diff --git a/.gitattributes b/.gitattributes
index 5a815654b4c..bede44edf8a 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -15,4 +15,6 @@ windows/INSTALL*   eol=native
 windows/NewGuidCmd.exe.config text eol=crlf
 windows/NewGuidCmd.exe binary
 
+# Prevent git changing CR-LF to LF when archiving (patch requires CR-LF on Windows).
+**/*.patch            -text
 
diff --git a/.gitignore b/.gitignore
index 961c8d64dc0..97a9cfff255 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,11 +6,12 @@
 !/src/*/Makefile
 !/src/*/README
 
-# Compiled Object files
+# Compiled Object files and python ciles
 *.slo
 *.lo
 *.o
 *.obj
+*.pyc
 
 # Compiled Dynamic libraries
 *.so
@@ -81,6 +82,8 @@ GSYMS
 /tools/portaudio/
 /tools/sctk-2.4.0-20091110-0958.tar.bz2
 /tools/sctk-2.4.0/
+/tools/sctk-2.4.10-20151007-1312Z.tar.bz2
+/tools/sctk-2.4.10/
 /tools/sph2pipe_v2.5.tar.gz
 /tools/sph2pipe_v2.5/
 /tools/kaldi_lm.tar.gz
@@ -98,5 +101,9 @@ GSYMS
 /tools/mpg123
 /tools/mpg123-1.21.0.tar.bz2
 /tools/mpg123-1.21.0
+/tools/pthreads
+/tools/pthreads*.zip
 /tools/sequitur
 /tools/srilm.tgz
+
+/kaldiwin_vs*
diff --git a/egs/ami/s5/RESULTS_ihm b/egs/ami/s5/RESULTS_ihm
index 234a434afb4..6435e9df47b 100644
--- a/egs/ami/s5/RESULTS_ihm
+++ b/egs/ami/s5/RESULTS_ihm
@@ -6,6 +6,9 @@ for x in exp/ihm/{mono,tri,sgmm,nnet,dnn,lstm}*/decode*; do [ -d $x ] && [[ $x =
 
 exit 0
 
+# Results with close-talk microphones (IHM),
+
+# Pawel,
 dev
 exp/ihm/tri2a/decode_dev_ami_fsh.o3g.kn.pr1-7/ascore_13/dev.ctm.filt.dtl:Percent Total Error       =   38.0%   (35925)
 exp/ihm/tri3a/decode_dev_ami_fsh.o3g.kn.pr1-7/ascore_14/dev.ctm.filt.dtl:Percent Total Error       =   35.3%   (33329)
@@ -18,14 +21,58 @@ exp/ihm/tri3a/decode_eval_ami_fsh.o3g.kn.pr1-7/ascore_14/eval.ctm.filt.dtl:Perce
 exp/ihm/tri4a/decode_eval_ami_fsh.o3g.kn.pr1-7/ascore_13/eval_o4.ctm.filt.dtl:Percent Total Error       =   35.0%   (31463)
 exp/ihm/tri4a_mmi_b0.1/decode_eval_3.mdl_ami_fsh.o3g.kn.pr1-7/ascore_12/eval_o4.ctm.filt.dtl:Percent Total Error       =   31.7%   (28518)
 
+# Karel, JSALT 2015, (21.7.2015)
+
+# dev,
+## GMM,
+%WER 38.1 | 13098 94489 | 67.1 20.6 12.2 5.2 38.1 67.0 | exp/ihm/tri2a/decode_dev_ami_fsh.o3g.kn.pr1-7/ascore_13/dev.ctm.filt.sys
+%WER 35.5 | 13098 94487 | 69.6 19.0 11.4 5.1 35.5 65.8 | exp/ihm/tri3a/decode_dev_ami_fsh.o3g.kn.pr1-7/ascore_13/dev.ctm.filt.sys
+%WER 32.2 | 13098 94483 | 72.5 17.2 10.3 4.8 32.2 63.8 | exp/ihm/tri4a/decode_dev_ami_fsh.o3g.kn.pr1-7/ascore_14/dev.ctm.filt.sys #0.1% worse than Pawel!
+%WER 30.2 | 13098 94479 | 74.0 15.6 10.4 4.2 30.2 61.9 | exp/ihm/tri4a_mmi_b0.1/decode_dev_3.mdl_ami_fsh.o3g.kn.pr1-7/ascore_12/dev.ctm.filt.sys
+## DNN-Xent,
+%WER 26.0 | 13098 94483 | 77.9 13.5 8.5 4.0 26.0 58.4 | exp/ihm/dnn4_pretrain-dbn_dnn/decode_dev_ami_fsh.o3g.kn.pr1-7/ascore_11/dev.ctm.filt.sys
+## DNN-sMBR,
+%WER 24.9 | 13098 94484 | 79.2 13.2 7.6 4.1 24.9 57.1 | exp/ihm/dnn4_pretrain-dbn_dnn_smbr/decode_dev_ami_fsh.o3g.kn.pr1-7_it1/ascore_11/dev.ctm.filt.sys
+%WER 24.3 | 13098 94481 | 79.6 12.6 7.8 3.9 24.3 56.3 | exp/ihm/dnn4_pretrain-dbn_dnn_smbr/decode_dev_ami_fsh.o3g.kn.pr1-7_it4/ascore_12/dev.ctm.filt.sys
+
+# eval,
+## GMM,
+%WER 43.9 | 12643 89978 | 60.8 25.3 13.9 4.8 43.9 65.6 | exp/ihm/tri2a/decode_eval_ami_fsh.o3g.kn.pr1-7/ascore_14/eval.ctm.filt.sys
+%WER 40.8 | 12643 89985 | 63.8 23.6 12.6 4.7 40.8 64.6 | exp/ihm/tri3a/decode_eval_ami_fsh.o3g.kn.pr1-7/ascore_14/eval.ctm.filt.sys
+%WER 35.1 | 12643 89975 | 69.1 19.8 11.1 4.2 35.1 61.8 | exp/ihm/tri4a/decode_eval_ami_fsh.o3g.kn.pr1-7/ascore_14/eval.ctm.filt.sys #0.1% worse than Pawel!
+%WER 31.7 | 12643 89986 | 72.1 18.0 9.9 3.8 31.7 59.4 | exp/ihm/tri4a_mmi_b0.1/decode_eval_4.mdl_ami_fsh.o3g.kn.pr1-7/ascore_11/eval.ctm.filt.sys
+## DNN-Xent,
+%WER 27.1 | 12643 89971 | 76.4 15.5 8.1 3.5 27.1 57.2 | exp/ihm/dnn4_pretrain-dbn_dnn/decode_eval_ami_fsh.o3g.kn.pr1-7/ascore_10/eval.ctm.filt.sys
+## DNN-sMBR,
+%WER 25.4 | 12643 89974 | 77.9 14.7 7.4 3.3 25.4 55.1 | exp/ihm/dnn4_pretrain-dbn_dnn_smbr/decode_eval_ami_fsh.o3g.kn.pr1-7_it1/ascore_11/eval.ctm.filt.sys
+%WER 24.6 | 12643 89972 | 78.8 14.1 7.1 3.3 24.6 54.4 | exp/ihm/dnn4_pretrain-dbn_dnn_smbr/decode_eval_ami_fsh.o3g.kn.pr1-7_it4/ascore_11/eval.ctm.filt.sys
+
 
-# TDNN results
+# Vijay, TDNN results,
 for x in exp/$mic/nnet2_online/*/decode*; do [ -d $x ] && [[ $x =~ "$1" ]] && grep Sum $x/ascore_*/*.sys | utils/best_wer.sh; done
-#dev
+# dev,
 %WER 25.0 | 13098 94483 | 78.3 12.0 9.6 3.4 25.0 57.7 | exp/ihm/nnet2_online/nnet_ms_sp_online/decode_dev/ascore_13/dev_hires.ctm.filt.sys
 %WER 25.3 | 13098 94468 | 78.5 12.7 8.8 3.8 25.3 57.9 | exp/ihm/nnet2_online/nnet_ms_sp_online/decode_dev_utt/ascore_12/dev_hires.ctm.filt.sys
 %WER 25.0 | 13098 94476 | 78.5 12.4 9.1 3.6 25.0 58.0 | exp/ihm/nnet2_online/nnet_ms_sp_online/decode_dev_utt_offline/ascore_13/dev_hires.ctm.filt.sys
-#eval
+# eval,
 %WER 25.9 | 12643 89971 | 77.2 14.2 8.6 3.2 25.9 56.4 | exp/ihm/nnet2_online/nnet_ms_sp_online/decode_eval/ascore_12/eval_hires.ctm.filt.sys
 %WER 26.0 | 12643 89976 | 77.1 14.7 8.2 3.2 26.0 55.7 | exp/ihm/nnet2_online/nnet_ms_sp_online/decode_eval_utt/ascore_12/eval_hires.ctm.filt.sys
 %WER 25.8 | 12643 89978 | 77.6 14.6 7.8 3.4 25.8 55.8 | exp/ihm/nnet2_online/nnet_ms_sp_online/decode_eval_utt_offline/ascore_11/eval_hires.ctm.filt.sys
+
+#------------------------------------------------------------------------------------------------------------------------------------
+# Nnet3 systems
+
+# BLSTM
+#  local/nnet3/run_blstm.sh --mic ihm \
+#    --chunk-right-context 20 \
+#    --use-sat-alignments true
+# Note: Chunk right context of 20 limits the latency of the acoustic model to
+# 20 frames.
+
+%WER 22.8 | 13098 94494 | 80.1 11.0 8.9 3.0 22.8 54.8 | exp/ihm/nnet3/lstm_bidirectional_ld0/decode_dev/ascore_10/dev_hires.ctm.filt.sys
+%WER 22.6 | 12643 89969 | 80.0 12.7 7.3 2.7 22.6 53.5 | exp/ihm/nnet3/lstm_bidirectional_ld0/decode_eval/ascore_9/eval_hires.ctm.filt.sys
+
+## Chain systems
+  # local/chain/run_tdnn_ami_5.sh  --mic ihm --max-wer 50 --affix min_seg_len2_50wer (built with min-seg-len 2 secs, but script now just supports (frames_per_eg+5)/100)
+  %WER 22.4 | 13098 94484 | 80.5 10.7 8.8 3.0 22.4 54.8 | 0.091 | exp/ihm/chain/tdnn_min_seg_len2_50wer_sp/decode_dev/ascore_10/dev_hires.ctm.filt.sys
+  %WER 22.4 | 12643 89973 | 80.3 12.6 7.1 2.8 22.4 53.2 | 0.155 | exp/ihm/chain/tdnn_min_seg_len2_50wer_sp/decode_eval/ascore_10/eval_hires.ctm.filt.sys
diff --git a/egs/ami/s5/RESULTS_mdm b/egs/ami/s5/RESULTS_mdm
index d0cbb335bd8..757f6a4d227 100644
--- a/egs/ami/s5/RESULTS_mdm
+++ b/egs/ami/s5/RESULTS_mdm
@@ -6,8 +6,9 @@ for x in exp/mdm*/{mono,tri,sgmm,nnet,dnn,lstm}*/decode*; do [ -d $x ] && [[ $x
 exit 0
 
 
-#Beamforming of 8 microphones, WER scores with up to 4 overlapping speakers
+# Beamforming of 8 microphones, WER scores with up to 4 overlapping speakers,
 
+# Pawel,
 dev
 exp/mdm8/tri2a/decode_dev_ami_fsh.o3g.kn.pr1-7/ascore_13/dev_o4.ctm.filt.dtl:Percent Total Error       =   58.8%   (55568)
 exp/mdm8/tri3a/decode_dev_ami_fsh.o3g.kn.pr1-7/ascore_13/dev_o4.ctm.filt.dtl:Percent Total Error       =   57.0%   (53855)
@@ -35,7 +36,7 @@ for x in exp/$mic/nnet2_online/*/decode*; do [ -d $x ] && [[ $x =~ "$1" ]] && gr
   #dev
     %WER 40.9 | 15965 94490 | 64.6 19.9 15.5 5.5 40.9 61.9 | -26.104 | exp/mdm8/nnet2_online/nnet_ms_sp_smbr_0.000005/decode_epoch4_dev_utt/ascore_12/dev_hires_o4.ctm.filt.sys
     %WER 40.7 | 13961 94495 | 64.4 18.8 16.8 5.0 40.7 70.4 | -26.622 | exp/mdm8/nnet2_online/nnet_ms_sp_smbr_0.000005/decode_epoch4_dev_utt_offline/ascore_13/dev_hires_o4.ctm.filt.sys
-  #eval  
+  #eval
     %WER 44.2 | 13577 89767 | 61.1 22.3 16.6 5.3 44.2 68.9 | -25.003 | exp/mdm8/nnet2_online/nnet_ms_sp_smbr_0.000005/decode_epoch4_eval_utt/ascore_11/eval_hires_o4.ctm.filt.sys
     %WER 44.0 | 13448 89769 | 60.8 21.4 17.8 4.9 44.0 69.6 | -25.331 | exp/mdm8/nnet2_online/nnet_ms_sp_smbr_0.000005/decode_epoch4_eval_utt_offline/ascore_12/eval_hires_o4.ctm.filt.sys
 
@@ -57,8 +58,8 @@ for x in exp/$mic/nnet2_online/*/decode*; do [ -d $x ] && [[ $x =~ "$1" ]] && gr
     %WER 40.9 | 15965 94490 | 64.6 19.9 15.5 5.5 40.9 61.9 | -26.104 | exp/mdm8/nnet2_online/nnet_ms_sp_smbr_0.000005/decode_epoch4_dev_utt/ascore_12/dev_hires_o4.ctm.filt.sys
     %WER 40.7 | 13961 94495 | 64.4 18.8 16.8 5.0 40.7 70.4 | -26.622 | exp/mdm8/nnet2_online/nnet_ms_sp_smbr_0.000005/decode_epoch4_dev_utt_offline/ascore_13/dev_hires_o4.ctm.filt.sys
 
-  #eval 
-    # epoch 0 
+  #eval
+    # epoch 0
     %WER 45.4 | 13992 89799 | 60.1 26.4 13.5 5.5 45.4 67.3 | -23.969 | exp/mdm8/nnet2_online/nnet_ms_sp_smbr_0.000005/decode_epoch0_eval_utt/ascore_12/eval_hires_o4.ctm.filt.sys
     %WER 45.1 | 13893 89804 | 60.3 25.9 13.9 5.4 45.1 67.9 | -24.110 | exp/mdm8/nnet2_online/nnet_ms_sp_smbr_0.000005/decode_epoch0_eval_utt_offline/ascore_12/eval_hires_o4.ctm.filt.sys
     # epoch 1
@@ -74,3 +75,56 @@ for x in exp/$mic/nnet2_online/*/decode*; do [ -d $x ] && [[ $x =~ "$1" ]] && gr
     %WER 44.2 | 13577 89767 | 61.1 22.3 16.6 5.3 44.2 68.9 | -25.003 | exp/mdm8/nnet2_online/nnet_ms_sp_smbr_0.000005/decode_epoch4_eval_utt/ascore_11/eval_hires_o4.ctm.filt.sys
     %WER 44.0 | 13448 89769 | 60.8 21.4 17.8 4.9 44.0 69.6 | -25.331 | exp/mdm8/nnet2_online/nnet_ms_sp_smbr_0.000005/decode_epoch4_eval_utt_offline/ascore_12/eval_hires_o4.ctm.filt.sys
 
+
+#------------------------------------------------------------------------------------------------------------------------------------
+# Nnet3 systems
+
+# BLSTM + clean alignments
+#  local/nnet3/run_blstm.sh --mic mdm8\
+#    --chunk-right-context 20 \
+#    --use-sat-alignments true \
+#    --use-ihm-ali true
+# Note: Chunk right context of 20 limits the latency of the acoustic model to
+# 20 frames.
+
+
+  %WER 35.5 | 15221 94509 | 69.9 21.0 9.1 5.4 35.5 61.4 | -26.440 | exp/mdm8_cleanali/nnet3/lstm_bidirectional_ld0/decode_dev/ascore_10/dev_hires_o4.ctm.filt.sys
+  %WER 38.3 | 13423 89786 | 65.8 22.0 12.2 4.1 38.3 66.3 | -26.016 | exp/mdm8_cleanali/nnet3/lstm_bidirectional_ld0/decode_eval/ascore_10/eval_hires_o4.ctm.filt.sys
+
+
+################################# Chain Systems ######################
+
+  # local/chain/run_tdnn_ami_5.sh  --mic mdm8 --affix msl1.5_45wer
+   %WER 38.5 | 14761 94496 | 65.5 17.6 16.9 4.0 38.5 66.5 | 0.620 | exp/mdm8/chain/tdnn_ami5_msl1.5_45wer_sp/decode_dev/ascore_9/dev_hires_o4.ctm.filt.sys
+   %WER 41.5 | 14219 89974 | 62.2 18.5 19.2 3.7 41.5 65.6 | 0.596 | exp/mdm8/chain/tdnn_ami5_msl1.5_45wer_sp/decode_eval/ascore_9/eval_hires_o4.ctm.filt.sys
+
+  # local/chain/run_tdnn_ami_5.sh  --mic mdm8 --use-ihm-ali true --affix msl1.5_45wer
+   %WER 38.1 | 15296 94487 | 65.7 17.9 16.4 3.8 38.1 62.5 | 0.617 | exp/mdm8_cleanali/chain/tdnn_ami5_msl1.5_45wer_sp/decode_dev/ascore_9/dev_hires_o4.ctm.filt.sys
+   %WER 41.5 | 13795 89975 | 62.6 20.4 17.0 4.1 41.5 66.9 | 0.628 | exp/mdm8_cleanali/chain/tdnn_ami5_msl1.5_45wer_sp/decode_eval/ascore_8/eval_hires_o4.ctm.filt.sys
+
+#--------------------------------------------------------------------------------------------------------------------------------------------
+# Karel, JSALT 2015, (31.7.2015)
+# nnet1, MFCC-LDA-MLLT-DNN system (local/nnet/run_dnn_lda_mllt.sh),
+
+# dev,
+## GMM,
+%WER 59.1 | 14105 94500 | 47.5 34.3 18.2 6.6 59.1 76.0 | -22.348 | exp/mdm8/tri2a/decode_dev_ami_fsh.o3g.kn.pr1-7/ascore_13/dev_o4.ctm.filt.sys
+%WER 57.2 | 14807 94503 | 49.6 33.2 17.3 6.8 57.2 72.1 | -22.450 | exp/mdm8/tri3a/decode_dev_ami_fsh.o3g.kn.pr1-7/ascore_13/dev_o4.ctm.filt.sys
+%WER 55.0 | 14511 94490 | 51.1 30.0 18.8 6.2 55.0 73.0 | -22.760 | exp/mdm8/tri3a_mmi_b0.1/decode_dev_2.mdl_ami_fsh.o3g.kn.pr1-7/ascore_11/dev_o4.ctm.filt.sys
+## DNN-Xent,
+%WER 48.2 | 15246 94513 | 58.4 28.7 12.9 6.7 48.2 67.3 | -23.329 | exp/mdm8/dnn4noSAT_pretrain-dbn_dnn/decode_dev_ami_fsh.o3g.kn.pr1-7/ascore_13/dev_o4.ctm.filt.sys
+## DNN-sMBR,
+%WER 46.2 | 15260 94500 | 60.1 26.0 13.9 6.3 46.2 66.3 | -23.908 | exp/mdm8/dnn4noSAT_pretrain-dbn_dnn_smbr/decode_dev_ami_fsh.o3g.kn.pr1-7_it1/ascore_13/dev_o4.ctm.filt.sys
+%WER 45.1 | 14204 94504 | 61.1 24.9 14.0 6.2 45.1 70.7 | -24.225 | exp/mdm8/dnn4noSAT_pretrain-dbn_dnn_smbr/decode_dev_ami_fsh.o3g.kn.pr1-7_it4/ascore_13/dev_o4.ctm.filt.sys
+
+# eval,
+## GMM,
+%WER 64.4 | 14362 90002 | 41.7 36.6 21.8 6.0 64.4 71.2 | -22.256 | exp/mdm8/tri2a/decode_eval_ami_fsh.o3g.kn.pr1-7/ascore_12/eval_o4.ctm.filt.sys
+%WER 62.1 | 13700 89987 | 44.0 35.5 20.5 6.2 62.1 74.1 | -22.267 | exp/mdm8/tri3a/decode_eval_ami_fsh.o3g.kn.pr1-7/ascore_12/eval_o4.ctm.filt.sys
+%WER 59.5 | 13822 89978 | 46.3 32.8 20.9 5.7 59.5 72.6 | -22.394 | exp/mdm8/tri3a_mmi_b0.1/decode_eval_2.mdl_ami_fsh.o3g.kn.pr1-7/ascore_10/eval_o4.ctm.filt.sys
+## DNN-Xent,
+%WER 52.1 | 13642 89829 | 53.6 30.7 15.7 5.7 52.1 71.6 | -22.884 | exp/mdm8/dnn4noSAT_pretrain-dbn_dnn/decode_eval_ami_fsh.o3g.kn.pr1-7/ascore_12/eval_o4.ctm.filt.sys
+## DNN-sMBR,
+%WER 50.3 | 14264 89966 | 54.7 27.6 17.8 5.0 50.3 67.5 | -23.397 | exp/mdm8/dnn4noSAT_pretrain-dbn_dnn_smbr/decode_eval_ami_fsh.o3g.kn.pr1-7_it1/ascore_13/eval_o4.ctm.filt.sys
+%WER 49.1 | 13969 89982 | 55.8 26.7 17.4 4.9 49.1 68.4 | -23.629 | exp/mdm8/dnn4noSAT_pretrain-dbn_dnn_smbr/decode_eval_ami_fsh.o3g.kn.pr1-7_it4/ascore_13/eval_o4.ctm.filt.sys
+
diff --git a/egs/ami/s5/RESULTS_sdm b/egs/ami/s5/RESULTS_sdm
index d2bcad1f414..362d4019327 100644
--- a/egs/ami/s5/RESULTS_sdm
+++ b/egs/ami/s5/RESULTS_sdm
@@ -5,8 +5,9 @@ for x in exp/sdm*/{mono,tri,sgmm,nnet,dnn,lstm}*/decode*; do [ -d $x ] && [[ $x
 for x in exp/sdm*/{mono,tri,sgmm,nnet,dnn,lstm}*/decode*; do [ -d $x ] && [[ $x =~ "$1" ]] && grep Sum $x/ascore_*/*.sys | utils/best_wer.sh; done 2>/dev/null
 exit 0
 
-#the below are WER scores with up to 4 overlapping speakers
+# The below are WER scores with up to 4 overlapping speakers,
 
+# Pawel,
 dev
 exp/sdm1/tri2a/decode_dev_ami_fsh.o3g.kn.pr1-7/ascore_13/dev_o4.ctm.filt.dtl:Percent Total Error       =   66.9%   (63190)
 exp/sdm1/tri3a/decode_dev_ami_fsh.o3g.kn.pr1-7/ascore_13/dev_o4.ctm.filt.dtl:Percent Total Error       =   64.5%   (60963)
@@ -17,8 +18,6 @@ exp/sdm1/tri2a/decode_eval_ami_fsh.o3g.kn.pr1-7/ascore_13/eval_o4.ctm.filt.dtl:P
 exp/sdm1/tri3a/decode_eval_ami_fsh.o3g.kn.pr1-7/ascore_12/eval_o4.ctm.filt.dtl:Percent Total Error       =   69.5%   (62576)
 exp/sdm1/tri3a_mmi_b0.1/decode_eval_3.mdl_ami_fsh.o3g.kn.pr1-7/ascore_10/eval_o4.ctm.filt.dtl:Percent Total Error       =   67.2%   (60447)
 
-
-
 #--------------------------------------------------------------------------------------------------------------------------------------------
 #TDNN-online system
 mic=sdm1
@@ -27,7 +26,7 @@ for x in exp/$mic/nnet2_online/*/decode*; do [ -d $x ] && [[ $x =~ "$1" ]] && gr
     #Dev
     %WER 46.8 | 15053 94502 | 59.3 27.6 13.0 6.2 46.8 67.0 | -23.602 | exp/sdm1/nnet2_online/nnet_ms_sp_online/decode_dev_utt/ascore_12/dev_hires_o4.ctm.filt.sys
     %WER 46.4 | 14210 94496 | 59.0 26.6 14.4 5.4 46.4 70.7 | -23.844 | exp/sdm1/nnet2_online/nnet_ms_sp_online/decode_dev_utt_offline/ascore_13/dev_hires_o4.ctm.filt.sys
-    
+
     #Eval
     %WER 50.7 | 13180 89643 | 54.7 29.6 15.7 5.3 50.7 72.6 | -23.104 | exp/sdm1/nnet2_online/nnet_ms_sp_online/decode_eval_utt/ascore_12/eval_hires_o4.ctm.filt.sys
     %WER 50.5 | 13099 89806 | 54.7 29.3 15.9 5.2 50.5 73.5 | -23.149 | exp/sdm1/nnet2_online/nnet_ms_sp_online/decode_eval_utt_offline/ascore_12/eval_hires_o4.ctm.filt.sys
@@ -66,3 +65,83 @@ for x in exp/$mic/nnet2_online/*/decode*; do [ -d $x ] && [[ $x =~ "$1" ]] && gr
   #epoch 4
   %WER 49.1 | 13948 89977 | 55.6 25.2 19.2 4.8 49.1 68.2 | -23.902 | exp/sdm1/nnet2_online/nnet_ms_sp_smbr_0.000005/decode_epoch4_eval_utt/ascore_15/eval_hires_o4.ctm.filt.sys
   %WER 49.0 | 14259 89798 | 55.8 25.4 18.8 4.8 49.0 66.6 | -23.873 | exp/sdm1/nnet2_online/nnet_ms_sp_smbr_0.000005/decode_epoch4_eval_utt_offline/ascore_15/eval_hires_o4.ctm.filt.sys
+
+
+#------------------------------------------------------------------------------------------------------------------------------------
+# Nnet3 systems
+# the ivectors are estimated per recording (not per utterance), the results will be updated with per-utterance ivectors
+
+#TDNN
+#Total training time is 5:19:19
+# local/nnet3/run_tdnn.sh --mic sdm1 --use-sat-alignments false
+%WER 46.1 | 15377 94333 | 59.1 25.8 15.0 5.2 46.1 65.6 | -24.026 | exp/sdm1/nnet3/tdnn_sp/decode_dev/ascore_11/dev_hires_o4.ctm.filt.sys
+%WER 50.9 | 13867 89975 | 53.6 27.9 18.5 4.5 50.9 70.1 | -23.332 | exp/sdm1/nnet3/tdnn_sp/decode_eval/ascore_11/eval_hires_o4.ctm.filt.sys
+
+#LSTM
+#Total training time is 21:34:06
+%WER 44.2 | 14069 94507 | 61.3 25.8 12.9 5.5 44.2 70.7 | -24.180 | exp/sdm1/nnet3/lstm_sp_ld5/decode_dev/ascore_10/dev_hires_o4.ctm.filt.sys
+%WER 47.6 | 14034 89978 | 56.8 26.9 16.4 4.3 47.6 67.7 | -23.786 | exp/sdm1/nnet3/lstm_sp_ld5/decode_eval/ascore_10/eval_hires_o4.ctm.filt.sys
+
+#Variable-delay LSTM (Default LSTM recipe)
+#Total training time is 18:43:35
+# local/nnet3/run_lstm.sh --mic sdm1 --use-sat-alignments false
+%WER 44.4 | 14208 94318 | 61.2 25.7 13.1 5.5 44.4 70.1 | -24.197 | exp/sdm1/nnet3/lstm_sp_ld5/decode_dev/ascore_10/dev_hires_o4.ctm.filt.sys
+%WER 47.9 | 14766 89956 | 56.0 25.7 18.2 3.9 47.9 64.1 | -23.997 | exp/sdm1/nnet3/lstm_sp_ld5/decode_eval/ascore_10/eval_hires_o4.ctm.filt.sys
+
+# BLSTM
+# local/nnet3/run_blstm.sh --mic sdm1 --use-sat-alignments false
+%WER 42.8 | 14948 94501 | 62.2 25.2 12.6 5.1 42.8 65.8 | -24.499 | exp/sdm1/nnet3/lstm_sp_bidirectional_ld0/decode_dev/ascore_10/dev_hires_o4.ctm.filt.sys
+%WER 46.1 | 13760 89981 | 57.8 25.9 16.3 3.9 46.1 68.2 | -24.143 | exp/sdm1/nnet3/lstm_sp_bidirectional_ld0/decode_eval/ascore_10/eval_hires_o4.ctm.filt.sys
+
+# local/nnet3/run_blstm.sh --mic sdm1 --use-sat-alignments true
+%WER 42.5 | 14150 94510 | 62.4 24.6 12.9 4.9 42.5 69.2 | -24.676 | exp/sdm1/nnet3/lstm_sp_bidirectional_fmllr_ld0/decode_dev/ascore_10/dev_hires_o4.ctm.filt.sys
+%WER 45.6 | 14142 89993 | 58.5 26.1 15.4 4.2 45.6 66.5 | -24.127 | exp/sdm1/nnet3/lstm_sp_bidirectional_fmllr_ld0/decode_eval/ascore_9/eval_hires_o4.ctm.filt.sys
+
+# BLSTM + clean alignments
+#  local/nnet3/run_blstm.sh --mic sdm1 \
+#    --chunk-right-context 20 \
+#    --use-sat-alignments true \
+#    --use-ihm-ali true
+# Note: Chunk right context of 20 limits the latency of the acoustic model to
+# 20 frames.
+
+%WER 38.5 | 14828 94514 | 66.6 22.7 10.6 5.2 38.5 63.7 | -25.569 | exp/sdm1_cleanali/nnet3/lstm_sp_bidirectional_ld0/decode_dev/ascore_11/dev_hires_o4.ctm.filt.sys
+%WER 41.8 | 12828 89977 | 62.5 24.6 12.9 4.3 41.8 70.8 | -24.813 | exp/sdm1_cleanali/nnet3/lstm_sp_bidirectional_ld0/decode_eval/ascore_10/eval_hires_o4.ctm.filt.sys
+
+##################################
+# chain model results
+  # local/chain/run_tdnn_ami_5.sh  --mic sdm1 --affix msl1.5_45wer
+  %WER 42.8 | 14391 94487 | 60.8 19.3 19.9 3.6 42.8 69.1 | 0.588 | exp/sdm1/chain/tdnn_ami4_msl1.5_45wer_sp/decode_dev/ascore_9/dev_hires_o4.ctm.filt.sys
+  %WER 46.1 | 13754 89977 | 57.5 20.7 21.9 3.6 46.1 69.2 | 0.561 | exp/sdm1/chain/tdnn_ami4_msl1.5_45wer_sp/decode_eval/ascore_9/eval_hires_o4.ctm.filt.sys
+
+
+  # local/chain/run_tdnn_ami_5.sh  --mic sdm1 --use-ihm-ali true --max-wer 50 --affix msl1.5_50wer
+  %WER 41.6 | 14793 94504 | 61.8 19.3 18.9 3.4 41.6 65.3 | 0.591 | exp/sdm1_cleanali/chain/tdnn_ami4_msl1.5_50wer_sp/decode_dev/ascore_9/dev_hires_o4.ctm.filt.sys
+  %WER 45.4 | 14141 89972 | 57.9 20.7 21.4 3.3 45.4 64.8 | 0.567 | exp/sdm1_cleanali/chain/tdnn_ami4_msl1.5_50wer_sp/decode_eval/ascore_9/eval_hires_o4.ctm.filt.sys
+
+#--------------------------------------------------------------------------------------------------------------------------------------------
+# Karel, JSALT 2015, (28.7.2015)
+# nnet1, MFCC-LDA-MLLT-DNN system (local/nnet/run_dnn_lda_mllt.sh),
+
+# dev,
+## GMM
+%WER 66.8 | 14238 94527 | 40.1 40.4 19.5 6.8 66.8 76.1 | -22.367 | exp/sdm1/tri2a/decode_dev_ami_fsh.o3g.kn.pr1-7/ascore_12/dev_o4.ctm.filt.sys
+%WER 64.4 | 14843 94511 | 42.1 38.7 19.2 6.5 64.4 72.2 | -22.275 | exp/sdm1/tri3a/decode_dev_ami_fsh.o3g.kn.pr1-7/ascore_12/dev_o4.ctm.filt.sys
+%WER 62.3 | 14761 94499 | 44.0 35.7 20.3 6.3 62.3 72.4 | -22.262 | exp/sdm1/tri3a_mmi_b0.1/decode_dev_2.mdl_ami_fsh.o3g.kn.pr1-7/ascore_10/dev_o4.ctm.filt.sys
+## DNN-Xent,
+%WER 54.0 | 14017 94513 | 51.7 32.3 15.9 5.7 54.0 73.8 | -22.649 | exp/sdm1/dnn4noSAT_pretrain-dbn_dnn/decode_dev_ami_fsh.o3g.kn.pr1-7/ascore_13/dev_o4.ctm.filt.sys
+## DNN-sMBR,
+%WER 51.6 | 15097 94506 | 54.5 29.8 15.7 6.1 51.6 67.5 | -22.989 | exp/sdm1/dnn4noSAT_pretrain-dbn_dnn_smbr/decode_dev_ami_fsh.o3g.kn.pr1-7_it1/ascore_11/dev_o4.ctm.filt.sys
+%WER 50.6 | 14806 94481 | 55.4 29.6 15.0 6.0 50.6 68.7 | -23.087 | exp/sdm1/dnn4noSAT_pretrain-dbn_dnn_smbr/decode_dev_ami_fsh.o3g.kn.pr1-7_it4/ascore_14/dev_o4.ctm.filt.sys
+
+# eval,
+## GMM,
+%WER 71.8 | 13901 89999 | 33.9 41.6 24.5 5.7 71.8 74.4 | -22.720 | exp/sdm1/tri2a/decode_eval_ami_fsh.o3g.kn.pr1-7/ascore_12/eval_o4.ctm.filt.sys
+%WER 69.5 | 13480 89988 | 36.0 39.6 24.4 5.5 69.5 76.3 | -22.469 | exp/sdm1/tri3a/decode_eval_ami_fsh.o3g.kn.pr1-7/ascore_12/eval_o4.ctm.filt.sys
+%WER 67.2 | 13704 89979 | 38.1 36.6 25.3 5.3 67.2 73.7 | -22.292 | exp/sdm1/tri3a_mmi_b0.1/decode_eval_2.mdl_ami_fsh.o3g.kn.pr1-7/ascore_10/eval_o4.ctm.filt.sys
+## DNN-Xent,
+%WER 58.6 | 14191 89646 | 46.7 34.8 18.6 5.3 58.6 69.2 | -22.351 | exp/sdm1/dnn4noSAT_pretrain-dbn_dnn/decode_eval_ami_fsh.o3g.kn.pr1-7/ascore_12/eval_o4.ctm.filt.sys
+## DNN-sMBR,
+%WER 56.4 | 14203 89973 | 48.8 31.7 19.5 5.2 56.4 68.8 | -22.584 | exp/sdm1/dnn4noSAT_pretrain-dbn_dnn_smbr/decode_eval_ami_fsh.o3g.kn.pr1-7_it1/ascore_11/eval_o4.ctm.filt.sys
+%WER 55.0 | 13731 89834 | 50.7 32.6 16.6 5.8 55.0 70.7 | -22.580 | exp/sdm1/dnn4noSAT_pretrain-dbn_dnn_smbr/decode_eval_ami_fsh.o3g.kn.pr1-7_it4/ascore_13/eval_o4.ctm.filt.sys
+
diff --git a/egs/ami/s5/cmd.sh b/egs/ami/s5/cmd.sh
index e9899d582f6..5ec5d4b715f 100644
--- a/egs/ami/s5/cmd.sh
+++ b/egs/ami/s5/cmd.sh
@@ -1,9 +1,24 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 1G"
+export decode_cmd="queue.pl --mem 2G"
+# the use of cuda_cmd is deprecated but it is sometimes still used in nnet1
+# scripts.
+export cuda_cmd="queue.pl --gpu 1 --mem 20G"
+
+# the rest of this file is present for historical reasons.
+# In general it's best to rely on conf/queue.conf for cluster-specific
+# configuration.
 
 # On Eddie use:
 #export train_cmd="queue.pl -P inf_hcrc_cstr_nst -l h_rt=08:00:00"
@@ -11,29 +26,13 @@
 #export highmem_cmd="queue.pl -P inf_hcrc_cstr_nst -l h_rt=05:00:00 -pe memory-2G 4"
 #export scoring_cmd="queue.pl -P inf_hcrc_cstr_nst  -l h_rt=00:20:00"
 
-# JSALT2015 workshop, cluster AWS-EC2, (setup from Vijay)
-#export train_cmd="queue.pl -l arch=*64*"
-export train_cmd="queue.pl -l arch=*64*"
-export decode_cmd="queue.pl -l arch=*64* --mem 4G"
-export highmem_cmd="queue.pl -l arch=*64* --mem 4G"
-export scoring_cmd="queue.pl -l arch=*64*"
-export cuda_cmd="queue.pl --gpu 1 -l mem_free=20G,ram_free=20G"
-#export cuda_cmd="run.pl"
-export cntk_decode_cmd="queue.pl -l arch=*64* --mem 1G -pe smp 2"
-
-# To run locally, use:
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-#export highmem_cmd=run.pl
-#export cuda_cmd=run.pl
-
 if [ "$(hostname -d)" == "fit.vutbr.cz" ]; then
   # BUT cluster:
   queue="all.q@@blade,all.q@@speech"
-  gpu_queue="long.q@supergpu*,long.q@dellgpu*,long.q@pcspeech-gpu,long.q@pcgpu*"
+  gpu_queue="long.q@@gpu"
   storage="matylda5"
-  export train_cmd="queue.pl -q $queue -l ram_free=1500M,mem_free=1500M,${storage}=1"
-  export decode_cmd="queue.pl -q $queue -l ram_free=2500M,mem_free=2500M,${storage}=0.5"
+  export train_cmd="queue.pl -q $queue -l ram_free=1.5G,mem_free=1.5G,${storage}=1"
+  export decode_cmd="queue.pl -q $queue -l ram_free=2.5G,mem_free=2.5G,${storage}=0.5"
   export cuda_cmd="queue.pl -q $gpu_queue -l gpu=1"
-fi 
+fi
 
diff --git a/egs/ami/s5/conf/queue_jsalt.conf b/egs/ami/s5/conf/queue_jsalt.conf
new file mode 100644
index 00000000000..6cda84f912a
--- /dev/null
+++ b/egs/ami/s5/conf/queue_jsalt.conf
@@ -0,0 +1,11 @@
+# Origin at : http://wiki.clsp.jhu.edu/view/Ws15_AWS_Kluster_Rules
+# configuration for the AWS cluster for WS'15.
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0
+option gpu=1 -q gpu.q
diff --git a/egs/ami/s5/local/ami_beamform.sh b/egs/ami/s5/local/ami_beamform.sh
index 419e67c74d2..b5ff8c23ba8 100755
--- a/egs/ami/s5/local/ami_beamform.sh
+++ b/egs/ami/s5/local/ami_beamform.sh
@@ -34,6 +34,8 @@ set -u
 mkdir -p $odir
 mkdir -p $wdir/log
 
+[ -e $odir/.done_beamforming ] && echo "Beamforming already done, skipping..." && exit 0
+
 meetings=$wdir/meetings.list
 
 cat local/split_train.orig local/split_dev.orig local/split_eval.orig | sort > $meetings
@@ -74,3 +76,4 @@ echo -e "Beamforming\n"
 $cmd JOB=1:$nj $wdir/log/beamform.JOB.log \
      local/beamformit.sh $nj JOB $numch $meetings $sdir $odir
 
+touch $odir/.done_beamforming
diff --git a/egs/ami/s5/local/ami_download.sh b/egs/ami/s5/local/ami_download.sh
index 3a2a0c5c0fe..b14f8550c75 100755
--- a/egs/ami/s5/local/ami_download.sh
+++ b/egs/ami/s5/local/ami_download.sh
@@ -53,8 +53,8 @@ cat local/split_train.orig local/split_eval.orig local/split_dev.orig > $wdir/am
 wgetfile=$wdir/wget_$mic.sh
 
 # TODO fix this with Pawel, files don't exist anymore,
-manifest="wget -O $adir/MANIFEST.TXT http://groups.inf.ed.ac.uk/ami/download/temp/amiBuild-04237-Sun-Jun-15-2014.manifest.txt"
-license="wget -O $adir/LICENCE.TXT http://groups.inf.ed.ac.uk/ami/download/temp/Creative-Commons-Attribution-NonCommercial-ShareAlike-2.5.txt"
+manifest="wget --continue -O $adir/MANIFEST.TXT http://groups.inf.ed.ac.uk/ami/download/temp/amiBuild-04237-Sun-Jun-15-2014.manifest.txt"
+license="wget --continue -O $adir/LICENCE.TXT http://groups.inf.ed.ac.uk/ami/download/temp/Creative-Commons-Attribution-NonCommercial-ShareAlike-2.5.txt"
 
 echo "#!/bin/bash" > $wgetfile
 echo $manifest >> $wgetfile
diff --git a/egs/ami/s5/local/ami_format_data.sh b/egs/ami/s5/local/ami_format_data.sh
index fda2d498137..91cd619f574 100755
--- a/egs/ami/s5/local/ami_format_data.sh
+++ b/egs/ami/s5/local/ami_format_data.sh
@@ -1,4 +1,4 @@
-#!/bin/bash 
+#!/bin/bash
 #
 
 if [ -f path.sh ]; then . path.sh; fi
@@ -15,25 +15,12 @@ arpa_lm=$1
 
 cp -r data/lang data/lang_test
 
-# grep -v '<s> <s>' etc. is only for future-proofing this script.  Our
-# LM doesn't have these "invalid combinations".  These can cause 
-# determinization failures of CLG [ends up being epsilon cycles].
-# Note: remove_oovs.pl takes a list of words in the LM that aren't in
-# our word list.  Since our LM doesn't have any, we just give it
-# /dev/null [we leave it in the script to show how you'd do it].
 gunzip -c "$arpa_lm" | \
-   grep -v '<s> <s>' | \
-   grep -v '</s> <s>' | \
-   grep -v '</s> </s>' | \
-   arpa2fst - | fstprint | \
-   utils/remove_oovs.pl /dev/null | \
-   utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \
-     --osymbols=data/lang_test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-    fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst
-  fstisstochastic data/lang_test/G.fst
+  arpa2fst --disambig-symbol=#0 \
+           --read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst
 
 echo  "Checking how stochastic G is (the first of these numbers should be small):"
-fstisstochastic data/lang_test/G.fst 
+fstisstochastic data/lang_test/G.fst
 
 ## Check lexicon.
 ## just have a look and make sure it seems sane.
@@ -61,4 +48,3 @@ fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \
    fstisstochastic || echo LG is not stochastic
 
 echo AMI_format_data succeeded.
-
diff --git a/egs/ami/s5/local/ami_ihm_scoring_data_prep.sh b/egs/ami/s5/local/ami_ihm_scoring_data_prep.sh
index a0cca9c5f8e..c3b9914d7a0 100755
--- a/egs/ami/s5/local/ami_ihm_scoring_data_prep.sh
+++ b/egs/ami/s5/local/ami_ihm_scoring_data_prep.sh
@@ -84,7 +84,7 @@ sort -k 2 $dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $dir/spk2utt || exit 1;
 #check and correct the case when segment timings for given speaker overlap themself 
 #(important for simulatenous asclite scoring to proceed).
 #There is actually only one such case for devset and automatic segmentetions
-join $dir/utt2spkm $dir/segments | \
+join $dir/utt2spk $dir/segments | \
    perl -ne '{BEGIN{$pu=""; $pt=0.0;} split;
            if ($pu eq $_[1] && $pt > $_[3]) {
              print "$_[0] $_[2] $_[3] $_[4]>$_[0] $_[2] $pt $_[4]\n"
diff --git a/egs/ami/s5/local/ami_mdm_scoring_data_prep.sh b/egs/ami/s5/local/ami_mdm_scoring_data_prep.sh
index 406add86bca..ab0fd185f70 100755
--- a/egs/ami/s5/local/ami_mdm_scoring_data_prep.sh
+++ b/egs/ami/s5/local/ami_mdm_scoring_data_prep.sh
@@ -1,15 +1,15 @@
 #!/bin/bash
 
 # Copyright 2014, University of Edinburgh (Author: Pawel Swietojanski)
-# AMI Corpus dev/eval data preparation 
+# AMI Corpus dev/eval data preparation
 
 . path.sh
 
 #check existing directories
 if [ $# != 3 ]; then
   echo "Usage: ami_mdm_scoring_data_prep.sh /path/to/AMI-MDM mic-name set-name"
-  exit 1; 
-fi 
+  exit 1;
+fi
 
 AMI_DIR=$1
 mic=$2
@@ -24,8 +24,8 @@ mkdir -p $tmpdir
 # Audio data directory check
 if [ ! -d $AMI_DIR ]; then
   echo "Error: run.sh requires a directory argument"
-  exit 1; 
-fi  
+  exit 1;
+fi
 
 # And transcripts check
 if [ ! -f $SEGS ]; then
@@ -48,7 +48,7 @@ awk '{meeting=$1; channel="MDM"; speaker=$3; stime=$4; etime=$5;
 # (1c) Make segment files from transcript
 #segments file format is: utt-id side-id start-time end-time, e.g.:
 #AMI_ES2011a_H00_FEE041_0003415_0003484
-awk '{ 
+awk '{
        segment=$1;
        split(segment,S,"[_]");
        audioname=S[1]"_"S[2]"_"S[3]; startf=S[5]; endf=S[6];
@@ -71,12 +71,12 @@ awk '{print $1" sox -c 1 -t wavpcm -s "$2" -t wavpcm - |"}' $tmpdir/wav2.scp > $
 
 #prep reco2file_and_channel
 cat $tmpdir/wav.scp | \
-  perl -ane '$_ =~ m:^(\S+MDM)\s+.*\/([IETB].*)\.wav.*$: || die "bad label $_"; 
+  perl -ane '$_ =~ m:^(\S+MDM)\s+.*\/([IETB].*)\.wav.*$: || die "bad label $_";
        print "$1 $2 A\n"; ' > $tmpdir/reco2file_and_channel || exit 1;
 
 # we assume we adapt to the session only
 awk '{print $1}' $tmpdir/segments | \
-  perl -ane '$_ =~ m:^(\S+)([FM][A-Z]{0,2}[0-9]{3}[A-Z]*)(\S+)$: || die "bad label $_"; 
+  perl -ane '$_ =~ m:^(\S+)([FM][A-Z]{0,2}[0-9]{3}[A-Z]*)(\S+)$: || die "bad label $_";
           print "$1$2$3 $1\n";'  \
     > $tmpdir/utt2spk || exit 1;
 
@@ -85,26 +85,27 @@ sort -k 2 $tmpdir/utt2spk | utils/utt2spk_to_spk2utt.pl > $tmpdir/spk2utt || exi
 # but we want to properly score the overlapped segments, hence we generate the extra
 # utt2spk_stm file containing speakers ids used to generate the stms for mdm/sdm case
 awk '{print $1}' $tmpdir/segments | \
-  perl -ane '$_ =~ m:^(\S+)([FM][A-Z]{0,2}[0-9]{3}[A-Z]*)(\S+)$: || die "bad label $_"; 
+  perl -ane '$_ =~ m:^(\S+)([FM][A-Z]{0,2}[0-9]{3}[A-Z]*)(\S+)$: || die "bad label $_";
           print "$1$2$3 $1$2\n";' > $tmpdir/utt2spk_stm || exit 1;
 
 #check and correct case when segment timings for a given speaker overlap themself
 #(important for simulatenous asclite scoring to proceed).
 #There is actually only one such case for devset and automatic segmentetions
 join $tmpdir/utt2spk_stm $tmpdir/segments | \
-   perl -ne '{BEGIN{$pu=""; $pt=0.0;} split;
-           if ($pu eq $_[1] && $pt > $_[3]) {
-             print "$_[0] $_[2] $_[3] $_[4]>$_[0] $_[2] $pt $_[4]\n"
-           }
-           $pu=$_[1]; $pt=$_[4]; 
-         }' > $tmpdir/segments_to_fix
+  awk '{ utt=$1; spk=$2; wav=$3; t_beg=$4; t_end=$5;
+         if(spk_prev == spk && t_end_prev > t_beg) {
+           print utt, wav, t_beg, t_end">"utt, wav, t_end_prev, t_end;
+         }
+         spk_prev=spk; t_end_prev=t_end;
+       }' > $tmpdir/segments_to_fix
+
 if [ `cat $tmpdir/segments_to_fix | wc -l` -gt 0 ]; then
   echo "$0. Applying following fixes to segments"
   cat $tmpdir/segments_to_fix
   while read line; do
      p1=`echo $line | awk -F'>' '{print $1}'`
      p2=`echo $line | awk -F'>' '{print $2}'`
-     sed -ir "s!$p1!$p2!" $tmpdir/segments
+     sed -ir "s:$p1:$p2:" $tmpdir/segments
   done < $tmpdir/segments_to_fix
 fi
 
diff --git a/egs/ami/s5/local/ami_sdm_scoring_data_prep.sh b/egs/ami/s5/local/ami_sdm_scoring_data_prep.sh
index 90690731ec9..01173d2e3a6 100755
--- a/egs/ami/s5/local/ami_sdm_scoring_data_prep.sh
+++ b/egs/ami/s5/local/ami_sdm_scoring_data_prep.sh
@@ -1,15 +1,15 @@
 #!/bin/bash
 
 # Copyright 2014, University of Edinburgh (Author: Pawel Swietojanski)
-# AMI Corpus dev/eval data preparation 
+# AMI Corpus dev/eval data preparation
 
 . path.sh
 
 #check existing directories
 if [ $# != 3 ]; then
   echo "Usage: ami_sdm_scoring_data_prep.sh <path/to/AMI> <mic-id> <set-name>"
-  exit 1; 
-fi 
+  exit 1;
+fi
 
 AMI_DIR=$1
 MICNUM=$2
@@ -25,8 +25,8 @@ mkdir -p $tmpdir
 # Audio data directory check
 if [ ! -d $AMI_DIR ]; then
   echo "Error: run.sh requires a directory argument"
-  exit 1; 
-fi  
+  exit 1;
+fi
 
 # And transcripts check
 if [ ! -f $SEGS ]; then
@@ -53,7 +53,7 @@ awk '{meeting=$1; channel="SDM"; speaker=$3; stime=$4; etime=$5;
 # (1c) Make segment files from transcript
 #segments file format is: utt-id side-id start-time end-time, e.g.:
 #AMI_ES2011a_H00_FEE041_0003415_0003484
-awk '{ 
+awk '{
        segment=$1;
        split(segment,S,"[_]");
        audioname=S[1]"_"S[2]"_"S[3]; startf=S[5]; endf=S[6];
@@ -76,13 +76,13 @@ awk '{print $1" sox -c 1 -t wavpcm -s "$2" -t wavpcm - |"}' $tmpdir/wav2.scp > $
 
 #prep reco2file_and_channel
 cat $tmpdir/wav.scp | \
-  perl -ane '$_ =~ m:^(\S+SDM).*\/([IETB].*)\.wav.*$: || die "bad label $_"; 
+  perl -ane '$_ =~ m:^(\S+SDM).*\/([IETB].*)\.wav.*$: || die "bad label $_";
        print "$1 $2 A\n"; '\
   > $tmpdir/reco2file_and_channel || exit 1;
 
 # we assume we adapt to the session only
 awk '{print $1}' $tmpdir/segments | \
-  perl -ane '$_ =~ m:^(\S+)([FM][A-Z]{0,2}[0-9]{3}[A-Z]*)(\S+)$: || die "bad label $_"; 
+  perl -ane '$_ =~ m:^(\S+)([FM][A-Z]{0,2}[0-9]{3}[A-Z]*)(\S+)$: || die "bad label $_";
           print "$1$2$3 $1\n";'  \
     > $tmpdir/utt2spk || exit 1;
 
@@ -91,27 +91,28 @@ sort -k 2 $tmpdir/utt2spk | utils/utt2spk_to_spk2utt.pl > $tmpdir/spk2utt || exi
 # but we want to properly score the overlapped segments, hence we generate the extra
 # utt2spk_stm file containing speakers ids used to generate the stms for mdm/sdm case
 awk '{print $1}' $tmpdir/segments | \
-  perl -ane '$_ =~ m:^(\S+)([FM][A-Z]{0,2}[0-9]{3}[A-Z]*)(\S+)$: || die "bad label $_"; 
+  perl -ane '$_ =~ m:^(\S+)([FM][A-Z]{0,2}[0-9]{3}[A-Z]*)(\S+)$: || die "bad label $_";
           print "$1$2$3 $1$2\n";'  \
     > $tmpdir/utt2spk_stm || exit 1;
 
-#check and correct the case when segment timings for given speaker overlap themself 
+#check and correct the case when segment timings for given speaker overlap themself
 #(important for simulatenous asclite scoring to proceed).
 #There is actually only one such case for devset and automatic segmentetions
 join $tmpdir/utt2spk_stm $tmpdir/segments | \
-   perl -ne '{BEGIN{$pu=""; $pt=0.0;} split;
-           if ($pu eq $_[1] && $pt > $_[3]) {
-             print "$_[0] $_[2] $_[3] $_[4]>$_[0] $_[2] $pt $_[4]\n"
-           }
-           $pu=$_[1]; $pt=$_[4]; 
-         }' > $tmpdir/segments_to_fix
+  awk '{ utt=$1; spk=$2; wav=$3; t_beg=$4; t_end=$5;
+         if(spk_prev == spk && t_end_prev > t_beg) {
+           print utt, wav, t_beg, t_end">"utt, wav, t_end_prev, t_end;
+         }
+         spk_prev=spk; t_end_prev=t_end;
+       }' > $tmpdir/segments_to_fix
+
 if [ `cat $tmpdir/segments_to_fix | wc -l` -gt 0 ]; then
   echo "$0. Applying following fixes to segments"
   cat $tmpdir/segments_to_fix
   while read line; do
      p1=`echo $line | awk -F'>' '{print $1}'`
      p2=`echo $line | awk -F'>' '{print $2}'`
-     sed -ir "s!$p1!$p2!" $tmpdir/segments
+     sed -ir "s:$p1:$p2:" $tmpdir/segments
   done < $tmpdir/segments_to_fix
 fi
 
diff --git a/egs/ami/s5/local/ami_text_prep.sh b/egs/ami/s5/local/ami_text_prep.sh
index 0b87d10e4de..777c3d8b086 100755
--- a/egs/ami/s5/local/ami_text_prep.sh
+++ b/egs/ami/s5/local/ami_text_prep.sh
@@ -9,29 +9,30 @@ if [ $# -ne 1 ]; then
   exit 1;
 fi
 
-set -e
-set -u
+set -eux
 
-amidir=$1
-mkdir -p $amidir
+dir=$1
+mkdir -p $dir
 
-echo "Downloading annotiations..."
+echo "Downloading annotations..."
 
 amiurl=http://groups.inf.ed.ac.uk/ami
 annotver=ami_public_manual_1.6.1
-annot="$amidir/$annotver"
+annot="$dir/$annotver"
 
 logdir=data/local/downloads; mkdir -p $logdir/log
 [ ! -f $annot.zip ] && wget -nv -O $annot.zip $amiurl/AMICorpusAnnotations/$annotver.zip &> $logdir/log/download_ami_annot.log
 
-mkdir -p $amidir/annotations
-unzip -o -d $amidir/annotations $annot.zip &> /dev/null
+if [ ! -d $dir/annotations ]; then
+  mkdir -p $dir/annotations
+  unzip -o -d $dir/annotations $annot.zip &> /dev/null
+fi
 
-[ ! -f "$amidir/annotations/AMI-metadata.xml" ] && echo "$0: File AMI-Metadata.xml not found under $amidir/annotations." && exit 1;
+[ ! -f "$dir/annotations/AMI-metadata.xml" ] && echo "$0: File AMI-Metadata.xml not found under $dir/annotations." && exit 1;
 
 
 # extract text from AMI XML annotations,
-local/ami_xml2text.sh $amidir
+local/ami_xml2text.sh $dir
 
 wdir=data/local/annotations
 [ ! -f $wdir/transcripts1 ] && echo "$0: File $wdir/transcripts1 not found." && exit 1;
@@ -39,7 +40,7 @@ wdir=data/local/annotations
 echo "Preprocessing transcripts..."
 local/ami_split_segments.pl $wdir/transcripts1 $wdir/transcripts2 &> $wdir/log/split_segments.log
 
-#make final train/dev/eval splits
+# make final train/dev/eval splits
 for dset in train eval dev; do
   [ ! -f local/split_$dset.final  ] &&  cp local/split_$dset.orig local/split_$dset.final
   grep -f local/split_$dset.final $wdir/transcripts2 > $wdir/$dset.txt
diff --git a/egs/ami/s5/local/ami_xml2text.sh b/egs/ami/s5/local/ami_xml2text.sh
index 4d5431c6a4d..c4b90a33702 100755
--- a/egs/ami/s5/local/ami_xml2text.sh
+++ b/egs/ami/s5/local/ami_xml2text.sh
@@ -19,7 +19,8 @@ JAVA_VER=$(java -version 2>&1 | sed 's/java version "\(.*\)\.\(.*\)\..*"/\1\2/;
 if [ "$JAVA_VER" -ge 15 ]; then
   if [ ! -d $wdir/nxt ]; then
     echo "Downloading NXT annotation tool..."
-    wget -O $wdir/nxt.zip http://sourceforge.net/projects/nite/files/nite/nxt_1.4.4/nxt_1.4.4.zip &> /dev/null
+    wget -O $wdir/nxt.zip http://sourceforge.net/projects/nite/files/nite/nxt_1.4.4/nxt_1.4.4.zip
+    [ ! -s $wdir/nxt.zip ] && echo "Downloading failed! ($wdir/nxt.zip)" && exit 1
     unzip -d $wdir/nxt $wdir/nxt.zip &> /dev/null
   fi
 
diff --git a/egs/ami/s5/local/chain/run_blstm_ami_5.sh b/egs/ami/s5/local/chain/run_blstm_ami_5.sh
new file mode 100755
index 00000000000..d9437af7e0c
--- /dev/null
+++ b/egs/ami/s5/local/chain/run_blstm_ami_5.sh
@@ -0,0 +1,178 @@
+#!/bin/bash
+
+
+###
+# Does not give improvements over xent+blstm system !!
+#local/chain/run_blstm_ami_5.sh --mic sdm1 --use-ihm-ali false --max-wer 45 --affix msl1.5_45wer
+# %WER 42.5 | 14769 94491 | 61.0 19.9 19.1 3.5 42.5 67.5 | 0.605 | exp/sdm1/chain/blstm_ami5_msl1.5_45wer_sp/decode_dev/ascore_9/dev_hires_o4.ctm.filt.sys
+# %WER 45.7 | 13674 89971 | 57.7 21.0 21.3 3.5 45.7 69.1 | 0.572 | exp/sdm1/chain/blstm_ami5_msl1.5_45wer_sp/decode_eval/ascore_9/eval_hires_o4.ctm.filt.sys
+
+set -e
+
+# configs for 'chain'
+stage=10
+train_stage=-10
+get_egs_stage=-10
+decode_stage=1
+mic=ihm
+use_ihm_ali=false
+affix=
+common_egs_dir=
+exp_name=blstm_ami5
+
+# LSTM options
+chunk_width=150
+chunk_left_context=40
+chunk_right_context=40
+
+
+# decode options
+extra_left_context=
+extra_right_context=
+frames_per_chunk=
+
+# training options
+# chain options
+xent_regularize=0.1
+max_wer=45
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+
+new_mic=$mic
+if [ $use_ihm_ali == "true" ]; then
+  new_mic=${mic}_cleanali
+fi
+
+# we do speed perturbation by default, however for sake of
+# consistency with older naming convention we  add _sp
+
+dir=exp/$new_mic/chain/${exp_name}${affix:+_$affix}_sp
+treedir=exp/$new_mic/chain/tri5_2y_tree_sp
+lang=data/$new_mic/lang_chain_2y
+
+local/chain/run_chain_common.sh --stage $stage \
+                                --mic $mic \
+                                --use-ihm-ali $use_ihm_ali \
+                                --frames-per-eg $chunk_width \
+                                --max-wer $max_wer \
+                                --dir $dir \
+                                --treedir $treedir \
+                                --lang $lang
+mic=$new_mic
+. $dir/vars
+# sets the directory names where features, ivectors and lattices are stored
+#train_data_dir
+#train_ivector_dir
+#lat_dir
+
+if [ $stage -le 16 ]; then
+  echo "$0: creating neural net configs";
+
+  steps/nnet3/lstm/make_configs.py  \
+    --feat-dir $train_data_dir \
+    --ivector-dir $train_ivector_dir \
+    --tree-dir $treedir \
+    --splice-indexes="-2,-1,0,1,2 0 0" \
+    --lstm-delay=" [-3,3] [-3,3] [-3,3] " \
+    --xent-regularize $xent_regularize \
+    --include-log-softmax false \
+    --num-lstm-layers 3 \
+    --cell-dim 512 \
+    --hidden-dim 512 \
+    --recurrent-projection-dim 128 \
+    --non-recurrent-projection-dim 128 \
+    --label-delay 0 \
+    --self-repair-scale 0.00001 \
+   $dir/configs || exit 1;
+
+fi
+
+if [ $stage -le 17 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{8,9,10,11,12}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --chain.left-deriv-truncate 0 \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1200000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 5 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $chunk_width \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $treedir \
+    --lat-dir $lat_dir \
+    --dir $dir  || exit 1;
+fi
+
+final_lm=`cat data/local/lm/final_lm`
+LM=$final_lm.pr1-7
+graph_dir=$dir/graph_${LM}
+if [ $stage -le 18 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+fi
+
+if [ $stage -le 19 ]; then
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $extra_right_context ] && extra_right_context=$chunk_right_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+  for decode_set in dev eval; do
+      (
+      num_jobs=`cat data/$mic/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --stage $decode_stage \
+          --nj $num_jobs --cmd "$decode_cmd" \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --frames-per-chunk "$frames_per_chunk" \
+          --online-ivector-dir exp/$mic/nnet3/ivectors_${decode_set} \
+          --scoring-opts "--min-lmwt 5 " \
+         $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1;
+      ) &
+  done
+fi
+wait;
+exit 0;
+
+
diff --git a/egs/ami/s5/local/chain/run_chain_common.sh b/egs/ami/s5/local/chain/run_chain_common.sh
new file mode 100755
index 00000000000..d165b86ef7a
--- /dev/null
+++ b/egs/ami/s5/local/chain/run_chain_common.sh
@@ -0,0 +1,151 @@
+#!/bin/bash
+
+# this script has common stages shared across AMI chain recipes
+set -e
+
+# configs for 'chain'
+stage=0
+mic=ihm
+use_ihm_ali=false
+# chain options
+frames_per_eg=150
+max_wer=
+
+# output directory names
+dir=
+treedir=
+lang=
+min_seg_len=
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+[ -z $treedir ] && echo "Set --treedir, this specifies the directory to store new tree " && exit 1;
+[ -z $lang ] && echo "Set --lang, this specifies the new lang directory which will have the new topology" && exit 1;
+[ -z $dir ] && echo "Set --dir, this specifies the experiment directory to store files relevant to the experiment " && exit 1;
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 10" if you have already
+# run those things.
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --mic $mic \
+                                  --use-ihm-ali $use_ihm_ali \
+                                  --use-sat-alignments true || exit 1;
+
+
+# Set the variables. These are based on variables set by run_ivector_common.sh
+gmm=tri4a
+if [ $use_ihm_ali == "true" ]; then
+  gmm_dir=exp/ihm/$gmm
+  mic=${mic}_cleanali
+  ali_dir=${gmm_dir}_${mic}_train_parallel_sp_ali
+  lat_dir=${gmm_dir}_${mic}_train_parallel_sp_lats
+else
+  gmm_dir=exp/$mic/$gmm
+  ali_dir=${gmm_dir}_${mic}_train_sp_ali
+  lat_dir=${gmm_dir}_${mic}_train_sp_lats
+fi
+
+train_set=train_sp
+latgen_train_set=train_sp
+if [ $use_ihm_ali == "true" ]; then
+  latgen_train_set=train_parallel_sp
+fi
+
+###################################
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4200 data/$mic/$latgen_train_set $lang $ali_dir $treedir
+
+fi
+
+# combining the segments in training data to have a minimum length of frames_per_eg + tolerance
+# this is critical stage in AMI (gives 1% absolute improvement)
+if [ -z $min_seg_len ]; then
+  min_seg_len=$(python -c "print ($frames_per_eg+5)/100.0")
+fi
+
+if [ $stage -le 12 ]; then
+  rm -rf data/$mic/${train_set}_min${min_seg_len}_hires
+  steps/cleanup/combine_short_segments.py --minimum-duration $min_seg_len \
+    --input-data-dir data/$mic/${train_set}_hires \
+    --output-data-dir data/$mic/${train_set}_min${min_seg_len}_hires
+
+  #extract ivectors for the new data
+  steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 \
+    data/$mic/${train_set}_min${min_seg_len}_hires data/$mic/${train_set}_min${min_seg_len}_hires_max2
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
+    data/$mic/${train_set}_min${min_seg_len}_hires_max2 \
+    exp/$mic/nnet3/extractor \
+    exp/$mic/nnet3/ivectors_${train_set}_min${min_seg_len} || exit 1;
+
+ # combine the non-hires features for alignments/lattices
+ rm -rf data/$mic/${latgen_train_set}_min${min_seg_len}
+ steps/cleanup/combine_short_segments.py --minimum-duration $min_seg_len \
+                   --input-data-dir data/$mic/${latgen_train_set} \
+                   --output-data-dir data/$mic/${latgen_train_set}_min${min_seg_len}
+fi
+
+train_set=${train_set}_min${min_seg_len}
+latgen_train_set=${latgen_train_set}_min${min_seg_len}
+ivector_dir=exp/$mic/nnet3/ivectors_${train_set}
+ali_dir=${ali_dir}_min${min_seg_len}
+lat_dir=${lat_dir}_min${min_seg_len}
+if [ $stage -le 13 ]; then
+  # realigning data as the segments would have changed
+  steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" data/$mic/$latgen_train_set data/lang $gmm_dir $ali_dir || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" data/$mic/$latgen_train_set \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+mkdir -p $dir
+train_data_dir=data/$mic/${train_set}_hires
+if [ ! -z $max_wer ]; then
+  if [ $stage -le 15 ]; then
+    bad_utts_dir=${gmm_dir}_${mic}_${train_set}_bad_utts # added mic in name as this can be ihm directory where parallel mdm and sdm utts are written
+    if [ ! -f $bad_utts_dir/all_info.sorted.txt ]; then
+      # This stage takes a lot of time ~7hrs, so run only if file is not available already
+      steps/cleanup/find_bad_utts.sh --cmd "$decode_cmd" --nj 405 data/$mic/$latgen_train_set data/lang $ali_dir $bad_utts_dir
+    fi
+    python local/sort_bad_utts.py --bad-utt-info-file $bad_utts_dir/all_info.sorted.txt --max-wer $max_wer --output-file $dir/wer_sorted_utts_${max_wer}wer
+    utils/copy_data_dir.sh --validate-opts "--no-wav"  data/$mic/${train_set}_hires data/$mic/${train_set}_${max_wer}wer_hires
+    utils/filter_scp.pl $dir/wer_sorted_utts_${max_wer}wer data/$mic/${train_set}_hires/feats.scp  > data/$mic/${train_set}_${max_wer}wer_hires/feats.scp
+    utils/fix_data_dir.sh data/$mic/${train_set}_${max_wer}wer_hires
+  fi
+  train_data_dir=data/$mic/${train_set}_${max_wer}wer_hires
+  # we don't realign again as the segment ids don't change
+fi
+
+cat > $dir/vars <<EOF
+train_data_dir=$train_data_dir
+train_ivector_dir=$ivector_dir
+lat_dir=$lat_dir
+EOF
+
+exit 0;
diff --git a/egs/ami/s5/local/chain/run_tdnn_ami_5.sh b/egs/ami/s5/local/chain/run_tdnn_ami_5.sh
new file mode 100755
index 00000000000..bd3eb56db11
--- /dev/null
+++ b/egs/ami/s5/local/chain/run_tdnn_ami_5.sh
@@ -0,0 +1,197 @@
+#!/bin/bash
+
+#adapted from swbd's local/chain/6z.sh script. We change the TDNN config
+# These are the other modifications:
+#   1. AMI data has a lot of short segments so we use combine_short_segments.py
+#      to combine contiguous segments to have a minimum_duration of 1.55 secs.
+#      This is done to ensure that chain/get_egs.sh does not discard the
+#      shorter segments
+#   2. AMI data has a lot of alignment errors so we add an option to discard
+#      segments with a lot of alignment errors. These are identified using
+#      find_bad_utt.sh
+
+# these are results with a slighly different min_seg_len=2secs and an older steps/cleanup/combine_short_segments.sh script
+# also the trees for these results were built with min_seg_len=2 utterances, I moved tree building stage prior to segment
+# combination, as I don't think it is affected by the minor change in alignments.
+# Current experiments show that max-wer setting is not helpful when --use-ihm-ali=true
+#---------------------------
+##################### Systems with alignments from same acoustic data as features ###########################
+  # Individual headset microphone systems
+  #----------------------------------------
+  # results might not change even with max-wer 45 (recommended)
+  # local/chain/run_tdnn_ami_5.sh  --mic ihm --max-wer 50 --affix min_seg_len2_50wer (built with min-seg-len 2 secs, but script now just supports (frames_per_eg+5)/100)
+  # %WER 22.4 | 13098 94484 | 80.5 10.7 8.8 3.0 22.4 54.8 | 0.091 | exp/ihm/chain/tdnn_min_seg_len2_50wer_sp/decode_dev/ascore_10/dev_hires.ctm.filt.sys
+  # %WER 22.4 | 12643 89973 | 80.3 12.6 7.1 2.8 22.4 53.2 | 0.155 | exp/ihm/chain/tdnn_min_seg_len2_50wer_sp/decode_eval/ascore_10/eval_hires.ctm.filt.sys
+
+  # Multiple distant microphone systems
+  #----------------------------------------
+  # local/chain/run_tdnn_ami_5.sh  --mic mdm8 --affix msl1.5_45wer
+  # %WER 38.5 | 14761 94496 | 65.5 17.6 16.9 4.0 38.5 66.5 | 0.620 | exp/mdm8/chain/tdnn_ami5_msl1.5_45wer_sp/decode_dev/ascore_9/dev_hires_o4.ctm.filt.sys
+  # %WER 41.5 | 14219 89974 | 62.2 18.5 19.2 3.7 41.5 65.6 | 0.596 | exp/mdm8/chain/tdnn_ami5_msl1.5_45wer_sp/decode_eval/ascore_9/eval_hires_o4.ctm.filt.sys
+
+  # Single distant microphone system
+  #----------------------------------------
+  # local/chain/run_tdnn_ami_5.sh  --mic sdm1 --affix msl1.5_45wer
+  # %WER 42.8 | 14391 94487 | 60.8 19.3 19.9 3.6 42.8 69.1 | 0.588 | exp/sdm1/chain/tdnn_ami4_msl1.5_45wer_sp/decode_dev/ascore_9/dev_hires_o4.ctm.filt.sys
+  # %WER 46.1 | 13754 89977 | 57.5 20.7 21.9 3.6 46.1 69.2 | 0.561 | exp/sdm1/chain/tdnn_ami4_msl1.5_45wer_sp/decode_eval/ascore_9/eval_hires_o4.ctm.filt.sys
+
+##################### Systems with alignments from individual head-set microphone data ###########################
+  # Multiple distant microphone system
+  #----------------------------------------
+  # local/chain/run_tdnn_ami_5.sh  --mic mdm8 --use-ihm-ali true --affix msl1.5_45wer
+  # %WER 38.1 | 15296 94487 | 65.7 17.9 16.4 3.8 38.1 62.5 | 0.617 | exp/mdm8_cleanali/chain/tdnn_ami5_msl1.5_45wer_sp/decode_dev/ascore_9/dev_hires_o4.ctm.filt.sys
+  # %WER 41.5 | 13795 89975 | 62.6 20.4 17.0 4.1 41.5 66.9 | 0.628 | exp/mdm8_cleanali/chain/tdnn_ami5_msl1.5_45wer_sp/decode_eval/ascore_8/eval_hires_o4.ctm.filt.sys
+
+  # Single distant microphone system
+  #----------------------------------------
+  # results might not change even with max-wer 45 (recommended)
+  # local/chain/run_tdnn_ami_5.sh  --mic sdm1 --use-ihm-ali true --max-wer 50 --affix msl1.5_50wer
+  # %WER 41.6 | 14793 94504 | 61.8 19.3 18.9 3.4 41.6 65.3 | 0.591 | exp/sdm1_cleanali/chain/tdnn_ami4_msl1.5_50wer_sp/decode_dev/ascore_9/dev_hires_o4.ctm.filt.sys
+  # %WER 45.4 | 14141 89972 | 57.9 20.7 21.4 3.3 45.4 64.8 | 0.567 | exp/sdm1_cleanali/chain/tdnn_ami4_msl1.5_50wer_sp/decode_eval/ascore_9/eval_hires_o4.ctm.filt.sys
+
+
+set -e
+
+# configs for 'chain'
+stage=10
+train_stage=-10
+get_egs_stage=-10
+decode_stage=1
+mic=ihm
+use_ihm_ali=false
+affix=
+common_egs_dir=
+exp_name=tdnn_ami5
+
+
+# training options
+# chain options
+frames_per_eg=150
+xent_regularize=0.1
+max_wer=
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+new_mic=$mic
+if [ $use_ihm_ali == "true" ]; then
+  new_mic=${mic}_cleanali
+fi
+
+# we do speed perturbation by default, however for sake of
+# consistency with older naming convention we  add _sp
+
+dir=exp/$new_mic/chain/${exp_name}${affix:+_$affix}_sp
+treedir=exp/$new_mic/chain/tri5_2y_tree_sp
+lang=data/$new_mic/lang_chain_2y
+
+local/chain/run_chain_common.sh --stage $stage \
+                                --mic $mic \
+                                --use-ihm-ali $use_ihm_ali \
+                                --frames-per-eg $frames_per_eg \
+                                --max-wer $max_wer \
+                                --dir $dir \
+                                --treedir $treedir \
+                                --lang $lang
+mic=$new_mic
+. $dir/vars
+# sets the directory names where features, ivectors and lattices are stored
+#train_data_dir
+#train_ivector_dir
+#lat_dir
+
+###################################
+
+if [ $stage -le 16 ]; then
+  echo "$0: creating neural net configs";
+
+  steps/nnet3/tdnn/make_configs.py \
+    --self-repair-scale 0.00001 \
+    --feat-dir $train_data_dir \
+    --ivector-dir $train_ivector_dir \
+    --tree-dir $treedir \
+    --relu-dim 450 \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \
+    --use-presoftmax-prior-scale false \
+    --xent-regularize $xent_regularize \
+    --xent-separate-forward-affine true \
+    --include-log-softmax false \
+    --final-layer-normalize-target 1.0 \
+   $dir/configs || exit 1;
+fi
+
+if [ $stage -le 17 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+
+touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $treedir \
+    --lat-dir $lat_dir \
+    --dir $dir  || exit 1;
+fi
+
+final_lm=`cat data/local/lm/final_lm`
+LM=$final_lm.pr1-7
+graph_dir=$dir/graph_${LM}
+if [ $stage -le 18 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+fi
+
+if [ $stage -le 19 ]; then
+  for decode_set in dev eval; do
+      (
+      num_jobs=`cat data/$mic/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --stage $decode_stage \
+          --extra-left-context 20 \
+          --nj $num_jobs --cmd "$decode_cmd" \
+          --online-ivector-dir exp/$mic/nnet3/ivectors_${decode_set} \
+          --scoring-opts "--min-lmwt 5 " \
+         $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1;
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/ami/s5/local/confidence_calibration.sh b/egs/ami/s5/local/confidence_calibration.sh
new file mode 100755
index 00000000000..592562772c4
--- /dev/null
+++ b/egs/ami/s5/local/confidence_calibration.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+. cmd.sh
+. path.sh
+
+# Global options,
+graph=exp/ihm/tri4a/graph_ami_fsh.o3g.kn.pr1-7
+arpa_gz=data/local/lm/ami_fsh.o3g.kn.pr1-7.gz
+lmwt=12
+
+# Dev-set options,
+dev_data=data/ihm/dev
+dev_latdir=exp/ihm/tri4a_mmi_b0.1/decode_dev_4.mdl_ami_fsh.o3g.kn.pr1-7
+
+# Eval-set options,
+eval_data=data/ihm/eval
+eval_latdir=exp/ihm/tri4a_mmi_b0.1/decode_eval_4.mdl_ami_fsh.o3g.kn.pr1-7
+
+. utils/parse_options.sh
+set -euxo pipefail
+
+# Derived options,
+dev_caldir=$dev_latdir/confidence_$lmwt
+eval_caldir=$eval_latdir/confidence_$lmwt
+
+###### Data preparation,
+
+# Prepare filtering for excluding data from train-set (1 .. keep word, 0 .. exclude word),
+# - only excludes from training-targets, the confidences are recalibrated for all the words,
+word_filter=$(mktemp)
+awk '{ keep_the_word = $1 !~ /^(\[.*\]|<.*>|%.*|!.*|-.*|.*-)$/; print $0, keep_the_word }' \
+  $graph/words.txt >$word_filter
+
+# Calcualte the word-length,
+word_length=$(mktemp)
+awk '{if(r==0) { len_hash[$1] = NF-2; } 
+      if(r==1) { if(len_hash[$1]) { len = len_hash[$1]; } else { len = -1 }  
+      print $0, len; }}' \
+  r=0 $graph/phones/align_lexicon.txt \
+  r=1 $graph/words.txt \
+  >$word_length
+
+# Extract unigrams,
+unigrams=$(mktemp); steps/conf/parse_arpa_unigrams.py $graph/words.txt $arpa_gz $unigrams
+
+###### Paste the 'word-specific' features (first 4 columns have fixed position, more feature-columns can be added),
+# Format: "word word_id filter length other_features"
+word_feats=$(mktemp)
+paste $word_filter <(awk '{ print $3 }' $word_length) <(awk '{ print $3 }' $unigrams) > $word_feats
+
+
+###### Train the calibration,
+steps/conf/train_calibration.sh --cmd "$decode_cmd" --lmwt $lmwt \
+  $dev_data $graph $word_feats $dev_latdir $dev_caldir
+
+###### Apply the calibration to eval set,
+steps/conf/apply_calibration.sh --cmd "$decode_cmd" \
+  $eval_data $graph $eval_latdir $dev_caldir $eval_caldir
+# The final confidences are here '$eval_caldir/ctm_calibrated',
+
+###### Sclite scoring,
+# We will produce NCE which shows the ``quality'' of the confidences.
+# Please compare with the default scoring script for your database.
+
+# Scoring tools,
+hubscr=$KALDI_ROOT/tools/sctk/bin/hubscr.pl 
+hubdir=`dirname $hubscr`
+
+# Inputs,
+ctm=$eval_caldir/ctm_calibrated
+stm=$eval_data/stm
+glm=$eval_data/glm
+
+# Normalizng CTM, just like in 'local/score_sclite.sh',
+cat $ctm | grep -i -v -E '\[NOISE|LAUGHTER|VOCALIZED-NOISE\]' | \
+  grep -i -v -E ' (ACH|AH|EEE|EH|ER|EW|HA|HEE|HM|HMM|HUH|MM|OOF|UH|UM) ' | \
+  grep -i -v -E '<unk>' >${ctm}.filt
+
+# Mapping the time info to global,
+utils/convert_ctm.pl $eval_data/segments $eval_data/reco2file_and_channel <${ctm}.filt >${ctm}.filt.conv
+
+# Scoring,
+$hubscr -p $hubdir -V -l english -h hub5 -g $glm -r $stm ${ctm}.filt.conv
diff --git a/egs/ami/s5/local/nnet/run_dnn.sh b/egs/ami/s5/local/nnet/run_dnn.sh
index 9e4264cb7f0..c7b9db11acc 100755
--- a/egs/ami/s5/local/nnet/run_dnn.sh
+++ b/egs/ami/s5/local/nnet/run_dnn.sh
@@ -14,13 +14,13 @@ stage=0 # resume training with --stage=N
 #
 
 if [ $# -ne 1 ]; then
-  printf "\nUSAGE: %s <mic condition(ihm|sdm|mdm)>\n\n" `basename $0`
+  printf "\nUSAGE: %s [opts] <mic condition(ihm|sdm|mdm)>\n\n" `basename $0`
   exit 1;
 fi
 mic=$1
 
 gmmdir=exp/$mic/tri4a
-data_fmllr=data-fmllr-tri4
+data_fmllr=data_${mic}-fmllr-tri4
 
 final_lm=`cat data/local/lm/final_lm`
 LM=$final_lm.pr1-7
@@ -28,10 +28,7 @@ graph_dir=$gmmdir/graph_${LM}
 
 # Set bash to 'debug' mode, it will exit on : 
 # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
-set -e
-set -u
-set -o pipefail
-set -x
+set -euxo pipefail
 
 # Store fMLLR features, so we can train on them easily,
 if [ $stage -le 0 ]; then
@@ -102,13 +99,13 @@ if [ $stage -le 4 ]; then
   steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 4 --acwt $acwt --do-smbr true \
     $data_fmllr/$mic/train data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir
   # Decode (reuse HCLG graph)
-  for ITER in 4 3 2 1; do
+  for ITER in 4 1; do
     steps/nnet/decode.sh --nj $nj_decode --cmd "$decode_cmd" --config conf/decode_dnn.conf \
-      --num-threads 3 --nnet $dir/${ITER}.nnet --acwt $acwt \
-      $graph_dir $data_fmllr/$mic/dev $dir/decode_dev_${LM}
+      --nnet $dir/${ITER}.nnet --acwt $acwt \
+      $graph_dir $data_fmllr/$mic/dev $dir/decode_dev_${LM}_it${ITER}
     steps/nnet/decode.sh --nj $nj_decode --cmd "$decode_cmd" --config conf/decode_dnn.conf \
-      --num-threads 3 --nnet $dir/${ITER}.nnet --acwt $acwt \
-      $graph_dir $data_fmllr/$mic/eval $dir/decode_eval_${LM}
+      --nnet $dir/${ITER}.nnet --acwt $acwt \
+      $graph_dir $data_fmllr/$mic/eval $dir/decode_eval_${LM}_it${ITER}
   done
 fi
 
diff --git a/egs/ami/s5/local/nnet/run_dnn_lda_mllt.sh b/egs/ami/s5/local/nnet/run_dnn_lda_mllt.sh
index 04cc7fe7052..4caf140093d 100755
--- a/egs/ami/s5/local/nnet/run_dnn_lda_mllt.sh
+++ b/egs/ami/s5/local/nnet/run_dnn_lda_mllt.sh
@@ -14,7 +14,7 @@ stage=0 # resume training with --stage=N
 #
 
 if [ $# -ne 1 ]; then
-  printf "\nUSAGE: %s <mic condition(ihm|sdm|mdm)>\n\n" `basename $0`
+  printf "\nUSAGE: %s [opts] <mic condition(ihm|sdm|mdm)>\n\n" `basename $0`
   exit 1;
 fi
 mic=$1
@@ -50,6 +50,7 @@ if [ $stage -le 1 ]; then
   # - re-use CMVN options,
   feat_dim=$(feat-to-dim scp:data/$mic/train/feats.scp -)
   cmvn_opts=$(cat $gmmdir/cmvn_opts)
+  [ -z $cmvn_opts ] && cmvn_opts="--norm-means=true --norm-vars=false" # GMM default,
   {
     echo "<Splice> <InputDim> $feat_dim <OutputDim> $((feat_dim*7)) <ReadVector> [ -3 -2 -1 0 1 2 3 ]"
     echo "<LinearTransform> <InputDim> $((feat_dim*7)) <OutputDim> 40 <ReadMatrix> $gmmdir/final.mat"
@@ -105,13 +106,13 @@ if [ $stage -le 4 ]; then
   steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 4 --acwt $acwt --do-smbr true \
     data/$mic/train data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir
   # Decode (reuse HCLG graph)
-  for ITER in 4 3 2 1; do
+  for ITER in 4 1; do
     steps/nnet/decode.sh --nj $nj_dev --cmd "$decode_cmd" --config conf/decode_dnn.conf \
-      --num-threads 3 --nnet $dir/${ITER}.nnet --acwt $acwt \
-      $graph_dir data/$mic/dev $dir/decode_dev_${LM}
+      --nnet $dir/${ITER}.nnet --acwt $acwt \
+      $graph_dir data/$mic/dev $dir/decode_dev_${LM}_it${ITER}
     steps/nnet/decode.sh --nj $nj_eval --cmd "$decode_cmd" --config conf/decode_dnn.conf \
-      --num-threads 3 --nnet $dir/${ITER}.nnet --acwt $acwt \
-      $graph_dir data/$mic/eval $dir/decode_eval_${LM}
+      --nnet $dir/${ITER}.nnet --acwt $acwt \
+      $graph_dir data/$mic/eval $dir/decode_eval_${LM}_it${ITER}
   done
 fi
 
diff --git a/egs/ami/s5/local/nnet3/prepare_parallel_datadirs.sh b/egs/ami/s5/local/nnet3/prepare_parallel_datadirs.sh
new file mode 100755
index 00000000000..df069929377
--- /dev/null
+++ b/egs/ami/s5/local/nnet3/prepare_parallel_datadirs.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+
+# this script creates a new data directory data/$new_mic
+# where the train, dev and eval directories are copied from $original_mic
+# in addition to these a new data directory train_parallel is created which has
+# the segment ids from data/$original_mic but the wav data is copied from 
+# data/$parallel_mic
+
+original_mic=sdm1
+parallel_mic=ihm
+new_mic=sdm1_cleanali
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+#copy the clean data directory and change the segment ids
+for dset in train dev eval; do
+  utils/copy_data_dir.sh data/$original_mic/$dset data/$new_mic/$dset
+done
+dset=train
+utils/copy_data_dir.sh data/$parallel_mic/$dset data/$new_mic/${dset}_parallel
+rm -rf data/$new_mic/${dset}_parallel/{text,feats.scp,cmvn.scp}
+cp data/$new_mic/$dset/{spk2utt,text,utt2spk} data/$new_mic/${dset}_parallel
+cp data/$new_mic/${dset}_parallel/wav.scp data/$new_mic/${dset}_parallel/wav.scp_full
+cp data/$new_mic/${dset}_parallel/reco2file_and_channel data/$new_mic/${dset}_parallel/reco2file_and_channel_full
+
+dset=train
+# map sdm/mdm segments to the ihm segments
+tmpdir=`mktemp -d ./tmpXXX`
+cat data/$parallel_mic/$dset/segments | sed -e "s/_H[0-9][0-9]_//g" > $tmpdir/key2ihm
+cat data/$new_mic/$dset/segments | awk '{print $1}' > $tmpdir/dm_utts
+mic_basename=$(echo $original_mic | sed -e "s/[0-9]//g")
+if [ $mic_basename == "sdm" ]; then
+  pattern="_SDM_"
+else
+  pattern="_MDM_"
+fi
+cat $tmpdir/dm_utts | sed -e "s/$pattern//g" > $tmpdir/key
+paste -d' ' $tmpdir/key $tmpdir/dm_utts  > $tmpdir/key2dm
+
+python -c "
+ihm = dict(map(lambda x: [x.split()[0], ' '.join(x.split()[1:])], open('$tmpdir/key2ihm').readlines()))
+dm = dict(map(lambda x: x.split(), open('$tmpdir/key2dm').readlines()))
+
+keys = ihm.keys()
+keys.sort()
+
+for key in keys :
+  try:
+    print '{0} {1}'.format(dm[key], ihm[key])
+  except KeyError:
+    continue
+" > data/$new_mic/${dset}_parallel/segments
+  
+cat data/$new_mic/${dset}_parallel/segments | awk '{print $2}' |sort -u > $tmpdir/ids
+utils/filter_scp.pl $tmpdir/ids \
+  data/$new_mic/${dset}_parallel/wav.scp_full  > \
+  data/$new_mic/${dset}_parallel/wav.scp
+
+utils/filter_scp.pl $tmpdir/ids \
+  data/$new_mic/${dset}_parallel/reco2file_and_channel_full  > \
+  data/$new_mic/${dset}_parallel/reco2file_and_channel
+utils/fix_data_dir.sh data/$new_mic/${dset}_parallel
+
+exit 0;
diff --git a/egs/ami/s5/local/nnet3/prepare_parallel_perturbed_alignments.sh b/egs/ami/s5/local/nnet3/prepare_parallel_perturbed_alignments.sh
new file mode 100755
index 00000000000..4041ecde27e
--- /dev/null
+++ b/egs/ami/s5/local/nnet3/prepare_parallel_perturbed_alignments.sh
@@ -0,0 +1,92 @@
+#!/bin/bash
+
+# This script creates the parallel data dir based on ihm data,
+# creates speed perturbed versions of this parallel data
+# and generates the corresponding alignments.
+# The parallel data dir has segment ids from distant microphone data
+# but the wav data is copied from ihm.
+
+mic=sdm1
+new_mic=sdm1_cleanali
+use_sat_alignments=true
+nj=10
+stage=0
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+speed_perturb_datadir() {
+  mic=$1
+  dataset=$2
+  extract_features=$3
+
+  utils/perturb_data_dir_speed.sh 0.9 data/$mic/$dataset data/$mic/temp1
+  utils/perturb_data_dir_speed.sh 1.0 data/$mic/$dataset data/$mic/temp2
+  utils/perturb_data_dir_speed.sh 1.1 data/$mic/$dataset data/$mic/temp3
+  utils/combine_data.sh --extra-files utt2uniq data/$mic/${dataset}_sp data/$mic/temp1 data/$mic/temp2 data/$mic/temp3
+  rm -r data/$mic/temp1 data/$mic/temp2 data/$mic/temp3
+
+  if [ "$extract_features" == "true" ]; then
+    mfccdir=mfcc_${mic}_perturbed
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
+      utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/ami-$mic-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
+    fi
+    for x in ${dataset}_sp; do
+      steps/make_mfcc.sh --cmd "$train_cmd" --nj $nj \
+        data/$mic/$x exp/make_${mic}_mfcc/$x $mfccdir || exit 1;
+      steps/compute_cmvn_stats.sh data/$mic/$x exp/make_${mic}_mfcc/$x $mfccdir || exit 1;
+    done
+  fi
+  utils/fix_data_dir.sh data/$mic/${dataset}_sp
+}
+
+if [ $stage -le 0 ]; then
+  # we will use ihm alignments as targets
+  # but as the segment names differ we will create a new data dir 
+  local/nnet3/prepare_parallel_datadirs.sh --original-mic $mic \
+                                           --parallel-mic ihm \
+                                           --new-mic $new_mic
+fi
+
+mic=$new_mic
+if [ $stage -le 1 ]; then
+# extract the features for the parallel data dir which will be used for alignments
+# in case there is no speed perturbation
+  mfccdir=mfcc_${mic}
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
+    utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/ami-$mic-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
+  fi
+
+  steps/make_mfcc.sh --cmd "$train_cmd" --nj $nj \
+    data/${mic}/train_parallel exp/make_${mic}_mfcc/train_parallel $mfccdir || exit 1;
+  steps/compute_cmvn_stats.sh data/$mic/train_parallel exp/make_${mic}_mfcc/train_parallel $mfccdir || exit 1;
+  utils/fix_data_dir.sh data/$mic/train_parallel
+fi
+
+if [ $stage -le 2 ]; then
+  # if we are using the ihm alignments we just need features for the parallel
+  # data, the actual data is being perturbed just so that we can copy this 
+  # directory to create hiresolution features later
+  speed_perturb_datadir $mic train_parallel true 
+  speed_perturb_datadir $mic train false
+fi
+
+if [ $stage -le 3 ]; then
+  # we just need to recreate alignments in case we perturbed the data 
+  # or in the case we are using ihm alignments, else the alignments would already
+  # have been generated when we built the GMM-HMM systems
+  data_set=train_parallel_sp
+  if [ "$use_sat_alignments" == "true" ]; then
+    gmm_dir=exp/ihm/tri4a
+    align_script=steps/align_fmllr.sh
+  else
+    gmm_dir=exp/ihm/tri3a
+    align_script=steps/align_si.sh
+  fi
+  $align_script --nj $nj --cmd "$train_cmd" \
+    data/$mic/train_parallel_sp data/lang $gmm_dir ${gmm_dir}_${mic}_${data_set}_ali || exit 1;
+fi
+
+exit 0;
diff --git a/egs/ami/s5/local/nnet3/prepare_perturbed_alignments.sh b/egs/ami/s5/local/nnet3/prepare_perturbed_alignments.sh
new file mode 100755
index 00000000000..4c9e26aa13f
--- /dev/null
+++ b/egs/ami/s5/local/nnet3/prepare_perturbed_alignments.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+
+# This script creates speed perturbed versions of the training data
+# and generates the corresponding alignments
+
+mic=ihm
+nj=10
+stage=0
+use_sat_alignments=true
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+speed_perturb_datadir() {
+  mic=$1
+  dataset=$2
+  extract_features=$3
+
+  utils/perturb_data_dir_speed.sh 0.9 data/$mic/$dataset data/$mic/temp1
+  utils/perturb_data_dir_speed.sh 1.0 data/$mic/$dataset data/$mic/temp2
+  utils/perturb_data_dir_speed.sh 1.1 data/$mic/$dataset data/$mic/temp3
+  utils/combine_data.sh --extra-files utt2uniq data/$mic/${dataset}_sp data/$mic/temp1 data/$mic/temp2 data/$mic/temp3
+  rm -r data/$mic/temp1 data/$mic/temp2 data/$mic/temp3
+
+  if [ "$extract_features" == "true" ]; then
+    mfccdir=mfcc_${mic}_perturbed
+    for x in ${dataset}_sp; do
+      steps/make_mfcc.sh --cmd "$train_cmd" --nj $nj \
+        data/$mic/$x exp/make_${mic}_mfcc/$x $mfccdir || exit 1;
+      steps/compute_cmvn_stats.sh data/$mic/$x exp/make_${mic}_mfcc/$x $mfccdir || exit 1;
+    done
+  fi
+  utils/fix_data_dir.sh data/$mic/${dataset}_sp
+}
+
+
+if [ $stage -le 1 ]; then
+  #Although the nnet will be trained by high resolution data, we still have to perturb the normal data to get the alignment
+  # _sp stands for speed-perturbed
+    speed_perturb_datadir $mic train true
+fi
+
+
+if [ $stage -le 2 ]; then
+  # we just need to recreate alignments in case we perturbed the data 
+  # or in the case we are using ihm alignments, else the alignments would already
+  # have been generated when we built the GMM-HMM systems
+  data_set=train_sp
+  if [ "$use_sat_alignments" == "true" ]; then
+    gmm_dir=exp/$mic/tri4a
+    align_script=steps/align_fmllr.sh
+  else
+    gmm_dir=exp/$mic/tri3a
+    align_script=steps/align_si.sh
+  fi
+  $align_script --nj $nj --cmd "$train_cmd" \
+    data/$mic/train_sp data/lang $gmm_dir ${gmm_dir}_${mic}_${data_set}_ali || exit 1;
+fi
+
+exit 0;
diff --git a/egs/ami/s5/local/nnet3/run_blstm.sh b/egs/ami/s5/local/nnet3/run_blstm.sh
new file mode 100755
index 00000000000..d5dee155ba2
--- /dev/null
+++ b/egs/ami/s5/local/nnet3/run_blstm.sh
@@ -0,0 +1,46 @@
+stage=0
+train_stage=-10
+mic=ihm
+affix=bidirectional
+common_egs_dir=
+remove_egs=true
+use_ihm_ali=false
+use_sat_alignments=true
+
+# BLSTM params
+cell_dim=512
+rp_dim=128
+nrp_dim=128
+chunk_left_context=40
+chunk_right_context=40
+
+# training options
+num_jobs_initial=2
+num_jobs_final=12
+samples_per_iter=20000
+realign_times=
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+local/nnet3/run_lstm.sh --affix $affix \
+                         --stage $stage \
+                         --train-stage $train_stage \
+                         --lstm-delay " [-1,1] [-2,2] [-3,3] " \
+                         --label-delay 0 \
+                         --cell-dim $cell_dim \
+                         --recurrent-projection-dim $rp_dim \
+                         --non-recurrent-projection-dim $nrp_dim \
+                         --common-egs-dir "$common_egs_dir" \
+                         --chunk-left-context $chunk_left_context \
+                         --chunk-right-context $chunk_right_context \
+                         --mic $mic \
+                         --num-jobs-initial $num_jobs_initial \
+                         --num-jobs-final $num_jobs_final \
+                         --samples-per-iter $samples_per_iter \
+                         --use-ihm-ali $use_ihm_ali \
+                         --use-sat-alignments $use_sat_alignments \
+                         --realign-times "$realign_times" \
+                         --remove-egs $remove_egs
+
diff --git a/egs/ami/s5/local/nnet3/run_ivector_common.sh b/egs/ami/s5/local/nnet3/run_ivector_common.sh
index 227c2fbe209..1b5e64c04fb 100755
--- a/egs/ami/s5/local/nnet3/run_ivector_common.sh
+++ b/egs/ami/s5/local/nnet3/run_ivector_common.sh
@@ -1,30 +1,63 @@
 #!/bin/bash
 
 # this script contains some common (shared) parts of the run_nnet*.sh scripts.
-
-. cmd.sh
-
+# speed perturbation is done for the training data
 
 stage=0
 mic=ihm
 num_threads_ubm=32
-speed_perturb=true
+nj=10
+use_ihm_ali=false
 use_sat_alignments=true
 
-set -e
 . cmd.sh
 . ./path.sh
 . ./utils/parse_options.sh
 
-if [ "$use_sat_alignments" == "true" ] ; then
-  gmm_dir=exp/$mic/tri4a
-  align_script=steps/align_fmllr.sh
+volume_perturb_datadir()  {
+  dir=$1
+  cat $dir/wav.scp | python -c "
+import sys, os, subprocess, re, random
+scale_low = 1.0/8
+scale_high = 2.0
+for line in sys.stdin.readlines():
+  if len(line.strip()) == 0:
+    continue
+  print '{0} sox --vol {1} -t wav - -t wav - |'.format(line.strip(), random.uniform(scale_low, scale_high))
+"| sort -k1,1 -u  > $dir/wav.scp_scaled || exit 1;
+  mv $dir/wav.scp $dir/wav.scp_nonorm
+  mv $dir/wav.scp_scaled $dir/wav.scp
+}
+
+if [ "$use_sat_alignments" == "true" ]; then
+  gmm=tri4a
+else
+  gmm=tri3a
+fi
+
+if [ "$use_ihm_ali" == "true" ]; then
+  if [ "$mic" == "ihm" ]; then
+    echo "This is an IHM setup, using the use_ihm_ali=true options does not make sense. Rerun with use_ihm_ali=false" && exit 1;
+  fi
+  # prepare the parallel data directory ${mic}_clean_ali
+  # generate alignments from the perturbed parallel data
+  local/nnet3/prepare_parallel_perturbed_alignments.sh --stage $stage \
+                                                       --mic $mic \
+                                                       --new-mic ${mic}_cleanali \
+                                                       --use-sat-alignments $use_sat_alignments
+  # we are going to modify the mic name as changing the alignments
+  # changes the ivector extractor
+  mic=${mic}_cleanali
+  ali_dir=exp/ihm/${gmm}_${mic}_train_parallel_sp_ali
 else
-  gmm_dir=exp/$mic/tri3a
-  align_script=steps/align_si.sh
+  # prepare the perturbed data directory and generate alignments
+  local/nnet3/prepare_perturbed_alignments.sh --stage $stage --mic $mic \
+                                              --use-sat-alignments $use_sat_alignments
+
+  ali_dir=exp/$mic/${gmm}_${mic}_train_sp_ali
 fi
 
-if [ $stage -le 1 ]; then
+if [ $stage -le 4 ]; then
   # Create high-resolution MFCC features (with 40 cepstra instead of 13).
   # this shows how you can split across multiple file-systems.  we'll split the
   # MFCC dir across multiple locations.  You might want to be careful here, if you
@@ -35,96 +68,50 @@ if [ $stage -le 1 ]; then
     utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/ami-$mic-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
   fi
 
-  for datadir in train dev eval; do
+  for datadir in train_sp dev eval; do
     utils/copy_data_dir.sh data/$mic/$datadir data/$mic/${datadir}_hires
-    if [ "$datadir" == "train" ]; then
-      dir=data/$mic/train_hires
-      cat $dir/wav.scp | python -c "
-import sys, os, subprocess, re, random
-scale_low = 1.0/8
-scale_high = 2.0
-for line in sys.stdin.readlines():
-  if len(line.strip()) == 0:
-    continue
-  print '{0} sox --vol {1} -t wav - -t wav - |'.format(line.strip(), random.uniform(scale_low, scale_high))
-"| sort -k1,1 -u  > $dir/wav.scp_scaled || exit 1;
-     mv $dir/wav.scp $dir/wav.scp_nonorm
-     mv $dir/wav.scp_scaled $dir/wav.scp
+    if [ "$datadir" == "train_sp" ]; then
+      volume_perturb_datadir data/$mic/${datadir}_hires
     fi
 
-    steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \
+    steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \
       --cmd "$train_cmd" data/$mic/${datadir}_hires exp/make_${mic}_hires/$datadir $mfccdir || exit 1;
     steps/compute_cmvn_stats.sh data/$mic/${datadir}_hires exp/make_${mic}_hires/$mic/$datadir $mfccdir || exit 1;
-  done
 
+    utils/fix_data_dir.sh data/$mic/${datadir}_hires
+  done
 fi
 
-if [ $stage -le 2 ]; then
+if [ $stage -le 5 ]; then
   # Train a system just for its LDA+MLLT transform.  We use --num-iters 13
   # because after we get the transform (12th iter is the last), any further
   # training is pointless.
   steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \
     --realign-iters "" \
     --splice-opts "--left-context=3 --right-context=3" \
-    5000 10000 data/$mic/train_hires data/lang \
-    ${gmm_dir}_ali exp/$mic/nnet3/tri5
+    5000 10000 data/$mic/train_sp_hires data/lang \
+    $ali_dir exp/$mic/nnet3/tri5
 fi
 
 
-if [ $stage -le 3 ]; then
+if [ $stage -le 6 ]; then
   steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
     --num-frames 700000 \
     --num-threads $num_threads_ubm \
-    data/$mic/train_hires 512 exp/$mic/nnet3/tri5 exp/$mic/nnet3/diag_ubm
+    data/$mic/train_sp_hires 512 exp/$mic/nnet3/tri5 exp/$mic/nnet3/diag_ubm
 fi
 
-if [ $stage -le 4 ]; then
+if [ $stage -le 7 ]; then
   # iVector extractors can in general be sensitive to the amount of data, but
   # this one has a fairly small dim (defaults to 100)
   steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
-    data/$mic/train_hires exp/$mic/nnet3/diag_ubm exp/$mic/nnet3/extractor || exit 1;
-fi
-
-if [ $stage -le 5 ] && [ "$speed_perturb" == "true" ]; then
-  #Although the nnet will be trained by high resolution data, we still have to perturbe the normal data to get the alignment
-  # _sp stands for speed-perturbed
-  utils/perturb_data_dir_speed.sh 0.9 data/$mic/train data/$mic/temp1
-  utils/perturb_data_dir_speed.sh 1.0 data/$mic/train data/$mic/temp2
-  utils/perturb_data_dir_speed.sh 1.1 data/$mic/train data/$mic/temp3
-  utils/combine_data.sh --extra-files utt2uniq data/$mic/train_sp data/$mic/temp1 data/$mic/temp2 data/$mic/temp3
-  rm -r data/$mic/temp1 data/$mic/temp2 data/$mic/temp3
-
-  mfccdir=mfcc_${mic}_perturbed
-  for x in train_sp; do
-    steps/make_mfcc.sh --cmd "$train_cmd" --nj $nj \
-      data/$mic/$x exp/make_${mic}_mfcc/$x $mfccdir || exit 1;
-    steps/compute_cmvn_stats.sh data/$mic/$x exp/make_${mic}_mfcc/$x $mfccdir || exit 1;
-  done
-  utils/fix_data_dir.sh data/$mic/train_sp
-
-  $align_script --nj $nj --cmd "$train_cmd" \
-    data/$mic/train_sp data/lang $gmm_dir ${gmm_dir}_sp_ali || exit 1
-
-  #Now perturb the high resolution daa
-  utils/copy_data_dir.sh data/$mic/train_sp data/$mic/train_sp_hires
-  mfccdir=mfcc_${mic}_perturbed_hires
-  for x in train_sp_hires; do
-    steps/make_mfcc.sh --cmd "$train_cmd" --nj $nj --mfcc-config conf/mfcc_hires.conf \
-      data/$mic/$x exp/make_${mic}_hires/$x $mfccdir || exit 1;
-    steps/compute_cmvn_stats.sh data/$mic/$x exp/make_${mic}_hires/$x $mfccdir || exit 1;
-  done
-  utils/fix_data_dir.sh data/$mic/train_sp_hires
+    data/$mic/train_sp_hires exp/$mic/nnet3/diag_ubm exp/$mic/nnet3/extractor || exit 1;
 fi
 
-if [ "$speed_perturb" == "true" ]; then
-  train_set=train_sp
-else
-  train_set=train
-fi
 
-if [ $stage -le 6 ]; then
-  rm exp/$mic/nnet3/.error 2>/dev/null
-  ivectordir=exp/$mic/nnet3/ivectors_${train}_hires
+if [ $stage -le 8 ]; then
+  rm -f exp/$mic/nnet3/.error 2>/dev/null
+  ivectordir=exp/$mic/nnet3/ivectors_train_sp_hires
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
     utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/ami-$mic-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
   fi
@@ -135,13 +122,23 @@ if [ $stage -le 6 ]; then
 
   # having a larger number of speakers is helpful for generalization, and to
   # handle per-utterance decoding well (iVector starts at zero).
-  steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/$mic/${train}_hires data/$mic/${train}_hires_max2
+  steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/$mic/train_sp_hires data/$mic/train_sp_hires_max2
   steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
-    data/$mic/${train}_hires_max2 \
+    data/$mic/train_sp_hires_max2 \
     exp/$mic/nnet3/extractor \
-    exp/$mic/nnet3/ivectors_${train}_hires \
+    exp/$mic/nnet3/ivectors_train_sp_hires \
     || touch exp/$mic/nnet3/.error
   [ -f exp/$mic/nnet3/.error ] && echo "$0: error extracting iVectors." && exit 1;
 fi
 
+if [ $stage -le 9 ]; then
+  rm -f exp/$mic/nnet3/.error 2>/dev/null
+  for data in dev eval; do
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 8 \
+      data/$mic/${data}_hires exp/$mic/nnet3/extractor exp/$mic/nnet3/ivectors_${data} || touch exp/$mic/nnet3/.error &
+  done
+  wait
+  [ -f exp/$mic/nnet3/.error ] && echo "$0: error extracting iVectors." && exit 1;
+fi
+
 exit 0;
diff --git a/egs/ami/s5/local/nnet3/run_lstm.sh b/egs/ami/s5/local/nnet3/run_lstm.sh
index c98d8340278..d077d14cc1e 100755
--- a/egs/ami/s5/local/nnet3/run_lstm.sh
+++ b/egs/ami/s5/local/nnet3/run_lstm.sh
@@ -1,119 +1,191 @@
 #!/bin/bash
 
-# this is a basic lstm script
+# Copyright 2015  Johns Hopkins University (Author: Daniel Povey).
+#           2015  Vijayaditya Peddinti
+#           2015  Xingyu Na
+#           2015  Pegah Ghahrmani
+# Apache 2.0.
+
+
+# this is a basic lstm script, it can also be used to train blstm models.
+# the blstm can be run using local/nnet3/run_blstm.sh which invokes this script
+# with the necessary parameters
+# Note: lstm script runs for more epochs than the tdnn script
 
 # At this script level we don't support not running on GPU, as it would be painfully slow.
-# If you want to run without GPU you'd have to call lstm/train.sh with --gpu false,
-# --num-threads 16 and --minibatch-size 128.
-set -e
+# If you want to run without GPU you'd have to call lstm/train.sh with --gpu false
 
 stage=0
 train_stage=-10
-has_fisher=true
 mic=ihm
-use_sat_alignments=true
+use_ihm_ali=false
+use_sat_alignments=false # if true, use tri4a alignments are used
+                         # by default GMM-HMM systems are not built to this stage
+                         # in SDM and MDM systems. So run the tri4a stage if you
+                         # want to use this option
 affix=
-speed_perturb=true
-splice_indexes="-2,-1,0,1,2 0"
 common_egs_dir=
 
-. cmd.sh
+# LSTM options
+splice_indexes="-2,-1,0,1,2 0 0"
+lstm_delay=" -1 -2 -3 "
+label_delay=5
+num_lstm_layers=3
+cell_dim=1024
+hidden_dim=1024
+recurrent_projection_dim=256
+non_recurrent_projection_dim=256
+chunk_width=20
+chunk_left_context=40
+chunk_right_context=0
+shrink=0.99
+max_param_change=2.0
+
+# training options
+num_epochs=10
+initial_effective_lrate=0.0003
+final_effective_lrate=0.00003
+num_jobs_initial=2
+num_jobs_final=12
+momentum=0.5
+num_chunk_per_minibatch=100
+samples_per_iter=20000
+remove_egs=true
+realign_times=
+
+# feature options
+use_ivectors=true
+
+#decode options
+extra_left_context=
+extra_right_context=
+frames_per_chunk=
+decode_iter=
+
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
 . ./path.sh
 . ./utils/parse_options.sh
 
 if ! cuda-compiled; then
-  cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.
 EOF
 fi
 
-dir=exp/$mic/nnet3/lstm${speed_perturb:+_sp}${affix:+_$affix}
-if [ "$use_sat_alignments" == "true" ] ; then
-  gmm_dir=exp/$mic/tri4a
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --mic $mic \
+                                  --use-ihm-ali $use_ihm_ali \
+                                  --use-sat-alignments $use_sat_alignments || exit 1;
+
+
+# set the variable names
+use_delay=false
+if [ $label_delay -gt 0 ]; then use_delay=true; fi
+
+# we still support this option as all the TDNN, LSTM, BLSTM systems were built
+# using tri3a alignments
+if [ $use_sat_alignments == "true" ]; then
+  gmm=tri4a
 else
-  gmm_dir=exp/$mic/tri3a
+  gmm=tri3a
 fi
 
-if [ "$speed_perturb" == "true" ]; then
-  train_set=train_sp
-  ali_dir=${gmm_dir}_sp_ali
+if [ $use_ihm_ali == "true" ]; then
+  gmm_dir=exp/ihm/$gmm
+  mic=${mic}_cleanali
+  ali_dir=${gmm_dir}_${mic}_train_parallel_sp_ali
 else
-  train_set=train
-  ali_dir=${gmm_dir}_ali
+  gmm_dir=exp/$mic/$gmm
+  ali_dir=${gmm_dir}_${mic}_train_sp_ali
 fi
 
 final_lm=`cat data/local/lm/final_lm`
 LM=$final_lm.pr1-7
 graph_dir=$gmm_dir/graph_${LM}
+dir=exp/$mic/nnet3/lstm${affix:+_$affix}${use_delay:+_ld$label_delay}
+
+
 
-local/nnet3/run_ivector_common.sh --stage $stage \
-  --use-sat-alignments $use_sat_alignments \
-  --speed-perturb $speed_perturb || exit 1;
 
-if [ $stage -le 7 ]; then
+if [ $stage -le 10 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl \
-     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+     /export/b0{3,5,6,7}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+  if [ "$use_ivectors" == "true" ]; then
+    ivector_opts=" --online-ivector-dir exp/$mic/nnet3/ivectors_train_sp_hires "
+    cmvn_opts="--norm-means=false --norm-vars=false"
+  else
+    ivector_opts=
+    cmvn_opts="--norm-means=true --norm-vars=true"
   fi
 
-  steps/nnet3/lstm/train.sh --stage $train_stage \
-    --num-epochs 3 --num-jobs-initial 2 --num-jobs-final 12 \
+  steps/nnet3/lstm/train.sh $ivector_opts \
+    --stage $train_stage \
+    --label-delay $label_delay \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --num-chunk-per-minibatch $num_chunk_per_minibatch \
+    --samples-per-iter $samples_per_iter \
     --splice-indexes "$splice_indexes" \
+    --add-lda $add_lda \
     --feat-type raw \
-    --online-ivector-dir exp/$mic/nnet3/ivectors_${train_set}_hires \
-    --cmvn-opts "--norm-means=false --norm-vars=false" \
-    --io-opts "-tc 12" \
-    --initial-effective-lrate 0.0015 --final-effective-lrate 0.00015 \
+    --cmvn-opts "$cmvn_opts" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --momentum $momentum \
+    --max-param-change $max_param_change \
+    --lstm-delay "$lstm_delay" \
+    --shrink $shrink \
     --cmd "$decode_cmd" \
-    --num-lstm-layers 1 \
-    --cell-dim 1024 \
-    --hidden-dim 1024 \
-    --recurrent-projection-dim 256 \
-    --non-recurrent-projection-dim 256 \
-    --bptt-truncation-width 20 \
-    --context-sensitive-chunk-width 20 \
+    --num-lstm-layers $num_lstm_layers \
+    --cell-dim $cell_dim \
+    --hidden-dim $hidden_dim \
+    --recurrent-projection-dim $recurrent_projection_dim \
+    --non-recurrent-projection-dim $non_recurrent_projection_dim \
+    --chunk-width $chunk_width \
+    --chunk-left-context $chunk_left_context \
+    --chunk-right-context $chunk_right_context \
     --egs-dir "$common_egs_dir" \
-    data/$mic/${train_set}_hires data/lang $ali_dir $dir  || exit 1;
+    --remove-egs $remove_egs \
+    --realign-times "$realign_times" \
+    data/$mic/train_sp_hires data/lang $ali_dir $dir  || exit 1;
 fi
-exit;
-if [ $stage -le 8 ]; then
-  # If this setup used PLP features, we'd have to give the option --feature-type plp
-  # to the script below.
-  steps/online/nnet2/prepare_online_decoding.sh --mfcc-config conf/mfcc_hires.conf \
-    data/lang exp/$mic/nnet3/extractor "$dir" ${dir}_online || exit 1;
-fi
-wait;
 
-if [ $stage -le 9 ]; then
-  # this version of the decoding treats each utterance separately
-  # without carrying forward speaker information.
+if [ $stage -le 11 ]; then
+  if [ -z $extra_left_context ]; then
+    extra_left_context=$chunk_left_context
+  fi
+  if [ -z $extra_right_context ]; then
+    extra_right_context=$chunk_right_context
+  fi
+  if [ -z $frames_per_chunk ]; then
+    frames_per_chunk=$chunk_width
+  fi
+  model_opts=
+  [ ! -z $decode_iter ] && model_opts=" --iter $decode_iter ";
   for decode_set in dev eval; do
       (
       num_jobs=`cat data/$mic/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
-      decode_dir=${dir}_online/decode_${decode_set}_utt
-      steps/online/nnet2/decode.sh --config conf/decode.conf --cmd "$decode_cmd" --nj $num_jobs \
-        --per-utt true $graph_dir data/$mic/${decode_set}_hires $decode_dir || exit 1;
+      decode_dir=${dir}/decode_${decode_set}
+      if [ "$use_ivectors" == "true" ]; then
+        ivector_opts=" --online-ivector-dir exp/$mic/nnet3/ivectors_${decode_set} "
+      else
+        ivector_opts=
+      fi
+      steps/nnet3/lstm/decode.sh --nj 250 --cmd "$decode_cmd" \
+          $ivector_opts $model_opts \
+          --extra-left-context $extra_left_context  \
+	        --extra-right-context $extra_right_context  \
+          --frames-per-chunk "$frames_per_chunk" \
+         $graph_dir data/$mic/${decode_set}_hires $decode_dir || exit 1;
       ) &
   done
 fi
-
-if [ $stage -le 10 ]; then
-  # this version of the decoding treats each utterance separately
-  # without carrying forward speaker information, but looks to the end
-  # of the utterance while computing the iVector (--online false)
-  for decode_set in dev eval; do
-    (
-      num_jobs=`cat data/$mic/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
-      decode_dir=${dir}_online/decode_${decode_set}_utt_offline
-      steps/online/nnet2/decode.sh --config conf/decode.conf --cmd "$decode_cmd" --nj $num_jobs \
-        --per-utt true --online false $graph_dir data/$mic/${decode_set}_hires \
-          $decode_dir || exit 1;
-    ) & 
-  done
-fi
 wait;
-
 exit 0;
-
diff --git a/egs/ami/s5/local/nnet3/run_tdnn.sh b/egs/ami/s5/local/nnet3/run_tdnn.sh
index 65f84a386b9..4d2432d6331 100755
--- a/egs/ami/s5/local/nnet3/run_tdnn.sh
+++ b/egs/ami/s5/local/nnet3/run_tdnn.sh
@@ -15,97 +15,88 @@ mic=ihm
 use_sat_alignments=true
 affix=
 speed_perturb=true
+common_egs_dir=
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -7,2 -3,3 0 0"
+subset_dim=0
+remove_egs=true
+relu_dim=850
+num_epochs=3
 
 . cmd.sh
 . ./path.sh
 . ./utils/parse_options.sh
 
 if ! cuda-compiled; then
-  cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.
 EOF
 fi
 
-dir=exp/$mic/nnet3/tdnn${speed_perturb:+_sp}${affix:+_$affix}
-if [ "$use_sat_alignments" == "true" ] ; then
-  gmm_dir=exp/$mic/tri4a
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --mic $mic \
+                                  --use-ihm-ali $use_ihm_ali \
+                                  --use-sat-alignments $use_sat_alignments || exit 1;
+
+# we still support this option as all the TDNN, LSTM, BLSTM systems were built
+# using tri3a alignments
+if [ $use_sat_alignments == "true" ]; then
+  gmm=tri4a
 else
-  gmm_dir=exp/$mic/tri3a
+  gmm=tri3a
 fi
 
-if [ "$speed_perturb" == "true" ]; then
-  train_set=train_sp
-  ali_dir=${gmm_dir}_sp_ali
+if [ $use_ihm_ali == "true" ]; then
+  gmm_dir=exp/ihm/$gmm
+  mic=${mic}_cleanali
+  ali_dir=${gmm_dir}_train_parallel_sp_ali
 else
-  train_set=train
-  ali_dir=${gmm_dir}_ali
+  gmm_dir=exp/$mic/$gmm
+  ali_dir=${gmm_dir}_train_sp_ali
 fi
 
 final_lm=`cat data/local/lm/final_lm`
 LM=$final_lm.pr1-7
 graph_dir=$gmm_dir/graph_${LM}
+dir=exp/$mic/nnet3/tdnn${speed_perturb:+_sp}${affix:+_$affix}
 
-local/nnet3/run_ivector_common.sh --stage $stage \
-  --use-sat-alignments $use_sat_alignments \
-  --speed-perturb $speed_perturb || exit 1;
 
-if [ $stage -le 7 ]; then
+
+if [ $stage -le 10 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl \
      /export/b0{3,4,5,6}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
   fi
 
-  steps/nnet3/train_tdnn.sh --stage $train_stage \
-    --num-epochs 3 --num-jobs-initial 2 --num-jobs-final 12 \
-    --splice-indexes "-2,-1,0,1,2 -1,2 -3,3 -7,2 -3,3 0 0" \
+  steps/nnet3/tdnn/train.sh --stage $train_stage \
+    --num-epochs $num_epochs --num-jobs-initial 2 --num-jobs-final 12 \
+    --splice-indexes "$splice_indexes" \
+    --subset-dim "$subset_dim" \
     --feat-type raw \
     --online-ivector-dir exp/$mic/nnet3/ivectors_${train_set}_hires \
     --cmvn-opts "--norm-means=false --norm-vars=false" \
-    --io-opts "-tc 12" \
+    --egs-dir "$common_egs_dir" \
     --initial-effective-lrate 0.0015 --final-effective-lrate 0.00015 \
     --cmd "$decode_cmd" \
-    --relu-dim 850 \
-    data/$mic/${train_set}_hires data/lang $ali_dir $dir  || exit 1;
-fi
-exit;
-if [ $stage -le 8 ]; then
-  # If this setup used PLP features, we'd have to give the option --feature-type plp
-  # to the script below.
-  steps/online/nnet2/prepare_online_decoding.sh --mfcc-config conf/mfcc_hires.conf \
-    data/lang exp/$mic/nnet3/extractor "$dir" ${dir}_online || exit 1;
+    --relu-dim "$relu_dim" \
+    --remove-egs "$remove_egs" \
+    data/$mic/train_sp_hires data/lang $ali_dir $dir  || exit 1;
 fi
-wait;
 
-if [ $stage -le 9 ]; then
+if [ $stage -le 11 ]; then
   # this version of the decoding treats each utterance separately
   # without carrying forward speaker information.
   for decode_set in dev eval; do
       (
       num_jobs=`cat data/$mic/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
-      decode_dir=${dir}_online/decode_${decode_set}_utt
-      steps/online/nnet2/decode.sh --config conf/decode.conf --cmd "$decode_cmd" --nj $num_jobs \
-        --per-utt true $graph_dir data/$mic/${decode_set}_hires $decode_dir || exit 1;
-      ) &
-  done
-fi
+      decode_dir=${dir}/decode_${decode_set}
 
-if [ $stage -le 10 ]; then
-  # this version of the decoding treats each utterance separately
-  # without carrying forward speaker information, but looks to the end
-  # of the utterance while computing the iVector (--online false)
-  for decode_set in dev eval; do
-    (
-      num_jobs=`cat data/$mic/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
-      decode_dir=${dir}_online/decode_${decode_set}_utt_offline
-      steps/online/nnet2/decode.sh --config conf/decode.conf --cmd "$decode_cmd" --nj $num_jobs \
-        --per-utt true --online false $graph_dir data/$mic/${decode_set}_hires \
-          $decode_dir || exit 1;
-    ) & 
+      steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \
+          --online-ivector-dir exp/$mic/nnet3/ivectors_${decode_set} \
+         $graph_dir data/$mic/${decode_set}_hires $decode_dir || exit 1;
+      ) &
   done
 fi
 wait;
-
 exit 0;
-
diff --git a/egs/ami/s5/local/online/run_nnet2_ms_perturbed.sh b/egs/ami/s5/local/online/run_nnet2_ms_perturbed.sh
index 4f6212f4b42..24176d69a34 100755
--- a/egs/ami/s5/local/online/run_nnet2_ms_perturbed.sh
+++ b/egs/ami/s5/local/online/run_nnet2_ms_perturbed.sh
@@ -21,6 +21,7 @@ affix=
 hidden_dim=950
 num_threads_ubm=32
 use_sat_alignments=true
+fix_nnet=false
 . ./path.sh
 . ./utils/parse_options.sh
 
@@ -125,7 +126,7 @@ if [ $stage -le 9 ]; then
 fi
 
 if [ $stage -le 10 ]; then
-  steps/nnet2/train_multisplice_accel2_fix.sh --stage $train_stage \
+  steps/nnet2/train_multisplice_accel2.sh --stage $train_stage \
     --num-epochs 3 --num-jobs-initial 2 --num-jobs-final 12 \
     --num-hidden-layers 6 --splice-indexes "$splice_indexes" \
     --feat-type raw \
@@ -141,6 +142,7 @@ if [ $stage -le 10 ]; then
     --egs-dir "$common_egs_dir" \
     --pnorm-input-dim $hidden_dim \
     --pnorm-output-dim $hidden_dim \
+    --fix-nnet $fix_nnet \
     data/$mic/train_hires_sp data/lang ${gmm_dir}_sp_ali $dir  || exit 1;
 fi
 
diff --git a/egs/ami/s5/local/online/run_nnet2_ms_sp_disc.sh b/egs/ami/s5/local/online/run_nnet2_ms_sp_disc.sh
index c80edeb7dbe..f8711c24025 100755
--- a/egs/ami/s5/local/online/run_nnet2_ms_sp_disc.sh
+++ b/egs/ami/s5/local/online/run_nnet2_ms_sp_disc.sh
@@ -111,14 +111,14 @@ if [ $stage -le 3 ]; then
 
   steps/nnet2/get_egs_discriminative2.sh \
     --stage 0 \
-    --cmd "$decode_cmd -tc $max_jobs" \
+    --cmd "$decode_cmd --max-jobs-run $max_jobs" \
     --online-ivector-dir exp/$mic/nnet2_online/ivectors_train_hires_sp2 \
     --criterion $criterion --drop-frames $drop_frames \
      data/$mic/train_hires_sp data/lang ${srcdir}{_ali,_denlats,/final.mdl,_degs} || exit 1;
 
   # the command below is a more generic, but slower, way to do it.
   #steps/online/nnet2/get_egs_discriminative2.sh \
-  #  --cmd "$decode_cmd -tc $max_jobs" \
+  #  --cmd "$decode_cmd --max-jobs-run $max_jobs" \
   #  --criterion $criterion --drop-frames $drop_frames \
   #   data/train_hires data/lang ${srcdir}{_ali,_denlats,_online,_degs} || exit 1;
 fi
diff --git a/egs/ami/s5/local/score.sh b/egs/ami/s5/local/score.sh
index e506efb3279..7de6eae11ad 100755
--- a/egs/ami/s5/local/score.sh
+++ b/egs/ami/s5/local/score.sh
@@ -11,8 +11,9 @@ for x in "$@"; do orig_args="$orig_args '$x'"; done
 # score_basic.sh might need, or parse_options.sh will die.
 cmd=run.pl
 stage=0
-min_lmwt=9
-max_lmwt=20
+min_lmwt=9 # unused,
+max_lmwt=15 # unused,
+iter=final
 asclite=true
 #end configuration section.
 
@@ -34,18 +35,16 @@ data=$1
 
 mic=$(echo $data | awk -F '/' '{print $2}')
 case $mic in
-  ihm)
-    #echo "use standard scoring took for ihm (close talk)"
-    #eval steps/score_kaldi.sh $orig_args
-    echo "use sclite for ihm (close talk), better outputs than with kaldi scoring"
+  ihm*)
+    echo "Using sclite for IHM (close talk),"
     eval local/score_asclite.sh --asclite false $orig_args
   ;;
   sdm*)
-    echo "use asclite for overlapped speech sdm condition"
+    echo "Using asclite for overlapped speech SDM (single distant mic),"
     eval local/score_asclite.sh --asclite $asclite $orig_args
   ;;
   mdm*)
-    echo "use asclite for overlapped speech mdm condition"
+    echo "Using asclite for overlapped speech MDM (multiple distant mics),"
     eval local/score_asclite.sh --asclite $asclite $orig_args
   ;;
   *)
diff --git a/egs/ami/s5/local/score_asclite.sh b/egs/ami/s5/local/score_asclite.sh
index d74b2f1ffa4..b9057ee8e5e 100755
--- a/egs/ami/s5/local/score_asclite.sh
+++ b/egs/ami/s5/local/score_asclite.sh
@@ -1,17 +1,18 @@
 #!/bin/bash
 # Copyright Johns Hopkins University (Author: Daniel Povey) 2012.  Apache 2.0.
 # 2014, University of Edinburgh, (Author: Pawel Swietojanski)
+# 2015, Brno University of Technology (Author: Karel Vesely)
 
 # begin configuration section.
 cmd=run.pl
 stage=0
+decode_mbr=true
 min_lmwt=9
-max_lmwt=20
-reverse=false
+max_lmwt=15
 asclite=true
+iter=final
 overlap_spk=4
-#end configuration section.
-
+# end configuration section.
 [ -f ./path.sh ] && . ./path.sh
 . parse_options.sh || exit 1;
 
@@ -22,7 +23,6 @@ if [ $# -ne 3 ]; then
   echo "    --stage (0|1|2)                 # start scoring script from part-way through."
   echo "    --min_lmwt <int>                # minumum LM-weight for lattice rescoring "
   echo "    --max_lmwt <int>                # maximum LM-weight for lattice rescoring "
-  echo "    --reverse (true/false)          # score with time reversed features "
   exit 1;
 fi
 
@@ -30,9 +30,9 @@ data=$1
 lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied.
 dir=$3
 
-model=$dir/../final.mdl # assume model one level up from decoding dir.
+model=$dir/../$iter.mdl # assume model one level up from decoding dir.
 
-hubscr=$KALDI_ROOT/tools/sctk/bin/hubscr.pl 
+hubscr=$KALDI_ROOT/tools/sctk/bin/hubscr.pl
 [ ! -f $hubscr ] && echo "Cannot find scoring program at $hubscr" && exit 1;
 hubdir=`dirname $hubscr`
 
@@ -41,56 +41,93 @@ for f in $data/stm $data/glm $lang/words.txt $lang/phones/word_boundary.int \
   [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
 done
 
+if [ -f $dir/../frame_shift ]; then
+  frame_shift_opt="--frame-shift=$(cat $dir/../frame_shift)"
+  echo "$0: $dir/../frame_shift exists, using $frame_shift_opt"
+elif [ -f $dir/../frame_subsampling_factor ]; then
+  factor=$(cat $dir/../frame_subsampling_factor) || exit 1
+  frame_shift_opt="--frame-shift=0.0$factor"
+  echo "$0: $dir/../frame_subsampling_factor exists, using $frame_shift_opt"
+fi
+
 name=`basename $data`; # e.g. eval2000
+nj=$(cat $dir/num_jobs)
 
 mkdir -p $dir/ascoring/log
 
 if [ $stage -le 0 ]; then
-  if $reverse; then
-    $cmd LMWT=$min_lmwt:$max_lmwt $dir/ascoring/log/get_ctm.LMWT.log \
-      mkdir -p $dir/ascore_LMWT/ '&&' \
-      lattice-1best --lm-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
-      lattice-reverse ark:- ark:- \| \
-      lattice-align-words --reorder=false $lang/phones/word_boundary.int $model ark:- ark:- \| \
-      nbest-to-ctm ark:- - \| \
-      utils/int2sym.pl -f 5 $lang/words.txt  \| \
-      utils/convert_ctm.pl $data/segments $data/reco2file_and_channel \
-      '>' $dir/ascore_LMWT/$name.ctm || exit 1;
-  else
-    $cmd LMWT=$min_lmwt:$max_lmwt $dir/ascoring/log/get_ctm.LMWT.log \
-      mkdir -p $dir/ascore_LMWT/ '&&' \
-      lattice-1best --lm-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
-      lattice-align-words $lang/phones/word_boundary.int $model ark:- ark:- \| \
-      nbest-to-ctm ark:- - \| \
+  for LMWT in $(seq $min_lmwt $max_lmwt); do
+    rm -f $dir/.error
+    (
+    $cmd JOB=1:$nj $dir/ascoring/log/get_ctm.${LMWT}.JOB.log \
+      mkdir -p $dir/ascore_${LMWT}/ '&&' \
+      lattice-scale --inv-acoustic-scale=${LMWT} "ark:gunzip -c $dir/lat.JOB.gz|" ark:- \| \
+      lattice-limit-depth ark:- ark:- \| \
+      lattice-push --push-strings=false ark:- ark:- \| \
+      lattice-align-words-lexicon --max-expand=10.0 \
+       $lang/phones/align_lexicon.int $model ark:- ark:- \| \
+      lattice-to-ctm-conf $frame_shift_opt --decode-mbr=$decode_mbr ark:- - \| \
       utils/int2sym.pl -f 5 $lang/words.txt  \| \
       utils/convert_ctm.pl $data/segments $data/reco2file_and_channel \
-      '>' $dir/ascore_LMWT/$name.ctm || exit 1;
-  fi
+      '>' $dir/ascore_${LMWT}/${name}.JOB.ctm || touch $dir/.error;
+    # Merge and clean,
+    for ((n=1; n<=nj; n++)); do cat $dir/ascore_${LMWT}/${name}.${n}.ctm; done > $dir/ascore_${LMWT}/${name}.ctm
+    rm -f $dir/ascore_${LMWT}/${name}.*.ctm
+    )&
+  done
+  wait;
+  [ -f $dir/.error ] && echo "$0: error during ctm generation. check $dir/ascoring/log/get_ctm.*.log" && exit 1;
 fi
 
 if [ $stage -le 1 ]; then
 # Remove some stuff we don't want to score, from the ctm.
-  for x in $dir/ascore_*/$name.ctm; do
+# - we remove hesitations here, otherwise the CTM would have a bug!
+#   (confidences in place of the removed hesitations),
+  for x in $dir/ascore_*/${name}.ctm; do
     cp $x $dir/tmpf;
     cat $dir/tmpf | grep -i -v -E '\[noise|laughter|vocalized-noise\]' | \
+      grep -i -v -E ' (ACH|AH|EEE|EH|ER|EW|HA|HEE|HM|HMM|HUH|MM|OOF|UH|UM) ' | \
       grep -i -v -E '<unk>' > $x;
 #      grep -i -v -E '<UNK>|%HESITATION' > $x;
   done
 fi
 
-if [ $stage -le 2 ]; then  
+if [ $stage -le 2 ]; then
   if [ "$asclite" == "true" ]; then
     oname=$name
     [ ! -z $overlap_spk ] && oname=${name}_o$overlap_spk
+    echo "asclite is starting"
+    # Run scoring, meaning of hubscr.pl options:
+    # -G .. produce alignment graphs,
+    # -v .. verbose,
+    # -m .. max-memory in GBs,
+    # -o .. max N of overlapping speakers,
+    # -a .. use asclite,
+    # -C .. compression for asclite,
+    # -B .. blocksize for asclite (kBs?),
+    # -p .. path for other components,
+    # -V .. skip validation of input transcripts,
+    # -h rt-stt .. removes non-lexical items from CTM,
     $cmd LMWT=$min_lmwt:$max_lmwt $dir/ascoring/log/score.LMWT.log \
       cp $data/stm $dir/ascore_LMWT/ '&&' \
       cp $dir/ascore_LMWT/${name}.ctm $dir/ascore_LMWT/${oname}.ctm '&&' \
       $hubscr -G -v -m 1:2 -o$overlap_spk -a -C -B 8192 -p $hubdir -V -l english \
-         -h rt-stt -g $data/glm -r $dir/ascore_LMWT/stm $dir/ascore_LMWT/${oname}.ctm || exit 1;
+        -h rt-stt -g $data/glm -r $dir/ascore_LMWT/stm $dir/ascore_LMWT/${oname}.ctm || exit 1
+    # Compress some scoring outputs : alignment info and graphs,
+    echo -n "compressing asclite outputs "
+    for LMWT in $(seq $min_lmwt $max_lmwt); do
+      ascore=$dir/ascore_${LMWT}
+      gzip -f $ascore/${oname}.ctm.filt.aligninfo.csv
+      cp $ascore/${oname}.ctm.filt.alignments/index.html $ascore/${oname}.ctm.filt.overlap.html
+      tar -C $ascore -czf $ascore/${oname}.ctm.filt.alignments.tar.gz ${oname}.ctm.filt.alignments
+      rm -r $ascore/${oname}.ctm.filt.alignments
+      echo -n "LMWT:$LMWT "
+    done
+    echo done
   else
     $cmd LMWT=$min_lmwt:$max_lmwt $dir/ascoring/log/score.LMWT.log \
       cp $data/stm $dir/ascore_LMWT/ '&&' \
-      $hubscr -p $hubdir -V -l english -h hub5 -g $data/glm -r $dir/ascore_LMWT/stm $dir/ascore_LMWT/${name}.ctm || exit 1
+      $hubscr -p $hubdir -v -V -l english -h hub5 -g $data/glm -r $dir/ascore_LMWT/stm $dir/ascore_LMWT/${name}.ctm || exit 1
   fi
 fi
 
diff --git a/egs/ami/s5/local/sort_bad_utts.py b/egs/ami/s5/local/sort_bad_utts.py
new file mode 100644
index 00000000000..f84fcb12608
--- /dev/null
+++ b/egs/ami/s5/local/sort_bad_utts.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python
+
+import sys
+import argparse
+import logging
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+handler.setLevel(logging.INFO)
+formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s')
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+
+def GetArgs():
+    # we add compulsary arguments as named arguments for readability
+    parser = argparse.ArgumentParser(description=""" """,
+    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+    # feat options
+    parser.add_argument("--bad-utt-info-file", type=str, required=True)
+    parser.add_argument("--output-file", type=str, required=True)
+    parser.add_argument("--max-wer", type=float, default=20)
+
+    print(' '.join(sys.argv))
+    args = parser.parse_args()
+
+    return args
+
+def GetSortedWers(utt_info_file):
+    utt_wer = []
+    for line in open(utt_info_file, 'r'):
+        parts = line.split()
+        utt = parts[0]
+        wer = float(parts[1])/float(parts[2])*100
+        utt_wer.append([utt, wer])
+
+    utt_wer_sorted = sorted(utt_wer, key = lambda k : k[1])
+    try:
+        import numpy as np
+        bins = range(0,105,5)
+        bins.append(sys.float_info.max)
+
+        hist, bin_edges = np.histogram(map(lambda x: x[1], utt_wer_sorted),
+                                       bins = bins)
+        num_utts = len(utt_wer)
+        string = ''
+        for i in range(len(hist)):
+            string += '[{0}, {1}] {2}\n'.format(bin_edges[i], bin_edges[i+1], float(hist[i])/num_utts * 100)
+        logger.info("The histogram is \n {0}".format(string))
+    except ImportError:
+        pass
+
+    return utt_wer_sorted
+
+def Main():
+    args = GetArgs()
+    utt_wer_sorted = GetSortedWers(args.bad_utt_info_file)
+    out_file = open(args.output_file, 'w')
+    logger.info("Writing output to file : {0}.".format(args.output_file))
+
+    for row in utt_wer_sorted:
+        if row[1] <= args.max_wer:
+            out_file.write('{0} {1}\n'.format(row[0], row[1]))
+    out_file.close()
+if __name__ == "__main__":
+    Main()
diff --git a/egs/ami/s5/path.sh b/egs/ami/s5/path.sh
index 52e44195f51..ad2c93b309b 100644
--- a/egs/ami/s5/path.sh
+++ b/egs/ami/s5/path.sh
@@ -1,11 +1,13 @@
 export KALDI_ROOT=`pwd`/../../..
-[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh 
-export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin/:$KALDI_ROOT/src/nnet3bin/:$KALDI_ROOT/src/kwsbin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$KALDI_ROOT/src/lmbin/:$PWD:$PATH
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
 export LC_ALL=C
 
 LMBIN=$KALDI_ROOT/tools/irstlm/bin
 SRILM=$KALDI_ROOT/tools/srilm/bin/i686-m64
-BEAMFORMIT=$KALDI_ROOT/tools/BeamformIt-3.51
+BEAMFORMIT=$KALDI_ROOT/tools/BeamformIt
 
 export PATH=$PATH:$LMBIN:$BEAMFORMIT:$SRILM
 
diff --git a/egs/ami/s5/run_ihm.sh b/egs/ami/s5/run_ihm.sh
index 4590ba1deb8..b9d60d78182 100755
--- a/egs/ami/s5/run_ihm.sh
+++ b/egs/ami/s5/run_ihm.sh
@@ -10,21 +10,24 @@ mic=ihm
 stage=0
 . utils/parse_options.sh
 
-# Set bash to 'debug' mode, it will exit on : 
-# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
-set -e
-set -u
-set -o pipefail
-set -x
+# Set bash to 'debug' mode, it prints the commands (option '-x') and exits on :
+# -e 'error', -u 'undefined variable', -o pipefail 'error in pipeline',
+set -euxo pipefail
 
 # Path where AMI gets downloaded (or where locally available):
-[ ! -r conf/ami_dir ] && echo "Please, run 'run_prepare_shared.sh' first!" && exit 1
-AMI_DIR=$(cat conf/ami_dir)
-
+AMI_DIR=$PWD/wav_db # Default,
+case $(hostname -d) in
+  fit.vutbr.cz) AMI_DIR=/mnt/scratch05/iveselyk/KALDI_AMI_WAV ;; # BUT,
+  clsp.jhu.edu) AMI_DIR=/export/corpora4/ami/amicorpus ;; # JHU,
+  cstr.ed.ac.uk) AMI_DIR= ;; # Edinburgh,
+esac
+
+[ ! -r data/local/lm/final_lm ] && echo "Please, run 'run_prepare_shared.sh' first!" && exit 1
 final_lm=`cat data/local/lm/final_lm`
 LM=$final_lm.pr1-7
 
 # Download AMI corpus, You need arount 130GB of free space to get whole data ihm+mdm,
+# Avoiding re-download, using 'wget --continue ...',
 if [ $stage -le 0 ]; then
   [ -e data/local/downloads/wget_$mic.sh ] && \
     echo "$data/local/downloads/wget_$mic.sh already exists, better quit than re-download... (use --stage N)" && \
@@ -54,9 +57,8 @@ fi
 
 if [ $stage -le 3 ]; then
   # Taking a subset, now unused, can be handy for quick experiments,
-  # Full set 77h, reduced set 9.5h,
-  local/remove_dup_utts.sh 20 data/$mic/train data/$mic/train_nodup # remvove uh-huh,
-  utils/subset_data_dir.sh --shortest data/$mic/train_nodup 30000 data/$mic/train_30k
+  # Full set 77h, reduced set 10.8h,
+  utils/subset_data_dir.sh data/$mic/train 15000 data/$mic/train_15k
 fi
 
 # Train systems,
@@ -84,7 +86,7 @@ if [ $stage -le 5 ]; then
     data/$mic/train data/lang exp/$mic/tri2a exp/$mic/tri2_ali
   # Decode,
   graph_dir=exp/$mic/tri2a/graph_${LM}
-  $highmem_cmd $graph_dir/mkgraph.log \
+  $cmd --mem 4G $graph_dir/mkgraph.log \
     utils/mkgraph.sh data/lang_${LM} exp/$mic/tri2a $graph_dir
   steps/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \
     $graph_dir data/$mic/dev exp/$mic/tri2a/decode_dev_${LM}
@@ -102,26 +104,26 @@ if [ $stage -le 6 ]; then
     data/$mic/train data/lang exp/$mic/tri3a exp/$mic/tri3a_ali
   # Decode,
   graph_dir=exp/$mic/tri3a/graph_${LM}
-  $highmem_cmd $graph_dir/mkgraph.log \
+  $cmd --mem 4G $graph_dir/mkgraph.log \
     utils/mkgraph.sh data/lang_${LM} exp/$mic/tri3a $graph_dir
   steps/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \
-    $graph_dir data/$mic/dev exp/$mic/tri3a/decode_dev_${LM} 
+    $graph_dir data/$mic/dev exp/$mic/tri3a/decode_dev_${LM}
   steps/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \
     $graph_dir data/$mic/eval exp/$mic/tri3a/decode_eval_${LM}
-fi 
+fi
 
 if [ $stage -le 7 ]; then
   # Train tri4a, which is LDA+MLLT+SAT,
   steps/train_sat.sh  --cmd "$train_cmd" \
     5000 80000 data/$mic/train data/lang exp/$mic/tri3a_ali exp/$mic/tri4a
-  # Decode,  
+  # Decode,
   graph_dir=exp/$mic/tri4a/graph_${LM}
   $highmem_cmd $graph_dir/mkgraph.log \
     utils/mkgraph.sh data/lang_${LM} exp/$mic/tri4a $graph_dir
   steps/decode_fmllr.sh --nj $nj --cmd "$decode_cmd"  --config conf/decode.conf \
-    $graph_dir data/$mic/dev exp/$mic/tri4a/decode_dev_${LM} 
+    $graph_dir data/$mic/dev exp/$mic/tri4a/decode_dev_${LM}
   steps/decode_fmllr.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \
-    $graph_dir data/$mic/eval exp/$mic/tri4a/decode_eval_${LM} 
+    $graph_dir data/$mic/eval exp/$mic/tri4a/decode_eval_${LM}
 fi
 
 nj_mmi=80
@@ -158,11 +160,11 @@ if [ $stage -le 11 ]; then
     decode_dir=exp/$mic/tri4a_mmi_b0.1/decode_dev_${i}.mdl_${LM}
     steps/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \
       --transform-dir exp/$mic/tri4a/decode_dev_${LM} --iter $i \
-      $graph_dir data/$mic/dev $decode_dir 
+      $graph_dir data/$mic/dev $decode_dir
     decode_dir=exp/$mic/tri4a_mmi_b0.1/decode_eval_${i}.mdl_${LM}
     steps/decode.sh --nj $nj --cmd "$decode_cmd"  --config conf/decode.conf \
       --transform-dir exp/$mic/tri4a/decode_eval_${LM} --iter $i \
-      $graph_dir data/$mic/eval $decode_dir 
+      $graph_dir data/$mic/eval $decode_dir
   done
 fi
 
@@ -179,11 +181,11 @@ if [ $stage -le 13 ]; then
     --hidden-dim 950 \
     --splice-indexes "layer0/-2:-1:0:1:2 layer1/-1:2 layer2/-3:3 layer3/-7:2 layer4/-3:3" \
     --use-sat-alignments true
-  
+
   local/online/run_nnet2_ms_sp_disc.sh  \
     --mic $mic  \
     --gmm-dir exp/$mic/tri4a \
     --srcdir exp/$mic/nnet2_online/nnet_ms_sp
 fi
 
-echo "Done!"
+echo "Done."
diff --git a/egs/ami/s5/run_mdm.sh b/egs/ami/s5/run_mdm.sh
index 5d1d964e2b1..3c147e0aa99 100755
--- a/egs/ami/s5/run_mdm.sh
+++ b/egs/ami/s5/run_mdm.sh
@@ -7,28 +7,31 @@
 nmics=8 #we use all 8 channels, possible other options are 2 and 4
 mic=mdm$nmics
 
-stage=0
-. utils/parse_options.sh
-
-# Set bash to 'debug' mode, it will exit on : 
-# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
-set -e
-set -u
-set -o pipefail
-set -x
-
 # Path where AMI gets downloaded (or where locally available):
-[ ! -r conf/ami_dir ] && echo "Please, run 'run_prepare_shared.sh' first!" && exit 1
-AMI_DIR=$(cat conf/ami_dir)
+AMI_DIR=$PWD/wav_db # Default, 
+case $(hostname -d) in 
+  fit.vutbr.cz) AMI_DIR=/mnt/scratch05/iveselyk/KALDI_AMI_WAV ;; # BUT,
+  clsp.jhu.edu) AMI_DIR=/export/corpora4/ami/amicorpus ;; # JHU,
+  cstr.ed.ac.uk) AMI_DIR= ;; # Edinburgh,
+esac
 
 # MDM_DIR is directory for beamformed waves,
-MDM_DIR=$AMI_DIR/beamformed # [Default]
-#MDM_DIR=/disk/data1/s1136550/ami/mdm # [Edinburgh]
+MDM_DIR=$AMI_DIR/beamformed # Default,
+#MDM_DIR=/disk/data1/s1136550/ami/mdm # Edinburgh,
 
+[ ! -r data/local/lm/final_lm ] && echo "Please, run 'run_prepare_shared.sh' first!" && exit 1
 final_lm=`cat data/local/lm/final_lm`
 LM=$final_lm.pr1-7
 
+stage=0
+. utils/parse_options.sh
+
+# Set bash to 'debug' mode, it prints the commands (option '-x') and exits on : 
+# -e 'error', -u 'undefined variable', -o pipefail 'error in pipeline',
+set -euxo pipefail
+
 # Download AMI corpus (distant channels), You need around 130GB of free space to get whole data ihm+mdm,
+# Avoiding re-download, using 'wget --continue ...',
 if [ $stage -le 0 ]; then
   [ -e data/local/downloads/wget_mdm.sh ] && \
     echo "data/local/downloads/wget_mdm.sh already exists, better quit than re-download... (use --stage N)" && \
@@ -64,9 +67,8 @@ fi
 
 if [ $stage -le 4 ]; then
   # Taking a subset, now unused, can be handy for quick experiments,
-  # Full set 77h, reduced set 9.5h,
-  local/remove_dup_utts.sh 20 data/$mic/train data/$mic/train_nodup # remvove uh-huh,
-  utils/subset_data_dir.sh --shortest data/$mic/train_nodup 30000 data/$mic/train_30k
+  # Full set 77h, reduced set 10.8h,
+  utils/subset_data_dir.sh data/$mic/train 15000 data/$mic/train_15k
 fi
 
 # Train systems,
@@ -179,5 +181,4 @@ if [ $stage -le 13 ]; then
     --srcdir exp/$mic/nnet2_online/nnet_ms_sp
 fi
 
-
-echo "Done!"
+echo "Done."
diff --git a/egs/ami/s5/run_prepare_shared.sh b/egs/ami/s5/run_prepare_shared.sh
index b931e910bb9..903de4125b8 100755
--- a/egs/ami/s5/run_prepare_shared.sh
+++ b/egs/ami/s5/run_prepare_shared.sh
@@ -3,34 +3,17 @@
 . ./cmd.sh
 . ./path.sh
 
-# To run this script you need SRILM,
-
 # Path to Fisher transcripts LM interpolation (if not defined only AMI transcript LM is built),
-FISHER_TRANS=`pwd`/eddie_data/lm/data/fisher/part1 # Edinburgh, [DEFAULT]
-# Path where AMI gets downloaded (or where locally available),
-AMI_DIR=$PWD/DOWNLOAD/amicorpus # [DEFAULT]
-
-# We can make setup specific to the 'domain' where the cluster is,
-case "$(hostname -d)" in
-  fit.vutbr.cz) # BUT cluster,
-    FISHER_TRANS=/mnt/matylda2/data/FISHER/fe_03_p1_tran
-    AMI_DIR=$(mktemp -d $(find /mnt/scratch*/$USER -maxdepth 0)/kaldi_ami_data_XXXXXX)
-  ;;
-  *) echo "Using defaults locations,"
-  ;;
+case $(hostname -d) in 
+  fit.vutbr.cz) FISHER_TRANS=/mnt/matylda2/data/FISHER/fe_03_p1_tran ;; # BUT,
+  clsp.jhu.edu) FISHER_TRANS=/export/corpora4/ami/fisher_trans/part1 ;; # JHU,
+  cstr.ed.ac.uk) FISHER_TRANS=`pwd`/eddie_data/lm/data/fisher/part1 ;; # Edinburgh,
 esac
+# Or select manually,
+# FISHER_TRANS=...
 
-# We can override the automatic setup by : 
-# './run_prepare_shared.sh --AMI-DIR [dir] --FISHER-TRANS [dir]'
 . utils/parse_options.sh 
 
-# Load previous / store the new AMI_DIR location,
-[ -r conf/ami_dir ] && AMI_DIR=$(cat conf/ami_dir) || echo $AMI_DIR >conf/ami_dir 
-
-if [ -z $IRSTLM ] ; then
-  export IRSTLM=$KALDI_ROOT/tools/irstlm/
-fi
-export PATH=${PATH}:$IRSTLM/bin
 if ! command -v prune-lm >/dev/null 2>&1 ; then
   echo "$0: Error: the IRSTLM is not available or compiled" >&2
   echo "$0: Error: We used to install it by default, but." >&2
@@ -40,13 +23,19 @@ if ! command -v prune-lm >/dev/null 2>&1 ; then
   exit 1
 fi
 
-# Set bash to 'debug' mode, it will exit on : 
-# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
-set -e
-set -u
-set -x
+if ! command -v ngram-count >/dev/null 2>&1 ; then
+  echo "$0: Error: the SRILM is not available or compiled" >&2
+  echo "$0: Error: To install it, go to $KALDI_ROOT/tools" >&2
+  echo "$0: Error: and run extras/install_srilm.sh" >&2
+  exit 1
+fi
+
+# Set bash to 'debug' mode, it prints the commands (option '-x') and exits on : 
+# -e 'error', -u 'undefined variable', -o pipefail 'error in pipeline',
+set -euxo pipefail
 
-local/ami_text_prep.sh $AMI_DIR
+# Download of annotations, pre-processing,
+local/ami_text_prep.sh data/local/downloads
 
 local/ami_prepare_dict.sh
 utils/prepare_lang.sh data/local/dict "<unk>" data/local/lang data/lang
@@ -58,6 +47,6 @@ LM=$final_lm.pr1-7
 prune-lm --threshold=1e-7 data/local/lm/$final_lm.gz /dev/stdout | gzip -c > data/local/lm/$LM.gz
 utils/format_lm.sh data/lang data/local/lm/$LM.gz data/local/dict/lexicon.txt data/lang_$LM
 
-echo "Done!"
+echo "Done"
 exit 0
 
diff --git a/egs/ami/s5/run_sdm.sh b/egs/ami/s5/run_sdm.sh
old mode 100644
new mode 100755
index 3ae7e2c67df..99dd80941e4
--- a/egs/ami/s5/run_sdm.sh
+++ b/egs/ami/s5/run_sdm.sh
@@ -3,31 +3,34 @@
 . ./cmd.sh
 . ./path.sh
 
-# SDM - Signle Distant Microphone 
+# SDM - Signle Distant Microphone
 micid=1 #which mic from array should be used?
 mic=sdm$micid
 
-stage=0
+stage=1
 . utils/parse_options.sh
 
-# Set bash to 'debug' mode, it will exit on : 
-# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
-set -e
-set -u
-set -o pipefail
-set -x
+# Set bash to 'debug' mode, it prints the commands (option '-x') and exits on :
+# -e 'error', -u 'undefined variable', -o pipefail 'error in pipeline',
+set -euxo pipefail
 
 # Path where AMI gets downloaded (or where locally available):
-[ ! -r conf/ami_dir ] && echo "Please, run 'run_prepare_shared.sh' first!" && exit 1
-AMI_DIR=$(cat conf/ami_dir)
-
+AMI_DIR=$PWD/wav_db # Default,
+case $(hostname -d) in
+  fit.vutbr.cz) AMI_DIR=/mnt/scratch05/iveselyk/KALDI_AMI_WAV ;; # BUT,
+  clsp.jhu.edu) AMI_DIR=/export/corpora4/ami/amicorpus ;; # JHU,
+  cstr.ed.ac.uk) AMI_DIR= ;; # Edinburgh,
+esac
+
+[ ! -r data/local/lm/final_lm ] && echo "Please, run 'run_prepare_shared.sh' first!" && exit 1
 final_lm=`cat data/local/lm/final_lm`
 LM=$final_lm.pr1-7
 
 # Download AMI corpus (distant channels), You need arount 130GB of free space to get whole data ihm+mdm,
-if [ $stage -le 0 ]; then
+# Avoiding re-download, using 'wget --continue ...',
+if [ $stage -le 1 ]; then
   [ -e data/local/downloads/wget_sdm.sh ] && \
-    echo "$data/local/downloads/wget_sdm.sh already exists, better quit than re-download... (use --stage N)" && \
+    echo "data/local/downloads/wget_sdm.sh already exists, better quit than re-download... (use --stage N)" && \
     exit 1
   local/ami_download.sh --mics $micid sdm $AMI_DIR
 fi
@@ -53,9 +56,8 @@ fi
 
 if [ $stage -le 4 ]; then
   # Taking a subset, now unused, can be handy for quick experiments,
-  # Full set 77h, reduced set 9.5h,
-  local/remove_dup_utts.sh 20 data/$mic/train data/$mic/train_nodup # remvove uh-huh,
-  utils/subset_data_dir.sh --shortest data/$mic/train_nodup 30000 data/$mic/train_30k
+  # Full set 77h, reduced set 10.8h,
+  utils/subset_data_dir.sh data/$mic/train 15000 data/$mic/train_15k
 fi
 
 # Train systems,
@@ -161,16 +163,36 @@ if [ $stage -le 13 ]; then
     --hidden-dim 850 \
     --splice-indexes "layer0/-2:-1:0:1:2 layer1/-1:2 layer2/-3:3 layer3/-7:2 layer4/-3:3" \
     --use-sat-alignments false
-  
+
   local/online/run_nnet2_ms_sp_disc.sh  \
     --mic $mic  \
     --gmm-dir exp/$mic/tri3a \
     --srcdir exp/$mic/nnet2_online/nnet_ms_sp
 fi
+
+#TDNN training (nnet3)
+if [ $stage -le 14 ]; then
+  local/nnet3/run_tdnn.sh \
+    --mic $mic \
+    --speed-perturb true \
+    --stage 9 \
+    --use-sat-alignments false
+fi
+exit 1;
+
+#LSTM training (nnet3)
+if [ $stage -le 15 ]; then
+  local/nnet3/run_lstm.sh \
+    --mic $mic \
+    --train-stage -5 \
+    --speed-perturb true \
+    --use-sat-alignments false
+fi
+
 echo "Done."
 
 
-# By default we do not build systems adapted to sessions for AMI in distant scnearios 
+# By default we do not build systems adapted to sessions for AMI in distant scnearios
 # as this does not help a lot (around 1%), but one can do this by running below code:
 exit;
 
@@ -186,7 +208,7 @@ graph_dir=exp/$mic/tri4a/graph_${LM}
 $highmem_cmd $graph_dir/mkgraph.log \
   utils/mkgraph.sh data/lang_${LM} exp/$mic/tri4a $graph_dir
 steps/decode_fmllr.sh --nj $nj_dev --cmd "$decode_cmd" --config conf/decode.conf \
-  $graph_dir data/$mic/dev exp/$mic/tri4a/decode_dev_${LM} 
+  $graph_dir data/$mic/dev exp/$mic/tri4a/decode_dev_${LM}
 steps/decode_fmllr.sh --nj $nj_eval --cmd "$decode_cmd" --config conf/decode.conf \
-  $graph_dir data/$mic/eval exp/$mic/tri4a/decode_eval_${LM} 
+  $graph_dir data/$mic/eval exp/$mic/tri4a/decode_eval_${LM}
 
diff --git a/egs/ami/s5/run_sdm_lstm.sh b/egs/ami/s5/run_sdm_lstm.sh
deleted file mode 100755
index ca4978f554c..00000000000
--- a/egs/ami/s5/run_sdm_lstm.sh
+++ /dev/null
@@ -1,201 +0,0 @@
-#!/bin/bash -u
-
-. ./cmd.sh
-. ./path.sh
-
-# SDM - Signle Distant Microphone 
-micid=1 #which mic from array should be used?
-mic=sdm$micid
-
-stage=0
-. utils/parse_options.sh
-
-# Set bash to 'debug' mode, it will exit on : 
-# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
-set -e
-set -u
-set -o pipefail
-set -x
-
-# Path where AMI gets downloaded (or where locally available):
-[ ! -r conf/ami_dir ] && echo "Please, run 'run_prepare_shared.sh' first!" && exit 1
-AMI_DIR=$(cat conf/ami_dir)
-
-final_lm=`cat data/local/lm/final_lm`
-LM=$final_lm.pr1-7
-
-# Download AMI corpus (distant channels), You need arount 130GB of free space to get whole data ihm+mdm,
-if [ $stage -le 0 ]; then
-  [ -e data/local/downloads/wget_sdm.sh ] && \
-    echo "$data/local/downloads/wget_sdm.sh already exists, better quit than re-download... (use --stage N)" && \
-    exit 1
-  local/ami_download.sh --mics $micid sdm $AMI_DIR
-fi
-
-# Prepare mdm data directories,
-if [ $stage -le 2 ]; then
-  local/ami_sdm_data_prep.sh $AMI_DIR $micid
-  local/ami_sdm_scoring_data_prep.sh $AMI_DIR $micid dev
-  local/ami_sdm_scoring_data_prep.sh $AMI_DIR $micid eval
-fi
-# Here starts the normal recipe, which is mostly shared across mic scenarios,
-# - for ihm we adapt to speaker by fMLLR,
-# - for sdm and mdm we do not adapt for speaker, but for environment only (cmn),
-
-# Feature extraction,
-if [ $stage -le 3 ]; then
-  for dset in train dev eval; do
-    steps/make_mfcc.sh --nj 15 --cmd "$train_cmd" data/$mic/$dset data/$mic/$dset/log data/$mic/$dset/data
-    steps/compute_cmvn_stats.sh data/$mic/$dset data/$mic/$dset/log data/$mic/$dset/data
-  done
-  for dset in train eval dev; do utils/fix_data_dir.sh data/$mic/$dset; done
-fi
-
-if [ $stage -le 4 ]; then
-  # Taking a subset, now unused, can be handy for quick experiments,
-  # Full set 77h, reduced set 9.5h,
-  local/remove_dup_utts.sh 20 data/$mic/train data/$mic/train_nodup # remvove uh-huh,
-  utils/subset_data_dir.sh --shortest data/$mic/train_nodup 30000 data/$mic/train_30k
-fi
-
-# Train systems,
-nj=30 # number of parallel jobs,
-nj_dev=$(cat data/$mic/dev/spk2utt | wc -l)
-nj_eval=$(cat data/$mic/eval/spk2utt | wc -l)
-
-if [ $stage -le 5 ]; then
-  # Mono,
-  steps/train_mono.sh --nj $nj --cmd "$train_cmd" --cmvn-opts "--norm-means=true --norm-vars=false" \
-    data/$mic/train data/lang exp/$mic/mono
-  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
-    data/$mic/train data/lang exp/$mic/mono exp/$mic/mono_ali
-
-  # Deltas,
-  steps/train_deltas.sh --cmd "$train_cmd" --cmvn-opts "--norm-means=true --norm-vars=false" \
-    5000 80000 data/$mic/train data/lang exp/$mic/mono_ali exp/$mic/tri1
-  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
-    data/$mic/train data/lang exp/$mic/tri1 exp/$mic/tri1_ali
-fi
-
-if [ $stage -le 6 ]; then
-  # Deltas again, (full train-set),
-  steps/train_deltas.sh --cmd "$train_cmd" --cmvn-opts "--norm-means=true --norm-vars=false" \
-    5000 80000 data/$mic/train data/lang exp/$mic/tri1_ali exp/$mic/tri2a
-  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
-    data/$mic/train data/lang exp/$mic/tri2a exp/$mic/tri2_ali
-  # Decode,
-  graph_dir=exp/$mic/tri2a/graph_${LM}
-  $highmem_cmd $graph_dir/mkgraph.log \
-    utils/mkgraph.sh data/lang_${LM} exp/$mic/tri2a $graph_dir
-  steps/decode.sh --nj $nj_dev --cmd "$decode_cmd" --config conf/decode.conf \
-    $graph_dir data/$mic/dev exp/$mic/tri2a/decode_dev_${LM}
-  steps/decode.sh --nj $nj_eval --cmd "$decode_cmd" --config conf/decode.conf \
-    $graph_dir data/$mic/eval exp/$mic/tri2a/decode_eval_${LM}
-fi
-
-# THE TARGET LDA+MLLT+SAT+BMMI PART GOES HERE:
-
-if [ $stage -le 7 ]; then
-  # Train tri3a, which is LDA+MLLT,
-  steps/train_lda_mllt.sh --cmd "$train_cmd" \
-    --splice-opts "--left-context=3 --right-context=3" \
-    5000 80000 data/$mic/train data/lang exp/$mic/tri2_ali exp/$mic/tri3a
-  # Decode,
-  graph_dir=exp/$mic/tri3a/graph_${LM}
-  $highmem_cmd $graph_dir/mkgraph.log \
-    utils/mkgraph.sh data/lang_${LM} exp/$mic/tri3a $graph_dir
-  steps/decode.sh --nj $nj_dev --cmd "$decode_cmd" --config conf/decode.conf \
-    $graph_dir data/$mic/dev exp/$mic/tri3a/decode_dev_${LM}
-  steps/decode.sh --nj $nj_eval --cmd "$decode_cmd" --config conf/decode.conf \
-    $graph_dir data/$mic/eval exp/$mic/tri3a/decode_eval_${LM}
-fi
-
-# skip SAT, and build MMI models
-nj_mmi=80
-if [ $stage -le 8 ]; then
-  steps/align_si.sh --nj $nj_mmi --cmd "$train_cmd" \
-    data/$mic/train data/lang exp/$mic/tri3a exp/$mic/tri3a_ali
-fi
-
-# At this point you can already run the DNN script:
-# local/nnet/run_dnn_lda_mllt.sh
-# exit 0
-
-if [ $stage -le 9 ]; then
-  steps/make_denlats.sh --nj $nj_mmi --cmd "$decode_cmd" --config conf/decode.conf \
-      data/$mic/train data/lang exp/$mic/tri3a exp/$mic/tri3a_denlats
-fi
-
-# 4 iterations of MMI seems to work well overall. The number of iterations is
-# used as an explicit argument even though train_mmi.sh will use 4 iterations by
-# default.
-if [ $stage -le 10 ]; then
-  num_mmi_iters=4
-  steps/train_mmi.sh --cmd "$train_cmd" --boost 0.1 --num-iters $num_mmi_iters \
-    data/$mic/train data/lang exp/$mic/tri3a_ali exp/$mic/tri3a_denlats \
-    exp/$mic/tri3a_mmi_b0.1
-fi
-if [ $stage -le 11 ]; then
-  # Decode,
-  graph_dir=exp/$mic/tri3a/graph_${LM}
-  for i in 4 3 2 1; do
-    decode_dir=exp/$mic/tri3a_mmi_b0.1/decode_dev_${i}.mdl_${LM}
-    steps/decode.sh --nj $nj_dev --cmd "$decode_cmd" --config conf/decode.conf \
-      --iter $i $graph_dir data/$mic/dev $decode_dir
-    decode_dir=exp/$mic/tri3a_mmi_b0.1/decode_eval_${i}.mdl_${LM}
-    steps/decode.sh --nj $nj_eval --cmd "$decode_cmd" --config conf/decode.conf \
-      --iter $i $graph_dir data/$mic/eval $decode_dir
-  done
-fi
-
-# DNN training. This script is based on egs/swbd/s5b/local/run_dnn.sh
-# Some of them would be out of date.
-if [ $stage -le 12 ]; then
-  local/nnet/run_dnn_lda_mllt.sh $mic
-fi
-
-# TDNN training.
-if [ $stage -le 13 ]; then
-  local/nnet3/run_tdnn.sh \
-    --mic $mic \
-    --hidden-dim 850 \
-    --speed-perturb true \
-    --stage 7 \
-    --use-sat-alignments false
-fi
-
-#LSTM training
-if [ $stage -le 14 ]; then
-  local/nnet3/run_lstm.sh \
-    --mic $mic \
-    --train-stage -5 \
-    --speed-perturb true \
-    --stage 7 \
-    --common-egs-dir exp/sdm1/nnet3/lstm_sp/egs \
-    --use-sat-alignments false
-fi
-
-
-echo "Done."
-
-
-# By default we do not build systems adapted to sessions for AMI in distant scnearios 
-# as this does not help a lot (around 1%), but one can do this by running below code:
-exit;
-
-# Train tri4a, which is LDA+MLLT+SAT,
-steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
-  data/$mic/train data/lang exp/$mic/tri3a exp/$mic/tri3a_ali-fmllr
-
-steps/train_sat.sh  --cmd "$train_cmd" \
-  5000 80000 data/$mic/train data/lang exp/$mic/tri3a_ali-fmllr exp/$mic/tri4a
-
-# Decode,
-graph_dir=exp/$mic/tri4a/graph_${LM}
-$highmem_cmd $graph_dir/mkgraph.log \
-  utils/mkgraph.sh data/lang_${LM} exp/$mic/tri4a $graph_dir
-steps/decode_fmllr.sh --nj $nj_dev --cmd "$decode_cmd" --config conf/decode.conf \
-  $graph_dir data/$mic/dev exp/$mic/tri4a/decode_dev_${LM} 
-steps/decode_fmllr.sh --nj $nj_eval --cmd "$decode_cmd" --config conf/decode.conf \
-  $graph_dir data/$mic/eval exp/$mic/tri4a/decode_eval_${LM} 
-
diff --git a/egs/apiai_decode/s5/README.md b/egs/apiai_decode/s5/README.md
new file mode 100644
index 00000000000..c6d9bd23b77
--- /dev/null
+++ b/egs/apiai_decode/s5/README.md
@@ -0,0 +1,53 @@
+# Api.ai model decoding example scripts
+This directory contains scripts on how to use a pre-trained chain enlgish model and kaldi base code to recognize any number of wav files.
+
+IMPORTANT: wav files must be in 16kHz, 16 bit little-endian format.
+
+## Model
+English pretrained model were released by Api.ai under Creative Commons Attribution-ShareAlike 4.0 International Public License. 
+- Acustic data is mostly mobile recorded data
+- Language model is based on Assistant.ai logs and good for understanding short commands, like "Wake me up at 7 am"
+For more details, visit https://github.com/api-ai/api-ai-english-asr-model
+
+## Usage
+Ensure kaldi is compiled and this scripts are inside kaldi/egs/<subfolder>/ directory then run
+```sh
+$ ./download-model.sh # to download pretrained chain model
+$ ./recognize-wav.sh test1.wav test2.wav # to do recognition
+```
+See console output for recognition results.
+
+### Using steps/nnet3/decode.sh
+You can use kaldi steps/nnet3/decode.sh, which will decode data and calculate Word Error Rate (WER) for it.
+
+Run:
+```sh
+$ recognize-wav.sh test1.wav test2.wav
+```
+It will make data dir, calculate mfcc features for it and do decoding, you need only first two steps out of it. If you want WER then edit data/test-corpus/text and replace NO_TRANSCRIPTION with expected text transcription for every wav file. 
+
+Run for decoding:
+```sh
+$ steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 --cmd run.pl --nj 1 exp/api.ai-model/ data/test-corpus/ exp/api.ai-model/decode/
+```
+See exp/api.ai-model/decode/wer* files for WER and exp/api.ai-model/decode/log/ files for decoding output.
+
+### Online Decoder:
+See http://kaldi.sourceforge.net/online_decoding.html for more information about kaldi online decoding.
+
+Run:
+```sh
+$./local/create-corpus.sh data/test-corpus/ test1.wav test2.wav
+```
+If you want WER then edit data/test-corpus/text and replace NO_TRANSCRIPTION with expected text transcription for every wav file.
+
+Make config file exp/api.ai-model/conf/online.conf with following content:
+```
+--feature-type=mfcc
+--mfcc-config=exp/api.ai-model/mfcc.conf
+```
+Then run:
+```sh
+$ steps/online/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 --cmd run.pl --nj 1 exp/api.ai-model/ data/test-corpus/ exp/api.ai-model/decode/
+```
+See exp/api.ai-model/decode/wer* files for WER and exp/api.ai-model/decode/log/ files for decoding output.
\ No newline at end of file
diff --git a/egs/apiai_decode/s5/download-model.sh b/egs/apiai_decode/s5/download-model.sh
new file mode 100755
index 00000000000..0847c3fb914
--- /dev/null
+++ b/egs/apiai_decode/s5/download-model.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+# Downlaods Api.ai chain model into exp/api.ai-model/ (will replace one if exists)
+
+DOWNLOAD_URL="https://api.ai/downloads/api.ai-kaldi-asr-model.zip"
+
+echo "Downloading model"
+wget -N $DOWNLOAD_URL || ( echo "Unable to download model: $DOWNLOAD_URL" && exit 1 );
+
+echo "Unpacking model"
+unzip api.ai-kaldi-asr-model.zip || ( echo "Unable to extract api.ai-kaldi-asr-model.zip" && exit 1 );
+
+echo "Moving model to exp/api.ai-model/"
+if [ ! -d exp ]; then
+	mkdir exp;
+fi;
+
+if [ -d exp/api.ai-model ]; then
+	echo "Found existing model, removing";
+	rm -rf exp/api.ai-model/
+fi
+
+mv api.ai-kaldi-asr-model exp/api.ai-model || ( echo "Unable to move model to exp/" && exit 1 )
+
+echo "Model is ready to use use recognize-wav.sh to do voice recognition"
diff --git a/egs/apiai_decode/s5/local/create-corpus.sh b/egs/apiai_decode/s5/local/create-corpus.sh
new file mode 100755
index 00000000000..a101128b4ac
--- /dev/null
+++ b/egs/apiai_decode/s5/local/create-corpus.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+
+# Checking arguments
+if [ $# -le 1 ]; then
+	echo "Use $0 <datadir> test1.wav [test2.wav] ..."
+	echo "  $0 data/test-corpus test1.wav test2.wav"
+	exit 0;
+fi
+
+CORPUS=$1
+shift
+for file in "$@"; do
+	if [[ "$file" != *.wav ]]; then
+		echo "Expecting .wav files, got $file"
+		exit 1;
+	fi
+
+	if [ ! -f "$file" ]; then
+		echo "$file not found";
+		exit 1;
+	fi
+done;
+
+
+echo "Initilizing $CORPUS"
+if [ ! -d "$CORPUS" ]; then
+	echo "Creating $CORPUS directory"
+	mkdir -p "$CORPUS" || ( echo "Unable to create data dir" && exit 1 )
+fi;
+
+wav_scp="$CORPUS/wav.scp"
+spk2utt="$CORPUS/spk2utt"
+utt2spk="$CORPUS/utt2spk"
+text="$CORPUS/text"
+
+#nulling files
+cat </dev/null >$wav_scp
+cat </dev/null >$spk2utt
+cat </dev/null >$utt2spk
+cat </dev/null >$text
+rm $CORPUS/feats.scp 2>/dev/null;
+rm $CORPUS/cmvn.scp  2>/dev/null;
+
+for file in "$@"; do
+	id=$(echo $file | sed -e 's/ /_/g')
+	echo "$id $file" >>$wav_scp
+	echo "$id $id" >>$spk2utt
+	echo "$id $id" >>$utt2spk
+	echo "$id NO_TRANSRIPTION" >>$text
+done;
diff --git a/egs/apiai_decode/s5/local/score.sh b/egs/apiai_decode/s5/local/score.sh
new file mode 120000
index 00000000000..0afefc3158c
--- /dev/null
+++ b/egs/apiai_decode/s5/local/score.sh
@@ -0,0 +1 @@
+../steps/score_kaldi.sh
\ No newline at end of file
diff --git a/egs/apiai_decode/s5/path.sh b/egs/apiai_decode/s5/path.sh
new file mode 100755
index 00000000000..8b177b18ab2
--- /dev/null
+++ b/egs/apiai_decode/s5/path.sh
@@ -0,0 +1,6 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/src/path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
diff --git a/egs/apiai_decode/s5/recognize-wav.sh b/egs/apiai_decode/s5/recognize-wav.sh
new file mode 100755
index 00000000000..cba3e70a4fc
--- /dev/null
+++ b/egs/apiai_decode/s5/recognize-wav.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+# Copyright 2016 Api.ai (Author: Ilya Platonov)
+# Apache 2.0
+
+# This script demonstrates kaldi decoding using pretrained model. It will decode list of wav files.
+#
+# IMPORTANT: wav files must be in 16kHz, 16 bit little-endian format.
+#
+# This script tries to follow with what other scripts are doing in terms of directory structures and data handling.
+# 
+# Use ./download-model.sh script to download asr model
+# See https://github.com/api-ai/api-ai-english-asr-model for details about a model and how to use it.
+
+. path.sh
+MODEL_DIR="exp/api.ai-model"
+DATA_DIR="data/test-corpus"
+
+echo "///////"
+echo "// IMPORTANT: wav files must be in 16kHz, 16 bit little-endian format."
+echo "//////";
+
+for file in final.mdl HCLG.fst words.txt frame_subsampling_factor; do
+	if [ ! -f $MODEL_DIR/$file ]; then
+		echo "$MODEL_DIR/$file not found, use ./download-model.sh"
+		exit 1;
+	fi
+done;
+
+for app in nnet3-latgen-faster apply-cmvn lattice-scale; do
+	command -v $app >/dev/null 2>&1 || { echo >&2 "$app not found, is kaldi compiled?"; exit 1; }
+done;
+
+local/create-corpus.sh $DATA_DIR $@ || exit 1;
+
+echo "///////"
+echo "// Computing mfcc and cmvn (cmvn is not really used)"
+echo "//////";
+
+ steps/make_mfcc.sh --nj 1 --mfcc-config $MODEL_DIR/mfcc.conf \
+      --cmd "run.pl" $DATA_DIR exp/make_mfcc exp/mfcc || { echo "Unable to calculate mfcc, ensure 16kHz, 16 bit little-endian wav format or see log"; exit 1; };
+    steps/compute_cmvn_stats.sh $DATA_DIR exp/make_mfcc/ exp/mfcc || exit 1;
+
+echo "///////"
+echo "// Doing decoding (see log for results)"
+echo "//////";
+frame_subsampling_factor=$(cat $MODEL_DIR/frame_subsampling_factor)
+nnet3-latgen-faster --frame-subsampling-factor=$frame_subsampling_factor --frames-per-chunk=50 --extra-left-context=0 \
+ --extra-right-context=0 --extra-left-context-initial=-1 --extra-right-context-final=-1 \
+ --minimize=false --max-active=7000 --min-active=200 --beam=15.0 --lattice-beam=8.0 \
+ --acoustic-scale=1.0 --allow-partial=true \
+ --word-symbol-table=$MODEL_DIR/words.txt $MODEL_DIR/final.mdl $MODEL_DIR//HCLG.fst \
+ "ark,s,cs:apply-cmvn --norm-means=false --norm-vars=false --utt2spk=ark:$DATA_DIR/utt2spk scp:$DATA_DIR/cmvn.scp scp:$DATA_DIR/feats.scp ark:- |" \
+ "ark:|lattice-scale --acoustic-scale=10.0 ark:- ark:-  >exp/lat.1"
\ No newline at end of file
diff --git a/egs/apiai_decode/s5/steps b/egs/apiai_decode/s5/steps
new file mode 120000
index 00000000000..1b186770dd1
--- /dev/null
+++ b/egs/apiai_decode/s5/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps/
\ No newline at end of file
diff --git a/egs/apiai_decode/s5/utils b/egs/apiai_decode/s5/utils
new file mode 120000
index 00000000000..a3279dc8679
--- /dev/null
+++ b/egs/apiai_decode/s5/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils/
\ No newline at end of file
diff --git a/egs/aspire/s5/local/fisher_create_test_lang.sh b/egs/aspire/s5/local/fisher_create_test_lang.sh
index e17d95c4b47..924f6e6c4ba 100755
--- a/egs/aspire/s5/local/fisher_create_test_lang.sh
+++ b/egs/aspire/s5/local/fisher_create_test_lang.sh
@@ -1,4 +1,4 @@
-#!/bin/bash 
+#!/bin/bash
 #
 
 if [ -f path.sh ]; then . path.sh; fi
@@ -10,26 +10,12 @@ arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz
 
 cp -rT data/lang data/lang_test
 
-# grep -v '<s> <s>' etc. is only for future-proofing this script.  Our
-# LM doesn't have these "invalid combinations".  These can cause 
-# determinization failures of CLG [ends up being epsilon cycles].
-# Note: remove_oovs.pl takes a list of words in the LM that aren't in
-# our word list.  Since our LM doesn't have any, we just give it
-# /dev/null [we leave it in the script to show how you'd do it].
 gunzip -c "$arpa_lm" | \
-   grep -v '<s> <s>' | \
-   grep -v '</s> <s>' | \
-   grep -v '</s> </s>' | \
-   arpa2fst - | fstprint | \
-   utils/remove_oovs.pl /dev/null | \
-   utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \
-     --osymbols=data/lang_test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-    fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst
-  fstisstochastic data/lang_test/G.fst
-
+   arpa2fst --disambig-symbol=#0 \
+            --read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst
 
 echo  "Checking how stochastic G is (the first of these numbers should be small):"
-fstisstochastic data/lang_test/G.fst 
+fstisstochastic data/lang_test/G.fst
 
 ## Check lexicon.
 ## just have a look and make sure it seems sane.
@@ -61,4 +47,3 @@ utils/build_const_arpa_lm.sh \
     data/local/lm/4gram-mincount/lm_unpruned.gz data/lang data/lang_test_fg
 
 echo "$0 succeeded"
-
diff --git a/egs/aspire/s5/local/multi_condition/check_version.sh b/egs/aspire/s5/local/multi_condition/check_version.sh
index 4c9af0b00cf..81c415a3d67 100755
--- a/egs/aspire/s5/local/multi_condition/check_version.sh
+++ b/egs/aspire/s5/local/multi_condition/check_version.sh
@@ -1,6 +1,23 @@
 #!/bin/bash
 
 # Script to check the tool versions necessary for the aspire recipe
+function check_for_bad_sox {
+  if which sox >&/dev/null; then  # sox is on the path
+    sox_version=$(sox --version | awk -F 'v' '{print $2}' | awk -F '.' '{print $1 "." $2}')
+    if [ "$sox_version" == "14.2" ] || [ "$sox_version" == "14.3" ]; then
+      echo "*** WARNING: your version of sox is either 14.2.x or 14.3.x ***"
+      echo "*** which may cause errors in the data preparation of this recipe. ***"
+      echo "*** Please upgrade your sox to version 14.4 or higher. ***"
+      exit 1;
+    fi
+  else
+    echo "*** This recipe requires sox for the data preparation. ***"
+    exit 1;
+  fi
+}
+
+check_for_bad_sox;
+
 python -c "
 from distutils.version import LooseVersion
 import warnings, sys
diff --git a/egs/aspire/s5/local/multi_condition/prepare_impulses_noises.sh b/egs/aspire/s5/local/multi_condition/prepare_impulses_noises.sh
index a7e8f82159c..ca73a447c83 100755
--- a/egs/aspire/s5/local/multi_condition/prepare_impulses_noises.sh
+++ b/egs/aspire/s5/local/multi_condition/prepare_impulses_noises.sh
@@ -112,6 +112,11 @@ fi
 # copying the noise-rir pairing files
 cp ${output_dir}_non_normalized/info/* $output_dir/info
 
+# rename file location in the noise-rir pairing files 
+for file in `ls $output_dir/info/noise_impulse*`; do
+  sed -i "s/_non_normalized//g" $file
+done
+
 # generating the rir-list with probabilities alloted for each rir
 db_string_python=$(echo $db_string|sed -e "s/'\s\+'/','/g")
 python -c "
diff --git a/egs/aspire/s5/local/multi_condition/read_rir.py b/egs/aspire/s5/local/multi_condition/read_rir.py
index 1229e508d2a..e2510ac7d61 100755
--- a/egs/aspire/s5/local/multi_condition/read_rir.py
+++ b/egs/aspire/s5/local/multi_condition/read_rir.py
@@ -13,14 +13,14 @@ def read_raw(input_filename, precision = np.float32):
 
 def wav_write(file_handle, fs, data):
   if str(data.dtype) in set(['float64', 'float32']):
-    data = (0.99 * data / np.max(np.abs(data))) * (2 ** 31)
-    data = data.astype('int32', copy = False)
-  elif str(data.dtype) == 'int32':
+    data = (0.99 * data / np.max(np.abs(data))) * (2 ** 15)
+    data = data.astype('int16', copy = False)
+  elif str(data.dtype) == 'int16':
     pass
   else:
     raise Exception('Not implemented for '+str(data.dtype))
   scipy.io.wavfile.write(file_handle, fs, data)
- 
+
 
 def usage():
   return """This is a python script to read impulse responses stored in custom formats. It handles AIR database."""
diff --git a/egs/aspire/s5/local/multi_condition/reverberate_data_dir.sh b/egs/aspire/s5/local/multi_condition/reverberate_data_dir.sh
index da0dfb90def..add00c3c5af 100755
--- a/egs/aspire/s5/local/multi_condition/reverberate_data_dir.sh
+++ b/egs/aspire/s5/local/multi_condition/reverberate_data_dir.sh
@@ -1,6 +1,7 @@
 #!/bin/bash 
 
 # Copyright 2014  Johns Hopkins University (Author: Vijayaditya Peddinti)
+#           2015  Tom Ko
 # Apache 2.0.
 # This script processes generates multi-condition training data from clean data dir
 # and directory with impulse responses and noises
@@ -9,11 +10,8 @@
 set -e
 
 random_seed=0
-num_files_per_job=100
 snrs="20:10:15:5:0"
 log_dir=exp/make_reverb
-max_jobs_run=50
-dest_wav_dir=
 
 . ./path.sh;
 . ./utils/parse_options.sh
@@ -29,13 +27,8 @@ src_dir=$1
 impnoise_dir=$2
 dest_dir=$3
 
-if [ -z $dest_wav_dir ]; then
-  dest_wav_dir=$dest_dir/wavs
-fi
-
 mkdir -p $dest_dir
 mkdir -p $log_dir
-mkdir -p $dest_wav_dir
 
 wav_prefix="rev${random_seed}_"
 utt_prefix="rev${random_seed}_"
@@ -48,17 +41,9 @@ cat $src_dir/utt2spk | awk -v p=$utt_prefix '{printf("%s%s %s\n", p, $1, $1);}'
 # create the wav.scp files
 cat $src_dir/wav.scp | sed -e "s/^\s*//g" | \
   cut -d' ' -f1 | \
-  awk -v p1=$dest_wav_dir -v p2=$wav_prefix \
-  '{printf("%s%s%s.wav\n", p1, p2, $1);}'> $log_dir/corrupted_${random_seed}.list
-
-python -c "
-import re
-file_ids = map(lambda x: x.split()[0], open('$src_dir/wav.scp').readlines())
-dest_file_names = map(lambda x: x.split()[0], open('$log_dir/corrupted_${random_seed}.list'))
-for file_id, dest_file_name in zip(file_ids, dest_file_names):
-  print '$wav_prefix{0} cat {1} |'.format(file_id, dest_file_name)
-" > $dest_dir/wav.scp
-
+  awk -v p2=$wav_prefix \
+  '{printf("%s%s\n", p2, $1);}'> $log_dir/corrupted_${random_seed}.list
+  
 # modify segments file to point to the new wav files
 cat $dest_dir/segments | awk -v p=$wav_prefix \
   '{printf("%s %s%s %s %s\n", $1, p, $2, $3, $4);}' > $log_dir/segments_temp
@@ -71,13 +56,9 @@ for file in cmvn.scp feats.scp reco2file_and_channel; do
   rm -f $dest_dir/$file
 done
 
-python local/multi_condition/get_reverberate_parameter_lists.py \
-  --snrs $snrs --num-files-per-job $num_files_per_job --random-seed $random_seed \
+python local/multi_condition/reverberate_wavs.py \
+  --snrs $snrs --random-seed $random_seed \
 $src_dir/wav.scp $log_dir/corrupted_${random_seed}.list $impnoise_dir \
-$log_dir/corrupt_wavs.${random_seed}.list > $log_dir/num_corruption_jobs || exit 1;
-
-num_jobs=$(cat $log_dir/num_corruption_jobs)
-$decode_cmd -V --max-jobs-run $max_jobs_run JOB=1:$num_jobs $log_dir/corrupt_wavs.${random_seed}.JOB.log  \
- python local/multi_condition/corrupt.py --temp-file-name $log_dir/temp_JOB.wav $log_dir/corrupt_wavs.${random_seed}.JOB.list || exit 1;
+$dest_dir/wav.scp || exit 1;
 
 echo "Successfully generated corrupted data and stored it in $dest_dir." && exit 0;
diff --git a/egs/aspire/s5/local/multi_condition/get_reverberate_parameter_lists.py b/egs/aspire/s5/local/multi_condition/reverberate_wavs.py
similarity index 56%
rename from egs/aspire/s5/local/multi_condition/get_reverberate_parameter_lists.py
rename to egs/aspire/s5/local/multi_condition/reverberate_wavs.py
index caa39690b23..998a3ed5e74 100755
--- a/egs/aspire/s5/local/multi_condition/get_reverberate_parameter_lists.py
+++ b/egs/aspire/s5/local/multi_condition/reverberate_wavs.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python
 # Copyright 2014  Johns Hopkins University (Authors: Vijayaditya Peddinti).  Apache 2.0.
+#           2015  Tom Ko
 # script to generate multicondition training data / dev data / test data
 import argparse, glob, math, os, random, scipy.io.wavfile, sys
 
@@ -23,18 +24,9 @@ def return_nonempty_lines(lines):
 
   return new_lines
 
-def exists_wavfile(file_name):
-  return os.path.isfile(file_name)
-  try:
-    scipy.io.wavfile.read(file_name)
-    return True
-  except IOError:
-    return False
-
 if __name__ == "__main__":
   parser = argparse.ArgumentParser()
   parser.add_argument('--snrs', type=str, default = '20:10:0', help='snrs to be used for corruption')
-  parser.add_argument('--num-files-per-job', type=int, default = None, help='number of commands to be stored in each file')
   parser.add_argument('--check-output-exists', type = str, default = 'True', help = 'process file only if output file does not exist', choices = ['True', 'true', 'False', 'false'])
   parser.add_argument('--random-seed', type = int, default = 0, help = 'seed to be used in the randomization of impulses')
   parser.add_argument('wav_file_list', type=str, help='wav.scp file to corrupt')
@@ -75,49 +67,32 @@ def exists_wavfile(file_name):
         raise Exception('Unknown format of ' + file)
       impulse_noise_index.append([impulses_set, noises_list])
 
-  
-  if params.num_files_per_job is None:
-    lines_per_file = len(wav_files)
-  else:
-    lines_per_file = params.num_files_per_job
-  num_parts = int(math.ceil(len(wav_files)/ float(lines_per_file)))
-  indices_per_file = map(lambda x: xrange(lines_per_file * (x-1), lines_per_file * x), range(1, num_parts))
-  indices_per_file.append(xrange(lines_per_file * (num_parts-1), len(wav_files)))
- 
-  part_counter = 1
-  commands_file_base, ext = os.path.splitext(params.output_command_file)
-  for indices in indices_per_file:
-    command_list = []
-    for i in indices:
-      wav_file = " ".join(wav_files[i].split()[1:])
-      output_wav_file = wav_out_files[i]
-      impulse_file = impulses.next()
-      noise_file = ''
-      snr = ''
-      found_impulse = False
-      if add_noise:
-        for i in xrange(len(impulse_noise_index)):
-          if impulse_file in impulse_noise_index[i][0]:
-            noise_file = impulse_noise_index[i][1].next()
-            snr = snrs.next()
-            assert(len(wav_file.strip()) > 0)
-            assert(len(impulse_file.strip()) > 0)
-            assert(len(noise_file.strip()) > 0)
-            assert(len(snr.strip()) > 0)
-            assert(len(output_wav_file.strip()) > 0)
-            command_list.append("{0} --rir-file {1} --noise-file {2} --snr-db {3} - {4} \n".format(wav_file, impulse_file, noise_file, snr, output_wav_file))
-            found_impulse = True
-            break
-      if not found_impulse:
-        assert(len(wav_file.strip()) > 0)
-        assert(len(impulse_file.strip()) > 0)
-        assert(len(output_wav_file.strip()) > 0)
-        command_list.append("{0} --rir-file {1} - {2} \n".format(wav_file, impulse_file, output_wav_file))
-      if exists_wavfile(output_wav_file):
-        # we perform the check at this point to ensure replication of (wavfile, impulse, noise, snr) tuples across runs.
-        command_list.pop()  
-    file_handle = open("{0}.{1}{2}".format(commands_file_base, part_counter, ext), 'w')
-    part_counter += 1
-    file_handle.write("".join(command_list))
-    file_handle.close()
-  print num_parts
+  command_list = []
+  for i in range(len(wav_files)):
+    wav_file = " ".join(wav_files[i].split()[1:])
+    output_wav_file = wav_out_files[i]
+    impulse_file = impulses.next()
+    noise_file = ''
+    snr = ''
+    found_impulse = False
+    if add_noise:
+      for i in xrange(len(impulse_noise_index)):
+        if impulse_file in impulse_noise_index[i][0]:
+          noise_file = impulse_noise_index[i][1].next()
+          snr = snrs.next()
+          assert(len(wav_file.strip()) > 0)
+          assert(len(impulse_file.strip()) > 0)
+          assert(len(noise_file.strip()) > 0)
+          assert(len(snr.strip()) > 0)
+          assert(len(output_wav_file.strip()) > 0)
+          command_list.append("{4} {0} wav-reverberate --noise-file={2} --snr-db={3} - {1} - |\n".format(wav_file, impulse_file, noise_file, snr, output_wav_file))
+          found_impulse = True
+          break
+    if not found_impulse:
+      assert(len(wav_file.strip()) > 0)
+      assert(len(impulse_file.strip()) > 0)
+      assert(len(output_wav_file.strip()) > 0)
+      command_list.append("{2} {0} wav-reverberate - {1} - |\n".format(wav_file, impulse_file, output_wav_file))
+  file_handle = open(params.output_command_file, 'w')
+  file_handle.write("".join(command_list))
+  file_handle.close()
diff --git a/egs/aspire/s5/local/multi_condition/rirs/prep_aalto.sh b/egs/aspire/s5/local/multi_condition/rirs/prep_aalto.sh
index 341122f73d0..f8a45c3e790 100755
--- a/egs/aspire/s5/local/multi_condition/rirs/prep_aalto.sh
+++ b/egs/aspire/s5/local/multi_condition/rirs/prep_aalto.sh
@@ -7,6 +7,7 @@
 
 download=true
 sampling_rate=8k
+output_bit=16
 DBname=AALTO
 file_splitter=  #script to generate job scripts given the command file
 
@@ -62,9 +63,9 @@ tmpdir=`readlink -e $tmpdir`
 file_count=1
 for data_file in ${data_files[@]}; do
   # aalto has incompatible format of wav audio, which are not compatible with python's wav.read() function
-  # so we convert everything to 32bit PCM.
-  output_file_name=${DBname}_type${type_num}_${file_count}_`basename $data_file| tr '[:upper:]' '[:lower:]'`
-  echo "sox -t wav $data_file -t wav -r $sampling_rate -e signed-integer -b 32 ${output_dir}/${output_file_name}" >>  $command_file
+#  output_file_name=${DBname}_type${type_num}_${file_count}_`basename $data_file| tr '[:upper:]' '[:lower:]'`
+  output_file_name=${DBname}_type${type_num}_`basename $data_file| tr '[:upper:]' '[:lower:]'`
+  echo "sox -t wav $data_file -t wav -r $sampling_rate -e signed-integer -b $output_bit ${output_dir}/${output_file_name}" >>  $command_file
  # echo "python local/multi_condition/read_rir.py --output-sampling-rate $sampling_rate wav ${tmpdir}/$file_count.wav ${output_dir}/${output_file_name} || exit -1;" >> $command_file
   echo ${output_dir}/${output_file_name} >>  $log_dir/${DBname}_type${type_num}.rir.list
   file_count=$((file_count + 1))
diff --git a/egs/aspire/s5/local/multi_condition/rirs/prep_air.sh b/egs/aspire/s5/local/multi_condition/rirs/prep_air.sh
index 9cb7e4fae1d..c7b6300db50 100755
--- a/egs/aspire/s5/local/multi_condition/rirs/prep_air.sh
+++ b/egs/aspire/s5/local/multi_condition/rirs/prep_air.sh
@@ -51,7 +51,8 @@ command_file=$log_dir/${DBname}_read_rir_noise.sh
 echo "">$command_file
 file_count=1
 while read file_pattern output_file_name; do
-  output_file_name=`echo ${DBname}_type${type_num}_${file_count}_$output_file_name| tr '[:upper:]' '[:lower:]'`
+ # output_file_name=`echo ${DBname}_type${type_num}_${file_count}_$output_file_name| tr '[:upper:]' '[:lower:]'`
+  output_file_name=`echo ${DBname}_type${type_num}_$output_file_name| tr '[:upper:]' '[:lower:]'`
   echo "local/multi_condition/read_rir.py --output-sampling-rate $sampling_rate air '${file_pattern}' ${output_dir}/${output_file_name} || exit 1;" >> $command_file
   echo ${output_dir}/${output_file_name} >>  $log_dir/${DBname}_type$type_num.rir.list
   file_count=$((file_count + 1))
diff --git a/egs/aspire/s5/local/multi_condition/rirs/prep_c4dm.sh b/egs/aspire/s5/local/multi_condition/rirs/prep_c4dm.sh
index 513e0c481d7..8e5dd34d9ac 100755
--- a/egs/aspire/s5/local/multi_condition/rirs/prep_c4dm.sh
+++ b/egs/aspire/s5/local/multi_condition/rirs/prep_c4dm.sh
@@ -8,6 +8,7 @@
 
 download=true 
 sampling_rate=8k
+output_bit=16
 DBname=C4DM
 file_splitter=  #script to generate job scripts given the command file
 
@@ -91,9 +92,9 @@ tmpdir=`readlink -e $tmpdir`
 file_count=1
 for data_file in ${data_files[@]}; do
   # c4dm has incompatible format of wav audio, which are not compatible with python's wav.read() function
-  # so we convert everything to 32bit PCM.
-  output_file_name=${DBname}_type${type_num}_${file_count}_`basename $data_file| tr '[:upper:]' '[:lower:]'`
-  echo "sox -t wav $data_file -t wav -r $sampling_rate -e signed-integer -b 32 ${output_dir}/${output_file_name}" >> $command_file
+ # output_file_name=${DBname}_type${type_num}_${file_count}_`basename $data_file| tr '[:upper:]' '[:lower:]'`
+  output_file_name=${DBname}_type${type_num}_`basename $data_file| tr '[:upper:]' '[:lower:]'`
+  echo "sox -t wav $data_file -t wav -r $sampling_rate -e signed-integer -b $output_bit ${output_dir}/${output_file_name}" >> $command_file
   #echo "python local/multi_condition/read_rir.py --output-sampling-rate $sampling_rate wav ${tmpdir}/${file_count}.wav ${output_dir}/${output_file_name} || exit -1;" >> $command_file
   echo ${output_dir}/${output_file_name} >>  $log_dir/${DBname}_type${type_num}.rir.list
   file_count=$((file_count + 1))
diff --git a/egs/aspire/s5/local/multi_condition/rirs/prep_mardy.sh b/egs/aspire/s5/local/multi_condition/rirs/prep_mardy.sh
index f8c2610293f..4690b9b1861 100755
--- a/egs/aspire/s5/local/multi_condition/rirs/prep_mardy.sh
+++ b/egs/aspire/s5/local/multi_condition/rirs/prep_mardy.sh
@@ -7,6 +7,7 @@
 
 download=true
 sampling_rate=8k
+output_bit=16
 DBname=MARDY
 file_splitter=  #script to generate job scripts given the command file
 
@@ -47,8 +48,9 @@ echo "" > $log_dir/${DBname}_type${type_num}.rir.list
 echo "Found $total_files impulse responses in ${RIR_home}/mardy/"
 file_count=1
 for data_file in ${data_files[@]}; do
-  output_file_name=${DBname}_type${type_num}_${file_count}_`basename $data_file| tr '[:upper:]' '[:lower:]'`
-  echo "sox -t wav $data_file -t wav -r $sampling_rate -e signed-integer -b 32 ${output_dir}/${output_file_name}" >> $command_file
+  #output_file_name=${DBname}_type${type_num}_${file_count}_`basename $data_file| tr '[:upper:]' '[:lower:]'`
+  output_file_name=${DBname}_type${type_num}_`basename $data_file| tr '[:upper:]' '[:lower:]'`
+  echo "sox -t wav $data_file -t wav -r $sampling_rate -e signed-integer -b $output_bit ${output_dir}/${output_file_name}" >> $command_file
   #echo "python local/multi_condition/read_rir.py --output-sampling-rate $sampling_rate wav ${data_file} ${output_dir}/${output_file_name} || exit -1;" >> $command_file
   echo ${output_dir}/${output_file_name} >>  $log_dir/${DBname}_type${type_num}.rir.list
   file_count=$((file_count + 1))
diff --git a/egs/aspire/s5/local/multi_condition/rirs/prep_openair.sh b/egs/aspire/s5/local/multi_condition/rirs/prep_openair.sh
index 71ec52d0d49..bd43da77079 100755
--- a/egs/aspire/s5/local/multi_condition/rirs/prep_openair.sh
+++ b/egs/aspire/s5/local/multi_condition/rirs/prep_openair.sh
@@ -7,6 +7,7 @@
 
 download=true
 sampling_rate=8k
+output_bit=16
 DBname=OPENAIR
 file_splitter=  #script to generate job scripts given the command file
 
@@ -432,13 +433,14 @@ echo "Found $total_files impulse responses in ${RIR_home}/open_air/"
 file_count=1 # affix to ensure that files with same name are not overwritten
 for data_file in ${data_files[@]}; do
   # open-air has multiple formats of wav audio, some of which are not compatible with python's wav.read() function
-  # so we convert everything to 32bit PCM.
-  output_file_name=${DBname}_type${type_num}_${file_count}_`basename $data_file| tr '[:upper:]' '[:lower:]'`
-  echo "sox -t wav $data_file -t wav -r $sampling_rate -e signed-integer -b 32 ${output_dir}/${output_file_name}" >> $command_file
+#  output_file_name=${DBname}_type${type_num}_${file_count}_`basename $data_file| tr '[:upper:]' '[:lower:]'`
+  output_file_name=${DBname}_type${type_num}_`basename $data_file| tr '[:upper:]' '[:lower:]'`
+  echo "sox -t wav $data_file -t wav -r $sampling_rate -e signed-integer -b $output_bit ${output_dir}/${output_file_name}" >> $command_file
   echo ${output_dir}/${output_file_name} >>  $log_dir/${DBname}_type${type_num}.rir.list
   file_count=$((file_count + 1))
 done
 
+
 if [ ! -z "$file_splitter" ]; then
   num_jobs=$($file_splitter $command_file || exit 1)
   job_file=${command_file%.sh}.JOB.sh
diff --git a/egs/aspire/s5/local/multi_condition/rirs/prep_rvb2014.sh b/egs/aspire/s5/local/multi_condition/rirs/prep_rvb2014.sh
index 0124038d1b0..32394556f01 100755
--- a/egs/aspire/s5/local/multi_condition/rirs/prep_rvb2014.sh
+++ b/egs/aspire/s5/local/multi_condition/rirs/prep_rvb2014.sh
@@ -8,6 +8,7 @@
 
 download=true
 sampling_rate=8k
+output_bit=16
 DBname=RVB2014
 file_splitter=  #script to generate job scripts given the command file
 
@@ -57,7 +58,7 @@ echo "" > $log_dir/${DBname}_type${type_num}.rir.list
 echo "Found $total_files impulse responses in ${Reverb2014_home1}/RIR."
 for data_file in ${data_files[@]}; do
   output_file_name=${DBname}_type${type_num}_`basename $data_file | tr '[:upper:]' '[:lower:]'` 
-  echo "sox -t wav $data_file -t wav -r $sampling_rate -e signed-integer -b 32 ${output_dir}/${output_file_name}" >> $command_file
+  echo "sox -t wav $data_file -t wav -r $sampling_rate -e signed-integer -b $output_bit ${output_dir}/${output_file_name}" >> $command_file
   echo ${output_dir}/${output_file_name} >>  $log_dir/${DBname}_type${type_num}.rir.list
   files_done=$((files_done + 1))
 done
@@ -69,7 +70,7 @@ echo "" > $log_dir/${DBname}_type${type_num}.noise.list
 echo "Found $total_files noises in ${Reverb2014_home1}/NOISE."
 for data_file in ${data_files[@]}; do
   output_file_name=${DBname}_type${type_num}_`basename $data_file| tr '[:upper:]' '[:lower:]'`
-  echo "sox -t wav $data_file -t wav -r $sampling_rate -e signed-integer -b 32 ${output_dir}/${output_file_name}" >> $command_file
+  echo "sox -t wav $data_file -t wav -r $sampling_rate -e signed-integer -b $output_bit ${output_dir}/${output_file_name}" >> $command_file
   echo ${output_dir}/${output_file_name} >>  $log_dir/${DBname}_type${type_num}.noise.list
   files_done=$((files_done + 1))
 done
@@ -83,7 +84,7 @@ echo "" > $log_dir/${DBname}_type${type_num}.rir.list
 echo "Found $total_files impulse responses in ${Reverb2014_home2}/RIR."
 for data_file in ${data_files[@]}; do
   output_file_name=${DBname}_type${type_num}_`basename $data_file| tr '[:upper:]' '[:lower:]'`
-  echo "sox -t wav $data_file -t wav -r $sampling_rate -e signed-integer -b 32 ${output_dir}/${output_file_name}" >> $command_file
+  echo "sox -t wav $data_file -t wav -r $sampling_rate -e signed-integer -b $output_bit ${output_dir}/${output_file_name}" >> $command_file
   echo ${output_dir}/${output_file_name} >>  $log_dir/${DBname}_type${type_num}.rir.list
   files_done=$((files_done + 1))
 done
@@ -96,7 +97,7 @@ echo "" > $log_dir/${DBname}_type${type_num}.noise.list
 echo "Found $total_files noises in ${Reverb2014_home2}/NOISE."
 for data_file in ${data_files[@]}; do
   output_file_name=${DBname}_type${type_num}_`basename $data_file | tr '[:upper:]' '[:lower:]'`
-  echo "sox -t wav $data_file -t wav -r $sampling_rate -e signed-integer -b 32 ${output_dir}/${output_file_name}" >> $command_file
+  echo "sox -t wav $data_file -t wav -r $sampling_rate -e signed-integer -b $output_bit ${output_dir}/${output_file_name}" >> $command_file
   echo ${output_dir}/${output_file_name} >>  $log_dir/${DBname}_type${type_num}.noise.list
   files_done=$((files_done + 1))
 done
diff --git a/egs/aspire/s5/local/multi_condition/rirs/prep_rwcp.sh b/egs/aspire/s5/local/multi_condition/rirs/prep_rwcp.sh
index aac8efcd340..b44669b86f1 100755
--- a/egs/aspire/s5/local/multi_condition/rirs/prep_rwcp.sh
+++ b/egs/aspire/s5/local/multi_condition/rirs/prep_rwcp.sh
@@ -14,6 +14,7 @@
 
 download=true
 sampling_rate=8k
+output_bit=16
 DBname=RWCP
 file_splitter=  #script to generate job scripts given the command file
 
@@ -73,7 +74,7 @@ for base_dir_name in ${RWCP_dirs[@]}; do
     for i in `seq $first_channel $last_channel`; do
       channel_files="$channel_files -t raw -e float -b 32 -c 1 -r 48k  $leaf_dir_name/$file_base_name.$i ";
     done
-    echo "sox -M $channel_files -r $sampling_rate -e signed-integer -b 32 ${output_dir}/${output_file_name}" >> $command_file
+    echo "sox -M $channel_files -r $sampling_rate -e signed-integer -b $output_bit ${output_dir}/${output_file_name}" >> $command_file
     echo ${output_dir}/${output_file_name} >>  $log_dir/RWCP_type$type_num.rir.list
     files_done=$((files_done + 1))
   done
@@ -105,7 +106,7 @@ for data_file in ${data_files[@]}; do
   temp_file=$tempdir_robo/$files_done.wav
   python $tempdir_robo/raw_read.py $data_file $temp_file 
   output_file_name=RWCP_type${type_num}_rir_`basename $data_file .dat | tr '[:upper:]' '[:lower:]'`.wav
-  echo "sox -t wav $temp_file -r $sampling_rate -e signed-integer -b 32 ${output_dir}/${output_file_name}"   >> $command_file
+  echo "sox -t wav $temp_file -r $sampling_rate -e signed-integer -b $output_bit ${output_dir}/${output_file_name}"   >> $command_file
   echo ${output_dir}/${output_file_name} >>  $log_dir/RWCP_type$type_num.rir.list
   files_done=$((files_done + 1))
 done
@@ -128,7 +129,7 @@ for leaf_dir_name in  ${leaf_directories[@]}; do
   for i in `seq $first_channel $last_channel`; do
     channel_files="$channel_files -t raw -e signed-integer -b 16 -c 1 -r 48k  $leaf_dir_name/$file_base_name.$i ";
   done
-  echo "sox -M $channel_files -r $sampling_rate -e signed-integer -b 32 ${output_dir}/${output_file_name}" >> $command_file
+  echo "sox -M $channel_files -r $sampling_rate -e signed-integer -b $output_bit ${output_dir}/${output_file_name}" >> $command_file
 
   echo ${output_dir}/${output_file_name} >>  $log_dir/RWCP_type$type_num.noise.list
   files_done=$((files_done + 1))
diff --git a/egs/aspire/s5/local/multi_condition/rirs/prep_varechoic.sh b/egs/aspire/s5/local/multi_condition/rirs/prep_varechoic.sh
index 6a0df08eb8a..4be2b1779f3 100755
--- a/egs/aspire/s5/local/multi_condition/rirs/prep_varechoic.sh
+++ b/egs/aspire/s5/local/multi_condition/rirs/prep_varechoic.sh
@@ -7,6 +7,7 @@
 
 download=true
 sampling_rate=8k
+output_bit=16
 DBname=VARECHOIC
 file_splitter=  #script to generate job scripts given the command file
 
@@ -47,7 +48,7 @@ varechoic_home=$RIR_home/icsi_varechoic/varechoic
 for room_type in ir00 ir43 ir100 ; do
   for mike in m1 m2 m3 m4; do
     file_basename=${room_type}${mike}
-    echo "sox  -B -e float -b 32 -c 1 -r 8k -t raw $varechoic_home/${file_basename}.raw -t wav -b 32 $output_dir/${DBname}_${file_basename}.wav" >> $command_file
+    echo "sox  -B -e float -b 32 -c 1 -r 8k -t raw $varechoic_home/${file_basename}.raw -t wav -b $output_bit $output_dir/${DBname}_${file_basename}.wav" >> $command_file
     echo $output_dir/${DBname}_${file_basename}.wav >>  $log_dir/${DBname}_type$type_num.rir.list
   done
 done
diff --git a/egs/aspire/s5/local/multi_condition/run_nnet2_common.sh b/egs/aspire/s5/local/multi_condition/run_nnet2_common.sh
index 11224d8e841..5b6424a1d86 100755
--- a/egs/aspire/s5/local/multi_condition/run_nnet2_common.sh
+++ b/egs/aspire/s5/local/multi_condition/run_nnet2_common.sh
@@ -8,12 +8,11 @@
 stage=1
 snrs="20:10:15:5:0"
 num_data_reps=3
-dest_wav_dir=data/rvb_wavs # directory to store the reverberated wav files
 ali_dir=exp/
 db_string="'air' 'rwcp' 'rvb2014'" # RIR dbs to be used in the experiment
                                       # only dbs used for ASpIRE submission system have been used here
 RIR_home=db/RIR_databases/ # parent directory of the RIR databases files
-download_rirs=false # download the RIR databases from the urls or assume they are present in the RIR_home directory
+download_rirs=true # download the RIR databases from the urls or assume they are present in the RIR_home directory
 
 set -e
 . cmd.sh
@@ -40,17 +39,15 @@ if [ $stage -le 1 ]; then
     else
       num_reps=1
     fi
-    mkdir -p data/${data_dir}_rvb/wavs
     reverb_data_dirs=
     for i in `seq 1 $num_reps`; do
       cur_dest_dir=" data/temp_${data_dir}_${i}" 
-      local/multi_condition/reverberate_data_dir.sh --random-seed $i --log-dir exp/make_reverb/log \
-        --dest-wav-dir ${dest_wav_dir}/wavs${i}/ \
+      local/multi_condition/reverberate_data_dir.sh --random-seed $i \
         --snrs "$snrs" --log-dir exp/make_corrupted_wav \
         data/${data_dir}  data/impulses_noises $cur_dest_dir
       reverb_data_dirs+=" $cur_dest_dir" 
     done
-    utils/combine_data.sh --extra-files utt2uniq data/${data_dir}_rvb_hires $reverb_data_dirs
+    utils/combine_data.sh --extra-files utt2uniq data/${data_dir}_rvb $reverb_data_dirs
     rm -rf $reverb_data_dirs
   done
 
@@ -63,7 +60,7 @@ if [ $stage -le 1 ]; then
     local/multi_condition/copy_ali_dir.sh --utt-prefix "rev${i}_" exp/tri5a exp/tri5a_temp_$i || exit 1;
     ali_dirs+=" exp/tri5a_temp_$i"
   done
-  local/multi_condition/combine_ali_dirs.sh --ref-data-dir data/train_rvb_hires \
+  local/multi_condition/combine_ali_dirs.sh --ref-data-dir data/train_rvb \
     exp/tri5a_rvb_ali $ali_dirs || exit 1;
    
   # copy the alignments for training the 100k system (from tri4a)
diff --git a/egs/aspire/s5/local/multi_condition/run_nnet2_ms.sh b/egs/aspire/s5/local/multi_condition/run_nnet2_ms.sh
index 8119ff44661..4be5efe25ec 100755
--- a/egs/aspire/s5/local/multi_condition/run_nnet2_ms.sh
+++ b/egs/aspire/s5/local/multi_condition/run_nnet2_ms.sh
@@ -13,7 +13,6 @@ stage=1
 train_stage=-10
 use_gpu=true
 dir=exp/nnet2_multicondition/nnet_ms_a
-dest_wav_dir=data/rvb_wavs # directory to store the reverberated wav files
 
 set -e
 . cmd.sh
@@ -52,7 +51,7 @@ else
 fi
 
 # do the common parts of the script.
-local/multi_condition/run_nnet2_common.sh --dest-wav-dir $dest_wav_dir --stage $stage
+local/multi_condition/run_nnet2_common.sh --stage $stage
 
 
 if [ $stage -le 7 ]; then
diff --git a/egs/aspire/s5/local/multi_condition/run_nnet2_ms_disc.sh b/egs/aspire/s5/local/multi_condition/run_nnet2_ms_disc.sh
index 9fcf134ccce..ad5fba0929f 100755
--- a/egs/aspire/s5/local/multi_condition/run_nnet2_ms_disc.sh
+++ b/egs/aspire/s5/local/multi_condition/run_nnet2_ms_disc.sh
@@ -85,7 +85,7 @@ if [ $stage -le 2 ]; then
   # hardcode no-GPU for alignment, although you could use GPU [you wouldn't
   # get excellent GPU utilization though.]
   nj=1500 # this is 6k hours, use more jobs and control the speed dynamically using 
-          # throttle control option (-tc with qalter) 
+          # throttle control option (--max-jobs-run with qalter) 
           # have a high number of jobs because this could take a while, and we might
           # have some stragglers.
   max_jobs_run=200
@@ -110,14 +110,14 @@ if [ $stage -le 3 ]; then
   if [ -d ${srcdir}_degs/storage ]; then max_jobs=10; else max_jobs=5; fi
 
   steps/nnet2/get_egs_discriminative2.sh \
-    --cmd "$decode_cmd -tc $max_jobs" \
+    --cmd "$decode_cmd --max-jobs-run $max_jobs" \
     --online-ivector-dir exp/nnet2_multicondition/ivectors_train \
     --criterion $criterion --drop-frames $drop_frames \
      data/train_rvb_hires data/lang ${srcdir}{_ali,_denlats,/final.mdl,_degs} || exit 1;
 
   # the command below is a more generic, but slower, way to do it.
   #steps/online/nnet2/get_egs_discriminative2.sh \
-  #  --cmd "$decode_cmd -tc $max_jobs" \
+  #  --cmd "$decode_cmd --max-jobs-run $max_jobs" \
   #  --criterion $criterion --drop-frames $drop_frames \
   #   data/train_960 data/lang ${srcdir}{_ali,_denlats,_online,_degs} || exit 1;
 fi
diff --git a/egs/aspire/s5/local/nnet3/run_autoencoder.sh b/egs/aspire/s5/local/nnet3/run_autoencoder.sh
new file mode 100644
index 00000000000..abc7f3a6234
--- /dev/null
+++ b/egs/aspire/s5/local/nnet3/run_autoencoder.sh
@@ -0,0 +1,88 @@
+#!/bin/bash
+
+# this is an example to show a "tdnn" system in raw nnet configuration
+# i.e. without a transition model
+
+. cmd.sh
+
+
+# At this script level we don't support not running on GPU, as it would be painfully slow.
+# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false,
+# --num-threads 16 and --minibatch-size 128.
+
+stage=0
+affix=
+train_stage=-10
+common_egs_dir=
+num_data_reps=10
+
+remove_egs=true
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+dir=exp/nnet3/tdnn_raw
+dir=$dir${affix:+_$affix}
+
+clean_data_dir=data/train
+data_dir=data/train_rvb
+targets_scp=$dir/targets.scp
+
+mkdir -p $dir
+
+# Create copies of clean feats with prefix "rev$x_" to match utterance names of
+# the noisy feats
+for x in `seq 1 $num_data_reps`; do
+  awk -v x=$x '{print "rev"x"_"$0}' $clean_data_dir/feats.scp | sort -k1,1 > $targets_scp
+done
+
+if [ $stage -le 9 ]; then
+  echo "$0: creating neural net configs";
+  
+  num_targets=`feat-to-dim scp:$targets_scp - 2>/dev/null` || exit 1
+
+  # create the config files for nnet initialization
+  python steps/nnet3/tdnn/make_configs.py  \
+     --splice-indexes "-2,-1,0,1,2 -1,2 -3,3 -7,2 0"  \
+     --feat-dir ${data_dir} \
+     --relu-dim=1024 \
+     --add-lda=false \
+     --objective-type=quadratic \
+     --add-final-sigmoid=false \
+     --include-log-softmax=false \
+     --use-presoftmax-prior-scale=false \
+     --num-targets=$num_targets \
+     $dir/configs || exit 1;
+fi
+
+if [ $stage -le 10 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/tdnn/train_raw_nnet.sh --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --num-epochs 2 \
+    --num-jobs-initial 3 \
+    --num-jobs-final 16 \
+    --initial-effective-lrate 0.0017 \
+    --final-effective-lrate 0.00017 \
+    --egs-dir "$common_egs_dir" \
+    --remove-egs $remove_egs \
+    --use-gpu true \
+    --dense-targets true \
+    ${data_dir} $targets_scp $dir || exit 1
+fi
+
diff --git a/egs/aspire/s5/path.sh b/egs/aspire/s5/path.sh
index e93eb33f24b..1a6fb5f891b 100755
--- a/egs/aspire/s5/path.sh
+++ b/egs/aspire/s5/path.sh
@@ -1,3 +1,5 @@
 export KALDI_ROOT=`pwd`/../../..
-export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$KALDI_ROOT/src/lmbin:$PWD:$PATH
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
 export LC_ALL=C
diff --git a/egs/aurora4/s5/cmd.sh b/egs/aurora4/s5/cmd.sh
index 139b2cd6c6c..378febca15b 100644
--- a/egs/aurora4/s5/cmd.sh
+++ b/egs/aurora4/s5/cmd.sh
@@ -1,29 +1,18 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
-#a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64"
-export decode_cmd="queue.pl -l arch=*64 --mem 2G"
-export mkgraph_cmd="queue.pl -l arch=*64 --mem 4G"
-export big_memory_cmd="queue.pl -l arch=*64 --mem 8G"
+export train_cmd="queue.pl --mem 4G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
+# the use of cuda_cmd is deprecated but it's still used in some example scripts
+# here.
 export cuda_cmd="queue.pl --gpu 1"
-
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
-#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
-
-#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
-#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
-
-#c) run it locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-#export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
diff --git a/egs/aurora4/s5/local/aurora4_format_data.sh b/egs/aurora4/s5/local/aurora4_format_data.sh
index 4208c019879..0b94f7f796d 100755
--- a/egs/aurora4/s5/local/aurora4_format_data.sh
+++ b/egs/aurora4/s5/local/aurora4_format_data.sh
@@ -21,7 +21,7 @@ tmpdir=data/local/lm_tmp
 lexicon=data/local/lang_tmp/lexiconp.txt
 mkdir -p $tmpdir
 
-for x in train_si84_clean train_si84_multi test_eval92 test_0166 dev_0330 dev_1206; do 
+for x in train_si84_clean train_si84_multi test_eval92 test_0166 dev_0330 dev_1206; do
   mkdir -p data/$x
   cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1;
   cp $srcdir/$x.txt data/$x/text || exit 1;
@@ -42,23 +42,9 @@ for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do
   cp -r data/lang/* $test
 
   gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
-   utils/find_arpa_oovs.pl $test/words.txt  > $tmpdir/oovs_${lm_suffix}.txt
+    arpa2fst --disambig-symbol=#0 \
+             --read-symbol-table=$test/words.txt - $test/G.fst
 
-  # grep -v '<s> <s>' because the LM seems to have some strange and useless
-  # stuff in it with multiple <s>'s in the history.  Encountered some other similar
-  # things in a LM from Geoff.  Removing all "illegal" combinations of <s> and </s>,
-  # which are supposed to occur only at being/end of utt.  These can cause 
-  # determinization failures of CLG [ends up being epsilon cycles].
-  gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
-    grep -v '<s> <s>' | \
-    grep -v '</s> <s>' | \
-    grep -v '</s> </s>' | \
-    arpa2fst - | fstprint | \
-    utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \
-    utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
-      --osymbols=$test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-     fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
-  
   utils/validate_lang.pl --skip-determinization-check $test || exit 1;
 done
 
diff --git a/egs/aurora4/s5/path.sh b/egs/aurora4/s5/path.sh
index fee0b9b0c11..2d17b17a84a 100755
--- a/egs/aurora4/s5/path.sh
+++ b/egs/aurora4/s5/path.sh
@@ -1,4 +1,6 @@
 export KALDI_ROOT=`pwd`/../../..
 [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
-export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin/:$KALDI_ROOT/src/kwsbin:$PWD:$PATH
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
 export LC_ALL=C
diff --git a/egs/babel/s5/cmd.sh b/egs/babel/s5/cmd.sh
index a4a11bef039..71dd849a93b 100644
--- a/egs/babel/s5/cmd.sh
+++ b/egs/babel/s5/cmd.sh
@@ -1,29 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64"
-export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G"
-export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G"
-
-#export cuda_cmd="..."
-
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
-#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
-
-#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
-#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
-
-#c) run it locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-#export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/babel/s5/local/arpa2G.sh b/egs/babel/s5/local/arpa2G.sh
index 67d44080fe4..f037caf0d7b 100755
--- a/egs/babel/s5/local/arpa2G.sh
+++ b/egs/babel/s5/local/arpa2G.sh
@@ -39,14 +39,8 @@ destdir=$3
 mkdir $destdir 2>/dev/null || true
 
 gunzip -c $lmfile | \
-    grep -v '<s> <s>' | grep -v '</s> <s>' |  grep -v '</s> </s>' | \
-    arpa2fst - | \
-    fstprint | \
-    utils/eps2disambig.pl | \
-    utils/s2eps.pl | \
-    fstcompile --isymbols=$langdir/words.txt \
-    --osymbols=$langdir/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-    fstrmepsilon | fstarcsort --sort_type=ilabel > $destdir/G.fst || exit 1
+    arpa2fst --disambig-symbol=#0 \
+             --read-symbol-table=$langdir/words.txt - $destdir/G.fst || exit 1
 fstisstochastic $destdir/G.fst || true
 
 exit 0
diff --git a/egs/babel/s5/local/arpa2G_syllables.sh b/egs/babel/s5/local/arpa2G_syllables.sh
index 8f10f87f019..58ef162ec2e 100755
--- a/egs/babel/s5/local/arpa2G_syllables.sh
+++ b/egs/babel/s5/local/arpa2G_syllables.sh
@@ -22,7 +22,7 @@ silence_id=`grep -w SIL $langdir/words.txt | awk '{print $2}'` || exit 1;
 [ -z $silence_id ] && echo Error getting silence-id from $langdir/words.txt && exit 1;
 rho=$[$last_id+1]
 
-# state 0 is start-state.  state 1 is state after we saw silence.  state 2 is 
+# state 0 is start-state.  state 1 is state after we saw silence.  state 2 is
 # "dead state/failure state" that is not coaccessible.
 cat <<EOF | fstcompile > $destdir/rho.fst
 0 1 $silence_id $silence_id
@@ -35,16 +35,11 @@ EOF
 
 
 gunzip -c $lmfile | \
-    grep -v '<s> <s>' | grep -v '</s> <s>' |  grep -v '</s> </s>' | \
     sed 's/<unk>/<oov>/g' | \
-    arpa2fst - | \
-    fstprint | \
-    utils/eps2disambig.pl | \
-    utils/s2eps.pl | \
-    fstcompile --isymbols=$langdir/words.txt \
-    --osymbols=$langdir/words.txt  --keep_isymbols=false --keep_osymbols=false | \
+    arpa2fst --disambig-symbol=#0 --ilabel-sort=false \
+             --read-symbol-table=$langdir/words.txt - | \
     fstrhocompose "$rho" - $destdir/rho.fst | \
-    fstrmepsilon | fstarcsort --sort_type=ilabel > $destdir/G.fst || exit 1
+    fstarcsort --sort_type=ilabel > $destdir/G.fst || exit 1
 
 fstisstochastic $destdir/G.fst || true
 
diff --git a/egs/babel/s5/local/prepare_kaldi_lm_from_training_text.sh b/egs/babel/s5/local/prepare_kaldi_lm_from_training_text.sh
index 1837902a7d0..a5601130343 100755
--- a/egs/babel/s5/local/prepare_kaldi_lm_from_training_text.sh
+++ b/egs/babel/s5/local/prepare_kaldi_lm_from_training_text.sh
@@ -8,7 +8,7 @@
 # This script trains LMs on the WSJ LM-training data.
 # It requires that you have already run wsj_extend_dict.sh,
 # to get the larger-size dictionary including all of CMUdict
-# plus any OOVs and possible acronyms that we could easily 
+# plus any OOVs and possible acronyms that we could easily
 # derive pronunciations for.
 
 # This script takes as command-line arguments the relevant data/lang
@@ -69,7 +69,7 @@ cat $data/text | awk '{for (n=2;n<NF;n++) printf("%s ", $n); printf "\n";}' | \
  gzip -c > $dir/train_in.gz || exit 1;
 
 # Get training data with OOV words (w.r.t. our current vocab) replaced with <unk>.
-echo "Getting training data with OOV words replaced with <unk> (train_nounk.gz)" 
+echo "Getting training data with OOV words replaced with <unk> (train_nounk.gz)"
 gunzip -c $dir/train_in.gz | awk -v w=$dir/wordlist \
   'BEGIN{while((getline<w)>0) v[$1]=1;}
   {for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf "<unk> ";print ""}'|sed 's/ $//g' \
@@ -93,7 +93,7 @@ gunzip -c $dir/train_nounk.gz | awk -v wmap=$dir/word_map 'BEGIN{while((getline<
 
 # To save disk space, remove the un-mapped training data.  We could
 # easily generate it again if needed.
-rm $dir/train_nounk.gz 
+rm $dir/train_nounk.gz
 
 
 ##################################################################
@@ -177,7 +177,7 @@ prune_lm.sh --arpa 5.0 $dir/4gram
 # The default LM chosen to be the last pruned 4gram-mincount
 #
 # Note: One can cheat and provide an external ARPA LM here!!!
-#       To do so, make sure that 
+#       To do so, make sure that
 #         -- its vocabulary is fully covered by $lang/words.txt,
 #         -- it is gzipped and
 #         -- it is placed in the $dir directory.
@@ -205,14 +205,9 @@ echo "Compiling $gzipped_ARPA_LM into $lang/G.fst"
 
 . ./path.sh || exit 1;
 gunzip -c $gzipped_ARPA_LM | \
-  grep -v '<s> <s>' | \
-  grep -v '</s> <s>' | \
-  grep -v '</s> </s>' | \
-  arpa2fst - | fstprint | \
-    utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
-      --osymbols=$lang/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-     fstrmepsilon | fstarcsort --sort_type=ilabel > $lang/G.fst || exit 1;
-  fstisstochastic $lang/G.fst
+    arpa2fst --disambig-symbol=#0 \
+             --read-symbol-table=$lang/words.txt - $lang/G.fst || exit 1;
+fstisstochastic $lang/G.fst
 
 ##################################################################
 # Redo the FST step after reviewing perplexities reported by the
@@ -220,4 +215,3 @@ gunzip -c $gzipped_ARPA_LM | \
 ##################################################################
 
 exit 0
-
diff --git a/egs/babel/s5b/cmd.sh b/egs/babel/s5b/cmd.sh
index a4a11bef039..88db78823a5 100644
--- a/egs/babel/s5b/cmd.sh
+++ b/egs/babel/s5b/cmd.sh
@@ -1,29 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64"
-export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G"
-export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G"
-
-#export cuda_cmd="..."
-
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
-#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
-
-#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
-#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
-
-#c) run it locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-#export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 4G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/babel/s5b/local/arpa2G.sh b/egs/babel/s5b/local/arpa2G.sh
index 83f789e999f..db816abc7a5 100755
--- a/egs/babel/s5b/local/arpa2G.sh
+++ b/egs/babel/s5b/local/arpa2G.sh
@@ -38,7 +38,7 @@ if [ $# -ne 3 ]; then
 fi
 
 set -e           #Exit on non-zero return code from any command
-set -o pipefail  #Exit if any of the commands in the pipeline will 
+set -o pipefail  #Exit if any of the commands in the pipeline will
                  #return non-zero return code
 
 lmfile=$1
@@ -58,7 +58,7 @@ if [ ! -z "$oov_prob_file" ]; then
     exit 1;
   fi
 
-  min_prob=$(gunzip -c $lmfile | perl -e '  $minlogprob = 0.0; 
+  min_prob=$(gunzip -c $lmfile | perl -e '  $minlogprob = 0.0;
      while(<STDIN>) { if (m/\\(\d)-grams:/) { $order = $1; }
       if ($order == 1) { @A = split;
        if ($A[0] < $minlogprob && $A[0] != -99) { $minlogprob = $A[0]; }}} print $minlogprob')
@@ -75,7 +75,7 @@ if [ ! -z "$oov_prob_file" ]; then
       while(<STDIN>) {
       if (m/^ngram 1=(\d+)/) { $n = $1 + $num_oovs; print "ngram 1=$n\n"; }
       else { print; } # print all lines unchanged except the one that says ngram 1=X.
-      if (m/^\\1-grams:$/) { 
+      if (m/^\\1-grams:$/) {
         foreach $l (@OOVS) {
           @A = split(" ", $l);
           @A == 2 || die "bad line in oov2prob: $_;";
@@ -96,16 +96,11 @@ elif [[ $lmfile == *.gz ]] ; then
 else
   decompress="cat $lmfile"
 fi
- 
+
 $decompress | \
-  grep -v '<s> <s>' | grep -v '</s> <s>' |  grep -v '</s> </s>' | \
-  arpa2fst - | \
-  fstprint | \
-  utils/eps2disambig.pl | \
-  utils/s2eps.pl | \
-  fstcompile --isymbols=$langdir/words.txt \
-  --osymbols=$langdir/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-  fstrmepsilon | fstarcsort --sort_type=olabel > $destdir/G.fst || exit 1
+  arpa2fst --disambig-symbol=#0 \
+           --read-symbol-table=$langdir/words.txt - $destdir/G.fst || exit 1
+
 fstisstochastic $destdir/G.fst || true;
 
 if $cleanup; then
diff --git a/egs/babel/s5b/local/arpa2G_syllables.sh b/egs/babel/s5b/local/arpa2G_syllables.sh
index 8147a6bb38b..58ef162ec2e 100755
--- a/egs/babel/s5b/local/arpa2G_syllables.sh
+++ b/egs/babel/s5b/local/arpa2G_syllables.sh
@@ -22,7 +22,7 @@ silence_id=`grep -w SIL $langdir/words.txt | awk '{print $2}'` || exit 1;
 [ -z $silence_id ] && echo Error getting silence-id from $langdir/words.txt && exit 1;
 rho=$[$last_id+1]
 
-# state 0 is start-state.  state 1 is state after we saw silence.  state 2 is 
+# state 0 is start-state.  state 1 is state after we saw silence.  state 2 is
 # "dead state/failure state" that is not coaccessible.
 cat <<EOF | fstcompile > $destdir/rho.fst
 0 1 $silence_id $silence_id
@@ -35,16 +35,11 @@ EOF
 
 
 gunzip -c $lmfile | \
-    grep -v '<s> <s>' | grep -v '</s> <s>' |  grep -v '</s> </s>' | \
     sed 's/<unk>/<oov>/g' | \
-    arpa2fst - | \
-    fstprint | \
-    utils/eps2disambig.pl | \
-    utils/s2eps.pl | \
-    fstcompile --isymbols=$langdir/words.txt \
-    --osymbols=$langdir/words.txt  --keep_isymbols=false --keep_osymbols=false | \
+    arpa2fst --disambig-symbol=#0 --ilabel-sort=false \
+             --read-symbol-table=$langdir/words.txt - | \
     fstrhocompose "$rho" - $destdir/rho.fst | \
-    fstrmepsilon > $destdir/G.fst || exit 1
+    fstarcsort --sort_type=ilabel > $destdir/G.fst || exit 1
 
 fstisstochastic $destdir/G.fst || true
 
diff --git a/egs/babel/s5b/local/nnet2/get_egs_semi_supervised.sh b/egs/babel/s5b/local/nnet2/get_egs_semi_supervised.sh
index fcf67514396..760d7ee80d5 100755
--- a/egs/babel/s5b/local/nnet2/get_egs_semi_supervised.sh
+++ b/egs/babel/s5b/local/nnet2/get_egs_semi_supervised.sh
@@ -28,7 +28,7 @@ transform_dir_sup=     # If supplied, overrides alidir
 transform_dir_unsup=   
 num_jobs_nnet=16    # Number of neural net jobs to run in parallel
 stage=-10
-io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time. 
+io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time. 
 splice_width=4 # meaning +- 4 frames on each side for second LDA
 spk_vecs_dir_sup=
 spk_vecs_dir_unsup=
diff --git a/egs/babel/s5b/local/prepare_kaldi_lm_from_training_text.sh b/egs/babel/s5b/local/prepare_kaldi_lm_from_training_text.sh
index 79a1bbd2263..79bd348bf75 100755
--- a/egs/babel/s5b/local/prepare_kaldi_lm_from_training_text.sh
+++ b/egs/babel/s5b/local/prepare_kaldi_lm_from_training_text.sh
@@ -8,7 +8,7 @@
 # This script trains LMs on the WSJ LM-training data.
 # It requires that you have already run wsj_extend_dict.sh,
 # to get the larger-size dictionary including all of CMUdict
-# plus any OOVs and possible acronyms that we could easily 
+# plus any OOVs and possible acronyms that we could easily
 # derive pronunciations for.
 
 # This script takes as command-line arguments the relevant data/lang
@@ -69,7 +69,7 @@ cat $data/text | awk '{for (n=2;n<NF;n++) printf("%s ", $n); printf "\n";}' | \
  gzip -c > $dir/train_in.gz || exit 1;
 
 # Get training data with OOV words (w.r.t. our current vocab) replaced with <unk>.
-echo "Getting training data with OOV words replaced with <unk> (train_nounk.gz)" 
+echo "Getting training data with OOV words replaced with <unk> (train_nounk.gz)"
 gunzip -c $dir/train_in.gz | awk -v w=$dir/wordlist \
   'BEGIN{while((getline<w)>0) v[$1]=1;}
   {for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf "<unk> ";print ""}'|sed 's/ $//g' \
@@ -93,7 +93,7 @@ gunzip -c $dir/train_nounk.gz | awk -v wmap=$dir/word_map 'BEGIN{while((getline<
 
 # To save disk space, remove the un-mapped training data.  We could
 # easily generate it again if needed.
-rm $dir/train_nounk.gz 
+rm $dir/train_nounk.gz
 
 
 ##################################################################
@@ -177,7 +177,7 @@ prune_lm.sh --arpa 5.0 $dir/4gram
 # The default LM chosen to be the last pruned 4gram-mincount
 #
 # Note: One can cheat and provide an external ARPA LM here!!!
-#       To do so, make sure that 
+#       To do so, make sure that
 #         -- its vocabulary is fully covered by $lang/words.txt,
 #         -- it is gzipped and
 #         -- it is placed in the $dir directory.
@@ -205,14 +205,9 @@ echo "Compiling $gzipped_ARPA_LM into $lang/G.fst"
 
 . ./path.sh || exit 1;
 gunzip -c $gzipped_ARPA_LM | \
-  grep -v '<s> <s>' | \
-  grep -v '</s> <s>' | \
-  grep -v '</s> </s>' | \
-  arpa2fst - | fstprint | \
-    utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
-      --osymbols=$lang/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-     fstrmepsilon > $lang/G.fst || exit 1;
-  fstisstochastic $lang/G.fst
+    arpa2fst --disambig-symbol=#0 \
+             --read-symbol-table=$lang/words.txt - $lang/G.fst || exit 1;
+fstisstochastic $lang/G.fst
 
 ##################################################################
 # Redo the FST step after reviewing perplexities reported by the
@@ -220,4 +215,3 @@ gunzip -c $gzipped_ARPA_LM | \
 ##################################################################
 
 exit 0
-
diff --git a/egs/babel/s5c/cmd.sh b/egs/babel/s5c/cmd.sh
index a4a11bef039..71dd849a93b 100644
--- a/egs/babel/s5c/cmd.sh
+++ b/egs/babel/s5c/cmd.sh
@@ -1,29 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64"
-export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G"
-export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G"
-
-#export cuda_cmd="..."
-
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
-#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
-
-#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
-#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
-
-#c) run it locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-#export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/babel/s5c/conf/lang/107-vietnamese-fullLP.official.conf b/egs/babel/s5c/conf/lang/107-vietnamese-fullLP.official.conf
index 8b09764e45a..13bac7586a1 100644
--- a/egs/babel/s5c/conf/lang/107-vietnamese-fullLP.official.conf
+++ b/egs/babel/s5c/conf/lang/107-vietnamese-fullLP.official.conf
@@ -3,7 +3,7 @@
 
 #speech corpora files location
 train_data_dir=/export/babel/data/107-vietnamese/release-current/conversational/training/
-train_data_list=/export/babel/data/splits/Vietnamese_Babel107/train.fullLP.list
+train_data_list=/export/babel/data/splits/Vietnamese_Babel107/train.FullLP.list
 train_nj=32
 
 #RADICAL DEV data files
diff --git a/egs/babel/s5c/local/arpa2G.sh b/egs/babel/s5c/local/arpa2G.sh
index 83f789e999f..db816abc7a5 100755
--- a/egs/babel/s5c/local/arpa2G.sh
+++ b/egs/babel/s5c/local/arpa2G.sh
@@ -38,7 +38,7 @@ if [ $# -ne 3 ]; then
 fi
 
 set -e           #Exit on non-zero return code from any command
-set -o pipefail  #Exit if any of the commands in the pipeline will 
+set -o pipefail  #Exit if any of the commands in the pipeline will
                  #return non-zero return code
 
 lmfile=$1
@@ -58,7 +58,7 @@ if [ ! -z "$oov_prob_file" ]; then
     exit 1;
   fi
 
-  min_prob=$(gunzip -c $lmfile | perl -e '  $minlogprob = 0.0; 
+  min_prob=$(gunzip -c $lmfile | perl -e '  $minlogprob = 0.0;
      while(<STDIN>) { if (m/\\(\d)-grams:/) { $order = $1; }
       if ($order == 1) { @A = split;
        if ($A[0] < $minlogprob && $A[0] != -99) { $minlogprob = $A[0]; }}} print $minlogprob')
@@ -75,7 +75,7 @@ if [ ! -z "$oov_prob_file" ]; then
       while(<STDIN>) {
       if (m/^ngram 1=(\d+)/) { $n = $1 + $num_oovs; print "ngram 1=$n\n"; }
       else { print; } # print all lines unchanged except the one that says ngram 1=X.
-      if (m/^\\1-grams:$/) { 
+      if (m/^\\1-grams:$/) {
         foreach $l (@OOVS) {
           @A = split(" ", $l);
           @A == 2 || die "bad line in oov2prob: $_;";
@@ -96,16 +96,11 @@ elif [[ $lmfile == *.gz ]] ; then
 else
   decompress="cat $lmfile"
 fi
- 
+
 $decompress | \
-  grep -v '<s> <s>' | grep -v '</s> <s>' |  grep -v '</s> </s>' | \
-  arpa2fst - | \
-  fstprint | \
-  utils/eps2disambig.pl | \
-  utils/s2eps.pl | \
-  fstcompile --isymbols=$langdir/words.txt \
-  --osymbols=$langdir/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-  fstrmepsilon | fstarcsort --sort_type=olabel > $destdir/G.fst || exit 1
+  arpa2fst --disambig-symbol=#0 \
+           --read-symbol-table=$langdir/words.txt - $destdir/G.fst || exit 1
+
 fstisstochastic $destdir/G.fst || true;
 
 if $cleanup; then
diff --git a/egs/babel/s5c/local/arpa2G_syllables.sh b/egs/babel/s5c/local/arpa2G_syllables.sh
index 8f10f87f019..58ef162ec2e 100755
--- a/egs/babel/s5c/local/arpa2G_syllables.sh
+++ b/egs/babel/s5c/local/arpa2G_syllables.sh
@@ -22,7 +22,7 @@ silence_id=`grep -w SIL $langdir/words.txt | awk '{print $2}'` || exit 1;
 [ -z $silence_id ] && echo Error getting silence-id from $langdir/words.txt && exit 1;
 rho=$[$last_id+1]
 
-# state 0 is start-state.  state 1 is state after we saw silence.  state 2 is 
+# state 0 is start-state.  state 1 is state after we saw silence.  state 2 is
 # "dead state/failure state" that is not coaccessible.
 cat <<EOF | fstcompile > $destdir/rho.fst
 0 1 $silence_id $silence_id
@@ -35,16 +35,11 @@ EOF
 
 
 gunzip -c $lmfile | \
-    grep -v '<s> <s>' | grep -v '</s> <s>' |  grep -v '</s> </s>' | \
     sed 's/<unk>/<oov>/g' | \
-    arpa2fst - | \
-    fstprint | \
-    utils/eps2disambig.pl | \
-    utils/s2eps.pl | \
-    fstcompile --isymbols=$langdir/words.txt \
-    --osymbols=$langdir/words.txt  --keep_isymbols=false --keep_osymbols=false | \
+    arpa2fst --disambig-symbol=#0 --ilabel-sort=false \
+             --read-symbol-table=$langdir/words.txt - | \
     fstrhocompose "$rho" - $destdir/rho.fst | \
-    fstrmepsilon | fstarcsort --sort_type=ilabel > $destdir/G.fst || exit 1
+    fstarcsort --sort_type=ilabel > $destdir/G.fst || exit 1
 
 fstisstochastic $destdir/G.fst || true
 
diff --git a/egs/babel/s5c/local/datasets/extra_kws.sh b/egs/babel/s5c/local/datasets/extra_kws.sh
index a84ebc7deb1..cb90968a1dc 100644
--- a/egs/babel/s5c/local/datasets/extra_kws.sh
+++ b/egs/babel/s5c/local/datasets/extra_kws.sh
@@ -60,7 +60,7 @@ function setup_oov_search {
   #instead of search collection dependent
   if [ ! -f exp/conf_matrix/.done ] ; then
     local/generate_confusion_matrix.sh --cmd "$decode_cmd" --nj $my_nj  \
-      exp/sgmm5_denlats/dengraph  exp/sgmm5 exp/sgmm5_ali exp/sgmm5_denlats  exp/conf_matrix
+      exp/sgmm5_denlats/dengraph  exp/sgmm5 exp/sgmm5_ali exp/sgmm5_denlats  exp/conf_matrix || return 1
     touch exp/conf_matrix/.done 
   fi
   confusion=exp/conf_matrix/confusions.txt
diff --git a/egs/babel/s5c/local/extend_lexicon.sh b/egs/babel/s5c/local/extend_lexicon.sh
index 18c69415ed4..fd0b27a4172 100755
--- a/egs/babel/s5c/local/extend_lexicon.sh
+++ b/egs/babel/s5c/local/extend_lexicon.sh
@@ -2,6 +2,7 @@
 
 # Copyright 2014  Johns Hopkins University (authors: Daniel Povey, Yenda Trmal)
 #           2014  Guoguo Chen
+#           2015  MIT Lincoln Labs (author: Fred Richardson)
 # Apache 2.0.
 
 # This script takes an input lexicon (e.g. lexicon.txt) and generates likely
@@ -351,7 +352,17 @@ if [ $stage -le $g2p_iters ]; then
     g2p.py -V $var_mass --variants-number $var_counts --encoding $encoding \
       --model $dir/p2g.model.final --apply - \
     \> $dir/p2g_output.JOB || exit 1;
-  cat $dir/p2g_output.* > $dir/p2g_output
+  perl -wlne 'use strict;
+            our %P;
+            my ($prn,$num,$prb,$spl)=m/^(\S+)\s+(\S+)\s+(\S+)\s+(.*)$/;
+            my $tok=$prn."=".$spl;
+            $P{$tok} = [ $num, $prb ] unless (defined($P{$tok}) && $P{$tok}[1] < $prb);
+            END {
+                map{ my ($prn,$spl)=m/^(.*)=(.*)$/;
+                     my ($num, $prb) = @{$P{$tok}};
+                     print join("\t",$prn,$num,$prb,$spl)
+                   } sort keys %P
+            }' $dir/p2g_output.* > $dir/p2g_output
   rm $dir/p2g_output.*
 fi
 
diff --git a/egs/babel/s5c/local/generate_confusion_matrix.sh b/egs/babel/s5c/local/generate_confusion_matrix.sh
index 6529057db9e..4bcbacb5ae9 100755
--- a/egs/babel/s5c/local/generate_confusion_matrix.sh
+++ b/egs/babel/s5c/local/generate_confusion_matrix.sh
@@ -37,6 +37,7 @@ fi
 
 set -u
 set -e
+set -o pipefail
 
 data=$1; shift
 modeldir=$1; shift
@@ -64,7 +65,7 @@ cat $data/phones.txt | sed 's/_[B|E|I|S]//g' |\
 
 echo "Converting alignments to phone sequences..."
 $cmd JOB=1:$nj $wdir/log/ali_to_phones.JOB.log \
-  compute-wer --text --mode=all\
+  align-text\
     ark:\<\( \
       ali-to-phones  $model ark:"gunzip -c $alidir/ali.JOB.gz|" ark,t:- \|\
       int2sym.pl -f 2- $wdir/phones.txt - \) \
@@ -72,7 +73,7 @@ $cmd JOB=1:$nj $wdir/log/ali_to_phones.JOB.log \
       lattice-to-phone-lattice $model ark:"gunzip -c $latdir/lat.JOB.gz|"  ark:- \| \
       lattice-best-path --acoustic-scale=$acwt  ark:- ark,t:- ark:/dev/null \| \
       int2sym.pl -f 2- $wdir/phones.txt - \) \
-    $wdir/confusions.JOB.txt
+    ark:$wdir/confusions.JOB.txt
 
 confusion_files=""
 for i in `seq 1 $nj` ; do
@@ -80,23 +81,12 @@ for i in `seq 1 $nj` ; do
 done
 
 echo "Converting statistics..."
-cat $confusion_files | sort | uniq -c | grep -v -E '<oov>|<sss>|<vns>|SIL' | \
+cat $confusion_files | cut -f 2- -d ' ' | sed 's/ *; */\n/g'| sort | uniq -c | \
+  grep -v -E '<oov>|<sss>|<vns>|SIL' | \
   perl -ane '
-    if ($F[1] eq "correct") {
-      die "Unknown format " . join(" ", @F) . "\n" if ($#F != 2);
-      print "$F[2] $F[2] $F[0]\n";
-    } elsif ($F[1] eq "deletion" ) {
-      die "Unknown format " . join(" ", @F) . "\n" if ($#F != 2);
-      print "$F[2] <eps> $F[0]\n";
-    } elsif ($F[1] eq "insertion") {
-      die "Unknown format " . join(" ", @F) . "\n" if ($#F != 2);
-      print "<eps> $F[2] $F[0]\n";
-    } elsif ($F[1] eq "substitution") {
-      die "Unknown format " . join(" ", @F) . "\n" if ($#F != 3);
-      print "$F[2] $F[3] $F[0]\n";
-    } else {
-      die "Unknown line " . join(" ", @F). "\n";
-    }' > $wdir/confusions.txt
+    die unless scalar @F == 3;
+    print "$F[1] $F[2] $F[0]\n";
+    ' > $wdir/confusions.txt 
 
 exit 0
 #-echo "Converting alignments to phone sequences..."
diff --git a/egs/babel/s5c/local/nnet2/get_egs_semi_supervised.sh b/egs/babel/s5c/local/nnet2/get_egs_semi_supervised.sh
index fcf67514396..760d7ee80d5 100755
--- a/egs/babel/s5c/local/nnet2/get_egs_semi_supervised.sh
+++ b/egs/babel/s5c/local/nnet2/get_egs_semi_supervised.sh
@@ -28,7 +28,7 @@ transform_dir_sup=     # If supplied, overrides alidir
 transform_dir_unsup=   
 num_jobs_nnet=16    # Number of neural net jobs to run in parallel
 stage=-10
-io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time. 
+io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time. 
 splice_width=4 # meaning +- 4 frames on each side for second LDA
 spk_vecs_dir_sup=
 spk_vecs_dir_unsup=
diff --git a/egs/babel/s5c/local/prepare_kaldi_lm_from_training_text.sh b/egs/babel/s5c/local/prepare_kaldi_lm_from_training_text.sh
index d69bf3338f6..79bd348bf75 100755
--- a/egs/babel/s5c/local/prepare_kaldi_lm_from_training_text.sh
+++ b/egs/babel/s5c/local/prepare_kaldi_lm_from_training_text.sh
@@ -8,7 +8,7 @@
 # This script trains LMs on the WSJ LM-training data.
 # It requires that you have already run wsj_extend_dict.sh,
 # to get the larger-size dictionary including all of CMUdict
-# plus any OOVs and possible acronyms that we could easily 
+# plus any OOVs and possible acronyms that we could easily
 # derive pronunciations for.
 
 # This script takes as command-line arguments the relevant data/lang
@@ -69,7 +69,7 @@ cat $data/text | awk '{for (n=2;n<NF;n++) printf("%s ", $n); printf "\n";}' | \
  gzip -c > $dir/train_in.gz || exit 1;
 
 # Get training data with OOV words (w.r.t. our current vocab) replaced with <unk>.
-echo "Getting training data with OOV words replaced with <unk> (train_nounk.gz)" 
+echo "Getting training data with OOV words replaced with <unk> (train_nounk.gz)"
 gunzip -c $dir/train_in.gz | awk -v w=$dir/wordlist \
   'BEGIN{while((getline<w)>0) v[$1]=1;}
   {for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf "<unk> ";print ""}'|sed 's/ $//g' \
@@ -93,7 +93,7 @@ gunzip -c $dir/train_nounk.gz | awk -v wmap=$dir/word_map 'BEGIN{while((getline<
 
 # To save disk space, remove the un-mapped training data.  We could
 # easily generate it again if needed.
-rm $dir/train_nounk.gz 
+rm $dir/train_nounk.gz
 
 
 ##################################################################
@@ -177,7 +177,7 @@ prune_lm.sh --arpa 5.0 $dir/4gram
 # The default LM chosen to be the last pruned 4gram-mincount
 #
 # Note: One can cheat and provide an external ARPA LM here!!!
-#       To do so, make sure that 
+#       To do so, make sure that
 #         -- its vocabulary is fully covered by $lang/words.txt,
 #         -- it is gzipped and
 #         -- it is placed in the $dir directory.
@@ -205,14 +205,9 @@ echo "Compiling $gzipped_ARPA_LM into $lang/G.fst"
 
 . ./path.sh || exit 1;
 gunzip -c $gzipped_ARPA_LM | \
-  grep -v '<s> <s>' | \
-  grep -v '</s> <s>' | \
-  grep -v '</s> </s>' | \
-  arpa2fst - | fstprint | \
-    utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
-      --osymbols=$lang/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-     fstrmepsilon | fstarcsort --sort_type=ilabel > $lang/G.fst || exit 1;
-  fstisstochastic $lang/G.fst
+    arpa2fst --disambig-symbol=#0 \
+             --read-symbol-table=$lang/words.txt - $lang/G.fst || exit 1;
+fstisstochastic $lang/G.fst
 
 ##################################################################
 # Redo the FST step after reviewing perplexities reported by the
@@ -220,4 +215,3 @@ gunzip -c $gzipped_ARPA_LM | \
 ##################################################################
 
 exit 0
-
diff --git a/egs/babel/s5c/run-4-anydecode.sh b/egs/babel/s5c/run-4-anydecode.sh
index a1b943dd35e..68b87ea1e27 100755
--- a/egs/babel/s5c/run-4-anydecode.sh
+++ b/egs/babel/s5c/run-4-anydecode.sh
@@ -188,11 +188,11 @@ echo ---------------------------------------------------------------------
 if [ ! -f  $dataset_dir/.done ] ; then
   if [ "$dataset_kind" == "supervised" ]; then
     if [ "$dataset_segments" == "seg" ]; then
-      . ./local/datasets/supervised_seg.sh
+      . ./local/datasets/supervised_seg.sh || exit 1
     elif [ "$dataset_segments" == "uem" ]; then
-      . ./local/datasets/supervised_uem.sh
+      . ./local/datasets/supervised_uem.sh || exit 1
     elif [ "$dataset_segments" == "pem" ]; then
-      . ./local/datasets/supervised_pem.sh
+      . ./local/datasets/supervised_pem.sh || exit 1
     else
       echo "Unknown type of the dataset: \"$dataset_segments\"!";
       echo "Valid dataset types are: seg, uem, pem";
@@ -241,12 +241,12 @@ echo ---------------------------------------------------------------------
 echo "Preparing kws data files in ${dataset_dir} on" `date`
 echo ---------------------------------------------------------------------
 if ! $skip_kws ; then
-  . ./local/datasets/basic_kws.sh
+  . ./local/datasets/basic_kws.sh || exit 1
   if  $extra_kws ; then 
-    . ./local/datasets/extra_kws.sh
+    . ./local/datasets/extra_kws.sh || exit 1
   fi
   if  $vocab_kws ; then 
-    . ./local/datasets/vocab_kws.sh
+    . ./local/datasets/vocab_kws.sh || exit 1
   fi
 fi
 
diff --git a/egs/bn_music_speech/v1/README b/egs/bn_music_speech/v1/README
new file mode 100644
index 00000000000..8a8ae65108d
--- /dev/null
+++ b/egs/bn_music_speech/v1/README
@@ -0,0 +1,6 @@
+ The MUSAN corpus is required for system training. It is available at: 
+   http://www.openslr.org/17/
+
+ The test requires Broadcast News data. The LDC Catalog numbers are:
+   Speech      LDC97S44
+   Transcripts LDC97T22
diff --git a/egs/bn_music_speech/v1/cmd.sh b/egs/bn_music_speech/v1/cmd.sh
new file mode 100755
index 00000000000..d1ca1a6d126
--- /dev/null
+++ b/egs/bn_music_speech/v1/cmd.sh
@@ -0,0 +1,15 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 4G"
+
+
diff --git a/egs/bn_music_speech/v1/conf/merge_vad_map.txt b/egs/bn_music_speech/v1/conf/merge_vad_map.txt
new file mode 100644
index 00000000000..216dee78b65
--- /dev/null
+++ b/egs/bn_music_speech/v1/conf/merge_vad_map.txt
@@ -0,0 +1,16 @@
+# This table defines the mapping used by the binary merge-vads to 
+# combine the output of compute-vad and compute-vad-from-frame-likes.
+# The first column corresponds to VAD decisions from compute-vad
+# and the second corresponds to VAD decisions from
+# compute-vad-from-frame-likes.  The labels "0" and "1" in the
+# first column represent (approximately) silence and nonsilence
+# respectively.  The labels "0," "1," and "2" in the second column
+# represent noise, speech, and music, respectively.  The third
+# column lists the resulting output labels: "0," "1," and "2" 
+# corresponding to silence/noise, speech, and music. 
+0 0 0
+1 0 0
+0 1 0
+1 1 1
+0 2 0
+1 2 2
diff --git a/egs/bn_music_speech/v1/conf/mfcc.conf b/egs/bn_music_speech/v1/conf/mfcc.conf
new file mode 100644
index 00000000000..a4be40be454
--- /dev/null
+++ b/egs/bn_music_speech/v1/conf/mfcc.conf
@@ -0,0 +1,6 @@
+--sample-frequency=16000 
+--frame-length=25 # the default is 25
+--low-freq=20 # the default.
+--high-freq=3700 # the default is zero meaning use the Nyquist (4k in this case).
+--num-ceps=20 # higher than the default which is 12.
+--snip-edges=false
diff --git a/egs/bn_music_speech/v1/conf/vad.conf b/egs/bn_music_speech/v1/conf/vad.conf
new file mode 100644
index 00000000000..a0ca2449b10
--- /dev/null
+++ b/egs/bn_music_speech/v1/conf/vad.conf
@@ -0,0 +1,2 @@
+--vad-energy-threshold=5.5
+--vad-energy-mean-scale=0.5
diff --git a/egs/bn_music_speech/v1/local/make_annotations_bn.py b/egs/bn_music_speech/v1/local/make_annotations_bn.py
new file mode 100755
index 00000000000..53cebf52ea4
--- /dev/null
+++ b/egs/bn_music_speech/v1/local/make_annotations_bn.py
@@ -0,0 +1,153 @@
+#!/usr/bin/env python
+# Copyright 2015   David Snyder
+# Apache 2.0.
+#
+# This script creates four files for each HUB4 Broadcast News
+# transcript file. The four files are for the music, speech, ad,
+# and other transcripts. Each line of the output files define the
+# start and end times of the individual events.
+#
+# This file is meant to be invoked by make_bn.sh.
+
+import sys, re, os
+
+def is_speech(line):
+  if "<Segment" in line and "Speaker=" in line:
+    return True
+  return False
+
+def is_other_type2(line):
+  if "Type=Commercial" in line or "Type=Filler" in line or "Type=Local_News" in line:
+    return True
+  return False
+
+def is_music(line):
+  if "Type=Music" in line:
+    return True
+  return False
+
+def is_other_type1(line):
+  if "Type=Other" in line:
+    return True
+  return False
+
+def extract_speech(line):
+  m = re.search('(?<=S_time=)\d+.\d+', line)
+  start = float(m.group(0))
+  m = re.search('(?<=E_time=)\d+.\d+', line)
+  end = float(m.group(0))
+  if start > end:
+    print "Skipping annotation where end time is before start time:", line
+  return start, end
+
+def extract_other_type2(line):
+  m = re.search('(?<=S_time=)\d+.\d+', line)
+  start = float(m.group(0))
+  m = re.search('(?<=E_time=)\d+.\d+', line)
+  end = float(m.group(0))
+  if start > end:
+    print "Skipping annotation where end time is before start time:", line
+  return start, end
+
+def extract_music(line):
+  m = re.search('(?<=Time=)\d+.\d+', line)
+  time = float(m.group(0))
+  m = re.search('(?<=Level=)\w', line)
+  level = m.group(0)
+  is_on = False
+  if level == "L" or level == "H":
+    is_on = True
+  elif level == "O":
+    is_on = False
+  else:
+    print "Encountered bad token on line:", line
+    sys.exit()
+  return time, is_on
+
+def extract_other_type1(line):
+  m = re.search('(?<=Time=)\d+.\d+', line)
+  time = float(m.group(0))
+  m = re.search('(?<=Level=)\w', line)
+  level = m.group(0)
+  is_on = False
+  if level == "L" or level == "H":
+    is_on = True
+  elif level == "O":
+    is_on = False
+  else:
+    print "Encountered bad token on line:", line
+    sys.exit()
+  return time, is_on
+
+def process_file(annos):
+  speech = ""
+  music = ""
+  other_type2 = ""
+  other_type1 = ""
+  start_new_music_segment = True
+  start_new_other_segment = True
+  max_time = 0.0
+  prev_music_time = "0.0"
+  prev_other_time = "0.0"
+  for line in annos:
+    if is_speech(line):
+      speech_start, speech_end = extract_speech(line)
+      speech = speech + str(speech_start) + " " + str(speech_end) + "\n"
+      max_time = max(speech_end, max_time)
+    elif is_other_type2(line):
+      other_type2_start, other_type2_end = extract_other_type2(line)
+      other_type2 = other_type2 + str(other_type2_start) + " " + str(other_type2_end) + "\n"
+      max_time = max(other_type2_end, max_time)
+    elif is_music(line):
+      time, is_on = extract_music(line)
+      max_time = max(time, max_time)
+      if is_on and start_new_music_segment:
+        prev_music_time = time
+        start_new_music_segment = False
+      elif not is_on and not start_new_music_segment:
+        music = music + str(prev_music_time) + " " + str(time) + "\n"
+        start_new_music_segment = True
+    elif is_other_type1(line):
+      time, is_on = extract_other_type1(line)
+      max_time = max(time, max_time)
+      if is_on and start_new_other_segment:
+        prev_other_time = time
+        start_new_other_segment = False
+      elif not is_on and not start_new_other_segment:
+        other_type1 = other_type1 + str(prev_other_time) + " " + str(time) + "\n"
+        start_new_other_segment = True
+
+  if not start_new_music_segment:
+    music = music + str(prev_music_time) + " " + str(max_time) + "\n"
+  if not start_new_other_segment:
+    other_type1 = other_type1 + str(prev_other_time) + " " + str(max_time) + "\n"
+
+  other = other_type1 + other_type2
+  return speech, music, other
+
+def main():
+  in_dir = sys.argv[1]
+  out_dir = sys.argv[2]
+  utts = ""
+  for root, dirs, files in os.walk(in_dir):
+    for file in files:
+      if file.endswith(".txt"):
+        anno_in = open(os.path.join(root, file), 'r').readlines()
+        speech, music, other = process_file(anno_in)
+        utt = file.replace(".txt", "")
+        utts = utts + utt + "\n"
+        speech_fi_str = utt + "_speech.key"
+        music_fi_str = utt +  "_music.key"
+        other_fi_str = utt +  "_other.key"
+        speech_fi = open(os.path.join(out_dir, speech_fi_str), 'w')
+        speech_fi.write(speech)
+        music_fi = open(os.path.join(out_dir, music_fi_str), 'w')
+        music_fi.write(music)
+        other_fi = open(os.path.join(out_dir, other_fi_str), 'w')
+        other_fi.write(other)
+  utts_fi = open(os.path.join(out_dir, "utt_list"), 'w')
+  utts_fi.write(utts)
+
+if __name__=="__main__":
+  main()
+
diff --git a/egs/bn_music_speech/v1/local/make_bn.py b/egs/bn_music_speech/v1/local/make_bn.py
new file mode 100755
index 00000000000..98836d32534
--- /dev/null
+++ b/egs/bn_music_speech/v1/local/make_bn.py
@@ -0,0 +1,48 @@
+#!/usr/bin/env python
+# Copyright 2015   David Snyder
+# Apache 2.0.
+#
+# Using the annotations created by refine_annotations_bn.py, this script
+# creates the segments, utt2spk, and wav.scp files.
+#
+# This file is meant to be invoked by make_bn.sh.
+
+import os, sys
+wav_dir = sys.argv[1]
+out_dir = sys.argv[2]
+
+utts = open(os.path.join(out_dir, "utt_list"), 'r').readlines()
+utts = set(x.rstrip() for x in utts)
+wav = ""
+segments = ""
+utt2spk = ""
+for subdir, dirs, files in os.walk(wav_dir):
+  for file in files:
+    utt = str(file).replace(".sph", "")
+    if file.endswith(".sph") and utt in utts:
+      wav = wav + utt + " sox " + subdir + "/" + utt + ".sph"  + " -c 1 -r 16000 -t wav - |\n"
+wav_fi = open(os.path.join(out_dir, "wav.scp"), 'w')
+wav_fi.write(wav)
+
+for utt in utts:
+  music_filename = utt + "_music.key.refined"
+  speech_filename = utt + "_speech.key.refined"
+  music_fi = open(os.path.join(out_dir, music_filename), 'r').readlines()
+  speech_fi = open(os.path.join(out_dir, speech_filename), 'r').readlines()
+  count = 1
+  for line in music_fi:
+    left, right = line.rstrip().split(" ")
+    segments = segments + utt + "-music-" + str(count) + " " + utt + " " + left + " " + right + "\n"
+    utt2spk = utt2spk + utt + "-music-" + str(count) + " " + utt + "-music-" + str(count) + "\n"
+    count += 1
+  count = 1
+  for line in speech_fi:
+    left, right = line.rstrip().split(" ")
+    segments = segments + utt + "-speech-" + str(count) + " " + utt + " " + left + " " + right + "\n"
+    utt2spk = utt2spk + utt + "-speech-" + str(count) + " " + utt + "-speech-" + str(count) + "\n"
+    count += 1
+utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), 'w')
+utt2spk_fi.write(utt2spk)
+segments_fi = open(os.path.join(out_dir, "segments"), 'w')
+segments_fi.write(segments)
+
diff --git a/egs/bn_music_speech/v1/local/make_bn.sh b/egs/bn_music_speech/v1/local/make_bn.sh
new file mode 100755
index 00000000000..5e2a29f0cca
--- /dev/null
+++ b/egs/bn_music_speech/v1/local/make_bn.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+# Copyright 2015   David Snyder
+# Apache 2.0.
+#
+# This script, called by ../run.sh, creates the HUB4 Broadcast News
+# data directory. The required datasets can be found at:
+#   https://catalog.ldc.upenn.edu/LDC97S44
+#   https://catalog.ldc.upenn.edu/LDC97T22
+
+set -e
+sph_dir=$1
+transcript_dir=$2
+data_dir=$3
+tmp_dir=local/bn.tmp
+
+# These parameters are used when refining the annotations.
+# A higher frames_per_second provides better resolution at the
+# frame boundaries. Set min_seg to control the minimum length of the
+# final segments. It seems that the original annotations for segments
+# below half a second are not very accurate, so we test only on segments
+# longer than this.
+frames_per_sec=100
+min_seg=0.5
+
+rm -rf local/bn.tmp
+mkdir local/bn.tmp
+
+echo "$0: preparing annotations..."
+local/make_annotations_bn.py ${transcript_dir} ${tmp_dir}
+echo "$0: Removing overlapping annotations..."
+local/refine_annotations_bn.py ${tmp_dir} ${frames_per_sec} ${min_seg}
+echo "$0: Preparing broadcast news data directories ${data_dir}/bn..."
+local/make_bn.py ${sph_dir} ${tmp_dir}
+
+mkdir -p ${data_dir}/bn
+cp ${tmp_dir}/wav.scp ${data_dir}/bn/
+cp ${tmp_dir}/utt2spk ${data_dir}/bn/
+cp ${tmp_dir}/segments ${data_dir}/bn/
+rm -rf local/bn.tmp
+utils/fix_data_dir.sh data/bn
diff --git a/egs/bn_music_speech/v1/local/make_musan.py b/egs/bn_music_speech/v1/local/make_musan.py
new file mode 100755
index 00000000000..490de9baa37
--- /dev/null
+++ b/egs/bn_music_speech/v1/local/make_musan.py
@@ -0,0 +1,97 @@
+#!/usr/bin/env python
+# Copyright 2015   David Snyder
+# Apache 2.0.
+#
+# This file is meant to be invoked by make_musan.sh.
+
+import os, sys
+
+
+def process_music_annotations(path):
+  utt2spk = {}
+  utt2vocals = {}
+  lines = open(path, 'r').readlines()
+  for line in lines:
+    utt, genres, vocals, musician = line.rstrip().split()[:4]
+    # For this application, the musican ID isn't important
+    utt2spk[utt] = utt
+    utt2vocals[utt] = vocals == "Y"
+  return utt2spk, utt2vocals
+
+def prepare_music(root_dir, use_vocals):
+  utt2vocals = {}
+  utt2spk = {}
+  utt2wav = {}
+  music_dir = os.path.join(root_dir, "music")
+  print str(music_dir)
+  for root, dirs, files in os.walk(music_dir):
+    for file in files:
+      file_path = os.path.join(root, file)
+      if file.endswith(".wav"):
+        utt = str(file).replace(".wav", "")
+        utt2wav[utt] = file_path
+      elif str(file) == "ANNOTATIONS":
+        utt2spk_part, utt2vocals_part = process_music_annotations(file_path)
+        utt2spk.update(utt2spk_part)
+        utt2vocals.update(utt2vocals_part)
+  utt2spk_str = ""
+  utt2wav_str = ""
+  for utt in utt2vocals:
+    if use_vocals or not utt2vocals[utt]:
+      utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
+      utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
+  return utt2spk_str, utt2wav_str
+
+def prepare_speech(root_dir):
+  utt2spk = {}
+  utt2wav = {}
+  speech_dir = os.path.join(root_dir, "speech")
+  for root, dirs, files in os.walk(speech_dir):
+    for file in files:
+      file_path = os.path.join(root, file)
+      if file.endswith(".wav"):
+        utt = str(file).replace(".wav", "")
+        utt2wav[utt] = file_path
+        utt2spk[utt] = utt
+  utt2spk_str = ""
+  utt2wav_str = ""
+  for utt in utt2spk:
+    utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
+    utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
+  return utt2spk_str, utt2wav_str
+
+def prepare_noise(root_dir):
+  utt2spk = {}
+  utt2wav = {}
+  speech_dir = os.path.join(root_dir, "noise")
+  for root, dirs, files in os.walk(speech_dir):
+    for file in files:
+      file_path = os.path.join(root, file)
+      if file.endswith(".wav"):
+        utt = str(file).replace(".wav", "")
+        utt2wav[utt] = file_path
+        utt2spk[utt] = utt
+  utt2spk_str = ""
+  utt2wav_str = ""
+  for utt in utt2spk:
+    utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
+    utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
+  return utt2spk_str, utt2wav_str
+
+def main():
+  in_dir = sys.argv[1]
+  out_dir = sys.argv[2]
+  use_vocals = sys.argv[3] == "Y"
+  utt2spk_music, utt2wav_music = prepare_music(in_dir, use_vocals)
+  utt2spk_speech, utt2wav_speech = prepare_speech(in_dir)
+  utt2spk_noise, utt2wav_noise = prepare_noise(in_dir)
+  utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise
+  utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise
+  wav_fi = open(os.path.join(out_dir, "wav.scp"), 'w')
+  wav_fi.write(utt2wav)
+  utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), 'w')
+  utt2spk_fi.write(utt2spk)
+
+
+if __name__=="__main__":
+  main()
diff --git a/egs/bn_music_speech/v1/local/make_musan.sh b/egs/bn_music_speech/v1/local/make_musan.sh
new file mode 100755
index 00000000000..1faac0ef58c
--- /dev/null
+++ b/egs/bn_music_speech/v1/local/make_musan.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+# Copyright 2015   David Snyder
+# Apache 2.0.
+#
+# This script, called by ../run.sh, creates the MUSAN
+# data directory. The required dataset is freely available at
+#   http://www.openslr.org/17/
+
+set -e
+in_dir=$1
+data_dir=$2
+use_vocals='Y'
+
+rm -rf local/musan.tmp
+mkdir local/musan.tmp
+
+echo "Preparing ${data_dir}/musan..."
+mkdir -p ${data_dir}/musan
+local/make_musan.py ${in_dir} ${data_dir}/musan ${use_vocals}
+utils/fix_data_dir.sh ${data_dir}/musan
+
+grep "music" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_music
+grep "speech" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_speech
+grep "noise" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_noise
+utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_music \
+  ${data_dir}/musan ${data_dir}/musan_music
+utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_speech \
+  ${data_dir}/musan ${data_dir}/musan_speech
+utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_noise \
+  ${data_dir}/musan ${data_dir}/musan_noise
+
+utils/fix_data_dir.sh ${data_dir}/musan_music
+utils/fix_data_dir.sh ${data_dir}/musan_speech
+utils/fix_data_dir.sh ${data_dir}/musan_noise
+
+rm -rf local/musan.tmp
+
diff --git a/egs/bn_music_speech/v1/local/print_scores.py b/egs/bn_music_speech/v1/local/print_scores.py
new file mode 100755
index 00000000000..c2b587cdcad
--- /dev/null
+++ b/egs/bn_music_speech/v1/local/print_scores.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python
+# Copyright 2015   David Snyder
+# Apache 2.0.
+#
+# This script prints out lines of the form:
+# <score> <target-type>.
+# Its output is meant to be used as input to the binary
+# compute-eer. The Broadcast News utterances have either
+# "music" or "speech" in the utterance name, and so we
+# can simply check if the utterance name contains  one of
+# those strings to determine if it is a target or nontarget
+# utterance. We arbitrarily pick music to be the target class.
+
+import sys
+utt2score = open(sys.argv[1], 'r').readlines()
+for i in range(0, len(utt2score)):
+  utt, score = utt2score[i].rstrip().split()
+  if "music" in utt:
+    type = "target"
+  else:
+    type = "nontarget"
+  print score, type
diff --git a/egs/bn_music_speech/v1/local/refine_annotations_bn.py b/egs/bn_music_speech/v1/local/refine_annotations_bn.py
new file mode 100755
index 00000000000..52ac87c8640
--- /dev/null
+++ b/egs/bn_music_speech/v1/local/refine_annotations_bn.py
@@ -0,0 +1,116 @@
+#!/usr/bin/env python
+# Copyright 2015   David Snyder
+# Apache 2.0.
+#
+# This script refines the annotation files produced by
+# make_annotations_bn.py. In order to create unambiguous annotations,
+# we remove any part of a segment that overlaps with another. Also,
+# this script merges together contiguous segments that have the
+# same annotation, and ensures that only segments longer than a
+# designated length are created.
+#
+# This file is meant to be invoked from make_bn.sh.
+import sys, os
+
+def seg_to_string(seg):
+  start = seg[0]
+  end = seg[1]
+  if start < end:
+    return str(start) + " " + str(end) + "\n"
+  else:
+    return ""
+
+def process_segs(raw_segs):
+  segs = []
+  for seg in raw_segs:
+    lower, upper = map(float, seg.rstrip().split(" "))
+    segs.append((lower, upper))
+  return segs
+
+def resegment(music, speech, other, frame_length, min_seg):
+  frame2classes = []
+  max_duration = 0
+  all_segs = music + speech + other
+  for (start, end) in all_segs:
+    if end > max_duration:
+      max_duration = end
+  num_frames = int(max_duration) * frame_length
+  for i in range(0, num_frames):
+    frame2classes.append([])
+
+  annotate_frames(frame2classes, music, "music", frame_length, num_frames)
+  annotate_frames(frame2classes, speech, "speech", frame_length, num_frames)
+  annotate_frames(frame2classes, other,  "other", frame_length, num_frames)
+
+  curr_class = None
+  for i in range(0, len(frame2classes)):
+    if len(frame2classes[i]) != 1 or frame2classes[i][0] == "other":
+      curr_class = "other"
+    elif frame2classes[i][0] == "music":
+      curr_class = "music"
+    elif frame2classes[i][0] == "speech":
+      curr_class = "speech"
+    else:
+      curr_class = "other"
+    frame2classes[i] = curr_class
+
+  new_music = []
+  new_speech = []
+  curr_class = frame2classes[0]
+  start_frame = 0
+  for i in range(1, len(frame2classes)):
+    if curr_class != frame2classes[i]:
+      start = float(start_frame) / frame_length
+      end = float(i) / frame_length
+      if end - start > min_seg:
+        if curr_class == "music":
+          new_music.append((start, end))
+        elif curr_class == "speech":
+          new_speech.append((start, end))
+      start_frame = i
+      curr_class = frame2classes[i]
+
+  return new_music, new_speech
+
+
+def annotate_frames(frame2classes, segs, annotation, frame_length, max_duration):
+  for (start, end) in segs:
+    frame_start = min(int(start * frame_length), max_duration)
+    frame_end = min(int(end * frame_length), max_duration)
+    for i in range(frame_start, frame_end):
+      frame2classes[i].append(annotation)
+
+def main():
+  out_dir = sys.argv[1]
+  frames_per_sec = int(sys.argv[2])
+  min_seg_length = float(sys.argv[3])
+
+  utts = open(os.path.join(out_dir, "utt_list"), 'r').readlines()
+  for line in utts:
+    speech_filename = os.path.join(out_dir, line.rstrip() + "_speech.key")
+    music_filename = os.path.join(out_dir, line.rstrip() + "_music.key")
+    other_filename = os.path.join(out_dir, line.rstrip() + "_other.key")
+    raw_speech_segs = open(speech_filename, 'r').readlines()
+    raw_music_segs = open(music_filename, 'r').readlines()
+    raw_other_segs = open(other_filename, 'r').readlines()
+    speech_segs = process_segs(raw_speech_segs)
+    music_segs = process_segs(raw_music_segs)
+    other_segs = process_segs(raw_other_segs)
+    music_segs, speech_segs = resegment(music_segs, speech_segs, other_segs, frames_per_sec, min_seg_length)
+
+    speech_output = ""
+    music_output = ""
+    for seg in music_segs:
+      music_output = music_output + seg_to_string(seg)
+    for seg in speech_segs:
+      speech_output = speech_output + seg_to_string(seg)
+
+    speech_fi = open(speech_filename + ".refined", 'w')
+    music_fi = open(music_filename + ".refined", 'w')
+    speech_fi.write(speech_output)
+    music_fi.write(music_output)
+    speech_fi.close()
+    music_fi.close()
+
+if __name__=="__main__":
+  main()
diff --git a/egs/bn_music_speech/v1/path.sh b/egs/bn_music_speech/v1/path.sh
new file mode 100755
index 00000000000..e50f57c5271
--- /dev/null
+++ b/egs/bn_music_speech/v1/path.sh
@@ -0,0 +1,5 @@
+export KALDI_ROOT=`pwd`/../../..
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
diff --git a/egs/bn_music_speech/v1/run.sh b/egs/bn_music_speech/v1/run.sh
new file mode 100755
index 00000000000..67935ead983
--- /dev/null
+++ b/egs/bn_music_speech/v1/run.sh
@@ -0,0 +1,89 @@
+#!/bin/bash
+# Copyright 2015   David Snyder
+# Apache 2.0.
+#
+# This example demonstrates music/speech discrimination. This recipe trains
+# three GMMs on the music, speech and noise portions of the MUSAN corpus.
+# We test the systems on Broadcast News. The Broadcast News test data consists
+# of short segments of either speech or music. The classification decisions
+# are made at a segment level from the average likelihoods of two GMMs.
+# Results (EERs) are inline in comments below.
+#
+# See README.txt for more info on data required.
+
+. cmd.sh
+. path.sh
+set -e
+mfccdir=`pwd`/mfcc
+vaddir=`pwd`/mfcc
+
+local/make_bn.sh /export/corpora5/LDC/LDC97S44 \
+                 /export/corpora/LDC/LDC97T22 data
+
+local/make_musan.sh /export/corpora/JHU/musan data
+
+steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 30 --cmd "$train_cmd" \
+    data/musan_speech exp/make_mfcc $mfccdir
+steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 30 --cmd "$train_cmd" \
+    data/musan_music exp/make_mfcc $mfccdir
+steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 5 --cmd "$train_cmd" \
+    data/musan_noise exp/make_mfcc $mfccdir
+steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 30 --cmd "$train_cmd" \
+    data/bn exp/make_mfcc $mfccdir
+
+utils/fix_data_dir.sh data/musan_speech
+utils/fix_data_dir.sh data/musan_music
+utils/fix_data_dir.sh data/musan_noise
+utils/fix_data_dir.sh data/bn
+
+sid/compute_vad_decision.sh --nj 20 --cmd "$train_cmd" \
+    data/musan_speech exp/make_vad $vaddir
+sid/compute_vad_decision.sh --nj 5 --cmd "$train_cmd" \
+    data/musan_noise exp/make_vad $vaddir
+sid/compute_vad_decision.sh --nj 20 --cmd "$train_cmd" \
+    data/musan_music exp/make_vad $vaddir
+sid/compute_vad_decision.sh --nj 20 --cmd "$train_cmd" \
+    data/bn exp/make_vad $vaddir
+
+sid/train_diag_ubm.sh --nj 10 --cmd "$train_cmd" --delta-window 2 \
+    data/musan_noise 32 exp/diag_ubm_noise &
+sid/train_diag_ubm.sh --nj 20 --cmd "$train_cmd" --delta-window 2 \
+    data/musan_speech 32 exp/diag_ubm_speech &
+sid/train_diag_ubm.sh --nj 20 --cmd "$train_cmd" --delta-window 2 \
+    data/musan_music 32  exp/diag_ubm_music
+wait;
+
+sid/train_full_ubm.sh --nj 20 --cmd "$train_cmd" \
+    --remove-low-count-gaussians false data/musan_noise \
+    exp/diag_ubm_noise exp/full_ubm_noise &
+sid/train_full_ubm.sh --nj 20 --cmd "$train_cmd" \
+    --remove-low-count-gaussians false data/musan_speech \
+    exp/diag_ubm_speech exp/full_ubm_speech &
+sid/train_full_ubm.sh --nj 20 --cmd "$train_cmd" \
+    --remove-low-count-gaussians false data/musan_music \
+    exp/diag_ubm_music exp/full_ubm_music
+wait;
+
+sid/music_id.sh --cmd "$train_cmd" --nj 40 \
+  exp/full_ubm_music exp/full_ubm_speech \
+  data/bn exp/bn_music_speech
+sid/music_id.sh --cmd "$train_cmd" --nj 40 \
+  exp/full_ubm_noise exp/full_ubm_speech \
+  data/bn exp/bn_noise_speech
+
+printf "EER using GMMs trained on music and speech"
+compute-eer <(local/print_scores.py exp/bn_music_speech/ratio)
+# Equal error rate is 0.344234%, at threshold 0.525752
+printf "\nEER using GMM trained on noise instead of music"
+compute-eer <(local/print_scores.py exp/bn_noise_speech/ratio)
+# Equal error rate is 0.860585%, at threshold 0.123218
+
+# The following script replaces the VAD decisions originally computed by
+# the energy-based VAD.  It uses the GMMs trained earlier in the script
+# to make frame-level decisions. Due to the mapping provided in
+# conf/merge_vad_map.txt, "0" corresponds to silence, "1" to speech, and
+# "2" to music.
+sid/compute_vad_decision_gmm.sh --nj 40 --cmd "$train_cmd" \
+  --merge-map-config conf/merge_vad_map.txt --use-energy-vad true \
+  data/bn exp/full_ubm_noise exp/full_ubm_speech/ \
+  exp/full_ubm_music/ exp/vad_gmm exp/vad_gmm/
diff --git a/egs/bn_music_speech/v1/sid b/egs/bn_music_speech/v1/sid
new file mode 120000
index 00000000000..a9cdb0f0013
--- /dev/null
+++ b/egs/bn_music_speech/v1/sid
@@ -0,0 +1 @@
+../../sre10/v1/sid
\ No newline at end of file
diff --git a/egs/bn_music_speech/v1/steps b/egs/bn_music_speech/v1/steps
new file mode 120000
index 00000000000..83b3d2b59a3
--- /dev/null
+++ b/egs/bn_music_speech/v1/steps
@@ -0,0 +1 @@
+../../sre10/v1/steps
\ No newline at end of file
diff --git a/egs/bn_music_speech/v1/utils b/egs/bn_music_speech/v1/utils
new file mode 120000
index 00000000000..726839e0092
--- /dev/null
+++ b/egs/bn_music_speech/v1/utils
@@ -0,0 +1 @@
+../../sre10/v1/utils
\ No newline at end of file
diff --git a/egs/callhome_egyptian/s5/RESULTS b/egs/callhome_egyptian/s5/RESULTS
new file mode 100644
index 00000000000..1d1c8fd1690
--- /dev/null
+++ b/egs/callhome_egyptian/s5/RESULTS
@@ -0,0 +1,226 @@
+--------------------------------------------------------------------------------------
+Triphone with mono alignment (small)
+--------------------------------------------------------------------------------------
+exp/tri1/decode_dev/wer_11 %WER 67.90 [ 22753 / 33509, 1778 ins, 5369 del, 15606 sub ]
+exp/tri1/decode_dev/wer_12 %WER 67.91 [ 22757 / 33509, 1555 ins, 5782 del, 15420 sub ]
+exp/tri1/decode_dev/wer_10 %WER 68.14 [ 22834 / 33509, 2041 ins, 4902 del, 15891 sub ]
+exp/tri1/decode_dev/wer_13 %WER 68.19 [ 22851 / 33509, 1428 ins, 6227 del, 15196 sub ]
+exp/tri1/decode_dev/wer_9 %WER 68.68 [ 23015 / 33509, 2379 ins, 4422 del, 16214 sub ]
+exp/tri1/decode_dev/wer_8 %WER 69.53 [ 23298 / 33509, 2748 ins, 4024 del, 16526 sub ]
+exp/tri1/decode_dev/wer_7 %WER 70.92 [ 23766 / 33509, 3180 ins, 3609 del, 16977 sub ]
+exp/tri1/decode_dev/wer_6 %WER 72.71 [ 24366 / 33509, 3674 ins, 3218 del, 17474 sub ]
+exp/tri1/decode_dev/wer_5 %WER 75.02 [ 25137 / 33509, 4247 ins, 2886 del, 18004 sub ]
+exp/tri1/decode_dev/wer_4 %WER 77.08 [ 25830 / 33509, 4794 ins, 2625 del, 18411 sub ]
+exp/tri1/decode_dev/wer_3 %WER 79.37 [ 26595 / 33509, 5340 ins, 2424 del, 18831 sub ]
+exp/tri1/decode_dev/wer_2 %WER 81.52 [ 27317 / 33509, 5869 ins, 2268 del, 19180 sub ]
+
+--------------------------------------------------------------------------------------
+Triphone with tri alignments
+--------------------------------------------------------------------------------------
+exp/tri2/decode_dev/wer_11 %WER 66.41 [ 22253 / 33509, 1841 ins, 5001 del, 15411 sub ]
+exp/tri2/decode_dev/wer_12 %WER 66.44 [ 22262 / 33509, 1620 ins, 5463 del, 15179 sub ]
+exp/tri2/decode_dev/wer_13 %WER 66.61 [ 22322 / 33509, 1448 ins, 5926 del, 14948 sub ]
+exp/tri2/decode_dev/wer_10 %WER 66.73 [ 22360 / 33509, 2153 ins, 4575 del, 15632 sub ]
+exp/tri2/decode_dev/wer_9 %WER 67.36 [ 22573 / 33509, 2453 ins, 4102 del, 16018 sub ]
+exp/tri2/decode_dev/wer_8 %WER 68.65 [ 23003 / 33509, 2874 ins, 3741 del, 16388 sub ]
+exp/tri2/decode_dev/wer_7 %WER 70.19 [ 23521 / 33509, 3380 ins, 3363 del, 16778 sub ]
+exp/tri2/decode_dev/wer_6 %WER 72.17 [ 24183 / 33509, 3950 ins, 3003 del, 17230 sub ]
+exp/tri2/decode_dev/wer_5 %WER 74.31 [ 24901 / 33509, 4476 ins, 2715 del, 17710 sub ]
+exp/tri2/decode_dev/wer_4 %WER 76.48 [ 25627 / 33509, 5044 ins, 2460 del, 18123 sub ]
+exp/tri2/decode_dev/wer_3 %WER 78.52 [ 26312 / 33509, 5544 ins, 2251 del, 18517 sub ]
+exp/tri2/decode_dev/wer_2 %WER 80.92 [ 27115 / 33509, 6114 ins, 2105 del, 18896 sub ]
+
+--------------------------------------------------------------------------------------
+Triphone + LDA + MLLT
+--------------------------------------------------------------------------------------
+exp/tri3a/decode_dev/wer_11 %WER 62.31 [ 20878 / 33509, 1793 ins, 4872 del, 14213 sub ]
+exp/tri3a/decode_dev/wer_12 %WER 62.33 [ 20887 / 33509, 1581 ins, 5349 del, 13957 sub ]
+exp/tri3a/decode_dev/wer_10 %WER 62.51 [ 20947 / 33509, 2058 ins, 4415 del, 14474 sub ]
+exp/tri3a/decode_dev/wer_13 %WER 62.68 [ 21005 / 33509, 1388 ins, 5856 del, 13761 sub ]
+exp/tri3a/decode_dev/wer_9 %WER 63.20 [ 21177 / 33509, 2369 ins, 3972 del, 14836 sub ]
+exp/tri3a/decode_dev/wer_8 %WER 64.29 [ 21543 / 33509, 2771 ins, 3604 del, 15168 sub ]
+exp/tri3a/decode_dev/wer_7 %WER 65.63 [ 21993 / 33509, 3209 ins, 3288 del, 15496 sub ]
+exp/tri3a/decode_dev/wer_6 %WER 67.63 [ 22661 / 33509, 3723 ins, 2970 del, 15968 sub ]
+exp/tri3a/decode_dev/wer_5 %WER 69.68 [ 23350 / 33509, 4241 ins, 2686 del, 16423 sub ]
+exp/tri3a/decode_dev/wer_4 %WER 71.83 [ 24069 / 33509, 4774 ins, 2439 del, 16856 sub ]
+exp/tri3a/decode_dev/wer_3 %WER 74.14 [ 24842 / 33509, 5326 ins, 2278 del, 17238 sub ]
+exp/tri3a/decode_dev/wer_2 %WER 76.28 [ 25561 / 33509, 5814 ins, 2152 del, 17595 sub ]
+
+--------------------------------------------------------------------------------------
++ SAT + fMLLR
+--------------------------------------------------------------------------------------
+exp/tri4a/decode_dev/wer_12 %WER 58.22 [ 19510 / 33509, 1796 ins, 4447 del, 13267 sub ]
+exp/tri4a/decode_dev/wer_11 %WER 58.29 [ 19532 / 33509, 1998 ins, 4124 del, 13410 sub ]
+exp/tri4a/decode_dev/wer_13 %WER 58.47 [ 19593 / 33509, 1634 ins, 4808 del, 13151 sub ]
+exp/tri4a/decode_dev/wer_10 %WER 58.61 [ 19641 / 33509, 2283 ins, 3790 del, 13568 sub ]
+exp/tri4a/decode_dev/wer_9 %WER 59.29 [ 19867 / 33509, 2591 ins, 3455 del, 13821 sub ]
+exp/tri4a/decode_dev/wer_8 %WER 60.60 [ 20307 / 33509, 2969 ins, 3133 del, 14205 sub ]
+exp/tri4a/decode_dev/wer_7 %WER 62.11 [ 20812 / 33509, 3471 ins, 2790 del, 14551 sub ]
+exp/tri4a/decode_dev/wer_6 %WER 64.08 [ 21471 / 33509, 3976 ins, 2508 del, 14987 sub ]
+exp/tri4a/decode_dev/wer_5 %WER 66.25 [ 22200 / 33509, 4563 ins, 2283 del, 15354 sub ]
+exp/tri4a/decode_dev/wer_4 %WER 68.40 [ 22920 / 33509, 5091 ins, 2106 del, 15723 sub ]
+exp/tri4a/decode_dev/wer_3 %WER 70.36 [ 23576 / 33509, 5576 ins, 1933 del, 16067 sub ]
+exp/tri4a/decode_dev/wer_2 %WER 72.33 [ 24236 / 33509, 6047 ins, 1819 del, 16370 sub ]
+
+--------------------------------------------------------------------------------------
++ More leaves and gaussians
+--------------------------------------------------------------------------------------
+exp/tri5a/decode_dev/wer_12 %WER 58.06 [ 19456 / 33509, 1866 ins, 4379 del, 13211 sub ]
+exp/tri5a/decode_dev/wer_11 %WER 58.19 [ 19498 / 33509, 2105 ins, 4031 del, 13362 sub ]
+exp/tri5a/decode_dev/wer_13 %WER 58.37 [ 19558 / 33509, 1670 ins, 4734 del, 13154 sub ]
+exp/tri5a/decode_dev/wer_10 %WER 58.64 [ 19651 / 33509, 2364 ins, 3696 del, 13591 sub ]
+exp/tri5a/decode_dev/wer_9 %WER 59.46 [ 19923 / 33509, 2711 ins, 3386 del, 13826 sub ]
+exp/tri5a/decode_dev/wer_8 %WER 60.49 [ 20270 / 33509, 3093 ins, 3040 del, 14137 sub ]
+exp/tri5a/decode_dev/wer_7 %WER 62.28 [ 20871 / 33509, 3592 ins, 2751 del, 14528 sub ]
+exp/tri5a/decode_dev/wer_6 %WER 64.11 [ 21483 / 33509, 4107 ins, 2465 del, 14911 sub ]
+exp/tri5a/decode_dev/wer_5 %WER 66.27 [ 22208 / 33509, 4674 ins, 2274 del, 15260 sub ]
+exp/tri5a/decode_dev/wer_4 %WER 68.31 [ 22891 / 33509, 5171 ins, 2076 del, 15644 sub ]
+exp/tri5a/decode_dev/wer_3 %WER 70.35 [ 23574 / 33509, 5646 ins, 1893 del, 16035 sub ]
+exp/tri5a/decode_dev/wer_2 %WER 72.46 [ 24279 / 33509, 6152 ins, 1784 del, 16343 sub ]
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+*** Test ***
+exp/tri5a/decode_test/wer_10 %WER 57.09 [ 8927 / 15637, 879 ins, 1783 del, 6265 sub ]
+exp/tri5a/decode_test/wer_11 %WER 56.60 [ 8851 / 15637, 782 ins, 1946 del, 6123 sub ]
+exp/tri5a/decode_test/wer_12 %WER 56.46 [ 8828 / 15637, 688 ins, 2085 del, 6055 sub ]
+exp/tri5a/decode_test/wer_13 %WER 56.73 [ 8871 / 15637, 629 ins, 2241 del, 6001 sub ]
+exp/tri5a/decode_test/wer_2 %WER 68.81 [ 10760 / 15637, 2364 ins, 932 del, 7464 sub ]
+exp/tri5a/decode_test/wer_3 %WER 66.74 [ 10436 / 15637, 2152 ins, 995 del, 7289 sub ]
+exp/tri5a/decode_test/wer_4 %WER 64.55 [ 10093 / 15637, 1919 ins, 1073 del, 7101 sub ]
+exp/tri5a/decode_test/wer_5 %WER 62.86 [ 9830 / 15637, 1727 ins, 1163 del, 6940 sub ]
+exp/tri5a/decode_test/wer_6 %WER 61.03 [ 9543 / 15637, 1497 ins, 1286 del, 6760 sub ]
+exp/tri5a/decode_test/wer_7 %WER 59.44 [ 9295 / 15637, 1311 ins, 1391 del, 6593 sub ]
+exp/tri5a/decode_test/wer_8 %WER 58.41 [ 9134 / 15637, 1141 ins, 1515 del, 6478 sub ]
+exp/tri5a/decode_test/wer_9 %WER 57.72 [ 9025 / 15637, 1008 ins, 1651 del, 6366 sub ]
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+*** Supplement ***
+exp/tri5a/decode_sup/wer_10 %WER 63.77 [ 11101 / 17409, 1164 ins, 2476 del, 7461 sub ]
+exp/tri5a/decode_sup/wer_11 %WER 63.52 [ 11059 / 17409, 1042 ins, 2666 del, 7351 sub ]
+exp/tri5a/decode_sup/wer_12 %WER 63.29 [ 11019 / 17409, 930 ins, 2884 del, 7205 sub ]
+exp/tri5a/decode_sup/wer_13 %WER 63.12 [ 10989 / 17409, 814 ins, 3124 del, 7051 sub ]
+exp/tri5a/decode_sup/wer_2 %WER 75.75 [ 13187 / 17409, 2952 ins, 1279 del, 8956 sub ]
+exp/tri5a/decode_sup/wer_3 %WER 74.18 [ 12914 / 17409, 2728 ins, 1371 del, 8815 sub ]
+exp/tri5a/decode_sup/wer_4 %WER 72.28 [ 12584 / 17409, 2491 ins, 1444 del, 8649 sub ]
+exp/tri5a/decode_sup/wer_5 %WER 70.04 [ 12194 / 17409, 2206 ins, 1562 del, 8426 sub ]
+exp/tri5a/decode_sup/wer_6 %WER 68.20 [ 11873 / 17409, 1944 ins, 1719 del, 8210 sub ]
+exp/tri5a/decode_sup/wer_7 %WER 66.61 [ 11596 / 17409, 1720 ins, 1880 del, 7996 sub ]
+exp/tri5a/decode_sup/wer_8 %WER 65.37 [ 11381 / 17409, 1500 ins, 2075 del, 7806 sub ]
+exp/tri5a/decode_sup/wer_9 %WER 64.45 [ 11220 / 17409, 1345 ins, 2275 del, 7600 sub ]
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+*** H5 ***
+exp/tri5a/decode_h5/wer_10 %WER 61.38 [ 10303 / 16785, 839 ins, 2581 del, 6883 sub ]
+exp/tri5a/decode_h5/wer_11 %WER 61.27 [ 10285 / 16785, 730 ins, 2760 del, 6795 sub ]
+exp/tri5a/decode_h5/wer_12 %WER 61.41 [ 10307 / 16785, 646 ins, 2953 del, 6708 sub ]
+exp/tri5a/decode_h5/wer_13 %WER 61.61 [ 10342 / 16785, 568 ins, 3132 del, 6642 sub ]
+exp/tri5a/decode_h5/wer_2 %WER 71.50 [ 12001 / 16785, 2156 ins, 1385 del, 8460 sub ]
+exp/tri5a/decode_h5/wer_3 %WER 69.96 [ 11742 / 16785, 1975 ins, 1476 del, 8291 sub ]
+exp/tri5a/decode_h5/wer_4 %WER 68.23 [ 11453 / 16785, 1765 ins, 1569 del, 8119 sub ]
+exp/tri5a/decode_h5/wer_5 %WER 66.48 [ 11159 / 16785, 1595 ins, 1703 del, 7861 sub ]
+exp/tri5a/decode_h5/wer_6 %WER 64.88 [ 10890 / 16785, 1411 ins, 1839 del, 7640 sub ]
+exp/tri5a/decode_h5/wer_7 %WER 63.67 [ 10687 / 16785, 1229 ins, 2019 del, 7439 sub ]
+exp/tri5a/decode_h5/wer_8 %WER 62.63 [ 10513 / 16785, 1082 ins, 2193 del, 7238 sub ]
+exp/tri5a/decode_h5/wer_9 %WER 61.95 [ 10399 / 16785, 959 ins, 2398 del, 7042 sub ]
+
+
+--------------------------------------------------------------------------------------
+pNorm-Ensemble DNN
+--------------------------------------------------------------------------------------
+exp/tri6a_dnn/decode_dev/wer_10 %WER 50.55 [ 16939 / 33509, 1407 ins, 4188 del, 11344 sub ]
+exp/tri6a_dnn/decode_dev/wer_11 %WER 51.03 [ 17098 / 33509, 1239 ins, 4563 del, 11296 sub ]
+exp/tri6a_dnn/decode_dev/wer_12 %WER 51.69 [ 17321 / 33509, 1126 ins, 5010 del, 11185 sub ]
+exp/tri6a_dnn/decode_dev/wer_13 %WER 52.54 [ 17607 / 33509, 1010 ins, 5466 del, 11131 sub ]
+exp/tri6a_dnn/decode_dev/wer_14 %WER 53.52 [ 17933 / 33509, 908 ins, 5918 del, 11107 sub ]
+exp/tri6a_dnn/decode_dev/wer_15 %WER 54.36 [ 18214 / 33509, 817 ins, 6294 del, 11103 sub ]
+exp/tri6a_dnn/decode_dev/wer_16 %WER 55.08 [ 18456 / 33509, 739 ins, 6622 del, 11095 sub ]
+exp/tri6a_dnn/decode_dev/wer_8 %WER 50.34 [ 16869 / 33509, 1841 ins, 3456 del, 11572 sub ]
+exp/tri6a_dnn/decode_dev/wer_9 %WER 50.31 [ 16859 / 33509, 1617 ins, 3794 del, 11448 sub ]
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+*** Test ***
+exp/tri6a_dnn/decode_test/wer_10 %WER 49.76 [ 7781 / 15637, 542 ins, 2022 del, 5217 sub ]
+exp/tri6a_dnn/decode_test/wer_11 %WER 50.40 [ 7881 / 15637, 489 ins, 2195 del, 5197 sub ]
+exp/tri6a_dnn/decode_test/wer_12 %WER 50.82 [ 7947 / 15637, 431 ins, 2356 del, 5160 sub ]
+exp/tri6a_dnn/decode_test/wer_13 %WER 51.72 [ 8087 / 15637, 375 ins, 2591 del, 5121 sub ]
+exp/tri6a_dnn/decode_test/wer_14 %WER 52.65 [ 8233 / 15637, 324 ins, 2800 del, 5109 sub ]
+exp/tri6a_dnn/decode_test/wer_15 %WER 53.57 [ 8376 / 15637, 284 ins, 2986 del, 5106 sub ]
+exp/tri6a_dnn/decode_test/wer_16 %WER 54.37 [ 8502 / 15637, 246 ins, 3131 del, 5125 sub ]
+exp/tri6a_dnn/decode_test/wer_8 %WER 49.33 [ 7714 / 15637, 696 ins, 1721 del, 5297 sub ]
+exp/tri6a_dnn/decode_test/wer_9 %WER 49.54 [ 7747 / 15637, 632 ins, 1873 del, 5242 sub ]
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+*** Supplement ***
+exp/tri6a_dnn/decode_sup/wer_10 %WER 58.14 [ 10121 / 17409, 895 ins, 2684 del, 6542 sub ]
+exp/tri6a_dnn/decode_sup/wer_11 %WER 58.41 [ 10169 / 17409, 791 ins, 2927 del, 6451 sub ]
+exp/tri6a_dnn/decode_sup/wer_12 %WER 58.71 [ 10220 / 17409, 681 ins, 3214 del, 6325 sub ]
+exp/tri6a_dnn/decode_sup/wer_13 %WER 59.14 [ 10295 / 17409, 593 ins, 3502 del, 6200 sub ]
+exp/tri6a_dnn/decode_sup/wer_14 %WER 59.84 [ 10417 / 17409, 515 ins, 3741 del, 6161 sub ]
+exp/tri6a_dnn/decode_sup/wer_15 %WER 60.33 [ 10503 / 17409, 450 ins, 3974 del, 6079 sub ]
+exp/tri6a_dnn/decode_sup/wer_16 %WER 60.78 [ 10581 / 17409, 393 ins, 4157 del, 6031 sub ]
+exp/tri6a_dnn/decode_sup/wer_8 %WER 58.57 [ 10197 / 17409, 1194 ins, 2262 del, 6741 sub ]
+exp/tri6a_dnn/decode_sup/wer_9 %WER 58.15 [ 10123 / 17409, 1023 ins, 2477 del, 6623 sub ]
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+*** H5 ***
+exp/tri6a_dnn/decode_h5/wer_10 %WER 55.98 [ 9396 / 16785, 592 ins, 2734 del, 6070 sub ]
+exp/tri6a_dnn/decode_h5/wer_11 %WER 56.11 [ 9418 / 16785, 495 ins, 2974 del, 5949 sub ]
+exp/tri6a_dnn/decode_h5/wer_12 %WER 56.75 [ 9526 / 16785, 418 ins, 3247 del, 5861 sub ]
+exp/tri6a_dnn/decode_h5/wer_13 %WER 57.61 [ 9670 / 16785, 368 ins, 3482 del, 5820 sub ]
+exp/tri6a_dnn/decode_h5/wer_14 %WER 58.37 [ 9797 / 16785, 318 ins, 3739 del, 5740 sub ]
+exp/tri6a_dnn/decode_h5/wer_15 %WER 59.32 [ 9957 / 16785, 284 ins, 3960 del, 5713 sub ]
+exp/tri6a_dnn/decode_h5/wer_16 %WER 59.93 [ 10060 / 16785, 256 ins, 4127 del, 5677 sub ]
+exp/tri6a_dnn/decode_h5/wer_8 %WER 55.60 [ 9333 / 16785, 750 ins, 2323 del, 6260 sub ]
+exp/tri6a_dnn/decode_h5/wer_9 %WER 55.76 [ 9360 / 16785, 666 ins, 2531 del, 6163 sub ]
+
+--------------------------------------------------------------------------------------
+TDNN + iVector
+--------------------------------------------------------------------------------------
+exp/nnet3/nnet_tdnn_a/decode_dev/wer_10 %WER 53.55 [ 17943 / 33509, 1332 ins, 4855 del, 11756 sub ]
+exp/nnet3/nnet_tdnn_a/decode_dev/wer_11 %WER 53.82 [ 18033 / 33509, 1176 ins, 5278 del, 11579 sub ]
+exp/nnet3/nnet_tdnn_a/decode_dev/wer_12 %WER 54.17 [ 18153 / 33509, 1040 ins, 5696 del, 11417 sub ]
+exp/nnet3/nnet_tdnn_a/decode_dev/wer_13 %WER 54.75 [ 18345 / 33509, 912 ins, 6111 del, 11322 sub ]
+exp/nnet3/nnet_tdnn_a/decode_dev/wer_2 %WER 65.73 [ 22026 / 33509, 4773 ins, 2143 del, 15110 sub ]
+exp/nnet3/nnet_tdnn_a/decode_dev/wer_3 %WER 62.48 [ 20937 / 33509, 4112 ins, 2383 del, 14442 sub ]
+exp/nnet3/nnet_tdnn_a/decode_dev/wer_4 %WER 59.65 [ 19989 / 33509, 3488 ins, 2699 del, 13802 sub ]
+exp/nnet3/nnet_tdnn_a/decode_dev/wer_5 %WER 57.41 [ 19238 / 33509, 2942 ins, 3032 del, 13264 sub ]
+exp/nnet3/nnet_tdnn_a/decode_dev/wer_6 %WER 55.58 [ 18624 / 33509, 2461 ins, 3356 del, 12807 sub ]
+exp/nnet3/nnet_tdnn_a/decode_dev/wer_7 %WER 54.44 [ 18242 / 33509, 2092 ins, 3714 del, 12436 sub ]
+exp/nnet3/nnet_tdnn_a/decode_dev/wer_8 %WER 53.63 [ 17970 / 33509, 1766 ins, 4087 del, 12117 sub ]
+exp/nnet3/nnet_tdnn_a/decode_dev/wer_9 %WER 53.51 [ 17931 / 33509, 1533 ins, 4452 del, 11946 sub ]
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+*** Test ***
+exp/nnet3/nnet_tdnn_a/decode_test/wer_10 %WER 52.29 [ 8177 / 15637, 536 ins, 2190 del, 5451 sub ]
+exp/nnet3/nnet_tdnn_a/decode_test/wer_11 %WER 52.49 [ 8208 / 15637, 474 ins, 2373 del, 5361 sub ]
+exp/nnet3/nnet_tdnn_a/decode_test/wer_12 %WER 53.03 [ 8293 / 15637, 420 ins, 2558 del, 5315 sub ]
+exp/nnet3/nnet_tdnn_a/decode_test/wer_13 %WER 53.52 [ 8369 / 15637, 361 ins, 2721 del, 5287 sub ]
+exp/nnet3/nnet_tdnn_a/decode_test/wer_2 %WER 64.18 [ 10036 / 15637, 2048 ins, 980 del, 7008 sub ]
+exp/nnet3/nnet_tdnn_a/decode_test/wer_3 %WER 60.84 [ 9513 / 15637, 1726 ins, 1076 del, 6711 sub ]
+exp/nnet3/nnet_tdnn_a/decode_test/wer_4 %WER 58.23 [ 9106 / 15637, 1471 ins, 1210 del, 6425 sub ]
+exp/nnet3/nnet_tdnn_a/decode_test/wer_5 %WER 55.77 [ 8720 / 15637, 1206 ins, 1351 del, 6163 sub ]
+exp/nnet3/nnet_tdnn_a/decode_test/wer_6 %WER 54.19 [ 8474 / 15637, 1005 ins, 1505 del, 5964 sub ]
+exp/nnet3/nnet_tdnn_a/decode_test/wer_7 %WER 53.08 [ 8300 / 15637, 827 ins, 1689 del, 5784 sub ]
+exp/nnet3/nnet_tdnn_a/decode_test/wer_8 %WER 52.43 [ 8198 / 15637, 712 ins, 1841 del, 5645 sub ]
+exp/nnet3/nnet_tdnn_a/decode_test/wer_9 %WER 52.10 [ 8147 / 15637, 619 ins, 1993 del, 5535 sub ]
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+*** Supplement ***
+exp/nnet3/nnet_tdnn_a/decode_sup/wer_10 %WER 80.31 [ 13981 / 17409, 407 ins, 8528 del, 5046 sub ]
+exp/nnet3/nnet_tdnn_a/decode_sup/wer_11 %WER 80.42 [ 14001 / 17409, 360 ins, 8752 del, 4889 sub ]
+exp/nnet3/nnet_tdnn_a/decode_sup/wer_12 %WER 80.52 [ 14017 / 17409, 318 ins, 8968 del, 4731 sub ]
+exp/nnet3/nnet_tdnn_a/decode_sup/wer_13 %WER 80.79 [ 14065 / 17409, 291 ins, 9155 del, 4619 sub ]
+exp/nnet3/nnet_tdnn_a/decode_sup/wer_2 %WER 85.93 [ 14960 / 17409, 1330 ins, 6454 del, 7176 sub ]
+exp/nnet3/nnet_tdnn_a/decode_sup/wer_3 %WER 84.65 [ 14737 / 17409, 1151 ins, 6635 del, 6951 sub ]
+exp/nnet3/nnet_tdnn_a/decode_sup/wer_4 %WER 83.31 [ 14504 / 17409, 968 ins, 6890 del, 6646 sub ]
+exp/nnet3/nnet_tdnn_a/decode_sup/wer_5 %WER 82.52 [ 14366 / 17409, 839 ins, 7159 del, 6368 sub ]
+exp/nnet3/nnet_tdnn_a/decode_sup/wer_6 %WER 81.66 [ 14216 / 17409, 711 ins, 7477 del, 6028 sub ]
+exp/nnet3/nnet_tdnn_a/decode_sup/wer_7 %WER 81.08 [ 14116 / 17409, 631 ins, 7750 del, 5735 sub ]
+exp/nnet3/nnet_tdnn_a/decode_sup/wer_8 %WER 80.52 [ 14017 / 17409, 547 ins, 7999 del, 5471 sub ]
+exp/nnet3/nnet_tdnn_a/decode_sup/wer_9 %WER 80.31 [ 13982 / 17409, 468 ins, 8269 del, 5245 sub ]
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+*** H5 ***
+exp/nnet3/nnet_tdnn_a/decode_h5/wer_10 %WER 85.15 [ 14293 / 16785, 170 ins, 9449 del, 4674 sub ]
+exp/nnet3/nnet_tdnn_a/decode_h5/wer_11 %WER 85.24 [ 14307 / 16785, 142 ins, 9700 del, 4465 sub ]
+exp/nnet3/nnet_tdnn_a/decode_h5/wer_12 %WER 85.51 [ 14353 / 16785, 119 ins, 9920 del, 4314 sub ]
+exp/nnet3/nnet_tdnn_a/decode_h5/wer_13 %WER 85.81 [ 14403 / 16785, 106 ins, 10113 del, 4184 sub ]
+exp/nnet3/nnet_tdnn_a/decode_h5/wer_2 %WER 88.11 [ 14790 / 16785, 749 ins, 7107 del, 6934 sub ]
+exp/nnet3/nnet_tdnn_a/decode_h5/wer_3 %WER 87.16 [ 14629 / 16785, 630 ins, 7324 del, 6675 sub ]
+exp/nnet3/nnet_tdnn_a/decode_h5/wer_4 %WER 86.45 [ 14510 / 16785, 509 ins, 7607 del, 6394 sub ]
+exp/nnet3/nnet_tdnn_a/decode_h5/wer_5 %WER 85.71 [ 14387 / 16785, 423 ins, 7925 del, 6039 sub ]
+exp/nnet3/nnet_tdnn_a/decode_h5/wer_6 %WER 85.24 [ 14307 / 16785, 341 ins, 8248 del, 5718 sub ]
+exp/nnet3/nnet_tdnn_a/decode_h5/wer_7 %WER 84.99 [ 14266 / 16785, 277 ins, 8617 del, 5372 sub ]
+exp/nnet3/nnet_tdnn_a/decode_h5/wer_8 %WER 84.85 [ 14242 / 16785, 230 ins, 8916 del, 5096 sub ]
+exp/nnet3/nnet_tdnn_a/decode_h5/wer_9 %WER 84.92 [ 14253 / 16785, 192 ins, 9200 del, 4861 sub ]
diff --git a/egs/callhome_egyptian/s5/cmd.sh b/egs/callhome_egyptian/s5/cmd.sh
index ab29f13d4cc..71dd849a93b 100755
--- a/egs/callhome_egyptian/s5/cmd.sh
+++ b/egs/callhome_egyptian/s5/cmd.sh
@@ -1,18 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#train_cmd='queue.pl -q all.q@a03.clsp.jhu.edu,all.q@a06.clsp.jhu.edu,all.q@a05.clsp.jhu.edu,all.q@v01.clsp.jhu.edu,all.q@a10.clsp.jhu.edu,all.q@a04.clsp.jhu.edu,all.q@a13.clsp.jhu.edu,all.q@a11.clsp.jhu.edu -l arch=*64'
-#decode_cmd='queue.pl -q all.q@a03.clsp.jhu.edu,all.q@a06.clsp.jhu.edu,all.q@a05.clsp.jhu.edu,all.q@v01.clsp.jhu.edu,all.q@a10.clsp.jhu.edu,all.q@a04.clsp.jhu.edu,all.q@a13.clsp.jhu.edu,all.q@a11.clsp.jhu.edu -l arch=*64'
-train_cmd="queue.pl -l arch=*64"
-decode_cmd="queue.pl -l arch=*64"
-#train_cmd="run.pl"
-# Do training locally.  Note: for jobs on smallish subsets,
-# it's way faster to run on a single machine with a handful of CPUs, as
-# you avoid the latency of starting GridEngine jobs.
-
-
-
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/callhome_egyptian/s5/conf/mfcc_hires.conf b/egs/callhome_egyptian/s5/conf/mfcc_hires.conf
new file mode 100644
index 00000000000..ad519cf4f9c
--- /dev/null
+++ b/egs/callhome_egyptian/s5/conf/mfcc_hires.conf
@@ -0,0 +1,11 @@
+# config for high-resolution MFCC features, intended for neural network training
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why 
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--num-mel-bins=40     # similar to Google's setup.
+--num-ceps=40     # there is no dimensionality reduction.
+--low-freq=20     # low cutoff frequency for mel bins... this is high-bandwidth data, so
+                  # there might be some information at the low end.
+--high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600) 
+--sample-frequency=8000
diff --git a/egs/callhome_egyptian/s5/conf/online_cmvn.conf b/egs/callhome_egyptian/s5/conf/online_cmvn.conf
new file mode 100644
index 00000000000..cbdaf5f281c
--- /dev/null
+++ b/egs/callhome_egyptian/s5/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used in the script ../local/online/run_online_decoding_nnet2.sh
diff --git a/egs/callhome_egyptian/s5/local/callhome_create_test_lang.sh b/egs/callhome_egyptian/s5/local/callhome_create_test_lang.sh
index aaa45f8e4e1..78059f153a8 100755
--- a/egs/callhome_egyptian/s5/local/callhome_create_test_lang.sh
+++ b/egs/callhome_egyptian/s5/local/callhome_create_test_lang.sh
@@ -1,4 +1,4 @@
-#!/bin/bash 
+#!/bin/bash
 #
 
 if [ -f path.sh ]; then . path.sh; fi
@@ -12,25 +12,18 @@ mkdir -p data/lang_test
 cp -r data/lang/* data/lang_test
 
 # grep -v '<s> <s>' etc. is only for future-proofing this script.  Our
-# LM doesn't have these "invalid combinations".  These can cause 
+# LM doesn't have these "invalid combinations".  These can cause
 # determinization failures of CLG [ends up being epsilon cycles].
 # Note: remove_oovs.pl takes a list of words in the LM that aren't in
 # our word list.  Since our LM doesn't have any, we just give it
 # /dev/null [we leave it in the script to show how you'd do it].
 gunzip -c "$arpa_lm" | \
-   grep -v '<s> <s>' | \
-   grep -v '</s> <s>' | \
-   grep -v '</s> </s>' | \
-   arpa2fst - | fstprint | \
-   utils/remove_oovs.pl /dev/null | \
-   utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \
-     --osymbols=data/lang_test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-    fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst
-  fstisstochastic data/lang_test/G.fst
+  arpa2fst --disambig-symbol=#0 \
+           --read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst
 
 
 echo  "Checking how stochastic G is (the first of these numbers should be small):"
-fstisstochastic data/lang_test/G.fst 
+fstisstochastic data/lang_test/G.fst
 
 ## Check lexicon.
 ## just have a look and make sure it seems sane.
@@ -59,4 +52,3 @@ fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \
 
 
 echo "$0 succeeded"
-
diff --git a/egs/callhome_egyptian/s5/local/callhome_data_prep.sh b/egs/callhome_egyptian/s5/local/callhome_data_prep.sh
index c8e5b30038e..f8a4430aeda 100755
--- a/egs/callhome_egyptian/s5/local/callhome_data_prep.sh
+++ b/egs/callhome_egyptian/s5/local/callhome_data_prep.sh
@@ -1,8 +1,8 @@
 #!/bin/bash
 #
 # Johns Hopkins University : (Gaurav Kumar)
-# The input is the Callhome Egyptian Arabic Dataset which contains *.sph files 
-# In addition the transcripts are needed as well. 
+# The input is the Callhome Egyptian Arabic Dataset which contains *.sph files
+# In addition the transcripts are needed as well.
 
 #TODO: Rewrite intro, copyright stuff and dir information
 # To be run from one directory above this script.
@@ -12,7 +12,7 @@ stage=0
 export LC_ALL=C
 
 
-if [ $# -lt 2 ]; then
+if [ $# -lt 6 ]; then
    echo "Arguments should be the location of the Callhome Egyptian Arabic Speech and Transcript Directories, se
 e ../run.sh for example."
    exit 1;
@@ -45,8 +45,18 @@ ln -s $* links
 # Basic spot checks to see if we got the data that we needed
 if [ ! -d links/LDC97S45 -o ! -d links/LDC97T19 ];
 then
-        echo "The speech and the data directories need to be named LDC97S45 and LDC97T19 respecti
-vely"
+        echo "The speech and the data directories need to be named LDC97S45 and LDC97T19 respectively"
+        exit 1;
+fi
+if [ ! -d links/LDC2002S37 -o ! -d links/LDC2002T38 ];
+then
+        echo "The Callhome supplement directories need to be named LDC2002S37 and LDC2002T38."
+        o
+        exit 1;
+fi
+if [ ! -d links/LDC2002S22 -o ! -d links/LDC2002T39 ];
+then
+        echo "The H5-ECA directories need to be named LDC2002S22 and LDC2002T39."
         exit 1;
 fi
 
@@ -63,27 +73,71 @@ then
         exit 1;
 fi
 
+if [ ! -d links/LDC2002S37/SPEECH ];
+then
+        echo "Callhome supplement directories missing or not properly organised within the speech data dir"
+        exit 1;
+fi
+
+if [ ! -d links/LDC2002T38/ch_ara_transcr_suppl/transcr ]
+then
+        echo "Callhome supplement Transcript directories missing or not properly organised"
+        exit 1;
+fi
+
+if [ ! -d links/LDC2002S22/SPEECH ];
+then
+        echo "H5 directories missing or not properly organised within the speech data dir"
+        exit 1;
+fi
+
+if [ ! -d links/LDC2002T39/transcr ]
+then
+        echo "H5 Transcript directories missing or not properly organised"
+        exit 1;
+fi
+
 speech_train=$dir/links/LDC97S45/CALLHOME/ARABIC/TRAIN
 speech_dev=$dir/links/LDC97S45/CALLHOME/ARABIC/DEVTEST
 speech_test=$dir/links/LDC97S45/CALLHOME/ARABIC/EVLTEST
-transcripts_train=$dir/links/LDC97T19/callhome_arabic_trans_970711/transcrp/train/roman 
+transcripts_train=$dir/links/LDC97T19/callhome_arabic_trans_970711/transcrp/train/roman
 transcripts_dev=$dir/links/LDC97T19/callhome_arabic_trans_970711/transcrp/devtest/roman
 transcripts_test=$dir/links/LDC97T19/callhome_arabic_trans_970711/transcrp/evaltest/roman
-                                                                                   
-fcount_train=`find ${speech_train} -iname '*.SPH' | wc -l` 
-fcount_dev=`find ${speech_dev} -iname '*.SPH' | wc -l`                                             
-fcount_test=`find ${speech_test} -iname '*.SPH' | wc -l`                                             
-fcount_t_train=`find ${transcripts_train} -iname '*.txt' | wc -l` 
-fcount_t_dev=`find ${transcripts_dev} -iname '*.txt' | wc -l` 
-fcount_t_test=`find ${transcripts_test} -iname '*.txt' | wc -l` 
+speech_sup=$dir/links/LDC2002S37/SPEECH
+transcripts_sup=$dir/links/LDC2002T38/ch_ara_transcr_suppl/transcr
+speech_h5=$dir/links/LDC2002S22/SPEECH
+transcripts_h5=$dir/links/LDC2002T39/transcr
+
+fcount_train=`find ${speech_train} -iname '*.SPH' | wc -l`
+fcount_dev=`find ${speech_dev} -iname '*.SPH' | wc -l`
+fcount_test=`find ${speech_test} -iname '*.SPH' | wc -l`
+fcount_t_train=`find ${transcripts_train} -iname '*.txt' | wc -l`
+fcount_t_dev=`find ${transcripts_dev} -iname '*.txt' | wc -l`
+fcount_t_test=`find ${transcripts_test} -iname '*.txt' | wc -l`
+fcount_sup=`find ${speech_sup} -iname '*.SPH' | wc -l`
+fcount_t_sup=`find ${transcripts_sup} -iname '*.txt' | wc -l`
+fcount_h5=`find ${speech_h5} -iname '*.SPH' | wc -l`
+fcount_t_h5=`find ${transcripts_h5} -iname '*.txt' | wc -l`
 
 #Now check if we got all the files that we needed
-if [ $fcount_train != 80 -o $fcount_dev != 20 -o $fcount_test != 20 -o $fcount_t_train != 80 -o $fcount_t_dev != 20 -o $fcount_t_test != 20 ];                 
-then                                                                               
-        echo "Incorrect number of files in the data directories"                   
+if [ $fcount_train != 80 -o $fcount_dev != 20 -o $fcount_test != 20 -o $fcount_t_train != 80 -o $fcount_t_dev != 20 -o $fcount_t_test != 20 ];
+then
+        echo "Incorrect number of files in the data directories"
         echo "The paritions should contain 80/20/20 files"
-        exit 1;                                                                    
-fi   
+        exit 1;
+fi
+if [ $fcount_sup != 20 -o $fcount_t_sup != 20 ];
+then
+        echo "Incorrect number of files in the ECA sup data directories"
+        echo "The paritions should contain 20/20 files"
+        exit 1;
+fi
+if [ $fcount_h5 != 20 -o $fcount_t_h5 != 20 ];
+then
+        echo "Incorrect number of files in the H5 data directories"
+        echo "The paritions should contain 20/20 files"
+        exit 1;
+fi
 
 if [ $stage -le 0 ]; then
 	#Gather all the speech files together to create a file list
@@ -91,15 +145,19 @@ if [ $stage -le 0 ]; then
 	    find $speech_train -iname '*.sph';
 	    find $speech_dev -iname '*.sph';
 	    find $speech_test -iname '*.sph';
+      find $speech_sup -iname '*.sph';
+      find $speech_h5 -iname '*.sph';
 	)  > $tmpdir/callhome_train_sph.flist
 
 	#Get all the transcripts in one place
 
-	(                                                                              
+  (
     find $transcripts_train -iname '*.txt';
     find $transcripts_dev -iname '*.txt';
     find $transcripts_test -iname '*.txt';
-    )  > $tmpdir/callhome_train_transcripts.flist 
+    find $transcripts_sup -iname '*.txt';
+    find $transcripts_h5 -iname '*.txt';
+  )  > $tmpdir/callhome_train_transcripts.flist
 
 fi
 
@@ -109,7 +167,7 @@ if [ $stage -le 1 ]; then
 	mv $tmpdir/reco2file_and_channel $dir/train_all/
 fi
 
-if [ $stage -le 2 ]; then                                                        
+if [ $stage -le 2 ]; then
   sort $tmpdir/text.1 | grep -v '((' | \
   awk '{if (NF > 1){ print; }}' | \
   sed 's:<\s*[/]*\s*\s*for[ei][ei]g[nh]\s*\w*>::g' | \
@@ -145,7 +203,7 @@ if [ $stage -le 2 ]; then
   ! cat $dir/train_all/text | perl -ane 'm:([^-]+)-([AB])-(\S+): || die "Bad line $_;"; print "$1-$2-$3 $1-$2\n"; ' > $dir/train_all/utt2spk \
   && echo "Error producing utt2spk file" && exit 1;
 
-  # Remove utterances that have the same start and end time. Corresponding text entries will be removed when use 
+  # Remove utterances that have the same start and end time. Corresponding text entries will be removed when use
   # fix_data_dir.sh and validate_data_dir.sh later
   cat $dir/train_all/text | perl -ane 'm:((\S+-[AB])-(\d+)-(\d+))\s: || die; $utt = $1; $reco = $2;
  $s = sprintf("%.2f", 0.01*$3); $e = sprintf("%.2f", 0.01*$4); print "$utt $reco $s $e\n"; ' | \
diff --git a/egs/callhome_egyptian/s5/local/convert_symtable_to_utf.py b/egs/callhome_egyptian/s5/local/convert_symtable_to_utf.py
new file mode 100644
index 00000000000..f5b69a1ff86
--- /dev/null
+++ b/egs/callhome_egyptian/s5/local/convert_symtable_to_utf.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env py
+
+# Converts a romanized ECA word list (symbol table) to
+# a version in the arabic script
+
+import sys
+import codecs
+
+if len(sys.argv) < 3:
+    print "USAGE: local/convert_symtable_to_utf.py [SYMTABLE] [ECA-LEXICON]"
+    print "E.g., local/convert_symtable_to_utf.py data/lang/words.txt \
+                /export/corpora/LDC/LDC99L22"
+    sys.exit(1)
+
+# Note that the ECA lexicon's default encoding is ISO-8859-6, not UTF8
+symtable = codecs.open(sys.argv[1], encoding="utf8")
+lexicon = codecs.open(sys.argv[2] + "/callhome_arabic_lexicon_991012/ar_lex.v07", encoding="iso-8859-6")
+
+dict_cache = {}
+# First read off the dictionary and store stuff in a cache
+for line in lexicon:
+    line = line.strip().split()
+    roman = line[0].strip()
+    script = line[1].strip()
+    assert roman not in dict_cache
+    dict_cache[roman] = script
+
+# Now read the symbol table and write off the ut8 versions
+for line in symtable:
+    line = line.strip().split()
+    if line[0] in dict_cache:
+        output = dict_cache[line[0]] + " " + line[1]
+    else:
+        output = line[0] + " " + line[1]
+    sys.stdout.write(output.encode("utf-8") + "\n")
+
+lexicon.close()
+symtable.close()
diff --git a/egs/callhome_egyptian/s5/local/create_splits b/egs/callhome_egyptian/s5/local/create_splits
index 98b27b0109e..80a32cea394 100755
--- a/egs/callhome_egyptian/s5/local/create_splits
+++ b/egs/callhome_egyptian/s5/local/create_splits
@@ -11,7 +11,7 @@ fi
 splitFile=$1
 
 # Train first
-for split in train dev test
+for split in train dev test sup h5
 do
 
   cp -r $train_all $data_dir/$split
diff --git a/egs/callhome_egyptian/s5/local/nnet3/run_ivector_common.sh b/egs/callhome_egyptian/s5/local/nnet3/run_ivector_common.sh
new file mode 100755
index 00000000000..db79dd138b2
--- /dev/null
+++ b/egs/callhome_egyptian/s5/local/nnet3/run_ivector_common.sh
@@ -0,0 +1,84 @@
+#!/bin/bash
+
+# Inherited from the WSJ nnet3 recipe, modified for use with ECA
+
+# this script is called from scripts like run_ms.sh; it does the common stages
+# of the build, such as feature extraction.
+# This is actually the same as local/online/run_nnet2_common.sh, except
+# for the directory names.
+
+mfccdir=mfcc
+
+stage=1
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if [ $stage -le 1 ]; then
+  for datadir in train dev test sup h5; do
+    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
+    steps/make_mfcc.sh --nj 40 --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1;
+    steps/compute_cmvn_stats.sh data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1;
+  done
+  utils/subset_data_dir.sh --first data/train 7388 data/train_small || exit 1
+  utils/subset_data_dir.sh --first data/train_hires 7388 data/train_small_hires || exit 1
+fi
+
+if [ $stage -le 2 ]; then
+  # We need to build a small system just because we need the LDA+MLLT transform
+  # to train the diag-UBM on top of.  We align the si84 data for this purpose.
+  steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
+    data/train_small data/lang exp/tri5a exp/nnet3/tri5a_ali_small
+fi
+
+if [ $stage -le 3 ]; then
+  # Train a small system just for its LDA+MLLT transform.  We use --num-iters 13
+  # because after we get the transform (12th iter is the last), any further
+  # training is pointless.
+  steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \
+    --realign-iters "" \
+    --splice-opts "--left-context=3 --right-context=3" \
+    5000 10000 data/train_small_hires data/lang \
+     exp/nnet3/tri5a_ali_small exp/nnet3/tri5b
+fi
+
+if [ $stage -le 4 ]; then
+  mkdir -p exp/nnet3
+
+  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
+     --num-frames 400000 data/train_small_hires 256 exp/nnet3/tri5b exp/nnet3/diag_ubm
+fi
+
+if [ $stage -le 5 ]; then
+  # even though $nj is just 10, each job uses multiple processes and threads.
+  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
+    data/train_hires exp/nnet3/diag_ubm exp/nnet3/extractor || exit 1;
+fi
+
+if [ $stage -le 6 ]; then
+  # We extract iVectors on all the train_si284 data, which will be what we
+  # train the system on.
+
+  # having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (iVector starts at zero).
+  steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/train_hires \
+    data/train_hires_max2
+
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
+    data/train_hires_max2 exp/nnet3/extractor exp/nnet3/ivectors_train || exit 1;
+fi
+
+if [ $stage -le 7 ]; then
+  rm exp/nnet3/.error 2>/dev/null
+  for data in dev test sup h5; do
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 8 \
+      data/${data}_hires exp/nnet3/extractor exp/nnet3/ivectors_${data} || touch exp/nnet3/.error &
+  done
+  wait
+  [ -f exp/nnet3/.error ] && echo "$0: error extracting iVectors." && exit 1;
+fi
+
+exit 0;
diff --git a/egs/callhome_egyptian/s5/local/nnet3/run_tdnn.sh b/egs/callhome_egyptian/s5/local/nnet3/run_tdnn.sh
new file mode 100755
index 00000000000..bd0a6afbda6
--- /dev/null
+++ b/egs/callhome_egyptian/s5/local/nnet3/run_tdnn.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+
+# this is the standard "tdnn" system, built in nnet3; it's what we use to
+# call multi-splice.
+
+# At this script level we don't support not running on GPU, as it would be painfully slow.
+# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false,
+# --num-threads 16 and --minibatch-size 128.
+
+stage=0
+train_stage=-10
+dir=exp/nnet3/nnet_tdnn_a
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1 
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage || exit 1;
+
+if [ $stage -le 8 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/eca-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  # Note that the alignments used come from the pnorm-ensemble model
+  # If you choose to skip ensemble training (which is slow), use the best
+  # fmllr alignments available (tri4a)
+  steps/nnet3/train_tdnn.sh --stage $train_stage \
+    --num-epochs 8 --num-jobs-initial 2 --num-jobs-final 14 \
+    --splice-indexes "-4,-3,-2,-1,0,1,2,3,4  0  -2,2  0  -4,4 0" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_train \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate 0.005 --final-effective-lrate 0.0005 \
+    --cmd "$decode_cmd" \
+    --pnorm-input-dim 2000 \
+    --pnorm-output-dim 250 \
+    data/train_hires data/lang exp/tri5a_ali $dir  || exit 1;
+fi
+
+
+if [ $stage -le 9 ]; then
+  # this does offline decoding that should give the same results as the real
+  # online decoding.
+  graph_dir=exp/tri5a/graph
+  # use already-built graphs.
+  for data in dev test sup h5; do
+    steps/nnet3/decode.sh --nj 8 --cmd "$decode_cmd" \
+        --online-ivector-dir exp/nnet3/ivectors_${data} \
+       $graph_dir data/${data}_hires $dir/decode_${data} || exit 1;
+  done
+fi
+
diff --git a/egs/callhome_egyptian/s5/local/splits/eval_96 b/egs/callhome_egyptian/s5/local/splits/eval_96
deleted file mode 100644
index 29b51b0d4c8..00000000000
--- a/egs/callhome_egyptian/s5/local/splits/eval_96
+++ /dev/null
@@ -1,20 +0,0 @@
-ar_4482.sph
-ar_4486.sph
-ar_4510.sph
-ar_4527.sph
-ar_4540.sph
-ar_4579.sph
-ar_4687.sph
-ar_4695.sph
-ar_4780.sph
-ar_4864.sph
-ar_5091.sph
-ar_5271.sph
-ar_5453.sph
-ar_5567.sph
-ar_5595.sph
-ar_5679.sph
-ar_5734.sph
-ar_5766.sph
-ar_5767.sph
-ar_5827.sph
diff --git a/egs/callhome_egyptian/s5/local/splits/eval_97 b/egs/callhome_egyptian/s5/local/splits/sup
similarity index 100%
rename from egs/callhome_egyptian/s5/local/splits/eval_97
rename to egs/callhome_egyptian/s5/local/splits/sup
diff --git a/egs/callhome_egyptian/s5/path.sh b/egs/callhome_egyptian/s5/path.sh
index 298a449cccd..1a6fb5f891b 100755
--- a/egs/callhome_egyptian/s5/path.sh
+++ b/egs/callhome_egyptian/s5/path.sh
@@ -1,3 +1,5 @@
 export KALDI_ROOT=`pwd`/../../..
-export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnet-cpubin/:$PWD:$PATH
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
 export LC_ALL=C
diff --git a/egs/callhome_egyptian/s5/run.sh b/egs/callhome_egyptian/s5/run.sh
index 53753e31be2..9d1fa692da0 100755
--- a/egs/callhome_egyptian/s5/run.sh
+++ b/egs/callhome_egyptian/s5/run.sh
@@ -15,16 +15,19 @@ set -e
 eca_speech=/export/corpora/LDC/LDC97S45
 eca_transcripts=/export/corpora/LDC/LDC97T19
 eca_lexicon=/export/corpora/LDC/LDC99L22
+sup_speech=/export/corpora/LDC/LDC2002S37
+sup_transcripts=/export/corpora/LDC/LDC2002T38
+h5_speech=/export/corpora/LDC/LDC2002S22
+h5_transcripts=/export/corpora/LDC/LDC2002T39
 split=local/splits
 
-local/callhome_data_prep.sh $eca_speech $eca_transcripts
+local/callhome_data_prep.sh $eca_speech $eca_transcripts $sup_speech $sup_transcripts $h5_speech $h5_transcripts
 
 local/callhome_prepare_dict.sh $eca_lexicon
 
 # Added c,j, v to the non silences phones manually
 utils/prepare_lang.sh data/local/dict "<unk>" data/local/lang data/lang
 
-
 # Make sure that you do not use your test and your dev sets to train the LM
 # Some form of cross validation is possible where you decode your dev/set based on an 
 # LM that is trained on  everything but that that conversation
@@ -47,6 +50,8 @@ local/create_splits $split
 # Now compute CMVN stats for the train, dev and test subsets
 steps/compute_cmvn_stats.sh data/dev exp/make_mfcc/dev $mfccdir
 steps/compute_cmvn_stats.sh data/test exp/make_mfcc/test $mfccdir
+steps/compute_cmvn_stats.sh data/sup exp/make_mfcc/sup $mfccdir
+steps/compute_cmvn_stats.sh data/h5 exp/make_mfcc/h5 $mfccdir
 
 steps/compute_cmvn_stats.sh data/train exp/make_mfcc/train $mfccdir
 
@@ -57,7 +62,7 @@ steps/compute_cmvn_stats.sh data/train exp/make_mfcc/train $mfccdir
 # utterances from those.
 
 steps/train_mono.sh --nj 10 --cmd "$train_cmd" \
-  data/train data/lang exp/mono0a    
+  data/train data/lang exp/mono0a
 
 steps/align_si.sh --nj 30 --cmd "$train_cmd" \
    data/train data/lang exp/mono0a exp/mono0a_ali || exit 1;
@@ -77,12 +82,11 @@ steps/train_deltas.sh --cmd "$train_cmd" \
     1400 15000 data/train data/lang exp/tri1_ali exp/tri2 || exit 1;
 
 (
-  utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph || exit 1;             
+  utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph || exit 1;
   steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
    exp/tri2/graph data/dev exp/tri2/decode_dev || exit 1;
 )&
 
-
 steps/align_si.sh --nj 30 --cmd "$train_cmd" \
   data/train data/lang exp/tri2 exp/tri2_ali || exit 1;
 
@@ -125,15 +129,51 @@ steps/train_sat.sh  --cmd "$train_cmd" \
    exp/tri5a/graph data/dev exp/tri5a/decode_dev
 )&
 
-steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
-exp/tri5a/graph data/test exp/tri5a/decode_test
+(
+  steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
+  exp/tri5a/graph data/test exp/tri5a/decode_test
+  # Decode Supplement and H5
+  steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
+  exp/tri5a/graph data/sup exp/tri5a/decode_sup
+  steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
+  exp/tri5a/graph data/h5 exp/tri5a/decode_h5
+)&
+
+dnn_cpu_parallel_opts=(--minibatch-size 128 --max-change 10 --num-jobs-nnet 8 --num-threads 16 \
+                       --parallel-opts "-pe smp 16" --cmd "queue.pl -l arch=*64 -l mem_free=2G,ram_free=1G")
+dnn_gpu_parallel_opts=(--minibatch-size 512 --max-change 40 --num-jobs-nnet 4 --num-threads 1 \
+                       --parallel-opts "-l gpu=1" --cmd "queue.pl -l arch=*64 -l mem_free=2G,ram_free=1G")
+
+steps/nnet2/train_pnorm_ensemble.sh \
+  --mix-up 5000  --initial-learning-rate 0.008 --final-learning-rate 0.0008\
+  --num-hidden-layers 4 --pnorm-input-dim 2000 --pnorm-output-dim 200\
+  --cmd "$train_cmd" \
+  "${dnn_gpu_parallel_opts[@]}" \
+  --ensemble-size 4 --initial-beta 0.1 --final-beta 5 \
+  data/train data/lang exp/tri5a_ali exp/tri6a_dnn
 
-# Decode CALLHOME
-steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
-exp/tri5a/graph data/callhome_test exp/tri5a/decode_callhome_test
-steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
-exp/tri5a/graph data/callhome_dev exp/tri5a/decode_callhome_dev
-steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
-exp/tri5a/graph data/callhome_train exp/tri5a/decode_callhome_train
+(
+  steps/nnet2/decode.sh --nj 13 --cmd "$decode_cmd" --num-threads 4 --parallel-opts " -pe smp 4"   \
+    --scoring-opts "--min-lmwt 8 --max-lmwt 16" --transform-dir exp/tri5a/decode_dev exp/tri5a/graph data/dev exp/tri6a_dnn/decode_dev
+) &
+
+# Decode test sets
+(
+  steps/nnet2/decode.sh --nj 13 --cmd "$decode_cmd" --num-threads 4 --parallel-opts " -pe smp 4"   \
+    --scoring-opts "--min-lmwt 8 --max-lmwt 16" --transform-dir exp/tri5a/decode_test exp/tri5a/graph data/test exp/tri6a_dnn/decode_test
+  steps/nnet2/decode.sh --nj 13 --cmd "$decode_cmd" --num-threads 4 --parallel-opts " -pe smp 4"   \
+    --scoring-opts "--min-lmwt 8 --max-lmwt 16" --transform-dir exp/tri5a/decode_sup exp/tri5a/graph data/sup exp/tri6a_dnn/decode_sup
+  steps/nnet2/decode.sh --nj 13 --cmd "$decode_cmd" --num-threads 4 --parallel-opts " -pe smp 4"   \
+    --scoring-opts "--min-lmwt 8 --max-lmwt 16" --transform-dir exp/tri5a/decode_h5 exp/tri5a/graph data/h5 exp/tri6a_dnn/decode_h5
+) &
+
+wait
+
+# (TDNN + iVectors) training
+# Note that the alignments used by run_tdnn.sh come from the pnorm-ensemble model
+# If you choose to skip ensemble training (which is slow), use the best
+# fmllr alignments available (tri4a)
+# You can modify this in local/nnet/run_tdnn.sh
+local/nnet3/run_tdnn.sh
 
 exit 0;
diff --git a/egs/chime1/s5/cmd.sh b/egs/chime1/s5/cmd.sh
index dda6226f419..0dcd5a9200f 100755
--- a/egs/chime1/s5/cmd.sh
+++ b/egs/chime1/s5/cmd.sh
@@ -1,39 +1,18 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#a) JHU cluster options
-#export train_cmd="queue.pl -l arch=*64"
-#export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G"
-#export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G"
-
-#export cuda_cmd="..."
-
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
-#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
-
-#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
-#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
-
-
-#c) USFD cluster options
-#config="conf/queue_usfd.conf"
-#export train_cmd="queue.pl  --config $config --mem 8G --rmem 4G"
-#export decode_cmd="queue.pl  --config $config --mem 8G --rmem 4G"
-#export mkgraph_cmd="queue.pl  --config $config --mem 8G --rmem 4G"
-#export cuda_cmd="queue.pl  --config $config --mem 24G --rmem 20G --gpu 1 --time 24:00:00"
-
-
-#d) run it locally...
-export train_cmd=run.pl
-export decode_cmd=run.pl
-export cuda_cmd=run.pl
-export mkgraph_cmd=run.pl
-
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
+
+# the use of cuda_cmd is deprecated, but it's still used in this recipe.
+export cuda_cmd="queue.pl --gpu 1"
diff --git a/egs/chime1/s5/path.sh b/egs/chime1/s5/path.sh
index 59966f91a53..1a6fb5f891b 100755
--- a/egs/chime1/s5/path.sh
+++ b/egs/chime1/s5/path.sh
@@ -1,3 +1,5 @@
 export KALDI_ROOT=`pwd`/../../..
-export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin/:$KALDI_ROOT/src/kwsbin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$KALDI_ROOT/src/lmbin/:$PWD:$PATH
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
 export LC_ALL=C
diff --git a/egs/chime2/s5/cmd.sh b/egs/chime2/s5/cmd.sh
index 8bb00fe0ec6..0dcd5a9200f 100644
--- a/egs/chime2/s5/cmd.sh
+++ b/egs/chime2/s5/cmd.sh
@@ -1,30 +1,18 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
-#a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64"
-export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G"
-export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G"
-export big_memory_cmd="queue.pl -l arch=*64,ram_free=8G,mem_free=8G"
-export cuda_cmd="queue.pl -l gpu=1"
-#export cuda_cmd="..."
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
 
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
-#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
-
-#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
-#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
-
-#c) run it locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
+# the use of cuda_cmd is deprecated, but it's still used in this recipe.
+export cuda_cmd="queue.pl --gpu 1"
diff --git a/egs/chime2/s5/local/chime_format_data.sh b/egs/chime2/s5/local/chime_format_data.sh
index 2c0728b943e..5870174aff4 100755
--- a/egs/chime2/s5/local/chime_format_data.sh
+++ b/egs/chime2/s5/local/chime_format_data.sh
@@ -17,11 +17,9 @@
 echo "Preparing train and test data"
 srcdir=data/local/data
 lmdir=data/local/nist_lm
-tmpdir=data/local/lm_tmp
 lexicon=data/local/lang_tmp/lexiconp.txt
-mkdir -p $tmpdir
 
-for x in test_eval92_clean test_eval92_noisy test_eval92_5k_clean test_eval92_5k_noisy dev_dt_05_clean dev_dt_05_reverb dev_dt_05_noisy dev_dt_20_clean dev_dt_20_reverb dev_dt_20_noisy train_si84_clean train_si84_reverb train_si84_noisy; do 
+for x in test_eval92_clean test_eval92_noisy test_eval92_5k_clean test_eval92_5k_noisy dev_dt_05_clean dev_dt_05_reverb dev_dt_05_noisy dev_dt_20_clean dev_dt_20_reverb dev_dt_20_noisy train_si84_clean train_si84_reverb train_si84_noisy; do
   mkdir -p data/$x
   cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1;
   cp $srcdir/$x.txt data/$x/text || exit 1;
@@ -42,25 +40,10 @@ for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do
   cp -r data/lang/* $test
 
   gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
-   utils/find_arpa_oovs.pl $test/words.txt  > $tmpdir/oovs_${lm_suffix}.txt
-
-  # grep -v '<s> <s>' because the LM seems to have some strange and useless
-  # stuff in it with multiple <s>'s in the history.  Encountered some other similar
-  # things in a LM from Geoff.  Removing all "illegal" combinations of <s> and </s>,
-  # which are supposed to occur only at being/end of utt.  These can cause 
-  # determinization failures of CLG [ends up being epsilon cycles].
-  gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
-    grep -v '<s> <s>' | \
-    grep -v '</s> <s>' | \
-    grep -v '</s> </s>' | \
-    arpa2fst - | fstprint | \
-    utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \
-    utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
-      --osymbols=$test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-     fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
+    arpa2fst --disambig-symbol=#0 \
+             --read-symbol-table=$test/words.txt - $test/G.fst
 
   utils/validate_lang.pl $test || exit 1;
 done
 
 echo "Succeeded in formatting data."
-rm -r $tmpdir
diff --git a/egs/chime2/s5/path.sh b/egs/chime2/s5/path.sh
index fee0b9b0c11..2d17b17a84a 100755
--- a/egs/chime2/s5/path.sh
+++ b/egs/chime2/s5/path.sh
@@ -1,4 +1,6 @@
 export KALDI_ROOT=`pwd`/../../..
 [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
-export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin/:$KALDI_ROOT/src/kwsbin:$PWD:$PATH
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
 export LC_ALL=C
diff --git a/egs/chime3/README.txt b/egs/chime3/README.txt
index 6c55689b298..3d52c0a8a04 100644
--- a/egs/chime3/README.txt
+++ b/egs/chime3/README.txt
@@ -6,7 +6,7 @@ If you use these data in a publication, please cite:
 Jon Barker, Ricard Marxer, Emmanuel Vincent, and Shinji Watanabe, The
 third 'CHiME' Speech Separation and Recognition Challenge: Dataset,
 task and baselines, submitted to IEEE 2015 Automatic Speech Recognition
-and Understanding Workshop (ASRU), 2015. 
+and Understanding Workshop (ASRU), 2015.
 
 Quick instruction:
 1) Download CHiME3 data
@@ -33,6 +33,7 @@ nohup ./run.sh > run.log
 
 local/run_gmm.sh <enhancement method> <enhanced speech directory>
 local/run_dnn.sh <enhancement method> <enhanced speech directory>
+local/run_lmrescore.sh <your CHiME3 directory> <enhancement method>
 
 You can put <enhanced speech directory> in your working directory.
 But please make sure to use the same directory structure and naming convention with those of the
@@ -45,6 +46,7 @@ You don't have to execute local/run_init.sh twice.
 enhan=<enhancement method>
 GMM clean training: exp/tri3b_tr05_orig_clean/best_wer_$enhan.result
 GMM multi training: exp/tri3b_tr05_multi_$enhan/best_wer_$enhan.result
-DNN multi training: exp/tri4a_dnn_tr05_multi_${enhan}_smbr_i1lats/best_wer_${enhan}.result 
+DNN multi training: exp/tri4a_dnn_tr05_multi_${enhan}_smbr_i1lats/best_wer_${enhan}.result
+DNN multi training with LM rescoring: exp/tri4a_dnn_tr05_multi_${enhan}_smbr_i1lats_lmrescore/best_wer_${enhan}_rnnlm_5k_h300_w0.5_n100.result
 
 Note that training on clean data means original WSJ0 data only (no booth data)
diff --git a/egs/chime3/s5/RESULTS b/egs/chime3/s5/RESULTS
new file mode 100644
index 00000000000..7e00f49542a
--- /dev/null
+++ b/egs/chime3/s5/RESULTS
@@ -0,0 +1,95 @@
+# The result based on Hori et al, "The MERL/SRI system for the 3rd CHiME challenge using beamforming,
+# robust feature extraction, and advanced speech recognition," in Proc. ASRU'15
+# Note that the following result is different from that in the paper since we don't include
+# SRI's robust features and system combination
+
+GMM multi-condition
+exp/tri3b_tr05_multi_noisy/best_wer_noisy.result
+-------------------
+best overall dt05 WER 18.49% (language model weight = 10)
+-------------------
+dt05_simu WER: 18.36% (Average), 18.72% (BUS), 22.46% (CAFE), 14.97% (PEDESTRIAN), 17.27% (STREET)
+-------------------
+dt05_real WER: 18.62% (Average), 26.17% (BUS), 17.18% (CAFE), 12.92% (PEDESTRIAN), 18.20% (STREET)
+-------------------
+et05_simu WER: 21.40% (Average), 19.14% (BUS), 24.08% (CAFE), 21.68% (PEDESTRIAN), 20.69% (STREET)
+-------------------
+et05_real WER: 32.54% (Average), 48.76% (BUS), 32.84% (CAFE), 27.30% (PEDESTRIAN), 21.29% (STREET)
+-------------------
+
+
+GMM with beamformit
+exp/tri3b_tr05_multi_beamformit_5mics/best_wer_beamformit_5mics.result
+-------------------
+best overall dt05 WER 13.83% (language model weight = 11)
+-------------------
+dt05_simu WER: 14.87% (Average), 12.36% (BUS), 17.95% (CAFE), 12.92% (PEDESTRIAN), 16.27% (STREET)
+-------------------
+dt05_real WER: 12.78% (Average), 16.17% (BUS), 12.20% (CAFE), 9.62% (PEDESTRIAN), 13.14% (STREET)
+-------------------
+et05_simu WER: 23.13% (Average), 16.27% (BUS), 24.86% (CAFE), 26.06% (PEDESTRIAN), 25.33% (STREET)
+-------------------
+et05_real WER: 23.06% (Average), 31.31% (BUS), 21.85% (CAFE), 21.86% (PEDESTRIAN), 17.22% (STREET)
+-------------------
+
+
+DNN
+exp/tri4a_dnn_tr05_multi_beamformit_5mics/best_wer_beamformit_5mics.result
+-------------------
+best overall dt05 WER 10.34% (language model weight = 10)
+-------------------
+dt05_simu WER: 11.08% (Average), 10.09% (BUS), 13.01% (CAFE), 9.23% (PEDESTRIAN), 12.01% (STREET)
+-------------------
+dt05_real WER: 9.59% (Average), 12.67% (BUS), 9.41% (CAFE), 6.65% (PEDESTRIAN), 9.64% (STREET)
+-------------------
+et05_simu WER: 17.48% (Average), 12.57% (BUS), 18.04% (CAFE), 18.64% (PEDESTRIAN), 20.66% (STREET)
+-------------------
+et05_real WER: 17.89% (Average), 26.77% (BUS), 16.57% (CAFE), 14.85% (PEDESTRIAN), 13.37% (STREET)
+-------------------
+
+
+DNN sMBR
+exp/tri4a_dnn_tr05_multi_beamformit_5mics_smbr_i1lats/best_wer_beamformit_5mics.result
+-------------------
+best overall dt05 WER 9.24% (language model weight = 10)
+ (Number of iterations = 4)
+-------------------
+dt05_simu WER: 9.90% (Average), 9.38% (BUS), 11.70% (CAFE), 8.14% (PEDESTRIAN), 10.40% (STREET)
+-------------------
+dt05_real WER: 8.58% (Average), 11.54% (BUS), 8.36% (CAFE), 5.74% (PEDESTRIAN), 8.67% (STREET)
+-------------------
+et05_simu WER: 16.01% (Average), 11.97% (BUS), 16.49% (CAFE), 16.51% (PEDESTRIAN), 19.07% (STREET)
+-------------------
+et05_real WER: 15.88% (Average), 23.54% (BUS), 14.21% (CAFE), 13.42% (PEDESTRIAN), 12.35% (STREET)
+-------------------
+
+
+5-gram rescoring
+exp/tri4a_dnn_tr05_multi_beamformit_5mics_smbr_lmrescore/best_wer_beamformit_5mics_5gkn_5k.result
+-------------------
+best overall dt05 WER 7.73% (language model weight = 10)
+-------------------
+dt05_simu WER: 8.43% (Average), 7.83% (BUS), 10.19% (CAFE), 6.87% (PEDESTRIAN), 8.83% (STREET)
+-------------------
+dt05_real WER: 7.02% (Average), 9.13% (BUS), 7.08% (CAFE), 4.62% (PEDESTRIAN), 7.27% (STREET)
+-------------------
+et05_simu WER: 13.94% (Average), 10.87% (BUS), 14.42% (CAFE), 13.69% (PEDESTRIAN), 16.79% (STREET)
+-------------------
+et05_real WER: 14.12% (Average), 21.57% (BUS), 12.22% (CAFE), 11.36% (PEDESTRIAN), 11.32% (STREET)
+-------------------
+
+
+RNNLM
+exp/tri4a_dnn_tr05_multi_beamformit_5mics_smbr_lmrescore/best_wer_beamformit_5mics_rnnlm_5k_h300_w0.5_n100.result
+-------------------
+best overall dt05 WER 7.14% (language model weight = 6)
+-------------------
+dt05_simu WER: 7.83% (Average), 7.29% (BUS), 9.62% (CAFE), 6.08% (PEDESTRIAN), 8.33% (STREET)
+-------------------
+dt05_real WER: 6.45% (Average), 8.48% (BUS), 6.19% (CAFE), 4.53% (PEDESTRIAN), 6.61% (STREET)
+-------------------
+et05_simu WER: 12.86% (Average), 9.92% (BUS), 13.35% (CAFE), 12.59% (PEDESTRIAN), 15.60% (STREET)
+-------------------
+et05_real WER: 12.79% (Average), 19.14% (BUS), 11.39% (CAFE), 10.33% (PEDESTRIAN), 10.31% (STREET)
+-------------------
+
diff --git a/egs/chime3/s5/conf/ami.cfg b/egs/chime3/s5/conf/ami.cfg
new file mode 100755
index 00000000000..70fdd858651
--- /dev/null
+++ b/egs/chime3/s5/conf/ami.cfg
@@ -0,0 +1,50 @@
+#BeamformIt sample configuration file for AMI data (http://groups.inf.ed.ac.uk/ami/download/)
+
+# scrolling size to compute the delays
+scroll_size = 250
+
+# cross correlation computation window size
+window_size = 500
+
+#amount of maximum points for the xcorrelation taken into account
+nbest_amount = 4
+
+#flag wether to apply an automatic noise thresholding 
+do_noise_threshold = 1
+
+#Percentage of frames with lower xcorr taken as noisy
+noise_percent = 10
+
+######## acoustic modelling parameters
+
+#transition probabilities weight for multichannel decoding
+trans_weight_multi = 25
+trans_weight_nbest = 25
+
+###
+
+#flag wether to print the feaures after setting them, or not
+print_features = 1
+
+#flag wether to use the bad frames in the sum process
+do_avoid_bad_frames = 1
+
+#flag to use the best channel (SNR) as a reference
+#defined from command line
+do_compute_reference = 1
+
+#flag wether to use a uem file or not(process all the file)
+do_use_uem_file = 0
+
+#flag wether to use an adaptative weights scheme or fixed weights
+do_adapt_weights = 1
+
+#flag wether to output the sph files or just run the system to create the auxiliary files
+do_write_sph_files = 1
+
+####directories where to store/retrieve info####
+#channels_file = ./cfg-files/channels
+
+#show needs to be passed as argument normally, here a default one is given just in case
+#show_id = Ttmp
+
diff --git a/egs/chime3/s5/local/chime3_beamform.sh b/egs/chime3/s5/local/chime3_beamform.sh
new file mode 100755
index 00000000000..170a37ccd84
--- /dev/null
+++ b/egs/chime3/s5/local/chime3_beamform.sh
@@ -0,0 +1,94 @@
+#!/bin/bash
+
+# Copyright 2015, Mitsubishi Electric Research Laboratories, MERL (Author: Shinji Watanabe)
+
+. ./cmd.sh
+. ./path.sh
+
+# Config:
+nj=10
+cmd=run.pl
+
+. utils/parse_options.sh || exit 1;
+
+if [ $# != 2 ]; then
+   echo "Wrong #arguments ($#, expected 2)"
+   echo "Usage: local/chime3_beamform.sh [options] <wav-in-dir> <wav-out-dir>"
+   echo "main options (for others, see top of script file)"
+   echo "  --nj <nj>                                # number of parallel jobs"
+   echo "  --cmd <cmd>                              # Command to run in parallel with"
+   exit 1;
+fi
+
+sdir=$1
+odir=$2
+wdir=data/local/beamforming
+
+if [ -z $BEAMFORMIT ] ; then
+  export BEAMFORMIT=$KALDI_ROOT/tools/BeamformIt
+fi
+export PATH=${PATH}:$BEAMFORMIT
+! hash BeamformIt && echo "Missing BeamformIt, run 'cd ../../../tools/; make beamformit;'" && exit 1
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+mkdir -p $odir
+mkdir -p $wdir/log
+
+# we use the following channel signals, and remove 2nd channel signal, which located on the back of
+# tablet, and behaves very different from the other front channel signals.
+bmf="1 3 4 5 6"
+echo "Will use the following channels: $bmf"
+# number of channels
+numch=`echo $bmf | tr ' ' '\n' | wc -l`
+echo "the number of channels: $numch"
+
+# wavfiles.list can be used as the name of the output files
+output_wavfiles=$wdir/wavfiles.list
+find $sdir/*{simu,real} | grep CH1.wav \
+  | awk -F '/' '{print $(NF-1) "/" $NF}' | sed -e "s/\.CH1\.wav//" | sort > $output_wavfiles
+
+# this is an input file list of the microphones
+# format: 1st_wav 2nd_wav ... nth_wav
+input_arrays=$wdir/channels_$numch
+for x in `cat $output_wavfiles`; do
+  echo -n "$x"
+  for ch in $bmf; do
+    echo -n " $x.CH$ch.wav"
+  done
+  echo ""
+done > $input_arrays
+
+# split the list for parallel processing
+split_wavfiles=""
+for n in `seq $nj`; do
+  split_wavfiles="$split_wavfiles $output_wavfiles.$n"
+done
+utils/split_scp.pl $output_wavfiles $split_wavfiles || exit 1;
+
+echo -e "Beamforming\n"
+# making a shell script for each job
+for n in `seq $nj`; do
+cat << EOF > $wdir/log/beamform.$n.sh
+while read line; do
+  $BEAMFORMIT/BeamformIt -s \$line -c $input_arrays \
+    --config_file `pwd`/conf/ami.cfg \
+    --source_dir $sdir \
+    --result_dir $odir
+done < $output_wavfiles.$n
+EOF
+done
+# making a subdirectory for the output wav files
+for x in `awk -F '/' '{print $1}' $output_wavfiles | sort | uniq`; do
+  mkdir -p $odir/$x
+done
+
+chmod a+x $wdir/log/beamform.*.sh
+$cmd JOB=1:$nj $wdir/log/beamform.JOB.log \
+  $wdir/log/beamform.JOB.sh
+
+echo "`basename $0` Done."
diff --git a/egs/chime3/s5/local/chime3_calc_wers.sh b/egs/chime3/s5/local/chime3_calc_wers.sh
index 83d9b7f4251..b083faec56b 100755
--- a/egs/chime3/s5/local/chime3_calc_wers.sh
+++ b/egs/chime3/s5/local/chime3_calc_wers.sh
@@ -6,7 +6,7 @@
 set -e
 
 if [ $# -ne 2 ]; then
-  printf "\nUSAGE: %s <training experiment directory> <enhance method>\n\n" `basename $0`
+  printf "\nUSAGE: %s <training experiment directory> <enhancement method>\n\n" `basename $0`
   printf "%s exp/tri3b_tr05_sr_noisy noisy\n\n" `basename $0`
   exit 1;
 fi
@@ -28,7 +28,7 @@ for a in `find $dir/decode_tgpr_5k_dt05_real_$enhan/ | grep "\/wer_" | awk -F'[/
 	cat $dir/decode_tgpr_5k_dt05_{real,simu}_$enhan/$a | grep WER | awk '{err+=$4} {wrd+=$6} END{printf("%.2f\n",err/wrd*100)}'
     else
 	cat $dir/decode_tgpr_5k_dt05_real_$enhan/$a | grep WER | awk '{err+=$4} {wrd+=$6} END{printf("%.2f\n",err/wrd*100)}'
-    fi	
+    fi
 done | sort -n -k 2 | head -n 1 > $dir/log/best_wer_$enhan
 
 lmw=`cut -f 1 -d" " $dir/log/best_wer_$enhan | cut -f 2 -d"_"`
diff --git a/egs/chime3/s5/local/chime3_calc_wers_smbr.sh b/egs/chime3/s5/local/chime3_calc_wers_smbr.sh
old mode 100644
new mode 100755
index ac63c0febb0..178e7a78b9c
--- a/egs/chime3/s5/local/chime3_calc_wers_smbr.sh
+++ b/egs/chime3/s5/local/chime3_calc_wers_smbr.sh
@@ -6,7 +6,7 @@
 set -e
 
 if [ $# -ne 3 ]; then
-  printf "\nUSAGE: %s <training experiment directory> <enhance method> <graph_dir>\n\n" `basename $0`
+  printf "\nUSAGE: %s <training experiment directory> <enhancement method> <graph_dir>\n\n" `basename $0`
   printf "%s exp/tri3b_tr05_sr_noisy noisy exp/tri4a_dnn_tr05_sr_noisy/graph_tgpr_5k\n\n" `basename $0`
   exit 1;
 fi
diff --git a/egs/chime3/s5/local/chime3_train_lms.sh b/egs/chime3/s5/local/chime3_train_lms.sh
new file mode 100755
index 00000000000..984ef766b2a
--- /dev/null
+++ b/egs/chime3/s5/local/chime3_train_lms.sh
@@ -0,0 +1,148 @@
+#!/bin/bash
+
+# Modified from the script for CHiME3 baseline
+# Copyright 2015, Mitsubishi Electric Research Laboratories, MERL (Author: Takaaki Hori)
+
+# Config:
+order=5 # n-gram order
+
+. utils/parse_options.sh || exit 1;
+
+. ./path.sh
+
+if [ $# -ne 1 ]; then
+  printf "\nUSAGE: %s <CHiME3 root directory>\n\n" `basename $0`
+  echo "Please specifies a CHiME3 root directory"
+  echo "If you use kaldi scripts distributed in the CHiME3 data,"
+  echo "It would be `pwd`/../.."
+  exit 1;
+fi
+
+# check data directories
+chime3_data=$1
+wsj0_data=$chime3_data/data/WSJ0 # directory of WSJ0 in CHiME3. You can also specify your WSJ0 corpus directory
+if [ ! -d $chime3_data ]; then
+  echo "$chime3_data does not exist. Please specify chime3 data root correctly" && exit 1
+fi
+if [ ! -d $wsj0_data ]; then
+  echo "$wsj0_data does not exist. Please specify WSJ0 corpus directory" && exit 1
+fi
+lm_train=$wsj0_data/wsj0/doc/lng_modl/lm_train/np_data
+
+# check whether run_init is executed
+if [ ! -d data/lang ]; then
+  echo "error, execute local/run_init.sh, first"
+  exit 1;
+fi
+
+# lm directories
+dir=data/local/local_lm
+srcdir=data/local/nist_lm
+mkdir -p $dir
+
+# check srilm ngram
+! which ngram-count \
+  && echo "SRILM tools not installed, which are required for LM training" && exit 1;
+
+# extract 5k vocabulary from a baseline language model
+srclm=$srcdir/lm_tgpr_5k.arpa.gz
+if [ -f $srclm ]; then
+  echo "Getting vocabulary from a baseline language model";
+  gunzip -c $srclm | awk 'BEGIN{unig=0}{
+    if(unig==0){
+      if($1=="\\1-grams:"){unig=1}}
+    else {
+      if ($1 != "") {
+        if ($1=="\\2-grams:" || $1=="\\end\\") {exit}
+        else {print $2}}
+    }}' > $dir/vocab_5k.txt
+else
+  echo "Language model $srclm does not exist" && exit 1;
+fi
+
+# collect training data from WSJ0
+touch $dir/train.gz
+if [ `du -m $dir/train.gz | cut -f 1` -eq 63 ]; then
+  echo "Not getting training data again [already exists]";
+else
+  echo "Collecting training data from $lm_train";
+  gunzip -c $lm_train/{87,88,89}/*.z \
+   | awk -v voc=$dir/vocab_5k.txt '
+   BEGIN{ while((getline<voc)>0) { invoc[$1]=1; }}
+   /^</{next}{
+     for (x=1;x<=NF;x++) {
+       w=toupper($x);
+       if (invoc[w]) { printf("%s ",w); } else { printf("<UNK> "); }
+     }
+     printf("\n");
+   }' | gzip -c > $dir/train.gz
+fi
+
+# get validation data from CHiME3 dev set
+touch $dir/valid.gz
+if [ `du -k $dir/valid.gz | cut -f 1` -eq 68 ]; then
+  echo "Not getting validation data again [already exists]";
+else
+  echo "Collecting validation data from $chime3_data/data/transcriptions";
+  cut -d" " -f2- $chime3_data/data/transcriptions/dt05_real.trn_all \
+                 $chime3_data/data/transcriptions/dt05_simu.trn_all \
+      |gzip -c > $dir/valid.gz
+fi
+
+# train a large n-gram language model
+lm_suffix=${order}gkn_5k
+if [ -f $dir/lm_${lm_suffix}.arpa.gz ]; then
+  echo "A $order-gram language model aready exists and is not constructed again"
+  echo "To reconstruct, remove $dir/$dir/lm_${lm_suffix}.arpa.gz first"
+else
+  echo "Training a $order-gram language model"
+  ngram-count -text $dir/train.gz -order $order \
+              -vocab $dir/vocab_5k.txt -unk -map-unk "<UNK>" \
+              -gt2min 1 -gt3min 1 -gt4min 2 -gt5min 2 \
+              -interpolate -kndiscount \
+              -lm $dir/lm_${lm_suffix}.arpa.gz
+fi
+echo "Checking validation perplexity of $order-gram language model"
+ngram -order $order -ppl $dir/valid.gz -lm $dir/lm_${lm_suffix}.arpa.gz
+# e.g. 5-gram perplexity:
+# file data/local/local_lm/valid.txt: 3280 sentences, 54239 words, 3 OOVs
+# 0 zeroprobs, logprob= -96775.5 ppl= 48.1486 ppl1= 60.8611
+
+# Next, create the corresponding FST and lang_test_* directory.
+echo "Preparing language models for test"
+tmpdir=data/local/lm_tmp
+lexicon=data/local/lang_tmp/lexiconp.txt
+mkdir -p $tmpdir
+
+test=data/lang_test_${lm_suffix}
+mkdir -p $test
+for f in phones.txt words.txt phones.txt L.fst L_disambig.fst \
+   phones; do
+  cp -r data/lang/$f $test
+done
+gunzip -c $dir/lm_${lm_suffix}.arpa.gz | \
+  arpa2fst --disambig-symbol=#0 \
+           --read-symbol-table=$test/words.txt - $test/G.fst
+fstisstochastic $test/G.fst
+# The output is like:
+# 9.14233e-05 -0.259833
+# we do expect the first of these 2 numbers to be close to zero (the second is
+# nonzero because the backoff weights make the states sum to >1).
+# Because of the <s> fiasco for these particular LMs, the first number is not
+# as close to zero as it could be.
+
+# Everything below is only for diagnostic.
+# Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
+# this might cause determinization failure of CLG.
+# #0 is treated as an empty word.
+mkdir -p $tmpdir/g
+awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \
+    < "$lexicon"  >$tmpdir/g/select_empty.fst.txt
+fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \
+ fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst
+fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
+  echo "Language model has cycles with empty words" && exit 1
+rm -r $tmpdir/g
+
+echo "Succeeded in preparing a large ${order}-gram LM"
+rm -r $tmpdir
diff --git a/egs/chime3/s5/local/chime3_train_rnnlms.sh b/egs/chime3/s5/local/chime3_train_rnnlms.sh
new file mode 100755
index 00000000000..429ca828aa3
--- /dev/null
+++ b/egs/chime3/s5/local/chime3_train_rnnlms.sh
@@ -0,0 +1,111 @@
+#!/bin/bash
+
+# Copyright 2015, Mitsubishi Electric Research Laboratories, MERL (Author: Takaaki Hori)
+
+# Config:
+hidden=300 # Num-hidden units
+class=200 # Num-classes
+rnnlm_ver=rnnlm-0.3e # version of RNNLM to use
+threads=1 # for RNNLM-HS
+bptt=4 # length of BPTT unfolding in RNNLM
+bptt_block=10 # length of BPTT unfolding in RNNLM
+
+. utils/parse_options.sh || exit 1;
+
+. ./path.sh
+. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
+           ## This relates to the queue.
+
+if [ $# -ne 1 ]; then
+  printf "\nUSAGE: %s <CHiME3 root directory>\n\n" `basename $0`
+  echo "Please specifies a CHiME3 root directory"
+  echo "If you use kaldi scripts distributed in the CHiME3 data,"
+  echo "It would be `pwd`/../.."
+  exit 1;
+fi
+
+# check data directories 
+chime3_data=$1
+wsj0_data=$chime3_data/data/WSJ0 # directory of WSJ0 in CHiME3. You can also specify your WSJ0 corpus directory
+if [ ! -d $chime3_data ]; then
+  echo "$chime3_data does not exist. Please specify chime3 data root correctly" && exit 1
+fi
+if [ ! -d $wsj0_data ]; then
+  echo "$wsj0_data does not exist. Please specify WSJ0 corpus directory" && exit 1
+fi
+lm_train=$wsj0_data/wsj0/doc/lng_modl/lm_train/np_data
+
+# lm directories
+dir=data/local/local_lm
+srcdir=data/local/nist_lm
+mkdir -p $dir
+
+# extract 5k vocabulary from a baseline language model
+srclm=$srcdir/lm_tgpr_5k.arpa.gz
+if [ -f $srclm ]; then
+  echo "Getting vocabulary from a baseline language model";
+  gunzip -c $srclm | awk 'BEGIN{unig=0}{
+    if(unig==0){
+      if($1=="\\1-grams:"){unig=1}}
+    else {
+      if ($1 != "") {
+        if ($1=="\\2-grams:" || $1=="\\end\\") {exit}
+        else {print $2}}
+    }}' | sed "s/<UNK>/<RNN_UNK>/" > $dir/vocab_5k.rnn
+else
+  echo "Language model $srclm does not exist" && exit 1;
+fi
+
+# collect training data from WSJ0
+touch $dir/train.rnn
+if [ `du -m $dir/train.rnn | cut -f 1` -eq 223 ]; then
+  echo "Not getting training data again [already exists]";
+else
+  echo "Collecting training data from $lm_train";
+  gunzip -c $lm_train/{87,88,89}/*.z \
+   | awk -v voc=$dir/vocab_5k.rnn '
+   BEGIN{ while((getline<voc)>0) { invoc[$1]=1; }}
+   /^</{next}{
+     for (x=1;x<=NF;x++) { 
+       w=toupper($x);
+       if (invoc[w]) { printf("%s ",w); } else { printf("<RNN_UNK> "); }
+     }
+     printf("\n");
+   }' > $dir/train.rnn
+fi
+
+# get validation data from CHiME3 dev set
+touch $dir/valid.rnn
+if [ `cat $dir/valid.rnn | wc -w` -eq 54239 ]; then
+  echo "Not getting validation data again [already exists]";
+else
+  echo "Collecting validation data from $chime3_data/data/transcriptions";
+  cut -d" " -f2- $chime3_data/data/transcriptions/dt05_real.trn_all \
+                 $chime3_data/data/transcriptions/dt05_simu.trn_all \
+      > $dir/valid.rnn
+fi
+  
+# RNN language model traing
+$KALDI_ROOT/tools/extras/check_for_rnnlm.sh "$rnnlm_ver" || exit 1
+
+# train a RNN language model
+rnnmodel=$dir/rnnlm_5k_h${hidden}_bptt${bptt}
+if [ -f $rnnmodel ]; then
+  echo "A RNN language model aready exists and is not constructed again"
+  echo "To reconstruct, remove $rnnmodel first"
+else
+  echo "Training a RNN language model with $rnnlm_ver"
+  echo "(runtime log is written to $dir/rnnlm.log)"
+  $train_cmd $dir/rnnlm.log \
+   $KALDI_ROOT/tools/$rnnlm_ver/rnnlm -train $dir/train.rnn -valid $dir/valid.rnn \
+        -rnnlm $rnnmodel -hidden $hidden -class $class \
+        -rand-seed 1 -independent -debug 1 -bptt $bptt -bptt-block $bptt_block || exit 1;
+fi
+
+# store in a RNNLM directory with necessary files
+rnndir=data/lang_test_rnnlm_5k_h${hidden}
+mkdir -p $rnndir
+cp $rnnmodel $rnndir/rnnlm
+grep -v -e "<s>" -e "</s>" $dir/vocab_5k.rnn > $rnndir/wordlist.rnn
+touch $rnndir/unk.probs # make an empty file because we don't know unk-word probs.
+
diff --git a/egs/chime3/s5/local/clean_chime3_format_data.sh b/egs/chime3/s5/local/clean_chime3_format_data.sh
index d3a2c73471c..f2d81bc5324 100755
--- a/egs/chime3/s5/local/clean_chime3_format_data.sh
+++ b/egs/chime3/s5/local/clean_chime3_format_data.sh
@@ -20,7 +20,7 @@ tmpdir=data/local/lm_tmp
 lexicon=data/local/lang_tmp/lexiconp.txt
 mkdir -p $tmpdir
 
-for x in et05_orig_clean dt05_orig_clean tr05_orig_clean; do 
+for x in et05_orig_clean dt05_orig_clean tr05_orig_clean; do
   mkdir -p data/$x
   cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1;
   cp $srcdir/$x.txt data/$x/text || exit 1;
@@ -43,29 +43,15 @@ for lm_suffix in tgpr_5k; do
     cp -r data/lang/$f $test
   done
   gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
-   utils/find_arpa_oovs.pl $test/words.txt  > $tmpdir/oovs_${lm_suffix}.txt
-
-  # grep -v '<s> <s>' because the LM seems to have some strange and useless
-  # stuff in it with multiple <s>'s in the history.  Encountered some other similar
-  # things in a LM from Geoff.  Removing all "illegal" combinations of <s> and </s>,
-  # which are supposed to occur only at being/end of utt.  These can cause 
-  # determinization failures of CLG [ends up being epsilon cycles].
-  gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
-    grep -v '<s> <s>' | \
-    grep -v '</s> <s>' | \
-    grep -v '</s> </s>' | \
-    arpa2fst - | fstprint | \
-    utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \
-    utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
-      --osymbols=$test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-     fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
+    arpa2fst --disambig-symbol=#0 \
+             --read-symbol-table=$test/words.txt - $test/G.fst
   fstisstochastic $test/G.fst
- # The output is like:
- # 9.14233e-05 -0.259833
- # we do expect the first of these 2 numbers to be close to zero (the second is
- # nonzero because the backoff weights make the states sum to >1).
- # Because of the <s> fiasco for these particular LMs, the first number is not
- # as close to zero as it could be.
+  # The output is like:
+  # 9.14233e-05 -0.259833
+  # we do expect the first of these 2 numbers to be close to zero (the second is
+  # nonzero because the backoff weights make the states sum to >1).
+  # Because of the <s> fiasco for these particular LMs, the first number is not
+  # as close to zero as it could be.
 
   # Everything below is only for diagnostic.
   # Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
@@ -76,7 +62,7 @@ for lm_suffix in tgpr_5k; do
     < "$lexicon"  >$tmpdir/g/select_empty.fst.txt
   fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \
    fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst
-  fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && 
+  fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
     echo "Language model has cycles with empty words" && exit 1
   rm -r $tmpdir/g
 done
diff --git a/egs/chime3/s5/local/run_dnn.sh b/egs/chime3/s5/local/run_dnn.sh
index 1795983ce17..668236dd341 100755
--- a/egs/chime3/s5/local/run_dnn.sh
+++ b/egs/chime3/s5/local/run_dnn.sh
@@ -12,6 +12,12 @@
 . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
            ## This relates to the queue.
 
+# Config:
+nj=30
+stage=0 # resume training with --stage=N
+
+. utils/parse_options.sh || exit 1;
+
 # This is a shell script, but it's recommended that you run the commands one by
 # one by copying and pasting into the shell.
 
@@ -22,12 +28,16 @@ if [ $# -ne 2 ]; then
   exit 1;
 fi
 
-nj=30
-
-# enhan data
+# set enhanced data
 enhan=$1
 enhan_data=$2
 
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
 # check whether run_init is executed
 if [ ! -d data/lang ]; then
   echo "error, execute local/run_init.sh, first"
@@ -40,53 +50,77 @@ if [ ! -d exp/tri3b_tr05_multi_$enhan ]; then
   exit 1;
 fi
 
-# make 40-dim fbank features for enhan data
-fbankdir=fbank/$enhan
-mkdir -p data-fbank
-for x in dt05_real_$enhan et05_real_$enhan tr05_real_$enhan dt05_simu_$enhan et05_simu_$enhan tr05_simu_$enhan; do
-  cp -r data/$x data-fbank
-  steps/make_fbank.sh --nj $nj \
-    data-fbank/$x exp/make_fbank/$x $fbankdir || exit 1;
-done
+# get alignments
+if [ $stage -le 0 ]; then
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    data/tr05_multi_$enhan data/lang exp/tri3b_tr05_multi_$enhan exp/tri3b_tr05_multi_${enhan}_ali
+  steps/align_fmllr.sh --nj 4 --cmd "$train_cmd" \
+    data/dt05_multi_$enhan data/lang exp/tri3b_tr05_multi_$enhan exp/tri3b_tr05_multi_${enhan}_ali_dt05
+fi
 
-# make mixed training set from real and simulation enhancement training data
-# multi = simu + real
-utils/combine_data.sh data-fbank/tr05_multi_$enhan data-fbank/tr05_simu_$enhan data-fbank/tr05_real_$enhan
-utils/combine_data.sh data-fbank/dt05_multi_$enhan data-fbank/dt05_simu_$enhan data-fbank/dt05_real_$enhan
-utils/combine_data.sh data-fbank/et05_multi_$enhan data-fbank/et05_simu_$enhan data-fbank/et05_real_$enhan
+# make fmllr feature for training multi = simu + real
+gmmdir=exp/tri3b_tr05_multi_${enhan}_ali
+data_fmllr=data-fmllr-tri3b
+mkdir -p $data_fmllr
+fmllrdir=fmllr-tri3b/$enhan
+if [ $stage -le 1 ]; then
+  for x in tr05_real_$enhan tr05_simu_$enhan; do
+    steps/nnet/make_fmllr_feats.sh --nj 4 --cmd "$train_cmd" \
+      --transform-dir $gmmdir \
+      $data_fmllr/$x data/$x $gmmdir exp/make_fmllr_tri3b/$x $fmllrdir
+  done
+fi
+
+# make fmllr feature for dev and eval
+gmmdir=exp/tri3b_tr05_multi_${enhan}
+if [ $stage -le 2 ]; then
+  for x in dt05_real_$enhan et05_real_$enhan dt05_simu_$enhan et05_simu_$enhan; do
+    steps/nnet/make_fmllr_feats.sh --nj 4 --cmd "$train_cmd" \
+      --transform-dir $gmmdir/decode_tgpr_5k_$x \
+      $data_fmllr/$x data/$x $gmmdir exp/make_fmllr_tri3b/$x $fmllrdir
+  done
+fi
 
-# get alignment
-steps/align_fmllr.sh --nj $nj \
-  data/tr05_multi_$enhan data/lang exp/tri3b_tr05_multi_$enhan exp/tri3b_tr05_multi_${enhan}_ali || exit 1;
-steps/align_fmllr.sh --nj 4 \
-  data/dt05_multi_$enhan data/lang exp/tri3b_tr05_multi_$enhan exp/tri3b_tr05_multi_${enhan}_ali_dt05 || exit 1;
+# make mixed training set from real and simulation enhanced data
+# multi = simu + real
+if [ $stage -le 3 ]; then
+  utils/combine_data.sh $data_fmllr/tr05_multi_$enhan $data_fmllr/tr05_simu_$enhan $data_fmllr/tr05_real_$enhan
+  utils/combine_data.sh $data_fmllr/dt05_multi_$enhan $data_fmllr/dt05_simu_$enhan $data_fmllr/dt05_real_$enhan
+  utils/combine_data.sh $data_fmllr/et05_multi_$enhan $data_fmllr/et05_simu_$enhan $data_fmllr/et05_real_$enhan
+fi
 
 # pre-train dnn
 dir=exp/tri4a_dnn_pretrain_tr05_multi_$enhan
-$cuda_cmd $dir/_pretrain_dbn.log \
-  steps/nnet/pretrain_dbn.sh --nn-depth 7 --rbm-iter 3 data-fbank/tr05_multi_$enhan $dir
+if [ $stage -le 4 ]; then
+  $cuda_cmd $dir/_pretrain_dbn.log \
+    steps/nnet/pretrain_dbn.sh --nn-depth 7 --rbm-iter 3 $data_fmllr/tr05_multi_$enhan $dir
+fi
 
 # train dnn
 dir=exp/tri4a_dnn_tr05_multi_$enhan
 ali=exp/tri3b_tr05_multi_${enhan}_ali
-ali_dev=exp/tri3b_tr05_multi_${enhan}_ali_dt05 
+ali_dev=exp/tri3b_tr05_multi_${enhan}_ali_dt05
 feature_transform=exp/tri4a_dnn_pretrain_tr05_multi_$enhan/final.feature_transform
 dbn=exp/tri4a_dnn_pretrain_tr05_multi_$enhan/7.dbn
-$cuda_cmd $dir/_train_nnet.log \
-steps/nnet/train.sh --feature-transform $feature_transform --dbn $dbn --hid-layers 0 --learn-rate 0.008 \
-data-fbank/tr05_multi_$enhan data-fbank/dt05_multi_$enhan data/lang $ali $ali_dev $dir || exit 1;
-
-# decode enhan speech
-utils/mkgraph.sh data/lang_test_tgpr_5k $dir $dir/graph_tgpr_5k || exit 1;
-steps/nnet/decode.sh --nj 4 --num-threads 4 --acwt 0.10 --config conf/decode_dnn.config \
-  $dir/graph_tgpr_5k data-fbank/dt05_real_$enhan $dir/decode_tgpr_5k_dt05_real_$enhan &
-steps/nnet/decode.sh --nj 4 --num-threads 4 --acwt 0.10 --config conf/decode_dnn.config \
-  $dir/graph_tgpr_5k data-fbank/dt05_simu_$enhan $dir/decode_tgpr_5k_dt05_simu_$enhan &
-steps/nnet/decode.sh --nj 4 --num-threads 4 --acwt 0.10 --config conf/decode_dnn.config \
-  $dir/graph_tgpr_5k data-fbank/et05_real_$enhan $dir/decode_tgpr_5k_et05_real_$enhan &
-steps/nnet/decode.sh --nj 4 --num-threads 4 --acwt 0.10 --config conf/decode_dnn.config \
-  $dir/graph_tgpr_5k data-fbank/et05_simu_$enhan $dir/decode_tgpr_5k_et05_simu_$enhan &
-wait;
+if [ $stage -le 5 ]; then
+  $cuda_cmd $dir/_train_nnet.log \
+    steps/nnet/train.sh --feature-transform $feature_transform --dbn $dbn --hid-layers 0 --learn-rate 0.008 \
+    $data_fmllr/tr05_multi_$enhan $data_fmllr/dt05_multi_$enhan data/lang $ali $ali_dev $dir
+fi
+
+# decode enhanced speech
+if [ $stage -le 6 ]; then
+  utils/mkgraph.sh data/lang_test_tgpr_5k $dir $dir/graph_tgpr_5k
+  steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --acwt 0.10 --config conf/decode_dnn.config \
+    $dir/graph_tgpr_5k $data_fmllr/dt05_real_$enhan $dir/decode_tgpr_5k_dt05_real_$enhan &
+  steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --acwt 0.10 --config conf/decode_dnn.config \
+    $dir/graph_tgpr_5k $data_fmllr/dt05_simu_$enhan $dir/decode_tgpr_5k_dt05_simu_$enhan &
+  steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --acwt 0.10 --config conf/decode_dnn.config \
+    $dir/graph_tgpr_5k $data_fmllr/et05_real_$enhan $dir/decode_tgpr_5k_et05_real_$enhan &
+  steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --acwt 0.10 --config conf/decode_dnn.config \
+    $dir/graph_tgpr_5k $data_fmllr/et05_simu_$enhan $dir/decode_tgpr_5k_et05_simu_$enhan &
+  wait;
+fi
 
 # Sequence training using sMBR criterion, we do Stochastic-GD
 # with per-utterance updates. We use usually good acwt 0.1
@@ -96,32 +130,38 @@ srcdir=exp/tri4a_dnn_tr05_multi_${enhan}
 acwt=0.1
 
 # First we generate lattices and alignments:
-# gawk musb be installed to perform awk -v FS="/" '{ print gensub(".gz","","",$NF)" gunzip -c "$0" |"; }' in
+# gawk must be installed to perform awk -v FS="/" '{ print gensub(".gz","","",$NF)" gunzip -c "$0" |"; }' in
 # steps/nnet/make_denlats.sh
-steps/nnet/align.sh --nj $nj --cmd "$train_cmd" \
-  data-fbank/tr05_multi_${enhan} data/lang $srcdir ${srcdir}_ali
-steps/nnet/make_denlats.sh --nj $nj --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt $acwt \
-  data-fbank/tr05_multi_${enhan} data/lang $srcdir ${srcdir}_denlats
+if [ $stage -le 7 ]; then
+  steps/nnet/align.sh --nj $nj --cmd "$train_cmd" \
+    $data_fmllr/tr05_multi_${enhan} data/lang $srcdir ${srcdir}_ali
+  steps/nnet/make_denlats.sh --nj $nj --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt $acwt \
+    $data_fmllr/tr05_multi_${enhan} data/lang $srcdir ${srcdir}_denlats
+fi
 
 # Re-train the DNN by 1 iteration of sMBR
-steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 1 --acwt $acwt --do-smbr true \
-  data-fbank/tr05_multi_${enhan} data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir
+if [ $stage -le 8 ]; then
+  steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 1 --acwt $acwt --do-smbr true \
+    $data_fmllr/tr05_multi_${enhan} data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir
+fi
 
 # Decode (reuse HCLG graph)
-for ITER in 1; do
-  steps/nnet/decode.sh --nj 4 --num-threads 4 --cmd "$decode_cmd" --config conf/decode_dnn.config \
-    --nnet $dir/${ITER}.nnet --acwt $acwt \
-    exp/tri4a_dnn_tr05_multi_${enhan}/graph_tgpr_5k data-fbank/dt05_real_${enhan} $dir/decode_tgpr_5k_dt05_real_${enhan}_it${ITER} &
-  steps/nnet/decode.sh --nj 4 --num-threads 4 --cmd "$decode_cmd" --config conf/decode_dnn.config \
-    --nnet $dir/${ITER}.nnet --acwt $acwt \
-    exp/tri4a_dnn_tr05_multi_${enhan}/graph_tgpr_5k data-fbank/dt05_simu_${enhan} $dir/decode_tgpr_5k_dt05_simu_${enhan}_it${ITER} &
-  steps/nnet/decode.sh --nj 4 --num-threads 4 --cmd "$decode_cmd" --config conf/decode_dnn.config \
-    --nnet $dir/${ITER}.nnet --acwt $acwt \
-    exp/tri4a_dnn_tr05_multi_${enhan}/graph_tgpr_5k data-fbank/et05_real_${enhan} $dir/decode_tgpr_5k_et05_real_${enhan}_it${ITER} &
-  steps/nnet/decode.sh --nj 4 --num-threads 4 --cmd "$decode_cmd" --config conf/decode_dnn.config \
-    --nnet $dir/${ITER}.nnet --acwt $acwt \
-    exp/tri4a_dnn_tr05_multi_${enhan}/graph_tgpr_5k data-fbank/et05_simu_${enhan} $dir/decode_tgpr_5k_et05_simu_${enhan}_it${ITER} &
-done
+if [ $stage -le 9 ]; then
+  for ITER in 1; do
+    steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \
+      --nnet $dir/${ITER}.nnet --acwt $acwt \
+      exp/tri4a_dnn_tr05_multi_${enhan}/graph_tgpr_5k $data_fmllr/dt05_real_${enhan} $dir/decode_tgpr_5k_dt05_real_${enhan}_it${ITER} &
+    steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \
+      --nnet $dir/${ITER}.nnet --acwt $acwt \
+      exp/tri4a_dnn_tr05_multi_${enhan}/graph_tgpr_5k $data_fmllr/dt05_simu_${enhan} $dir/decode_tgpr_5k_dt05_simu_${enhan}_it${ITER} &
+    steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \
+      --nnet $dir/${ITER}.nnet --acwt $acwt \
+      exp/tri4a_dnn_tr05_multi_${enhan}/graph_tgpr_5k $data_fmllr/et05_real_${enhan} $dir/decode_tgpr_5k_et05_real_${enhan}_it${ITER} &
+    steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \
+      --nnet $dir/${ITER}.nnet --acwt $acwt \
+      exp/tri4a_dnn_tr05_multi_${enhan}/graph_tgpr_5k $data_fmllr/et05_simu_${enhan} $dir/decode_tgpr_5k_et05_simu_${enhan}_it${ITER} &
+  done
+fi
 
 # Re-generate lattices, run 4 more sMBR iterations
 dir=exp/tri4a_dnn_tr05_multi_${enhan}_smbr_i1lats
@@ -129,37 +169,47 @@ srcdir=exp/tri4a_dnn_tr05_multi_${enhan}_smbr
 acwt=0.1
 
 # Generate lattices and alignments:
-steps/nnet/align.sh --nj $nj --cmd "$train_cmd" \
-  data-fbank/tr05_multi_${enhan} data/lang $srcdir ${srcdir}_ali
-steps/nnet/make_denlats.sh --nj $nj --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt $acwt \
-  data-fbank/tr05_multi_${enhan} data/lang $srcdir ${srcdir}_denlats
+if [ $stage -le 10 ]; then
+  steps/nnet/align.sh --nj $nj --cmd "$train_cmd" \
+    $data_fmllr/tr05_multi_${enhan} data/lang $srcdir ${srcdir}_ali
+  steps/nnet/make_denlats.sh --nj $nj --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt $acwt \
+    $data_fmllr/tr05_multi_${enhan} data/lang $srcdir ${srcdir}_denlats
+fi
 
 # Re-train the DNN by 4 iterations of sMBR
-steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 4 --acwt $acwt --do-smbr true \
-  data-fbank/tr05_multi_${enhan} data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir || exit 1
+if [ $stage -le 11 ]; then
+  steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 4 --acwt $acwt --do-smbr true \
+    $data_fmllr/tr05_multi_${enhan} data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir || exit 1
+fi
 
 # Decode (reuse HCLG graph)
-for ITER in 1 2 3 4; do
-  steps/nnet/decode.sh --nj 4 --num-threads 4 --cmd "$decode_cmd" --config conf/decode_dnn.config \
-    --nnet $dir/${ITER}.nnet --acwt $acwt \
-    exp/tri4a_dnn_tr05_multi_${enhan}/graph_tgpr_5k data-fbank/dt05_real_${enhan} $dir/decode_tgpr_5k_dt05_real_${enhan}_it${ITER} &
-  steps/nnet/decode.sh --nj 4 --num-threads 4 --cmd "$decode_cmd" --config conf/decode_dnn.config \
-    --nnet $dir/${ITER}.nnet --acwt $acwt \
-    exp/tri4a_dnn_tr05_multi_${enhan}/graph_tgpr_5k data-fbank/dt05_simu_${enhan} $dir/decode_tgpr_5k_dt05_simu_${enhan}_it${ITER} &
-  steps/nnet/decode.sh --nj 4 --num-threads 4 --cmd "$decode_cmd" --config conf/decode_dnn.config \
-    --nnet $dir/${ITER}.nnet --acwt $acwt \
-    exp/tri4a_dnn_tr05_multi_${enhan}/graph_tgpr_5k data-fbank/et05_real_${enhan} $dir/decode_tgpr_5k_et05_real_${enhan}_it${ITER} &
-  steps/nnet/decode.sh --nj 4 --num-threads 4 --cmd "$decode_cmd" --config conf/decode_dnn.config \
-    --nnet $dir/${ITER}.nnet --acwt $acwt \
-    exp/tri4a_dnn_tr05_multi_${enhan}/graph_tgpr_5k data-fbank/et05_simu_${enhan} $dir/decode_tgpr_5k_et05_simu_${enhan}_it${ITER} &
-done
-wait
-
-# decoded results of enhan speech using enhan DNN AMs
-local/chime3_calc_wers.sh exp/tri4a_dnn_tr05_multi_$enhan $enhan > exp/tri4a_dnn_tr05_multi_$enhan/best_wer_$enhan.result
-head -n 15 exp/tri4a_dnn_tr05_multi_$enhan/best_wer_$enhan.result
-# decoded results of enhan speech using enhan DNN AMs with sequence training
-./local/chime3_calc_wers_smbr.sh exp/tri4a_dnn_tr05_multi_${enhan}_smbr_i1lats ${enhan} exp/tri4a_dnn_tr05_multi_${enhan}/graph_tgpr_5k \
+if [ $stage -le 12 ]; then
+  for ITER in 1 2 3 4; do
+    steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \
+      --nnet $dir/${ITER}.nnet --acwt $acwt \
+      exp/tri4a_dnn_tr05_multi_${enhan}/graph_tgpr_5k $data_fmllr/dt05_real_${enhan} $dir/decode_tgpr_5k_dt05_real_${enhan}_it${ITER} &
+    steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \
+      --nnet $dir/${ITER}.nnet --acwt $acwt \
+      exp/tri4a_dnn_tr05_multi_${enhan}/graph_tgpr_5k $data_fmllr/dt05_simu_${enhan} $dir/decode_tgpr_5k_dt05_simu_${enhan}_it${ITER} &
+    steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \
+      --nnet $dir/${ITER}.nnet --acwt $acwt \
+      exp/tri4a_dnn_tr05_multi_${enhan}/graph_tgpr_5k $data_fmllr/et05_real_${enhan} $dir/decode_tgpr_5k_et05_real_${enhan}_it${ITER} &
+    steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \
+      --nnet $dir/${ITER}.nnet --acwt $acwt \
+      exp/tri4a_dnn_tr05_multi_${enhan}/graph_tgpr_5k $data_fmllr/et05_simu_${enhan} $dir/decode_tgpr_5k_et05_simu_${enhan}_it${ITER} &
+  done
+  wait
+fi
+
+# scoring
+if [ $stage -le 13 ]; then
+  # decoded results of enhanced speech using DNN AMs trained with enhanced data
+  local/chime3_calc_wers.sh exp/tri4a_dnn_tr05_multi_$enhan $enhan > exp/tri4a_dnn_tr05_multi_$enhan/best_wer_$enhan.result
+  head -n 15 exp/tri4a_dnn_tr05_multi_$enhan/best_wer_$enhan.result
+  # decoded results of enhanced speech using sequence-training DNN
+  ./local/chime3_calc_wers_smbr.sh exp/tri4a_dnn_tr05_multi_${enhan}_smbr_i1lats ${enhan} exp/tri4a_dnn_tr05_multi_${enhan}/graph_tgpr_5k \
     > exp/tri4a_dnn_tr05_multi_${enhan}_smbr_i1lats/best_wer_${enhan}.result
-head -n 15 exp/tri4a_dnn_tr05_multi_${enhan}_smbr_i1lats/best_wer_${enhan}.result
+  head -n 15 exp/tri4a_dnn_tr05_multi_${enhan}_smbr_i1lats/best_wer_${enhan}.result
+fi
 
+echo "`basename $0` Done."
diff --git a/egs/chime3/s5/local/run_gmm.sh b/egs/chime3/s5/local/run_gmm.sh
index 9ba4dadc14c..5b9fbaa1736 100755
--- a/egs/chime3/s5/local/run_gmm.sh
+++ b/egs/chime3/s5/local/run_gmm.sh
@@ -12,6 +12,12 @@
 . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
            ## This relates to the queue.
 
+# Config:
+nj=30
+stage=0 # resume training with --stage=N
+
+. utils/parse_options.sh || exit 1;
+
 # This is a shell script, but it's recommended that you run the commands one by
 # one by copying and pasting into the shell.
 
@@ -22,87 +28,115 @@ if [ $# -ne 2 ]; then
   exit 1;
 fi
 
-nj=30
-
-# enhan data
+# set enhanced data
 enhan=$1
 enhan_data=$2
 
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
 # check whether run_init is executed
 if [ ! -d data/lang ]; then
   echo "error, execute local/run_init.sh, first"
   exit 1;
 fi
 
-# process for enhan data
-local/real_enhan_chime3_data_prep.sh $enhan $enhan_data || exit 1;
-local/simu_enhan_chime3_data_prep.sh $enhan $enhan_data || exit 1;
+# process for enhanced data
+if [ $stage -le 0 ]; then
+  local/real_enhan_chime3_data_prep.sh $enhan $enhan_data
+  local/simu_enhan_chime3_data_prep.sh $enhan $enhan_data
+fi
 
 # Now make MFCC features for clean, close, and noisy data
 # mfccdir should be some place with a largish disk where you
 # want to store MFCC features.
 mfccdir=mfcc/$enhan
-for x in dt05_real_$enhan et05_real_$enhan tr05_real_$enhan dt05_simu_$enhan et05_simu_$enhan tr05_simu_$enhan; do 
-  steps/make_mfcc.sh --nj $nj \
-    data/$x exp/make_mfcc/$x $mfccdir || exit 1;
-  steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir || exit 1;
-done
+if [ $stage -le 1 ]; then
+  for x in dt05_real_$enhan et05_real_$enhan tr05_real_$enhan dt05_simu_$enhan et05_simu_$enhan tr05_simu_$enhan; do
+    steps/make_mfcc.sh --nj 8 --cmd "$train_cmd" \
+      data/$x exp/make_mfcc/$x $mfccdir
+    steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir
+  done
+fi
 
-# make mixed training set from real and simulation enhancement training data
+# make mixed training set from real and simulation enhanced data
 # multi = simu + real
-utils/combine_data.sh data/tr05_multi_$enhan data/tr05_simu_$enhan data/tr05_real_$enhan
-utils/combine_data.sh data/dt05_multi_$enhan data/dt05_simu_$enhan data/dt05_real_$enhan
-utils/combine_data.sh data/et05_multi_$enhan data/et05_simu_$enhan data/et05_real_$enhan
-
-# decode enhan speech using clean AMs
-steps/decode_fmllr.sh --nj 4 --num-threads 4 \
-  exp/tri3b_tr05_orig_clean/graph_tgpr_5k data/dt05_real_$enhan exp/tri3b_tr05_orig_clean/decode_tgpr_5k_dt05_real_$enhan &
-steps/decode_fmllr.sh --nj 4 --num-threads 4 \
-  exp/tri3b_tr05_orig_clean/graph_tgpr_5k data/dt05_simu_$enhan exp/tri3b_tr05_orig_clean/decode_tgpr_5k_dt05_simu_$enhan &
-steps/decode_fmllr.sh --nj 4 --num-threads 4 \
-  exp/tri3b_tr05_orig_clean/graph_tgpr_5k data/et05_real_$enhan exp/tri3b_tr05_orig_clean/decode_tgpr_5k_et05_real_$enhan &
-steps/decode_fmllr.sh --nj 4 --num-threads 4 \
-  exp/tri3b_tr05_orig_clean/graph_tgpr_5k data/et05_simu_$enhan exp/tri3b_tr05_orig_clean/decode_tgpr_5k_et05_simu_$enhan &
-
-# training models using enhan data
-steps/train_mono.sh --boost-silence 1.25 --nj $nj \
-  data/tr05_multi_$enhan data/lang exp/mono0a_tr05_multi_$enhan || exit 1;
-
-steps/align_si.sh --boost-silence 1.25 --nj $nj \
-  data/tr05_multi_$enhan data/lang exp/mono0a_tr05_multi_$enhan exp/mono0a_ali_tr05_multi_$enhan || exit 1;
-
-steps/train_deltas.sh --boost-silence 1.25 \
-  2000 10000 data/tr05_multi_$enhan data/lang exp/mono0a_ali_tr05_multi_$enhan exp/tri1_tr05_multi_$enhan || exit 1;
-
-steps/align_si.sh --nj $nj \
-  data/tr05_multi_$enhan data/lang exp/tri1_tr05_multi_$enhan exp/tri1_ali_tr05_multi_$enhan || exit 1;
-
-steps/train_lda_mllt.sh \
-  --splice-opts "--left-context=3 --right-context=3" \
-  2500 15000 data/tr05_multi_$enhan data/lang exp/tri1_ali_tr05_multi_$enhan exp/tri2b_tr05_multi_$enhan || exit 1;
-
-steps/align_si.sh  --nj $nj \
-  --use-graphs true data/tr05_multi_$enhan data/lang exp/tri2b_tr05_multi_$enhan exp/tri2b_ali_tr05_multi_$enhan  || exit 1;
-
-steps/train_sat.sh \
-  2500 15000 data/tr05_multi_$enhan data/lang exp/tri2b_ali_tr05_multi_$enhan exp/tri3b_tr05_multi_$enhan || exit 1;
-
-utils/mkgraph.sh data/lang_test_tgpr_5k exp/tri3b_tr05_multi_$enhan exp/tri3b_tr05_multi_$enhan/graph_tgpr_5k || exit 1;
-
-# decode enhan speech using enhan AMs
-steps/decode_fmllr.sh --nj 4 --num-threads 4 \
-  exp/tri3b_tr05_multi_$enhan/graph_tgpr_5k data/dt05_real_$enhan exp/tri3b_tr05_multi_$enhan/decode_tgpr_5k_dt05_real_$enhan &
-steps/decode_fmllr.sh --nj 4 --num-threads 4 \
-  exp/tri3b_tr05_multi_$enhan/graph_tgpr_5k data/dt05_simu_$enhan exp/tri3b_tr05_multi_$enhan/decode_tgpr_5k_dt05_simu_$enhan &
-steps/decode_fmllr.sh --nj 4 --num-threads 4 \
-  exp/tri3b_tr05_multi_$enhan/graph_tgpr_5k data/et05_real_$enhan exp/tri3b_tr05_multi_$enhan/decode_tgpr_5k_et05_real_$enhan &
-steps/decode_fmllr.sh --nj 4 --num-threads 4 \
-  exp/tri3b_tr05_multi_$enhan/graph_tgpr_5k data/et05_simu_$enhan exp/tri3b_tr05_multi_$enhan/decode_tgpr_5k_et05_simu_$enhan &
-
-wait;
-# decoded results of enhan speech using clean AMs
-local/chime3_calc_wers.sh exp/tri3b_tr05_orig_clean $enhan > exp/tri3b_tr05_orig_clean/best_wer_$enhan.result
-head -n 15 exp/tri3b_tr05_orig_clean/best_wer_$enhan.result
-# decoded results of enhan speech using enhan AMs
-local/chime3_calc_wers.sh exp/tri3b_tr05_multi_$enhan $enhan > exp/tri3b_tr05_multi_$enhan/best_wer_$enhan.result
-head -n 15 exp/tri3b_tr05_multi_$enhan/best_wer_$enhan.result
+if [ $stage -le 2 ]; then
+  utils/combine_data.sh data/tr05_multi_$enhan data/tr05_simu_$enhan data/tr05_real_$enhan
+  utils/combine_data.sh data/dt05_multi_$enhan data/dt05_simu_$enhan data/dt05_real_$enhan
+  utils/combine_data.sh data/et05_multi_$enhan data/et05_simu_$enhan data/et05_real_$enhan
+fi
+
+# decode enhanced speech using clean AMs
+if [ $stage -le 3 ]; then
+  steps/decode_fmllr.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" \
+    exp/tri3b_tr05_orig_clean/graph_tgpr_5k data/dt05_real_$enhan exp/tri3b_tr05_orig_clean/decode_tgpr_5k_dt05_real_$enhan &
+  steps/decode_fmllr.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" \
+    exp/tri3b_tr05_orig_clean/graph_tgpr_5k data/dt05_simu_$enhan exp/tri3b_tr05_orig_clean/decode_tgpr_5k_dt05_simu_$enhan &
+  steps/decode_fmllr.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" \
+    exp/tri3b_tr05_orig_clean/graph_tgpr_5k data/et05_real_$enhan exp/tri3b_tr05_orig_clean/decode_tgpr_5k_et05_real_$enhan &
+  steps/decode_fmllr.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" \
+    exp/tri3b_tr05_orig_clean/graph_tgpr_5k data/et05_simu_$enhan exp/tri3b_tr05_orig_clean/decode_tgpr_5k_et05_simu_$enhan &
+fi
+
+# training models using enhanced data
+# training monophone model
+if [ $stage -le 4 ]; then
+  steps/train_mono.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
+    data/tr05_multi_$enhan data/lang exp/mono0a_tr05_multi_$enhan
+
+  steps/align_si.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
+    data/tr05_multi_$enhan data/lang exp/mono0a_tr05_multi_$enhan exp/mono0a_ali_tr05_multi_$enhan
+fi
+
+# training triphone model with delta, delta+delta features
+if [ $stage -le 5 ]; then
+  steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \
+    2000 10000 data/tr05_multi_$enhan data/lang exp/mono0a_ali_tr05_multi_$enhan exp/tri1_tr05_multi_$enhan
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+    data/tr05_multi_$enhan data/lang exp/tri1_tr05_multi_$enhan exp/tri1_ali_tr05_multi_$enhan
+fi
+
+# training triphone model with lad mllt features
+if [ $stage -le 6 ]; then
+  steps/train_lda_mllt.sh --cmd "$train_cmd" \
+    --splice-opts "--left-context=3 --right-context=3" \
+    2500 15000 data/tr05_multi_$enhan data/lang exp/tri1_ali_tr05_multi_$enhan exp/tri2b_tr05_multi_$enhan
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+    --use-graphs true data/tr05_multi_$enhan data/lang exp/tri2b_tr05_multi_$enhan exp/tri2b_ali_tr05_multi_$enhan
+fi
+
+# training triphone model with SAT
+if [ $stage -le 7 ]; then
+  steps/train_sat.sh --cmd "$train_cmd" \
+    2500 15000 data/tr05_multi_$enhan data/lang exp/tri2b_ali_tr05_multi_$enhan exp/tri3b_tr05_multi_$enhan
+  utils/mkgraph.sh data/lang_test_tgpr_5k exp/tri3b_tr05_multi_$enhan exp/tri3b_tr05_multi_$enhan/graph_tgpr_5k
+fi
+
+# decode enhanced speech using AMs trained with enhanced data
+if [ $stage -le 8 ]; then
+  steps/decode_fmllr.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" \
+    exp/tri3b_tr05_multi_$enhan/graph_tgpr_5k data/dt05_real_$enhan exp/tri3b_tr05_multi_$enhan/decode_tgpr_5k_dt05_real_$enhan &
+  steps/decode_fmllr.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" \
+    exp/tri3b_tr05_multi_$enhan/graph_tgpr_5k data/dt05_simu_$enhan exp/tri3b_tr05_multi_$enhan/decode_tgpr_5k_dt05_simu_$enhan &
+  steps/decode_fmllr.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" \
+    exp/tri3b_tr05_multi_$enhan/graph_tgpr_5k data/et05_real_$enhan exp/tri3b_tr05_multi_$enhan/decode_tgpr_5k_et05_real_$enhan &
+  steps/decode_fmllr.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" \
+    exp/tri3b_tr05_multi_$enhan/graph_tgpr_5k data/et05_simu_$enhan exp/tri3b_tr05_multi_$enhan/decode_tgpr_5k_et05_simu_$enhan &
+  wait;
+fi
+
+# scoring
+if [ $stage -le 9 ]; then
+  # decoded results of enhanced speech using clean AMs
+  local/chime3_calc_wers.sh exp/tri3b_tr05_orig_clean $enhan > exp/tri3b_tr05_orig_clean/best_wer_$enhan.result
+  head -n 15 exp/tri3b_tr05_orig_clean/best_wer_$enhan.result
+  # decoded results of enhanced speech using AMs trained with enhanced data
+  local/chime3_calc_wers.sh exp/tri3b_tr05_multi_$enhan $enhan > exp/tri3b_tr05_multi_$enhan/best_wer_$enhan.result
+  head -n 15 exp/tri3b_tr05_multi_$enhan/best_wer_$enhan.result
+fi
+
+echo "`basename $0` Done."
diff --git a/egs/chime3/s5/local/run_init.sh b/egs/chime3/s5/local/run_init.sh
index 2f923298e38..9db289a12a5 100755
--- a/egs/chime3/s5/local/run_init.sh
+++ b/egs/chime3/s5/local/run_init.sh
@@ -5,6 +5,12 @@
 #                Mitsubishi Electric Research Labs (Shinji Watanabe)
 #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 
+# Config:
+nj=30
+stage=0 # resume training with --stage=N
+
+. utils/parse_options.sh || exit 1;
+
 # This script is made from the kaldi recipe of the 2nd CHiME Challenge Track 2
 # made by Chao Weng
 
@@ -23,32 +29,38 @@ fi
 # This is a shell script, but it's recommended that you run the commands one by
 # one by copying and pasting into the shell.
 
-nj=30
 # clean data
 chime3_data=$1
 wsj0_data=$chime3_data/data/WSJ0 # directory of WSJ0 in CHiME3. You can also specify your WSJ0 corpus directory
 
 eval_flag=true # make it true when the evaluation data are released
 
-# process for clean speech and making LMs etc. from original WSJ0
-# note that training on clean data means original WSJ0 data only (no booth data)
-local/clean_wsj0_data_prep.sh $wsj0_data || exit 1;
-
-local/wsj_prepare_dict.sh || exit 1;
-
-utils/prepare_lang.sh data/local/dict "<SPOKEN_NOISE>" data/local/lang_tmp data/lang || exit 1;
-
-local/clean_chime3_format_data.sh || exit 1;
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+if [ $stage -le 0 ]; then
+  # process for clean speech and making LMs etc. from original WSJ0
+  # note that training on clean data means original WSJ0 data only (no booth data)
+  local/clean_wsj0_data_prep.sh $wsj0_data
+  local/wsj_prepare_dict.sh
+  utils/prepare_lang.sh data/local/dict "<SPOKEN_NOISE>" data/local/lang_tmp data/lang
+  local/clean_chime3_format_data.sh
+fi
 
-# process for close talking speech for real data (will not be used)
-local/real_close_chime3_data_prep.sh $chime3_data || exit 1;
+if [ $stage -le 1 ]; then
+  # process for close talking speech for real data (will not be used)
+  # local/real_close_chime3_data_prep.sh $chime3_data
 
-# process for booth recording speech (will not be used)
-# local/bth_chime3_data_prep.sh $chime3_data || exit 1;
+  # process for booth recording speech (will not be used)
+  # local/bth_chime3_data_prep.sh $chime3_data
 
-# process for distant talking speech for real and simulation data
-local/real_noisy_chime3_data_prep.sh $chime3_data || exit 1;
-local/simu_noisy_chime3_data_prep.sh $chime3_data || exit 1;
+  # process for distant talking speech for real and simulation data
+  local/real_noisy_chime3_data_prep.sh $chime3_data
+  local/simu_noisy_chime3_data_prep.sh $chime3_data
+fi
 
 # Now make MFCC features for clean, close, and noisy data
 # mfccdir should be some place with a largish disk where you
@@ -72,74 +84,88 @@ else
   list=$list" tr05_simu_noisy dt05_simu_noisy"
 fi
 mfccdir=mfcc
-for x in $list; do 
-  steps/make_mfcc.sh --nj 8 \
-    data/$x exp/make_mfcc/$x $mfccdir || exit 1;
-  steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir || exit 1;
-done
+if [ $stage -le 2 ]; then
+  for x in $list; do
+    steps/make_mfcc.sh --nj 8 --cmd "$train_cmd" \
+      data/$x exp/make_mfcc/$x $mfccdir
+    steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir
+  done
+fi
 
 # make mixed training set from real and simulation training data
 # multi = simu + real
-utils/combine_data.sh data/tr05_multi_noisy data/tr05_simu_noisy data/tr05_real_noisy
-utils/combine_data.sh data/dt05_multi_noisy data/dt05_simu_noisy data/dt05_real_noisy
+if [ $stage -le 3 ]; then
+  utils/combine_data.sh data/tr05_multi_noisy data/tr05_simu_noisy data/tr05_real_noisy
+  utils/combine_data.sh data/dt05_multi_noisy data/dt05_simu_noisy data/dt05_real_noisy
+fi
 
 # training models for clean and noisy data
 # if you want to check the performance of the ASR only using real/simu data
 # please try to add "tr05_real_noisy" "tr05_simu_noisy"
-#for train in tr05_multi_noisy tr05_real_noisy tr05_simu_noisy tr05_orig_clean; do
-for train in tr05_multi_noisy tr05_orig_clean; do
-  nspk=`wc -l data/$train/spk2utt | awk '{print $1}'`
-  if [ $nj -gt $nspk ]; then
-    nj2=$nspk
-  else
-    nj2=$nj
-  fi
-  steps/train_mono.sh --boost-silence 1.25 --nj $nj2 \
-    data/$train data/lang exp/mono0a_$train || exit 1;
-
-  steps/align_si.sh --boost-silence 1.25 --nj $nj2 \
-    data/$train data/lang exp/mono0a_$train exp/mono0a_ali_$train || exit 1;
-
-  steps/train_deltas.sh --boost-silence 1.25 \
-    2000 10000 data/$train data/lang exp/mono0a_ali_$train exp/tri1_$train || exit 1;
-
-  steps/align_si.sh --nj $nj2 \
-    data/$train data/lang exp/tri1_$train exp/tri1_ali_$train || exit 1;
-
-  steps/train_lda_mllt.sh \
-    --splice-opts "--left-context=3 --right-context=3" \
-    2500 15000 data/$train data/lang exp/tri1_ali_$train exp/tri2b_$train || exit 1;
-
-  steps/align_si.sh  --nj $nj2 \
-    --use-graphs true data/$train data/lang exp/tri2b_$train exp/tri2b_ali_$train  || exit 1;
-
-  steps/train_sat.sh \
-    2500 15000 data/$train data/lang exp/tri2b_ali_$train exp/tri3b_$train || exit 1;
-
-  utils/mkgraph.sh data/lang_test_tgpr_5k exp/tri3b_$train exp/tri3b_$train/graph_tgpr_5k || exit 1;
-
-  # if you want to know the result of the close talk microphone, plese try the following 
-  # decode close speech
-  # steps/decode_fmllr.sh --nj 4 --num-threads 4 \
-  #   exp/tri3b_$train/graph_tgpr_5k data/dt05_real_close exp/tri3b_$train/decode_tgpr_5k_dt05_real_close &
-  # steps/decode_fmllr.sh --nj 4 --num-threads 4 \
-  #   exp/tri3b_$train/graph_tgpr_5k data/et05_real_close exp/tri3b_$train/decode_tgpr_5k_et05_real_close &
-  # decode real noisy speech
-  steps/decode_fmllr.sh --nj 4 --num-threads 4 \
-    exp/tri3b_$train/graph_tgpr_5k data/dt05_real_noisy exp/tri3b_$train/decode_tgpr_5k_dt05_real_noisy &
-  steps/decode_fmllr.sh --nj 4 --num-threads 4 \
-    exp/tri3b_$train/graph_tgpr_5k data/et05_real_noisy exp/tri3b_$train/decode_tgpr_5k_et05_real_noisy &
-  # decode simu noisy speech
-  steps/decode_fmllr.sh --nj 4 --num-threads 4 \
-    exp/tri3b_$train/graph_tgpr_5k data/dt05_simu_noisy exp/tri3b_$train/decode_tgpr_5k_dt05_simu_noisy &
-  steps/decode_fmllr.sh --nj 4 --num-threads 4 \
-    exp/tri3b_$train/graph_tgpr_5k data/et05_simu_noisy exp/tri3b_$train/decode_tgpr_5k_et05_simu_noisy &
-done
-wait
+# for train in tr05_multi_noisy tr05_real_noisy tr05_simu_noisy tr05_orig_clean; do
+if [ $stage -le 4 ]; then
+  for train in tr05_multi_noisy tr05_orig_clean; do
+    nspk=`wc -l data/$train/spk2utt | awk '{print $1}'`
+    if [ $nj -gt $nspk ]; then
+      nj2=$nspk
+    else
+      nj2=$nj
+    fi
+    # training monophone model
+    steps/train_mono.sh --boost-silence 1.25 --nj $nj2 --cmd "$train_cmd" \
+      data/$train data/lang exp/mono0a_$train
+    steps/align_si.sh --boost-silence 1.25 --nj $nj2 --cmd "$train_cmd" \
+      data/$train data/lang exp/mono0a_$train exp/mono0a_ali_$train
+
+    # training triphone model with lad mllt features
+    steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \
+      2000 10000 data/$train data/lang exp/mono0a_ali_$train exp/tri1_$train
+    steps/align_si.sh --nj $nj2 --cmd "$train_cmd" \
+      data/$train data/lang exp/tri1_$train exp/tri1_ali_$train
+
+    steps/train_lda_mllt.sh --cmd "$train_cmd" \
+      --splice-opts "--left-context=3 --right-context=3" \
+      2500 15000 data/$train data/lang exp/tri1_ali_$train exp/tri2b_$train
+    steps/align_si.sh  --nj $nj2 --cmd "$train_cmd" \
+      --use-graphs true data/$train data/lang exp/tri2b_$train exp/tri2b_ali_$train
+
+    steps/train_sat.sh --cmd "$train_cmd" \
+      2500 15000 data/$train data/lang exp/tri2b_ali_$train exp/tri3b_$train
+    utils/mkgraph.sh data/lang_test_tgpr_5k exp/tri3b_$train exp/tri3b_$train/graph_tgpr_5k
+  done
+fi
+
+# decoding
+if [ $stage -le 5 ]; then
+  for train in tr05_multi_noisy tr05_orig_clean; do
+    # if you want to know the result of the close talk microphone, please try the following
+    # decode close speech
+    # steps/decode_fmllr.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" \
+    #   exp/tri3b_$train/graph_tgpr_5k data/dt05_real_close exp/tri3b_$train/decode_tgpr_5k_dt05_real_close &
+    # steps/decode_fmllr.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" \
+    #   exp/tri3b_$train/graph_tgpr_5k data/et05_real_close exp/tri3b_$train/decode_tgpr_5k_et05_real_close &
+
+    # decode real noisy speech
+    steps/decode_fmllr.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" \
+      exp/tri3b_$train/graph_tgpr_5k data/dt05_real_noisy exp/tri3b_$train/decode_tgpr_5k_dt05_real_noisy &
+    steps/decode_fmllr.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" \
+      exp/tri3b_$train/graph_tgpr_5k data/et05_real_noisy exp/tri3b_$train/decode_tgpr_5k_et05_real_noisy &
+    # decode simu noisy speech
+    steps/decode_fmllr.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" \
+      exp/tri3b_$train/graph_tgpr_5k data/dt05_simu_noisy exp/tri3b_$train/decode_tgpr_5k_dt05_simu_noisy &
+    steps/decode_fmllr.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" \
+      exp/tri3b_$train/graph_tgpr_5k data/et05_simu_noisy exp/tri3b_$train/decode_tgpr_5k_et05_simu_noisy &
+  done
+  wait
+fi
 
 # get the best scores
-#for train in tr05_multi_noisy tr05_real_noisy tr05_simu_noisy tr05_orig_clean; do
-for train in tr05_multi_noisy tr05_orig_clean; do
-  local/chime3_calc_wers.sh exp/tri3b_$train noisy > exp/tri3b_$train/best_wer_noisy.result
-  head -n 15 exp/tri3b_$train/best_wer_noisy.result
-done
+if [ $stage -le 6 ]; then
+  #for train in tr05_multi_noisy tr05_real_noisy tr05_simu_noisy tr05_orig_clean; do
+  for train in tr05_multi_noisy tr05_orig_clean; do
+    local/chime3_calc_wers.sh exp/tri3b_$train noisy > exp/tri3b_$train/best_wer_noisy.result
+    head -n 15 exp/tri3b_$train/best_wer_noisy.result
+  done
+fi
+
+echo "`basename $0` Done."
diff --git a/egs/chime3/s5/local/run_lmrescore.sh b/egs/chime3/s5/local/run_lmrescore.sh
new file mode 100755
index 00000000000..0c364367c98
--- /dev/null
+++ b/egs/chime3/s5/local/run_lmrescore.sh
@@ -0,0 +1,125 @@
+#!/bin/bash
+
+# Copyright 2015 University of Sheffield (Jon Barker, Ricard Marxer)
+#                Inria (Emmanuel Vincent)
+#                Mitsubishi Electric Research Labs (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+# Copyright 2015, Mitsubishi Electric Research Laboratories, MERL (Author: Takaaki Hori)
+
+nj=12
+stage=1
+order=5
+hidden=300
+rnnweight=0.5
+nbest=100
+
+. utils/parse_options.sh || exit 1;
+
+. ./path.sh
+. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
+           ## This relates to the queue.
+
+# This is a shell script, but it's recommended that you run the commands one by
+# one by copying and pasting into the shell.
+
+if [ $# -ne 2 ]; then
+  printf "\nUSAGE: %s <CHiME3 root directory> <enhancement method>\n\n" `basename $0`
+  echo "First argument specifies a root directory of CHiME3 data"
+  echo "Second argument specifies a unique name for different enhancement method"
+  exit 1;
+fi
+
+# set language models
+lm_suffix=${order}gkn_5k
+rnnlm_suffix=rnnlm_5k_h${hidden}
+
+# data root
+chime3_data=$1
+# enhan data
+enhan=$2
+
+# check data
+if [ ! -d $chime3_data ]; then
+  echo "$chime3_data does not exist. Please specify chime3 data root correctly" && exit 1
+fi
+
+# check whether run_dnn is executed
+srcdir=exp/tri4a_dnn_tr05_multi_${enhan}_smbr_i1lats
+if [ ! -d $srcdir ]; then
+  echo "error, execute local/run_dnn.sh, first"
+  exit 1;
+fi
+
+# train a high-order n-gram language model
+if [ $stage -le 1 ]; then
+  local/chime3_train_lms.sh $chime3_data || exit 1;
+fi
+
+# train a RNN language model
+if [ $stage -le 2 ]; then
+  local/chime3_train_rnnlms.sh $chime3_data || exit 1;
+fi
+
+# preparation
+dir=exp/tri4a_dnn_tr05_multi_${enhan}_smbr_lmrescore
+mkdir -p $dir
+# make a symbolic link to graph info
+if [ ! -e $dir/graph_tgpr_5k ]; then
+  if [ ! -e exp/tri4a_dnn_tr05_multi_${enhan}/graph_tgpr_5k ]; then
+    echo "graph is missing, execute local/run_dnn.sh, correctly"
+    exit 1;
+  fi
+  pushd . ; cd $dir
+  ln -s ../tri4a_dnn_tr05_multi_${enhan}/graph_tgpr_5k .
+  popd
+fi
+
+# rescore lattices by a high-order N-gram
+if [ $stage -le 3 ]; then
+  # check the best iteration
+  if [ ! -f $srcdir/log/best_wer_$enhan ]; then
+    echo "error, execute local/run_dnn.sh, first"
+    exit 1;
+  fi
+  it=`cut -f 1 -d" " $srcdir/log/best_wer_$enhan | awk -F'[_]' '{print $1}'`
+  # rescore lattices
+  for t in dt05_simu dt05_real et05_simu et05_real; do 
+    steps/lmrescore.sh --mode 3 \
+      data/lang_test_tgpr_5k \
+      data/lang_test_${lm_suffix} \
+      data-fmllr-tri3b/${t}_$enhan \
+      $srcdir/decode_tgpr_5k_${t}_${enhan}_it$it \
+      $dir/decode_tgpr_5k_${t}_${enhan}_${lm_suffix}
+  done
+  # rescored results by high-order n-gram LM
+  mkdir -p $dir/log
+  local/chime3_calc_wers.sh $dir ${enhan}_${lm_suffix} \
+      > $dir/best_wer_${enhan}_${lm_suffix}.result
+  head -n 15 $dir/best_wer_${enhan}_${lm_suffix}.result
+fi
+
+# N-best rescoring using a RNNLM
+if [ $stage -le 4 ]; then
+  # check the best lmw
+  if [ ! -f $dir/log/best_wer_${enhan}_${lm_suffix} ]; then
+    echo "error, rescoring with a high-order n-gram seems to be failed"
+    exit 1;
+  fi
+  lmw=`cut -f 1 -d" " $dir/log/best_wer_${enhan}_${lm_suffix} | awk -F'[_]' '{print $NF}'`
+  # rescore n-best list for all sets
+  for t in dt05_simu dt05_real et05_simu et05_real; do 
+    steps/rnnlmrescore.sh --inv-acwt $lmw --N $nbest --use-phi true \
+      $rnnweight \
+      data/lang_test_${lm_suffix} \
+      data/lang_test_${rnnlm_suffix} \
+      data-fmllr-tri3b/${t}_$enhan \
+      $dir/decode_tgpr_5k_${t}_${enhan}_${lm_suffix} \
+      $dir/decode_tgpr_5k_${t}_${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest}
+  done
+  # calc wers for RNNLM results
+  local/chime3_calc_wers.sh $dir ${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest} \
+      > $dir/best_wer_${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest}.result
+  head -n 15 $dir/best_wer_${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest}.result
+fi
+
diff --git a/egs/chime3/s5/path.sh b/egs/chime3/s5/path.sh
index fc9eaf0192e..a4772b7d89d 100755
--- a/egs/chime3/s5/path.sh
+++ b/egs/chime3/s5/path.sh
@@ -1,4 +1,6 @@
 export KALDI_ROOT=`pwd`/../../..
 [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
-export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin/:$KALDI_ROOT/src/kwsbin:$KALDI_ROOT/tools/irstlm/bin/:$KALDI_ROOT/tools/kaldi_lm/:$PWD:$PATH
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/irstlm/bin/:$KALDI_ROOT/tools/kaldi_lm/:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
 export LC_ALL=C
diff --git a/egs/chime3/s5/run.sh b/egs/chime3/s5/run.sh
old mode 100644
new mode 100755
index a934055ab0b..f7cc389f37a
--- a/egs/chime3/s5/run.sh
+++ b/egs/chime3/s5/run.sh
@@ -8,27 +8,67 @@
 #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 
 . ./path.sh
-. ./cmd.sh 
+. ./cmd.sh
+
+# Config:
+stage=0 # resume training with --stage=N
+
+. utils/parse_options.sh || exit 1;
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
 
 # You can execute run_init.sh only "once"
-# This creates LMs, basic task files, basic models, 
+# This creates LMs, basic task files, basic models,
 # baseline results without speech enhancement techniques, and so on.
 # Please set a main root directory of the CHiME3 data
-# If you use kaldi scripts distributed in the CHiME3 data, 
-chime3_data=`pwd`/../.. 
-# Otherwise, please specify it, e.g., 
-# chime3_data=/local_data/watanabe/work/201410CHiME3/CHiME3
-local/run_init.sh $chime3_data
+# If you use kaldi scripts distributed in the CHiME3 data,
+# chime3_data=`pwd`/../..
+# Otherwise, please specify it, e.g.,
+chime3_data=/data2/archive/speech-db/original/public/CHiME3
+if [ ! -d $chime3_data ]; then
+  echo "$chime3_data does not exist. Please specify chime3 data root correctly" && exit 1
+fi
+if [ $stage -le 0 ]; then
+  local/run_init.sh $chime3_data
+fi
+
+# Using Beamformit
+# This results in better performance than the CHiME3 official beamforming
+# See Hori et al, "The MERL/SRI system for the 3rd CHiME challenge using beamforming,
+# robust feature extraction, and advanced speech recognition," in Proc. ASRU'15
+# note that beamformed wav files are generated in the following directory
+enhancement_method=beamformit_5mics
+enhancement_data=`pwd`/$enhancement_method
+if [ $stage -le 1 ]; then
+  local/chime3_beamform.sh --cmd "$train_cmd" --nj 20 $chime3_data/data/audio/16kHz/isolated $enhancement_data
+fi
 
 # GMM based ASR experiment
 # Please set a directory of your speech enhancement method.
 # run_gmm.sh can be done every time when you change a speech enhancement technique.
 # The directory structure and audio files must follow the attached baseline enhancement directory
-enhancement_method=enhanced
-enhancement_data=$chime3_data/data/audio/16kHz/enhanced
-local/run_gmm.sh $enhancement_method $enhancement_data
+# if you want to use the CHiME3 official enhanced data, please comment out the following
+# enhancement_method=enhanced
+# enhancement_data=$chime3_data/data/audio/16kHz/enhanced
+if [ $stage -le 2 ]; then
+  local/run_gmm.sh $enhancement_method $enhancement_data
+fi
 
 # DNN based ASR experiment
 # Since it takes time to evaluate DNN, we make the GMM and DNN scripts separately.
 # You may execute it after you would have promising results using GMM-based ASR experiments
-local/run_dnn.sh $enhancement_method $enhancement_data
\ No newline at end of file
+if [ $stage -le 3 ]; then
+  local/run_dnn.sh $enhancement_method $enhancement_data
+fi
+
+# LM-rescoring experiment with 5-gram and RNN LMs
+# It takes a few days to train a RNNLM.
+if [ $stage -le 4 ]; then
+  local/run_lmrescore.sh $chime3_data $enhancement_method
+fi
+
+echo "Done."
diff --git a/egs/chime4/README.txt b/egs/chime4/README.txt
new file mode 100644
index 00000000000..e1d8f35e3f6
--- /dev/null
+++ b/egs/chime4/README.txt
@@ -0,0 +1,11 @@
+This is a kaldi recipe for the 4th CHiME Speech Separation and Recognition Challenge (CHiME-4). The
+challenge revisits the datasets originally recorded for CHiME-3, i.e., Wall Street Journal corpus sentences
+spoken by talkers situated in challenging noisy environments recorded using a 6-channel tablet based
+microphone array. CHiME-4 increases the level of difficulty by constraining the number of microphones
+available for testing (i.e., separate 1, 2 and 6 channel tracks).
+
+See http://spandh.dcs.shef.ac.uk/chime_challenge/ for more detailed information.
+
+    s5_1ch: 1 channel track
+    s5_2ch: 2 channel track
+    s5_6ch: 6 channel track
diff --git a/egs/chime4/s5_1ch/RESULTS b/egs/chime4/s5_1ch/RESULTS
new file mode 100644
index 00000000000..5654b6400ef
--- /dev/null
+++ b/egs/chime4/s5_1ch/RESULTS
@@ -0,0 +1,47 @@
+# CHiME-4 1ch track results
+# The result is based on Hori et al, "The MERL/SRI system for the 3rd CHiME challenge using beamforming,
+# robust feature extraction, and advanced speech recognition," in Proc. ASRU'15,
+# and please refer the paper if you think the baseline useful.
+# Note that the following result is different from that in the paper since we don't include
+# SRI's robust features and system combination
+
+GMM noisy multi-condition without enhancement
+exp/tri3b_tr05_multi_noisy/best_wer_isolated_1ch_track.result
+-------------------
+dt05_simu WER: 24.48% (Average), 20.37% (BUS), 29.78% (CAFE), 20.49% (PEDESTRIAN), 27.27% (STREET)
+-------------------
+dt05_real WER: 22.16% (Average), 27.32% (BUS), 23.07% (CAFE), 16.29% (PEDESTRIAN), 21.96% (STREET)
+-------------------
+
+DNN sMBR
+exp/tri4a_dnn_tr05_multi_noisy_smbr_i1lats/best_wer_isolated_1ch_track.result
+-------------------
+best overall dt05 WER 15.17% (language model weight = 11)
+ (Number of iterations = 4)
+-------------------
+dt05_simu WER: 15.67% (Average), 14.09% (BUS), 18.97% (CAFE), 12.76% (PEDESTRIAN), 16.89% (STREET)
+-------------------
+dt05_real WER: 14.67% (Average), 18.97% (BUS), 15.28% (CAFE), 9.88% (PEDESTRIAN), 14.56% (STREET)
+-------------------
+
+5-gram rescoring
+exp/tri4a_dnn_tr05_multi_noisy_smbr_lmrescore/best_wer_isolated_1ch_track_5gkn_5k.result
+-------------------
+best overall dt05 WER 13.46% (language model weight = 11)
+-------------------
+dt05_simu WER: 13.99% (Average), 13.02% (BUS), 16.76% (CAFE), 11.12% (PEDESTRIAN), 15.07% (STREET)
+-------------------
+dt05_real WER: 12.93% (Average), 16.89% (BUS), 13.48% (CAFE), 8.53% (PEDESTRIAN), 12.82% (STREET)
+-------------------
+
+RNNLM
+exp/tri4a_dnn_tr05_multi_noisy_smbr_lmrescore/best_wer_isolated_1ch_track_rnnlm_5k_h300_w0.5_n100.result
+-------------------
+best overall dt05 WER 12.28% (language model weight = 11)
+-------------------
+dt05_simu WER: 12.98% (Average), 11.90% (BUS), 15.90% (CAFE), 9.94% (PEDESTRIAN), 14.19% (STREET)
+-------------------
+dt05_real WER: 11.57% (Average), 15.13% (BUS), 11.81% (CAFE), 7.42% (PEDESTRIAN), 11.90% (STREET)
+-------------------
+
+
diff --git a/egs/chime4/s5_1ch/cmd.sh b/egs/chime4/s5_1ch/cmd.sh
new file mode 100755
index 00000000000..114fbff7a17
--- /dev/null
+++ b/egs/chime4/s5_1ch/cmd.sh
@@ -0,0 +1,22 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+#export train_cmd="queue.pl --mem 2G"
+#export cuda_cmd="queue.pl --mem 2G --gpu 1"
+#export decode_cmd="queue.pl --mem 4G"
+#export mkgraph_cmd="queue.pl --mem 8G"
+
+# run it locally...
+export train_cmd=run.pl
+export decode_cmd=run.pl
+export cuda_cmd=run.pl
+export mkgraph_cmd=run.pl
diff --git a/egs/chime4/s5_1ch/conf/chime4.cfg b/egs/chime4/s5_1ch/conf/chime4.cfg
new file mode 100755
index 00000000000..70fdd858651
--- /dev/null
+++ b/egs/chime4/s5_1ch/conf/chime4.cfg
@@ -0,0 +1,50 @@
+#BeamformIt sample configuration file for AMI data (http://groups.inf.ed.ac.uk/ami/download/)
+
+# scrolling size to compute the delays
+scroll_size = 250
+
+# cross correlation computation window size
+window_size = 500
+
+#amount of maximum points for the xcorrelation taken into account
+nbest_amount = 4
+
+#flag wether to apply an automatic noise thresholding 
+do_noise_threshold = 1
+
+#Percentage of frames with lower xcorr taken as noisy
+noise_percent = 10
+
+######## acoustic modelling parameters
+
+#transition probabilities weight for multichannel decoding
+trans_weight_multi = 25
+trans_weight_nbest = 25
+
+###
+
+#flag wether to print the feaures after setting them, or not
+print_features = 1
+
+#flag wether to use the bad frames in the sum process
+do_avoid_bad_frames = 1
+
+#flag to use the best channel (SNR) as a reference
+#defined from command line
+do_compute_reference = 1
+
+#flag wether to use a uem file or not(process all the file)
+do_use_uem_file = 0
+
+#flag wether to use an adaptative weights scheme or fixed weights
+do_adapt_weights = 1
+
+#flag wether to output the sph files or just run the system to create the auxiliary files
+do_write_sph_files = 1
+
+####directories where to store/retrieve info####
+#channels_file = ./cfg-files/channels
+
+#show needs to be passed as argument normally, here a default one is given just in case
+#show_id = Ttmp
+
diff --git a/egs/chime4/s5_1ch/conf/decode_dnn.config b/egs/chime4/s5_1ch/conf/decode_dnn.config
new file mode 100644
index 00000000000..89dd9929a62
--- /dev/null
+++ b/egs/chime4/s5_1ch/conf/decode_dnn.config
@@ -0,0 +1,2 @@
+beam=18.0 # beam for decoding.  Was 13.0 in the scripts.
+lattice_beam=10.0 # this has most effect on size of the lattices.
diff --git a/egs/chime4/s5_1ch/conf/fbank.conf b/egs/chime4/s5_1ch/conf/fbank.conf
new file mode 100644
index 00000000000..5fc7774b31f
--- /dev/null
+++ b/egs/chime4/s5_1ch/conf/fbank.conf
@@ -0,0 +1,11 @@
+# No non-default options for now.
+--window-type=hamming # disable Dans window, use the standard
+--use-energy=false    # only fbank outputs
+--sample-frequency=16000 # Cantonese is sampled at 8kHz
+
+--low-freq=64         # typical setup from Frantisek Grezl
+--high-freq=8000
+--dither=1
+
+--num-mel-bins=40     # 8kHz so we use 15 bins
+--htk-compat=true     # try to make it compatible with HTK
diff --git a/egs/timit/s3/conf/mfcc.conf b/egs/chime4/s5_1ch/conf/mfcc.conf
similarity index 100%
rename from egs/timit/s3/conf/mfcc.conf
rename to egs/chime4/s5_1ch/conf/mfcc.conf
diff --git a/egs/chime4/s5_1ch/local/chime4_calc_wers.sh b/egs/chime4/s5_1ch/local/chime4_calc_wers.sh
new file mode 100755
index 00000000000..079668520f4
--- /dev/null
+++ b/egs/chime4/s5_1ch/local/chime4_calc_wers.sh
@@ -0,0 +1,85 @@
+#!/bin/bash
+
+#  Copyright  2015  Mitsubishi Electric Research Laboratories (Author: Shinji Watanabe)
+#  Apache 2.0.
+
+set -e
+
+# Config:
+eval_flag=false # make it true when the evaluation data are released
+
+. utils/parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+  printf "\nUSAGE: %s <training experiment directory> <enhancement method> <graph_dir>\n\n" `basename $0`
+  printf "%s exp/tri3b_tr05_sr_noisy noisy exp/tri4a_dnn_tr05_sr_noisy/graph_tgpr_5k\n\n" `basename $0`
+  exit 1;
+fi
+
+echo "$0 $@"  # Print the command line for logging
+
+. path.sh
+
+dir=$1
+enhan=$2
+graph_dir=$3
+
+echo "compute dt05 WER for each location"
+echo ""
+mkdir -p $dir/log
+for a in `find $dir/decode_tgpr_5k_dt05_real_$enhan/ | grep "\/wer_" | awk -F'[/]' '{print $NF}' | sort`; do
+    echo -n "$a "
+    if [ -e $dir/decode_tgpr_5k_dt05_simu_$enhan ]; then
+	cat $dir/decode_tgpr_5k_dt05_{real,simu}_$enhan/$a | grep WER | awk '{err+=$4} {wrd+=$6} END{printf("%.2f\n",err/wrd*100)}'
+    else
+	cat $dir/decode_tgpr_5k_dt05_real_$enhan/$a | grep WER | awk '{err+=$4} {wrd+=$6} END{printf("%.2f\n",err/wrd*100)}'
+    fi
+done | sort -n -k 2 | head -n 1 > $dir/log/best_wer_$enhan
+
+lmw=`cut -f 1 -d" " $dir/log/best_wer_$enhan | cut -f 2 -d"_"`
+echo "-------------------"
+printf "best overall dt05 WER %s" `cut -f 2 -d" " $dir/log/best_wer_$enhan`
+echo -n "%"
+printf " (language model weight = %s)\n" $lmw
+echo "-------------------"
+if $eval_flag; then
+  tasks="dt05 et05"
+else
+  tasks="dt05"
+fi
+for e_d in $tasks; do
+  for task in simu real; do
+    rdir=$dir/decode_tgpr_5k_${e_d}_${task}_$enhan
+    if [ -e $rdir ]; then
+      for a in _BUS _CAF _PED _STR; do
+	grep $a $rdir/scoring/test_filt.txt \
+	  > $rdir/scoring/test_filt_$a.txt
+	cat $rdir/scoring/$lmw.tra \
+	  | utils/int2sym.pl -f 2- $graph_dir/words.txt \
+	  | sed s:\<UNK\>::g \
+	  | compute-wer --text --mode=present ark:$rdir/scoring/test_filt_$a.txt ark,p:- \
+	  1> $rdir/${a}_wer_$lmw 2> /dev/null
+      done
+    echo -n "${e_d}_${task} WER: `grep WER $rdir/wer_$lmw | cut -f 2 -d" "`% (Average), "
+    echo -n "`grep WER $rdir/_BUS_wer_$lmw | cut -f 2 -d" "`% (BUS), "
+    echo -n "`grep WER $rdir/_CAF_wer_$lmw | cut -f 2 -d" "`% (CAFE), "
+    echo -n "`grep WER $rdir/_PED_wer_$lmw | cut -f 2 -d" "`% (PEDESTRIAN), "
+    echo -n "`grep WER $rdir/_STR_wer_$lmw | cut -f 2 -d" "`% (STREET)"
+    echo ""
+    echo "-------------------"
+    fi
+  done
+done
+echo ""
+
+for e_d in $tasks; do
+  echo "-----------------------------"
+  echo "1-best transcription for $e_d"
+  echo "-----------------------------"
+  for task in simu real; do
+    rdir=$dir/decode_tgpr_5k_${e_d}_${task}_$enhan
+    cat $rdir/scoring/$lmw.tra \
+      | utils/int2sym.pl -f 2- $graph_dir/words.txt \
+      | sed s:\<UNK\>::g
+  done
+done
diff --git a/egs/chime4/s5_1ch/local/chime4_calc_wers_smbr.sh b/egs/chime4/s5_1ch/local/chime4_calc_wers_smbr.sh
new file mode 100755
index 00000000000..4990423a8a7
--- /dev/null
+++ b/egs/chime4/s5_1ch/local/chime4_calc_wers_smbr.sh
@@ -0,0 +1,83 @@
+#!/bin/bash
+
+#  Copyright  2015  Mitsubishi Electric Research Laboratories (Author: Shinji Watanabe)
+#  Apache 2.0.
+
+set -e
+
+# Config:
+eval_flag=false # make it true when the evaluation data are released
+
+. utils/parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+  printf "\nUSAGE: %s <training experiment directory> <enhancement method> <graph_dir>\n\n" `basename $0`
+  printf "%s exp/tri3b_tr05_sr_noisy noisy exp/tri4a_dnn_tr05_sr_noisy/graph_tgpr_5k\n\n" `basename $0`
+  exit 1;
+fi
+
+echo "$0 $@"  # Print the command line for logging
+
+. path.sh
+
+dir=$1
+enhan=$2
+graph_dir=$3
+
+echo "compute WER for each location"
+echo ""
+mkdir -p $dir/log
+# collect scores
+for x in `find $dir/ -type d -name "*_it*" | awk -F "_it" '{print $NF}' | sort | uniq`; do
+    for y in `find $dir/*_${enhan}_it*/ | grep "\/wer_" | awk -F'[/]' '{print $NF}' | sort | uniq`; do
+	echo -n "${x}_$y "
+	cat $dir/decode_tgpr_5k_dt05_{real,simu}_${enhan}_it$x/$y | grep WER | awk '{err+=$4} {wrd+=$6} END{printf("%.2f\n",err/wrd*100)}'
+    done
+done | sort -n -k 2 | head -n 1 > $dir/log/best_wer_$enhan
+
+lmw=`cut -f 1 -d" " $dir/log/best_wer_$enhan | awk -F'[_]' '{print $NF}'`
+it=`cut -f 1 -d" " $dir/log/best_wer_$enhan | awk -F'[_]' '{print $1}'`
+echo "-------------------"
+printf "best overall dt05 WER %s" `cut -f 2 -d" " $dir/log/best_wer_$enhan`
+echo -n "%"
+printf " (language model weight = %s)\n" $lmw
+printf " (Number of iterations = %s)\n" $it
+echo "-------------------"
+if $eval_flag; then
+  tasks="dt05 et05"
+else
+  tasks="dt05"
+fi
+for e_d in $tasks; do
+  for task in simu real; do
+    rdir=$dir/decode_tgpr_5k_${e_d}_${task}_${enhan}_it$it
+    for a in _BUS _CAF _PED _STR; do
+      grep $a $rdir/scoring/test_filt.txt \
+	> $rdir/scoring/test_filt_$a.txt
+      cat $rdir/scoring/$lmw.tra \
+	| utils/int2sym.pl -f 2- $graph_dir/words.txt \
+	| sed s:\<UNK\>::g \
+	| compute-wer --text --mode=present ark:$rdir/scoring/test_filt_$a.txt ark,p:- \
+	1> $rdir/${a}_wer_$lmw 2> /dev/null
+    done
+    echo -n "${e_d}_${task} WER: `grep WER $rdir/wer_$lmw | cut -f 2 -d" "`% (Average), "
+    echo -n "`grep WER $rdir/_BUS_wer_$lmw | cut -f 2 -d" "`% (BUS), "
+    echo -n "`grep WER $rdir/_CAF_wer_$lmw | cut -f 2 -d" "`% (CAFE), "
+    echo -n "`grep WER $rdir/_PED_wer_$lmw | cut -f 2 -d" "`% (PEDESTRIAN), "
+    echo -n "`grep WER $rdir/_STR_wer_$lmw | cut -f 2 -d" "`% (STREET)"
+    echo ""
+    echo "-------------------"
+  done
+done
+
+for e_d in $tasks; do
+  echo "-----------------------------"
+  echo "1-best transcription for $e_d"
+  echo "-----------------------------"
+  for task in simu real; do
+    rdir=$dir/decode_tgpr_5k_${e_d}_${task}_${enhan}_it$it
+    cat $rdir/scoring/$lmw.tra \
+      | utils/int2sym.pl -f 2- $graph_dir/words.txt \
+      | sed s:\<UNK\>::g
+  done
+done
diff --git a/egs/chime4/s5_1ch/local/chime4_train_lms.sh b/egs/chime4/s5_1ch/local/chime4_train_lms.sh
new file mode 100755
index 00000000000..06dd716e789
--- /dev/null
+++ b/egs/chime4/s5_1ch/local/chime4_train_lms.sh
@@ -0,0 +1,117 @@
+#!/bin/bash
+
+# Modified from the script for CHiME3 baseline
+# Copyright 2015, Mitsubishi Electric Research Laboratories, MERL (Author: Takaaki Hori)
+
+# Config:
+order=5 # n-gram order
+
+. utils/parse_options.sh || exit 1;
+
+. ./path.sh
+
+if [ $# -ne 1 ]; then
+  printf "\nUSAGE: %s <Chime4 root directory>\n\n" `basename $0`
+  echo "Please specifies a Chime4 root directory"
+  echo "If you use kaldi scripts distributed in the Chime4 data,"
+  echo "It would be `pwd`/../.."
+  exit 1;
+fi
+
+# check data directories
+chime4_data=$1
+wsj0_data=$chime4_data/data/WSJ0 # directory of WSJ0 in Chime4. You can also specify your WSJ0 corpus directory
+if [ ! -d $chime4_data ]; then
+  echo "$chime4_data does not exist. Please specify chime4 data root correctly" && exit 1
+fi
+if [ ! -d $wsj0_data ]; then
+  echo "$wsj0_data does not exist. Please specify WSJ0 corpus directory" && exit 1
+fi
+lm_train=$wsj0_data/wsj0/doc/lng_modl/lm_train/np_data
+
+# check whether run_init is executed
+if [ ! -d data/lang ]; then
+  echo "error, execute local/run_init.sh, first"
+  exit 1;
+fi
+
+# lm directories
+dir=data/local/local_lm
+srcdir=data/local/nist_lm
+mkdir -p $dir
+
+# check srilm ngram
+! which ngram-count \
+  && echo "SRILM tools not installed, which are required for LM training" && exit 1;
+
+# extract 5k vocabulary from a baseline language model
+srclm=$srcdir/lm_tgpr_5k.arpa.gz
+if [ -f $srclm ]; then
+  echo "Getting vocabulary from a baseline language model";
+  ngram -lm $srclm -unk -map-unk '<UNK>' -write-vocab $dir/vocab_5k.txt
+else
+  echo "Language model $srclm does not exist" && exit 1;
+fi
+
+# collect training data from WSJ0
+touch $dir/train.gz
+if [ `du -m $dir/train.gz | cut -f 1` -eq 63 ]; then
+  echo "Not getting training data again [already exists]";
+else
+  echo "Collecting training data from $lm_train";
+  gunzip -c $lm_train/{87,88,89}/*.z \
+   | awk -v voc=$dir/vocab_5k.txt '
+   BEGIN{ while((getline<voc)>0) { invoc[$1]=1; }}
+   /^</{next}{
+     for (x=1;x<=NF;x++) {
+       w=toupper($x);
+       if (invoc[w]) { printf("%s ",w); } else { printf("<UNK> "); }
+     }
+     printf("\n");
+   }' | gzip -c > $dir/train.gz
+fi
+
+# get validation data from Chime4 dev set
+touch $dir/valid.gz
+if [ `du -k $dir/valid.gz | cut -f 1` -eq 68 ]; then
+  echo "Not getting validation data again [already exists]";
+else
+  echo "Collecting validation data from $chime4_data/data/transcriptions";
+  cut -d" " -f2- $chime4_data/data/transcriptions/dt05_real.trn_all \
+                 $chime4_data/data/transcriptions/dt05_simu.trn_all \
+      |gzip -c > $dir/valid.gz
+fi
+
+# train a large n-gram language model
+lm_suffix=${order}gkn_5k
+if [ -f $dir/lm_${lm_suffix}.arpa.gz ]; then
+  echo "A $order-gram language model aready exists and is not constructed again"
+  echo "To reconstruct, remove $dir/lm_${lm_suffix}.arpa.gz first"
+else
+  echo "Training a $order-gram language model"
+  ngram-count -text $dir/train.gz -order $order \
+              -vocab $dir/vocab_5k.txt -unk -map-unk "<UNK>" \
+              -gt2min 1 -gt3min 1 -gt4min 2 -gt5min 2 \
+              -interpolate -kndiscount \
+              -lm $dir/lm_${lm_suffix}.arpa.gz
+fi
+echo "Checking validation perplexity of $order-gram language model"
+ngram -order $order -ppl $dir/valid.gz -lm $dir/lm_${lm_suffix}.arpa.gz
+# e.g. 5-gram perplexity:
+# file data/local/local_lm/valid.txt: 3280 sentences, 54239 words, 3 OOVs
+# 0 zeroprobs, logprob= -96775.5 ppl= 48.1486 ppl1= 60.8611
+
+# convert arpa LM to G.fst
+echo "Converting the $order-gram language model to G.fst"
+test=data/lang_test_${lm_suffix}
+mkdir -p $test
+cp -r data/lang/* $test || exit 1;
+
+gunzip -c $dir/lm_${lm_suffix}.arpa.gz | \
+    arpa2fst --disambig-symbol=#0 \
+             --read-symbol-table=$test/words.txt - $test/G.fst
+
+utils/validate_lang.pl --skip-determinization-check $test || exit 1;
+
+echo "Succeeded in $order-gram LM training and conversion to G.fst"
+
diff --git a/egs/chime4/s5_1ch/local/chime4_train_rnnlms.sh b/egs/chime4/s5_1ch/local/chime4_train_rnnlms.sh
new file mode 100755
index 00000000000..8324c8e06b1
--- /dev/null
+++ b/egs/chime4/s5_1ch/local/chime4_train_rnnlms.sh
@@ -0,0 +1,111 @@
+#!/bin/bash
+
+# Copyright 2015, Mitsubishi Electric Research Laboratories, MERL (Author: Takaaki Hori)
+
+# Config:
+hidden=300 # Num-hidden units
+class=200 # Num-classes
+rnnlm_ver=rnnlm-0.3e # version of RNNLM to use
+threads=1 # for RNNLM-HS
+bptt=4 # length of BPTT unfolding in RNNLM
+bptt_block=10 # length of BPTT unfolding in RNNLM
+
+. utils/parse_options.sh || exit 1;
+
+. ./path.sh
+. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
+           ## This relates to the queue.
+
+if [ $# -ne 1 ]; then
+  printf "\nUSAGE: %s <Chime4 root directory>\n\n" `basename $0`
+  echo "Please specifies a Chime4 root directory"
+  echo "If you use kaldi scripts distributed in the Chime4 data,"
+  echo "It would be `pwd`/../.."
+  exit 1;
+fi
+
+# check data directories
+chime4_data=$1
+wsj0_data=$chime4_data/data/WSJ0 # directory of WSJ0 in Chime4. You can also specify your WSJ0 corpus directory
+if [ ! -d $chime4_data ]; then
+  echo "$chime4_data does not exist. Please specify chime4 data root correctly" && exit 1
+fi
+if [ ! -d $wsj0_data ]; then
+  echo "$wsj0_data does not exist. Please specify WSJ0 corpus directory" && exit 1
+fi
+lm_train=$wsj0_data/wsj0/doc/lng_modl/lm_train/np_data
+
+# lm directories
+dir=data/local/local_lm
+srcdir=data/local/nist_lm
+mkdir -p $dir
+
+# extract 5k vocabulary from a baseline language model
+srclm=$srcdir/lm_tgpr_5k.arpa.gz
+if [ -f $srclm ]; then
+  echo "Getting vocabulary from a baseline language model";
+  gunzip -c $srclm | awk 'BEGIN{unig=0}{
+    if(unig==0){
+      if($1=="\\1-grams:"){unig=1}}
+    else {
+      if ($1 != "") {
+        if ($1=="\\2-grams:" || $1=="\\end\\") {exit}
+        else {print $2}}
+    }}' | sed "s/<UNK>/<RNN_UNK>/" > $dir/vocab_5k.rnn
+else
+  echo "Language model $srclm does not exist" && exit 1;
+fi
+
+# collect training data from WSJ0
+touch $dir/train.rnn
+if [ `du -m $dir/train.rnn | cut -f 1` -eq 223 ]; then
+  echo "Not getting training data again [already exists]";
+else
+  echo "Collecting training data from $lm_train";
+  gunzip -c $lm_train/{87,88,89}/*.z \
+   | awk -v voc=$dir/vocab_5k.rnn '
+   BEGIN{ while((getline<voc)>0) { invoc[$1]=1; }}
+   /^</{next}{
+     for (x=1;x<=NF;x++) {
+       w=toupper($x);
+       if (invoc[w]) { printf("%s ",w); } else { printf("<RNN_UNK> "); }
+     }
+     printf("\n");
+   }' > $dir/train.rnn
+fi
+
+# get validation data from Chime4 dev set
+touch $dir/valid.rnn
+if [ `cat $dir/valid.rnn | wc -w` -eq 54239 ]; then
+  echo "Not getting validation data again [already exists]";
+else
+  echo "Collecting validation data from $chime4_data/data/transcriptions";
+  cut -d" " -f2- $chime4_data/data/transcriptions/dt05_real.trn_all \
+                 $chime4_data/data/transcriptions/dt05_simu.trn_all \
+      > $dir/valid.rnn
+fi
+
+# RNN language model traing
+$KALDI_ROOT/tools/extras/check_for_rnnlm.sh "$rnnlm_ver" || exit 1
+
+# train a RNN language model
+rnnmodel=$dir/rnnlm_5k_h${hidden}_bptt${bptt}
+if [ -f $rnnmodel ]; then
+  echo "A RNN language model aready exists and is not constructed again"
+  echo "To reconstruct, remove $rnnmodel first"
+else
+  echo "Training a RNN language model with $rnnlm_ver"
+  echo "(runtime log is written to $dir/rnnlm.log)"
+  $train_cmd $dir/rnnlm.log \
+   $KALDI_ROOT/tools/$rnnlm_ver/rnnlm -train $dir/train.rnn -valid $dir/valid.rnn \
+        -rnnlm $rnnmodel -hidden $hidden -class $class \
+        -rand-seed 1 -independent -debug 1 -bptt $bptt -bptt-block $bptt_block || exit 1;
+fi
+
+# store in a RNNLM directory with necessary files
+rnndir=data/lang_test_rnnlm_5k_h${hidden}
+mkdir -p $rnndir
+cp $rnnmodel $rnndir/rnnlm
+grep -v -e "<s>" -e "</s>" $dir/vocab_5k.rnn > $rnndir/wordlist.rnn
+touch $rnndir/unk.probs # make an empty file because we don't know unk-word probs.
+
diff --git a/egs/chime4/s5_1ch/local/clean_chime4_format_data.sh b/egs/chime4/s5_1ch/local/clean_chime4_format_data.sh
new file mode 100755
index 00000000000..23dc8a70d9e
--- /dev/null
+++ b/egs/chime4/s5_1ch/local/clean_chime4_format_data.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+
+# Copyright 2012  Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
+#           2015  Guoguo Chen
+#           2016  Mitsubishi Electric Research Laboratories (Author: Shinji Watanabe)
+# Apache 2.0
+
+# This script takes data prepared in a corpus-dependent way
+# in data/local/, and converts it into the "canonical" form,
+# in various subdirectories of data/, e.g. data/lang, data/lang_test_ug,
+# data/train_si84, etc.
+
+lang_suffix=
+
+echo "$0 $@"  # Print the command line for logging
+. utils/parse_options.sh || exit 1;
+
+. ./path.sh || exit 1;
+
+echo "Preparing train and test data"
+srcdir=data/local/data
+lmdir=data/local/nist_lm
+tmpdir=data/local/lm_tmp
+lexicon=data/local/lang_tmp/lexiconp.txt
+mkdir -p $tmpdir
+
+for x in et05_orig_clean dt05_orig_clean tr05_orig_clean; do
+  mkdir -p data/$x
+  cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1;
+  cp $srcdir/$x.txt data/$x/text || exit 1;
+  cp $srcdir/$x.spk2utt data/$x/spk2utt || exit 1;
+  cp $srcdir/$x.utt2spk data/$x/utt2spk || exit 1;
+  utils/filter_scp.pl data/$x/spk2utt $srcdir/spk2gender > data/$x/spk2gender || exit 1;
+done
+
+
+# Next, for each type of language model, create the corresponding FST
+# and the corresponding lang_test_* directory.
+
+echo Preparing language models for test
+
+for lm_suffix in tgpr_5k; do
+  test=data/lang${lang_suffix}_test_${lm_suffix}
+
+  mkdir -p $test
+  cp -r data/lang${lang_suffix}/* $test || exit 1;
+
+  gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
+    arpa2fst --disambig-symbol=#0 \
+             --read-symbol-table=$test/words.txt - $test/G.fst
+
+  utils/validate_lang.pl --skip-determinization-check $test || exit 1;
+done
+
+echo "Succeeded in formatting data."
+rm -r $tmpdir
diff --git a/egs/chime4/s5_1ch/local/clean_wsj0_data_prep.sh b/egs/chime4/s5_1ch/local/clean_wsj0_data_prep.sh
new file mode 100755
index 00000000000..8c6989bc0b2
--- /dev/null
+++ b/egs/chime4/s5_1ch/local/clean_wsj0_data_prep.sh
@@ -0,0 +1,152 @@
+#!/bin/bash
+set -e
+
+# Copyright 2009-2012  Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0.
+
+# This is modified from the script in standard Kaldi recipe to account
+# for the way the WSJ data is structured on the Edinburgh systems.
+# - Arnab Ghoshal, 29/05/12
+
+# Modified from the script for CHiME2 baseline
+# Shinji Watanabe 02/13/2015
+
+if [ $# -ne 1 ]; then
+  printf "\nUSAGE: %s <original WSJ0 corpus-directory>\n\n" `basename $0`
+  echo "The argument should be a the top-level WSJ corpus directory."
+  echo "It is assumed that there will be a 'wsj0' and a 'wsj1' subdirectory"
+  echo "within the top-level corpus directory."
+  exit 1;
+fi
+
+wsj0=$1
+
+dir=`pwd`/data/local/data
+lmdir=`pwd`/data/local/nist_lm
+mkdir -p $dir $lmdir
+local=`pwd`/local
+utils=`pwd`/utils
+
+. ./path.sh # Needed for KALDI_ROOT
+sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
+if [ ! -x $sph2pipe ]; then
+  echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
+  exit 1;
+fi
+
+if [ -z $IRSTLM ] ; then
+  export IRSTLM=$KALDI_ROOT/tools/irstlm/
+fi
+export PATH=${PATH}:$IRSTLM/bin
+if ! command -v prune-lm >/dev/null 2>&1 ; then
+  echo "$0: Error: the IRSTLM is not available or compiled" >&2
+  echo "$0: Error: We used to install it by default, but." >&2
+  echo "$0: Error: this is no longer the case." >&2
+  echo "$0: Error: To install it, go to $KALDI_ROOT/tools" >&2
+  echo "$0: Error: and run extras/install_irstlm.sh" >&2
+  exit 1
+fi
+
+cd $dir
+
+# This version for SI-84
+cat $wsj0/wsj0/doc/indices/train/tr_s_wv1.ndx \
+  | $local/cstr_ndx2flist.pl $wsj0 | sort -u > tr05_orig_clean.flist
+
+# Now for the test sets.
+# $wsj0/wsj1/doc/indices/readme.doc
+# describes all the different test sets.
+# Note: each test-set seems to come in multiple versions depending
+# on different vocabulary sizes, verbalized vs. non-verbalized
+# pronunciations, etc.  We use the largest vocab and non-verbalized
+# pronunciations.
+# The most normal one seems to be the "baseline 60k test set", which
+# is h1_p0.
+
+# Nov'92 (330 utts, 5k vocab)
+cat $wsj0/wsj0/doc/indices/test/nvp/si_et_05.ndx | \
+  $local/cstr_ndx2flist.pl $wsj0 | sort > et05_orig_clean.flist
+
+# Note: the ???'s below match WSJ and SI_DT, or wsj and si_dt.
+# Sometimes this gets copied from the CD's with upcasing, don't know
+# why (could be older versions of the disks).
+find $wsj0/wsj0/si_dt_05 -print | grep -i ".wv1" | sort > dt05_orig_clean.flist
+
+# Finding the transcript files:
+find -L $wsj0 -iname '*.dot' > dot_files.flist
+
+# Convert the transcripts into our format (no normalization yet)
+# adding suffix to utt_id
+# 0 for clean condition
+for x in tr05_orig_clean et05_orig_clean dt05_orig_clean; do
+  $local/flist2scp.pl $x.flist | sort > ${x}_sph_tmp.scp
+  cat ${x}_sph_tmp.scp | awk '{print $1}' \
+    | $local/find_transcripts.pl dot_files.flist > ${x}_tmp.trans1
+  cat ${x}_sph_tmp.scp | awk '{printf("%s %s\n", $1, $2);}' > ${x}_sph.scp
+  cat ${x}_tmp.trans1 | awk '{printf("%s ", $1); for(i=2;i<=NF;i++) printf("%s ", $i); printf("\n");}' > ${x}.trans1
+done
+
+# Do some basic normalization steps.  At this point we don't remove OOVs--
+# that will be done inside the training scripts, as we'd like to make the
+# data-preparation stage independent of the specific lexicon used.
+noiseword="<NOISE>";
+for x in tr05_orig_clean et05_orig_clean dt05_orig_clean; do
+  cat $x.trans1 | $local/normalize_transcript.pl $noiseword \
+    | sort > $x.txt || exit 1;
+done
+
+# Create scp's with wav's. (the wv1 in the distribution is not really wav, it is sph.)
+for x in tr05_orig_clean et05_orig_clean dt05_orig_clean; do
+  awk '{printf("%s '$sph2pipe' -f wav %s |\n", $1, $2);}' < ${x}_sph.scp \
+    > ${x}_wav.scp
+done
+
+# Make the utt2spk and spk2utt files.
+for x in tr05_orig_clean et05_orig_clean dt05_orig_clean; do
+  cat ${x}_sph.scp | awk '{print $1}' \
+    | perl -ane 'chop; m:^...:; print "$_ $&\n";' > $x.utt2spk
+  cat $x.utt2spk | $utils/utt2spk_to_spk2utt.pl > $x.spk2utt || exit 1;
+done
+
+#in case we want to limit lm's on most frequent words, copy lm training word frequency list
+cp $wsj0/wsj0/doc/lng_modl/vocab/wfl_64.lst $lmdir
+chmod u+w $lmdir/*.lst # had weird permissions on source.
+
+# The 5K vocab language model without verbalized pronunciations.
+# This is used for 3rd CHiME challenge
+# trigram would be: !only closed vocabulary here!
+cp $wsj0/wsj0/doc/lng_modl/base_lm/tcb05cnp.z $lmdir/lm_tg_5k.arpa.gz || exit 1;
+chmod u+rw $lmdir/lm_tg_5k.arpa.gz
+gunzip $lmdir/lm_tg_5k.arpa.gz
+tail -n 4328839 $lmdir/lm_tg_5k.arpa | gzip -c -f > $lmdir/lm_tg_5k.arpa.gz
+rm $lmdir/lm_tg_5k.arpa
+
+prune-lm --threshold=1e-7 $lmdir/lm_tg_5k.arpa.gz $lmdir/lm_tgpr_5k.arpa || exit 1;
+gzip -f $lmdir/lm_tgpr_5k.arpa || exit 1;
+
+
+if [ ! -f wsj0-train-spkrinfo.txt ] || [ `cat wsj0-train-spkrinfo.txt | wc -l` -ne 134 ]; then
+  rm -f wsj0-train-spkrinfo.txt
+  wget http://www.ldc.upenn.edu/Catalog/docs/LDC93S6A/wsj0-train-spkrinfo.txt \
+    || ( echo "Getting wsj0-train-spkrinfo.txt from backup location" && \
+         wget --no-check-certificate https://sourceforge.net/projects/kaldi/files/wsj0-train-spkrinfo.txt );
+fi
+
+if [ ! -f wsj0-train-spkrinfo.txt ]; then
+  echo "Could not get the spkrinfo.txt file from LDC website (moved)?"
+  echo "This is possibly omitted from the training disks; couldn't find it."
+  echo "Everything else may have worked; we just may be missing gender info"
+  echo "which is only needed for VTLN-related diagnostics anyway."
+  exit 1
+fi
+# Note: wsj0-train-spkrinfo.txt doesn't seem to be on the disks but the
+# LDC put it on the web.  Perhaps it was accidentally omitted from the
+# disks.
+
+cat $wsj0/wsj0/doc/spkrinfo.txt \
+    ./wsj0-train-spkrinfo.txt  | \
+    perl -ane 'tr/A-Z/a-z/; m/^;/ || print;' | \
+    awk '{print $1, $2}' | grep -v -- -- | sort | uniq > spk2gender
+
+
+echo "Data preparation succeeded"
diff --git a/egs/chime4/s5_1ch/local/cstr_ndx2flist.pl b/egs/chime4/s5_1ch/local/cstr_ndx2flist.pl
new file mode 100755
index 00000000000..d19db421a9f
--- /dev/null
+++ b/egs/chime4/s5_1ch/local/cstr_ndx2flist.pl
@@ -0,0 +1,54 @@
+#!/usr/bin/env perl
+
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# This is modified from the script in standard Kaldi recipe to account
+# for the way the WSJ data is structured on the Edinburgh systems. 
+# - Arnab Ghoshal, 12/1/12
+
+# This program takes as its standard input an .ndx file from the WSJ corpus that looks
+# like this:
+#;; File: tr_s_wv1.ndx, updated 04/26/94
+#;;
+#;; Index for WSJ0 SI-short Sennheiser training data
+#;; Data is read WSJ sentences, Sennheiser mic.
+#;; Contains 84 speakers X (~100 utts per speaker MIT/SRI and ~50 utts 
+#;; per speaker TI) = 7236 utts
+#;;
+#11_1_1:wsj0/si_tr_s/01i/01ic0201.wv1
+#11_1_1:wsj0/si_tr_s/01i/01ic0202.wv1
+#11_1_1:wsj0/si_tr_s/01i/01ic0203.wv1
+
+# and as command-line argument it takes the names of the WSJ disk locations, e.g.:
+# /group/corpora/public/wsjcam0/data on DICE machines.
+# It outputs a list of absolute pathnames.
+
+$wsj_dir = $ARGV[0];
+
+while(<STDIN>){
+  if(m/^;/){ next; } # Comment.  Ignore it.
+  else {
+    m/^([0-9_]+):\s*(\S+)$/  || die "Could not parse line $_";
+    $filename = $2; # as a subdirectory of the distributed disk.
+    if ($filename !~ m/\.wv1$/) { $filename .= ".wv1"; }
+    $filename = "$wsj_dir/$filename";
+    if (-e $filename) {
+      print "$filename\n";
+    } else {
+      print STDERR "File $filename found in the index but not on disk\n";
+    }
+  }
+}
diff --git a/egs/chime4/s5_1ch/local/find_noisy_transcripts.pl b/egs/chime4/s5_1ch/local/find_noisy_transcripts.pl
new file mode 100755
index 00000000000..fdeb38d9444
--- /dev/null
+++ b/egs/chime4/s5_1ch/local/find_noisy_transcripts.pl
@@ -0,0 +1,65 @@
+#!/usr/bin/env perl
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+
+# This program takes on its standard input a list of utterance
+# id's, one for each line. (e.g. 4k0c030a is a an utterance id).
+# It takes as
+# Extracts from the dot files the transcripts for a given
+# dataset (represented by a file list).
+# 
+
+@ARGV == 1 || die "find_transcripts.pl dot_files_flist < utterance_ids > transcripts";
+$dot_flist = shift @ARGV;
+
+open(L, "<$dot_flist") || die "Opening file list of dot files: $dot_flist\n";
+while(<L>){
+    chop;
+    m:\S+/(\w{6})00.dot: || die "Bad line in dot file list: $_";
+    $spk = $1;
+    $spk2dot{$spk} = $_;
+}
+
+
+
+while(<STDIN>){ 
+    chop;
+    $uttid_orig = $_;
+    $uttid = substr $uttid_orig, 0, 8; 
+    $uttid =~ m:(\w{6})\w\w: || die "Bad utterance id $_";
+    $spk = $1;
+    if($spk ne $curspk) {
+        %utt2trans = { }; # Don't keep all the transcripts in memory...
+        $curspk = $spk;
+        $dotfile = $spk2dot{$spk};
+        defined $dotfile || die "No dot file for speaker $spk\n";
+        open(F, "<$dotfile") || die "Error opening dot file $dotfile\n";
+        while(<F>) {
+            $_ =~ m:(.+)\((\w{8})\)\s*$: || die "Bad line $_ in dot file $dotfile (line $.)\n";
+            $trans = $1;
+            $utt = $2;
+            $utt2trans{$utt} = $trans;
+        }
+    }
+    if(!defined $utt2trans{$uttid}) {
+        print STDERR "No transcript for utterance $uttid (current dot file is $dotfile)\n";
+    } else {
+        print "$uttid_orig $utt2trans{$uttid}\n";
+    }
+}
+
+
diff --git a/egs/chime4/s5_1ch/local/find_transcripts.pl b/egs/chime4/s5_1ch/local/find_transcripts.pl
new file mode 100755
index 00000000000..6429411b864
--- /dev/null
+++ b/egs/chime4/s5_1ch/local/find_transcripts.pl
@@ -0,0 +1,64 @@
+#!/usr/bin/env perl
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+
+# This program takes on its standard input a list of utterance
+# id's, one for each line. (e.g. 4k0c030a is a an utterance id).
+# It takes as
+# Extracts from the dot files the transcripts for a given
+# dataset (represented by a file list).
+# 
+
+@ARGV == 1 || die "find_transcripts.pl dot_files_flist < utterance_ids > transcripts";
+$dot_flist = shift @ARGV;
+
+open(L, "<$dot_flist") || die "Opening file list of dot files: $dot_flist\n";
+while(<L>){
+    chop;
+    m:\S+/(\w{6})00.dot: || die "Bad line in dot file list: $_";
+    $spk = $1;
+    $spk2dot{$spk} = $_;
+}
+
+
+
+while(<STDIN>){ 
+    chop;
+    $uttid = $_;
+    $uttid =~ m:(\w{6})\w\w: || die "Bad utterance id $_";
+    $spk = $1;
+    if($spk ne $curspk) {
+        %utt2trans = { }; # Don't keep all the transcripts in memory...
+        $curspk = $spk;
+        $dotfile = $spk2dot{$spk};
+        defined $dotfile || die "No dot file for speaker $spk\n";
+        open(F, "<$dotfile") || die "Error opening dot file $dotfile\n";
+        while(<F>) {
+            $_ =~ m:(.+)\((\w{8})\)\s*$: || die "Bad line $_ in dot file $dotfile (line $.)\n";
+            $trans = $1;
+            $utt = $2;
+            $utt2trans{$utt} = $trans;
+        }
+    }
+    if(!defined $utt2trans{$uttid}) {
+        print STDERR "No transcript for utterance $uttid (current dot file is $dotfile)\n";
+    } else {
+        print "$uttid $utt2trans{$uttid}\n";
+    }
+}
+
+
diff --git a/egs/timit/s4/utils/eps2disambig.pl b/egs/chime4/s5_1ch/local/flist2scp.pl
similarity index 57%
rename from egs/timit/s4/utils/eps2disambig.pl
rename to egs/chime4/s5_1ch/local/flist2scp.pl
index 049802b0888..234e4add1ed 100755
--- a/egs/timit/s4/utils/eps2disambig.pl
+++ b/egs/chime4/s5_1ch/local/flist2scp.pl
@@ -14,10 +14,18 @@
 # See the Apache 2 License for the specific language governing permissions and
 # limitations under the License.
 
-# This script replaces epsilon with #0 on the input side only, of the G.fst
-# acceptor.  
+
+# takes in a file list with lines like
+# /mnt/matylda2/data/WSJ1/13-16.1/wsj1/si_dt_20/4k0/4k0c030a.wv1
+# and outputs an scp in kaldi format with lines like
+# 4k0c030a /mnt/matylda2/data/WSJ1/13-16.1/wsj1/si_dt_20/4k0/4k0c030a.wv1
+# (the first thing is the utterance-id, which is the same as the basename of the file.
+
 
 while(<>){
-    s:^(\d+\s+\d+\s+)\<eps\>(\s+):$1#0$2:;
-    print;
+    m:^\S+/(\w+)\.[wW][vV]1$: || die "Bad line $_";
+    $id = $1;
+    $id =~ tr/A-Z/a-z/;  # Necessary because of weirdness on disk 13-16.1 (uppercase filenames)
+    print "$id $_";
 }
+
diff --git a/egs/chime4/s5_1ch/local/normalize_transcript.pl b/egs/chime4/s5_1ch/local/normalize_transcript.pl
new file mode 100755
index 00000000000..09cee06172e
--- /dev/null
+++ b/egs/chime4/s5_1ch/local/normalize_transcript.pl
@@ -0,0 +1,59 @@
+#!/usr/bin/env perl
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+# This takes data from the standard input that's unnormalized transcripts in the format
+# 4k2c0308 Of course there isn\'t any guarantee the company will keep its hot hand [misc_noise] 
+# 4k2c030a [loud_breath] And new hardware such as the set of personal computers I\. B\. M\. introduced last week can lead to unexpected changes in the software business [door_slam] 
+# and outputs normalized transcripts.
+# c.f. /mnt/matylda2/data/WSJ0/11-10.1/wsj0/transcrp/doc/dot_spec.doc
+
+@ARGV == 1 ||  die "usage: normalize_transcript.pl noise_word < transcript > transcript2";
+$noise_word = shift @ARGV;
+
+while(<STDIN>) {
+    $_ =~ m:^(\S+) (.+): || die "bad line $_";
+    $utt = $1;
+    $trans = $2;
+    print "$utt";
+    foreach $w (split (" ",$trans)) {
+        $w =~ tr:a-z:A-Z:; # Upcase everything to match the CMU dictionary. .
+        $w =~ s:\\::g;      # Remove backslashes.  We don't need the quoting.
+        $w =~ s:^\%PERCENT$:PERCENT:; # Normalization for Nov'93 test transcripts.
+        $w =~ s:^\.POINT$:POINT:; # Normalization for Nov'93 test transcripts.
+        if($w =~ m:^\[\<\w+\]$:  || # E.g. [<door_slam], this means a door slammed in the preceding word. Delete.
+           $w =~ m:^\[\w+\>\]$:  ||  # E.g. [door_slam>], this means a door slammed in the next word.  Delete.
+           $w =~ m:\[\w+/\]$: ||  # E.g. [phone_ring/], which indicates the start of this phenomenon.
+           $w =~ m:\[\/\w+]$: ||  # E.g. [/phone_ring], which indicates the end of this phenomenon.
+           $w eq "~" ||        # This is used to indicate truncation of an utterance.  Not a word.
+           $w eq ".") {      # "." is used to indicate a pause.  Silence is optional anyway so not much 
+                             # point including this in the transcript.
+            next; # we won't print this word.
+        } elsif($w =~ m:\[\w+\]:) { # Other noises, e.g. [loud_breath].
+            print " $noise_word";
+        } elsif($w =~ m:^\<([\w\']+)\>$:) {
+            # e.g. replace <and> with and.  (the <> means verbal deletion of a word).. but it's pronounced.
+            print " $1";
+        } elsif($w eq "--DASH") {
+            print " -DASH";  # This is a common issue; the CMU dictionary has it as -DASH.
+#        } elsif($w =~ m:(.+)\-DASH$:) { # E.g. INCORPORATED-DASH... seems the DASH gets combined with previous word
+#            print " $1 -DASH";
+        } else {
+            print " $w";
+        }
+    }
+    print "\n";
+}
diff --git a/egs/chime4/s5_1ch/local/real_enhan_chime4_data_prep.sh b/egs/chime4/s5_1ch/local/real_enhan_chime4_data_prep.sh
new file mode 100755
index 00000000000..b5ff06f6903
--- /dev/null
+++ b/egs/chime4/s5_1ch/local/real_enhan_chime4_data_prep.sh
@@ -0,0 +1,103 @@
+#!/bin/bash
+set -e
+
+# Copyright 2009-2012  Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0.
+
+# This is modified from the script in standard Kaldi recipe to account
+# for the way the WSJ data is structured on the Edinburgh systems.
+# - Arnab Ghoshal, 29/05/12
+
+# Modified from the script for CHiME2 baseline
+# Shinji Watanabe 02/13/2015
+
+# Config:
+eval_flag=false # make it true when the evaluation data are released
+
+. utils/parse_options.sh || exit 1;
+
+if [ $# -ne 2 ]; then
+  printf "\nUSAGE: %s <enhancement-name> <enhanced-speech-directory>\n\n" `basename $0`
+  echo "The argument should be a the directory that only contains enhanced speech data."
+  exit 1;
+fi
+
+echo "$0 $@"  # Print the command line for logging
+
+enhan=$1
+audio_dir=$2
+
+dir=`pwd`/data/local/data
+mkdir -p $dir
+local=`pwd`/local
+utils=`pwd`/utils
+odir=`pwd`/data
+
+. ./path.sh # Needed for KALDI_ROOT
+export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin
+sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
+if [ ! -x $sph2pipe ]; then
+  echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
+  exit 1;
+fi
+
+if $eval_flag; then
+list_set="tr05_real_$enhan dt05_real_$enhan et05_real_$enhan"
+else
+list_set="tr05_real_$enhan dt05_real_$enhan"
+fi
+
+cd $dir
+
+find $audio_dir/ -name '*.wav' | grep 'tr05_bus_real\|tr05_caf_real\|tr05_ped_real\|tr05_str_real' | sort -u > tr05_real_$enhan.flist
+find $audio_dir/ -name '*.wav' | grep 'dt05_bus_real\|dt05_caf_real\|dt05_ped_real\|dt05_str_real' | sort -u > dt05_real_$enhan.flist
+if $eval_flag; then
+find $audio_dir/ -name '*.wav' | grep 'et05_bus_real\|et05_caf_real\|et05_ped_real\|et05_str_real' | sort -u > et05_real_$enhan.flist
+fi
+
+# make a scp file from file list
+for x in $list_set; do
+    cat $x.flist | awk -F'[/]' '{print $NF}'| sed -e 's/\.wav/_REAL/' > ${x}_wav.ids
+    paste -d" " ${x}_wav.ids $x.flist | sort -k 1 > ${x}_wav.scp
+done
+
+#make a transcription from dot
+cat tr05_real.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF "_REAL"}'> tr05_real_$enhan.ids
+cat tr05_real.dot | sed -e 's/(.*)//' > tr05_real_$enhan.txt
+paste -d" " tr05_real_$enhan.ids tr05_real_$enhan.txt | sort -k 1 > tr05_real_$enhan.trans1
+cat dt05_real.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF "_REAL"}'> dt05_real_$enhan.ids
+cat dt05_real.dot | sed -e 's/(.*)//' > dt05_real_$enhan.txt
+paste -d" " dt05_real_$enhan.ids dt05_real_$enhan.txt | sort -k 1 > dt05_real_$enhan.trans1
+if $eval_flag; then
+cat et05_real.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF "_REAL"}'> et05_real_$enhan.ids
+cat et05_real.dot | sed -e 's/(.*)//' > et05_real_$enhan.txt
+paste -d" " et05_real_$enhan.ids et05_real_$enhan.txt | sort -k 1 > et05_real_$enhan.trans1
+fi
+
+# Do some basic normalization steps.  At this point we don't remove OOVs--
+# that will be done inside the training scripts, as we'd like to make the
+# data-preparation stage independent of the specific lexicon used.
+noiseword="<NOISE>";
+for x in $list_set;do
+  cat $x.trans1 | $local/normalize_transcript.pl $noiseword \
+    | sort > $x.txt || exit 1;
+done
+
+# Make the utt2spk and spk2utt files.
+for x in $list_set; do
+  cat ${x}_wav.scp | awk -F'_' '{print $1}' > $x.spk
+  cat ${x}_wav.scp | awk '{print $1}' > $x.utt
+  paste -d" " $x.utt $x.spk > $x.utt2spk
+  cat $x.utt2spk | $utils/utt2spk_to_spk2utt.pl > $x.spk2utt || exit 1;
+done
+
+# copying data to data/...
+for x in $list_set; do
+  mkdir -p $odir/$x
+  cp ${x}_wav.scp $odir/$x/wav.scp || exit 1;
+  cp ${x}.txt     $odir/$x/text    || exit 1;
+  cp ${x}.spk2utt $odir/$x/spk2utt || exit 1;
+  cp ${x}.utt2spk $odir/$x/utt2spk || exit 1;
+done
+
+echo "Data preparation succeeded"
diff --git a/egs/chime4/s5_1ch/local/real_noisy_chime4_data_prep.sh b/egs/chime4/s5_1ch/local/real_noisy_chime4_data_prep.sh
new file mode 100755
index 00000000000..86186b9e543
--- /dev/null
+++ b/egs/chime4/s5_1ch/local/real_noisy_chime4_data_prep.sh
@@ -0,0 +1,114 @@
+#!/bin/bash
+set -e
+
+# Copyright 2009-2012  Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0.
+
+# This is modified from the script in standard Kaldi recipe to account
+# for the way the WSJ data is structured on the Edinburgh systems.
+# - Arnab Ghoshal, 29/05/12
+
+# Modified from the script for CHiME2 baseline
+# Shinji Watanabe 02/13/2015
+
+# Config:
+eval_flag=false # make it true when the evaluation data are released
+
+. utils/parse_options.sh || exit 1;
+
+if [ $# -ne 1 ]; then
+  printf "\nUSAGE: %s <corpus-directory>\n\n" `basename $0`
+  echo "The argument should be a the top-level Chime4 directory."
+  echo "It is assumed that there will be a 'data' subdirectory"
+  echo "within the top-level corpus directory."
+  exit 1;
+fi
+
+echo "$0 $@"  # Print the command line for logging
+
+audio_dir=$1/data/audio/16kHz/isolated
+trans_dir=$1/data/transcriptions
+
+echo "extract 5th channel (CH5.wav, the center bottom edge in the front of the tablet) for noisy data"
+
+dir=`pwd`/data/local/data
+lmdir=`pwd`/data/local/nist_lm
+mkdir -p $dir $lmdir
+local=`pwd`/local
+utils=`pwd`/utils
+
+. ./path.sh # Needed for KALDI_ROOT
+export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin
+sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
+if [ ! -x $sph2pipe ]; then
+  echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
+  exit 1;
+fi
+
+if $eval_flag; then
+list_set="tr05_real_noisy dt05_real_noisy et05_real_noisy"
+else
+list_set="tr05_real_noisy dt05_real_noisy"
+fi
+
+cd $dir
+
+find $audio_dir -name '*CH5.wav' | grep 'tr05_bus_real\|tr05_caf_real\|tr05_ped_real\|tr05_str_real' | sort -u > tr05_real_noisy.flist
+find $audio_dir -name '*CH5.wav' | grep 'dt05_bus_real\|dt05_caf_real\|dt05_ped_real\|dt05_str_real' | sort -u > dt05_real_noisy.flist
+if $eval_flag; then
+find $audio_dir -name '*CH5.wav' | grep 'et05_bus_real\|et05_caf_real\|et05_ped_real\|et05_str_real' | sort -u > et05_real_noisy.flist
+fi
+
+# make a dot format from json annotation files
+cp $trans_dir/tr05_real.dot_all tr05_real.dot
+cp $trans_dir/dt05_real.dot_all dt05_real.dot
+if $eval_flag; then
+cp $trans_dir/et05_real.dot_all et05_real.dot
+fi
+
+# make a scp file from file list
+for x in $list_set; do
+    cat $x.flist | awk -F'[/]' '{print $NF}'| sed -e 's/\.wav/_REAL/' > ${x}_wav.ids
+    paste -d" " ${x}_wav.ids $x.flist | sort -k 1 > ${x}_wav.scp
+done
+
+#make a transcription from dot
+cat tr05_real.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF ".CH5_REAL"}'> tr05_real_noisy.ids
+cat tr05_real.dot | sed -e 's/(.*)//' > tr05_real_noisy.txt
+paste -d" " tr05_real_noisy.ids tr05_real_noisy.txt | sort -k 1 > tr05_real_noisy.trans1
+cat dt05_real.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF ".CH5_REAL"}'> dt05_real_noisy.ids
+cat dt05_real.dot | sed -e 's/(.*)//' > dt05_real_noisy.txt
+paste -d" " dt05_real_noisy.ids dt05_real_noisy.txt | sort -k 1 > dt05_real_noisy.trans1
+if $eval_flag; then
+cat et05_real.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF ".CH5_REAL"}'> et05_real_noisy.ids
+cat et05_real.dot | sed -e 's/(.*)//' > et05_real_noisy.txt
+paste -d" " et05_real_noisy.ids et05_real_noisy.txt | sort -k 1 > et05_real_noisy.trans1
+fi
+
+# Do some basic normalization steps.  At this point we don't remove OOVs--
+# that will be done inside the training scripts, as we'd like to make the
+# data-preparation stage independent of the specific lexicon used.
+noiseword="<NOISE>";
+for x in $list_set;do
+  cat $x.trans1 | $local/normalize_transcript.pl $noiseword \
+    | sort > $x.txt || exit 1;
+done
+
+# Make the utt2spk and spk2utt files.
+for x in $list_set; do
+  cat ${x}_wav.scp | awk -F'_' '{print $1}' > $x.spk
+  cat ${x}_wav.scp | awk '{print $1}' > $x.utt
+  paste -d" " $x.utt $x.spk > $x.utt2spk
+  cat $x.utt2spk | $utils/utt2spk_to_spk2utt.pl > $x.spk2utt || exit 1;
+done
+
+# copying data to data/...
+for x in $list_set; do
+  mkdir -p ../../$x
+  cp ${x}_wav.scp ../../$x/wav.scp || exit 1;
+  cp ${x}.txt     ../../$x/text    || exit 1;
+  cp ${x}.spk2utt ../../$x/spk2utt || exit 1;
+  cp ${x}.utt2spk ../../$x/utt2spk || exit 1;
+done
+
+echo "Data preparation succeeded"
diff --git a/egs/chime4/s5_1ch/local/run_beamform_2ch_track.sh b/egs/chime4/s5_1ch/local/run_beamform_2ch_track.sh
new file mode 100755
index 00000000000..29d7ee0ff5e
--- /dev/null
+++ b/egs/chime4/s5_1ch/local/run_beamform_2ch_track.sh
@@ -0,0 +1,83 @@
+#!/bin/bash
+
+# Copyright 2015, Mitsubishi Electric Research Laboratories, MERL (Author: Shinji Watanabe)
+
+. ./cmd.sh
+. ./path.sh
+
+# Config:
+nj=10
+cmd=run.pl
+
+. utils/parse_options.sh || exit 1;
+
+if [ $# != 2 ]; then
+   echo "Wrong #arguments ($#, expected 3)"
+   echo "Usage: local/run_beamform_2ch_track.sh [options] <wav-in-dir> <wav-out-dir>"
+   echo "main options (for others, see top of script file)"
+   echo "  --nj <nj>                                # number of parallel jobs"
+   echo "  --cmd <cmd>                              # Command to run in parallel with"
+   exit 1;
+fi
+
+sdir=$1
+odir=$2
+
+wdir=data/beamforming_2ch_track
+
+if [ -z $BEAMFORMIT ] ; then
+  export BEAMFORMIT=$KALDI_ROOT/tools/BeamformIt
+fi
+export PATH=${PATH}:$BEAMFORMIT
+! hash BeamformIt && echo "Missing BeamformIt, run 'cd ../../../tools/; make beamformit;'" && exit 1
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+mkdir -p $odir
+mkdir -p $wdir/log
+
+allwavs=`find $sdir/ | grep "\.wav" | tr ' ' '\n' | awk -F '/' '{print $(NF-1)"/"$NF}'`
+
+# wavfiles.list can be used as the name of the output files
+output_wavfiles=$wdir/wavfiles.list
+echo $allwavs | tr ' ' '\n' | awk -F '.' '{print $1}' | sort | uniq > $output_wavfiles
+
+# channel list
+input_arrays=$wdir/channels
+echo $allwavs | tr ' ' '\n' | sort | awk 'NR%2==1' > $wdir/channels.1st
+echo $allwavs | tr ' ' '\n' | sort | awk 'NR%2==0' > $wdir/channels.2nd
+paste -d" " $output_wavfiles $wdir/channels.1st $wdir/channels.2nd > $input_arrays
+
+# split the list for parallel processing
+split_wavfiles=""
+for n in `seq $nj`; do
+  split_wavfiles="$split_wavfiles $output_wavfiles.$n"
+done
+utils/split_scp.pl $output_wavfiles $split_wavfiles || exit 1;
+
+echo -e "Beamforming\n"
+# making a shell script for each job
+for n in `seq $nj`; do
+cat << EOF > $wdir/log/beamform.$n.sh
+while read line; do
+  $BEAMFORMIT/BeamformIt -s \$line -c $input_arrays \
+    --config_file `pwd`/conf/chime4.cfg \
+    --source_dir $sdir \
+    --result_dir $odir
+done < $output_wavfiles.$n
+EOF
+done
+# making a subdirectory for the output wav files
+for x in `awk -F '/' '{print $1}' $output_wavfiles | sort | uniq`; do
+  mkdir -p $odir/$x
+done
+
+chmod a+x $wdir/log/beamform.*.sh
+$cmd JOB=1:$nj $wdir/log/beamform.JOB.log \
+  $wdir/log/beamform.JOB.sh
+
+echo "`basename $0` Done."
diff --git a/egs/chime4/s5_1ch/local/run_beamform_6ch_track.sh b/egs/chime4/s5_1ch/local/run_beamform_6ch_track.sh
new file mode 100755
index 00000000000..92e7b95707f
--- /dev/null
+++ b/egs/chime4/s5_1ch/local/run_beamform_6ch_track.sh
@@ -0,0 +1,100 @@
+#!/bin/bash
+
+# Copyright 2015, Mitsubishi Electric Research Laboratories, MERL (Author: Shinji Watanabe)
+
+. ./cmd.sh
+. ./path.sh
+
+# Config:
+nj=10
+cmd=run.pl
+bmf="1 3 4 5 6"
+eval_flag=false # make it true when the evaluation data are released
+
+. utils/parse_options.sh || exit 1;
+
+if [ $# != 2 ]; then
+   echo "Wrong #arguments ($#, expected 2)"
+   echo "Usage: local/run_beamform_6ch_track.sh [options] <wav-in-dir> <wav-out-dir>"
+   echo "main options (for others, see top of script file)"
+   echo "  --nj <nj>                                # number of parallel jobs"
+   echo "  --cmd <cmd>                              # Command to run in parallel with"
+   echo "  --bmf \"1 3 4 5 6\"                      # microphones used for beamforming (2th mic is omitted in default)"
+   exit 1;
+fi
+
+sdir=$1
+odir=$2
+wdir=data/beamforming_`echo $bmf | tr ' ' '_'`
+
+if [ -z $BEAMFORMIT ] ; then
+  export BEAMFORMIT=$KALDI_ROOT/tools/BeamformIt
+fi
+export PATH=${PATH}:$BEAMFORMIT
+! hash BeamformIt && echo "Missing BeamformIt, run 'cd ../../../tools/; make beamformit;'" && exit 1
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+mkdir -p $odir
+mkdir -p $wdir/log
+
+echo "Will use the following channels: $bmf"
+# number of channels
+numch=`echo $bmf | tr ' ' '\n' | wc -l`
+echo "the number of channels: $numch"
+
+# wavfiles.list can be used as the name of the output files
+# we only process dev and eval waves
+output_wavfiles=$wdir/wavfiles.list
+if $eval_flag; then
+  find $sdir/{dt,et}*{simu,real}/ | grep CH1.wav \
+    | awk -F '/' '{print $(NF-1) "/" $NF}' | sed -e "s/\.CH1\.wav//" | sort > $output_wavfiles
+else
+  find $sdir/dt*{simu,real}/ | grep CH1.wav \
+    | awk -F '/' '{print $(NF-1) "/" $NF}' | sed -e "s/\.CH1\.wav//" | sort > $output_wavfiles
+fi
+
+# this is an input file list of the microphones
+# format: 1st_wav 2nd_wav ... nth_wav
+input_arrays=$wdir/channels_$numch
+for x in `cat $output_wavfiles`; do
+  echo -n "$x"
+  for ch in $bmf; do
+    echo -n " $x.CH$ch.wav"
+  done
+  echo ""
+done > $input_arrays
+
+# split the list for parallel processing
+split_wavfiles=""
+for n in `seq $nj`; do
+  split_wavfiles="$split_wavfiles $output_wavfiles.$n"
+done
+utils/split_scp.pl $output_wavfiles $split_wavfiles || exit 1;
+
+echo -e "Beamforming\n"
+# making a shell script for each job
+for n in `seq $nj`; do
+cat << EOF > $wdir/log/beamform.$n.sh
+while read line; do
+  $BEAMFORMIT/BeamformIt -s \$line -c $input_arrays \
+    --config_file `pwd`/conf/chime4.cfg \
+    --source_dir $sdir \
+    --result_dir $odir
+done < $output_wavfiles.$n
+EOF
+done
+# making a subdirectory for the output wav files
+for x in `awk -F '/' '{print $1}' $output_wavfiles | sort | uniq`; do
+  mkdir -p $odir/$x
+done
+
+chmod a+x $wdir/log/beamform.*.sh
+$cmd JOB=1:$nj $wdir/log/beamform.JOB.log \
+  $wdir/log/beamform.JOB.sh
+
+echo "`basename $0` Done."
diff --git a/egs/chime4/s5_1ch/local/run_dnn.sh b/egs/chime4/s5_1ch/local/run_dnn.sh
new file mode 100755
index 00000000000..db6437258fc
--- /dev/null
+++ b/egs/chime4/s5_1ch/local/run_dnn.sh
@@ -0,0 +1,237 @@
+#!/bin/bash
+
+# Copyright 2016 University of Sheffield (Jon Barker, Ricard Marxer)
+#                Inria (Emmanuel Vincent)
+#                Mitsubishi Electric Research Labs (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+# This script is made from the kaldi recipe of the 2nd CHiME Challenge Track 2
+# made by Chao Weng
+
+. ./path.sh
+. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
+           ## This relates to the queue.
+
+# Config:
+nj=30
+stage=0 # resume training with --stage=N
+train=noisy
+eval_flag=false # make it true when the evaluation data are released
+
+. utils/parse_options.sh || exit 1;
+
+# This is a shell script, but it's recommended that you run the commands one by
+# one by copying and pasting into the shell.
+
+if [ $# -ne 1 ]; then
+  printf "\nUSAGE: %s <enhancement method>\n\n" `basename $0`
+  echo "First argument specifies a unique name for different enhancement method"
+  exit 1;
+fi
+
+# set enhanced data
+enhan=$1
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+# check whether run_init is executed
+if [ ! -d data/lang ]; then
+  echo "error, execute local/run_init.sh, first"
+  exit 1;
+fi
+
+# check whether run_init is executed
+if [ ! -d exp/tri3b_tr05_multi_${train} ]; then
+  echo "error, execute local/run_init.sh, first"
+  exit 1;
+fi
+
+# get alignments
+if [ $stage -le 0 ]; then
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    data/tr05_multi_${train} data/lang exp/tri3b_tr05_multi_${train} exp/tri3b_tr05_multi_${train}_ali
+  steps/align_fmllr.sh --nj 4 --cmd "$train_cmd" \
+    data/dt05_multi_$enhan data/lang exp/tri3b_tr05_multi_${train} exp/tri3b_tr05_multi_${train}_ali_dt05
+fi
+
+# make fmllr feature for training multi = simu + real
+gmmdir=exp/tri3b_tr05_multi_${train}_ali
+data_fmllr=data-fmllr-tri3b
+mkdir -p $data_fmllr
+fmllrdir=fmllr-tri3b/${train}
+if [ $stage -le 1 ]; then
+  for x in tr05_real_${train} tr05_simu_${train}; do
+    steps/nnet/make_fmllr_feats.sh --nj 4 --cmd "$train_cmd" \
+      --transform-dir $gmmdir \
+      $data_fmllr/$x data/$x $gmmdir exp/make_fmllr_tri3b/$x $fmllrdir
+  done
+fi
+
+# make fmllr feature for dev and eval
+gmmdir=exp/tri3b_tr05_multi_${train}
+fmllrdir=fmllr-tri3b/$enhan
+if [ $stage -le 2 ]; then
+  if $eval_flag; then
+    tasks="dt05_real_$enhan dt05_simu_$enhan et05_real_$enhan et05_simu_$enhan"
+  else
+    tasks="dt05_real_$enhan dt05_simu_$enhan"
+  fi
+  for x in $tasks; do
+    steps/nnet/make_fmllr_feats.sh --nj 4 --cmd "$train_cmd" \
+      --transform-dir $gmmdir/decode_tgpr_5k_$x \
+      $data_fmllr/$x data/$x $gmmdir exp/make_fmllr_tri3b/$x $fmllrdir
+  done
+fi
+
+# make mixed training set from real and simulation enhanced data
+# multi = simu + real
+if [ $stage -le 3 ]; then
+  for data_dir in $data_fmllr/tr05_real_${train} $data_fmllr/tr05_simu_${train} $data_fmllr/dt05_real_$enhan $data_fmllr/dt05_simu_$enhan; do
+    utils/data/get_utt2dur.sh $data_dir
+  done
+
+  utils/combine_data.sh $data_fmllr/tr05_multi_${train} $data_fmllr/tr05_simu_${train} $data_fmllr/tr05_real_${train}
+  utils/combine_data.sh $data_fmllr/dt05_multi_$enhan $data_fmllr/dt05_simu_$enhan $data_fmllr/dt05_real_$enhan
+  if $eval_flag; then
+    for data_dir in $data_fmllr/et05_real_$enhan $data_fmllr/et05_simu_$enhan; do
+      utils/data/get_utt2dur.sh $data_dir
+    done
+    utils/combine_data.sh $data_fmllr/et05_multi_$enhan $data_fmllr/et05_simu_$enhan $data_fmllr/et05_real_$enhan
+  fi
+fi
+
+# pre-train dnn
+dir=exp/tri4a_dnn_pretrain_tr05_multi_${train}
+if [ $stage -le 4 ]; then
+  $cuda_cmd $dir/_pretrain_dbn.log \
+    steps/nnet/pretrain_dbn.sh --nn-depth 7 --rbm-iter 3 $data_fmllr/tr05_multi_${train} $dir
+fi
+
+# train dnn
+dir=exp/tri4a_dnn_tr05_multi_${train}
+ali=exp/tri3b_tr05_multi_${train}_ali
+ali_dev=exp/tri3b_tr05_multi_${train}_ali_dt05
+feature_transform=exp/tri4a_dnn_pretrain_tr05_multi_${train}/final.feature_transform
+dbn=exp/tri4a_dnn_pretrain_tr05_multi_${train}/7.dbn
+if [ $stage -le 5 ]; then
+  $cuda_cmd $dir/_train_nnet.log \
+    steps/nnet/train.sh --feature-transform $feature_transform --dbn $dbn --hid-layers 0 --learn-rate 0.008 \
+    $data_fmllr/tr05_multi_${train} $data_fmllr/dt05_multi_$enhan data/lang $ali $ali_dev $dir
+fi
+
+# decode enhanced speech
+if [ $stage -le 6 ]; then
+  utils/mkgraph.sh data/lang_test_tgpr_5k $dir $dir/graph_tgpr_5k
+  steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --acwt 0.10 --config conf/decode_dnn.config \
+    $dir/graph_tgpr_5k $data_fmllr/dt05_real_$enhan $dir/decode_tgpr_5k_dt05_real_$enhan &
+  steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --acwt 0.10 --config conf/decode_dnn.config \
+    $dir/graph_tgpr_5k $data_fmllr/dt05_simu_$enhan $dir/decode_tgpr_5k_dt05_simu_$enhan &
+  if $eval_flag; then
+  steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --acwt 0.10 --config conf/decode_dnn.config \
+    $dir/graph_tgpr_5k $data_fmllr/et05_real_$enhan $dir/decode_tgpr_5k_et05_real_$enhan &
+  steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --acwt 0.10 --config conf/decode_dnn.config \
+    $dir/graph_tgpr_5k $data_fmllr/et05_simu_$enhan $dir/decode_tgpr_5k_et05_simu_$enhan &
+  fi
+  wait;
+fi
+
+# Sequence training using sMBR criterion, we do Stochastic-GD
+# with per-utterance updates. We use usually good acwt 0.1
+# Lattices are re-generated after 1st epoch, to get faster convergence.
+dir=exp/tri4a_dnn_tr05_multi_${train}_smbr
+srcdir=exp/tri4a_dnn_tr05_multi_${train}
+acwt=0.1
+
+# First we generate lattices and alignments:
+# gawk must be installed to perform awk -v FS="/" '{ print gensub(".gz","","",$NF)" gunzip -c "$0" |"; }' in
+# steps/nnet/make_denlats.sh
+if [ $stage -le 7 ]; then
+  steps/nnet/align.sh --nj $nj --cmd "$train_cmd" \
+    $data_fmllr/tr05_multi_${train} data/lang $srcdir ${srcdir}_ali
+  steps/nnet/make_denlats.sh --nj $nj --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt $acwt \
+    $data_fmllr/tr05_multi_${train} data/lang $srcdir ${srcdir}_denlats
+fi
+
+# Re-train the DNN by 1 iteration of sMBR
+if [ $stage -le 8 ]; then
+  steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 1 --acwt $acwt --do-smbr true \
+    $data_fmllr/tr05_multi_${train} data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir
+fi
+
+# Decode (reuse HCLG graph)
+if [ $stage -le 9 ]; then
+  for ITER in 1; do
+    steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \
+      --nnet $dir/${ITER}.nnet --acwt $acwt \
+      exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k $data_fmllr/dt05_real_${enhan} $dir/decode_tgpr_5k_dt05_real_${enhan}_it${ITER} &
+    steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \
+      --nnet $dir/${ITER}.nnet --acwt $acwt \
+      exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k $data_fmllr/dt05_simu_${enhan} $dir/decode_tgpr_5k_dt05_simu_${enhan}_it${ITER} &
+    if $eval_flag; then
+    steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \
+      --nnet $dir/${ITER}.nnet --acwt $acwt \
+      exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k $data_fmllr/et05_real_${enhan} $dir/decode_tgpr_5k_et05_real_${enhan}_it${ITER} &
+    steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \
+      --nnet $dir/${ITER}.nnet --acwt $acwt \
+      exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k $data_fmllr/et05_simu_${enhan} $dir/decode_tgpr_5k_et05_simu_${enhan}_it${ITER} &
+    fi
+  done
+fi
+
+# Re-generate lattices, run 4 more sMBR iterations
+dir=exp/tri4a_dnn_tr05_multi_${train}_smbr_i1lats
+srcdir=exp/tri4a_dnn_tr05_multi_${train}_smbr
+acwt=0.1
+
+# Generate lattices and alignments:
+if [ $stage -le 10 ]; then
+  steps/nnet/align.sh --nj $nj --cmd "$train_cmd" \
+    $data_fmllr/tr05_multi_${train} data/lang $srcdir ${srcdir}_ali
+  steps/nnet/make_denlats.sh --nj $nj --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt $acwt \
+    $data_fmllr/tr05_multi_${train} data/lang $srcdir ${srcdir}_denlats
+fi
+
+# Re-train the DNN by 4 iterations of sMBR
+if [ $stage -le 11 ]; then
+  steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 4 --acwt $acwt --do-smbr true \
+    $data_fmllr/tr05_multi_${train} data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir || exit 1
+fi
+
+# Decode (reuse HCLG graph)
+if [ $stage -le 12 ]; then
+  for ITER in 1 2 3 4; do
+    steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \
+      --nnet $dir/${ITER}.nnet --acwt $acwt \
+      exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k $data_fmllr/dt05_real_${enhan} $dir/decode_tgpr_5k_dt05_real_${enhan}_it${ITER} &
+    steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \
+      --nnet $dir/${ITER}.nnet --acwt $acwt \
+      exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k $data_fmllr/dt05_simu_${enhan} $dir/decode_tgpr_5k_dt05_simu_${enhan}_it${ITER} &
+    if $eval_flag; then
+    steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \
+      --nnet $dir/${ITER}.nnet --acwt $acwt \
+      exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k $data_fmllr/et05_real_${enhan} $dir/decode_tgpr_5k_et05_real_${enhan}_it${ITER} &
+    steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \
+      --nnet $dir/${ITER}.nnet --acwt $acwt \
+      exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k $data_fmllr/et05_simu_${enhan} $dir/decode_tgpr_5k_et05_simu_${enhan}_it${ITER} &
+    fi
+  done
+  wait
+fi
+
+# scoring
+if [ $stage -le 13 ]; then
+  # decoded results of enhanced speech using DNN AMs trained with enhanced data
+  local/chime4_calc_wers.sh exp/tri4a_dnn_tr05_multi_${train} $enhan exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k \
+    > exp/tri4a_dnn_tr05_multi_${train}/best_wer_$enhan.result
+  head -n 15 exp/tri4a_dnn_tr05_multi_${train}/best_wer_$enhan.result
+  # decoded results of enhanced speech using sequence-training DNN
+  ./local/chime4_calc_wers_smbr.sh exp/tri4a_dnn_tr05_multi_${train}_smbr_i1lats ${enhan} exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k \
+    > exp/tri4a_dnn_tr05_multi_${train}_smbr_i1lats/best_wer_${enhan}.result
+  head -n 15 exp/tri4a_dnn_tr05_multi_${train}_smbr_i1lats/best_wer_${enhan}.result
+fi
+
+echo "`basename $0` Done."
diff --git a/egs/chime4/s5_1ch/local/run_dnn_recog.sh b/egs/chime4/s5_1ch/local/run_dnn_recog.sh
new file mode 100755
index 00000000000..5c9c1010fb2
--- /dev/null
+++ b/egs/chime4/s5_1ch/local/run_dnn_recog.sh
@@ -0,0 +1,143 @@
+#!/bin/bash
+
+# Copyright 2016 University of Sheffield (Jon Barker, Ricard Marxer)
+#                Inria (Emmanuel Vincent)
+#                Mitsubishi Electric Research Labs (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+# This script is made from the kaldi recipe of the 2nd CHiME Challenge Track 2
+# made by Chao Weng
+
+. ./path.sh
+. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
+           ## This relates to the queue.
+
+# Config:
+nj=30
+stage=0 # resume training with --stage=N
+train=noisy
+eval_flag=false # make it true when the evaluation data are released
+
+. utils/parse_options.sh || exit 1;
+
+# This is a shell script, but it's recommended that you run the commands one by
+# one by copying and pasting into the shell.
+
+if [ $# -ne 2 ]; then
+  printf "\nUSAGE: %s <enhancement method> <model dir>\n\n" `basename $0`
+  echo "First argument specifies a unique name for different enhancement method"
+  echo "Second argument specifies acoustic and language model directory"
+  exit 1;
+fi
+
+# set enhanced data
+enhan=$1
+# set model directory
+mdir=$2
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+# check data/loca/data
+if [ ! -d $mdir/data/local/data ]; then
+  echo "error, set $mdir correctly"
+  exit 1;
+elif [ ! -d data/local/data ]; then
+  echo "copy $mdir/data/local/data"
+  mkdir -p data/local
+  cp -r $mdir/data/local/data data/local/
+fi
+
+# check gmm model
+if [ ! -d $mdir/exp/tri3b_tr05_multi_${train} ]; then
+  echo "error, set $mdir correctly"
+  exit 1;
+elif [ ! -d exp/tri3b_tr05_multi_${train} ]; then
+  echo "copy $mdir/exp/tri3b_tr05_multi_${train}"
+  mkdir -p exp
+  cp -r $mdir/exp/tri3b_tr05_multi_${train} exp/
+fi
+
+# check dnn graph
+if [ ! -d $mdir/exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k ]; then
+  echo "error, set $mdir correctly"
+  exit 1;
+elif [ ! -d exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k ]; then
+  echo "copy $mdir/exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k"
+  mkdir -p exp/tri4a_dnn_tr05_multi_${train}
+  cp -r $mdir/exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k exp/tri4a_dnn_tr05_multi_${train}/
+fi
+
+# check dnn smbr model
+if [ ! -d $mdir/exp/tri4a_dnn_tr05_multi_${train}_smbr_i1lats ]; then
+  echo "error, set $mdir correctly"
+  exit 1;
+elif [ ! -d exp/tri4a_dnn_tr05_multi_${train}_smbr_i1lats ]; then
+  echo "copy $mdir/exp/tri4a_dnn_tr05_multi_${train}_smbr_i1lats"
+  mkdir -p exp
+  cp -r $mdir/exp/tri4a_dnn_tr05_multi_${train}_smbr_i1lats exp/
+fi
+
+# make fmllr feature for dev and eval
+gmmdir=exp/tri3b_tr05_multi_${train}
+data_fmllr=data-fmllr-tri3b
+mkdir -p $data_fmllr
+fmllrdir=fmllr-tri3b/$enhan
+if [ $stage -le 4 ]; then
+  if $eval_flag; then
+    tasks="dt05_real_$enhan dt05_simu_$enhan et05_real_$enhan et05_simu_$enhan"
+  else
+    tasks="dt05_real_$enhan dt05_simu_$enhan"
+  fi
+  for x in $tasks; do
+    steps/nnet/make_fmllr_feats.sh --nj 4 --cmd "$train_cmd" \
+      --transform-dir $gmmdir/decode_tgpr_5k_$x \
+      $data_fmllr/$x data/$x $gmmdir exp/make_fmllr_tri3b/$x $fmllrdir
+  done
+fi
+
+# make mixed training set from real and simulation enhanced data
+# multi = simu + real
+if [ $stage -le 5 ]; then
+  utils/combine_data.sh $data_fmllr/dt05_multi_$enhan $data_fmllr/dt05_simu_$enhan $data_fmllr/dt05_real_$enhan
+  if $eval_flag; then
+  utils/combine_data.sh $data_fmllr/et05_multi_$enhan $data_fmllr/et05_simu_$enhan $data_fmllr/et05_real_$enhan
+  fi
+fi
+
+# Re-generate lattices, run 4 more sMBR iterations
+dir=exp/tri4a_dnn_tr05_multi_${train}_smbr_i1lats
+acwt=0.1
+
+# Decode (reuse HCLG graph)
+if [ $stage -le 6 ]; then
+  for ITER in 1 2 3 4; do
+    steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \
+      --nnet $dir/${ITER}.nnet --acwt $acwt \
+      exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k $data_fmllr/dt05_real_${enhan} $dir/decode_tgpr_5k_dt05_real_${enhan}_it${ITER} &
+    steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \
+      --nnet $dir/${ITER}.nnet --acwt $acwt \
+      exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k $data_fmllr/dt05_simu_${enhan} $dir/decode_tgpr_5k_dt05_simu_${enhan}_it${ITER} &
+    if $eval_flag; then
+    steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \
+      --nnet $dir/${ITER}.nnet --acwt $acwt \
+      exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k $data_fmllr/et05_real_${enhan} $dir/decode_tgpr_5k_et05_real_${enhan}_it${ITER} &
+    steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \
+      --nnet $dir/${ITER}.nnet --acwt $acwt \
+      exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k $data_fmllr/et05_simu_${enhan} $dir/decode_tgpr_5k_et05_simu_${enhan}_it${ITER} &
+    fi
+    wait
+  done
+fi
+
+# scoring
+if [ $stage -le 7 ]; then
+  # decoded results of enhanced speech using sequence-training DNN
+  ./local/chime4_calc_wers_smbr.sh $dir ${enhan} exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k > $dir/best_wer_${enhan}.result
+  head -n 15 $dir/best_wer_${enhan}.result
+fi
+
+echo "`basename $0` Done."
diff --git a/egs/chime4/s5_1ch/local/run_gmm.sh b/egs/chime4/s5_1ch/local/run_gmm.sh
new file mode 100755
index 00000000000..bedd6de51a5
--- /dev/null
+++ b/egs/chime4/s5_1ch/local/run_gmm.sh
@@ -0,0 +1,186 @@
+#!/bin/bash
+
+# Copyright 2016 University of Sheffield (Jon Barker, Ricard Marxer)
+#                Inria (Emmanuel Vincent)
+#                Mitsubishi Electric Research Labs (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+# This script is made from the kaldi recipe of the 2nd CHiME Challenge Track 2
+# made by Chao Weng
+
+. ./path.sh
+. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
+           ## This relates to the queue.
+
+# Config:
+nj=30
+stage=0 # resume training with --stage=N
+train=noisy # noisy data multi-condition training
+eval_flag=false # make it true when the evaluation data are released
+
+. utils/parse_options.sh || exit 1;
+
+# This is a shell script, but it's recommended that you run the commands one by
+# one by copying and pasting into the shell.
+
+if [ $# -ne 3 ]; then
+  printf "\nUSAGE: %s <enhancement method> <enhanced speech directory> <chime4 root directory>\n\n" `basename $0`
+  echo "First argument specifies a unique name for different enhancement method"
+  echo "Second argument specifies the directory of enhanced wav files"
+  echo "Third argument specifies the CHiME4 root directory"
+  exit 1;
+fi
+
+# set enhanced data
+enhan=$1
+enhan_data=$2
+# set chime4 data
+chime4_data=$3
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+# check whether run_init is executed
+if [ ! -d data/lang ]; then
+  echo "error, execute local/run_init.sh, first"
+  exit 1;
+fi
+
+#######################
+#### training #########
+if [ $stage -le 1 ]; then
+  # process for distant talking speech for real and simulation data
+  local/real_noisy_chime4_data_prep.sh $chime4_data
+  local/simu_noisy_chime4_data_prep.sh $chime4_data
+fi
+
+# Now make MFCC features for clean, close, and noisy data
+# mfccdir should be some place with a largish disk where you
+# want to store MFCC features.
+mfccdir=mfcc
+if [ $stage -le 2 ]; then
+  if $eval_flag; then
+    tasks="tr05_real_${train} dt05_real_${train} tr05_simu_${train} dt05_simu_${train} et05_real_${train} et05_simu_${train}"
+  else
+    tasks="tr05_real_${train} dt05_real_${train} tr05_simu_${train} dt05_simu_${train}"
+  fi
+  for x in $tasks; do
+    steps/make_mfcc.sh --nj 8 --cmd "$train_cmd" \
+      data/$x exp/make_mfcc/$x $mfccdir
+    steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir
+  done
+fi
+
+# make mixed training set from real and simulation training data
+# multi = simu + real
+if [ $stage -le 3 ]; then
+  utils/combine_data.sh data/tr05_multi_${train} data/tr05_simu_${train} data/tr05_real_${train}
+  utils/combine_data.sh data/dt05_multi_${train} data/dt05_simu_${train} data/dt05_real_${train}
+  if $eval_flag; then
+  utils/combine_data.sh data/et05_multi_${train} data/et05_simu_${train} data/et05_real_${train}
+  fi
+fi
+
+# training models for noisy data
+if [ $stage -le 4 ]; then
+  nspk=`wc -l data/tr05_multi_${train}/spk2utt | awk '{print $1}'`
+  if [ $nj -gt $nspk ]; then
+    nj2=$nspk
+  else
+    nj2=$nj
+  fi
+  # training monophone model
+  steps/train_mono.sh --boost-silence 1.25 --nj $nj2 --cmd "$train_cmd" \
+    data/tr05_multi_${train} data/lang exp/mono0a_tr05_multi_${train}
+  steps/align_si.sh --boost-silence 1.25 --nj $nj2 --cmd "$train_cmd" \
+    data/tr05_multi_${train} data/lang exp/mono0a_tr05_multi_${train} exp/mono0a_ali_tr05_multi_${train}
+
+  # training triphone model with lad mllt features
+  steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \
+    2000 10000 data/tr05_multi_${train} data/lang exp/mono0a_ali_tr05_multi_${train} exp/tri1_tr05_multi_${train}
+  steps/align_si.sh --nj $nj2 --cmd "$train_cmd" \
+    data/tr05_multi_${train} data/lang exp/tri1_tr05_multi_${train} exp/tri1_ali_tr05_multi_${train}
+
+  steps/train_lda_mllt.sh --cmd "$train_cmd" \
+    --splice-opts "--left-context=3 --right-context=3" \
+    2500 15000 data/tr05_multi_${train} data/lang exp/tri1_ali_tr05_multi_${train} exp/tri2b_tr05_multi_${train}
+  steps/align_si.sh  --nj $nj2 --cmd "$train_cmd" \
+    --use-graphs true data/tr05_multi_${train} data/lang exp/tri2b_tr05_multi_${train} exp/tri2b_ali_tr05_multi_${train}
+
+  steps/train_sat.sh --cmd "$train_cmd" \
+    2500 15000 data/tr05_multi_${train} data/lang exp/tri2b_ali_tr05_multi_${train} exp/tri3b_tr05_multi_${train}
+  utils/mkgraph.sh data/lang_test_tgpr_5k exp/tri3b_tr05_multi_${train} exp/tri3b_tr05_multi_${train}/graph_tgpr_5k
+fi
+#### training done ####
+#######################
+
+
+#####################
+#### tsting #########
+# process for enhanced data
+if [ $stage -le 5 ]; then
+  if [ ! -d data/dt05_real_$enhan ]; then
+    local/real_enhan_chime4_data_prep.sh $enhan $enhan_data
+    local/simu_enhan_chime4_data_prep.sh $enhan $enhan_data
+  fi
+fi
+
+# Now make MFCC features for enhanced data
+# mfccdir should be some place with a largish disk where you
+# want to store MFCC features.
+mfccdir=mfcc/$enhan
+if [ $stage -le 6 ]; then
+  if $eval_flag; then
+    tasks="dt05_real_$enhan dt05_simu_$enhan et05_real_$enhan et05_simu_$enhan"
+  else
+    tasks="dt05_real_$enhan dt05_simu_$enhan"
+  fi
+  for x in $tasks; do
+    if [ ! -e data/$x/feats.scp ]; then
+      steps/make_mfcc.sh --nj 8 --cmd "$train_cmd" \
+	data/$x exp/make_mfcc/$x $mfccdir
+      steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir
+    fi
+  done
+fi
+
+# make mixed training set from real and simulation enhanced data
+# multi = simu + real
+if [ $stage -le 7 ]; then
+  if [ ! -d data/dt05_multi_$enhan ]; then
+    utils/combine_data.sh data/dt05_multi_$enhan data/dt05_simu_$enhan data/dt05_real_$enhan
+    if $eval_flag; then
+    utils/combine_data.sh data/et05_multi_$enhan data/et05_simu_$enhan data/et05_real_$enhan
+    fi
+  fi
+fi
+
+# decode enhanced speech using AMs trained with enhanced data
+if [ $stage -le 8 ]; then
+  steps/decode_fmllr.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" \
+    exp/tri3b_tr05_multi_${train}/graph_tgpr_5k data/dt05_real_$enhan exp/tri3b_tr05_multi_${train}/decode_tgpr_5k_dt05_real_$enhan &
+  steps/decode_fmllr.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" \
+    exp/tri3b_tr05_multi_${train}/graph_tgpr_5k data/dt05_simu_$enhan exp/tri3b_tr05_multi_${train}/decode_tgpr_5k_dt05_simu_$enhan &
+  if $eval_flag; then
+  steps/decode_fmllr.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" \
+    exp/tri3b_tr05_multi_${train}/graph_tgpr_5k data/et05_real_$enhan exp/tri3b_tr05_multi_${train}/decode_tgpr_5k_et05_real_$enhan &
+  steps/decode_fmllr.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" \
+    exp/tri3b_tr05_multi_${train}/graph_tgpr_5k data/et05_simu_$enhan exp/tri3b_tr05_multi_${train}/decode_tgpr_5k_et05_simu_$enhan &
+  fi
+  wait;
+fi
+
+# scoring
+if [ $stage -le 9 ]; then
+  # decoded results of enhanced speech using AMs trained with enhanced data
+  local/chime4_calc_wers.sh exp/tri3b_tr05_multi_${train} $enhan exp/tri3b_tr05_multi_${train}/graph_tgpr_5k \
+    > exp/tri3b_tr05_multi_${train}/best_wer_$enhan.result
+  head -n 15 exp/tri3b_tr05_multi_${train}/best_wer_$enhan.result
+fi
+#### tsting done ####
+#####################
+
+echo "`basename $0` Done."
diff --git a/egs/chime4/s5_1ch/local/run_gmm_recog.sh b/egs/chime4/s5_1ch/local/run_gmm_recog.sh
new file mode 100755
index 00000000000..8824aa255f4
--- /dev/null
+++ b/egs/chime4/s5_1ch/local/run_gmm_recog.sh
@@ -0,0 +1,127 @@
+#!/bin/bash
+
+# Copyright 2016 University of Sheffield (Jon Barker, Ricard Marxer)
+#                Inria (Emmanuel Vincent)
+#                Mitsubishi Electric Research Labs (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+# This script is made from the kaldi recipe of the 2nd CHiME Challenge Track 2
+# made by Chao Weng
+
+. ./path.sh
+. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
+           ## This relates to the queue.
+
+# Config:
+nj=30
+stage=0 # resume training with --stage=N
+train=noisy
+eval_flag=false # make it true when the evaluation data are released
+
+. utils/parse_options.sh || exit 1;
+
+# This is a shell script, but it's recommended that you run the commands one by
+# one by copying and pasting into the shell.
+
+if [ $# -ne 3 ]; then
+  printf "\nUSAGE: %s <enhancement method> <enhanced speech directory> <model dir>\n\n" `basename $0`
+  echo "First argument specifies a unique name for different enhancement method"
+  echo "Second argument specifies the directory of enhanced wav files"
+  echo "Third argument specifies acoustic and language model directory"
+  exit 1;
+fi
+
+# set enhanced data
+enhan=$1
+enhan_data=$2
+# set model directory
+mdir=$3
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+# check data/loca/data
+if [ ! -d $mdir/data/local/data ]; then
+  echo "error, set $mdir correctly"
+  exit 1;
+elif [ ! -d data/local/data ]; then
+  echo "copy $mdir/data/local/data"
+  mkdir -p data/local
+  cp -r $mdir/data/local/data data/local/
+fi
+
+# check gmm model
+if [ ! -d $mdir/exp/tri3b_tr05_multi_${train} ]; then
+  echo "error, set $mdir correctly"
+  exit 1;
+elif [ ! -d exp/tri3b_tr05_multi_${train} ]; then
+  echo "copy $mdir/exp/tri3b_tr05_multi_${train}"
+  mkdir -p exp
+  cp -r $mdir/exp/tri3b_tr05_multi_${train} exp/
+fi
+
+# process for enhanced data
+if [ $stage -le 0 ]; then
+  if [ ! -d data/dt05_real_$enhan ]; then
+    local/real_enhan_chime4_data_prep.sh $enhan $enhan_data
+    local/simu_enhan_chime4_data_prep.sh $enhan $enhan_data
+  fi
+fi
+
+# Now make MFCC features for enhanced data
+# mfccdir should be some place with a largish disk where you
+# want to store MFCC features.
+mfccdir=mfcc/$enhan
+if [ $stage -le 1 ]; then
+  if $eval_flag; then
+    tasks="dt05_real_$enhan dt05_simu_$enhan et05_real_$enhan et05_simu_$enhan"
+  else
+    tasks="dt05_real_$enhan dt05_simu_$enhan"
+  fi
+  for x in $tasks; do
+    if [ ! -e data/$x/feats.scp ]; then
+      steps/make_mfcc.sh --nj 8 --cmd "$train_cmd" \
+	data/$x exp/make_mfcc/$x $mfccdir
+      steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir
+    fi
+  done
+fi
+
+# make mixed training set from real and simulation enhanced data
+# multi = simu + real
+if [ $stage -le 2 ]; then
+  if [ ! -d data/dt05_multi_$enhan ]; then
+    utils/combine_data.sh data/dt05_multi_$enhan data/dt05_simu_$enhan data/dt05_real_$enhan
+    if $eval_flag; then
+    utils/combine_data.sh data/et05_multi_$enhan data/et05_simu_$enhan data/et05_real_$enhan
+    fi
+  fi
+fi
+
+# decode enhanced speech using AMs trained with enhanced data
+if [ $stage -le 3 ]; then
+  steps/decode_fmllr.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" \
+    exp/tri3b_tr05_multi_${train}/graph_tgpr_5k data/dt05_real_$enhan exp/tri3b_tr05_multi_${train}/decode_tgpr_5k_dt05_real_$enhan &
+  steps/decode_fmllr.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" \
+    exp/tri3b_tr05_multi_${train}/graph_tgpr_5k data/dt05_simu_$enhan exp/tri3b_tr05_multi_${train}/decode_tgpr_5k_dt05_simu_$enhan &
+  if $eval_flag; then
+  steps/decode_fmllr.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" \
+    exp/tri3b_tr05_multi_${train}/graph_tgpr_5k data/et05_real_$enhan exp/tri3b_tr05_multi_${train}/decode_tgpr_5k_et05_real_$enhan &
+  steps/decode_fmllr.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" \
+    exp/tri3b_tr05_multi_${train}/graph_tgpr_5k data/et05_simu_$enhan exp/tri3b_tr05_multi_${train}/decode_tgpr_5k_et05_simu_$enhan &
+  fi
+  wait;
+fi
+
+# scoring
+if [ $stage -le 4 ]; then
+  # decoded results of enhanced speech using AMs trained with enhanced data
+  local/chime4_calc_wers.sh exp/tri3b_tr05_multi_${train} $enhan exp/tri3b_tr05_multi_${train}/graph_tgpr_5k \
+    > exp/tri3b_tr05_multi_${train}/best_wer_$enhan.result
+  head -n 15 exp/tri3b_tr05_multi_${train}/best_wer_$enhan.result
+fi
+
+echo "`basename $0` Done."
diff --git a/egs/chime4/s5_1ch/local/run_init.sh b/egs/chime4/s5_1ch/local/run_init.sh
new file mode 100755
index 00000000000..3cafd7fbada
--- /dev/null
+++ b/egs/chime4/s5_1ch/local/run_init.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+# Copyright 2016 University of Sheffield (Jon Barker, Ricard Marxer)
+#                Inria (Emmanuel Vincent)
+#                Mitsubishi Electric Research Labs (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+# Config:
+nj=30
+stage=0 # resume training with --stage=N
+eval_flag=false # make it true when the evaluation data are released
+
+. utils/parse_options.sh || exit 1;
+
+# This script is made from the kaldi recipe of the 2nd CHiME Challenge Track 2
+# made by Chao Weng
+
+. ./path.sh
+. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
+           ## This relates to the queue.
+
+if [ $# -ne 1 ]; then
+  printf "\nUSAGE: %s <Chime4 root directory>\n\n" `basename $0`
+  echo "Please specifies a CHiME4 root directory"
+  echo "If you use scripts distributed in the CHiME4 package,"
+  echo "It would be `pwd`/../.."
+  exit 1;
+fi
+
+# This is a shell script, but it's recommended that you run the commands one by
+# one by copying and pasting into the shell.
+
+# clean data
+chime4_data=$1
+wsj0_data=$chime4_data/data/WSJ0 # directory of WSJ0 in Chime4. You can also specify your WSJ0 corpus directory
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+if [ $stage -le 0 ]; then
+  # process for clean speech and making LMs etc. from original WSJ0
+  # note that training on clean data means original WSJ0 data only (no booth data)
+  local/clean_wsj0_data_prep.sh $wsj0_data
+  local/wsj_prepare_dict.sh
+  utils/prepare_lang.sh data/local/dict "<SPOKEN_NOISE>" data/local/lang_tmp data/lang
+  local/clean_chime4_format_data.sh
+fi
+
+echo "`basename $0` Done."
diff --git a/egs/chime4/s5_1ch/local/run_lmrescore.sh b/egs/chime4/s5_1ch/local/run_lmrescore.sh
new file mode 100755
index 00000000000..9ae66bdc3d6
--- /dev/null
+++ b/egs/chime4/s5_1ch/local/run_lmrescore.sh
@@ -0,0 +1,136 @@
+#!/bin/bash
+
+# Copyright 2015 University of Sheffield (Jon Barker, Ricard Marxer)
+#                Inria (Emmanuel Vincent)
+#                Mitsubishi Electric Research Labs (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+# Copyright 2015, Mitsubishi Electric Research Laboratories, MERL (Author: Takaaki Hori)
+
+nj=12
+stage=1
+order=5
+hidden=300
+rnnweight=0.5
+nbest=100
+train=noisy
+eval_flag=false # make it true when the evaluation data are released
+
+. utils/parse_options.sh || exit 1;
+
+. ./path.sh
+. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
+           ## This relates to the queue.
+
+# This is a shell script, but it's recommended that you run the commands one by
+# one by copying and pasting into the shell.
+
+if [ $# -ne 2 ]; then
+  printf "\nUSAGE: %s <Chime4 root directory> <enhancement method>\n\n" `basename $0`
+  echo "First argument specifies a root directory of Chime4 data"
+  echo "Second argument specifies a unique name for different enhancement method"
+  exit 1;
+fi
+
+# set language models
+lm_suffix=${order}gkn_5k
+rnnlm_suffix=rnnlm_5k_h${hidden}
+
+# data root
+chime4_data=$1
+# enhan data
+enhan=$2
+
+# check data
+if [ ! -d $chime4_data ]; then
+  echo "$chime4_data does not exist. Please specify chime4 data root correctly" && exit 1
+fi
+
+# check whether run_dnn is executed
+srcdir=exp/tri4a_dnn_tr05_multi_${train}_smbr_i1lats
+if [ ! -d $srcdir ]; then
+  echo "error, execute local/run_dnn.sh, first"
+  exit 1;
+fi
+
+# train a high-order n-gram language model
+if [ $stage -le 1 ]; then
+  local/chime4_train_lms.sh $chime4_data || exit 1;
+fi
+
+# train a RNN language model
+if [ $stage -le 2 ]; then
+  local/chime4_train_rnnlms.sh $chime4_data || exit 1;
+fi
+
+# preparation
+dir=exp/tri4a_dnn_tr05_multi_${train}_smbr_lmrescore
+mkdir -p $dir
+# make a symbolic link to graph info
+if [ ! -e $dir/graph_tgpr_5k ]; then
+  if [ ! -e exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k ]; then
+    echo "graph is missing, execute local/run_dnn.sh, correctly"
+    exit 1;
+  fi
+  pushd . ; cd $dir
+  ln -s ../tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k .
+  popd
+fi
+
+# rescore lattices by a high-order N-gram
+if [ $stage -le 3 ]; then
+  # check the best iteration
+  if [ ! -f $srcdir/log/best_wer_$enhan ]; then
+    echo "error, execute local/run_dnn.sh, first"
+    exit 1;
+  fi
+  it=`cut -f 1 -d" " $srcdir/log/best_wer_$enhan | awk -F'[_]' '{print $1}'`
+  # rescore lattices
+  if $eval_flag; then
+    tasks="dt05_simu dt05_real et05_simu et05_real"
+  else
+    tasks="dt05_simu dt05_real"
+  fi
+  for t in $tasks; do
+    steps/lmrescore.sh --mode 3 \
+      data/lang_test_tgpr_5k \
+      data/lang_test_${lm_suffix} \
+      data-fmllr-tri3b/${t}_$enhan \
+      $srcdir/decode_tgpr_5k_${t}_${enhan}_it$it \
+      $dir/decode_tgpr_5k_${t}_${enhan}_${lm_suffix}
+  done
+  # rescored results by high-order n-gram LM
+  mkdir -p $dir/log
+  local/chime4_calc_wers.sh $dir ${enhan}_${lm_suffix} $dir/graph_tgpr_5k \
+      > $dir/best_wer_${enhan}_${lm_suffix}.result
+  head -n 15 $dir/best_wer_${enhan}_${lm_suffix}.result
+fi
+
+# N-best rescoring using a RNNLM
+if [ $stage -le 4 ]; then
+  # check the best lmw
+  if [ ! -f $dir/log/best_wer_${enhan}_${lm_suffix} ]; then
+    echo "error, rescoring with a high-order n-gram seems to be failed"
+    exit 1;
+  fi
+  lmw=`cut -f 1 -d" " $dir/log/best_wer_${enhan}_${lm_suffix} | awk -F'[_]' '{print $NF}'`
+  # rescore n-best list for all sets
+  if $eval_flag; then
+    tasks="dt05_simu dt05_real et05_simu et05_real"
+  else
+    tasks="dt05_simu dt05_real"
+  fi
+  for t in $tasks; do
+    steps/rnnlmrescore.sh --inv-acwt $lmw --N $nbest --use-phi true \
+      $rnnweight \
+      data/lang_test_${lm_suffix} \
+      data/lang_test_${rnnlm_suffix} \
+      data-fmllr-tri3b/${t}_$enhan \
+      $dir/decode_tgpr_5k_${t}_${enhan}_${lm_suffix} \
+      $dir/decode_tgpr_5k_${t}_${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest}
+  done
+  # calc wers for RNNLM results
+  local/chime4_calc_wers.sh $dir ${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest} $dir/graph_tgpr_5k \
+      > $dir/best_wer_${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest}.result
+  head -n 15 $dir/best_wer_${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest}.result
+fi
diff --git a/egs/chime4/s5_1ch/local/run_lmrescore_recog.sh b/egs/chime4/s5_1ch/local/run_lmrescore_recog.sh
new file mode 100755
index 00000000000..c7d62530d19
--- /dev/null
+++ b/egs/chime4/s5_1ch/local/run_lmrescore_recog.sh
@@ -0,0 +1,121 @@
+#!/bin/bash
+
+# Copyright 2015 University of Sheffield (Jon Barker, Ricard Marxer)
+#                Inria (Emmanuel Vincent)
+#                Mitsubishi Electric Research Labs (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+# Copyright 2015, Mitsubishi Electric Research Laboratories, MERL (Author: Takaaki Hori)
+
+nj=12
+stage=1
+order=5
+hidden=300
+rnnweight=0.5
+nbest=100
+train=noisy
+eval_flag=false # make it true when the evaluation data are released
+
+. utils/parse_options.sh || exit 1;
+
+. ./path.sh
+. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
+           ## This relates to the queue.
+
+# This is a shell script, but it's recommended that you run the commands one by
+# one by copying and pasting into the shell.
+
+if [ $# -ne 2 ]; then
+  printf "\nUSAGE: %s <enhancement method> <model dir>\n\n" `basename $0`
+  echo "First argument specifies a unique name for different enhancement method"
+  echo "Second argument specifies acoustic and language model directory"
+  exit 1;
+fi
+
+# set language models
+lm_suffix=${order}gkn_5k
+rnnlm_suffix=rnnlm_5k_h${hidden}
+
+# enhan data
+enhan=$1
+# set model directory
+mdir=$2
+srcdir=exp/tri4a_dnn_tr05_multi_${train}_smbr_i1lats
+
+# check language models
+if [ ! -d $mdir/data/lang ]; then
+  echo "error, set $mdir correctly"
+  exit 1;
+fi
+
+# preparation
+dir=exp/tri4a_dnn_tr05_multi_${train}_smbr_lmrescore
+mkdir -p $dir
+# make a symbolic link to graph info
+if [ ! -e $dir/graph_tgpr_5k ]; then
+  if [ ! -e exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k ]; then
+    echo "graph is missing, execute local/run_dnn.sh, correctly"
+    exit 1;
+  fi
+  pushd . ; cd $dir
+  ln -s ../tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k .
+  popd
+fi
+
+# rescore lattices by a high-order N-gram
+if [ $stage -le 3 ]; then
+  # check the best iteration
+  if [ ! -f $srcdir/log/best_wer_$enhan ]; then
+    echo "$0: error $srcdir/log/best_wer_$enhan not found. execute local/run_dnn.sh, first"
+    exit 1;
+  fi
+  it=`cut -f 1 -d" " $srcdir/log/best_wer_$enhan | awk -F'[_]' '{print $1}'`
+  # rescore lattices
+  if $eval_flag; then
+    tasks="dt05_simu dt05_real et05_simu et05_real"
+  else
+    tasks="dt05_simu dt05_real"
+  fi
+  for t in $tasks; do
+    steps/lmrescore.sh --mode 3 \
+      $mdir/data/lang_test_tgpr_5k \
+      $mdir/data/lang_test_${lm_suffix} \
+      data-fmllr-tri3b/${t}_$enhan \
+      $srcdir/decode_tgpr_5k_${t}_${enhan}_it$it \
+      $dir/decode_tgpr_5k_${t}_${enhan}_${lm_suffix}
+  done
+  # rescored results by high-order n-gram LM
+  mkdir -p $dir/log
+  local/chime4_calc_wers.sh $dir ${enhan}_${lm_suffix} $dir/graph_tgpr_5k \
+      > $dir/best_wer_${enhan}_${lm_suffix}.result
+  head -n 15 $dir/best_wer_${enhan}_${lm_suffix}.result
+fi
+
+# N-best rescoring using a RNNLM
+if [ $stage -le 4 ]; then
+  # check the best lmw
+  if [ ! -f $dir/log/best_wer_${enhan}_${lm_suffix} ]; then
+    echo "error, rescoring with a high-order n-gram seems to be failed"
+    exit 1;
+  fi
+  lmw=`cut -f 1 -d" " $dir/log/best_wer_${enhan}_${lm_suffix} | awk -F'[_]' '{print $NF}'`
+  # rescore n-best list for all sets
+  if $eval_flag; then
+    tasks="dt05_simu dt05_real et05_simu et05_real"
+  else
+    tasks="dt05_simu dt05_real"
+  fi
+  for t in $tasks; do
+    steps/rnnlmrescore.sh --inv-acwt $lmw --N $nbest --use-phi true \
+      $rnnweight \
+      $mdir/data/lang_test_${lm_suffix} \
+      $mdir/data/lang_test_${rnnlm_suffix} \
+      data-fmllr-tri3b/${t}_$enhan \
+      $dir/decode_tgpr_5k_${t}_${enhan}_${lm_suffix} \
+      $dir/decode_tgpr_5k_${t}_${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest}
+  done
+  # calc wers for RNNLM results
+  local/chime4_calc_wers.sh $dir ${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest} $dir/graph_tgpr_5k \
+      > $dir/best_wer_${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest}.result
+  head -n 15 $dir/best_wer_${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest}.result
+fi
diff --git a/egs/chime4/s5_1ch/local/score.sh b/egs/chime4/s5_1ch/local/score.sh
new file mode 100755
index 00000000000..b18f350416d
--- /dev/null
+++ b/egs/chime4/s5_1ch/local/score.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0
+
+[ -f ./path.sh ] && . ./path.sh
+
+# begin configuration section.
+cmd=run.pl
+stage=0
+decode_mbr=true
+reverse=false
+word_ins_penalty=0.0
+min_lmwt=5
+max_lmwt=20
+#end configuration section.
+
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+  echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
+  echo " Options:"
+  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
+  echo "    --stage (0|1|2)                 # start scoring script from part-way through."
+  echo "    --decode_mbr (true/false)       # maximum bayes risk decoding (confusion network)."
+  echo "    --min_lmwt <int>                # minumum LM-weight for lattice rescoring "
+  echo "    --max_lmwt <int>                # maximum LM-weight for lattice rescoring "
+  echo "    --reverse (true/false)          # score with time reversed features "
+  exit 1;
+fi
+
+data=$1
+lang_or_graph=$2
+dir=$3
+
+symtab=$lang_or_graph/words.txt
+
+for f in $symtab $dir/lat.1.gz $data/text; do
+  [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1;
+done
+
+mkdir -p $dir/scoring/log
+
+cat $data/text | sed 's:<NOISE>::g' | sed 's:<SPOKEN_NOISE>::g' > $dir/scoring/test_filt.txt
+
+$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.log \
+  lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
+  lattice-add-penalty --word-ins-penalty=$word_ins_penalty ark:- ark:- \| \
+  lattice-best-path --word-symbol-table=$symtab \
+    ark:- ark,t:$dir/scoring/LMWT.tra || exit 1;
+
+if $reverse; then
+  for lmwt in `seq $min_lmwt $max_lmwt`; do
+    mv $dir/scoring/$lmwt.tra $dir/scoring/$lmwt.tra.orig
+    awk '{ printf("%s ",$1); for(i=NF; i>1; i--){ printf("%s ",$i); } printf("\n"); }' \
+       <$dir/scoring/$lmwt.tra.orig >$dir/scoring/$lmwt.tra
+  done
+fi
+
+# Note: the double level of quoting for the sed command
+$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \
+   cat $dir/scoring/LMWT.tra \| \
+    utils/int2sym.pl -f 2- $symtab \| sed 's:\<UNK\>::g' \| \
+    compute-wer --text --mode=present \
+     ark:$dir/scoring/test_filt.txt  ark,p:- ">&" $dir/wer_LMWT || exit 1;
+
+exit 0;
diff --git a/egs/chime4/s5_1ch/local/simu_enhan_chime4_data_prep.sh b/egs/chime4/s5_1ch/local/simu_enhan_chime4_data_prep.sh
new file mode 100755
index 00000000000..c9e4dc96cc6
--- /dev/null
+++ b/egs/chime4/s5_1ch/local/simu_enhan_chime4_data_prep.sh
@@ -0,0 +1,112 @@
+#!/bin/bash
+set -e
+
+# Copyright 2009-2012  Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0.
+
+# This is modified from the script in standard Kaldi recipe to account
+# for the way the WSJ data is structured on the Edinburgh systems.
+# - Arnab Ghoshal, 29/05/12
+
+# Modified from the script for CHiME2 baseline
+# Shinji Watanabe 02/13/2015
+
+# Config:
+eval_flag=false # make it true when the evaluation data are released
+
+. utils/parse_options.sh || exit 1;
+
+if [ $# -ne 2 ]; then
+  printf "\nUSAGE: %s <enhancement-name> <enhanced-speech-directory>\n\n" `basename $0`
+  echo "The argument should be a the directory that only contains enhanced speech data."
+  exit 1;
+fi
+
+echo "$0 $@"  # Print the command line for logging
+
+enhan=$1
+audio_dir=$2
+
+dir=`pwd`/data/local/data
+mkdir -p $dir
+local=`pwd`/local
+utils=`pwd`/utils
+odir=`pwd`/data
+
+. ./path.sh # Needed for KALDI_ROOT
+export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin
+sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
+if [ ! -x $sph2pipe ]; then
+  echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
+  exit 1;
+fi
+
+if $eval_flag; then
+list_set="tr05_simu_$enhan dt05_simu_$enhan et05_simu_$enhan"
+else
+list_set="tr05_simu_$enhan dt05_simu_$enhan"
+fi
+
+cd $dir
+
+find $audio_dir/ -name '*.wav' | grep 'tr05_bus_simu\|tr05_caf_simu\|tr05_ped_simu\|tr05_str_simu' | sort -u > tr05_simu_$enhan.flist
+find $audio_dir/ -name '*.wav' | grep 'dt05_bus_simu\|dt05_caf_simu\|dt05_ped_simu\|dt05_str_simu' | sort -u > dt05_simu_$enhan.flist
+if $eval_flag; then
+find $audio_dir/ -name '*.wav' | grep 'et05_bus_simu\|et05_caf_simu\|et05_ped_simu\|et05_str_simu' | sort -u > et05_simu_$enhan.flist
+fi
+
+# make a scp file from file list
+for x in $list_set; do
+    cat $x.flist | awk -F'[/]' '{print $NF}'| sed -e 's/\.wav/_SIMU/' > ${x}_wav.ids
+    paste -d" " ${x}_wav.ids $x.flist | sort -k 1 > ${x}_wav.scp
+done
+
+# make a transcription from dot
+# simulation training data extract dot file from original WSJ0 data
+# since it is generated from these data
+if [ ! -e dot_files.flist ]; then
+  echo "Could not find $dir/dot_files.flist files, first run local/clean_wsj0_data_prep.sh";
+  exit 1;
+fi
+cat tr05_simu_${enhan}_wav.scp | awk -F'[_]' '{print $2}' | tr '[A-Z]' '[a-z]' \
+    | $local/find_noisy_transcripts.pl dot_files.flist | cut -f 2- -d" " > tr05_simu_$enhan.txt
+cat tr05_simu_${enhan}_wav.scp | cut -f 1 -d" " > tr05_simu_$enhan.ids
+paste -d" " tr05_simu_$enhan.ids tr05_simu_$enhan.txt | sort -k 1 > tr05_simu_$enhan.trans1
+# dt05 and et05 simulation data are generated from the CHiME4 booth recording
+# and we use CHiME4 dot files
+cat dt05_simu.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF "_SIMU"}'> dt05_simu_$enhan.ids
+cat dt05_simu.dot | sed -e 's/(.*)//' > dt05_simu_$enhan.txt
+paste -d" " dt05_simu_$enhan.ids dt05_simu_$enhan.txt | sort -k 1 > dt05_simu_$enhan.trans1
+if $eval_flag; then
+cat et05_simu.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF "_SIMU"}'> et05_simu_$enhan.ids
+cat et05_simu.dot | sed -e 's/(.*)//' > et05_simu_$enhan.txt
+paste -d" " et05_simu_$enhan.ids et05_simu_$enhan.txt | sort -k 1 > et05_simu_$enhan.trans1
+fi
+
+# Do some basic normalization steps.  At this point we don't remove OOVs--
+# that will be done inside the training scripts, as we'd like to make the
+# data-preparation stage independent of the specific lexicon used.
+noiseword="<NOISE>";
+for x in $list_set;do
+  cat $x.trans1 | $local/normalize_transcript.pl $noiseword \
+    | sort > $x.txt || exit 1;
+done
+
+# Make the utt2spk and spk2utt files.
+for x in $list_set; do
+  cat ${x}_wav.scp | awk -F'_' '{print $1}' > $x.spk
+  cat ${x}_wav.scp | awk '{print $1}' > $x.utt
+  paste -d" " $x.utt $x.spk > $x.utt2spk
+  cat $x.utt2spk | $utils/utt2spk_to_spk2utt.pl > $x.spk2utt || exit 1;
+done
+
+# copying data to data/...
+for x in $list_set; do
+  mkdir -p $odir/$x
+  cp ${x}_wav.scp $odir/$x/wav.scp || exit 1;
+  cp ${x}.txt     $odir/$x/text    || exit 1;
+  cp ${x}.spk2utt $odir/$x/spk2utt || exit 1;
+  cp ${x}.utt2spk $odir/$x/utt2spk || exit 1;
+done
+
+echo "Data preparation succeeded"
diff --git a/egs/chime4/s5_1ch/local/simu_noisy_chime4_data_prep.sh b/egs/chime4/s5_1ch/local/simu_noisy_chime4_data_prep.sh
new file mode 100755
index 00000000000..6e7a827358e
--- /dev/null
+++ b/egs/chime4/s5_1ch/local/simu_noisy_chime4_data_prep.sh
@@ -0,0 +1,122 @@
+#!/bin/bash
+set -e
+
+# Copyright 2009-2012  Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0.
+
+# This is modified from the script in standard Kaldi recipe to account
+# for the way the WSJ data is structured on the Edinburgh systems.
+# - Arnab Ghoshal, 29/05/12
+
+# Modified from the script for CHiME2 baseline
+# Shinji Watanabe 02/13/2015
+
+# Config:
+eval_flag=false # make it true when the evaluation data are released
+
+. utils/parse_options.sh || exit 1;
+
+if [ $# -ne 1 ]; then
+  printf "\nUSAGE: %s <corpus-directory>\n\n" `basename $0`
+  echo "The argument should be a the top-level Chime4 directory."
+  echo "It is assumed that there will be a 'data' subdirectory"
+  echo "within the top-level corpus directory."
+  exit 1;
+fi
+
+echo "$0 $@"  # Print the command line for logging
+
+audio_dir=$1/data/audio/16kHz/isolated
+trans_dir=$1/data/transcriptions
+
+echo "extract 5th channel (CH5.wav, the center bottom edge in the front of the tablet) for noisy data"
+
+dir=`pwd`/data/local/data
+lmdir=`pwd`/data/local/nist_lm
+mkdir -p $dir $lmdir
+local=`pwd`/local
+utils=`pwd`/utils
+
+. ./path.sh # Needed for KALDI_ROOT
+export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin
+sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
+if [ ! -x $sph2pipe ]; then
+  echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
+  exit 1;
+fi
+
+if $eval_flag; then
+list_set="tr05_simu_noisy dt05_simu_noisy et05_simu_noisy"
+else
+list_set="tr05_simu_noisy dt05_simu_noisy"
+fi
+
+cd $dir
+
+find $audio_dir -name '*CH5.wav' | grep 'tr05_bus_simu\|tr05_caf_simu\|tr05_ped_simu\|tr05_str_simu' | sort -u > tr05_simu_noisy.flist
+find $audio_dir -name '*CH5.wav' | grep 'dt05_bus_simu\|dt05_caf_simu\|dt05_ped_simu\|dt05_str_simu' | sort -u > dt05_simu_noisy.flist
+if $eval_flag; then
+find $audio_dir -name '*CH5.wav' | grep 'et05_bus_simu\|et05_caf_simu\|et05_ped_simu\|et05_str_simu' | sort -u > et05_simu_noisy.flist
+fi
+
+# make a dot format from json annotation files
+cp $trans_dir/dt05_simu.dot_all dt05_simu.dot
+if $eval_flag; then
+cp $trans_dir/et05_simu.dot_all et05_simu.dot
+fi
+
+# make a scp file from file list
+for x in $list_set; do
+    cat $x.flist | awk -F'[/]' '{print $NF}'| sed -e 's/\.wav/_SIMU/' > ${x}_wav.ids
+    paste -d" " ${x}_wav.ids $x.flist | sort -k 1 > ${x}_wav.scp
+done
+
+# make a transcription from dot
+# simulation training data extract dot file from original WSJ0 data
+# since it is generated from these data
+if [ ! -e dot_files.flist ]; then
+  echo "Could not find $dir/dot_files.flist files, first run local/clean_wsj0_data_prep.sh";
+  exit 1;
+fi
+cat tr05_simu_noisy_wav.scp | awk -F'[_]' '{print $2}' | tr '[A-Z]' '[a-z]' \
+    | $local/find_noisy_transcripts.pl dot_files.flist | cut -f 2- -d" " > tr05_simu_noisy.txt
+cat tr05_simu_noisy_wav.scp | cut -f 1 -d" " > tr05_simu_noisy.ids
+paste -d" " tr05_simu_noisy.ids tr05_simu_noisy.txt | sort -k 1 > tr05_simu_noisy.trans1
+# dt05 and et05 simulation data are generated from the CHiME4 booth recording
+# and we use CHiME4 dot files
+cat dt05_simu.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF ".CH5_SIMU"}'> dt05_simu_noisy.ids
+cat dt05_simu.dot | sed -e 's/(.*)//' > dt05_simu_noisy.txt
+paste -d" " dt05_simu_noisy.ids dt05_simu_noisy.txt | sort -k 1 > dt05_simu_noisy.trans1
+if $eval_flag; then
+cat et05_simu.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF ".CH5_SIMU"}'> et05_simu_noisy.ids
+cat et05_simu.dot | sed -e 's/(.*)//' > et05_simu_noisy.txt
+paste -d" " et05_simu_noisy.ids et05_simu_noisy.txt | sort -k 1 > et05_simu_noisy.trans1
+fi
+
+# Do some basic normalization steps.  At this point we don't remove OOVs--
+# that will be done inside the training scripts, as we'd like to make the
+# data-preparation stage independent of the specific lexicon used.
+noiseword="<NOISE>";
+for x in $list_set;do
+  cat $x.trans1 | $local/normalize_transcript.pl $noiseword \
+    | sort > $x.txt || exit 1;
+done
+
+# Make the utt2spk and spk2utt files.
+for x in $list_set; do
+  cat ${x}_wav.scp | awk -F'_' '{print $1}' > $x.spk
+  cat ${x}_wav.scp | awk '{print $1}' > $x.utt
+  paste -d" " $x.utt $x.spk > $x.utt2spk
+  cat $x.utt2spk | $utils/utt2spk_to_spk2utt.pl > $x.spk2utt || exit 1;
+done
+
+# copying data to data/...
+for x in $list_set; do
+  mkdir -p ../../$x
+  cp ${x}_wav.scp ../../$x/wav.scp || exit 1;
+  cp ${x}.txt     ../../$x/text    || exit 1;
+  cp ${x}.spk2utt ../../$x/spk2utt || exit 1;
+  cp ${x}.utt2spk ../../$x/utt2spk || exit 1;
+done
+
+echo "Data preparation succeeded"
diff --git a/egs/chime4/s5_1ch/local/wsj_prepare_dict.sh b/egs/chime4/s5_1ch/local/wsj_prepare_dict.sh
new file mode 100755
index 00000000000..6ddebd60293
--- /dev/null
+++ b/egs/chime4/s5_1ch/local/wsj_prepare_dict.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+
+# Copyright 2010-2012 Microsoft Corporation  
+#           2012-2014 Johns Hopkins University (Author: Daniel Povey)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# Call this script from one level above, e.g. from the s3/ directory.  It puts
+# its output in data/local/.
+
+# The parts of the output of this that will be needed are
+# [in data/local/dict/ ]
+# lexicon.txt
+# extra_questions.txt
+# nonsilence_phones.txt
+# optional_silence.txt
+# silence_phones.txt
+
+# run this from ../
+dir=data/local/dict
+mkdir -p $dir
+
+
+# (1) Get the CMU dictionary
+svn co  https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict \
+  $dir/cmudict || exit 1;
+
+# can add -r 10966 for strict compatibility.
+
+
+#(2) Dictionary preparation:
+
+
+# Make phones symbol-table (adding in silence and verbal and non-verbal noises at this point).
+# We are adding suffixes _B, _E, _S for beginning, ending, and singleton phones.
+
+# silence phones, one per line.
+(echo SIL; echo SPN; echo NSN) > $dir/silence_phones.txt
+echo SIL > $dir/optional_silence.txt
+
+# nonsilence phones; on each line is a list of phones that correspond
+# really to the same base phone.
+cat $dir/cmudict/cmudict.0.7a.symbols | perl -ane 's:\r::; print;' | \
+ perl -e 'while(<>){
+  chop; m:^([^\d]+)(\d*)$: || die "Bad phone $_"; 
+  $phones_of{$1} .= "$_ "; }
+  foreach $list (values %phones_of) {print $list . "\n"; } ' \
+  | sort > $dir/nonsilence_phones.txt || exit 1;
+
+# A few extra questions that will be added to those obtained by automatically clustering
+# the "real" phones.  These ask about stress; there's also one for silence.
+cat $dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf "\n";}' > $dir/extra_questions.txt || exit 1;
+cat $dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ", $_)) {
+  $p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \
+ >> $dir/extra_questions.txt || exit 1;
+
+grep -v ';;;' $dir/cmudict/cmudict.0.7a | \
+ perl -ane 'if(!m:^;;;:){ s:(\S+)\(\d+\) :$1 :; print; }' \
+  > $dir/lexicon1_raw_nosil.txt || exit 1;
+
+# Add to cmudict the silences, noises etc.
+
+# the sort | uniq is to remove a duplicated pron from cmudict.
+(echo '!SIL SIL'; echo '<SPOKEN_NOISE> SPN'; echo '<UNK> SPN'; echo '<NOISE> NSN'; ) | \
+ cat - $dir/lexicon1_raw_nosil.txt | sort | uniq > $dir/lexicon2_raw.txt || exit 1;
+
+
+# lexicon.txt is without the _B, _E, _S, _I markers.
+# This is the input to wsj_format_data.sh
+cp $dir/lexicon2_raw.txt $dir/lexicon.txt
+
+rm $dir/lexiconp.txt 2>/dev/null
+
+echo "Dictionary preparation succeeded"
+
diff --git a/egs/chime4/s5_1ch/path.sh b/egs/chime4/s5_1ch/path.sh
new file mode 100755
index 00000000000..2d17b17a84a
--- /dev/null
+++ b/egs/chime4/s5_1ch/path.sh
@@ -0,0 +1,6 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
diff --git a/egs/chime4/s5_1ch/run.sh b/egs/chime4/s5_1ch/run.sh
new file mode 100755
index 00000000000..012a7eefc81
--- /dev/null
+++ b/egs/chime4/s5_1ch/run.sh
@@ -0,0 +1,90 @@
+#!/bin/bash
+
+# Kaldi ASR baseline for the CHiME-4 Challenge (1ch track: single channel track)
+#
+# Copyright 2016 University of Sheffield (Jon Barker, Ricard Marxer)
+#                Inria (Emmanuel Vincent)
+#                Mitsubishi Electric Research Labs (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh
+. ./cmd.sh
+
+# Config:
+stage=0 # resume training with --stage=N
+flatstart=false
+
+. utils/parse_options.sh || exit 1;
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+#####check data and model paths################
+# Set a main root directory of the CHiME4 data
+# If you use scripts distributed in the CHiME4 package,
+chime4_data=`pwd`/../..
+# Otherwise, please specify it, e.g.,
+chime4_data=/db/laputa1/data/processed/public/CHiME4
+if [ ! -d $chime4_data ]; then
+  echo "$chime4_data does not exist. Please specify chime4 data root correctly" && exit 1
+fi
+# Set a model directory for the CHiME4 data.
+modeldir=$chime4_data/tools/ASR_models
+for d in $modeldir $modeldir/data/{lang,lang_test_tgpr_5k,lang_test_5gkn_5k,lang_test_rnnlm_5k_h300,local} \
+  $modeldir/exp/{tri3b_tr05_multi_noisy,tri4a_dnn_tr05_multi_noisy,tri4a_dnn_tr05_multi_noisy_smbr_i1lats}; do
+  [ ! -d ] && echo "$0: no such directory $d. specify models correctly or execute './run.sh --flatstart true' first" && exit 1;
+done
+#####check data and model paths finished#######
+
+
+#####main program start################
+# You can execute run_init.sh only "once"
+# This creates 3-gram LM, FSTs, and basic task files
+if [ $stage -le 0 ] && $flatstart; then
+  local/run_init.sh $chime4_data
+fi
+
+# In this script, we use non-enhanced 6th microphone signals.
+enhancement_method=isolated_1ch_track
+enhancement_data=$chime4_data/data/audio/16kHz/$enhancement_method
+#if [ $stage -le 1 ]; then
+#  put your single channel enhancement
+#fi
+
+# GMM based ASR experiment without "retraining"
+# Please set a directory of your speech enhancement method.
+# run_gmm_recog.sh can be done every time when you change a speech enhancement technique.
+# The directory structure and audio files must follow the attached baseline enhancement directory
+if [ $stage -le 2 ]; then
+  if $flatstart; then
+    local/run_gmm.sh $enhancement_method $enhancement_data $chime4_data
+  else
+    local/run_gmm_recog.sh $enhancement_method $enhancement_data $modeldir
+  fi
+fi
+
+# DNN based ASR experiment
+# Since it takes time to evaluate DNN, we make the GMM and DNN scripts separately.
+# You may execute it after you would have promising results using GMM-based ASR experiments
+if [ $stage -le 3 ]; then
+  if $flatstart; then
+    local/run_dnn.sh $enhancement_method
+  else
+    local/run_dnn_recog.sh $enhancement_method $modeldir
+  fi
+fi
+
+# LM-rescoring experiment with 5-gram and RNN LMs
+# It takes a few days to train a RNNLM.
+if [ $stage -le 4 ]; then
+  if $flatstart; then
+    local/run_lmrescore.sh $chime4_data $enhancement_method
+  else
+    local/run_lmrescore_recog.sh $enhancement_method $modeldir
+  fi
+fi
+
+echo "Done."
diff --git a/egs/chime4/s5_1ch/steps b/egs/chime4/s5_1ch/steps
new file mode 120000
index 00000000000..6e99bf5b5ad
--- /dev/null
+++ b/egs/chime4/s5_1ch/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps
\ No newline at end of file
diff --git a/egs/chime4/s5_1ch/utils b/egs/chime4/s5_1ch/utils
new file mode 120000
index 00000000000..b240885218f
--- /dev/null
+++ b/egs/chime4/s5_1ch/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils
\ No newline at end of file
diff --git a/egs/chime4/s5_2ch/RESULTS b/egs/chime4/s5_2ch/RESULTS
new file mode 100644
index 00000000000..81c18cccf07
--- /dev/null
+++ b/egs/chime4/s5_2ch/RESULTS
@@ -0,0 +1,49 @@
+# CHiME-4 2ch track results
+# The result is based on Hori et al, "The MERL/SRI system for the 3rd CHiME challenge using beamforming,
+# robust feature extraction, and advanced speech recognition," in Proc. ASRU'15,
+# and please refer the paper if you think the baseline useful.
+# Note that the following result is different from that in the paper since we don't include
+# SRI's robust features and system combination
+
+GMM noisy multi-condition with beamformit
+exp/tri3b_tr05_multi_noisy/best_wer_beamformit_2mics.result
+-------------------
+best overall dt05 WER 17.69% (language model weight = 11)
+-------------------
+dt05_simu WER: 19.15% (Average), 16.14% (BUS), 23.55% (CAFE), 15.49% (PEDESTRIAN), 21.42% (STREET)
+-------------------
+dt05_real WER: 16.22% (Average), 20.12% (BUS), 16.25% (CAFE), 12.35% (PEDESTRIAN), 16.18% (STREET)
+-------------------
+
+DNN sMBR
+exp/tri4a_dnn_tr05_multi_noisy_smbr_i1lats/best_wer_beamformit_2mics.result
+-------------------
+best overall dt05 WER 11.63% (language model weight = 11)
+ (Number of iterations = 4)
+-------------------
+dt05_simu WER: 12.36% (Average), 10.66% (BUS), 15.55% (CAFE), 9.87% (PEDESTRIAN), 13.36% (STREET)
+-------------------
+dt05_real WER: 10.90% (Average), 13.62% (BUS), 10.63% (CAFE), 7.69% (PEDESTRIAN), 11.65% (STREET)
+-------------------
+
+5-gram rescoring
+exp/tri4a_dnn_tr05_multi_noisy_smbr_lmrescore/best_wer_beamformit_2mics_5gkn_5k.result
+-------------------
+best overall dt05 WER 10.17% (language model weight = 11)
+-------------------
+dt05_simu WER: 10.72% (Average), 9.37% (BUS), 13.70% (CAFE), 8.07% (PEDESTRIAN), 11.73% (STREET)
+-------------------
+dt05_real WER: 9.63% (Average), 11.93% (BUS), 9.75% (CAFE), 6.46% (PEDESTRIAN), 10.37% (STREET)
+-------------------
+
+RNNLM
+exp/tri4a_dnn_tr05_multi_noisy_smbr_lmrescore/best_wer_beamformit_2mics_rnnlm_5k_h300_w0.5_n100.result
+-------------------
+best overall dt05 WER 8.86% (language model weight = 12)
+-------------------
+dt05_simu WER: 9.50% (Average), 8.19% (BUS), 12.15% (CAFE), 7.12% (PEDESTRIAN), 10.55% (STREET)
+-------------------
+dt05_real WER: 8.23% (Average), 10.90% (BUS), 7.96% (CAFE), 5.22% (PEDESTRIAN), 8.82% (STREET)
+-------------------
+
+
diff --git a/egs/chime4/s5_2ch/cmd.sh b/egs/chime4/s5_2ch/cmd.sh
new file mode 100755
index 00000000000..2626a1a35b2
--- /dev/null
+++ b/egs/chime4/s5_2ch/cmd.sh
@@ -0,0 +1,21 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+#export train_cmd="queue.pl --mem 2G"
+#export decode_cmd="queue.pl --mem 4G"
+#export mkgraph_cmd="queue.pl --mem 8G"
+
+# run it locally...
+export train_cmd=run.pl
+export decode_cmd=run.pl
+export cuda_cmd=run.pl
+export mkgraph_cmd=run.pl
diff --git a/egs/chime4/s5_2ch/conf/chime4.cfg b/egs/chime4/s5_2ch/conf/chime4.cfg
new file mode 100755
index 00000000000..70fdd858651
--- /dev/null
+++ b/egs/chime4/s5_2ch/conf/chime4.cfg
@@ -0,0 +1,50 @@
+#BeamformIt sample configuration file for AMI data (http://groups.inf.ed.ac.uk/ami/download/)
+
+# scrolling size to compute the delays
+scroll_size = 250
+
+# cross correlation computation window size
+window_size = 500
+
+#amount of maximum points for the xcorrelation taken into account
+nbest_amount = 4
+
+#flag wether to apply an automatic noise thresholding 
+do_noise_threshold = 1
+
+#Percentage of frames with lower xcorr taken as noisy
+noise_percent = 10
+
+######## acoustic modelling parameters
+
+#transition probabilities weight for multichannel decoding
+trans_weight_multi = 25
+trans_weight_nbest = 25
+
+###
+
+#flag wether to print the feaures after setting them, or not
+print_features = 1
+
+#flag wether to use the bad frames in the sum process
+do_avoid_bad_frames = 1
+
+#flag to use the best channel (SNR) as a reference
+#defined from command line
+do_compute_reference = 1
+
+#flag wether to use a uem file or not(process all the file)
+do_use_uem_file = 0
+
+#flag wether to use an adaptative weights scheme or fixed weights
+do_adapt_weights = 1
+
+#flag wether to output the sph files or just run the system to create the auxiliary files
+do_write_sph_files = 1
+
+####directories where to store/retrieve info####
+#channels_file = ./cfg-files/channels
+
+#show needs to be passed as argument normally, here a default one is given just in case
+#show_id = Ttmp
+
diff --git a/egs/chime4/s5_2ch/conf/decode_dnn.config b/egs/chime4/s5_2ch/conf/decode_dnn.config
new file mode 100644
index 00000000000..89dd9929a62
--- /dev/null
+++ b/egs/chime4/s5_2ch/conf/decode_dnn.config
@@ -0,0 +1,2 @@
+beam=18.0 # beam for decoding.  Was 13.0 in the scripts.
+lattice_beam=10.0 # this has most effect on size of the lattices.
diff --git a/egs/chime4/s5_2ch/conf/fbank.conf b/egs/chime4/s5_2ch/conf/fbank.conf
new file mode 100644
index 00000000000..5fc7774b31f
--- /dev/null
+++ b/egs/chime4/s5_2ch/conf/fbank.conf
@@ -0,0 +1,11 @@
+# No non-default options for now.
+--window-type=hamming # disable Dans window, use the standard
+--use-energy=false    # only fbank outputs
+--sample-frequency=16000 # Cantonese is sampled at 8kHz
+
+--low-freq=64         # typical setup from Frantisek Grezl
+--high-freq=8000
+--dither=1
+
+--num-mel-bins=40     # 8kHz so we use 15 bins
+--htk-compat=true     # try to make it compatible with HTK
diff --git a/egs/timit/s4/conf/mfcc.conf b/egs/chime4/s5_2ch/conf/mfcc.conf
similarity index 100%
rename from egs/timit/s4/conf/mfcc.conf
rename to egs/chime4/s5_2ch/conf/mfcc.conf
diff --git a/egs/chime4/s5_2ch/local b/egs/chime4/s5_2ch/local
new file mode 120000
index 00000000000..93f81ea6259
--- /dev/null
+++ b/egs/chime4/s5_2ch/local
@@ -0,0 +1 @@
+../s5_1ch/local
\ No newline at end of file
diff --git a/egs/chime4/s5_2ch/path.sh b/egs/chime4/s5_2ch/path.sh
new file mode 100755
index 00000000000..2d17b17a84a
--- /dev/null
+++ b/egs/chime4/s5_2ch/path.sh
@@ -0,0 +1,6 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
diff --git a/egs/chime4/s5_2ch/run.sh b/egs/chime4/s5_2ch/run.sh
new file mode 100755
index 00000000000..16d92723fdf
--- /dev/null
+++ b/egs/chime4/s5_2ch/run.sh
@@ -0,0 +1,93 @@
+#!/bin/bash
+
+# Kaldi ASR baseline for the CHiME-4 Challenge (2ch track: 2 channel track)
+#
+# Copyright 2016 University of Sheffield (Jon Barker, Ricard Marxer)
+#                Inria (Emmanuel Vincent)
+#                Mitsubishi Electric Research Labs (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh
+. ./cmd.sh
+
+# Config:
+stage=0 # resume training with --stage=N
+flatstart=false
+
+. utils/parse_options.sh || exit 1;
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+#####check data and model paths################
+# Set a main root directory of the CHiME4 data
+# If you use scripts distributed in the CHiME4 package,
+chime4_data=`pwd`/../..
+# Otherwise, please specify it, e.g.,
+chime4_data=/db/laputa1/data/processed/public/CHiME4
+if [ ! -d $chime4_data ]; then
+  echo "$chime4_data does not exist. Please specify chime4 data root correctly" && exit 1
+fi
+# Set a model directory for the CHiME4 data.
+modeldir=$chime4_data/tools/ASR_models
+for d in $modeldir $modeldir/data/{lang,lang_test_tgpr_5k,lang_test_5gkn_5k,lang_test_rnnlm_5k_h300,local} \
+  $modeldir/exp/{tri3b_tr05_multi_noisy,tri4a_dnn_tr05_multi_noisy,tri4a_dnn_tr05_multi_noisy_smbr_i1lats}; do
+  [ ! -d ] && echo "$0: no such directory $d. specify models correctly or execute './run.sh --flatstart true' first" && exit 1;
+done
+#####check data and model paths finished#######
+
+
+#####main program start################
+# You can execute run_init.sh only "once"
+# This creates 3-gram LM, FSTs, and basic task files
+if [ $stage -le 0 ] && $flatstart; then
+  local/run_init.sh $chime4_data
+fi
+
+# Using Beamformit
+# See Hori et al, "The MERL/SRI system for the 3rd CHiME challenge using beamforming,
+# robust feature extraction, and advanced speech recognition," in Proc. ASRU'15
+# note that beamformed wav files are generated in the following directory
+enhancement_method=beamformit_2mics
+enhancement_data=`pwd`/enhan/$enhancement_method
+if [ $stage -le 1 ]; then
+  local/run_beamform_2ch_track.sh --cmd "$train_cmd" --nj 20 $chime4_data/data/audio/16kHz/isolated_2ch_track $enhancement_data
+fi
+
+# GMM based ASR experiment without "retraining"
+# Please set a directory of your speech enhancement method.
+# run_gmm_recog.sh can be done every time when you change a speech enhancement technique.
+# The directory structure and audio files must follow the attached baseline enhancement directory
+if [ $stage -le 2 ]; then
+  if $flatstart; then
+    local/run_gmm.sh $enhancement_method $enhancement_data $chime4_data
+  else
+    local/run_gmm_recog.sh $enhancement_method $enhancement_data $modeldir
+  fi
+fi
+
+# DNN based ASR experiment
+# Since it takes time to evaluate DNN, we make the GMM and DNN scripts separately.
+# You may execute it after you would have promising results using GMM-based ASR experiments
+if [ $stage -le 3 ]; then
+  if $flatstart; then
+    local/run_dnn.sh $enhancement_method
+  else
+    local/run_dnn_recog.sh $enhancement_method $modeldir
+  fi
+fi
+
+# LM-rescoring experiment with 5-gram and RNN LMs
+# It takes a few days to train a RNNLM.
+if [ $stage -le 4 ]; then
+  if $flatstart; then
+    local/run_lmrescore.sh $chime4_data $enhancement_method
+  else
+    local/run_lmrescore_recog.sh $enhancement_method $modeldir
+  fi
+fi
+
+echo "Done."
diff --git a/egs/chime4/s5_2ch/steps b/egs/chime4/s5_2ch/steps
new file mode 120000
index 00000000000..6e99bf5b5ad
--- /dev/null
+++ b/egs/chime4/s5_2ch/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps
\ No newline at end of file
diff --git a/egs/chime4/s5_2ch/utils b/egs/chime4/s5_2ch/utils
new file mode 120000
index 00000000000..b240885218f
--- /dev/null
+++ b/egs/chime4/s5_2ch/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils
\ No newline at end of file
diff --git a/egs/chime4/s5_6ch/RESULTS b/egs/chime4/s5_6ch/RESULTS
new file mode 100644
index 00000000000..533edc2704e
--- /dev/null
+++ b/egs/chime4/s5_6ch/RESULTS
@@ -0,0 +1,48 @@
+# CHiME-4 6ch track results
+# The result is based on Hori et al, "The MERL/SRI system for the 3rd CHiME challenge using beamforming,
+# robust feature extraction, and advanced speech recognition," in Proc. ASRU'15,
+# and please refer the paper if you think the baseline useful.
+# Note that the following result is different from that in the paper since we don't include
+# SRI's robust features and system combination
+
+GMM noisy multi-condition with beamformit
+exp/tri3b_tr05_multi_noisy/best_wer_beamformit_5mics.result
+-------------------
+best overall dt05 WER 13.67% (language model weight = 11)
+-------------------
+dt05_simu WER: 14.30% (Average), 12.80% (BUS), 17.05% (CAFE), 11.90% (PEDESTRIAN), 15.46% (STREET)
+-------------------
+dt05_real WER: 13.03% (Average), 16.03% (BUS), 12.80% (CAFE), 10.02% (PEDESTRIAN), 13.27% (STREET)
+-------------------
+
+DNN sMBR
+exp/tri4a_dnn_tr05_multi_noisy_smbr_i1lats/best_wer_beamformit_5mics.result
+-------------------
+best overall dt05 WER 8.60% (language model weight = 11)
+ (Number of iterations = 4)
+-------------------
+dt05_simu WER: 9.07% (Average), 8.44% (BUS), 10.63% (CAFE), 7.39% (PEDESTRIAN), 9.82% (STREET)
+-------------------
+dt05_real WER: 8.14% (Average), 10.22% (BUS), 8.19% (CAFE), 5.69% (PEDESTRIAN), 8.45% (STREET)
+-------------------
+
+5-gram rescoring
+exp/tri4a_dnn_tr05_multi_noisy_smbr_lmrescore/best_wer_beamformit_5mics_5gkn_5k.result
+-------------------
+best overall dt05 WER 7.30% (language model weight = 11)
+-------------------
+dt05_simu WER: 7.75% (Average), 7.14% (BUS), 9.13% (CAFE), 6.33% (PEDESTRIAN), 8.41% (STREET)
+-------------------
+dt05_real WER: 6.85% (Average), 8.53% (BUS), 6.90% (CAFE), 4.72% (PEDESTRIAN), 7.24% (STREET)
+-------------------
+
+RNNLM
+exp/tri4a_dnn_tr05_multi_noisy_smbr_lmrescore/best_wer_beamformit_5mics_rnnlm_5k_h300_w0.5_n100.result
+-------------------
+best overall dt05 WER 6.27% (language model weight = 12)
+-------------------
+dt05_simu WER: 6.77% (Average), 6.02% (BUS), 8.10% (CAFE), 5.49% (PEDESTRIAN), 7.48% (STREET)
+-------------------
+dt05_real WER: 5.76% (Average), 7.39% (BUS), 5.77% (CAFE), 3.72% (PEDESTRIAN), 6.18% (STREET)
+-------------------
+
diff --git a/egs/chime4/s5_6ch/cmd.sh b/egs/chime4/s5_6ch/cmd.sh
new file mode 100755
index 00000000000..2626a1a35b2
--- /dev/null
+++ b/egs/chime4/s5_6ch/cmd.sh
@@ -0,0 +1,21 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+#export train_cmd="queue.pl --mem 2G"
+#export decode_cmd="queue.pl --mem 4G"
+#export mkgraph_cmd="queue.pl --mem 8G"
+
+# run it locally...
+export train_cmd=run.pl
+export decode_cmd=run.pl
+export cuda_cmd=run.pl
+export mkgraph_cmd=run.pl
diff --git a/egs/chime4/s5_6ch/conf/chime4.cfg b/egs/chime4/s5_6ch/conf/chime4.cfg
new file mode 100755
index 00000000000..70fdd858651
--- /dev/null
+++ b/egs/chime4/s5_6ch/conf/chime4.cfg
@@ -0,0 +1,50 @@
+#BeamformIt sample configuration file for AMI data (http://groups.inf.ed.ac.uk/ami/download/)
+
+# scrolling size to compute the delays
+scroll_size = 250
+
+# cross correlation computation window size
+window_size = 500
+
+#amount of maximum points for the xcorrelation taken into account
+nbest_amount = 4
+
+#flag wether to apply an automatic noise thresholding 
+do_noise_threshold = 1
+
+#Percentage of frames with lower xcorr taken as noisy
+noise_percent = 10
+
+######## acoustic modelling parameters
+
+#transition probabilities weight for multichannel decoding
+trans_weight_multi = 25
+trans_weight_nbest = 25
+
+###
+
+#flag wether to print the feaures after setting them, or not
+print_features = 1
+
+#flag wether to use the bad frames in the sum process
+do_avoid_bad_frames = 1
+
+#flag to use the best channel (SNR) as a reference
+#defined from command line
+do_compute_reference = 1
+
+#flag wether to use a uem file or not(process all the file)
+do_use_uem_file = 0
+
+#flag wether to use an adaptative weights scheme or fixed weights
+do_adapt_weights = 1
+
+#flag wether to output the sph files or just run the system to create the auxiliary files
+do_write_sph_files = 1
+
+####directories where to store/retrieve info####
+#channels_file = ./cfg-files/channels
+
+#show needs to be passed as argument normally, here a default one is given just in case
+#show_id = Ttmp
+
diff --git a/egs/chime4/s5_6ch/conf/decode_dnn.config b/egs/chime4/s5_6ch/conf/decode_dnn.config
new file mode 100644
index 00000000000..89dd9929a62
--- /dev/null
+++ b/egs/chime4/s5_6ch/conf/decode_dnn.config
@@ -0,0 +1,2 @@
+beam=18.0 # beam for decoding.  Was 13.0 in the scripts.
+lattice_beam=10.0 # this has most effect on size of the lattices.
diff --git a/egs/chime4/s5_6ch/conf/fbank.conf b/egs/chime4/s5_6ch/conf/fbank.conf
new file mode 100644
index 00000000000..5fc7774b31f
--- /dev/null
+++ b/egs/chime4/s5_6ch/conf/fbank.conf
@@ -0,0 +1,11 @@
+# No non-default options for now.
+--window-type=hamming # disable Dans window, use the standard
+--use-energy=false    # only fbank outputs
+--sample-frequency=16000 # Cantonese is sampled at 8kHz
+
+--low-freq=64         # typical setup from Frantisek Grezl
+--high-freq=8000
+--dither=1
+
+--num-mel-bins=40     # 8kHz so we use 15 bins
+--htk-compat=true     # try to make it compatible with HTK
diff --git a/egs/chime4/s5_6ch/conf/mfcc.conf b/egs/chime4/s5_6ch/conf/mfcc.conf
new file mode 100644
index 00000000000..7361509099f
--- /dev/null
+++ b/egs/chime4/s5_6ch/conf/mfcc.conf
@@ -0,0 +1 @@
+--use-energy=false   # only non-default option.
diff --git a/egs/chime4/s5_6ch/local b/egs/chime4/s5_6ch/local
new file mode 120000
index 00000000000..93f81ea6259
--- /dev/null
+++ b/egs/chime4/s5_6ch/local
@@ -0,0 +1 @@
+../s5_1ch/local
\ No newline at end of file
diff --git a/egs/chime4/s5_6ch/path.sh b/egs/chime4/s5_6ch/path.sh
new file mode 100755
index 00000000000..2d17b17a84a
--- /dev/null
+++ b/egs/chime4/s5_6ch/path.sh
@@ -0,0 +1,6 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
diff --git a/egs/chime4/s5_6ch/run.sh b/egs/chime4/s5_6ch/run.sh
new file mode 100755
index 00000000000..d5a8b871a07
--- /dev/null
+++ b/egs/chime4/s5_6ch/run.sh
@@ -0,0 +1,93 @@
+#!/bin/bash
+
+# Kaldi ASR baseline for the CHiME-4 Challenge (6ch track: 6 channel track)
+#
+# Copyright 2016 University of Sheffield (Jon Barker, Ricard Marxer)
+#                Inria (Emmanuel Vincent)
+#                Mitsubishi Electric Research Labs (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh
+. ./cmd.sh
+
+# Config:
+stage=0 # resume training with --stage=N
+flatstart=false
+
+. utils/parse_options.sh || exit 1;
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+#####check data and model paths################
+# Set a main root directory of the CHiME4 data
+# If you use scripts distributed in the CHiME4 package,
+chime4_data=`pwd`/../..
+# Otherwise, please specify it, e.g.,
+chime4_data=/db/laputa1/data/processed/public/CHiME4
+if [ ! -d $chime4_data ]; then
+  echo "$chime4_data does not exist. Please specify chime4 data root correctly" && exit 1
+fi
+# Set a model directory for the CHiME4 data.
+modeldir=$chime4_data/tools/ASR_models
+for d in $modeldir $modeldir/data/{lang,lang_test_tgpr_5k,lang_test_5gkn_5k,lang_test_rnnlm_5k_h300,local} \
+  $modeldir/exp/{tri3b_tr05_multi_noisy,tri4a_dnn_tr05_multi_noisy,tri4a_dnn_tr05_multi_noisy_smbr_i1lats}; do
+  [ ! -d ] && echo "$0: no such directory $d. specify models correctly or execute './run.sh --flatstart true' first" && exit 1;
+done
+#####check data and model paths finished#######
+
+
+#####main program start################
+# You can execute run_init.sh only "once"
+# This creates 3-gram LM, FSTs, and basic task files
+if [ $stage -le 0 ] && $flatstart; then
+  local/run_init.sh $chime4_data
+fi
+
+# Using Beamformit
+# See Hori et al, "The MERL/SRI system for the 3rd CHiME challenge using beamforming,
+# robust feature extraction, and advanced speech recognition," in Proc. ASRU'15
+# note that beamformed wav files are generated in the following directory
+enhancement_method=beamformit_5mics
+enhancement_data=`pwd`/enhan/$enhancement_method
+if [ $stage -le 1 ]; then
+  local/run_beamform_6ch_track.sh --cmd "$train_cmd" --nj 20 $chime4_data/data/audio/16kHz/isolated_6ch_track $enhancement_data
+fi
+
+# GMM based ASR experiment without "retraining"
+# Please set a directory of your speech enhancement method.
+# run_gmm_recog.sh can be done every time when you change a speech enhancement technique.
+# The directory structure and audio files must follow the attached baseline enhancement directory
+if [ $stage -le 2 ]; then
+  if $flatstart; then
+    local/run_gmm.sh $enhancement_method $enhancement_data $chime4_data
+  else
+    local/run_gmm_recog.sh $enhancement_method $enhancement_data $modeldir
+  fi
+fi
+
+# DNN based ASR experiment
+# Since it takes time to evaluate DNN, we make the GMM and DNN scripts separately.
+# You may execute it after you would have promising results using GMM-based ASR experiments
+if [ $stage -le 3 ]; then
+  if $flatstart; then
+    local/run_dnn.sh $enhancement_method
+  else
+    local/run_dnn_recog.sh $enhancement_method $modeldir
+  fi
+fi
+
+# LM-rescoring experiment with 5-gram and RNN LMs
+# It takes a few days to train a RNNLM.
+if [ $stage -le 4 ]; then
+  if $flatstart; then
+    local/run_lmrescore.sh $chime4_data $enhancement_method
+  else
+    local/run_lmrescore_recog.sh $enhancement_method $modeldir
+  fi
+fi
+
+echo "Done."
diff --git a/egs/chime4/s5_6ch/steps b/egs/chime4/s5_6ch/steps
new file mode 120000
index 00000000000..6e99bf5b5ad
--- /dev/null
+++ b/egs/chime4/s5_6ch/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps
\ No newline at end of file
diff --git a/egs/chime4/s5_6ch/utils b/egs/chime4/s5_6ch/utils
new file mode 120000
index 00000000000..b240885218f
--- /dev/null
+++ b/egs/chime4/s5_6ch/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils
\ No newline at end of file
diff --git a/egs/csj/README.txt b/egs/csj/README.txt
index 9683c8b543c..268a313b458 100644
--- a/egs/csj/README.txt
+++ b/egs/csj/README.txt
@@ -1,16 +1,27 @@
 About the Corpus of Spontaneous Japanese:
 The Corpus of Spontaneous Japanese (CSJ) is a database of spoken
-Japanese developed by the Japan's national priority area research 
+Japanese developed by the Japan's national priority area research
 project "Spontaneous Speech: Corpus and Processing Technology".
-It contains about 650 hours of speech consisting of approximately 
+It contains about 650 hours of speech consisting of approximately
 7.5 million words that were provided by more than 1,400 speakers.
-For more details about the corpus, please visit the website of the 
+For more details about the corpus, please visit the website of the
 National Institute for Japanese Language (NINJAL). It is available
 from the Institute.
 http://www.ninjal.ac.jp/english/products/csj/
 http://pj.ninjal.ac.jp/corpus_center/csj/
 
+Meta-parameter tuning based on evolution strategy:
+The meta-parameters of the system contained in conf/config_opt were
+automatically tuned using evolution strategy. For the details,
+please refer the following paper:
+Takafumi Moriya, Tomohiro Tanaka, Takahiro Shinozaki, Shinji Watanabe,
+and Kevin Duh, "Automation of System Building for State-of-the-art
+Large Vocabulary Speech Recognition Using Evolution Strategy," Proc.
+IEEE 2015 Automatic Speech Recognition and Understanding Workshop
+(ASRU), 2015.
+
+
 Each subdirectory of this directory contains the
-scripts for a sequence of experiments. 
+scripts for a sequence of experiments.
 s5: This is the current recommended recipe.
-    The third edition of CSJ is assumed.
+   The recipe supports the third and fourth editions of CSJ.
diff --git a/egs/csj/s5/RESULTS b/egs/csj/s5/RESULTS
index 208d99b8d66..340879aeda7 100644
--- a/egs/csj/s5/RESULTS
+++ b/egs/csj/s5/RESULTS
@@ -1,117 +1,118 @@
+## These are results using the third edition of CSJ.
 for eval_num in `seq 3`; do echo "=== evaluation set $eval_num ===" ;\
  for x in exp/{tri,dnn}*/decode_eval${eval_num}*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done ; done
 
 ## Results of using training set that contains "academic" and "other" speech data (default).
-## If you want to use "trial lecture" and "dialog" data, you should check the following script [local/csj_data_prep.sh line 44].
+## If you want to use "simulated public speaking" and "dialog" data, you should check the following script [local/csj_data_prep.sh line 44].
 
 === evaluation set 1 ===
-%WER 22.67 [ 6269 / 27651, 522 ins, 1903 del, 3844 sub ] exp/tri1/decode_eval1_csj/wer_12
-%WER 21.49 [ 5943 / 27651, 541 ins, 1745 del, 3657 sub ] exp/tri2/decode_eval1_csj/wer_12
-%WER 17.49 [ 4837 / 27651, 613 ins, 1269 del, 2955 sub ] exp/tri3/decode_eval1_csj/wer_16
-%WER 15.26 [ 4220 / 27651, 566 ins, 1071 del, 2583 sub ] exp/tri4/decode_eval1_csj/wer_17
-%WER 17.33 [ 4792 / 27651, 628 ins, 1137 del, 3027 sub ] exp/tri4/decode_eval1_csj.si/wer_16
-%WER 14.59 [ 4033 / 27651, 617 ins, 919 del, 2497 sub ] exp/tri4_fmmi_b0.1/decode_eval1_it4_csj/wer_14
-%WER 14.14 [ 3911 / 27651, 585 ins, 915 del, 2411 sub ] exp/tri4_fmmi_b0.1/decode_eval1_it5_csj/wer_17
-%WER 14.00 [ 3871 / 27651, 586 ins, 888 del, 2397 sub ] exp/tri4_fmmi_b0.1/decode_eval1_it6_csj/wer_15
-%WER 13.92 [ 3850 / 27651, 661 ins, 793 del, 2396 sub ] exp/tri4_fmmi_b0.1/decode_eval1_it7_csj/wer_14
-%WER 14.15 [ 3913 / 27651, 640 ins, 877 del, 2396 sub ] exp/tri4_fmmi_b0.1/decode_eval1_it8_csj/wer_14
-%WER 14.39 [ 3979 / 27651, 570 ins, 946 del, 2463 sub ] exp/tri4_mmi_b0.1/decode_eval1_1.mdl_csj/wer_17
-%WER 14.09 [ 3895 / 27651, 576 ins, 882 del, 2437 sub ] exp/tri4_mmi_b0.1/decode_eval1_2.mdl_csj/wer_15
-%WER 14.02 [ 3877 / 27651, 602 ins, 858 del, 2417 sub ] exp/tri4_mmi_b0.1/decode_eval1_3.mdl_csj/wer_15
-%WER 14.00 [ 3870 / 27651, 609 ins, 853 del, 2408 sub ] exp/tri4_mmi_b0.1/decode_eval1_4.mdl_csj/wer_15
-%WER 11.93 [ 3298 / 27651, 348 ins, 970 del, 1980 sub ] exp/dnn5b_pretrain-dbn_dnn/decode_eval1_csj/wer_12
-%WER 11.29 [ 3123 / 27651, 509 ins, 651 del, 1963 sub ] exp/dnn5b_pretrain-dbn_dnn_smbr/decode_eval1_csj/wer_15
-%WER 10.87 [ 3007 / 27651, 497 ins, 589 del, 1921 sub ] exp/dnn5b_pretrain-dbn_dnn_smbr_i1lats/decode_eval1_csj/wer_14
+%WER 22.67 [ 6269 / 27651, 522 ins, 1903 del, 3844 sub ] exp/tri1/decode_eval1_csj/wer_12_0.0
+%WER 21.49 [ 5943 / 27651, 541 ins, 1745 del, 3657 sub ] exp/tri2/decode_eval1_csj/wer_12_0.0
+%WER 17.49 [ 4837 / 27651, 613 ins, 1269 del, 2955 sub ] exp/tri3/decode_eval1_csj/wer_16_0.0
+%WER 15.26 [ 4220 / 27651, 566 ins, 1071 del, 2583 sub ] exp/tri4/decode_eval1_csj/wer_17_0.0
+%WER 17.33 [ 4792 / 27651, 628 ins, 1137 del, 3027 sub ] exp/tri4/decode_eval1_csj.si/wer_16_0.0
+%WER 14.59 [ 4033 / 27651, 617 ins, 919 del, 2497 sub ] exp/tri4_fmmi_b0.1/decode_eval1_it4_csj/wer_14_0.0
+%WER 14.14 [ 3911 / 27651, 585 ins, 915 del, 2411 sub ] exp/tri4_fmmi_b0.1/decode_eval1_it5_csj/wer_17_0.0
+%WER 14.00 [ 3871 / 27651, 586 ins, 888 del, 2397 sub ] exp/tri4_fmmi_b0.1/decode_eval1_it6_csj/wer_15_0.5
+%WER 13.92 [ 3850 / 27651, 661 ins, 793 del, 2396 sub ] exp/tri4_fmmi_b0.1/decode_eval1_it7_csj/wer_14_0.5
+%WER 14.15 [ 3913 / 27651, 640 ins, 877 del, 2396 sub ] exp/tri4_fmmi_b0.1/decode_eval1_it8_csj/wer_14_0.5
+%WER 14.39 [ 3979 / 27651, 570 ins, 946 del, 2463 sub ] exp/tri4_mmi_b0.1/decode_eval1_1.mdl_csj/wer_17_0.5
+%WER 14.09 [ 3895 / 27651, 576 ins, 882 del, 2437 sub ] exp/tri4_mmi_b0.1/decode_eval1_2.mdl_csj/wer_15_0.0
+%WER 14.02 [ 3877 / 27651, 602 ins, 858 del, 2417 sub ] exp/tri4_mmi_b0.1/decode_eval1_3.mdl_csj/wer_15_0.5
+%WER 14.00 [ 3870 / 27651, 609 ins, 853 del, 2408 sub ] exp/tri4_mmi_b0.1/decode_eval1_4.mdl_csj/wer_15_0.5
+%WER 11.93 [ 3298 / 27651, 348 ins, 970 del, 1980 sub ] exp/dnn5b_pretrain-dbn_dnn/decode_eval1_csj/wer_12_0.0
+%WER 11.29 [ 3123 / 27651, 509 ins, 651 del, 1963 sub ] exp/dnn5b_pretrain-dbn_dnn_smbr/decode_eval1_csj/wer_15_1.0
+%WER 10.87 [ 3007 / 27651, 497 ins, 589 del, 1921 sub ] exp/dnn5b_pretrain-dbn_dnn_smbr_i1lats/decode_eval1_csj/wer_14_0.5
 === evaluation set 2 ===
-%WER 19.80 [ 5628 / 28424, 561 ins, 1511 del, 3556 sub ] exp/tri1/decode_eval2_csj/wer_12
-%WER 19.04 [ 5413 / 28424, 600 ins, 1423 del, 3390 sub ] exp/tri2/decode_eval2_csj/wer_12
-%WER 15.80 [ 4490 / 28424, 582 ins, 1131 del, 2777 sub ] exp/tri3/decode_eval2_csj/wer_16
-%WER 13.95 [ 3964 / 28424, 691 ins, 843 del, 2430 sub ] exp/tri4/decode_eval2_csj/wer_13
-%WER 18.74 [ 5326 / 28424, 804 ins, 1056 del, 3466 sub ] exp/tri4/decode_eval2_csj.si/wer_17
-%WER 12.77 [ 3631 / 28424, 604 ins, 781 del, 2246 sub ] exp/tri4_fmmi_b0.1/decode_eval2_it4_csj/wer_14
-%WER 12.27 [ 3488 / 28424, 604 ins, 707 del, 2177 sub ] exp/tri4_fmmi_b0.1/decode_eval2_it5_csj/wer_14
-%WER 12.32 [ 3502 / 28424, 613 ins, 713 del, 2176 sub ] exp/tri4_fmmi_b0.1/decode_eval2_it6_csj/wer_13
-%WER 12.32 [ 3502 / 28424, 658 ins, 688 del, 2156 sub ] exp/tri4_fmmi_b0.1/decode_eval2_it7_csj/wer_14
-%WER 12.56 [ 3569 / 28424, 642 ins, 760 del, 2167 sub ] exp/tri4_fmmi_b0.1/decode_eval2_it8_csj/wer_13
-%WER 12.51 [ 3557 / 28424, 588 ins, 766 del, 2203 sub ] exp/tri4_mmi_b0.1/decode_eval2_1.mdl_csj/wer_15
-%WER 12.25 [ 3482 / 28424, 587 ins, 730 del, 2165 sub ] exp/tri4_mmi_b0.1/decode_eval2_2.mdl_csj/wer_14
-%WER 12.20 [ 3467 / 28424, 599 ins, 706 del, 2162 sub ] exp/tri4_mmi_b0.1/decode_eval2_3.mdl_csj/wer_14
-%WER 12.33 [ 3504 / 28424, 615 ins, 714 del, 2175 sub ] exp/tri4_mmi_b0.1/decode_eval2_4.mdl_csj/wer_14
-%WER 10.24 [ 2910 / 28424, 271 ins, 852 del, 1787 sub ] exp/dnn5b_pretrain-dbn_dnn/decode_eval2_csj/wer_12
-%WER 9.41 [ 2676 / 28424, 453 ins, 432 del, 1791 sub ] exp/dnn5b_pretrain-dbn_dnn_smbr/decode_eval2_csj/wer_14
-%WER 9.19 [ 2612 / 28424, 417 ins, 422 del, 1773 sub ] exp/dnn5b_pretrain-dbn_dnn_smbr_i1lats/decode_eval2_csj/wer_14
+%WER 19.80 [ 5628 / 28424, 561 ins, 1511 del, 3556 sub ] exp/tri1/decode_eval2_csj/wer_12_0.0
+%WER 19.04 [ 5413 / 28424, 600 ins, 1423 del, 3390 sub ] exp/tri2/decode_eval2_csj/wer_12_0.0
+%WER 15.80 [ 4490 / 28424, 582 ins, 1131 del, 2777 sub ] exp/tri3/decode_eval2_csj/wer_16_0.5
+%WER 13.95 [ 3964 / 28424, 691 ins, 843 del, 2430 sub ] exp/tri4/decode_eval2_csj/wer_13_0.0
+%WER 18.74 [ 5326 / 28424, 804 ins, 1056 del, 3466 sub ] exp/tri4/decode_eval2_csj.si/wer_17_0.0
+%WER 12.77 [ 3631 / 28424, 604 ins, 781 del, 2246 sub ] exp/tri4_fmmi_b0.1/decode_eval2_it4_csj/wer_14_0.5
+%WER 12.27 [ 3488 / 28424, 604 ins, 707 del, 2177 sub ] exp/tri4_fmmi_b0.1/decode_eval2_it5_csj/wer_14_0.5
+%WER 12.32 [ 3502 / 28424, 613 ins, 713 del, 2176 sub ] exp/tri4_fmmi_b0.1/decode_eval2_it6_csj/wer_13_0.5
+%WER 12.32 [ 3502 / 28424, 658 ins, 688 del, 2156 sub ] exp/tri4_fmmi_b0.1/decode_eval2_it7_csj/wer_14_1.0
+%WER 12.56 [ 3569 / 28424, 642 ins, 760 del, 2167 sub ] exp/tri4_fmmi_b0.1/decode_eval2_it8_csj/wer_13_1.0
+%WER 12.51 [ 3557 / 28424, 588 ins, 766 del, 2203 sub ] exp/tri4_mmi_b0.1/decode_eval2_1.mdl_csj/wer_15_0.5
+%WER 12.25 [ 3482 / 28424, 587 ins, 730 del, 2165 sub ] exp/tri4_mmi_b0.1/decode_eval2_2.mdl_csj/wer_14_0.5
+%WER 12.20 [ 3467 / 28424, 599 ins, 706 del, 2162 sub ] exp/tri4_mmi_b0.1/decode_eval2_3.mdl_csj/wer_14_0.5
+%WER 12.33 [ 3504 / 28424, 615 ins, 714 del, 2175 sub ] exp/tri4_mmi_b0.1/decode_eval2_4.mdl_csj/wer_14_0.5
+%WER 10.24 [ 2910 / 28424, 271 ins, 852 del, 1787 sub ] exp/dnn5b_pretrain-dbn_dnn/decode_eval2_csj/wer_12_0.0
+%WER 9.41 [ 2676 / 28424, 453 ins, 432 del, 1791 sub ] exp/dnn5b_pretrain-dbn_dnn_smbr/decode_eval2_csj/wer_14_1.0
+%WER 9.19 [ 2612 / 28424, 417 ins, 422 del, 1773 sub ] exp/dnn5b_pretrain-dbn_dnn_smbr_i1lats/decode_eval2_csj/wer_14_0.5
 === evaluation set 3 ===
-%WER 24.80 [ 4534 / 18283, 447 ins, 1350 del, 2737 sub ] exp/tri1/decode_eval3_csj/wer_15
-%WER 23.68 [ 4329 / 18283, 497 ins, 1183 del, 2649 sub ] exp/tri2/decode_eval3_csj/wer_13
-%WER 19.97 [ 3651 / 18283, 582 ins, 828 del, 2241 sub ] exp/tri3/decode_eval3_csj/wer_17
-%WER 17.27 [ 3158 / 18283, 520 ins, 752 del, 1886 sub ] exp/tri4/decode_eval3_csj/wer_19
-%WER 21.44 [ 3919 / 18283, 660 ins, 823 del, 2436 sub ] exp/tri4/decode_eval3_csj.si/wer_20
-%WER 16.56 [ 3028 / 18283, 476 ins, 716 del, 1836 sub ] exp/tri4_fmmi_b0.1/decode_eval3_it4_csj/wer_20
-%WER 15.79 [ 2887 / 18283, 547 ins, 554 del, 1786 sub ] exp/tri4_fmmi_b0.1/decode_eval3_it5_csj/wer_15
-%WER 15.89 [ 2906 / 18283, 519 ins, 597 del, 1790 sub ] exp/tri4_fmmi_b0.1/decode_eval3_it6_csj/wer_15
-%WER 15.64 [ 2860 / 18283, 556 ins, 512 del, 1792 sub ] exp/tri4_fmmi_b0.1/decode_eval3_it7_csj/wer_15
-%WER 16.38 [ 2994 / 18283, 529 ins, 655 del, 1810 sub ] exp/tri4_fmmi_b0.1/decode_eval3_it8_csj/wer_15
-%WER 16.13 [ 2949 / 18283, 505 ins, 630 del, 1814 sub ] exp/tri4_mmi_b0.1/decode_eval3_1.mdl_csj/wer_18
-%WER 15.97 [ 2920 / 18283, 540 ins, 556 del, 1824 sub ] exp/tri4_mmi_b0.1/decode_eval3_2.mdl_csj/wer_14
-%WER 15.98 [ 2922 / 18283, 564 ins, 537 del, 1821 sub ] exp/tri4_mmi_b0.1/decode_eval3_3.mdl_csj/wer_14
-%WER 15.98 [ 2921 / 18283, 548 ins, 566 del, 1807 sub ] exp/tri4_mmi_b0.1/decode_eval3_4.mdl_csj/wer_15
-%WER 13.94 [ 2548 / 18283, 313 ins, 716 del, 1519 sub ] exp/dnn5b_pretrain-dbn_dnn/decode_eval3_csj/wer_13
-%WER 12.52 [ 2289 / 18283, 464 ins, 354 del, 1471 sub ] exp/dnn5b_pretrain-dbn_dnn_smbr/decode_eval3_csj/wer_15
-%WER 12.18 [ 2226 / 18283, 431 ins, 340 del, 1455 sub ] exp/dnn5b_pretrain-dbn_dnn_smbr_i1lats/decode_eval3_csj/wer_15
+%WER 24.80 [ 4534 / 18283, 447 ins, 1350 del, 2737 sub ] exp/tri1/decode_eval3_csj/wer_15_0.0
+%WER 23.68 [ 4329 / 18283, 497 ins, 1183 del, 2649 sub ] exp/tri2/decode_eval3_csj/wer_13_0.0
+%WER 19.97 [ 3651 / 18283, 582 ins, 828 del, 2241 sub ] exp/tri3/decode_eval3_csj/wer_17_0.5
+%WER 17.27 [ 3158 / 18283, 520 ins, 752 del, 1886 sub ] exp/tri4/decode_eval3_csj/wer_19_0.0
+%WER 21.44 [ 3919 / 18283, 660 ins, 823 del, 2436 sub ] exp/tri4/decode_eval3_csj.si/wer_20_1.0
+%WER 16.56 [ 3028 / 18283, 476 ins, 716 del, 1836 sub ] exp/tri4_fmmi_b0.1/decode_eval3_it4_csj/wer_20_0.0
+%WER 15.79 [ 2887 / 18283, 547 ins, 554 del, 1786 sub ] exp/tri4_fmmi_b0.1/decode_eval3_it5_csj/wer_15_0.5
+%WER 15.89 [ 2906 / 18283, 519 ins, 597 del, 1790 sub ] exp/tri4_fmmi_b0.1/decode_eval3_it6_csj/wer_15_0.5
+%WER 15.64 [ 2860 / 18283, 556 ins, 512 del, 1792 sub ] exp/tri4_fmmi_b0.1/decode_eval3_it7_csj/wer_15_1.0
+%WER 16.38 [ 2994 / 18283, 529 ins, 655 del, 1810 sub ] exp/tri4_fmmi_b0.1/decode_eval3_it8_csj/wer_15_0.5
+%WER 16.13 [ 2949 / 18283, 505 ins, 630 del, 1814 sub ] exp/tri4_mmi_b0.1/decode_eval3_1.mdl_csj/wer_18_0.0
+%WER 15.97 [ 2920 / 18283, 540 ins, 556 del, 1824 sub ] exp/tri4_mmi_b0.1/decode_eval3_2.mdl_csj/wer_14_0.5
+%WER 15.98 [ 2922 / 18283, 564 ins, 537 del, 1821 sub ] exp/tri4_mmi_b0.1/decode_eval3_3.mdl_csj/wer_14_0.0
+%WER 15.98 [ 2921 / 18283, 548 ins, 566 del, 1807 sub ] exp/tri4_mmi_b0.1/decode_eval3_4.mdl_csj/wer_15_1.0
+%WER 13.94 [ 2548 / 18283, 313 ins, 716 del, 1519 sub ] exp/dnn5b_pretrain-dbn_dnn/decode_eval3_csj/wer_13_0.0
+%WER 12.52 [ 2289 / 18283, 464 ins, 354 del, 1471 sub ] exp/dnn5b_pretrain-dbn_dnn_smbr/decode_eval3_csj/wer_15_0.0
+%WER 12.18 [ 2226 / 18283, 431 ins, 340 del, 1455 sub ] exp/dnn5b_pretrain-dbn_dnn_smbr_i1lats/decode_eval3_csj/wer_15_0.5
 
-## Results of using training data that contain all types of speech data.
+## Results of using training data that contain all types of speech data except for dialog type.
 
 === evaluation set 1 ===
-%WER 22.71 [ 6279 / 27651, 524 ins, 1936 del, 3819 sub ] exp/tri1/decode_eval1_csj/wer_13
-%WER 21.36 [ 5905 / 27651, 529 ins, 1781 del, 3595 sub ] exp/tri2/decode_eval1_csj/wer_13
-%WER 17.89 [ 4948 / 27651, 586 ins, 1314 del, 3048 sub ] exp/tri3/decode_eval1_csj/wer_16
-%WER 15.85 [ 4383 / 27651, 580 ins, 1169 del, 2634 sub ] exp/tri4/decode_eval1_csj/wer_17
-%WER 18.06 [ 4995 / 27651, 671 ins, 1209 del, 3115 sub ] exp/tri4/decode_eval1_csj.si/wer_15
-%WER 15.17 [ 4196 / 27651, 536 ins, 1105 del, 2555 sub ] exp/tri4_fmmi_b0.1/decode_eval1_it4_csj/wer_17
-%WER 14.32 [ 3959 / 27651, 578 ins, 949 del, 2432 sub ] exp/tri4_fmmi_b0.1/decode_eval1_it5_csj/wer_15
-%WER 14.20 [ 3926 / 27651, 598 ins, 885 del, 2443 sub ] exp/tri4_fmmi_b0.1/decode_eval1_it6_csj/wer_13
-%WER 13.93 [ 3851 / 27651, 631 ins, 829 del, 2391 sub ] exp/tri4_fmmi_b0.1/decode_eval1_it7_csj/wer_14
-%WER 14.09 [ 3895 / 27651, 621 ins, 847 del, 2427 sub ] exp/tri4_fmmi_b0.1/decode_eval1_it8_csj/wer_12
-%WER 14.69 [ 4061 / 27651, 587 ins, 981 del, 2493 sub ] exp/tri4_mmi_b0.1/decode_eval1_1.mdl_csj/wer_15
-%WER 14.48 [ 4003 / 27651, 549 ins, 1001 del, 2453 sub ] exp/tri4_mmi_b0.1/decode_eval1_2.mdl_csj/wer_16
-%WER 14.33 [ 3963 / 27651, 611 ins, 901 del, 2451 sub ] exp/tri4_mmi_b0.1/decode_eval1_3.mdl_csj/wer_14
-%WER 14.12 [ 3905 / 27651, 610 ins, 870 del, 2425 sub ] exp/tri4_mmi_b0.1/decode_eval1_4.mdl_csj/wer_14
-%WER 11.62 [ 3214 / 27651, 381 ins, 799 del, 2034 sub ] exp/dnn5b_pretrain-dbn_dnn/decode_eval1_csj/wer_12
-%WER 10.93 [ 3021 / 27651, 475 ins, 566 del, 1980 sub ] exp/dnn5b_pretrain-dbn_dnn_smbr/decode_eval1_csj/wer_14
-%WER 10.71 [ 2962 / 27651, 516 ins, 496 del, 1950 sub ] exp/dnn5b_pretrain-dbn_dnn_smbr_i1lats/decode_eval1_csj/wer_13
+%WER 22.97 [ 6352 / 27651, 514 ins, 1941 del, 3897 sub ] exp/tri1/decode_eval1_csj/wer_13_0.0
+%WER 21.48 [ 5939 / 27651, 482 ins, 1885 del, 3572 sub ] exp/tri2/decode_eval1_csj/wer_14_0.0
+%WER 17.86 [ 4939 / 27651, 596 ins, 1305 del, 3038 sub ] exp/tri3/decode_eval1_csj/wer_15_0.0
+%WER 15.67 [ 4333 / 27651, 584 ins, 1121 del, 2628 sub ] exp/tri4/decode_eval1_csj/wer_16_0.0
+%WER 17.88 [ 4943 / 27651, 623 ins, 1226 del, 3094 sub ] exp/tri4/decode_eval1_csj.si/wer_16_0.0
+%WER 15.01 [ 4150 / 27651, 580 ins, 1009 del, 2561 sub ] exp/tri4_fmmi_b0.1/decode_eval1_it4_csj/wer_15_0.0
+%WER 14.28 [ 3949 / 27651, 578 ins, 929 del, 2442 sub ] exp/tri4_fmmi_b0.1/decode_eval1_it5_csj/wer_15_0.0
+%WER 14.17 [ 3917 / 27651, 542 ins, 966 del, 2409 sub ] exp/tri4_fmmi_b0.1/decode_eval1_it6_csj/wer_15_0.0
+%WER 14.00 [ 3871 / 27651, 442 ins, 1085 del, 2344 sub ] exp/tri4_fmmi_b0.1/decode_eval1_it7_csj/wer_12_1.0
+%WER 14.08 [ 3893 / 27651, 426 ins, 1087 del, 2380 sub ] exp/tri4_fmmi_b0.1/decode_eval1_it8_csj/wer_11_1.0
+%WER 14.60 [ 4036 / 27651, 458 ins, 1115 del, 2463 sub ] exp/tri4_mmi_b0.1/decode_eval1_1.mdl_csj/wer_15_0.5
+%WER 14.42 [ 3986 / 27651, 459 ins, 1081 del, 2446 sub ] exp/tri4_mmi_b0.1/decode_eval1_2.mdl_csj/wer_14_0.5
+%WER 14.22 [ 3931 / 27651, 492 ins, 1022 del, 2417 sub ] exp/tri4_mmi_b0.1/decode_eval1_3.mdl_csj/wer_13_0.5
+%WER 13.99 [ 3869 / 27651, 504 ins, 949 del, 2416 sub ] exp/tri4_mmi_b0.1/decode_eval1_4.mdl_csj/wer_12_0.5
+%WER 11.63 [ 3215 / 27651, 384 ins, 804 del, 2027 sub ] exp/dnn5b_pretrain-dbn_dnn/decode_eval1_csj/wer_12_0.0
+%WER 10.56 [ 2921 / 27651, 366 ins, 662 del, 1893 sub ] exp/dnn5b_pretrain-dbn_dnn_smbr/decode_eval1_csj/wer_13_1.0
+%WER 10.34 [ 2859 / 27651, 363 ins, 660 del, 1836 sub ] exp/dnn5b_pretrain-dbn_dnn_smbr_i1lats/decode_eval1_csj/wer_14_1.0
 === evaluation set 2 ===
-%WER 19.61 [ 5575 / 28424, 577 ins, 1442 del, 3556 sub ] exp/tri1/decode_eval2_csj/wer_12
-%WER 18.47 [ 5250 / 28424, 572 ins, 1361 del, 3317 sub ] exp/tri2/decode_eval2_csj/wer_12
-%WER 15.71 [ 4464 / 28424, 577 ins, 1128 del, 2759 sub ] exp/tri3/decode_eval2_csj/wer_15
-%WER 13.24 [ 3764 / 28424, 535 ins, 921 del, 2308 sub ] exp/tri4/decode_eval2_csj/wer_16
-%WER 17.90 [ 5088 / 28424, 743 ins, 1057 del, 3288 sub ] exp/tri4/decode_eval2_csj.si/wer_16
-%WER 12.56 [ 3571 / 28424, 595 ins, 767 del, 2209 sub ] exp/tri4_fmmi_b0.1/decode_eval2_it4_csj/wer_13
-%WER 11.79 [ 3350 / 28424, 584 ins, 669 del, 2097 sub ] exp/tri4_fmmi_b0.1/decode_eval2_it5_csj/wer_13
-%WER 11.86 [ 3372 / 28424, 619 ins, 643 del, 2110 sub ] exp/tri4_fmmi_b0.1/decode_eval2_it6_csj/wer_11
-%WER 11.79 [ 3352 / 28424, 603 ins, 659 del, 2090 sub ] exp/tri4_fmmi_b0.1/decode_eval2_it7_csj/wer_13
-%WER 12.08 [ 3434 / 28424, 602 ins, 701 del, 2131 sub ] exp/tri4_fmmi_b0.1/decode_eval2_it8_csj/wer_11
-%WER 12.13 [ 3447 / 28424, 561 ins, 735 del, 2151 sub ] exp/tri4_mmi_b0.1/decode_eval2_1.mdl_csj/wer_14
-%WER 11.88 [ 3376 / 28424, 575 ins, 676 del, 2125 sub ] exp/tri4_mmi_b0.1/decode_eval2_2.mdl_csj/wer_12
-%WER 11.77 [ 3345 / 28424, 588 ins, 646 del, 2111 sub ] exp/tri4_mmi_b0.1/decode_eval2_3.mdl_csj/wer_12
-%WER 11.73 [ 3333 / 28424, 586 ins, 658 del, 2089 sub ] exp/tri4_mmi_b0.1/decode_eval2_4.mdl_csj/wer_12
-%WER 9.36 [ 2660 / 28424, 357 ins, 561 del, 1742 sub ] exp/dnn5b_pretrain-dbn_dnn/decode_eval2_csj/wer_10
-%WER 9.07 [ 2579 / 28424, 467 ins, 404 del, 1708 sub ] exp/dnn5b_pretrain-dbn_dnn_smbr/decode_eval2_csj/wer_13
-%WER 8.91 [ 2533 / 28424, 439 ins, 399 del, 1695 sub ] exp/dnn5b_pretrain-dbn_dnn_smbr_i1lats/decode_eval2_csj/wer_15
+%WER 19.56 [ 5560 / 28424, 560 ins, 1527 del, 3473 sub ] exp/tri1/decode_eval2_csj/wer_12_0.0
+%WER 18.62 [ 5293 / 28424, 610 ins, 1361 del, 3322 sub ] exp/tri2/decode_eval2_csj/wer_12_0.0
+%WER 15.58 [ 4429 / 28424, 626 ins, 1026 del, 2777 sub ] exp/tri3/decode_eval2_csj/wer_13_0.0
+%WER 13.37 [ 3801 / 28424, 643 ins, 844 del, 2314 sub ] exp/tri4/decode_eval2_csj/wer_14_0.0
+%WER 18.03 [ 5126 / 28424, 665 ins, 1178 del, 3283 sub ] exp/tri4/decode_eval2_csj.si/wer_15_0.5
+%WER 12.36 [ 3514 / 28424, 475 ins, 880 del, 2159 sub ] exp/tri4_fmmi_b0.1/decode_eval2_it4_csj/wer_13_0.5
+%WER 11.54 [ 3279 / 28424, 448 ins, 792 del, 2039 sub ] exp/tri4_fmmi_b0.1/decode_eval2_it5_csj/wer_13_0.5
+%WER 11.47 [ 3260 / 28424, 497 ins, 740 del, 2023 sub ] exp/tri4_fmmi_b0.1/decode_eval2_it6_csj/wer_11_0.5
+%WER 11.34 [ 3223 / 28424, 476 ins, 713 del, 2034 sub ] exp/tri4_fmmi_b0.1/decode_eval2_it7_csj/wer_10_1.0
+%WER 11.60 [ 3298 / 28424, 523 ins, 716 del, 2059 sub ] exp/tri4_fmmi_b0.1/decode_eval2_it8_csj/wer_10_0.5
+%WER 11.86 [ 3372 / 28424, 555 ins, 723 del, 2094 sub ] exp/tri4_mmi_b0.1/decode_eval2_1.mdl_csj/wer_14_0.0
+%WER 11.57 [ 3289 / 28424, 446 ins, 814 del, 2029 sub ] exp/tri4_mmi_b0.1/decode_eval2_2.mdl_csj/wer_13_0.5
+%WER 11.46 [ 3256 / 28424, 510 ins, 684 del, 2062 sub ] exp/tri4_mmi_b0.1/decode_eval2_3.mdl_csj/wer_11_0.5
+%WER 11.58 [ 3292 / 28424, 408 ins, 827 del, 2057 sub ] exp/tri4_mmi_b0.1/decode_eval2_4.mdl_csj/wer_11_1.0
+%WER 9.15 [ 2601 / 28424, 305 ins, 604 del, 1692 sub ] exp/dnn5b_pretrain-dbn_dnn/decode_eval2_csj/wer_12_0.0
+%WER 8.69 [ 2469 / 28424, 367 ins, 444 del, 1658 sub ] exp/dnn5b_pretrain-dbn_dnn_smbr/decode_eval2_csj/wer_12_1.0
+%WER 8.62 [ 2450 / 28424, 349 ins, 444 del, 1657 sub ] exp/dnn5b_pretrain-dbn_dnn_smbr_i1lats/decode_eval2_csj/wer_13_1.0
 === evaluation set 3 ===
-%WER 25.01 [ 4573 / 18283, 529 ins, 1219 del, 2825 sub ] exp/tri1/decode_eval3_csj/wer_13
-%WER 23.62 [ 4319 / 18283, 499 ins, 1176 del, 2644 sub ] exp/tri2/decode_eval3_csj/wer_14
-%WER 18.04 [ 3298 / 18283, 528 ins, 739 del, 2031 sub ] exp/tri3/decode_eval3_csj/wer_12
-%WER 15.63 [ 2858 / 18283, 411 ins, 719 del, 1728 sub ] exp/tri4/decode_eval3_csj/wer_15
-%WER 19.36 [ 3540 / 18283, 506 ins, 836 del, 2198 sub ] exp/tri4/decode_eval3_csj.si/wer_17
-%WER 14.90 [ 2724 / 18283, 456 ins, 602 del, 1666 sub ] exp/tri4_fmmi_b0.1/decode_eval3_it4_csj/wer_13
-%WER 13.70 [ 2504 / 18283, 456 ins, 477 del, 1571 sub ] exp/tri4_fmmi_b0.1/decode_eval3_it5_csj/wer_13
-%WER 13.78 [ 2520 / 18283, 460 ins, 548 del, 1512 sub ] exp/tri4_fmmi_b0.1/decode_eval3_it6_csj/wer_12
-%WER 13.08 [ 2391 / 18283, 517 ins, 400 del, 1474 sub ] exp/tri4_fmmi_b0.1/decode_eval3_it7_csj/wer_12
-%WER 13.75 [ 2514 / 18283, 469 ins, 562 del, 1483 sub ] exp/tri4_fmmi_b0.1/decode_eval3_it8_csj/wer_12
-%WER 14.14 [ 2585 / 18283, 436 ins, 537 del, 1612 sub ] exp/tri4_mmi_b0.1/decode_eval3_1.mdl_csj/wer_14
-%WER 13.83 [ 2529 / 18283, 429 ins, 547 del, 1553 sub ] exp/tri4_mmi_b0.1/decode_eval3_2.mdl_csj/wer_14
-%WER 13.54 [ 2475 / 18283, 460 ins, 492 del, 1523 sub ] exp/tri4_mmi_b0.1/decode_eval3_3.mdl_csj/wer_13
-%WER 13.36 [ 2443 / 18283, 463 ins, 482 del, 1498 sub ] exp/tri4_mmi_b0.1/decode_eval3_4.mdl_csj/wer_13
-%WER 10.55 [ 1928 / 18283, 242 ins, 482 del, 1204 sub ] exp/dnn5b_pretrain-dbn_dnn/decode_eval3_csj/wer_13
-%WER 9.71 [ 1775 / 18283, 338 ins, 271 del, 1166 sub ] exp/dnn5b_pretrain-dbn_dnn_smbr/decode_eval3_csj/wer_13
-%WER 9.31 [ 1703 / 18283, 336 ins, 247 del, 1120 sub ] exp/dnn5b_pretrain-dbn_dnn_smbr_i1lats/decode_eval3_csj/wer_13
\ No newline at end of file
+%WER 25.00 [ 4570 / 18283, 515 ins, 1277 del, 2778 sub ] exp/tri1/decode_eval3_csj/wer_14_0.0
+%WER 23.93 [ 4375 / 18283, 560 ins, 1163 del, 2652 sub ] exp/tri2/decode_eval3_csj/wer_14_0.0
+%WER 17.66 [ 3229 / 18283, 484 ins, 773 del, 1972 sub ] exp/tri3/decode_eval3_csj/wer_14_0.0
+%WER 15.46 [ 2827 / 18283, 311 ins, 860 del, 1656 sub ] exp/tri4/decode_eval3_csj/wer_17_0.5
+%WER 18.92 [ 3459 / 18283, 424 ins, 910 del, 2125 sub ] exp/tri4/decode_eval3_csj.si/wer_16_0.5
+%WER 14.55 [ 2661 / 18283, 423 ins, 629 del, 1609 sub ] exp/tri4_fmmi_b0.1/decode_eval3_it4_csj/wer_14_0.0
+%WER 13.38 [ 2446 / 18283, 362 ins, 572 del, 1512 sub ] exp/tri4_fmmi_b0.1/decode_eval3_it5_csj/wer_13_0.5
+%WER 13.37 [ 2444 / 18283, 484 ins, 470 del, 1490 sub ] exp/tri4_fmmi_b0.1/decode_eval3_it6_csj/wer_11_0.0
+%WER 12.96 [ 2370 / 18283, 332 ins, 570 del, 1468 sub ] exp/tri4_fmmi_b0.1/decode_eval3_it7_csj/wer_12_1.0
+%WER 13.62 [ 2490 / 18283, 440 ins, 549 del, 1501 sub ] exp/tri4_fmmi_b0.1/decode_eval3_it8_csj/wer_10_0.5
+%WER 13.77 [ 2518 / 18283, 323 ins, 664 del, 1531 sub ] exp/tri4_mmi_b0.1/decode_eval3_1.mdl_csj/wer_15_0.5
+%WER 13.48 [ 2464 / 18283, 334 ins, 618 del, 1512 sub ] exp/tri4_mmi_b0.1/decode_eval3_2.mdl_csj/wer_13_0.5
+%WER 13.28 [ 2428 / 18283, 379 ins, 546 del, 1503 sub ] exp/tri4_mmi_b0.1/decode_eval3_3.mdl_csj/wer_12_0.5
+%WER 13.26 [ 2424 / 18283, 388 ins, 543 del, 1493 sub ] exp/tri4_mmi_b0.1/decode_eval3_4.mdl_csj/wer_12_0.5
+%WER 10.41 [ 1904 / 18283, 289 ins, 422 del, 1193 sub ] exp/dnn5b_pretrain-dbn_dnn/decode_eval3_csj/wer_10_0.0
+%WER 9.34 [ 1707 / 18283, 251 ins, 341 del, 1115 sub ] exp/dnn5b_pretrain-dbn_dnn_smbr/decode_eval3_csj/wer_13_1.0
+%WER 9.10 [ 1664 / 18283, 246 ins, 344 del, 1074 sub ] exp/dnn5b_pretrain-dbn_dnn_smbr_i1lats/decode_eval3_csj/wer_14_1.0
diff --git a/egs/csj/s5/cmd.sh b/egs/csj/s5/cmd.sh
index d5952fe0f87..71dd849a93b 100644
--- a/egs/csj/s5/cmd.sh
+++ b/egs/csj/s5/cmd.sh
@@ -1,31 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#a) JHU cluster options
-#export train_cmd="queue.pl -l arch=*64*"
-#export decode_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G"
-export train_cmd="run.pl"
-export decode_cmd="run.pl"
-#export cuda_cmd="..."
-#export mkgraph_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G"
-export mkgraph_cmd="run.pl"
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
-#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
-#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
-#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
-
-#c) run it locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
-
-
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/csj/s5/conf/config_opt b/egs/csj/s5/conf/config_opt
index 5868d671c3e..e91c33abfa2 100644
--- a/egs/csj/s5/conf/config_opt
+++ b/egs/csj/s5/conf/config_opt
@@ -3,7 +3,8 @@
 # Apache 2.0
 # Acknowledgement  This work was supported by JSPS KAKENHI Grant Number 26280055.  
 
-# Current optimized parameter config for CSJ
+# Currently optimized parameter config for CSJ
+
 splice=17
 nn_depth=6
 hid_dim=1905
diff --git a/egs/csj/s5/conf/mfcc.conf b/egs/csj/s5/conf/mfcc.conf
index 0e7dfcd69b0..a5b1cbc03a3 100644
--- a/egs/csj/s5/conf/mfcc.conf
+++ b/egs/csj/s5/conf/mfcc.conf
@@ -1,3 +1,2 @@
 --use-energy=false   # only non-default option.
-#--sample-frequency=8000 #  Switchboard is sampled at 8kHz
 --sample-frequency=16000 #  CSJ is sampled at 16kHz
diff --git a/egs/csj/s5/local/csj_data_prep.sh b/egs/csj/s5/local/csj_data_prep.sh
index 7458c0ce395..73462f17832 100644
--- a/egs/csj/s5/local/csj_data_prep.sh
+++ b/egs/csj/s5/local/csj_data_prep.sh
@@ -50,7 +50,7 @@ cat $CSJ/dvd{3,5,6,7,8,9,10}/{A*,M*}/*-wav.list 2>/dev/null | sort > $dir/wav.fl
 n=`cat $dir/wav.flist | wc -l`
 
 [ $n -ne 986 ] && \
-  echo Warning: expected 986 data data files, found $n
+  echo "Warning: expected 986 data files (Case : Using 'Academic lecture' and 'Other' data), found $n."
 
 
 # (1a) Transcriptions preparation
@@ -102,7 +102,7 @@ awk '{segment=$1; split(segment,S,"[_]"); spkid=S[1]; print $1 " " spkid}' $dir/
 
 sort -k 2 $dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $dir/spk2utt || exit 1;
 
-# Copy stuff into its final locations.
+# Copy stuff into its final locations [this has been moved from the format_data script]
 mkdir -p data/train
 for f in spk2utt utt2spk wav.scp text segments; do
   cp data/local/train/$f data/train/$f || exit 1;
diff --git a/egs/csj/s5/local/csj_eval_data_prep.sh b/egs/csj/s5/local/csj_eval_data_prep.sh
index 623197775e5..a8b848de4e2 100644
--- a/egs/csj/s5/local/csj_eval_data_prep.sh
+++ b/egs/csj/s5/local/csj_eval_data_prep.sh
@@ -9,7 +9,7 @@
 
 # To be run from one directory above this script.
 
-# The input is directory name containing the official evaluation test set.
+# The input is directory containing the official evaluation test set and transcripts.
 
 if [ $# -ne 2 ]; then
   echo "Usage: "`basename $0`" <transcription-dir> <eval_num>"
diff --git a/egs/csj/s5/local/csj_make_trans/csj2kaldi4m.pl b/egs/csj/s5/local/csj_make_trans/csj2kaldi4m.pl
index 7895fa3410d..05ff93a54f8 100755
--- a/egs/csj/s5/local/csj_make_trans/csj2kaldi4m.pl
+++ b/egs/csj/s5/local/csj_make_trans/csj2kaldi4m.pl
@@ -204,8 +204,10 @@
     $word =~ s/\ン\ー/\ン/g; #
     $word =~ s/\ヮ/\ワ/g;
     $word =~ s/\ゎ/\わ/g;
-    $word =~ s/^\ゼロ$/\０/g;
-    $word =~ s/^\零$/\０/g;
+
+    # Normalization
+#    $word =~ s/^\ゼロ$/\０/g;
+#    $word =~ s/^\零$/\０/g;
 
     # Arrange morpheme
     # This function is to arrange morpheme.
diff --git a/egs/csj/s5/local/csj_make_trans/csj_automake.sh b/egs/csj/s5/local/csj_make_trans/csj_automake.sh
index 132725c0466..8dbb507a631 100644
--- a/egs/csj/s5/local/csj_make_trans/csj_automake.sh
+++ b/egs/csj/s5/local/csj_make_trans/csj_automake.sh
@@ -11,22 +11,17 @@ if [ $# -ne 2 ]; then
   exit 1
 fi
 
-
 resource=$1
 outd=$2
 
-csjext=./local/csj_make_trans/csj2kaldi4m.pl
-csjconnect=./local/csj_make_trans/csjconnect.pl
-k2phone=./local/csj_make_trans/kana2phone
-vocab2dic=./local/csj_make_trans/vocab2dic.pl
-reform=./local/csj_make_trans/reform.pl
+[ ! -e $resource ] && echo "Not exist CSJ or incorrect PATH." && exit 1;
 
-if [ ! -d ./csj-data/dvd17 ];then
+if [ ! -e $outd/.done_make_trans ];then
 (
 mkdir -p $outd
 rm $outd/al_sent4lex.txt
 
-cp ./local/csj_make_trans/overview_csj-data $outd/README.txt
+cp local/csj_make_trans/overview_csj-data $outd/README.txt
 
 # Make transcription file for each dvd and each lecture
 [ ! -x "`which nkf `" ]\
@@ -35,19 +30,14 @@ cp ./local/csj_make_trans/overview_csj-data $outd/README.txt
 for vol in dvd{3..17} ;do
     mkdir -p $outd/$vol
 
+    (
     for id in `ls $resource/$vol`;do
 	mkdir -p $outd/$vol/${id}
 	rm -r $outd/$vol/00README.txt
-	
-	    (
 		nkf -e -d $resource/$vol/$id/${id}.sdb > $outd/$vol/${id}/sdb.tmp
-		$csjext $outd/$vol/${id}/sdb.tmp  $outd/$vol/$id/${id}.4lex $outd/$vol/$id/${id}.4trn.t 
-		
-		$csjconnect 0.5 10 $outd/$vol/$id/${id}.4trn.t $id > $outd/$vol/$id/${id}-trans.text
-		
+		local/csj_make_trans/csj2kaldi4m.pl $outd/$vol/${id}/sdb.tmp  $outd/$vol/$id/${id}.4lex $outd/$vol/$id/${id}.4trn.t 
 		
-		[ -z `grep $id local/csj_make_trans/testset` ]\
-                  && cat $outd/$vol/$id/${id}.4lex >> $outd/al_sent4lex.txt
+		local/csj_make_trans/csjconnect.pl 0.5 10 $outd/$vol/$id/${id}.4trn.t $id > $outd/$vol/$id/${id}-trans.text
 		
 	    	rm $outd/$vol/$id/{${id}.4trn.t,sdb.tmp}
 		
@@ -56,30 +46,16 @@ for vol in dvd{3..17} ;do
 		else
                     find $resource/$vol/$id -iname ${id}.wav >$outd/$vol/$id/${id}-wav.list
 		fi
-		
-		
-	    ) 
     done
+    )&
 done
 wait
+echo -n >$outd/.done_make_trans
 )
 fi
 
-## make lexicon.txt
-if [ ! -f ./csj-data/lexicon/lexicon.txt ]; then
-    (
-    mkdir -p $outd/lexicon
-    sort $outd/al_sent4lex.txt >lex.tmp123
-    uniq lex.tmp123 > lex.tmp456
-    ${vocab2dic} -p $k2phone -o lex.tmp123 lex.tmp456
-    $reform lex.tmp123 | sort | uniq > $outd/lexicon/lexicon.txt
-    mv $outd/al_sent4lex.txt $outd/lexicon
-    rm lex.tmp123 lex.tmp456 ERROR
-    )
-fi
-
 ## Exclude speech data given by test set speakers.
-if [ ! -d ./csj-data/[eval,excluded] ]; then
+if [ ! -e $outd/.done_mv_eval_dup ]; then
 (
     mkdir -p $outd/eval
     mkdir -p $outd/excluded
@@ -89,10 +65,10 @@ if [ ! -d ./csj-data/[eval,excluded] ]; then
     
     # Speech data given by test set speakers (eval2 : A01M0056)
     rm dup_list
-    for line in `cat local/csj_make_trans/A01M0056_duplication | less`; do
+    for line in `cat local/csj_make_trans/A01M0056_duplication`; do
 	find $outd/dvd* -iname $line >>dup_list
     done
-    for list in `cat dup_list | less`;do
+    for list in `cat dup_list`;do
 	mv $list $outd/excluded
 	cp dup_list $outd/excluded/duplication.list
     done
@@ -100,10 +76,10 @@ if [ ! -d ./csj-data/[eval,excluded] ]; then
     
     # Evaluation data
     rm dup_list
-    for line in `cat local/csj_make_trans/testset | less`; do
+    for line in `cat local/csj_make_trans/testset`; do
 	find $outd/dvd* -iname $line >>dup_list
     done
-    for list in `cat dup_list | less`;do
+    for list in `cat dup_list`;do
 	mv $list $outd/eval
 	cp dup_list $outd/eval/evaluation.list
     done
@@ -114,11 +90,28 @@ if [ ! -d ./csj-data/[eval,excluded] ]; then
     mv $outd/eval/{A01M0110,A01M0137,A01M0097,A04M0123,A04M0121,A04M0051,A03M0156,A03M0112,A03M0106,A05M0011} $outd/eval/eval1
     mv $outd/eval/{A01M0056,A03F0072,A02M0012,A03M0016,A06M0064,A06F0135,A01F0034,A01F0063,A01F0001,A01M0141} $outd/eval/eval2
     mv $outd/eval/{S00M0112,S00F0066,S00M0213,S00F0019,S00M0079,S01F0105,S00F0152,S00M0070,S00M0008,S00F0148} $outd/eval/eval3
+
+    echo -n >$outd/.done_mv_eval_dup
+    )
+fi
+
+## make lexicon.txt
+if [ ! -e $outd/.done_make_lexicon ]; then
+    (
+    cat $outd/{dvd*,excluded}/*/*.4lex >> $outd/al_sent4lex.txt
+    mkdir -p $outd/lexicon
+    sort $outd/al_sent4lex.txt >lex.tmp123
+    uniq lex.tmp123 > lex.tmp456
+    local/csj_make_trans/vocab2dic.pl -p local/csj_make_trans/kana2phone -o lex.tmp123 lex.tmp456
+    local/csj_make_trans/reform.pl lex.tmp123 | sort | uniq > $outd/lexicon/lexicon.txt
+    mv $outd/al_sent4lex.txt $outd/lexicon
+    rm lex.tmp123 lex.tmp456 ERROR
+    
+    echo -n >$outd/.done_make_lexicon
     )
 fi
 
-comp_num=`ls -l $outd | wc -l`
-[ ! $comp_num -eq 20 ] \
+[ ! 3 -le `ls -a $outd | grep done | wc -l` ] \
   && echo "ERROR : Processing is incorrect." && exit 1;
 
-echo "Finish processing original CSJ data"
+echo "Finish processing original CSJ data" && echo -n >$outd/.done_make_all
diff --git a/egs/csj/s5/local/csj_make_trans/kana2phone b/egs/csj/s5/local/csj_make_trans/kana2phone
index 76a0a4bff9e..6979a320389 100644
--- a/egs/csj/s5/local/csj_make_trans/kana2phone
+++ b/egs/csj/s5/local/csj_make_trans/kana2phone
@@ -141,4 +141,4 @@
 ヴ+b u 
 ツ+ts u 
 シ+sh i 
-チ+ch i  
+チ+ch i 
diff --git a/egs/csj/s5/local/csj_make_trans/reform.pl b/egs/csj/s5/local/csj_make_trans/reform.pl
index 1c267e2c491..d9f6ac3058b 100755
--- a/egs/csj/s5/local/csj_make_trans/reform.pl
+++ b/egs/csj/s5/local/csj_make_trans/reform.pl
@@ -1,4 +1,6 @@
 #!/usr/bin/env perl
+use warnings;
+
 # Copyright  2015 Tokyo Institute of Technology (Authors: Takafumi Moriya and Takahiro Shinozaki)
 #            2015 Mitsubishi Electric Research Laboratories (Author: Shinji Watanabe)
 # Apache 2.0
@@ -6,8 +8,6 @@
 
 # This script is to make lexicon for KALDI format.
 
-use warnings;
-
 while (<>){
     chomp;
     @line=split(/\t/, $_);
diff --git a/egs/csj/s5/local/nnet/run_dnn.sh b/egs/csj/s5/local/nnet/run_dnn.sh
index 028be0b03e7..b0acce39d15 100644
--- a/egs/csj/s5/local/nnet/run_dnn.sh
+++ b/egs/csj/s5/local/nnet/run_dnn.sh
@@ -25,7 +25,7 @@
 # Config:
 config=conf/config_opt
 . $config
-gmmdir=exp/tri4
+gmmdir=exp/tri4 
 data_fmllr=data-fmllr-tri4
 stage=0 # resume training with --stage=N
 # End of config.
@@ -60,7 +60,7 @@ if [ $stage -le 1 ]; then
 fi
 
 
-if [ $stage -le 2 ]; then  
+if [ $stage -le 2 ]; then 
   # Train the DNN optimizing per-frame cross-entropy.
   dir=exp/dnn5b_pretrain-dbn_dnn
   ali=${gmmdir}_ali_nodup
@@ -86,7 +86,7 @@ dir=exp/dnn5b_pretrain-dbn_dnn_smbr
 srcdir=exp/dnn5b_pretrain-dbn_dnn
 acwt=0.0909
 
-if [ $stage -le 3 ]; then
+if [ $stage -le 3 ]; then 
   # First we generate lattices and alignments:
   steps/nnet/align.sh --nj 10 --cmd "$train_cmd" \
     $data_fmllr/train_nodup data/lang $srcdir ${srcdir}_ali || exit 1;
diff --git a/egs/csj/s5/local/run_sgmm2.sh b/egs/csj/s5/local/run_sgmm2.sh
index a5369e30205..ee836dc2043 100644
--- a/egs/csj/s5/local/run_sgmm2.sh
+++ b/egs/csj/s5/local/run_sgmm2.sh
@@ -17,14 +17,16 @@ steps/train_sgmm2_group.sh --cmd "$train_cmd" \
   18000 60000 data/train_nodup data/lang exp/tri4_ali_nodup \
   exp/ubm5/final.ubm exp/sgmm2_5 || exit 1;
 
+
+
+graph_dir=exp/sgmm2_5/graph_csj_tg
+$train_cmd $graph_dir/mkgraph.log \
+  utils/mkgraph.sh data/lang_csj_tg exp/sgmm2_5 $graph_dir
 for eval_num in `seq 3`; do
-  graph_dir=exp/sgmm2_5/graph_csj_tg
-  $train_cmd $graph_dir/mkgraph.log \
-    utils/mkgraph.sh data/lang_csj_tg exp/sgmm2_5 $graph_dir
   steps/decode_sgmm2.sh --nj 10 \
     --cmd "$decode_cmd" --config conf/decode.config \
-    --transform-dir exp/tri4/decode_eval${eval_num}_csj_tg $graph_dir \
-    data/eval${eval_num} exp/sgmm2_5/decode_eval${eval_num}_csj_tg
+    --transform-dir exp/tri4/decode_eval${eval_num}_csj $graph_dir \
+    data/eval${eval_num} exp/sgmm2_5/decode_eval${eval_num}_csj
 done
 wait
 
@@ -48,10 +50,10 @@ steps/train_mmi_sgmm2.sh --cmd "$decode_cmd" \
 for eval_num in `seq 3`; do
     for iter in 1 2 3 4; do
 	steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $iter \
-	    --transform-dir exp/tri4/decode_eval${eval_num}_csj_tg \
+	    --transform-dir exp/tri4/decode_eval${eval_num}_csj \
 	    data/lang_csj_tg data/eval${eval_num} \
-	    exp/sgmm2_5/decode_eval${eval_num}_csj_tg \
-	    exp/sgmm2_5_mmi_b0.1/decode_eval${eval_num}_csj_tg_it$iter
+	    exp/sgmm2_5/decode_eval${eval_num}_csj \
+	    exp/sgmm2_5_mmi_b0.1/decode_eval${eval_num}_csj_it$iter
     done
 done
 wait
diff --git a/egs/csj/s5/local/score.sh b/egs/csj/s5/local/score.sh
deleted file mode 100644
index 05981ab999e..00000000000
--- a/egs/csj/s5/local/score.sh
+++ /dev/null
@@ -1 +0,0 @@
-link ../steps/score_kaldi.sh
\ No newline at end of file
diff --git a/egs/csj/s5/local/score.sh b/egs/csj/s5/local/score.sh
new file mode 120000
index 00000000000..0afefc3158c
--- /dev/null
+++ b/egs/csj/s5/local/score.sh
@@ -0,0 +1 @@
+../steps/score_kaldi.sh
\ No newline at end of file
diff --git a/egs/csj/s5/local/wer_hyp_filter b/egs/csj/s5/local/wer_hyp_filter
index c2911317399..d07b0cf4c28 100644
--- a/egs/csj/s5/local/wer_hyp_filter
+++ b/egs/csj/s5/local/wer_hyp_filter
@@ -3,5 +3,5 @@
 perl -e 'foreach $w (@ARGV) { $bad{$w} = 1; }                                                                                                                                      
    while(<STDIN>) { @A  = split(" ", $_); $id = shift @A; print "$id ";                                                                                                              
      foreach $a (@A) { if (!defined $bad{$a}){ @W=split(/\+/,$a); $word=$W[0]; { print "$word "; }}} print "\n"; }' \
-   '<UNK>'
+   '<unk>' '<sp>'
 
diff --git a/egs/csj/s5/local/wer_output_filter b/egs/csj/s5/local/wer_output_filter
index c2911317399..d07b0cf4c28 100644
--- a/egs/csj/s5/local/wer_output_filter
+++ b/egs/csj/s5/local/wer_output_filter
@@ -3,5 +3,5 @@
 perl -e 'foreach $w (@ARGV) { $bad{$w} = 1; }                                                                                                                                      
    while(<STDIN>) { @A  = split(" ", $_); $id = shift @A; print "$id ";                                                                                                              
      foreach $a (@A) { if (!defined $bad{$a}){ @W=split(/\+/,$a); $word=$W[0]; { print "$word "; }}} print "\n"; }' \
-   '<UNK>'
+   '<unk>' '<sp>'
 
diff --git a/egs/csj/s5/local/wer_ref_filter b/egs/csj/s5/local/wer_ref_filter
index c2911317399..d07b0cf4c28 100644
--- a/egs/csj/s5/local/wer_ref_filter
+++ b/egs/csj/s5/local/wer_ref_filter
@@ -3,5 +3,5 @@
 perl -e 'foreach $w (@ARGV) { $bad{$w} = 1; }                                                                                                                                      
    while(<STDIN>) { @A  = split(" ", $_); $id = shift @A; print "$id ";                                                                                                              
      foreach $a (@A) { if (!defined $bad{$a}){ @W=split(/\+/,$a); $word=$W[0]; { print "$word "; }}} print "\n"; }' \
-   '<UNK>'
+   '<unk>' '<sp>'
 
diff --git a/egs/csj/s5/path.sh b/egs/csj/s5/path.sh
index 41f65d7a03c..8a4c29be4f8 100644
--- a/egs/csj/s5/path.sh
+++ b/egs/csj/s5/path.sh
@@ -1,8 +1,9 @@
 export KALDI_ROOT=`pwd`/../../..
 
-export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$PWD:$PATH
-#$KALDI_ROOT/tools/srilm/bin:$KALDI_ROOT/tools/srilm/bin/i686-m64:$KALDI_ROOT/tools/srilm/bin/i686:$PATH
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
 export PATH=$PATH:/usr/local/cuda/bin
 export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/lib:/usr/local/lib64:/usr/local/cuda/bin/nvcc
 
-export LC_ALL=C
+#export LC_ALL=C 
diff --git a/egs/csj/s5/run.sh b/egs/csj/s5/run.sh
index 6c0af8106c8..fa5355f86f1 100644
--- a/egs/csj/s5/run.sh
+++ b/egs/csj/s5/run.sh
@@ -19,14 +19,14 @@ set -e # exit on error
 
 #: << '#SKIP'
 
-if [ ! -d data/csj-data/eval ]; then
+if [ ! -e data ]; then
  echo "CSJ transcription file does not exist"
  #local/csj_make_trans/csj_automake.sh <RESOUCE_DIR> <MAKING_PLACE(no change)> || exit 1;
- local/csj_make_trans/csj_automake.sh /database/NINJAL/CSJ/ data/csj-data 2>/dev/null
+ local/csj_make_trans/csj_automake.sh /database/NINJAL/CSJ data/csj-data 2>/dev/null
 fi
 wait
 
-[ ! -d data/csj-data/eval ]\
+[ ! -e data/csj-data/.done_make_all ]\
     && echo "Not finished processing CSJ data" && exit 1;
 
 # Prepare Corpus of Spontaneous Japanese (CSJ) data. 
@@ -36,7 +36,7 @@ local/csj_data_prep.sh data/csj-data/
 
 local/csj_prepare_dict.sh 
 
-utils/prepare_lang.sh data/local/dict_nosp "<unk>" data/local/lang_nosp data/lang_nosp
+utils/prepare_lang.sh --num-sil-states 4 data/local/dict_nosp "<unk>" data/local/lang_nosp data/lang_nosp
 
 # Now train the language models.
 local/csj_train_lms.sh data/local/train/text data/local/dict_nosp/lexicon.txt data/local/lm
@@ -155,7 +155,7 @@ $train_cmd $graph_dir/mkgraph.log \
     utils/mkgraph.sh data/lang_nosp_csj_tg exp/tri3 $graph_dir
 for eval_num in `seq 3`; do
     steps/decode.sh --nj 10 --cmd "$decode_cmd" --config conf/decode.config \
-	$graph_dir data/eval${eval_num} exp/tri3/decode_eval${eval_num}_csj
+	$graph_dir data/eval${eval_num} exp/tri3/decode_eval${eval_num}_csj_nosp
 done
 
 # Now we compute the pronunciation and silence probabilities from training data,                                                                                                     
diff --git a/egs/csj/s5/steps b/egs/csj/s5/steps
deleted file mode 100644
index 5e522274378..00000000000
--- a/egs/csj/s5/steps
+++ /dev/null
@@ -1 +0,0 @@
-link ../../wsj/s5/steps/
\ No newline at end of file
diff --git a/egs/csj/s5/steps b/egs/csj/s5/steps
new file mode 120000
index 00000000000..6e99bf5b5ad
--- /dev/null
+++ b/egs/csj/s5/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps
\ No newline at end of file
diff --git a/egs/csj/s5/utils b/egs/csj/s5/utils
deleted file mode 100644
index 1ebeb7c52c7..00000000000
--- a/egs/csj/s5/utils
+++ /dev/null
@@ -1 +0,0 @@
-link ../../wsj/s5/utils/
\ No newline at end of file
diff --git a/egs/csj/s5/utils b/egs/csj/s5/utils
new file mode 120000
index 00000000000..b240885218f
--- /dev/null
+++ b/egs/csj/s5/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils
\ No newline at end of file
diff --git a/egs/farsdat/s5/cmd.sh b/egs/farsdat/s5/cmd.sh
index d749f2c9f1f..71dd849a93b 100644
--- a/egs/farsdat/s5/cmd.sh
+++ b/egs/farsdat/s5/cmd.sh
@@ -1,25 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64"
-export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G"
-export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G"
-export cuda_cmd="run.pl"
-
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@blade[01][0126789][123456789] -l ram_free=2500M,mem_free=2500M,matylda5=0.5"
-#export decode_cmd="queue.pl -q all.q@blade[01][0126789][123456789] -l ram_free=3000M,mem_free=3000M,matylda5=0.1"
-#export mkgraph_cmd="queue.pl -q all.q@blade[01][0126789][123456789] -l ram_free=4G,mem_free=4G,matylda5=3"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu,long.q@dellgpu*,long.q@pco203-0[0124] -l gpu=1" 
-
-#c) run locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-#export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/farsdat/s5/local/farsdat_format_data.sh b/egs/farsdat/s5/local/farsdat_format_data.sh
index 033538656bd..8e565f11fd0 100644
--- a/egs/farsdat/s5/local/farsdat_format_data.sh
+++ b/egs/farsdat/s5/local/farsdat_format_data.sh
@@ -25,13 +25,10 @@ for lm_suffix in bg; do
   test=data/lang_test_${lm_suffix}
   mkdir -p $test
   cp -r data/lang/* $test
-  
+
   gunzip -c $lmdir/lm_phone_${lm_suffix}.arpa.gz | \
-    egrep -v '<s> <s>|</s> <s>|</s> </s>' | \
-    arpa2fst - | fstprint | \
-    utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
-     --osymbols=$test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-    fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
+    arpa2fst --disambig-symbol=#0 \
+             --read-symbol-table=$test/words.txt - $test/G.fst
   fstisstochastic $test/G.fst
  # The output is like:
  # 9.14233e-05 -0.259833
@@ -49,7 +46,7 @@ for lm_suffix in bg; do
     < "$lexicon"  >$tmpdir/g/select_empty.fst.txt
   fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \
    fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst
-  fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && 
+  fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
     echo "Language model has cycles with empty words" && exit 1
   rm -r $tmpdir/g
 done
diff --git a/egs/farsdat/s5/local/farsdat_prepare_lm.sh b/egs/farsdat/s5/local/farsdat_prepare_lm.sh
index 782e1e3ed8f..c04f756d438 100755
--- a/egs/farsdat/s5/local/farsdat_prepare_lm.sh
+++ b/egs/farsdat/s5/local/farsdat_prepare_lm.sh
@@ -25,13 +25,10 @@ for lm_suffix in bg; do
   test=data/lang_test_${lm_suffix}
   mkdir -p $test
   cp -r data/lang/* $test
-  
+
   gunzip -c $lmdir/lm_phone_${lm_suffix}.arpa.gz | \
-    egrep -v '<s> <s>|</s> <s>|</s> </s>' | \
-    arpa2fst - | fstprint | \
-    utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
-     --osymbols=$test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-    fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
+    arpa2fst --disambig-symbol=#0 \
+             --read-symbol-table=$test/words.txt - $test/G.fst
   fstisstochastic $test/G.fst
  # The output is like:
  # 9.14233e-05 -0.259833
@@ -49,7 +46,7 @@ for lm_suffix in bg; do
     < "$lexicon"  >$tmpdir/g/select_empty.fst.txt
   fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \
    fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst
-  fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && 
+  fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
     echo "Language model has cycles with empty words" && exit 1
   rm -r $tmpdir/g
 done
diff --git a/egs/farsdat/s5/path.sh b/egs/farsdat/s5/path.sh
index 1e48f21b323..62794699b41 100755
--- a/egs/farsdat/s5/path.sh
+++ b/egs/farsdat/s5/path.sh
@@ -1,4 +1,6 @@
 export KALDI_ROOT=`pwd`/../../..
-[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh 
-export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/irstlm/bin/:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin/:$KALDI_ROOT/src/kwsbin:$PWD:$PATH
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/irstlm/bin/:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
 export LC_ALL=C
diff --git a/egs/fisher_callhome_spanish/s5/cmd.sh b/egs/fisher_callhome_spanish/s5/cmd.sh
index ab29f13d4cc..88db78823a5 100755
--- a/egs/fisher_callhome_spanish/s5/cmd.sh
+++ b/egs/fisher_callhome_spanish/s5/cmd.sh
@@ -1,18 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#train_cmd='queue.pl -q all.q@a03.clsp.jhu.edu,all.q@a06.clsp.jhu.edu,all.q@a05.clsp.jhu.edu,all.q@v01.clsp.jhu.edu,all.q@a10.clsp.jhu.edu,all.q@a04.clsp.jhu.edu,all.q@a13.clsp.jhu.edu,all.q@a11.clsp.jhu.edu -l arch=*64'
-#decode_cmd='queue.pl -q all.q@a03.clsp.jhu.edu,all.q@a06.clsp.jhu.edu,all.q@a05.clsp.jhu.edu,all.q@v01.clsp.jhu.edu,all.q@a10.clsp.jhu.edu,all.q@a04.clsp.jhu.edu,all.q@a13.clsp.jhu.edu,all.q@a11.clsp.jhu.edu -l arch=*64'
-train_cmd="queue.pl -l arch=*64"
-decode_cmd="queue.pl -l arch=*64"
-#train_cmd="run.pl"
-# Do training locally.  Note: for jobs on smallish subsets,
-# it's way faster to run on a single machine with a handful of CPUs, as
-# you avoid the latency of starting GridEngine jobs.
-
-
-
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 4G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/fisher_callhome_spanish/s5/local/fsp_create_test_lang.sh b/egs/fisher_callhome_spanish/s5/local/fsp_create_test_lang.sh
index 70d2886cecc..90250ff521b 100755
--- a/egs/fisher_callhome_spanish/s5/local/fsp_create_test_lang.sh
+++ b/egs/fisher_callhome_spanish/s5/local/fsp_create_test_lang.sh
@@ -1,4 +1,4 @@
-#!/bin/bash 
+#!/bin/bash
 # Copyright 2014  Gaurav Kumar.   Apache 2.0
 #
 
@@ -12,26 +12,13 @@ arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz
 mkdir -p data/lang_test
 cp -r data/lang/* data/lang_test
 
-# grep -v '<s> <s>' etc. is only for future-proofing this script.  Our
-# LM doesn't have these "invalid combinations".  These can cause 
-# determinization failures of CLG [ends up being epsilon cycles].
-# Note: remove_oovs.pl takes a list of words in the LM that aren't in
-# our word list.  Since our LM doesn't have any, we just give it
-# /dev/null [we leave it in the script to show how you'd do it].
 gunzip -c "$arpa_lm" | \
-   grep -v '<s> <s>' | \
-   grep -v '</s> <s>' | \
-   grep -v '</s> </s>' | \
-   arpa2fst - | fstprint | \
-   utils/remove_oovs.pl /dev/null | \
-   utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \
-     --osymbols=data/lang_test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-    fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst
-  fstisstochastic data/lang_test/G.fst
+  arpa2fst --disambig-symbol=#0 \
+           --read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst
 
 
 echo  "Checking how stochastic G is (the first of these numbers should be small):"
-fstisstochastic data/lang_test/G.fst 
+fstisstochastic data/lang_test/G.fst
 
 ## Check lexicon.
 ## just have a look and make sure it seems sane.
@@ -60,4 +47,3 @@ fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \
 
 
 echo "$0 succeeded"
-
diff --git a/egs/fisher_callhome_spanish/s5/local/fsp_data_prep.sh b/egs/fisher_callhome_spanish/s5/local/fsp_data_prep.sh
index f453ab42058..8fe80b46784 100755
--- a/egs/fisher_callhome_spanish/s5/local/fsp_data_prep.sh
+++ b/egs/fisher_callhome_spanish/s5/local/fsp_data_prep.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 #
 # Copyright 2014  Gaurav Kumar.   Apache 2.0
-# The input is the Fisher Dataset which contains DISC1 and DISC2. (*.sph files) 
-# In addition the transcripts are needed as well. 
+# The input is the Fisher Dataset which contains DISC1 and DISC2. (*.sph files)
+# In addition the transcripts are needed as well.
 # To be run from one directory above this script.
 
 # Note: when creating your own data preparation scripts, it's a good idea
 # to make sure that the speaker id (if present) is a prefix of the utterance
-# id, that the output scp file is sorted on utterance id, and that the 
+# id, that the output scp file is sorted on utterance id, and that the
 # transcription file is exactly the same length as the scp file and is also
 # sorted on utterance id (missing transcriptions should be removed from the
 # scp file using e.g. scripts/filter_scp.pl)
@@ -18,8 +18,8 @@ export LC_ALL=C
 
 
 if [ $# -lt 2 ]; then
-   echo "Arguments should be the location of the Spanish Fisher Speech and Transcript Directories, se
-e ../run.sh for example."
+   echo "Usage: $0 <LDC2010S01-location> <LDC2010T04-location>"
+   echo "e.g.: $0 /home/mpost/data/LDC/LDC2010S01 /home/mpost/data/LDC/LDC2010T04"
    exit 1;
 fi
 
@@ -72,20 +72,20 @@ fi
 
 speech_d1=$dir/links/LDC2010S01/DISC1/data/speech
 speech_d2=$dir/links/LDC2010S01/DISC2/data/speech
-transcripts=$dir/links/LDC2010T04/data/transcripts                                 
-                                                                                   
-fcount_d1=`find ${speech_d1} -iname '*.sph' | wc -l`                                             
-fcount_d2=`find ${speech_d2} -iname '*.sph' | wc -l`                                             
-fcount_t=`find ${transcripts} -iname '*.tdf' | wc -l`                                            
-#TODO:it seems like not all speech files have transcripts             
+transcripts=$dir/links/LDC2010T04/data/transcripts
+
+fcount_d1=`find ${speech_d1} -iname '*.sph' | wc -l`
+fcount_d2=`find ${speech_d2} -iname '*.sph' | wc -l`
+fcount_t=`find ${transcripts} -iname '*.tdf' | wc -l`
+#TODO:it seems like not all speech files have transcripts
 #Now check if we got all the files that we needed
-if [ $fcount_d1 != 411 -o $fcount_d2 != 408 -o $fcount_t != 819 ];                 
-then                                                                               
-        echo "Incorrect number of files in the data directories"                   
-        echo "DISC1 and DISC2 should contain 411 and 408 .sph files respectively"  
-        echo "The transcripts should contain 819 files"                            
-        exit 1;                                                                    
-fi   
+if [ $fcount_d1 != 411 -o $fcount_d2 != 408 -o $fcount_t != 819 ];
+then
+        echo "Incorrect number of files in the data directories"
+        echo "DISC1 and DISC2 should contain 411 and 408 .sph files respectively"
+        echo "The transcripts should contain 819 files"
+        exit 1;
+fi
 
 if [ $stage -le 0 ]; then
 	#Gather all the speech files together to create a file list
@@ -105,7 +105,7 @@ if [ $stage -le 1 ]; then
 	mv $tmpdir/reco2file_and_channel $dir/train_all/
 fi
 
-if [ $stage -le 2 ]; then                                                        
+if [ $stage -le 2 ]; then
   sort $tmpdir/text.1 | grep -v '((' | \
   awk '{if (NF > 1){ print; }}' | \
   sed 's:<\s*[/]*\s*\s*for[ei][ei]g[nh]\s*\w*>::g' | \
@@ -149,7 +149,7 @@ if [ $stage -le 3 ]; then
   for f in `cat $tmpdir/train_sph.flist`; do
     # convert to absolute path
     readlink -e $f
-  done > $tmpdir/train_sph_abs.flist  
+  done > $tmpdir/train_sph_abs.flist
 
   cat $tmpdir/train_sph_abs.flist | perl -ane 'm:/([^/]+)\.sph$: || die "bad line $_; ";  print "$1 $_"; ' > $tmpdir/sph.scp
   cat $tmpdir/sph.scp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2); printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);}' | \
diff --git a/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh b/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh
index 0f2bd037ba0..6d04f53c7e5 100755
--- a/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh
+++ b/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh
@@ -22,12 +22,32 @@ lexicon=$1
 #Get all unique words, remove punctuation.
 if [ $stage -le 0 ]; then
   cat $datadir/text | sed 's:[0-9][0-9]\S*::g' | sed 's:[\.,\?]::g' | tr " " "\n" | sort | uniq | awk '{if (NF > 0){ print; }}' > $tmpdir/uniquewords
-  if [ -f "/export/a04/gkumar/corpora/gigaword-spanish/bin/gigaword-lexicon.json" ]; then
-    # Merge with gigaword corpus
-    $local/merge_lexicons.py
-    mv $tmpdir/uniquewords $tmpdir/uniquewords.small
-    mv $tmpdir/uniquewords64k $tmpdir/uniquewords
+  if [ ! -f "${tmpdir}/es_wordlist.json" ]; then
+    echo "Could not find the large collection of Spanish words es_wordlist.json"
+    echo "Trying to download it via wget"
+
+    if ! which wget >&/dev/null; then
+      echo "This script requires you to first install wget"
+      exit 1;
+    fi
+
+    cwd=`pwd`
+    cd $tmpdir
+    wget -T 10 -t 3 -c http://www.openslr.org/resources/21/es_wordlist.json.tgz
+
+    if [ ! -e ${tmpdir}/es_wordlist.json.tgz ]; then
+      echo "Download of the large Spanish word list failed"
+      exit 1;
+    fi
+
+    tar -xovzf es_wordlist.json.tgz || exit 1;
+    cd $cwd
   fi
+
+  # Merge with gigaword corpus
+  $local/merge_lexicons.py ${tmpdir} ${lexicon}
+  mv $tmpdir/uniquewords $tmpdir/uniquewords.small
+  mv $tmpdir/uniquewords64k $tmpdir/uniquewords
 fi
 
 #Then get the list of phones form basic_rules in the lexicon folder
@@ -50,6 +70,7 @@ if [ $stage -le 2 ]; then
   # representation
   cat $tmpdir/uniquewords | $local/spron.pl $lexicon/callhome_spanish_lexicon_970908/preferences $lexicon/callhome_spanish_lexicon_970908/basic_rules \
     | cut -f1 | sed -r 's:#\S+\s\S+\s\S+\s\S+\s(\S+):\1:g' \
+    | awk -F '[/][/]' '{print $1}' \
     > $tmpdir/lexicon_raw
 fi
 
diff --git a/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py b/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py
index 8c67ae56804..5c09f09bc35 100755
--- a/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py
+++ b/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py
@@ -7,55 +7,58 @@
 import sys
 import json
 import codecs
-import os
 import operator
 
-wordlimit=64000
-uw_fisher="data/local/tmp/uniquewords"
-uw_gigaword="/export/a04/gkumar/corpora/gigaword-spanish/bin/gigaword-lexicon.json"
-uw_LDC="/export/corpora/LDC/LDC96L16/callhome_spanish_lexicon_970908/preferences"
+wordlimit = 64000
+tmpdir = sys.argv[1]
+ldc_lexicon = sys.argv[2]
+uw_fisher = tmpdir + "/uniquewords"
+uw_gigaword = tmpdir + "/es_wordlist.json"
+uw_LDC = ldc_lexicon + "/callhome_spanish_lexicon_970908/preferences"
 
 merged_lexicon = []
 # All three lexicons are in different formats
 # First add the data from lexicon_fisher (A) into the dictionary
 fisher = codecs.open(uw_fisher, encoding='utf-8')
 for line in fisher:
-	merged_lexicon.append(line.strip())
+    merged_lexicon.append(line.strip())
 fisher.close()
 
-print "After adding the fisher data, the lexicon contains " + str(len(merged_lexicon)) + " entries."
+print "After adding the fisher data, the lexicon contains " \
+      + str(len(merged_lexicon)) + " entries."
 
 # Now add data from the LDC lexicon
 ldc = codecs.open(uw_LDC, encoding='iso-8859-1')
-for line in ldc: 
-	entries = line.strip().split('\t')
-	if entries[0].lower() not in merged_lexicon:
-		merged_lexicon.append(entries[0].lower())
+for line in ldc:
+    entries = line.strip().split('\t')
+    if entries[0].lower() not in merged_lexicon:
+        merged_lexicon.append(entries[0].lower())
 
-print "After adding the LDC data, the lexicon contains " + str(len(merged_lexicon)) + " entries."
+print "After adding the LDC data, the lexicon contains " \
+      + str(len(merged_lexicon)) + " entries."
 
 # Finally add the gigaword data
 gigaword = json.load(open(uw_gigaword))
 gigaword = reversed(sorted(gigaword.iteritems(), key=operator.itemgetter(1)))
 
 for item in gigaword:
-	# We need a maximum of wordlimit words in the lexicon
-	if len(merged_lexicon) == wordlimit:
-		break	
+    # We need a maximum of wordlimit words in the lexicon
+    if len(merged_lexicon) == wordlimit:
+        break
 
-	if item[0].lower() not in merged_lexicon:
-		merged_lexicon.append(item[0].lower())
-	
-print "After adding the Gigaword data, the lexicon contains " + str(len(merged_lexicon)) + " entries."
+    if item[0].lower() not in merged_lexicon:
+        merged_lexicon.append(item[0].lower())
+
+print "After adding the Gigaword data, the lexicon contains " \
+      + str(len(merged_lexicon)) + " entries."
 
 # Now write the uniquewords to a file
-lf = codecs.open('data/local/tmp/uniquewords64k', encoding='utf-8', mode='w+')
+lf = codecs.open(tmpdir + '/uniquewords64k', encoding='utf-8', mode='w+')
 ltuples = sorted(merged_lexicon)
 
 for item in ltuples:
-	lf.write(item + "\n")
+    lf.write(item + "\n")
 
 lf.close()
 
 print "Finshed writing unique words"
-
diff --git a/egs/fisher_callhome_spanish/s5/path.sh b/egs/fisher_callhome_spanish/s5/path.sh
index 423d1dd0016..1a6fb5f891b 100755
--- a/egs/fisher_callhome_spanish/s5/path.sh
+++ b/egs/fisher_callhome_spanish/s5/path.sh
@@ -1,3 +1,5 @@
 export KALDI_ROOT=`pwd`/../../..
-export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/src/nnet:$KALDI_ROOT/src/nnet2:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnet-cpubin/:$PWD:$PATH
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
 export LC_ALL=C
diff --git a/egs/fisher_callhome_spanish/s5/run.sh b/egs/fisher_callhome_spanish/s5/run.sh
index 706f3793278..edd7f56bad2 100755
--- a/egs/fisher_callhome_spanish/s5/run.sh
+++ b/egs/fisher_callhome_spanish/s5/run.sh
@@ -17,12 +17,10 @@ set -e
 sfisher_speech=/home/mpost/data/LDC/LDC2010S01
 sfisher_transcripts=/home/mpost/data/LDC/LDC2010T04
 spanish_lexicon=/export/corpora/LDC/LDC96L16
-#split=/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt
 split=local/splits/split_fisher
 
 callhome_speech=/export/corpora/LDC/LDC96S35
 callhome_transcripts=/export/corpora/LDC/LDC96T17
-#split_callhome=/export/a04/gkumar/corpora/fishcall/jack-splits/split-callhome
 split=local/splits/split_callhome
 
 local/fsp_data_prep.sh $sfisher_speech $sfisher_transcripts
@@ -33,16 +31,16 @@ local/fsp_prepare_dict.sh $spanish_lexicon
 
 # Rewrite ----------------------------- This section is no longer needed----
 # At this point, it might make sense to use a bigger lexicon
-# The one I will use is derived from this exercise (spanish fisher) and 
-# the LDC spanish lexicon along with the most frequent words derived from the 
+# The one I will use is derived from this exercise (spanish fisher) and
+# the LDC spanish lexicon along with the most frequent words derived from the
 # gigaword corpus such that the total number of entries in the lexicon
 # are 64k
 
 # To generate the merged lexicon, run
 # /export/a04/gkumar/corpora/gigaword/bin/merge_lexicons.py
 # you might have to set the locations of the three lexicons within this
-# file. Note that the LDC rule base phoneme generator works only from its 
-# own directory. So the merged lexicon is actually created in 
+# file. Note that the LDC rule base phoneme generator works only from its
+# own directory. So the merged lexicon is actually created in
 # /export/a04/gkumar/corpora/LDC9..../spanish_lexicon../lexicon64k
 # This can be easily fixed and will be done. #TODO
 # Also run the clean lexicon script to take care of non stressable vowels
@@ -57,11 +55,11 @@ utils/prepare_lang.sh data/local/dict "<unk>" data/local/lang data/lang
 
 
 # Make sure that you do not use your test and your dev sets to train the LM
-# Some form of cross validation is possible where you decode your dev/set based on an 
+# Some form of cross validation is possible where you decode your dev/set based on an
 # LM that is trained on  everything but that that conversation
 # When in doubt about what your data partitions should be use local/fsp_ideal_data_partitions.pl
-# to get the numbers. Depending on your needs, you might have to change the size of 
-# the splits within that file. The default paritions are based on the Kaldi + Joshua 
+# to get the numbers. Depending on your needs, you might have to change the size of
+# the splits within that file. The default paritions are based on the Kaldi + Joshua
 # requirements which means that I have very large dev and test sets
 local/fsp_train_lms.sh $split
 local/fsp_create_test_lang.sh
@@ -95,7 +93,7 @@ cp -r data/local/data/callhome_train_all data/callhome_train_all
 # MT Tune : Same as the ASR eval set (Use the lattices from here)
 # MT Eval : 20k utterances
 # The dev and the test sets need to be carefully chosen so that there is no conversation/speaker
-# overlap. This has been setup and the script local/fsp_ideal_data_partitions provides the numbers that are needed below. 
+# overlap. This has been setup and the script local/fsp_ideal_data_partitions provides the numbers that are needed below.
 # As noted above, the LM has not been trained on the dev and the test sets.
 #utils/subset_data_dir.sh --first data/train_all 158126 data/dev_and_test
 #utils/subset_data_dir.sh --first data/dev_and_test 37814 data/asr_dev_and_test
@@ -136,7 +134,7 @@ utils/subset_data_dir.sh --shortest data/train 90000 data/train_100kshort
 utils/subset_data_dir.sh  data/train_100kshort 10000 data/train_10k
 local/remove_dup_utts.sh 100 data/train_10k data/train_10k_nodup
 utils/subset_data_dir.sh --speakers data/train 30000 data/train_30k
-utils/subset_data_dir.sh --speakers data/train 90000 data/train_100k  
+utils/subset_data_dir.sh --speakers data/train 90000 data/train_100k
 
 steps/train_mono.sh --nj 10 --cmd "$train_cmd" \
   data/train_10k_nodup data/lang exp/mono0a
@@ -178,7 +176,7 @@ steps/train_lda_mllt.sh --cmd "$train_cmd" \
    exp/tri3a/graph data/dev exp/tri3a/decode_dev || exit 1;
 )&
 
-# Next we'll use fMLLR and train with SAT (i.e. on 
+# Next we'll use fMLLR and train with SAT (i.e. on
 # fMLLR features)
 
 steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
diff --git a/egs/fisher_english/s5/cmd.sh b/egs/fisher_english/s5/cmd.sh
index a4a11bef039..88db78823a5 100644
--- a/egs/fisher_english/s5/cmd.sh
+++ b/egs/fisher_english/s5/cmd.sh
@@ -1,29 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64"
-export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G"
-export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G"
-
-#export cuda_cmd="..."
-
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
-#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
-
-#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
-#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
-
-#c) run it locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-#export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 4G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/fisher_english/s5/local/fisher_create_test_lang.sh b/egs/fisher_english/s5/local/fisher_create_test_lang.sh
index aaa45f8e4e1..1d7c4013b83 100755
--- a/egs/fisher_english/s5/local/fisher_create_test_lang.sh
+++ b/egs/fisher_english/s5/local/fisher_create_test_lang.sh
@@ -1,4 +1,4 @@
-#!/bin/bash 
+#!/bin/bash
 #
 
 if [ -f path.sh ]; then . path.sh; fi
@@ -11,26 +11,13 @@ arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz
 mkdir -p data/lang_test
 cp -r data/lang/* data/lang_test
 
-# grep -v '<s> <s>' etc. is only for future-proofing this script.  Our
-# LM doesn't have these "invalid combinations".  These can cause 
-# determinization failures of CLG [ends up being epsilon cycles].
-# Note: remove_oovs.pl takes a list of words in the LM that aren't in
-# our word list.  Since our LM doesn't have any, we just give it
-# /dev/null [we leave it in the script to show how you'd do it].
 gunzip -c "$arpa_lm" | \
-   grep -v '<s> <s>' | \
-   grep -v '</s> <s>' | \
-   grep -v '</s> </s>' | \
-   arpa2fst - | fstprint | \
-   utils/remove_oovs.pl /dev/null | \
-   utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \
-     --osymbols=data/lang_test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-    fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst
-  fstisstochastic data/lang_test/G.fst
+  arpa2fst --disambig-symbol=#0 \
+           --read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst
 
 
 echo  "Checking how stochastic G is (the first of these numbers should be small):"
-fstisstochastic data/lang_test/G.fst 
+fstisstochastic data/lang_test/G.fst
 
 ## Check lexicon.
 ## just have a look and make sure it seems sane.
@@ -59,4 +46,3 @@ fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \
 
 
 echo "$0 succeeded"
-
diff --git a/egs/fisher_english/s5/local/nnet2/run_6c_gpu.sh b/egs/fisher_english/s5/local/nnet2/run_6c_gpu.sh
index be2548cc667..eae5f7b8581 100755
--- a/egs/fisher_english/s5/local/nnet2/run_6c_gpu.sh
+++ b/egs/fisher_english/s5/local/nnet2/run_6c_gpu.sh
@@ -34,7 +34,7 @@ parallel_opts="-l gpu=1"  # This is suitable for the CLSP network, you'll likely
     # note: 12 epochs is too many, it's taking a very long time.
     steps/nnet2/train_pnorm_simple2.sh --stage $train_stage \
       --num-epochs 12 \
-      --io-opts "-tc 10" \
+      --io-opts "--max-jobs-run 10" \
       --num-jobs-nnet 8 --num-threads 1 \
       --minibatch-size 512 --parallel-opts "$parallel_opts" \
       --mix-up 15000 \
diff --git a/egs/fisher_english/s5/local/online/run_nnet2.sh b/egs/fisher_english/s5/local/online/run_nnet2.sh
index 97f3d655b78..0b9adb7d315 100755
--- a/egs/fisher_english/s5/local/online/run_nnet2.sh
+++ b/egs/fisher_english/s5/local/online/run_nnet2.sh
@@ -39,7 +39,7 @@ if [ $stage -le 6 ]; then
 
   # Because we have a lot of data here and we don't want the training to take
   # too long, we reduce the number of epochs from the defaults (15 + 5) to (3 +
-  # 1).  The option "--io-opts '-tc 12'" is to have more than the default number
+  # 1).  The option "--io-opts '--max-jobs-run 12'" is to have more than the default number
   # (5) of jobs dumping the egs to disk; this is OK since we're splitting our
   # data across four filesystems for speed.
 
@@ -52,7 +52,7 @@ if [ $stage -le 6 ]; then
     --num-threads "$num_threads" \
     --minibatch-size "$minibatch_size" \
     --parallel-opts "$parallel_opts" \
-    --io-opts "-tc 12" \
+    --io-opts "--max-jobs-run 12" \
     --num-jobs-nnet 6 \
     --num-hidden-layers 4 \
     --mix-up 12000 \
diff --git a/egs/fisher_english/s5/local/online/run_nnet2_b.sh b/egs/fisher_english/s5/local/online/run_nnet2_b.sh
index e9e0041cf0e..7eac7cf0a7d 100755
--- a/egs/fisher_english/s5/local/online/run_nnet2_b.sh
+++ b/egs/fisher_english/s5/local/online/run_nnet2_b.sh
@@ -76,7 +76,7 @@ if [ $stage -le 4 ]; then
 
   # Because we have a lot of data here and we don't want the training to take
   # too long, we reduce the number of epochs from the defaults (15 + 5) to (1 +
-  # 1).  The option "--io-opts '-tc 12'" is to have more than the default number
+  # 1).  The option "--io-opts '--max-jobs-run 12'" is to have more than the default number
   # (5) of jobs dumping the egs to disk; this is OK since we're splitting our
   # data across four filesystems for speed.
 
@@ -89,7 +89,7 @@ if [ $stage -le 4 ]; then
     --num-threads "$num_threads" \
     --minibatch-size "$minibatch_size" \
     --parallel-opts "$parallel_opts" \
-    --io-opts "-tc 12" \
+    --io-opts "--max-jobs-run 12" \
     --num-jobs-nnet 6 \
     --num-hidden-layers 5 \
     --mix-up 12000 \
diff --git a/egs/fisher_english/s5/local/online/run_nnet2_multisplice.sh b/egs/fisher_english/s5/local/online/run_nnet2_multisplice.sh
index 37a0f91d7cb..47ba36f0072 100755
--- a/egs/fisher_english/s5/local/online/run_nnet2_multisplice.sh
+++ b/egs/fisher_english/s5/local/online/run_nnet2_multisplice.sh
@@ -15,13 +15,13 @@ set -e
 # assume use_gpu=true since it would be way too slow otherwise.
 
 if ! cuda-compiled; then
-  cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.
 EOF
 fi
-parallel_opts="-l gpu=1" 
+parallel_opts="-l gpu=1"
 num_threads=1
 minibatch_size=512
 dir=exp/nnet2_online/nnet_ms_a
@@ -34,12 +34,12 @@ local/online/run_nnet2_common.sh --stage $stage
 
 if [ $stage -le 6 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]]; then
-    utils/create_split_dir.pl /export/b0{6,7,8,9}/$(USER)/kaldi-dsata/egs/fisher_english/s5/$dir/egs/storage $dir/egs/storage
+    utils/create_split_dir.pl /export/b0{6,7,8,9}/$(USER)/kaldi-data/egs/fisher_english/s5/$dir/egs/storage $dir/egs/storage
   fi
-  
+
   # Because we have a lot of data here and we don't want the training to take
   # too long, we reduce the number of epochs from the defaults (15 + 5) to (3 +
-  # 1).  The option "--io-opts '-tc 12'" is to have more than the default number
+  # 1).  The option "--io-opts '--max-jobs-run 12'" is to have more than the default number
   # (5) of jobs dumping the egs to disk; this is OK since we're splitting our
   # data across four filesystems for speed.
 
@@ -71,7 +71,7 @@ if [ $stage -le 7 ]; then
 fi
 
 if [ $stage -le 8 ]; then
-  # do the actual online decoding with iVectors, carrying info forward from 
+  # do the actual online decoding with iVectors, carrying info forward from
   # previous utterances of the same speaker.
    steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 30 \
       exp/tri5a/graph data/dev ${dir}_online/decode_dev || exit 1;
diff --git a/egs/fisher_english/s5/path.sh b/egs/fisher_english/s5/path.sh
index bee65315090..1a6fb5f891b 100755
--- a/egs/fisher_english/s5/path.sh
+++ b/egs/fisher_english/s5/path.sh
@@ -1,3 +1,5 @@
 export KALDI_ROOT=`pwd`/../../..
-export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$PWD:$PATH
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
 export LC_ALL=C
diff --git a/egs/fisher_swbd/s5/RESULTS b/egs/fisher_swbd/s5/RESULTS
index 77306f8df4e..b8fe8371631 100644
--- a/egs/fisher_swbd/s5/RESULTS
+++ b/egs/fisher_swbd/s5/RESULTS
@@ -42,8 +42,77 @@ for x in exp/nnet2_online/nnet_ms_a_online/decode_eval2000*_fg; do grep Sum  $x/
 %WER 12.3 | 1831 21395 | 89.2 7.2 3.5 1.5 12.3 50.8 | exp/nnet2_online/nnet_ms_a_online/decode_eval2000_utt_fsh_sw1_fg/score_13/eval2000.ctm.swbd.filt.sys
 %WER 11.8 | 1831 21395 | 89.6 7.2 3.2 1.4 11.8 49.0 | exp/nnet2_online/nnet_ms_a_online/decode_eval2000_utt_offline_fsh_sw1_fg/score_11/eval2000.ctm.swbd.filt.sys
 
+# nnet3 result on eval2000
+# BLSTM ran for about 760 hours, command:
+# local/nnet3/run_lstm.sh --affix bidirectional --lstm-delay " [-1,1] [-2,2] [-3,3] " --label-delay 0 \
+#                         --cell-dim 1024 --recurrent-projection-dim 128 --non-recurrent-projection-dim 128 \
+#                         --chunk-left-context 40 --chunk-right-context 40 \
+#                         --extra-left-context 50 --extra-right-context 50
+# use tri-gram
+for x in exp/nnet3/*/decode_eval2000*tg; do grep Sum  $x/score_*/*.ctm.filt.sys | utils/best_wer.sh ; done
+%WER 15.8 | 4459 42989 | 86.1 9.7 4.1 1.9 15.8 52.6 | exp/nnet3/tdnn_sp/decode_eval2000_fsh_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+%WER 14.8 | 4459 42989 | 87.2 9.4 3.4 2.1 14.8 52.2 | exp/nnet3/tdnn_sp_smbr/decode_eval2000_fsh_sw1_tg_epoch2.adj/score_13_0.0/eval2000_hires.ctm.filt.sys
+%WER 14.8 | 4459 42989 | 86.6 9.2 4.3 1.4 14.8 54.3 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_fsh_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# rescore with four-gram
+for x in exp/nnet3/*/decode_eval2000*fg; do grep Sum  $x/score_*/*.ctm.filt.sys | utils/best_wer.sh ; done
+%WER 15.4 | 4459 42989 | 86.4 9.5 4.0 1.8 15.4 51.6 | exp/nnet3/tdnn_sp/decode_eval2000_fsh_sw1_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+%WER 14.5 | 4459 42989 | 87.5 9.0 3.5 2.0 14.5 51.4 | exp/nnet3/tdnn_sp_smbr/decode_eval2000_fsh_sw1_fg_epoch2.adj/score_14_0.0/eval2000_hires.ctm.filt.sys
+%WER 14.5 | 4459 42989 | 87.0 9.0 4.0 1.5 14.5 53.7 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_fsh_sw1_fg/score_8_0.0/eval2000_hires.ctm.filt.sys
+
+# nnet3 result on eval2000 for swbd subset
+# use tri-gram
+for x in exp/nnet3/*/decode_eval2000*tg; do grep Sum  $x/score_*/*.ctm.swbd.filt.sys | utils/best_wer.sh ; done
+%WER 11.6 | 1831 21395 | 89.7 7.3 3.0 1.3 11.6 47.7 | exp/nnet3/tdnn_sp/decode_eval2000_fsh_sw1_tg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
+%WER 10.3 | 1831 21395 | 91.0 6.4 2.5 1.3 10.3 45.9 | exp/nnet3/tdnn_sp_smbr/decode_eval2000_fsh_sw1_tg_epoch2.adj/score_19_0.0/eval2000_hires.ctm.swbd.filt.sys
+%WER 10.7 | 1831 21395 | 90.3 6.7 3.0 1.0 10.7 45.9 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_fsh_sw1_tg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
+# rescore with four-gram
+for x in exp/nnet3/*/decode_eval2000*fg; do grep Sum  $x/score_*/*.ctm.swbd.filt.sys | utils/best_wer.sh ; done
+%WER 11.1 | 1831 21395 | 90.2 7.0 2.8 1.3 11.1 46.2 | exp/nnet3/tdnn_sp/decode_eval2000_fsh_sw1_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
+%WER 10.0 | 1831 21395 | 91.3 6.3 2.4 1.3 10.0 45.1 | exp/nnet3/tdnn_sp_smbr/decode_eval2000_fsh_sw1_fg_epoch2.adj/score_19_1.0/eval2000_hires.ctm.swbd.filt.sys
+%WER 10.4 | 1831 21395 | 90.6 6.5 2.9 1.0 10.4 45.3 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_fsh_sw1_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
+
+# nnet3 result on eval2000 for callhm subset
+# use tri-gram
+for x in exp/nnet3/*/decode_eval2000*tg; do grep Sum  $x/score_*/*.ctm.callhm.filt.sys | utils/best_wer.sh ; done
+%WER 19.9 | 2628 21594 | 82.6 12.1 5.3 2.6 19.9 56.0 | exp/nnet3/tdnn_sp/decode_eval2000_fsh_sw1_tg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys
+%WER 19.0 | 2628 21594 | 83.5 11.7 4.8 2.5 19.0 56.5 | exp/nnet3/tdnn_sp_smbr/decode_eval2000_fsh_sw1_tg_epoch2.adj/score_14_0.5/eval2000_hires.ctm.callhm.filt.sys
+%WER 18.8 | 2628 21594 | 83.1 11.7 5.2 1.9 18.8 60.2 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_fsh_sw1_tg/score_8_0.0/eval2000_hires.ctm.callhm.filt.sys
+# rescore with four-gram
+for x in exp/nnet3/*/decode_eval2000*fg; do grep Sum  $x/score_*/*.ctm.swbd.filt.sys | utils/best_wer.sh ; done
+%WER 19.7 | 2628 21594 | 82.7 12.1 5.2 2.4 19.7 55.3 | exp/nnet3/tdnn_sp/decode_eval2000_fsh_sw1_fg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys
+%WER 18.7 | 2628 21594 | 83.7 11.5 4.8 2.5 18.7 55.6 | exp/nnet3/tdnn_sp_smbr/decode_eval2000_fsh_sw1_fg_epoch2.adj/score_14_0.0/eval2000_hires.ctm.callhm.filt.sys
+%WER 18.6 | 2628 21594 | 83.3 11.5 5.2 1.9 18.6 59.6 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_fsh_sw1_fg/score_8_0.0/eval2000_hires.ctm.callhm.filt.sys
 
+# chain result on eval2000
+# BLSTM ran for about 380 hours
+# use tri-gram
+for x in exp/chain/*/decode_eval2000*tg; do grep Sum  $x/score_*/*.ctm.filt.sys | utils/best_wer.sh ; done
+%WER 13.6 | 4459 42989 | 88.2 7.9 3.9 1.8 13.6 51.0 | exp/chain/tdnn_7b_sp/decode_eval2000_fsh_sw1_tg/score_8_0.0/eval2000_hires.ctm.filt.sys
+%WER 12.1 | 4459 42989 | 89.7 6.8 3.5 1.8 12.1 50.2 | exp/chain/blstm_6h_sp/decode_eval2000_fsh_sw1_tg/score_7_0.0/eval2000_hires.ctm.filt.sys
+# rescore with four-gram
+for x in exp/chain/*/decode_eval2000*fg; do grep Sum  $x/score_*/*.ctm.filt.sys | utils/best_wer.sh ; done
+%WER 13.3 | 4459 42989 | 88.4 7.8 3.8 1.8 13.3 50.1 | exp/chain/tdnn_7b_sp/decode_eval2000_fsh_sw1_fg/score_8_0.0/eval2000_hires.ctm.filt.sys
+%WER 12.0 | 4459 42989 | 89.6 6.5 3.8 1.7 12.0 49.3 | exp/chain/blstm_6h_sp/decode_eval2000_fsh_sw1_fg/score_8_0.5/eval2000_hires.ctm.filt.sys
+
+# chain result on eval2000 for swbd subset
+# use tri-gram
+for x in exp/chain/*/decode_eval2000*tg; do grep Sum  $x/score_*/*.ctm.swbd.filt.sys | utils/best_wer.sh ; done
+%WER 9.4 | 1831 21395 | 91.7 5.4 2.9 1.2 9.4 43.9 | exp/chain/tdnn_7b_sp/decode_eval2000_fsh_sw1_tg/score_10_0.5/eval2000_hires.ctm.swbd.filt.sys
+%WER 8.8 | 1831 21395 | 92.5 5.3 2.2 1.4 8.8 46.9 | exp/chain/blstm_6h_sp/decode_eval2000_fsh_sw1_tg/score_7_1.0/eval2000_hires.ctm.swbd.filt.sys
+# rescore with four-gram
+for x in exp/chain/*/decode_eval2000*fg; do grep Sum  $x/score_*/*.ctm.swbd.filt.sys | utils/best_wer.sh ; done
+%WER 9.2 | 1831 21395 | 92.1 5.6 2.3 1.3 9.2 42.4 | exp/chain/tdnn_7b_relu_sp/decode_eval2000_fsh_sw1_fg/score_9_0.0/eval2000_hires.ctm.swbd.filt.sys
+%WER 8.5 | 1831 21395 | 92.6 4.9 2.4 1.2 8.5 44.1 | exp/chain/blstm_6h_sp/decode_eval2000_fsh_sw1_fg/score_9_1.0/eval2000_hires.ctm.swbd.filt.sys
 
+# chain result on eval2000 for callhm subset
+# use tri-gram
+for x in exp/chain/*/decode_eval2000*tg; do grep Sum  $x/score_*/*.ctm.callhm.filt.sys | utils/best_wer.sh ; done
+%WER 17.4 | 2628 21594 | 84.7 9.8 5.5 2.1 17.4 55.3 | exp/chain/tdnn_7b_relu_sp/decode_eval2000_fsh_sw1_tg/score_8_0.0/eval2000_hires.ctm.callhm.filt.sys
+%WER 15.3 | 2628 21594 | 86.9 8.3 4.8 2.2 15.3 52.4 | exp/chain/blstm_6h_sp/decode_eval2000_fsh_sw1_tg/score_7_0.0/eval2000_hires.ctm.callhm.filt.sys
+# rescore with four-gram
+for x in exp/chain/*/decode_eval2000*fg; do grep Sum  $x/score_*/*.ctm.callhm.filt.sys | utils/best_wer.sh ; done
+%WER 17.3 | 2628 21594 | 84.9 9.7 5.5 2.1 17.3 55.0 | exp/chain/tdnn_7b_relu_sp/decode_eval2000_fsh_sw1_fg/score_8_0.0/eval2000_hires.ctm.callhm.filt.sys
+%WER 15.3 | 2628 21594 | 87.0 8.6 4.4 2.4 15.3 52.1 | exp/chain/blstm_6h_sp/decode_eval2000_fsh_sw1_fg/score_6_0.5/eval2000_hires.ctm.callhm.filt.sys
 
 # GMM and SGMM numbers reported on rt03
 for x in exp/*/decode_rt03*; do grep Sum  $x/score_*/*.ctm.filt.sys | utils/best_wer.sh ; done
@@ -89,3 +158,69 @@ for x in exp/nnet2_online/nnet_ms_a_online/decode_rt03*_fg; do grep Sum  $x/scor
 %WER 20.2 | 3970 36721 | 88.3 8.1 3.6 8.5 20.2 74.3 | exp/nnet2_online/nnet_ms_a_online/decode_rt03_utt_fsh_sw1_fg/score_11/rt03.ctm.swbd.filt.sys
 %WER 19.1 | 3970 36721 | 88.8 7.8 3.4 7.9 19.1 72.2 | exp/nnet2_online/nnet_ms_a_online/decode_rt03_utt_offline_fsh_sw1_fg/score_11/rt03.ctm.swbd.filt.sys
 
+# nnet3 result on rt03
+# use tri-gram
+for x in exp/nnet3/*/decode_rt03*tg; do grep Sum  $x/score_*/*.ctm.filt.sys | utils/best_wer.sh ; done
+%WER 14.7 | 8420 76157 | 86.8 8.9 4.3 1.5 14.7 45.9 | exp/nnet3/tdnn_sp/decode_rt03_fsh_sw1_tg/score_11_0.0/rt03_hires.ctm.filt.sys
+%WER 13.6 | 8420 76157 | 87.9 8.4 3.8 1.5 13.6 44.4 | exp/nnet3/tdnn_sp_smbr/decode_rt03_fsh_sw1_tg_epoch2.adj/score_18_1.0/rt03_hires.ctm.filt.sys
+%WER 14.2 | 8420 76157 | 87.0 8.7 4.3 1.2 14.2 46.9 | exp/nnet3/lstm_bidirectional_sp/decode_rt03_fsh_sw1_tg/score_8_0.0/rt03_hires.ctm.filt.sys
+# rescore with four-gram
+for x in exp/nnet3/*/decode_rt03*fg; do grep Sum  $x/score_*/*.ctm.filt.sys | utils/best_wer.sh ; done
+%WER 14.4 | 8420 76157 | 87.1 8.8 4.2 1.5 14.4 45.2 | exp/nnet3/tdnn_sp/decode_rt03_fsh_sw1_fg/score_11_0.0/rt03_hires.ctm.filt.sys
+%WER 13.4 | 8420 76157 | 88.2 8.4 3.4 1.6 13.4 43.6 | exp/nnet3/tdnn_sp_smbr/decode_rt03_fsh_sw1_fg_epoch2.adj/score_16_0.0/rt03_hires.ctm.filt.sys
+%WER 13.9 | 8420 76157 | 87.2 8.4 4.3 1.2 13.9 46.0 | exp/nnet3/lstm_bidirectional_sp/decode_rt03_fsh_sw1_fg/score_9_0.0/rt03_hires.ctm.filt.sys
+
+# nnet3 result on rt03 for swbd subset
+# use tri-gram
+for x in exp/nnet3/*/decode_rt03*tg; do grep Sum  $x/score_*/*.ctm.swbd.filt.sys | utils/best_wer.sh ; done
+%WER 17.4 | 4450 39436 | 84.3 10.6 5.1 1.8 17.4 48.9 | exp/nnet3/tdnn_sp/decode_rt03_fsh_sw1_tg/score_11_0.5/rt03_hires.ctm.swbd.filt.sys
+%WER 16.1 | 4450 39436 | 85.7 9.9 4.4 1.8 16.1 47.2 | exp/nnet3/tdnn_sp_smbr/decode_rt03_fsh_sw1_tg_epoch2.adj/score_18_0.5/rt03_hires.ctm.swbd.filt.sys
+%WER 16.6 | 4450 39436 | 84.7 10.0 5.3 1.3 16.6 49.6 | exp/nnet3/lstm_bidirectional_sp/decode_rt03_fsh_sw1_tg/score_10_0.5/rt03_hires.ctm.swbd.filt.sys
+# rescore with four-gram
+for x in exp/nnet3/*/decode_rt03*fg; do grep Sum  $x/score_*/*.ctm.swbd.filt.sys | utils/best_wer.sh ; done
+%WER 17.1 | 4450 39436 | 84.6 10.3 5.1 1.8 17.1 48.2 | exp/nnet3/tdnn_sp/decode_rt03_fsh_sw1_fg/score_12_0.0/rt03_hires.ctm.swbd.filt.sys
+%WER 15.9 | 4450 39436 | 85.9 9.9 4.2 1.8 15.9 46.7 | exp/nnet3/tdnn_sp_smbr/decode_rt03_fsh_sw1_fg_epoch2.adj/score_18_0.0/rt03_hires.ctm.swbd.filt.sys
+%WER 16.3 | 4450 39436 | 85.0 9.8 5.1 1.3 16.3 49.0 | exp/nnet3/lstm_bidirectional_sp/decode_rt03_fsh_sw1_fg/score_10_0.0/rt03_hires.ctm.swbd.filt.sys
+
+# nnet3 result on rt03 for fsh subset
+# use tri-gram
+for x in exp/nnet3/*/decode_rt03*tg; do grep Sum  $x/score_*/*.ctm.fsh.filt.sys | utils/best_wer.sh ; done
+%WER 11.8 | 3970 36721 | 89.4 7.2 3.5 1.2 11.8 42.5 | exp/nnet3/tdnn_sp/decode_rt03_fsh_sw1_tg/score_11_0.0/rt03_hires.ctm.fsh.filt.sys
+%WER 10.9 | 3970 36721 | 90.4 6.8 2.7 1.3 10.9 40.6 | exp/nnet3/tdnn_sp_smbr/decode_rt03_fsh_sw1_tg_epoch2.adj/score_15_0.0/rt03_hires.ctm.fsh.filt.sys
+%WER 11.6 | 3970 36721 | 89.4 7.1 3.5 1.0 11.6 43.6 | exp/nnet3/lstm_bidirectional_sp/decode_rt03_fsh_sw1_tg/score_7_0.0/rt03_hires.ctm.fsh.filt.sys
+# rescore with four-gram
+for x in exp/nnet3/*/decode_rt03*fg; do grep Sum  $x/score_*/*.ctm.fsh.filt.sys | utils/best_wer.sh ; done
+%WER 11.4 | 3970 36721 | 89.7 6.9 3.4 1.1 11.4 41.5 | exp/nnet3/tdnn_sp/decode_rt03_fsh_sw1_fg/score_11_0.0/rt03_hires.ctm.fsh.filt.sys
+%WER 10.6 | 3970 36721 | 90.7 6.6 2.7 1.3 10.6 39.8 | exp/nnet3/tdnn_sp_smbr/decode_rt03_fsh_sw1_fg_epoch2.adj/score_15_1.0/rt03_hires.ctm.fsh.filt.sys
+%WER 11.4 | 3970 36721 | 89.5 6.7 3.8 1.0 11.4 42.6 | exp/nnet3/lstm_bidirectional_sp/decode_rt03_fsh_sw1_fg/score_10_0.0/rt03_hires.ctm.fsh.filt.sys
+
+# chain result on rt03
+# BLSTM ran for about 380 hours
+# use tri-gram
+for x in exp/chain/*/decode_rt03*tg; do grep Sum  $x/score_*/*.ctm.filt.sys | utils/best_wer.sh ; done
+%WER 12.7 | 8420 76157 | 88.5 7.2 4.2 1.3 12.7 43.2 | exp/chain/tdnn_7b_sp/decode_rt03_fsh_sw1_tg/score_9_0.0/rt03_hires.ctm.filt.sys
+%WER 11.7 | 8420 76157 | 89.8 6.6 3.6 1.5 11.7 43.7 | exp/chain/blstm_6h_sp/decode_rt03_fsh_sw1_tg/score_7_0.0/rt03_hires.ctm.filt.sys
+# rescore with four-gram
+for x in exp/chain/*/decode_rt03*fg; do grep Sum  $x/score_*/*.ctm.filt.sys | utils/best_wer.sh ; done
+%WER 12.4 | 8420 76157 | 88.9 7.0 4.1 1.3 12.4 42.7 | exp/chain/tdnn_7b_sp/decode_rt03_fsh_sw1_fg/score_9_0.0/rt03_hires.ctm.filt.sys
+%WER 11.4 | 8420 76157 | 89.9 6.1 3.9 1.3 11.4 43.4 | exp/chain/blstm_6h_sp/decode_rt03_fsh_sw1_fg/score_8_0.0/rt03_hires.ctm.filt.sys
+
+# chain result on rt03 for swbd subset
+# use tri-gram
+for x in exp/chain/*/decode_rt03*tg; do grep Sum  $x/score_*/*.ctm.swbd.filt.sys | utils/best_wer.sh ; done
+%WER 15.0 | 4450 39436 | 86.4 8.6 5.0 1.4 15.0 45.8 | exp/chain/tdnn_7b_sp/decode_rt03_fsh_sw1_tg/score_9_0.0/rt03_hires.ctm.swbd.filt.sys
+%WER 13.3 | 4450 39436 | 88.3 7.5 4.2 1.6 13.3 45.2 | exp/chain/blstm_6h_sp/decode_rt03_fsh_sw1_tg/score_8_0.0/rt03_hires.ctm.swbd.filt.sys
+# rescore with four-gram
+for x in exp/chain/*/decode_rt03*fg; do grep Sum  $x/score_*/*.ctm.swbd.filt.sys | utils/best_wer.sh ; done
+%WER 14.8 | 4450 39436 | 86.5 8.0 5.5 1.3 14.8 45.5 | exp/chain/tdnn_7b_sp/decode_rt03_fsh_sw1_fg/score_10_0.0/rt03_hires.ctm.swbd.filt.sys
+%WER 13.0 | 4450 39436 | 88.5 7.3 4.2 1.6 13.0 44.8 | exp/chain/blstm_6h_sp/decode_rt03_fsh_sw1_fg/score_8_0.0/rt03_hires.ctm.swbd.filt.sys
+
+# chain result on rt03 for fsh subset
+# use tri-gram
+for x in exp/chain/*/decode_rt03*tg; do grep Sum  $x/score_*/*.ctm.fsh.filt.sys | utils/best_wer.sh ; done
+%WER 10.2 | 3970 36721 | 91.1 6.0 3.0 1.2 10.2 40.2 | exp/chain/tdnn_7b_relu_sp/decode_rt03_fsh_sw1_tg/score_8_0.0/rt03_hires.ctm.fsh.filt.sys
+%WER 9.8 | 3970 36721 | 91.4 5.3 3.3 1.2 9.8 42.0 | exp/chain/blstm_6h_sp/decode_rt03_fsh_sw1_tg/score_7_0.0/rt03_hires.ctm.fsh.filt.sys
+# rescore with four-gram
+for x in exp/chain/*/decode_rt03*fg; do grep Sum  $x/score_*/*.ctm.fsh.filt.sys | utils/best_wer.sh ; done
+%WER 9.8 | 3970 36721 | 91.4 5.8 2.8 1.2 9.8 39.6 | exp/chain/tdnn_7b_relu_sp/decode_rt03_fsh_sw1_fg/score_8_0.0/rt03_hires.ctm.fsh.filt.sys
+%WER 9.6 | 3970 36721 | 91.6 5.2 3.3 1.2 9.6 41.4 | exp/chain/blstm_6h_sp/decode_rt03_fsh_sw1_fg/score_7_0.0/rt03_hires.ctm.fsh.filt.sys
diff --git a/egs/fisher_swbd/s5/cmd.sh b/egs/fisher_swbd/s5/cmd.sh
index e3294fde05a..88db78823a5 100644
--- a/egs/fisher_swbd/s5/cmd.sh
+++ b/egs/fisher_swbd/s5/cmd.sh
@@ -1,32 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64*"
-export decode_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G"
-export cuda_cmd="..."
-export mkgraph_cmd="queue.pl -l arch=*64* ram_free=4G,mem_free=4G"
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
-#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
-#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
-#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
-
-#c) run it locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-#export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
-
-#d) Gorgon cluster
-#export train_cmd="gorgon_queue.pl -q gorgon"
-#export decode_cmd="gorgon_queue.pl -q gorgon"
-#export cuda_cmd="gorgon_queue.pl -q gorgon"
-#export mkgraph_cmd="gorgon_queue.pl -q gorgon"
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 4G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/fisher_swbd/s5/conf/MSU_single_letter.txt b/egs/fisher_swbd/s5/conf/MSU_single_letter.txt
new file mode 100644
index 00000000000..1f7b419cca7
--- /dev/null
+++ b/egs/fisher_swbd/s5/conf/MSU_single_letter.txt
@@ -0,0 +1,26 @@
+A ey
+B b iy
+C s iy
+D d iy
+E iy
+F eh f
+G jh iy
+H ey ch
+I ay
+J jh ey
+K k ey
+L eh l
+M eh m
+N eh n
+O ow
+P p iy
+Q k y uw
+R aa r
+S eh s
+T t iy
+U y uw
+V v iy
+W d ah b ax l y uw
+X eh k s
+Y w ay
+Z z iy
diff --git a/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh b/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh
new file mode 100644
index 00000000000..b70da4e852a
--- /dev/null
+++ b/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh
@@ -0,0 +1,181 @@
+#!/bin/bash
+
+# based on run_tdnn_6h.sh
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+dir=exp/chain/blstm_6h
+decode_iter=
+decode_dir_affix=
+
+# training options
+num_epochs=4
+remove_egs=false
+common_egs_dir=
+affix=
+chunk_width=150
+chunk_left_context=40
+chunk_right_context=40
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+dir=$dir${affix:+_$affix}
+train_set=train_nodup_sp
+ali_dir=exp/tri5a_ali_nodup
+treedir=exp/chain/tri6_tree_11000
+lang=data/lang_chain
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb true \
+  --generate-alignments false || exit 1;
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat $ali_dir/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri5a exp/tri5a_lats_nodup_sp
+  rm exp/tri5a_lats_nodup_sp/fsts.*.gz # save space
+fi
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 11000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs";
+
+  steps/nnet3/lstm/make_configs.py  \
+    --feat-dir data/${train_set}_hires \
+    --ivector-dir exp/nnet3/ivectors_${train_set} \
+    --tree-dir $treedir \
+    --splice-indexes="-2,-1,0,1,2 0 0" \
+    --lstm-delay=" [-3,3] [-3,3] [-3,3] " \
+    --xent-regularize 0.1 \
+    --include-log-softmax false \
+    --num-lstm-layers 3 \
+    --cell-dim 1024 \
+    --hidden-dim 1024 \
+    --recurrent-projection-dim 256 \
+    --non-recurrent-projection-dim 256 \
+    --label-delay 0 \
+    --self-repair-scale 0.00001 \
+   $dir/configs || exit 1;
+
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --chain.left-deriv-truncate 0 \
+    --trainer.num-chunk-per-minibatch 64 \
+    --trainer.frames-per-iter 1200000 \
+    --trainer.max-param-change 1.414 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $chunk_width \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri5a_lats_nodup_sp \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
+fi
+
+decode_suff=fsh_sw1_tg
+graph_dir=$dir/graph_fsh_sw1_tg
+if [ $stage -le 15 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+
+  # decoding options
+  extra_left_context=$[$chunk_left_context+10]
+  extra_right_context=$[$chunk_right_context+10]
+
+  for decode_set in eval2000 rt03; do
+      (
+      num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $num_jobs --cmd "$decode_cmd" $iter_opts \
+          --extra-left-context $extra_left_context \
+          --extra-right-context $extra_right_context \
+          --frames-per-chunk $chunk_width \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+          data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \
+         $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_7b.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_7b.sh
new file mode 100644
index 00000000000..d0e1093bf93
--- /dev/null
+++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_7b.sh
@@ -0,0 +1,162 @@
+#!/bin/bash
+
+set -e
+
+# based on run_tdnn_7b.sh in the swbd recipe
+
+# configs for 'chain'
+affix=
+stage=12
+train_stage=-10
+get_egs_stage=-10
+dir=exp/chain/tdnn_7b
+decode_iter=
+
+# training options
+num_epochs=4
+remove_egs=false
+common_egs_dir=
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+dir=${dir}${affix:+_$affix}
+train_set=train_nodup_sp
+ali_dir=exp/tri5a_ali_nodup
+treedir=exp/chain/tri6_tree_11000
+lang=data/lang_chain
+
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb true \
+  --generate-alignments false || exit 1;
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat $ali_dir/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri5a exp/tri5a_lats_nodup_sp
+  rm exp/tri5a_lats_nodup_sp/fsts.*.gz # save space
+fi
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 11000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs";
+
+  # create the config files for nnet initialization
+  steps/nnet3/tdnn/make_configs.py \
+    --self-repair-scale 0.00001 \
+    --feat-dir data/${train_set}_hires \
+    --ivector-dir exp/nnet3/ivectors_${train_set} \
+    --tree-dir $treedir \
+    --relu-dim 725 \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \
+    --use-presoftmax-prior-scale false \
+    --xent-regularize 0.1 \
+    --xent-separate-forward-affine true \
+    --include-log-softmax false \
+    --final-layer-normalize-target 0.5 \
+    $dir/configs || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --egs.dir "$common_egs_dir" \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width 150 \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri5a_lats_nodup_sp \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
+fi
+
+decode_suff=fsh_sw1_tg
+graph_dir=$dir/graph_fsh_sw1_tg
+if [ $stage -le 15 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in eval2000 rt03; do
+      (
+      num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $num_jobs --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+          data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \
+          $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_fsh_sw1_{tg,fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/fisher_swbd/s5/local/dict.patch b/egs/fisher_swbd/s5/local/dict.patch
new file mode 100644
index 00000000000..7fcaa98b4f5
--- /dev/null
+++ b/egs/fisher_swbd/s5/local/dict.patch
@@ -0,0 +1,378 @@
+8645a8646
+> uh-hum ah m hh ah m
+9006c9007
+< April ey p r ih l
+---
+> April ey p r ax l
+9144d9144
+< B ay zh aa n iy z
+9261c9261
+< Battle b ae t el
+---
+> Battle b ae t ax l
+10014a10015
+> Chevy sh eh v iy
+10211a10213
+> Colorado k ao l ax r aa d ow
+10212a10215
+> Colorado' k ao l ax r aa d ow z
+10370c10373
+< Creek k r ih k
+---
+> Creek k r iy k
+10889a10893
+> Eleven ax l eh v ih n
+10951c10955
+< Erie ih r iy
+---
+> Erie iy r iy
+11183c11187
+< Forever f ax r eh v er
+---
+> Forever f er eh v er
+11231a11236
+> Friday f r ay d iy
+11744a11750
+> History hh ih s t r iy
+12004a12011,12012
+> Israel ih z r ih l
+> Israel's ih z r ih l z
+12573a12582
+> Lincoln l ih ng k ih n
+12574a12584
+> Lincolns l ih ng k ih n z
+13268c13278
+< NAACP eh ey ey s iy p iy
+---
+> NAACP eh n ey ey s iy p iy
+13286c13296
+< NIT eh ay t iy
+---
+> NIT eh n ay t iy
+13292c13302
+< NTSC eh t iy eh s s iy
+---
+> NTSC eh n t iy eh s s iy
+14058a14069
+> Quarter k ow r t er
+14059a14071
+> Quarterback k ow r t er b ae k
+14060a14073
+> Quarters k ow r t er z
+14569a14583
+> Science s ay n s
+15087a15102
+> Sunday s ah n d iy
+15088a15104
+> Sunday's s ah n d iy z
+15089a15106
+> Sundays s ah n d iy z
+15290,15291c15307,15308
+< Texan t eh k sh ih n
+< Texan's t eh k sh ih n s
+---
+> Texan t eh k s ih n
+> Texan's t eh k s ih n s
+15335a15353
+> Thousands th aw z ih n z
+15739c15757
+< Waco w ae k ow
+---
+> Waco w ey k ow
+15841a15860
+> Weekends w iy k eh n z
+16782a16802
+> acceptable eh k s eh p ax b ax l
+16833a16854
+> accounting ax k aw n ih ng
+16948a16970
+> address ax d r eh s
+17281a17304
+> already aa r d iy
+17315a17339
+> am m
+17709a17734
+> asked ae s t
+17847a17873
+> attorney ih t er n iy
+17919a17946
+> autopilot ao t ow p ay l ih t
+17960a17988
+> awfully ao f l iy
+18221a18250
+> basketball b ae s k ax b ao l
+18222a18252
+> basketball's b ae s k ax b ao l z
+18302a18333
+> become b ah k ah m
+18303a18335
+> becomes b iy k ah m z
+18344a18377
+> began b ax g en n
+18817c18850
+< bottle b aa t el
+---
+> bottle b aa t ax l
+19332,19333c19365,19367
+< camera's k ae m ax r ax z
+< cameras k ae m ax r ax z
+---
+> camera k ae m r ax
+> camera's k ae m r ax z
+> cameras k ae m r ax z
+19411a19446
+> capital k ae p ax l
+19505a19541
+> carrying k ae r ih ng
+20316a20353,20354
+> combination k aa m ih n ey sh ih n
+> combinations k aa m ih n ey sh ih n z
+20831a20870
+> contracts k aa n t r ae k s
+21010a21050
+> costs k ao s
+21062a21103
+> county k aw n iy
+21371a21413
+> cultural k ao l ch ax r ax l
+21372a21415
+> culturally k ao l ch ax r ax l iy
+21373a21417
+> culture k ao l ch er
+21375a21420
+> cultures k ao l ch er z
+21543a21589
+> data d ey t ax
+22097a22144
+> differently d ih f ax r ih n t l iy
+22972a23020
+> effects ax f eh k t s
+23016a23065
+> election ax l eh k sh ih n
+23018a23068
+> elections ax l eh k sh ih n z
+23052a23103
+> eleven ax l eh v ih n
+23242a23294
+> enjoyable ae n jh oy ax b ax l
+23248a23301
+> enjoys ae n jh oy z
+23293a23347
+> entire ih n t ay r
+23295a23350,23351
+> entirely ih n t ay r l iy
+> entirety ih n t ay r t iy
+23745a23802
+> extra eh k s t er
+23818a23876
+> facts f ae k s
+24508c24566
+< forever f ax r eh v er
+---
+> forever f er eh v er
+24514c24572
+< forget f ow r g eh t
+---
+> forget f er r g eh t
+24521a24580
+> forgot f er r g aa t
+24522a24582
+> forgotten f er r g aa t ax n
+24563a24624
+> forward f ow er d
+24680a24742
+> frightening f r ay t n ih ng
+24742a24805
+> full-time f ax l t ay m
+24862a24926
+> garage g r aa jh
+25218a25283
+> grandmother g r ae m ah dh er
+25790a25856
+> heavily hh eh v ax l iy
+25949a26016
+> history hh ih s t r iy
+26038a26106
+> honestly aa n ax s t l iy
+26039a26108
+> honesty aa n ax s t iy
+26099a26169
+> horror hh ow r 
+26155a26226
+> houses hh aw z ih z
+26184c26255
+< huh-uh hh ah hh ah
+---
+> huh-uh ah hh ah
+26189c26260
+< hum-um hh m hh m
+---
+> hum-um ah m hh ah m
+26236a26308
+> hunting hh ah n ih ng
+26307a26380,26381
+> ideal ay d iy l
+> idealist ay d iy l ih s t
+26369a26444
+> imagine m ae jh ih n
+26628a26704
+> individuals ih n d ih v ih jh ax l z
+26968a27045
+> interest ih n t r ih s t
+27184a27262
+> it'd ih d
+27702a27781
+> lead l iy d
+28378a28458
+> mandatory m ae n d ih t ow r iy
+28885a28966
+> minute m ih n ih t
+29167a29249
+> mountains m aw t n z
+29317a29400
+> mysteries m ih s t r iy z
+29318a29402
+> mystery m ih s t r iy
+29470a29555
+> nervous n er v ih s
+29578,29580c29663,29665
+< nobody n ow b aa d iy
+< nobody'll n ow b aa d iy l
+< nobody's n ow b aa d iy z
+---
+> nobody n ow b ah d iy
+> nobody'll n ow b ah d iy l
+> nobody's n ow b ah d iy z
+29712a29798
+> nuclear n uw k l iy r
+29938a30025
+> onto aa n t ax
+30051a30139
+> originally ax r ih jh ax l iy
+30507a30596
+> particularly p er t ih k y ax l iy
+30755a30845
+> perfectly p er f ih k l iy
+30820a30911
+> personally p er s n ax l iy
+30915a31007
+> physically f ih z ih k l iy
+30986a31079
+> pilot p ay l ih t
+30987a31081
+> pilot's p ay l ih t s
+31227a31322
+> police p l iy s
+31513a31609
+> prefer p er f er
+31553a31650
+> prepare p r ax p ey r
+31578a31676
+> prescription p er s k r ih p sh ih n
+31579a31678
+> prescriptions p er s k r ih p sh ih n z
+31770a31870
+> products p r aa d ax k s
+31821a31922
+> projects p r aa jh eh k s
+31908a32010
+> protect p er t eh k t
+31909a32012
+> protected p er t eh k t ih d
+31911a32015
+> protection p er t eh k sh ih n
+31914a32019
+> protection p er t eh k t ih v
+32149a32255
+> quarter k ow r t er
+32414a32521
+> read r iy d
+32785a32893
+> rehabilitation r iy ax b ih l ih t ey sh ih n
+33150a33259
+> resource r ih s ow r s
+33151a33261
+> resources r iy s ow r s ih z
+33539c33649
+< roots r uh t s
+---
+> roots r uw t s
+33929a34040
+> science s ay n s
+34315a34427
+> seventy s eh v ih n iy
+34319,34320c34431,34432
+< severe s ax v iy r
+< severely s ax v iy r l iy
+---
+> severe s ih v iy r
+> severely s ih v iy r l iy
+35060a35173
+> software s ao f w ey r
+35083a35197
+> solid s ao l ih d
+35084a35199
+> solidly s ao l ih d l iy
+35750a35866
+> stood s t ih d
+35854a35971
+> strictly s t r ih k l iy
+35889c36006
+< stronger s t r ao ng er
+---
+> stronger s t r ao ng g er
+36192a36310,36311
+> supposed s p ow z
+> supposed s p ow s
+36510a36630
+> tastes t ey s
+36856a36977
+> thoroughly th er r l iy
+36866a36988
+> thousands th aw z ih n z
+37081c37203
+< toots t uh t s
+---
+> toots t uw t s
+37157a37280
+> toward t w ow r d
+37158a37282
+> towards t w ow r d z
+37564a37689
+> twenties t w eh n iy z
+37565a37691
+> twentieth t w eh n iy ih th
+37637a37764
+> unacceptable ah n ae k s eh p ax b ax l
+37728a37856
+> understand ah n d er s t ae n
+37860a37989
+> unless ih n l eh s
+38040a38170
+> use y uw z
+38049a38180
+> uses y uw z ih z
+38125a38257
+> various v ah r iy ih s
+38202a38335
+> versus v er s ih z
+38381c38514
+< wacko w ae k ow
+---
+> wacko w ey k ow
+38455c38588
+< wanna w aa n ax
+---
+> wanna w ah n ax
+38675c38808
+< whatnot w ah t n aa t
+---
+> whatnot w aa t n aa t
+38676a38810
+> whatsoever w aa t s ow eh v er
+38890c39024
+< wok w aa k
+---
+> wok w ao k
+38910a39045
+> wondering w ah n d r ih ng
diff --git a/egs/fisher_swbd/s5/local/fisher_create_test_lang.sh b/egs/fisher_swbd/s5/local/fisher_create_test_lang.sh
index aaa45f8e4e1..1d7c4013b83 100755
--- a/egs/fisher_swbd/s5/local/fisher_create_test_lang.sh
+++ b/egs/fisher_swbd/s5/local/fisher_create_test_lang.sh
@@ -1,4 +1,4 @@
-#!/bin/bash 
+#!/bin/bash
 #
 
 if [ -f path.sh ]; then . path.sh; fi
@@ -11,26 +11,13 @@ arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz
 mkdir -p data/lang_test
 cp -r data/lang/* data/lang_test
 
-# grep -v '<s> <s>' etc. is only for future-proofing this script.  Our
-# LM doesn't have these "invalid combinations".  These can cause 
-# determinization failures of CLG [ends up being epsilon cycles].
-# Note: remove_oovs.pl takes a list of words in the LM that aren't in
-# our word list.  Since our LM doesn't have any, we just give it
-# /dev/null [we leave it in the script to show how you'd do it].
 gunzip -c "$arpa_lm" | \
-   grep -v '<s> <s>' | \
-   grep -v '</s> <s>' | \
-   grep -v '</s> </s>' | \
-   arpa2fst - | fstprint | \
-   utils/remove_oovs.pl /dev/null | \
-   utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \
-     --osymbols=data/lang_test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-    fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst
-  fstisstochastic data/lang_test/G.fst
+  arpa2fst --disambig-symbol=#0 \
+           --read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst
 
 
 echo  "Checking how stochastic G is (the first of these numbers should be small):"
-fstisstochastic data/lang_test/G.fst 
+fstisstochastic data/lang_test/G.fst
 
 ## Check lexicon.
 ## just have a look and make sure it seems sane.
@@ -59,4 +46,3 @@ fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \
 
 
 echo "$0 succeeded"
-
diff --git a/egs/fisher_swbd/s5/local/fisher_create_test_lang_fsh.sh b/egs/fisher_swbd/s5/local/fisher_create_test_lang_fsh.sh
index 246ef1b888f..fb07544a92a 100755
--- a/egs/fisher_swbd/s5/local/fisher_create_test_lang_fsh.sh
+++ b/egs/fisher_swbd/s5/local/fisher_create_test_lang_fsh.sh
@@ -1,4 +1,4 @@
-#!/bin/bash 
+#!/bin/bash
 #
 
 if [ -f path.sh ]; then . path.sh; fi
@@ -11,26 +11,13 @@ arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz
 mkdir -p data/lang_test_fsh
 cp -r data/lang/* data/lang_test_fsh
 
-# grep -v '<s> <s>' etc. is only for future-proofing this script.  Our
-# LM doesn't have these "invalid combinations".  These can cause 
-# determinization failures of CLG [ends up being epsilon cycles].
-# Note: remove_oovs.pl takes a list of words in the LM that aren't in
-# our word list.  Since our LM doesn't have any, we just give it
-# /dev/null [we leave it in the script to show how you'd do it].
 gunzip -c "$arpa_lm" | \
-   grep -v '<s> <s>' | \
-   grep -v '</s> <s>' | \
-   grep -v '</s> </s>' | \
-   arpa2fst - | fstprint | \
-   utils/remove_oovs.pl /dev/null | \
-   utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test_fsh/words.txt \
-     --osymbols=data/lang_test_fsh/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-    fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test_fsh/G.fst
-  fstisstochastic data/lang_test_fsh/G.fst
+  arpa2fst --disambig-symbol=#0 \
+           --read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst
 
 
 echo  "Checking how stochastic G is (the first of these numbers should be small):"
-fstisstochastic data/lang_test_fsh/G.fst 
+fstisstochastic data/lang_test_fsh/G.fst
 
 ## Check lexicon.
 ## just have a look and make sure it seems sane.
@@ -59,4 +46,3 @@ fsttablecompose data/lang/L_disambig.fst data/lang_test_fsh/G.fst | \
 
 
 echo "$0 succeeded"
-
diff --git a/egs/fisher_swbd/s5/local/fisher_train_lms.sh b/egs/fisher_swbd/s5/local/fisher_train_lms.sh
index 5d8b9e2e18d..a9e3fa4566a 100755
--- a/egs/fisher_swbd/s5/local/fisher_train_lms.sh
+++ b/egs/fisher_swbd/s5/local/fisher_train_lms.sh
@@ -30,6 +30,7 @@ export PATH=$PATH:`pwd`/../../../tools/kaldi_lm
  else
    echo Downloading and installing the kaldi_lm tools
    if [ ! -f kaldi_lm.tar.gz ]; then
+     wget http://www.danielpovey.com/files/kaldi/kaldi_lm.tar.gz ||
      wget http://merlin.fit.vutbr.cz/kaldi/kaldi_lm.tar.gz || exit 1;
    fi
    tar -xvzf kaldi_lm.tar.gz || exit 1;
diff --git a/egs/fisher_swbd/s5/local/fisher_train_lms_fsh.sh b/egs/fisher_swbd/s5/local/fisher_train_lms_fsh.sh
index ebc954b756b..3133af6ee1f 100755
--- a/egs/fisher_swbd/s5/local/fisher_train_lms_fsh.sh
+++ b/egs/fisher_swbd/s5/local/fisher_train_lms_fsh.sh
@@ -30,6 +30,7 @@ export PATH=$PATH:`pwd`/../../../tools/kaldi_lm
  else
    echo Downloading and installing the kaldi_lm tools
    if [ ! -f kaldi_lm.tar.gz ]; then
+     wget http://www.danielpovey.com/files/kaldi/kaldi_lm.tar.gz ||
      wget http://merlin.fit.vutbr.cz/kaldi/kaldi_lm.tar.gz || exit 1;
    fi
    tar -xvzf kaldi_lm.tar.gz || exit 1;
diff --git a/egs/fisher_swbd/s5/local/nnet3/run_ivector_common.sh b/egs/fisher_swbd/s5/local/nnet3/run_ivector_common.sh
new file mode 100644
index 00000000000..4d083d61d0e
--- /dev/null
+++ b/egs/fisher_swbd/s5/local/nnet3/run_ivector_common.sh
@@ -0,0 +1,141 @@
+#!/bin/bash
+
+. ./cmd.sh
+set -e
+stage=1
+train_stage=-10
+generate_alignments=true # false if doing chain training
+speed_perturb=true
+
+. ./path.sh
+. ./utils/parse_options.sh
+
+# perturbed data preparation
+train_set=train_nodup
+if [ "$speed_perturb" == "true" ]; then
+  if [ $stage -le 1 ]; then
+    #Although the nnet will be trained by high resolution data, we still have to perturbe the normal data to get the alignment
+    # _sp stands for speed-perturbed
+
+    for datadir in train_nodup; do
+      utils/perturb_data_dir_speed.sh 0.9 data/${datadir} data/temp1
+      utils/perturb_data_dir_speed.sh 1.1 data/${datadir} data/temp2
+      utils/combine_data.sh data/${datadir}_tmp data/temp1 data/temp2
+      utils/validate_data_dir.sh --no-feats data/${datadir}_tmp
+      rm -r data/temp1 data/temp2
+
+      mfccdir=mfcc_perturbed
+      steps/make_mfcc.sh --cmd "$train_cmd" --nj 50 \
+        data/${datadir}_tmp exp/make_mfcc/${datadir}_tmp $mfccdir || exit 1;
+      steps/compute_cmvn_stats.sh data/${datadir}_tmp exp/make_mfcc/${datadir}_tmp $mfccdir || exit 1;
+      utils/fix_data_dir.sh data/${datadir}_tmp
+
+      utils/copy_data_dir.sh --spk-prefix sp1.0- --utt-prefix sp1.0- data/${datadir} data/temp0
+      utils/combine_data.sh data/${datadir}_sp data/${datadir}_tmp data/temp0
+      utils/fix_data_dir.sh data/${datadir}_sp
+      rm -r data/temp0 data/${datadir}_tmp
+    done
+  fi
+
+  if [ $stage -le 2 ] && [ "$generate_alignments" == "true" ]; then
+    #obtain the alignment of the perturbed data
+    steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \
+      data/train_nodup_sp data/lang_nosp exp/tri5a exp/tri5a_ali_nodup_sp || exit 1
+  fi
+  train_set=train_nodup_sp
+fi
+
+if [ $stage -le 3 ]; then
+  mfccdir=mfcc_hires
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
+    date=$(date +'%m_%d_%H_%M')
+    utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/fisher_swbd-$date/s5b/$mfccdir/storage $mfccdir/storage
+  fi
+
+  # the 100k_nodup directory is copied seperately, as
+  # we want to use exp/tri1b_ali_100k_nodup for lda_mllt training
+  # the main train directory might be speed_perturbed
+  for dataset in $train_set train_100k_nodup; do
+    utils/copy_data_dir.sh data/$dataset data/${dataset}_hires
+
+    # scale the waveforms, this is useful as we don't use CMVN
+    data_dir=data/${dataset}_hires
+    cat $data_dir/wav.scp | python -c "
+import sys, os, subprocess, re, random
+scale_low = 1.0/8
+scale_high = 2.0
+for line in sys.stdin.readlines():
+  if len(line.strip()) == 0:
+    continue
+  print '{0} sox --vol {1} -t wav - -t wav - |'.format(line.strip(), random.uniform(scale_low, scale_high))
+"| sort -k1,1 -u  > $data_dir/wav.scp_scaled || exit 1;
+    mv $data_dir/wav.scp_scaled $data_dir/wav.scp
+
+    steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \
+        --cmd "$train_cmd" data/${dataset}_hires exp/make_hires/$dataset $mfccdir;
+    steps/compute_cmvn_stats.sh data/${dataset}_hires exp/make_hires/${dataset} $mfccdir;
+
+    # Remove the small number of utterances that couldn't be extracted for some
+    # reason (e.g. too short; no such file).
+    utils/fix_data_dir.sh data/${dataset}_hires;
+  done
+
+  for dataset in eval2000 rt03; do
+    # Create MFCCs for the eval set
+    utils/copy_data_dir.sh data/$dataset data/${dataset}_hires
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 --mfcc-config conf/mfcc_hires.conf \
+        data/${dataset}_hires exp/make_hires/$dataset $mfccdir;
+    steps/compute_cmvn_stats.sh data/${dataset}_hires exp/make_hires/$dataset $mfccdir;
+    utils/fix_data_dir.sh data/${dataset}_hires  # remove segments with problems
+  done
+
+  # Take the first 30k utterances (about 1/8th of the data) this will be used
+  # for the diagubm training
+  utils/subset_data_dir.sh --first data/${train_set}_hires 30000 data/${train_set}_30k_hires
+  local/remove_dup_utts.sh 200 data/${train_set}_30k_hires data/${train_set}_30k_nodup_hires  # 33hr
+fi
+
+# ivector extractor training
+if [ $stage -le 5 ]; then
+  # We need to build a small system just because we need the LDA+MLLT transform
+  # to train the diag-UBM on top of.  We use --num-iters 13 because after we get
+  # the transform (12th iter is the last), any further training is pointless.
+  # this decision is based on fisher_english
+  steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \
+    --splice-opts "--left-context=3 --right-context=3" \
+    5500 90000 data/train_100k_nodup_hires \
+    data/lang_nosp exp/tri1b_ali exp/nnet3/tri2b
+fi
+
+if [ $stage -le 6 ]; then
+  # To train a diagonal UBM we don't need very much data, so use the smallest subset.
+  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 200000 \
+    data/${train_set}_30k_nodup_hires 512 exp/nnet3/tri2b exp/nnet3/diag_ubm
+fi
+
+if [ $stage -le 7 ]; then
+  # iVector extractors can be sensitive to the amount of data, but this one has a
+  # fairly small dim (defaults to 100) so we don't use all of it, we use just the
+  # 100k subset (just under half the data).
+  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
+    data/train_100k_nodup_hires exp/nnet3/diag_ubm exp/nnet3/extractor || exit 1;
+fi
+
+if [ $stage -le 8 ]; then
+  # We extract iVectors on all the train_nodup data, which will be what we
+  # train the system on.
+
+  # having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (iVector starts at zero).
+  steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/${train_set}_hires data/${train_set}_max2_hires
+
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
+    data/${train_set}_max2_hires exp/nnet3/extractor exp/nnet3/ivectors_$train_set || exit 1;
+
+  for data_set in eval2000 rt03; do
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
+      data/${data_set}_hires exp/nnet3/extractor exp/nnet3/ivectors_$data_set || exit 1;
+  done
+fi
+
+exit 0;
diff --git a/egs/fisher_swbd/s5/local/nnet3/run_lstm.sh b/egs/fisher_swbd/s5/local/nnet3/run_lstm.sh
new file mode 100644
index 00000000000..fec07fb2983
--- /dev/null
+++ b/egs/fisher_swbd/s5/local/nnet3/run_lstm.sh
@@ -0,0 +1,158 @@
+#!/bin/bash
+
+# Copyright 2015  Johns Hopkins University (Author: Daniel Povey).
+#           2015  Vijayaditya Peddinti
+#           2015  Xingyu Na
+#           2015  Pegah Ghahrmani
+# Apache 2.0.
+
+
+# this is a basic lstm script
+# LSTM script runs for more epochs than the TDNN script
+# and each epoch takes twice the time
+
+# At this script level we don't support not running on GPU, as it would be painfully slow.
+# If you want to run without GPU you'd have to call lstm/train.sh with --gpu false
+
+stage=0
+train_stage=-10
+affix=
+common_egs_dir=
+reporting_email=
+
+# LSTM options
+splice_indexes="-2,-1,0,1,2 0 0"
+lstm_delay=" -1 -2 -3 "
+label_delay=5
+num_lstm_layers=3
+cell_dim=1024
+hidden_dim=1024
+recurrent_projection_dim=256
+non_recurrent_projection_dim=256
+chunk_width=20
+chunk_left_context=40
+chunk_right_context=0
+
+
+# training options
+num_epochs=8
+initial_effective_lrate=0.0003
+final_effective_lrate=0.00003
+num_jobs_initial=3
+num_jobs_final=15
+momentum=0.5
+num_chunk_per_minibatch=100
+samples_per_iter=20000
+remove_egs=true
+
+#decode options
+extra_left_context=
+extra_right_context=
+frames_per_chunk=
+
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+dir=exp/nnet3/lstm
+dir=$dir${affix:+_$affix}
+if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
+train_set=train_nodup_sp
+ali_dir=exp/tri5a_ali_nodup_sp
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb true || exit 1;
+
+if [ $stage -le 9 ]; then
+  echo "$0: creating neural net configs";
+  config_extra_opts=()
+  [ ! -z "$lstm_delay" ] && config_extra_opts+=(--lstm-delay "$lstm_delay")
+  steps/nnet3/lstm/make_configs.py  "${config_extra_opts[@]}" \
+    --feat-dir data/${train_set}_hires \
+    --ivector-dir exp/nnet3/ivectors_${train_set} \
+    --ali-dir $ali_dir \
+    --num-lstm-layers $num_lstm_layers \
+    --splice-indexes "$splice_indexes " \
+    --cell-dim $cell_dim \
+    --hidden-dim $hidden_dim \
+    --recurrent-projection-dim $recurrent_projection_dim \
+    --non-recurrent-projection-dim $non_recurrent_projection_dim \
+    --label-delay $label_delay \
+    --self-repair-scale 0.00001 \
+   $dir/configs || exit 1;
+
+fi
+
+if [ $stage -le 10 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/fisher_swbd-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_rnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.num-epochs=$num_epochs \
+    --trainer.samples-per-iter=$samples_per_iter \
+    --trainer.optimization.num-jobs-initial=$num_jobs_initial \
+    --trainer.optimization.num-jobs-final=$num_jobs_final \
+    --trainer.optimization.initial-effective-lrate=$initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate=$final_effective_lrate \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.rnn.num-chunk-per-minibatch=$num_chunk_per_minibatch \
+    --trainer.optimization.momentum=$momentum \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.dir="$common_egs_dir" \
+    --cleanup.remove-egs=$remove_egs \
+    --cleanup.preserve-model-interval=500 \
+    --use-gpu=true \
+    --feat-dir=data/${train_set}_hires \
+    --ali-dir=$ali_dir \
+    --lang=data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+fi
+
+graph_dir=exp/tri5a/graph_sw1_tg
+if [ $stage -le 11 ]; then
+  if [ -z $extra_left_context ]; then
+    extra_left_context=$chunk_left_context
+  fi
+  if [ -z $extra_right_context ]; then
+    extra_right_context=$chunk_right_context
+  fi
+  if [ -z $frames_per_chunk ]; then
+    frames_per_chunk=$chunk_width
+  fi
+  for decode_set in eval2000 rt03; do
+      (
+      num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+      steps/nnet3/lstm/decode.sh --nj $num_jobs --cmd "$decode_cmd" \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --frames-per-chunk "$frames_per_chunk" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_fsh_sw1_tg || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+          data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \
+         $dir/decode_${decode_set}_fsh_sw1_{tg,fg} || exit 1;
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/fisher_swbd/s5/local/nnet3/run_tdnn.sh b/egs/fisher_swbd/s5/local/nnet3/run_tdnn.sh
new file mode 100644
index 00000000000..1c6e6cb3e51
--- /dev/null
+++ b/egs/fisher_swbd/s5/local/nnet3/run_tdnn.sh
@@ -0,0 +1,99 @@
+#!/bin/bash
+
+# this is the standard "tdnn" system, built in nnet3; it's what we use to
+# call multi-splice.
+
+. ./cmd.sh
+
+
+# At this script level we don't support not running on GPU, as it would be painfully slow.
+# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false,
+# --num-threads 16 and --minibatch-size 128.
+
+stage=0
+affix=
+train_stage=-10
+common_egs_dir=
+reporting_email=
+remove_egs=true
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+dir=exp/nnet3/tdnn
+dir=$dir${affix:+_$affix}
+train_set=train_nodup_sp
+ali_dir=exp/tri5a_ali_nodup_sp
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb true || exit 1;
+
+if [ $stage -le 9 ]; then
+  echo "$0: creating neural net configs";
+
+  # create the config files for nnet initialization
+  python steps/nnet3/tdnn/make_configs.py  \
+    --feat-dir data/${train_set}_hires \
+    --ivector-dir exp/nnet3/ivectors_${train_set} \
+    --ali-dir $ali_dir \
+    --relu-dim 1024 \
+    --splice-indexes "-2,-1,0,1,2 -1,2 -3,3 -3,3 -7,2 0"  \
+    --use-presoftmax-prior-scale true \
+   $dir/configs || exit 1;
+fi
+
+
+if [ $stage -le 10 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/fisher_swbd-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_dnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.0017 \
+    --trainer.optimization.final-effective-lrate 0.00017 \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --cleanup.preserve-model-interval 500 \
+    --use-gpu true \
+    --feat-dir=data/${train_set}_hires \
+    --ali-dir $ali_dir \
+    --lang data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+
+fi
+
+graph_dir=exp/tri5a/graph_fsh_sw1_tg
+if [ $stage -le 11 ]; then
+  for decode_set in eval2000 rt03; do
+    (
+    num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+    steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \
+        --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+       $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_fsh_sw1_tg || exit 1;
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \
+       $dir/decode_${decode_set}_fsh_sw1_{tg,fg} || exit 1;
+    ) &
+  done
+fi
+wait;
+exit 0;
+
diff --git a/egs/fisher_swbd/s5/local/nnet3/run_tdnn_discriminative.sh b/egs/fisher_swbd/s5/local/nnet3/run_tdnn_discriminative.sh
new file mode 100644
index 00000000000..be6c82a935e
--- /dev/null
+++ b/egs/fisher_swbd/s5/local/nnet3/run_tdnn_discriminative.sh
@@ -0,0 +1,188 @@
+#!/bin/bash
+
+set -o pipefail
+
+# this is run_tdnn_discriminative.sh
+
+# This script does discriminative training on top of CE nnet3 system.
+# note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
+# since the lattice generation runs in about real-time, so takes of the order of
+# 1000 hours of CPU time.
+# 
+. ./cmd.sh
+
+
+stage=0
+train_stage=-10 # can be used to start training in the middle.
+get_egs_stage=-10
+use_gpu=true  # for training
+cleanup=false  # run with --cleanup true --stage 6 to clean up (remove large things like denlats,
+               # alignments and degs).
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+srcdir=exp/nnet3/tdnn
+train_data_dir=data/train_nodup_sp_hires
+online_ivector_dir=exp/nnet3/ivectors_train_nodup_sp
+degs_dir=                     # If provided, will skip the degs directory creation
+lats_dir=                     # If provided, will skip denlats creation
+
+## Objective options
+criterion=smbr
+one_silence_class=true
+
+dir=${srcdir}_${criterion}
+
+## Egs options
+frames_per_eg=150
+frames_overlap_per_eg=30
+truncate_deriv_weights=10
+
+## Nnet training options
+effective_learning_rate=0.00000125
+max_param_change=1
+num_jobs_nnet=4
+num_epochs=2
+regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options 
+minibatch_size=64
+adjust_priors=true            # May need to be set to false 
+                              # because it does not help in some setups
+modify_learning_rates=true
+last_layer_factor=0.1
+
+## Decode options
+decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we decided to run more.
+
+if $use_gpu; then
+  if ! cuda-compiled; then
+    cat <<EOF && exit 1 
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
+EOF
+  fi
+  num_threads=1
+else
+  # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
+  # almost the same, but this may be a little bit slow.
+  num_threads=16
+fi
+
+if [ ! -f ${srcdir}/final.mdl ]; then
+  echo "$0: expected ${srcdir}/final.mdl to exist; first run run_tdnn.sh or run_lstm.sh"
+  exit 1;
+fi
+
+if [ $stage -le 1 ]; then
+  # hardcode no-GPU for alignment, although you could use GPU [you wouldn't
+  # get excellent GPU utilization though.]
+  nj=100 # have a high number of jobs because this could take a while, and we might
+         # have some stragglers.
+  steps/nnet3/align.sh  --cmd "$decode_cmd" --use-gpu false \
+    --online-ivector-dir $online_ivector_dir \
+     --nj $nj $train_data_dir data/lang $srcdir ${srcdir}_ali ;
+
+fi
+
+if [ -z "$lats_dir" ]; then
+  lats_dir=${srcdir}_denlats
+  if [ $stage -le 2 ]; then
+    nj=100  
+    # this doesn't really affect anything strongly, except the num-jobs for one of
+    # the phases of get_egs_discriminative.sh below.
+    num_threads_denlats=6
+    subsplit=40 # number of jobs that run per job (but 2 run at a time, so total jobs is 80, giving
+    # total slots = 80 * 6 = 480.
+    steps/nnet3/make_denlats.sh --cmd "$decode_cmd" --determinize true \
+      --online-ivector-dir $online_ivector_dir \
+      --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \
+      $train_data_dir data/lang $srcdir ${lats_dir} ;
+  fi
+fi
+
+model_left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'` 
+model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'` 
+
+left_context=$[model_left_context + extra_left_context]
+right_context=$[model_right_context + extra_right_context]
+
+valid_left_context=$[valid_left_context + frames_per_eg]
+valid_right_context=$[valid_right_context + frames_per_eg]
+
+frame_subsampling_opt=
+if [ -f $srcdir/frame_subsampling_factor ]; then
+  frame_subsampling_opt="--frame-subsampling-factor $(cat $srcdir/frame_subsampling_factor)"
+fi
+
+cmvn_opts=`cat $srcdir/cmvn_opts` 
+
+if [ -z "$degs_dir" ]; then
+  degs_dir=${srcdir}_degs
+
+  if [ $stage -le 3 ]; then
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${srcdir}_degs/storage ]; then
+      utils/create_split_dir.pl \
+        /export/b0{1,2,12,13}/$USER/kaldi-data/egs/fisher_swbd-$(date +'%m_%d_%H_%M')/s5/${srcdir}_degs/storage ${srcdir}_degs/storage
+    fi
+    # have a higher maximum num-jobs if
+    if [ -d ${srcdir}_degs/storage ]; then max_jobs=10; else max_jobs=5; fi
+
+    degs_opts="--determinize true --minimize true --remove-output-symbols true --remove-epsilons true --collapse-transition-ids true"
+
+    steps/nnet3/get_egs_discriminative.sh \
+      --cmd "$decode_cmd --max-jobs-run $max_jobs --mem 20G" --stage $get_egs_stage --cmvn-opts "$cmvn_opts" \
+      --adjust-priors $adjust_priors \
+      --online-ivector-dir $online_ivector_dir \
+      --left-context $left_context --right-context $right_context \
+      --valid-left-context $valid_left_context --valid-right-context $valid_right_context \
+      --priors-left-context $valid_left_context --priors-right-context $valid_right_context $frame_subsampling_opt \
+      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg ${degs_opts} \
+      $train_data_dir data/lang ${srcdir}_ali $lats_dir $srcdir/final.mdl $degs_dir ;
+  fi
+fi
+
+if [ $stage -le 4 ]; then
+  steps/nnet3/train_discriminative.sh --cmd "$decode_cmd" \
+    --stage $train_stage \
+    --effective-lrate $effective_learning_rate --max-param-change $max_param_change \
+    --criterion $criterion --drop-frames true \
+    --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size $minibatch_size \
+    --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
+    --regularization-opts "$regularization_opts" \
+    --truncate-deriv-weights $truncate_deriv_weights --adjust-priors $adjust_priors \
+    --modify-learning-rates $modify_learning_rates --last-layer-factor $last_layer_factor \
+    ${degs_dir} $dir 
+fi
+
+graph_dir=exp/tri5a/graph_fsh_sw1_tg
+if [ $stage -le 5 ]; then
+  for x in `seq $decode_start_epoch $num_epochs`; do
+    for decode_set in eval2000 rt03; do
+      (
+      num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+      iter=epoch$x.adj
+      
+      steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
+        --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+        $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_fsh_sw1_tg_$iter ;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \
+        $dir/decode_${decode_set}_fsh_sw1_{tg,fg}_$iter ;
+      ) &
+    done
+  done
+fi
+wait;
+
+if [ $stage -le 6 ] && $cleanup; then
+  # if you run with "--cleanup true --stage 6" you can clean up.
+  rm ${lats_dir}/lat.*.gz || true
+  rm ${srcdir}_ali/ali.*.gz || true
+  steps/nnet2/remove_egs.sh ${srcdir}_degs || true
+fi
+
+
+exit 0;
+
diff --git a/egs/fisher_swbd/s5/local/online/run_nnet2_ms.sh b/egs/fisher_swbd/s5/local/online/run_nnet2_ms.sh
index 3ec0be79b3c..158a5148fb5 100755
--- a/egs/fisher_swbd/s5/local/online/run_nnet2_ms.sh
+++ b/egs/fisher_swbd/s5/local/online/run_nnet2_ms.sh
@@ -40,7 +40,7 @@ if [ $stage -le 6 ]; then
   
   # Because we have a lot of data here and we don't want the training to take
   # too long, we reduce the number of epochs from the defaults (15 + 5) to (3 +
-  # 1).  The option "--io-opts '-tc 12'" is to have more than the default number
+  # 1).  The option "--io-opts '--max-jobs-run 12'" is to have more than the default number
   # (5) of jobs dumping the egs to disk; this is OK since we're splitting our
   # data across four filesystems for speed.
 
diff --git a/egs/fisher_swbd/s5/local/rt03_data_prep.sh b/egs/fisher_swbd/s5/local/rt03_data_prep.sh
index 35dd9b399d3..a18637a6a16 100755
--- a/egs/fisher_swbd/s5/local/rt03_data_prep.sh
+++ b/egs/fisher_swbd/s5/local/rt03_data_prep.sh
@@ -51,7 +51,7 @@ awk -v sph2pipe=$sph2pipe '{
 # en_4156 A unknown_speaker 301.85 302.48
 
 #grep -v ';;' $pem \
-cat $tdir/*.stm | grep -v ';;' \
+cat $tdir/*.stm | grep -v ';;' | grep -v inter_segment_gap \
   | awk '{
            spk=$1"-"(($2==1)?"A":"B");
            utt=sprintf("%s_%06d-%06d",spk,$4*100,$5*100);
@@ -63,7 +63,7 @@ cat $tdir/*.stm | grep -v ';;' \
 # TODO(arnab): We should really be lowercasing this since the Edinburgh
 # recipe uses lowercase. This is not used in the actual scoring.
 #grep -v ';;' $tdir/reference/hub5e00.english.000405.stm \
-cat $tdir/*.stm | grep -v ';;' \
+cat $tdir/*.stm | grep -v ';;' | grep -v inter_segment_gap \
   | awk '{
            spk=$1"-"(($2==1)?"A":"B");
            utt=sprintf("%s_%06d-%06d",spk,$4*100,$5*100);
diff --git a/egs/fisher_swbd/s5/local/score_sclite.sh b/egs/fisher_swbd/s5/local/score_sclite.sh
index c8f29d68b56..a5ac4932e59 100755
--- a/egs/fisher_swbd/s5/local/score_sclite.sh
+++ b/egs/fisher_swbd/s5/local/score_sclite.sh
@@ -128,8 +128,8 @@ rt03* )
   if [ $stage -le 3 ]; then
     for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
       $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.swbd.LMWT.${wip}.log \
-        grep -v '^sw_' $data/stm '>' $dir/score_LMWT_${wip}/stm.swbd '&&' \
-        grep -v '^sw_' $dir/score_LMWT_${wip}/${name}.ctm '>' $dir/score_LMWT_${wip}/${name}.ctm.swbd '&&' \
+        grep -v '^fsh_' $data/stm '>' $dir/score_LMWT_${wip}/stm.swbd '&&' \
+        grep -v '^fsh_' $dir/score_LMWT_${wip}/${name}.ctm '>' $dir/score_LMWT_${wip}/${name}.ctm.swbd '&&' \
         $hubscr -p $hubdir -V -l english -h hub5 -g $data/glm -r $dir/score_LMWT_${wip}/stm.swbd $dir/score_LMWT_${wip}/${name}.ctm.swbd || exit 1;
     done
   fi
@@ -137,8 +137,8 @@ rt03* )
   if [ $stage -le 3 ]; then
     for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
       $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.fsh.LMWT.${wip}.log \
-        grep -v '^fsh_' $data/stm '>' $dir/score_LMWT_${wip}/stm.fsh '&&' \
-        grep -v '^fsh_' $dir/score_LMWT_${wip}/${name}.ctm '>' $dir/score_LMWT_${wip}/${name}.ctm.fsh '&&' \
+        grep -v '^sw_' $data/stm '>' $dir/score_LMWT_${wip}/stm.fsh '&&' \
+        grep -v '^sw_' $dir/score_LMWT_${wip}/${name}.ctm '>' $dir/score_LMWT_${wip}/${name}.ctm.fsh '&&' \
         $hubscr -p $hubdir -V -l english -h hub5 -g $data/glm -r $dir/score_LMWT_${wip}/stm.fsh $dir/score_LMWT_${wip}/${name}.ctm.fsh || exit 1;
     done
   fi
diff --git a/egs/fisher_swbd/s5/local/swbd1_data_download.sh b/egs/fisher_swbd/s5/local/swbd1_data_download.sh
new file mode 100755
index 00000000000..95c9d5e58a4
--- /dev/null
+++ b/egs/fisher_swbd/s5/local/swbd1_data_download.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+# Switchboard-1 training data preparation customized for Edinburgh
+# Author:  Arnab Ghoshal (Jan 2013)
+
+# To be run from one directory above this script.
+
+## The input is some directory containing the switchboard-1 release 2
+## corpus (LDC97S62).  Note: we don't make many assumptions about how
+## you unpacked this.  We are just doing a "find" command to locate
+## the .sph files.
+
+. path.sh
+
+#check existing directories
+if [ $# != 1 ]; then
+  echo "Usage: swbd1_data_download.sh /path/to/SWBD"
+  exit 1; 
+fi 
+
+SWBD_DIR=$1
+
+dir=data/local/train_swbd
+mkdir -p $dir
+
+# Audio data directory check
+if [ ! -d $SWBD_DIR ]; then
+  echo "Error: run.sh requires a directory argument"
+  exit 1; 
+fi  
+
+# Trans directory check
+if [ ! -d $SWBD_DIR/transcriptions/swb_ms98_transcriptions ]; then
+  ( 
+    cd $dir;
+    if [ ! -d swb_ms98_transcriptions ]; then
+      echo " *** Downloading trascriptions and dictionary ***" 
+      wget http://www.openslr.org/resources/5/switchboard_word_alignments.tar.gz ||
+      wget http://www.isip.piconepress.com/projects/switchboard/releases/switchboard_word_alignments.tar.gz
+      tar -xf switchboard_word_alignments.tar.gz
+    fi
+  )
+else
+  echo "Directory with transcriptions exists, skipping downloading"
+  [ -f $dir/swb_ms98_transcriptions ] \
+    || ln -sf $SWBD_DIR/transcriptions/swb_ms98_transcriptions $dir/
+fi
diff --git a/egs/fisher_swbd/s5/local/swbd1_data_prep.sh b/egs/fisher_swbd/s5/local/swbd1_data_prep.sh
index 552e304a6a3..54513437dbe 100755
--- a/egs/fisher_swbd/s5/local/swbd1_data_prep.sh
+++ b/egs/fisher_swbd/s5/local/swbd1_data_prep.sh
@@ -14,7 +14,7 @@
 
 #check existing directories
 if [ $# != 1 ]; then
-  echo "Usage: swbd1_data_prep_edin.sh /path/to/SWBD"
+  echo "Usage: swbd1_data_prep.sh /path/to/SWBD"
   exit 1; 
 fi 
 
@@ -23,7 +23,6 @@ SWBD_DIR=$1
 dir=data/local/train_swbd
 mkdir -p $dir
 
-
 # Audio data directory check
 if [ ! -d $SWBD_DIR ]; then
   echo "Error: run.sh requires a directory argument"
@@ -34,22 +33,6 @@ sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
 [ ! -x $sph2pipe ] \
   && echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1;
 
-
-# Trans directory check
-if [ ! -d $SWBD_DIR/transcriptions/swb_ms98_transcriptions ]; then
-  # To get the SWBD transcriptions and dict, do:
-  echo " *** Downloading transcriptions and dictionary ***"   
-  ( 
-    cd $dir;
-    wget http://www.isip.piconepress.com/projects/switchboard/releases/switchboard_word_alignments.tar.gz
-    tar -xf switchboard_word_alignments.tar.gz
-  )
-else
-  echo "Directory with transcriptions exists, skipping downloading"
-  [ -f $dir/swb_ms98_transcriptions ] \
-    || ln -sf $SWBD_DIR/transcriptions/swb_ms98_transcriptions $dir/
-fi
-
 # Option A: SWBD dictionary file check
 [ ! -f $dir/swb_ms98_transcriptions/sw-ms98-dict.text ] && \
   echo  "SWBD dictionary file does not exist" &&  exit 1;
@@ -101,7 +84,7 @@ local/swbd1_map_words.pl -f 2- $dir/transcripts2.txt  > $dir/text  # final trans
 
 # format acronyms in text
 python local/map_acronyms_transcripts.py -i $dir/text -o $dir/text_map \
-  -M data/local/dict/acronyms_swbd.map
+  -M data/local/dict_nosp/acronyms_swbd.map
 cp $dir/text $dir/text_bk
 mv $dir/text_map $dir/text
 
diff --git a/egs/fisher_swbd/s5/path.sh b/egs/fisher_swbd/s5/path.sh
index 3b05dc5e2ba..e14c6074f6b 100755
--- a/egs/fisher_swbd/s5/path.sh
+++ b/egs/fisher_swbd/s5/path.sh
@@ -1,4 +1,6 @@
 export KALDI_ROOT=`pwd`/../../../
 export PWD=`pwd`
-export PATH=$KALDI_ROOT/src/ivectorbin:$KALDI_ROOT/src/onlinebin:$KALDI_ROOT/src/online2bin:$PWD/stanford-utils:$KALDI_ROOT/src/stanford-bin/:$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/lmbin/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/nnet-cpubin/:$KALDI_ROOT/src/kwsbin:$PWD:$KALDI_ROOT/tools/kaldi_lm:$KALDI_ROOT/tools/srilm/bin:$KALDI_ROOT/tools/srilm/bin/i686-m64:$PATH
+export PATH=$KALDI_ROOT/src/ivectorbin:$PWD/stanford-utils:$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$KALDI_ROOT/tools/kaldi_lm:$KALDI_ROOT/tools/srilm/bin:$KALDI_ROOT/tools/srilm/bin/i686-m64:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
 export LC_ALL=C
diff --git a/egs/fisher_swbd/s5/run.sh b/egs/fisher_swbd/s5/run.sh
index 4bb0a55b0a9..fa3ad62fa84 100755
--- a/egs/fisher_swbd/s5/run.sh
+++ b/egs/fisher_swbd/s5/run.sh
@@ -25,7 +25,6 @@ local/swbd1_data_prep.sh /export/corpora3/LDC/LDC97S62
 # local/swbd1_data_prep.sh /mnt/matylda2/data/SWITCHBOARD_1R2
 # local/swbd1_data_prep.sh /exports/work/inf_hcrc_cstr_general/corpora/switchboard/switchboard1
 
-
 utils/prepare_lang.sh data/local/dict_nosp \
     "<unk>" data/local/lang_nosp data/lang_nosp
 
@@ -135,15 +134,14 @@ local/remove_dup_utts.sh 300 data/train data/train_nodup
 )
 
 # Start training on the Switchboard subset, which has cleaner alignments
-
 steps/train_mono.sh --nj 3 --cmd "$train_cmd" \
-  data/train_10k_nodup data/lang_nopp exp/mono0a 
+  data/train_10k_nodup data/lang_nosp exp/mono0a 
 
 steps/align_si.sh --nj 10 --cmd "$train_cmd" \
-   data/train_30k_nodup data/lang_nopp exp/mono0a exp/mono0a_ali || exit 1;
+   data/train_30k_nodup data/lang_nosp exp/mono0a exp/mono0a_ali || exit 1;
 
 steps/train_deltas.sh --cmd "$train_cmd" \
-    3200 30000 data/train_30k_nodup data/lang_nopp exp/mono0a_ali exp/tri1a || exit 1;
+    3200 30000 data/train_30k_nodup data/lang_nosp exp/mono0a_ali exp/tri1a || exit 1;
 #used to be 2500 20000
 (
  graph_dir=exp/tri1a/graph_nosp_fsh_sw1_tg
diff --git a/egs/gale_arabic/s5/cmd.sh b/egs/gale_arabic/s5/cmd.sh
index 6e2777b595b..71dd849a93b 100755
--- a/egs/gale_arabic/s5/cmd.sh
+++ b/egs/gale_arabic/s5/cmd.sh
@@ -1,11 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-export train_cmd="queue.pl -l 'arch=*64*'"
-export decode_cmd="queue.pl -l 'arch=*64*'"
-export cuda_cmd="queue.pl -l gpu=1"
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/gale_arabic/s5/local/gale_format_data.sh b/egs/gale_arabic/s5/local/gale_format_data.sh
index 584702b4122..6675dd20f71 100755
--- a/egs/gale_arabic/s5/local/gale_format_data.sh
+++ b/egs/gale_arabic/s5/local/gale_format_data.sh
@@ -6,9 +6,9 @@
 if [ -f path.sh ]; then
   . path.sh; else
    echo "missing path.sh"; exit 1;
-fi 
+fi
 
-for dir in test train; do 
+for dir in test train; do
    cp -pr data/local/$dir data/$dir
 done
 
@@ -21,26 +21,13 @@ arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz
 rm -r data/lang_test
 cp -r data/lang data/lang_test
 
-# grep -v '<s> <s>' etc. is only for future-proofing this script.  Our
-# LM doesn't have these "invalid combinations".  These can cause 
-# determinization failures of CLG [ends up being epsilon cycles].
-# Note: remove_oovs.pl takes a list of words in the LM that aren't in
-# our word list.  Since our LM doesn't have any, we just give it
-# /dev/null [we leave it in the script to show how you'd do it].
 gunzip -c "$arpa_lm" | \
-   grep -v '<s> <s>' | \
-   grep -v '</s> <s>' | \
-   grep -v '</s> </s>' | \
-   arpa2fst - | fstprint | \
-   utils/remove_oovs.pl /dev/null | \
-   utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \
-     --osymbols=data/lang_test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-    fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst
-  fstisstochastic data/lang_test/G.fst
+  arpa2fst --disambig-symbol=#0 \
+           --read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst
 
 
 echo  "Checking how stochastic G is (the first of these numbers should be small):"
-fstisstochastic data/lang_test/G.fst 
+fstisstochastic data/lang_test/G.fst
 
 ## Check lexicon.
 ## just have a look and make sure it seems sane.
diff --git a/egs/gale_arabic/s5/local/nnet/run_lstm.sh b/egs/gale_arabic/s5/local/nnet/run_lstm.sh
index 39854360e14..aeb2272976b 100755
--- a/egs/gale_arabic/s5/local/nnet/run_lstm.sh
+++ b/egs/gale_arabic/s5/local/nnet/run_lstm.sh
@@ -45,7 +45,7 @@ if [ $stage -le 1 ]; then
     steps/nnet/train.sh --network-type lstm --learn-rate 0.00001 \
       --cmvn-opts "--norm-means=true --norm-vars=true" --feat-type plain --splice 0 \
       --proto-opts "--clip-gradient 5.0" \
-      --train-opts "--momentum 0.9 --halving-factor 0.65" \
+      --train-tool-opts "--momentum 0.9 --halving-factor 0.65" \
       --train-tool "nnet-train-lstm-streams --num-stream=4 --targets-delay=5" \
     ${train}_tr90 ${train}_cv10 data/lang $ali $ali $dir || exit 1;
 
diff --git a/egs/gale_arabic/s5/local/online/run_nnet2.sh b/egs/gale_arabic/s5/local/online/run_nnet2.sh
index 6926a3670be..8ccbda5a8dc 100644
--- a/egs/gale_arabic/s5/local/online/run_nnet2.sh
+++ b/egs/gale_arabic/s5/local/online/run_nnet2.sh
@@ -126,7 +126,7 @@ if [ $stage -le 6 ]; then
   
   # Because we have a lot of data here and we don't want the training to take
   # too long, we reduce the number of epochs from the defaults (15) to (8).
-  # The option "--io-opts '-tc 12'" is to have more than the default number
+  # The option "--io-opts '--max-jobs-run 12'" is to have more than the default number
   # (5) of jobs dumping the egs to disk; this is OK since we're splitting our
   # data across four filesystems for speed.
   
@@ -139,7 +139,7 @@ if [ $stage -le 6 ]; then
     --num-threads "$num_threads" \
     --minibatch-size "$minibatch_size" \
     --parallel-opts "$parallel_opts" \
-    --io-opts "-tc 12" \
+    --io-opts "--max-jobs-run 12" \
     --num-jobs-nnet 6 \
     --num-hidden-layers 4 \
     --mix-up 12000 \
diff --git a/egs/gale_arabic/s5/path.sh b/egs/gale_arabic/s5/path.sh
index db21a99a725..be11b34cbc6 100755
--- a/egs/gale_arabic/s5/path.sh
+++ b/egs/gale_arabic/s5/path.sh
@@ -1,3 +1,5 @@
 export KALDI_ROOT=$(pwd)/../../..
-export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin/:$KALDI_ROOT/src/kwsbin:$PWD:$PATH
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
 export LC_ALL=C
diff --git a/egs/gale_mandarin/s5/cmd.sh b/egs/gale_mandarin/s5/cmd.sh
index 6e2777b595b..2d51ad82004 100755
--- a/egs/gale_mandarin/s5/cmd.sh
+++ b/egs/gale_mandarin/s5/cmd.sh
@@ -1,11 +1,18 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-export train_cmd="queue.pl -l 'arch=*64*'"
-export decode_cmd="queue.pl -l 'arch=*64*'"
-export cuda_cmd="queue.pl -l gpu=1"
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
+# the use of cuda_cmd is deprecated, but it's still used in this example
+# directory.
+export cuda_cmd="queue.pl --gpu 1"
diff --git a/egs/gale_mandarin/s5/local/gale_format_data.sh b/egs/gale_mandarin/s5/local/gale_format_data.sh
index 15a2bfaef52..71187e89a12 100755
--- a/egs/gale_mandarin/s5/local/gale_format_data.sh
+++ b/egs/gale_mandarin/s5/local/gale_format_data.sh
@@ -6,9 +6,9 @@
 if [ -f path.sh ]; then
   . path.sh; else
    echo "missing path.sh"; exit 1;
-fi 
+fi
 
-for dir in dev train; do 
+for dir in dev train; do
    cp -pr data/local/$dir data/$dir
 done
 
@@ -22,26 +22,13 @@ arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz
 rm -r data/lang_dev
 cp -r data/lang data/lang_dev
 
-# grep -v '<s> <s>' etc. is only for future-proofing this script.  Our
-# LM doesn't have these "invalid combinations".  These can cause 
-# determinization failures of CLG [ends up being epsilon cycles].
-# Note: remove_oovs.pl takes a list of words in the LM that aren't in
-# our word list.  Since our LM doesn't have any, we just give it
-# /dev/null [we leave it in the script to show how you'd do it].
 gunzip -c "$arpa_lm" | \
-   grep -v '<s> <s>' | \
-   grep -v '</s> <s>' | \
-   grep -v '</s> </s>' | \
-   arpa2fst - | fstprint | \
-   utils/remove_oovs.pl /dev/null | \
-   utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_dev/words.txt \
-     --osymbols=data/lang_dev/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-    fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_dev/G.fst
-  fstisstochastic data/lang_dev/G.fst
+  arpa2fst --disambig-symbol=#0 \
+           --read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst
 
 
 echo  "Checking how stochastic G is (the first of these numbers should be small):"
-fstisstochastic data/lang_dev/G.fst 
+fstisstochastic data/lang_dev/G.fst
 
 ## Check lexicon.
 ## just have a look and make sure it seems sane.
diff --git a/egs/gale_mandarin/s5/path.sh b/egs/gale_mandarin/s5/path.sh
index db21a99a725..be11b34cbc6 100755
--- a/egs/gale_mandarin/s5/path.sh
+++ b/egs/gale_mandarin/s5/path.sh
@@ -1,3 +1,5 @@
 export KALDI_ROOT=$(pwd)/../../..
-export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin/:$KALDI_ROOT/src/kwsbin:$PWD:$PATH
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
 export LC_ALL=C
diff --git a/egs/gp/s1/local/gp_format_lms_edin.sh b/egs/gp/s1/local/gp_format_lms_edin.sh
index 7fa6f181060..60e3c266d5c 100755
--- a/egs/gp/s1/local/gp_format_lms_edin.sh
+++ b/egs/gp/s1/local/gp_format_lms_edin.sh
@@ -40,20 +40,10 @@ function format_lms () {
     cp $work_dir/lang_test/$f $test
   done
 
+  # kkm: I am removing fstdeterminizelog from the following pipe, no point.
   gunzip -c $work_dir/local/lm_${lm_suffix}.arpa.gz \
-    | find_arpa_oovs.pl $test/words.txt > $test/oovs_${lm_suffix}.txt
-
-  # Removing all "illegal" combinations of <s> and </s>, which are supposed to 
-  # occur only at being/end of utt.  These can cause determinization failures 
-  # of CLG [ends up being epsilon cycles].
-  gunzip -c $work_dir/local/lm_${lm_suffix}.arpa.gz \
-    | egrep -v '<s> <s>|</s> <s>|</s> </s>' \
-    | arpa2fst - | fstprint \
-    | remove_oovs.pl $test/oovs_${lm_suffix}.txt \
-    | eps2disambig.pl | s2eps.pl \
-    | fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt \
-      --keep_isymbols=false --keep_osymbols=false \
-    | fstrmepsilon | fstdeterminizelog > $test/G.fst
+    arpa2fst --disambig-symbol=#0 \
+             --read-symbol-table=$test/words.txt - $test/G.fst
   set +e
   fstisstochastic $test/G.fst
   set -e
@@ -73,7 +63,7 @@ function format_lms () {
     < $work_dir/local/lexicon_??.txt  >tmpdir.g/select_empty.fst.txt
   fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt tmpdir.g/select_empty.fst.txt | \
    fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > tmpdir.g/empty_words.fst
-  fstinfo tmpdir.g/empty_words.fst | grep cyclic | grep -w 'y' && 
+  fstinfo tmpdir.g/empty_words.fst | grep cyclic | grep -w 'y' &&
     echo "Language model has cycles with empty words" && exit 1
   rm -r tmpdir.g
 
@@ -99,7 +89,7 @@ echo "Preparing language models for test"
   format_lms GE17k_tg $WDIR/GE;
   format_lms GE17k_tg_pr $WDIR/GE; } >& $WDIR/GE/format_lms.log
 
-# German - 60K 
+# German - 60K
 { format_lms GE60k_bg $WDIR/GE;
   format_lms GE60k_tg $WDIR/GE;
   format_lms GE60k_tg_pr $WDIR/GE; } >> $WDIR/GE/format_lms.log 2>&1
@@ -115,7 +105,7 @@ echo "Preparing language models for test"
   format_lms SP23k_tg_pr $WDIR/SP; } >& $WDIR/SP/format_lms.log
 
 # Swedish - 24K
-# TODO(arnab): Something going wrong with the Swedish trigram LM. 
+# TODO(arnab): Something going wrong with the Swedish trigram LM.
 { # format_lms SW24k_tg $WDIR/SW;
   # format_lms SW24k_tg_pr $WDIR/SW;
   format_lms SW24k_bg $WDIR/SW; } >& $WDIR/SW/format_lms.log
diff --git a/egs/gp/s5/RESULTS b/egs/gp/s5/RESULTS
index 760545cf59d..297ef23d2da 100644
--- a/egs/gp/s5/RESULTS
+++ b/egs/gp/s5/RESULTS
@@ -1,4 +1,94 @@
-$ for L in $GP_LANGUAGES; do grep WER exp/$L/mono/decode_dev_tgpr_sri/wer_* | ./utils/best_wer.sh ; doneexp/CZ/mono/decode_dev_tgpr_sri/wer_9:%WER 35.13 [ 5820 / 16568, 486 ins, 1116 del, 4218 sub ]
+#!/bin/bash
+
+# this RESULTS file was obtained by Bogdan Vlasenko in February 2016.
+
+for x in exp/*/*/decode*; do [ -d $x ] && [[ $x =~ "$1" ]] && grep WER $x/wer_* | utils/best_wer.sh; done
+
+# Monophone, MFCC+delta+accel
+%WER 45.16 [ 10073 / 22306, 684 ins, 2010 del, 7379 sub ] exp/FR/mono/decode_dev_tgpr_sri/wer_8
+%WER 26.96 [ 4149 / 15387, 285 ins, 933 del, 2931 sub ] exp/GE/mono/decode_dev_tgpr_sri/wer_11
+%WER 52.95 [ 10040 / 18962, 588 ins, 2182 del, 7270 sub ] exp/RU/mono/decode_dev_tgpr_sri/wer_8
+
+%WER 41.80 [ 9071 / 21700, 513 ins, 1876 del, 6682 sub ] exp/FR/mono/decode_eval_tgpr_sri/wer_9
+%WER 44.71 [ 5347 / 11959, 399 ins, 1024 del, 3924 sub ] exp/GE/mono/decode_eval_tgpr_sri/wer_9
+%WER 51.55 [ 9416 / 18266, 533 ins, 1975 del, 6908 sub ] exp/RU/mono/decode_eval_tgpr_sri/wer_9
+
+# First triphone build.
+%WER 28.44 [ 6343 / 22306, 751 ins, 742 del, 4850 sub ] exp/FR/tri1/decode_dev_tgpr_sri/wer_14
+%WER 15.29 [ 2353 / 15387, 288 ins, 388 del, 1677 sub ] exp/GE/tri1/decode_dev_tgpr_sri/wer_18
+%WER 36.40 [ 6903 / 18962, 578 ins, 1876 del, 4449 sub ] exp/RU/tri1/decode_dev_tgpr_sri/wer_13
+
+%WER 26.26 [ 5699 / 21700, 697 ins, 526 del, 4476 sub ] exp/FR/tri1/decode_eval_tgpr_sri/wer_13
+%WER 22.80 [ 2727 / 11959, 386 ins, 329 del, 2012 sub ] exp/GE/tri1/decode_eval_tgpr_sri/wer_15
+%WER 34.69 [ 6336 / 18266, 547 ins, 1470 del, 4319 sub ] exp/RU/tri1/decode_eval_tgpr_sri/wer_14
+
+# tri2a is delta+delta-delta features.
+%WER 28.45 [ 6345 / 22306, 771 ins, 725 del, 4849 sub ] exp/FR/tri2a/decode_dev_tgpr_sri/wer_14
+%WER 15.13 [ 2328 / 15387, 320 ins, 345 del, 1663 sub ] exp/GE/tri2a/decode_dev_tgpr_sri/wer_17
+%WER 36.62 [ 6944 / 18962, 526 ins, 2083 del, 4335 sub ] exp/RU/tri2a/decode_dev_tgpr_sri/wer_14
+
+%WER 26.18 [ 5681 / 21700, 694 ins, 542 del, 4445 sub ] exp/FR/tri2a/decode_eval_tgpr_sri/wer_14
+%WER 22.52 [ 2693 / 11959, 341 ins, 363 del, 1989 sub ] exp/GE/tri2a/decode_eval_tgpr_sri/wer_17
+%WER 34.37 [ 6278 / 18266, 594 ins, 1378 del, 4306 sub ] exp/RU/tri2a/decode_eval_tgpr_sri/wer_14
+
+# LDA+MLLT. 
+%WER 27.76 [ 6192 / 22306, 723 ins, 824 del, 4645 sub ] exp/FR/tri2b/decode_dev_tgpr_sri/wer_17
+%WER 13.78 [ 2121 / 15387, 300 ins, 313 del, 1508 sub ] exp/GE/tri2b/decode_dev_tgpr_sri/wer_18
+%WER 34.68 [ 6576 / 18962, 521 ins, 1872 del, 4183 sub ] exp/RU/tri2b/decode_dev_tgpr_sri/wer_15
+
+%WER 25.43 [ 5519 / 21700, 724 ins, 532 del, 4263 sub ] exp/FR/tri2b/decode_eval_tgpr_sri/wer_15
+%WER 21.26 [ 2542 / 11959, 307 ins, 372 del, 1863 sub ] exp/GE/tri2b/decode_eval_tgpr_sri/wer_17
+%WER 32.83 [ 5997 / 18266, 522 ins, 1431 del, 4044 sub ] exp/RU/tri2b/decode_eval_tgpr_sri/wer_16
+
+# LDA+MLLT+SAT.
+%WER 25.62 [ 5714 / 22306, 746 ins, 634 del, 4334 sub ] exp/FR/tri3b/decode_dev_tgpr_sri/wer_18
+%WER 11.01 [ 1694 / 15387, 311 ins, 205 del, 1178 sub ] exp/GE/tri3b/decode_dev_tgpr_sri/wer_20
+%WER 32.48 [ 6159 / 18962, 556 ins, 1534 del, 4069 sub ] exp/RU/tri3b/decode_dev_tgpr_sri/wer_17
+
+%WER 23.82 [ 5169 / 21700, 685 ins, 478 del, 4006 sub ] exp/FR/tri3b/decode_eval_tgpr_sri/wer_17
+%WER 17.72 [ 2119 / 11959, 329 ins, 248 del, 1542 sub ] exp/GE/tri3b/decode_eval_tgpr_sri/wer_18
+%WER 31.24 [ 5706 / 18266, 657 ins, 1046 del, 4003 sub ] exp/RU/tri3b/decode_eval_tgpr_sri/wer_16
+
+# Some "SGMM2" experiments.
+%WER 24.76 [ 5524 / 22306, 716 ins, 623 del, 4185 sub ] exp/FR/sgmm2_4a/decode_dev_tgpr_sri/wer_12
+%WER 9.61 [ 1478 / 15387, 253 ins, 174 del, 1051 sub ] exp/GE/sgmm2_4a/decode_dev_tgpr_sri/wer_13
+%WER 30.27 [ 5740 / 18962, 505 ins, 1301 del, 3934 sub ] exp/RU/sgmm2_4a/decode_dev_tgpr_sri/wer_12
+
+%WER 22.88 [ 4965 / 21700, 675 ins, 430 del, 3860 sub ] exp/FR/sgmm2_4a/decode_eval_tgpr_sri/wer_11
+%WER 16.03 [ 1917 / 11959, 267 ins, 224 del, 1426 sub ] exp/GE/sgmm2_4a/decode_eval_tgpr_sri/wer_12
+%WER 29.06 [ 5309 / 18266, 494 ins, 1107 del, 3708 sub ] exp/RU/sgmm2_4a/decode_eval_tgpr_sri/wer_13
+
+%WER 24.16 [ 5389 / 22306, 733 ins, 559 del, 4097 sub ] exp/FR/sgmm2_4a_mmi_b0.1/decode_dev_tgpr_sri_it1/wer_12
+%WER 24.02 [ 5359 / 22306, 733 ins, 534 del, 4092 sub ] exp/FR/sgmm2_4a_mmi_b0.1/decode_dev_tgpr_sri_it2/wer_12
+%WER 24.23 [ 5405 / 22306, 754 ins, 532 del, 4119 sub ] exp/FR/sgmm2_4a_mmi_b0.1/decode_dev_tgpr_sri_it3/wer_12
+%WER 24.50 [ 5464 / 22306, 727 ins, 574 del, 4163 sub ] exp/FR/sgmm2_4a_mmi_b0.1/decode_dev_tgpr_sri_it4/wer_13
+%WER 9.22 [ 1418 / 15387, 266 ins, 146 del, 1006 sub ] exp/GE/sgmm2_4a_mmi_b0.1/decode_dev_tgpr_sri_it1/wer_12
+%WER 9.17 [ 1411 / 15387, 253 ins, 153 del, 1005 sub ] exp/GE/sgmm2_4a_mmi_b0.1/decode_dev_tgpr_sri_it2/wer_13
+%WER 9.18 [ 1412 / 15387, 264 ins, 150 del, 998 sub ] exp/GE/sgmm2_4a_mmi_b0.1/decode_dev_tgpr_sri_it3/wer_13
+%WER 9.31 [ 1432 / 15387, 271 ins, 150 del, 1011 sub ] exp/GE/sgmm2_4a_mmi_b0.1/decode_dev_tgpr_sri_it4/wer_13
+%WER 29.96 [ 5681 / 18962, 465 ins, 1549 del, 3667 sub ] exp/RU/sgmm2_4a_mmi_b0.1/decode_dev_tgpr_sri_it1/wer_11
+%WER 30.39 [ 5762 / 18962, 500 ins, 1669 del, 3593 sub ] exp/RU/sgmm2_4a_mmi_b0.1/decode_dev_tgpr_sri_it2/wer_10
+%WER 31.00 [ 5879 / 18962, 420 ins, 1864 del, 3595 sub ] exp/RU/sgmm2_4a_mmi_b0.1/decode_dev_tgpr_sri_it3/wer_11
+%WER 31.50 [ 5973 / 18962, 433 ins, 1926 del, 3614 sub ] exp/RU/sgmm2_4a_mmi_b0.1/decode_dev_tgpr_sri_it4/wer_11
+
+%WER 22.51 [ 4885 / 21700, 672 ins, 423 del, 3790 sub ] exp/FR/sgmm2_4a_mmi_b0.1/decode_eval_tgpr_sri_it1/wer_12
+%WER 22.56 [ 4896 / 21700, 702 ins, 380 del, 3814 sub ] exp/FR/sgmm2_4a_mmi_b0.1/decode_eval_tgpr_sri_it2/wer_11
+%WER 22.70 [ 4925 / 21700, 670 ins, 398 del, 3857 sub ] exp/FR/sgmm2_4a_mmi_b0.1/decode_eval_tgpr_sri_it3/wer_12
+%WER 22.83 [ 4954 / 21700, 681 ins, 400 del, 3873 sub ] exp/FR/sgmm2_4a_mmi_b0.1/decode_eval_tgpr_sri_it4/wer_12
+%WER 15.28 [ 1827 / 11959, 291 ins, 178 del, 1358 sub ] exp/GE/sgmm2_4a_mmi_b0.1/decode_eval_tgpr_sri_it1/wer_11
+%WER 15.22 [ 1820 / 11959, 271 ins, 190 del, 1359 sub ] exp/GE/sgmm2_4a_mmi_b0.1/decode_eval_tgpr_sri_it2/wer_12
+%WER 15.35 [ 1836 / 11959, 281 ins, 187 del, 1368 sub ] exp/GE/sgmm2_4a_mmi_b0.1/decode_eval_tgpr_sri_it3/wer_12
+%WER 15.38 [ 1839 / 11959, 252 ins, 205 del, 1382 sub ] exp/GE/sgmm2_4a_mmi_b0.1/decode_eval_tgpr_sri_it4/wer_13
+%WER 28.31 [ 5172 / 18266, 496 ins, 1127 del, 3549 sub ] exp/RU/sgmm2_4a_mmi_b0.1/decode_eval_tgpr_sri_it1/wer_11
+%WER 28.64 [ 5232 / 18266, 446 ins, 1321 del, 3465 sub ] exp/RU/sgmm2_4a_mmi_b0.1/decode_eval_tgpr_sri_it2/wer_11
+%WER 28.96 [ 5289 / 18266, 458 ins, 1334 del, 3497 sub ] exp/RU/sgmm2_4a_mmi_b0.1/decode_eval_tgpr_sri_it3/wer_10
+%WER 29.55 [ 5398 / 18266, 421 ins, 1477 del, 3500 sub ] exp/RU/sgmm2_4a_mmi_b0.1/decode_eval_tgpr_sri_it4/wer_11
+
+
+# these RESULTS were obtained with Arnab Ghoshal version of the script established in 2012.
+$ for L in $GP_LANGUAGES; do grep WER exp/$L/mono/decode_dev_tgpr_sri/wer_* | ./utils/best_wer.sh ; done
+
+exp/CZ/mono/decode_dev_tgpr_sri/wer_9:%WER 35.13 [ 5820 / 16568, 486 ins, 1116 del, 4218 sub ]
 exp/FR/mono/decode_dev_tgpr_sri/wer_9:%WER 45.69 [ 10192 / 22306, 533 ins, 2323 del, 7336 sub ]
 exp/GE/mono/decode_dev_tgpr_sri/wer_11:%WER 27.48 [ 4228 / 15387, 278 ins, 974 del, 2976 sub ]
 exp/PL/mono/decode_dev_tgpr_sri/wer_9:%WER 36.45 [ 6437 / 17660, 607 ins, 1228 del, 4602 sub ]
@@ -16,5 +106,3 @@ exp/PO/tri1/decode_dev_tgpr_sri/wer_14:%WER 26.56 [ 3461 / 13030, 477 ins, 795 d
 exp/RU/tri1/decode_dev_tgpr_sri/wer_14:%WER 33.89 [ 6427 / 18962, 575 ins, 1612 del, 4240 sub ]
 exp/SP/tri1/decode_dev_tgpr_sri/wer_16:%WER 26.73 [ 5105 / 19098, 551 ins, 1313 del, 3241 sub ]
 qghoshal@merlin:[~/globalphone/a1.1]
-
-
diff --git a/egs/gp/s5/run.sh b/egs/gp/s5/run.sh
index ed345efef14..933d3a4f566 100755
--- a/egs/gp/s5/run.sh
+++ b/egs/gp/s5/run.sh
@@ -2,6 +2,16 @@
 
 # Copyright 2012  Arnab Ghoshal
 
+#
+# Copyright 2016 by Idiap Research Institute, http://www.idiap.ch
+#
+# See the file COPYING for the licence associated with this software.
+#
+# Author(s):
+#   Bogdan Vlasenko, February 2016
+#
+
+
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -22,47 +32,37 @@ echo "This shell script may run as-is on your system, but it is recommended
 that you run the commands one by one by copying and pasting into the shell."
 #exit 1;
 
-[ -f cmd.sh ] && source ./cmd.sh \
-  || echo "cmd.sh not found. Jobs may not execute properly."
+[ -f cmd.sh ] && source ./cmd.sh || echo "cmd.sh not found. Jobs may not execute properly."
 
 # CHECKING FOR AND INSTALLING REQUIRED TOOLS:
 #  This recipe requires shorten (3.6.1) and sox (14.3.2).
 #  If they are not found, the local/gp_install.sh script will install them.
-local/gp_check_tools.sh $PWD path.sh
+#local/gp_check_tools.sh $PWD path.sh || exit 1;
 
 . path.sh || { echo "Cannot source path.sh"; exit 1; }
 
 # Set the locations of the GlobalPhone corpus and language models
-GP_CORPUS=/mnt/matylda2/data/GLOBALPHONE
-# GP_LM=/mnt/matylda6/ijanda/GLOBALPHONE_LM
+GP_CORPUS=/idiap/resource/database/GLOBALPHONE
 GP_LM=$PWD/language_models
 
 # Set the languages that will actually be processed
-# export GP_LANGUAGES="CZ FR GE PL PO RU SP VN"
-export GP_LANGUAGES="CZ FR GE PL PO SP"
+export GP_LANGUAGES="FR GE RU"
 
 # The following data preparation step actually converts the audio files from 
 # shorten to WAV to take out the empty files and those with compression errors. 
-local/gp_data_prep.sh --config-dir=$PWD/conf --corpus-dir=$GP_CORPUS \
-  --languages="$GP_LANGUAGES"
-local/gp_dict_prep.sh --config-dir $PWD/conf $GP_CORPUS $GP_LANGUAGES
-# # Use the following to map to X-SAMPA phoneset
-# local/gp_dict_prep.sh --config-dir $PWD/conf --map-dir $PWD/conf/xsampa_map \
-#   $GP_CORPUS $GP_LANGUAGES
+local/gp_data_prep.sh --config-dir=$PWD/conf --corpus-dir=$GP_CORPUS --languages="$GP_LANGUAGES" || exit 1;
+local/gp_dict_prep.sh --config-dir $PWD/conf $GP_CORPUS $GP_LANGUAGES || exit 1;
 
 for L in $GP_LANGUAGES; do
-  utils/prepare_lang.sh --position-dependent-phones true \
-    data/$L/local/dict "<unk>" data/$L/local/lang_tmp data/$L/lang \
-    >& data/$L/prepare_lang.log || exit 1;
+ utils/prepare_lang.sh --position-dependent-phones true \
+   data/$L/local/dict "<unk>" data/$L/local/lang_tmp data/$L/lang \
+   >& data/$L/prepare_lang.log || exit 1;
 done
 
 # Convert the different available language models to FSTs, and create separate 
 # decoding configurations for each.
 for L in $GP_LANGUAGES; do
-  # $highmem_cmd data/$L/format_lm.log \
-  #   local/gp_format_lm.sh --filter-vocab-sri false $GP_LM $L & 
-  $highmem_cmd data/$L/format_lm.log \
-    local/gp_format_lm.sh --filter-vocab-sri true $GP_LM $L & 
+   local/gp_format_lm.sh --filter-vocab-sri true $GP_LM $L & 
 done
 wait
 
@@ -72,13 +72,14 @@ for L in $GP_LANGUAGES; do
   for x in train dev eval; do
     ( 
       steps/make_mfcc.sh --nj 6 --cmd "$train_cmd" data/$L/$x \
-	exp/$L/make_mfcc/$x $mfccdir;
+        exp/$L/make_mfcc/$x $mfccdir; 
       steps/compute_cmvn_stats.sh data/$L/$x exp/$L/make_mfcc/$x $mfccdir; 
     ) &
   done
 done
 wait;
 
+
 for L in $GP_LANGUAGES; do
   mkdir -p exp/$L/mono;
   steps/train_mono.sh --nj 10 --cmd "$train_cmd" \
@@ -86,86 +87,107 @@ for L in $GP_LANGUAGES; do
 done
 wait;
 
+
 for L in $GP_LANGUAGES; do
   for lm_suffix in tgpr_sri; do
     (
       graph_dir=exp/$L/mono/graph_${lm_suffix}
       mkdir -p $graph_dir
-      $highmem_cmd $graph_dir/mkgraph.log \
-	utils/mkgraph.sh --mono data/$L/lang_test_${lm_suffix} exp/$L/mono \
-	$graph_dir
+      utils/mkgraph.sh --mono data/$L/lang_test_${lm_suffix} exp/$L/mono \
+	 $graph_dir
 
       steps/decode.sh --nj 5 --cmd "$decode_cmd" $graph_dir data/$L/dev \
-	exp/$L/mono/decode_dev_${lm_suffix} 
+	 exp/$L/mono/decode_dev_${lm_suffix}
+      steps/decode.sh --nj 5 --cmd "$decode_cmd" $graph_dir data/$L/eval \
+	 exp/$L/mono/decode_eval_${lm_suffix}
     ) &
   done
 done
 
-
+# Train tri1, which is first triphone pass
 for L in $GP_LANGUAGES; do
   (
     mkdir -p exp/$L/mono_ali
     steps/align_si.sh --nj 10 --cmd "$train_cmd" \
-      data/$L/train data/$L/lang exp/$L/mono exp/$L/mono_ali \
-      >& exp/$L/mono_ali/align.log 
+	data/$L/train data/$L/lang exp/$L/mono exp/$L/mono_ali \ 
+	>& exp/$L/mono_ali/align.log
 
     num_states=$(grep "^$L" conf/tri.conf | cut -f2)
     num_gauss=$(grep "^$L" conf/tri.conf | cut -f3)
     mkdir -p exp/$L/tri1
-    steps/train_deltas.sh --cmd "$train_cmd" --cluster-thresh 100 \
-      $num_states $num_gauss data/$L/train data/$L/lang exp/$L/mono_ali \
-      exp/$L/tri1 >& exp/$L/tri1/train.log
-    ) &
+    steps/train_deltas.sh --cmd "$train_cmd" \
+	--cluster-thresh 100 $num_states $num_gauss data/$L/train data/$L/lang \
+	exp/$L/mono_ali exp/$L/tri1 >& exp/$L/tri1/train.log
+  ) &
 done
 wait;
 
-
+# Decode tri1
 for L in $GP_LANGUAGES; do
   for lm_suffix in tgpr_sri; do
     (
       graph_dir=exp/$L/tri1/graph_${lm_suffix}
       mkdir -p $graph_dir
-      $highmem_cmd $graph_dir/mkgraph.log \
-	utils/mkgraph.sh data/$L/lang_test_${lm_suffix} exp/$L/tri1 $graph_dir
+      utils/mkgraph.sh data/$L/lang_test_${lm_suffix} exp/$L/tri1 \
+	$graph_dir
 
       steps/decode.sh --nj 5 --cmd "$decode_cmd" $graph_dir data/$L/dev \
-	exp/$L/tri1/decode_dev_${lm_suffix} 
+	exp/$L/tri1/decode_dev_${lm_suffix}
+      steps/decode.sh --nj 5 --cmd "$decode_cmd" $graph_dir data/$L/eval \
+	exp/$L/tri1/decode_eval_${lm_suffix}
     ) &
   done
 done
 
-# SAT-trained triphone systems: MFCC feats
+
+# Train tri2a, which is deltas + delta-deltas
 for L in $GP_LANGUAGES; do
   (
-    mkdir -p exp/$L/tri1_ali_fmllr
-    steps/align_fmllr.sh --nj 10 --cmd "$train_cmd" \
-      data/$L/train data/$L/lang exp/$L/tri1 exp/$L/tri1_ali_fmllr \
-      >& exp/$L/tri1_ali_fmllr/align.log  || exit 1;
+    mkdir -p exp/$L/tri1_ali
+    steps/align_si.sh --nj 10 --cmd "$train_cmd" \ 
+	data/$L/train data/$L/lang exp/$L/tri1 exp/$L/tri1_ali \
+	>& exp/$L/tri1_ali/tri1_ali.log
 
     num_states=$(grep "^$L" conf/tri.conf | cut -f2)
     num_gauss=$(grep "^$L" conf/tri.conf | cut -f3)
     mkdir -p exp/$L/tri2a
-    steps/train_sat.sh --cmd "$train_cmd" --cluster-thresh 100 \
-      $num_states $num_gauss data/$L/train data/$L/lang exp/$L/tri1_ali_fmllr \
-      exp/$L/tri2a >& exp/$L/tri2a/train.log
+    steps/train_deltas.sh --cmd "$train_cmd" \
+	--cluster-thresh 100 $num_states $num_gauss data/$L/train data/$L/lang \ 
+	exp/$L/tri1_ali exp/$L/tri2a >& exp/$L/tri2a/train.log
   ) &
 done
 wait;
 
+# Decode tri2a
 for L in $GP_LANGUAGES; do
   for lm_suffix in tgpr_sri; do
     (
       graph_dir=exp/$L/tri2a/graph_${lm_suffix}
       mkdir -p $graph_dir
-      $highmem_cmd $graph_dir/mkgraph.log \
-	utils/mkgraph.sh data/$L/lang_test_${lm_suffix} exp/$L/tri2a $graph_dir
+      utils/mkgraph.sh data/$L/lang_test_${lm_suffix} exp/$L/tri2a \
+	$graph_dir
 
-      steps/decode_fmllr.sh --nj 5 --cmd "$decode_cmd" $graph_dir data/$L/dev \
-	exp/$L/tri2a/decode_dev_${lm_suffix} 
+      steps/decode.sh --nj 5 --cmd "$decode_cmd" $graph_dir data/$L/dev \
+	exp/$L/tri2a/decode_dev_${lm_suffix}
+      steps/decode.sh --nj 5 --cmd "$decode_cmd" $graph_dir data/$L/eval \
+	exp/$L/tri2a/decode_eval_${lm_suffix}
     ) &
   done
 done
 
+# Train tri2b, which is LDA+MLLT
+for L in $GP_LANGUAGES; do
+  (
+    num_states=$(grep "^$L" conf/tri.conf | cut -f2)
+    num_gauss=$(grep "^$L" conf/tri.conf | cut -f3)
+    mkdir -p exp/$L/tri2b
+    steps/train_lda_mllt.sh --cmd "$train_cmd" \ 
+	--splice-opts "--left-context=3 --right-context=3" $num_states $num_gauss data/$L/train \ 
+	data/$L/lang exp/$L/tri1_ali exp/$L/tri2b >& exp/$L/tri2b/tri2_ali.log 
+  ) &
+done
+wait;
+
 # for L in $GP_LANGUAGES; do
 #   mode=4
 # # Doing this only for the LMs whose vocabs were limited using SRILM, since the
@@ -175,19 +197,152 @@ done
 #     exp/$L/tri2a/decode_dev_tgpr_sri exp/$L/tri2a/decode_dev_tg_sri$mode
 # done
 
+# Decode tri2b
+for L in $GP_LANGUAGES; do
+  for lm_suffix in tgpr_sri; do
+  (
+    graph_dir=exp/$L/tri2b/graph_${lm_suffix}
+    mkdir -p $graph_dir
+    utils/mkgraph.sh data/$L/lang_test_${lm_suffix} exp/$L/tri2b \ 
+	$graph_dir  
+
+    steps/decode.sh --nj 5 --cmd "$decode_cmd" $graph_dir data/$L/dev \
+	exp/$L/tri2b/decode_dev_${lm_suffix} 
+    steps/decode.sh --nj 5 --cmd "$decode_cmd" $graph_dir data/$L/eval \ 
+	exp/$L/tri2b/decode_eval_${lm_suffix}
+  ) &
+  done
+done
+wait;
+
+# Train tri3b, which is LDA+MLLT+SAT.
+for L in $GP_LANGUAGES; do
+  (
+    mkdir -p exp/$L/tri2b_ali
+    steps/align_si.sh --nj 10 --cmd "$train_cmd" \ 
+	--use-graphs true data/$L/train data/$L/lang exp/$L/tri2b exp/$L/tri2b_ali \
+	>& exp/$L/tri2b_ali/align.log
+
+    num_states=$(grep "^$L" conf/tri.conf | cut -f2)
+    num_gauss=$(grep "^$L" conf/tri.conf | cut -f3)
+    mkdir -p exp/$L/tri3b
+    steps/train_sat.sh --cmd "$train_cmd" \
+	--cluster-thresh 100 $num_states $num_gauss data/$L/train data/$L/lang \ 
+	exp/$L/tri2b_ali exp/$L/tri3b >& exp/$L/tri3b/train.log
+  ) &
+done
+wait;
+
+# Decode 3b
+for L in $GP_LANGUAGES; do
+  for lm_suffix in tgpr_sri; do
+  (
+    graph_dir=exp/$L/tri3b/graph_${lm_suffix}
+    mkdir -p $graph_dir
+    utils/mkgraph.sh data/$L/lang_test_${lm_suffix} exp/$L/tri3b \
+	$graph_dir
+
+    mkdir -p exp/$L/tri3b/decode_dev_${lm_suffix}
+    steps/decode_fmllr.sh --nj 5 --cmd "$decode_cmd" \ 
+	$graph_dir data/$L/dev exp/$L/tri3b/decode_dev_${lm_suffix}
+    steps/decode_fmllr.sh --nj 5 --cmd "$decode_cmd" \ 
+	$graph_dir data/$L/eval exp/$L/tri3b/decode_eval_${lm_suffix}
+  ) &
+done
+done
+wait;
+
+## Train sgmm2b, which is SGMM on top of LDA+MLLT+SAT features.
+for L in $GP_LANGUAGES; do
+  (
+    mkdir -p exp/$L/tri3b_ali
+    steps/align_fmllr.sh --nj 10 --cmd "$train_cmd" \ 
+	data/$L/train data/$L/lang exp/$L/tri3b exp/$L/tri3b_ali
+
+    num_states=$(grep "^$L" conf/sgmm.conf | cut -f2)
+    num_substates=$(grep "^$L" conf/sgmm.conf | cut -f3)
+    mkdir -p exp/$L/ubm4a
+    steps/train_ubm.sh --cmd "$train_cmd" \ 
+	600 data/$L/train data/$L/lang exp/$L/tri3b_ali exp/$L/ubm4a
+
+    mkdir -p exp/$L/sgmm2_4a
+    steps/train_sgmm2.sh --cmd "$train_cmd" \ 
+	$num_states $num_substates data/$L/train data/$L/lang exp/$L/tri3b_ali \ 
+	exp/$L/ubm4a/final.ubm exp/$L/sgmm2_4a
+  ) &
+done
+wait;
+
+## Decode sgmm2_4a
+for L in $GP_LANGUAGES; do
+ for lm_suffix in tgpr_sri; do
+  (
+    graph_dir=exp/$L/sgmm2_4a/graph_${lm_suffix}
+    mkdir -p $graph_dir
+    utils/mkgraph.sh data/$L/lang_test_${lm_suffix} exp/$L/sgmm2_4a \ 
+	$graph_dir
+
+    steps/decode_sgmm2.sh --use-fmllr true --nj 5 --cmd "$decode_cmd" \
+	--transform-dir exp/$L/tri3b/decode_dev_${lm_suffix}  $graph_dir data/$L/dev \ 
+	exp/$L/sgmm2_4a/decode_dev_${lm_suffix}
+    steps/decode_sgmm2.sh --use-fmllr true --nj 5 --cmd "$decode_cmd" \
+	--transform-dir exp/$L/tri3b/decode_eval_${lm_suffix}  $graph_dir data/$L/eval \
+	exp/$L/sgmm2_4a/decode_eval_${lm_suffix}
+  )
+ done
+done
+wait;
+
+
+# Now we'll align the SGMM system to prepare for discriminative training MMI
+for L in $GP_LANGUAGES; do
+ for lm_suffix in tgpr_sri; do
+  (
+    mkdir -p exp/$L/sgmm2_4a_ali
+    steps/align_sgmm2.sh --nj 10 --cmd "$train_cmd" \ 
+	--transform-dir exp/$L/tri3b_ali --use-graphs true --use-gselect true data/$L/train \ 
+	data/$L/lang exp/$L/sgmm2_4a exp/$L/sgmm2_4a_ali
+
+    mkdir -p exp/$L/sgmm2_4a_denlats
+    steps/make_denlats_sgmm2.sh --nj 10 --sub-split 10 --cmd "$decode_cmd" \ 
+	--transform-dir exp/$L/tri3b_ali data/$L/train data/$L/lang \ 
+	exp/$L/sgmm2_4a_ali exp/$L/sgmm2_4a_denlats
+    mkdir -p exp/$L/sgmm2_4a_mmi_b0.1
+    steps/train_mmi_sgmm2.sh --cmd "$decode_cmd" \ 
+	--transform-dir exp/$L/tri3b_ali --boost 0.1 data/$L/train data/$L/lang \ 
+	exp/$L/sgmm2_4a_ali exp/$L/sgmm2_4a_denlats exp/$L/sgmm2_4a_mmi_b0.1
+  ) &
+ done
+done
+wait;
+
+# decode sgmm2_4a-mmi_b0.1
+for L in $GP_LANGUAGES; do
+ for lm_suffix in tgpr_sri; do
+  (
+   graph_dir=exp/$L/sgmm2_4a/graph_${lm_suffix}
+    for iter in 1 2 3 4; do
+     for test in dev eval; do 
+      steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" \
+	--iter $iter --transform-dir exp/$L/tri3b/decode_${test}_${lm_suffix} data/$L/lang_test_${lm_suffix} \ 
+	data/$L/${test} exp/$L/sgmm2_4a/decode_${test}_${lm_suffix} \ 
+	exp/$L/sgmm2_4a_mmi_b0.1/decode_${test}_${lm_suffix}_it$iter
+     done
+    done
+  ) &
+ done
+done
+wait;
+
+
 # SGMMs starting from non-SAT triphone system, both with and without 
 # speaker vectors.
 for L in $GP_LANGUAGES; do
   (
-    mkdir -p exp/$L/tri1_ali
-    steps/align_si.sh --nj 10 --cmd "$train_cmd" \
-      data/$L/train data/$L/lang exp/$L/tri1 exp/$L/tri1_ali \
-      >& exp/$L/tri1_ali/align.log
-
     mkdir -p exp/$L/ubm2a
-    steps/train_ubm.sh --cmd "$train_cmd" \
-      400 data/$L/train data/$L/lang exp/$L/tri1_ali exp/$L/ubm2a \
-      >& exp/$L/ubm2a/train.log || exit 1;
+    steps/train_ubm.sh --cmd "$train_cmd" \ 
+	400 data/$L/train data/$L/lang exp/$L/tri1_ali exp/$L/ubm2a \ 
+	>& exp/$L/ubm2a/train.log
 
     num_states=$(grep "^$L" conf/sgmm.conf | cut -f2)
     num_substates=$(grep "^$L" conf/sgmm.conf | cut -f3)
@@ -222,35 +377,5 @@ for L in $GP_LANGUAGES; do
   done    # loop over model with and without speaker vecs
 done      # loop over languages
 
-# Train SGMMs using SAT features
-for L in $GP_LANGUAGES; do
-  (
-    mkdir -p exp/$L/ubm2c
-    steps/train_ubm.sh --cmd "$train_cmd" \
-      400 data/$L/train data/$L/lang exp/$L/tri1_ali_fmllr exp/$L/ubm2c \
-      >& exp/$L/ubm2c/train.log || exit 1;
 
-    num_states=$(grep "^$L" conf/tri.conf | cut -f2)
-    num_gauss=$(grep "^$L" conf/tri.conf | cut -f3)
-    mkdir -p exp/$L/sgmm2c
-    steps/train_sgmm.sh --cmd "$train_cmd" --cluster-thresh 100 \
-      $num_states $num_gauss data/$L/train data/$L/lang exp/$L/tri1_ali_fmllr \
-      exp/$L/ubm2c/final.ubm exp/$L/sgmm2c >& exp/$L/sgmm2c/train.log
-  ) &
-done
-wait
-
-for L in $GP_LANGUAGES; do
-  for lm_suffix in tgpr_sri; do
-    (
-      graph_dir=exp/$L/sgmm2c/graph_${lm_suffix}
-      mkdir -p $graph_dir
-      $highmem_cmd $graph_dir/mkgraph.log \
-	utils/mkgraph.sh data/$L/lang_test_${lm_suffix} exp/$L/sgmm2c $graph_dir
 
-      steps/decode_sgmm.sh --nj 5 --cmd "$decode_cmd" \
-	--transform-dir exp/$L/tri2a/decode_dev_${lm_suffix} \
-	$graph_dir data/$L/dev exp/$L/sgmm2c/decode_dev_${lm_suffix} 
-    ) &
-  done
-done
diff --git a/egs/hkust/s5/RESULTS b/egs/hkust/s5/RESULTS
index 851e2a1ced1..33d9f713e75 100644
--- a/egs/hkust/s5/RESULTS
+++ b/egs/hkust/s5/RESULTS
@@ -20,4 +20,8 @@ exp/lstm5e/decode/cer_10:%WER 37.61 [ 21121 / 56154, 1829 ins, 3941 del, 15351 s
 # nnet2 results
 exp/nnet2_5d/decode/cer_10:%WER 38.59 [ 21669 / 56154, 2498 ins, 3581 del, 15590 sub ]
 # ConvNet with 2 convolutional layers and 2 ReLU layers
-exp/nnet2_convnet/decode/cer_10:%WER 40.73 [ 22873 / 56154, 2609 ins, 3712 del, 16552 sub ]
+exp/nnet2_convnet/decode/cer_10:%WER 41.19 [ 23129 / 56154, 2599 ins, 3782 del, 16748 sub ]
+
+# nnet3 results (using speed perturbed data)
+exp/nnet3/tdnn_sp/decode_dev/cer_10:%WER 33.79 [ 18977 / 56154, 2027 ins, 3485 del, 13465 sub ]
+exp/nnet3/lstm_sp_ld5/decode_dev/cer_9:%WER 33.51 [ 18815 / 56154, 1813 ins, 3249 del, 13753 sub ]
\ No newline at end of file
diff --git a/egs/hkust/s5/cmd.sh b/egs/hkust/s5/cmd.sh
index 2a46d89f385..71dd849a93b 100644
--- a/egs/hkust/s5/cmd.sh
+++ b/egs/hkust/s5/cmd.sh
@@ -1,13 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-export train_cmd="queue.pl -q all.q@a*.clsp.jhu.edu"
-export decode_cmd="queue.pl -q all.q@a*.clsp.jhu.edu"
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/hkust/s5/local/create_oov_char_lexicon.pl b/egs/hkust/s5/local/create_oov_char_lexicon.pl
new file mode 100644
index 00000000000..aaf5d3bcb9b
--- /dev/null
+++ b/egs/hkust/s5/local/create_oov_char_lexicon.pl
@@ -0,0 +1,46 @@
+#!/usr/bin/perl
+# Copyright 2016 LeSpeech (Author: Xingyu Na)
+#
+# A script for char-based Chinese OOV lexicon generation.
+#
+# Input 1: char-based dictionary, example
+# CHAR1 ph1 ph2
+# CHAR2 ph3
+# CHAR3 ph2 ph4
+#
+# Input 2: OOV word list, example
+# WORD1
+# WORD2
+# WORD3
+#
+# where WORD1 is in the format of "CHAR1CHAR2".
+#
+# Output: OOV lexicon, in the format of normal lexicon
+
+if($#ARGV != 1) {
+  print STDERR "usage: perl create_oov_char_lexicon.pl chardict oovwordlist > oovlex\n\n";
+  print STDERR "### chardict: a dict in which each line contains the pronunciation of one Chinese char\n";
+  print STDERR "### oovwordlist: OOV word list\n";
+  print STDERR "### oovlex: output OOV lexicon\n";
+  exit;
+}
+
+use encoding utf8;
+my %prons;
+open(DICT, $ARGV[0]) || die("Can't open dict ".$ARGV[0]."\n");
+foreach (<DICT>) {
+  chomp; @A = split(" ", $_); $prons{$A[0]} = $A[1];
+}
+close DICT;
+
+open(WORDS, $ARGV[1]) || die("Can't open oov word list ".$ARGV[1]."\n");
+while (<WORDS>) {
+  chomp;
+  print $_;
+  @A = split("", $_);
+  foreach (@A) {
+    print " $prons{$_}";
+  }
+  print "\n";
+}
+close WORDS;
diff --git a/egs/hkust/s5/local/hkust_format_data.sh b/egs/hkust/s5/local/hkust_format_data.sh
index 4f517e6dd1a..33cf8fa22ef 100755
--- a/egs/hkust/s5/local/hkust_format_data.sh
+++ b/egs/hkust/s5/local/hkust_format_data.sh
@@ -1,4 +1,4 @@
-#!/bin/bash 
+#!/bin/bash
 #
 
 if [ -f path.sh ]; then . path.sh; fi
@@ -23,26 +23,13 @@ done
 rm -r data/lang_test
 cp -r data/lang data/lang_test
 
-# grep -v '<s> <s>' etc. is only for future-proofing this script.  Our
-# LM doesn't have these "invalid combinations".  These can cause 
-# determinization failures of CLG [ends up being epsilon cycles].
-# Note: remove_oovs.pl takes a list of words in the LM that aren't in
-# our word list.  Since our LM doesn't have any, we just give it
-# /dev/null [we leave it in the script to show how you'd do it].
 gunzip -c "$arpa_lm" | \
-   grep -v '<s> <s>' | \
-   grep -v '</s> <s>' | \
-   grep -v '</s> </s>' | \
-   arpa2fst - | fstprint | \
-   utils/remove_oovs.pl /dev/null | \
-   utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \
-     --osymbols=data/lang_test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-    fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst
-  fstisstochastic data/lang_test/G.fst
+  arpa2fst --disambig-symbol=#0 \
+           --read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst
 
 
 echo  "Checking how stochastic G is (the first of these numbers should be small):"
-fstisstochastic data/lang_test/G.fst 
+fstisstochastic data/lang_test/G.fst
 
 ## Check lexicon.
 ## just have a look and make sure it seems sane.
@@ -71,4 +58,3 @@ fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \
 
 
 echo hkust_format_data succeeded.
-
diff --git a/egs/hkust/s5/local/hkust_prepare_dict.sh b/egs/hkust/s5/local/hkust_prepare_dict.sh
index cd3ed602c70..dc9e5262dfb 100755
--- a/egs/hkust/s5/local/hkust_prepare_dict.sh
+++ b/egs/hkust/s5/local/hkust_prepare_dict.sh
@@ -1,4 +1,6 @@
 #!/bin/bash
+# Copyright 2016 LeSpeech (Author: Xingyu Na)
+
 # prepare dictionary for HKUST
 # it is done for English and Chinese separately, 
 # For English, we use CMU dictionary, and Sequitur G2P
@@ -14,23 +16,19 @@ train_dir=data/local/train
 dev_dir=data/local/dev
 dict_dir=data/local/dict
 mkdir -p $dict_dir
-
-case 0 in    #goto here
-    1)
-;;           #here:
-esac
-
-
+mkdir -p $dict_dir/lexicon-{en,ch}
+  
 # extract full vocabulary
 cat $train_dir/text $dev_dir/text | awk '{for (i = 2; i <= NF; i++) print $i}' |\
   sed -e 's/ /\n/g' | sort -u | grep -v '\[LAUGHTER\]' | grep -v '\[NOISE\]' |\
-  grep -v '\[VOCALIZED-NOISE\]' > $dict_dir/vocab-full.txt  
+  grep -v '\[VOCALIZED-NOISE\]' > $dict_dir/words.txt  
 
 # split into English and Chinese
-cat $dict_dir/vocab-full.txt | grep '[a-zA-Z]' > $dict_dir/vocab-en.txt
-cat $dict_dir/vocab-full.txt | grep -v '[a-zA-Z]' > $dict_dir/vocab-ch.txt
+cat $dict_dir/words.txt | grep '[a-zA-Z]' > $dict_dir/lexicon-en/words-en.txt
+cat $dict_dir/words.txt | grep -v '[a-zA-Z]' > $dict_dir/lexicon-ch/words-ch.txt
+
 
-# produce pronunciations for english 
+##### produce pronunciations for english 
 if [ ! -f $dict_dir/cmudict/cmudict.0.7a ]; then
   echo "--- Downloading CMU dictionary ..."
   svn co https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict \
@@ -40,19 +38,19 @@ fi
 echo "--- Striping stress and pronunciation variant markers from cmudict ..."
 perl $dict_dir/cmudict/scripts/make_baseform.pl \
   $dict_dir/cmudict/cmudict.0.7a /dev/stdout |\
-  sed -e 's:^\([^\s(]\+\)([0-9]\+)\(\s\+\)\(.*\):\1\2\3:' > $dict_dir/cmudict-plain.txt
+  sed -e 's:^\([^\s(]\+\)([0-9]\+)\(\s\+\)\(.*\):\1\2\3:' > $dict_dir/cmudict/cmudict-plain.txt
 
 echo "--- Searching for English OOV words ..."
 gawk 'NR==FNR{words[$1]; next;} !($1 in words)' \
-  $dict_dir/cmudict-plain.txt $dict_dir/vocab-en.txt |\
-  egrep -v '<.?s>' > $dict_dir/vocab-en-oov.txt
+  $dict_dir/cmudict/cmudict-plain.txt $dict_dir/lexicon-en/words-en.txt |\
+  egrep -v '<.?s>' > $dict_dir/lexicon-en/words-en-oov.txt
 
 gawk 'NR==FNR{words[$1]; next;} ($1 in words)' \
-  $dict_dir/vocab-en.txt $dict_dir/cmudict-plain.txt |\
-  egrep -v '<.?s>' > $dict_dir/lexicon-en-iv.txt
+  $dict_dir/lexicon-en/words-en.txt $dict_dir/cmudict/cmudict-plain.txt |\
+  egrep -v '<.?s>' > $dict_dir/lexicon-en/lexicon-en-iv.txt
 
-wc -l $dict_dir/vocab-en-oov.txt
-wc -l $dict_dir/lexicon-en-iv.txt
+wc -l $dict_dir/lexicon-en/words-en-oov.txt
+wc -l $dict_dir/lexicon-en/lexicon-en-iv.txt
 
 pyver=`python --version 2>&1 | sed -e 's:.*\([2-3]\.[0-9]\+\).*:\1:g'`
 export PYTHONPATH=$PYTHONPATH:`pwd`/tools/g2p/lib/python${pyver}/site-packages
@@ -83,21 +81,78 @@ fi
 
 echo "--- Preparing pronunciations for OOV words ..."
 python tools/g2p/lib/python${pyver}/site-packages/g2p.py \
-  --model=conf/g2p_model --apply $dict_dir/vocab-en-oov.txt > $dict_dir/lexicon-en-oov.txt
+  --model=conf/g2p_model --apply $dict_dir/lexicon-en/words-en-oov.txt > $dict_dir/lexicon-en/lexicon-en-oov.txt
 
-cat $dict_dir/lexicon-en-oov.txt $dict_dir/lexicon-en-iv.txt |\
-  sort > $dict_dir/lexicon-en-phn.txt
+cat $dict_dir/lexicon-en/lexicon-en-oov.txt $dict_dir/lexicon-en/lexicon-en-iv.txt |\
+  sort > $dict_dir/lexicon-en/lexicon-en-phn.txt
 
+mkdir $dict_dir/map
+cat conf/cmu2pinyin | awk '{print $1;}' | sort -u > $dict_dir/map/cmu 
+cat conf/pinyin2cmu | awk -v cmu=$dict_dir/map/cmu \
+  'BEGIN{while((getline<cmu)) dict[$1] = 1;}
+   {for (i = 2; i <=NF; i++) if (dict[$i]) print $i;}' | sort -u > $dict_dir/map/cmu-used
+cat $dict_dir/map/cmu | awk -v cmu=$dict_dir/map/cmu-used \
+  'BEGIN{while((getline<cmu)) dict[$1] = 1;}
+   {if (!dict[$1]) print $1;}' > $dict_dir/map/cmu-not-used 
+
+gawk 'NR==FNR{words[$1]; next;} ($1 in words)' \
+  $dict_dir/map/cmu-not-used conf/cmu2pinyin |\
+  egrep -v '<.?s>' > $dict_dir/map/cmu-py
+
+cat $dict_dir/map/cmu-py | \
+  perl -e '
+  open(MAPS, $ARGV[0]) or die("could not open map file");
+  my %py2ph;
+  foreach $line (<MAPS>) {
+    @A = split(" ", $line);
+    $py = shift(@A);
+    $py2ph{$py} = [@A];
+  }
+  my @entry;
+  while (<STDIN>) {
+    @A = split(" ", $_);
+    @entry = ();
+    $W = shift(@A);
+    push(@entry, $W);
+    for($i = 0; $i < @A; $i++) { push(@entry, @{$py2ph{$A[$i]}}); }
+    print "@entry";
+    print "\n";  
+  }  
+' conf/pinyin2cmu > $dict_dir/map/cmu-cmu 
 
+cat $dict_dir/lexicon-en/lexicon-en-phn.txt | \
+  perl -e '
+  open(MAPS, $ARGV[0]) or die("could not open map file");
+  my %py2ph;
+  foreach $line (<MAPS>) {
+    @A = split(" ", $line);
+    $py = shift(@A);
+    $py2ph{$py} = [@A];
+  }
+  my @entry;
+  while (<STDIN>) {
+    @A = split(" ", $_);
+    @entry = ();
+    $W = shift(@A);
+    push(@entry, $W);
+    for($i = 0; $i < @A; $i++) { 
+      if (exists $py2ph{$A[$i]}) { push(@entry, @{$py2ph{$A[$i]}}); }
+      else {push(@entry, $A[$i])};
+    }
+    print "@entry";
+    print "\n";  
+  }
+' $dict_dir/map/cmu-cmu > $dict_dir/lexicon-en/lexicon-en.txt 
 
 
-# produce pronunciations for chinese 
-if [ ! -f $dict_dir/cedict_1_0_ts_utf-8_mdbg.txt ]; then
-  wget -P $dict_dir http://www.mdbg.net/chindict/export/cedict/cedict_1_0_ts_utf-8_mdbg.txt.gz 
-  gunzip $dict_dir/cedict_1_0_ts_utf-8_mdbg.txt.gz
+##### produce pronunciations for chinese 
+if [ ! -f $dict_dir/cedict/cedict_1_0_ts_utf-8_mdbg.txt ]; then
+  mkdir -p $dict_dir/cedict
+  wget -P $dict_dir/cedict http://www.mdbg.net/chindict/export/cedict/cedict_1_0_ts_utf-8_mdbg.txt.gz 
+  gunzip $dict_dir/cedict/cedict_1_0_ts_utf-8_mdbg.txt.gz
 fi
 
-cat $dict_dir/cedict_1_0_ts_utf-8_mdbg.txt | grep -v '#' | awk -F '/' '{print $1}' |\
+cat $dict_dir/cedict/cedict_1_0_ts_utf-8_mdbg.txt | grep -v '#' | awk -F '/' '{print $1}' |\
  perl -e '   
   while (<STDIN>) {
     @A = split(" ", $_);
@@ -109,27 +164,24 @@ cat $dict_dir/cedict_1_0_ts_utf-8_mdbg.txt | grep -v '#' | awk -F '/' '{print $1
     }
     print "\n";
   }
- ' | sort -k1 > $dict_dir/ch-dict.txt 
+ ' | sort -k1 > $dict_dir/cedict/ch-dict.txt 
 
 echo "--- Searching for Chinese OOV words ..."
 gawk 'NR==FNR{words[$1]; next;} !($1 in words)' \
-  $dict_dir/ch-dict.txt $dict_dir/vocab-ch.txt |\
-  egrep -v '<.?s>' > $dict_dir/vocab-ch-oov.txt
+  $dict_dir/cedict/ch-dict.txt $dict_dir/lexicon-ch/words-ch.txt |\
+  egrep -v '<.?s>' > $dict_dir/lexicon-ch/words-ch-oov.txt
 
 gawk 'NR==FNR{words[$1]; next;} ($1 in words)' \
-  $dict_dir/vocab-ch.txt $dict_dir/ch-dict.txt |\
-  egrep -v '<.?s>' > $dict_dir/lexicon-ch-iv.txt
-
-wc -l $dict_dir/vocab-ch-oov.txt
-wc -l $dict_dir/lexicon-ch-iv.txt
+  $dict_dir/lexicon-ch/words-ch.txt $dict_dir/cedict/ch-dict.txt |\
+  egrep -v '<.?s>' > $dict_dir/lexicon-ch/lexicon-ch-iv.txt
 
+wc -l $dict_dir/lexicon-ch/words-ch-oov.txt
+wc -l $dict_dir/lexicon-ch/lexicon-ch-iv.txt
 
 
-# this
-unset LC_ALL
-# first make sure number of characters and pinyins 
-# are equal  
-cat $dict_dir/ch-dict.txt |\
+# validate Chinese dictionary and compose a char-based
+# dictionary in order to get OOV pronunciations
+cat $dict_dir/cedict/ch-dict.txt |\
   perl -e '
   use encoding utf8;
   while (<STDIN>) {
@@ -138,15 +190,38 @@ cat $dict_dir/ch-dict.txt |\
     $proun_len = @A - 1 ; 
     if ($word_len == $proun_len) {print $_;}
   }
-  ' > $dict_dir/ch-dict-1.txt
+  ' > $dict_dir/cedict/ch-dict-1.txt
 
-cat $dict_dir/ch-dict-1.txt | awk '{print $1}' | sed -e 's/\(\S\)/\1\n/g' | grep -v '^$' > $dict_dir/ch-char.txt
-cat $dict_dir/ch-dict-1.txt | awk '{for(i=2; i<=NF; i++) print $i}' | sed -e 's/ /\n/g' > $dict_dir/ch-char-pinyin.txt
-wc -l $dict_dir/ch-char.txt
-wc -l $dict_dir/ch-char-pinyin.txt
-paste $dict_dir/ch-char.txt $dict_dir/ch-char-pinyin.txt | sort -u > $dict_dir/ch-char-dict.txt 
+# extract chars
+cat $dict_dir/cedict/ch-dict-1.txt | awk '{print $1}' |\
+  perl -e '
+  use encoding utf8;
+  while (<STDIN>) {
+    @A = split(" ", $_);
+    @chars = split("", $A[0]);
+    foreach (@chars) {
+      print "$_\n";
+    }
+  }
+  ' | grep -v '^$' > $dict_dir/lexicon-ch/ch-char.txt
 
-cat $dict_dir/ch-char-dict.txt |\
+# extract individual pinyins
+cat $dict_dir/cedict/ch-dict-1.txt | awk '{for(i=2; i<=NF; i++) print $i}' | sed -e 's/ /\n/g' > $dict_dir/lexicon-ch/ch-char-pinyin.txt
+
+# first make sure number of characters and pinyins 
+# are equal, so that a char-based dictionary can
+# be composed.
+nchars=`wc -l < $dict_dir/lexicon-ch/ch-char.txt`
+npinyin=`wc -l < $dict_dir/lexicon-ch/ch-char-pinyin.txt`
+if [ $nchars -ne $npinyin ]; then
+  echo "Found $nchars chars and $npinyin pinyin. Please check!"
+  exit 1
+fi
+
+paste $dict_dir/lexicon-ch/ch-char.txt $dict_dir/lexicon-ch/ch-char-pinyin.txt | sort -u > $dict_dir/lexicon-ch/ch-char-dict.txt 
+
+# create a multiple pronunciation dictionary
+cat $dict_dir/lexicon-ch/ch-char-dict.txt |\
   perl -e '
   my $prev = ""; 
   my $out_line = "";
@@ -161,14 +236,13 @@ cat $dict_dir/ch-char-dict.txt |\
     $prev = $cur;
   }
   print $out_line;  
-  ' >  $dict_dir/ch-char-dict-1.txt 
-
-cat $dict_dir/vocab-ch-oov.txt | awk -v w=$dict_dir/ch-char-dict-1.txt \
-  'BEGIN{while((getline<w)>0) dict[$1]=$2;} 
-   {printf("%s", $1); for (i=1; i<=length($1); i++) { py=substr($1, i, 1); printf(" %s", dict[py]); } printf("\n"); }' \
-  > $dict_dir/lexicon-ch-oov.txt 
+  ' >  $dict_dir/lexicon-ch/ch-char-dict-mp.txt
+2
+# get lexicon for Chinese OOV words
+perl local/create_oov_char_lexicon.pl $dict_dir/lexicon-ch/ch-char-dict-mp.txt $dict_dir/lexicon-ch/words-ch-oov.txt > $dict_dir/lexicon-ch/lexicon-ch-oov.txt
 
-cat $dict_dir/lexicon-ch-oov.txt |\
+# seperate multiple prons for Chinese OOV lexicon
+cat $dict_dir/lexicon-ch/lexicon-ch-oov.txt |\
   perl -e '
   my @entry;
   my @entry1;
@@ -192,72 +266,18 @@ cat $dict_dir/lexicon-ch-oov.txt |\
       print "\n";
     } 
   }
-  ' > $dict_dir/lexicon-ch-oov1.txt
+  ' > $dict_dir/lexicon-ch/lexicon-ch-oov-mp.txt
 
-cat $dict_dir/lexicon-ch-oov1.txt $dict_dir/lexicon-ch-iv.txt |\
-  awk '{if (NF > 1) print $0;}' > $dict_dir/lexicon-ch.txt 
+# compose IV and OOV lexicons for Chinese
+cat $dict_dir/lexicon-ch/lexicon-ch-oov-mp.txt $dict_dir/lexicon-ch/lexicon-ch-iv.txt |\
+  awk '{if (NF > 1) print $0;}' > $dict_dir/lexicon-ch/lexicon-ch.txt 
 
-cat $dict_dir/lexicon-ch.txt | sed -e 's/U:/V/g' | sed -e 's/ R\([0-9]\)/ ER\1/g'|\
-  utils/pinyin_map.pl conf/pinyin2cmu > $dict_dir/lexicon-ch-cmu.txt
+# convert Chinese pinyin to CMU format
+cat $dict_dir/lexicon-ch/lexicon-ch.txt | sed -e 's/U:/V/g' | sed -e 's/ R\([0-9]\)/ ER\1/g'|\
+  utils/pinyin_map.pl conf/pinyin2cmu > $dict_dir/lexicon-ch/lexicon-ch-cmu.txt
 
-cat conf/cmu2pinyin | awk '{print $1;}' | sort -u > $dict_dir/cmu 
-cat conf/pinyin2cmu | awk -v cmu=$dict_dir/cmu \
-  'BEGIN{while((getline<cmu)) dict[$1] = 1;}
-   {for (i = 2; i <=NF; i++) if (dict[$i]) print $i;}' | sort -u > $dict_dir/cmu-used
-cat $dict_dir/cmu | awk -v cmu=$dict_dir/cmu-used \
-  'BEGIN{while((getline<cmu)) dict[$1] = 1;}
-   {if (!dict[$1]) print $1;}' > $dict_dir/cmu-not-used 
-
-gawk 'NR==FNR{words[$1]; next;} ($1 in words)' \
-  $dict_dir/cmu-not-used conf/cmu2pinyin |\
-  egrep -v '<.?s>' > $dict_dir/cmu-py
-
-cat $dict_dir/cmu-py | \
-  perl -e '
-  open(MAPS, $ARGV[0]) or die("could not open map file");
-  my %py2ph;
-  foreach $line (<MAPS>) {
-    @A = split(" ", $line);
-    $py = shift(@A);
-    $py2ph{$py} = [@A];
-  }
-  my @entry;
-  while (<STDIN>) {
-    @A = split(" ", $_);
-    @entry = ();
-    $W = shift(@A);
-    push(@entry, $W);
-    for($i = 0; $i < @A; $i++) { push(@entry, @{$py2ph{$A[$i]}}); }
-    print "@entry";
-    print "\n";  
-  }  
-' conf/pinyin2cmu > $dict_dir/cmu-cmu 
-
-cat $dict_dir/lexicon-en-phn.txt | \
-  perl -e '
-  open(MAPS, $ARGV[0]) or die("could not open map file");
-  my %py2ph;
-  foreach $line (<MAPS>) {
-    @A = split(" ", $line);
-    $py = shift(@A);
-    $py2ph{$py} = [@A];
-  }
-  my @entry;
-  while (<STDIN>) {
-    @A = split(" ", $_);
-    @entry = ();
-    $W = shift(@A);
-    push(@entry, $W);
-    for($i = 0; $i < @A; $i++) { 
-      if (exists $py2ph{$A[$i]}) { push(@entry, @{$py2ph{$A[$i]}}); }
-      else {push(@entry, $A[$i])};
-    }
-    print "@entry";
-    print "\n";  
-  }
-' $dict_dir/cmu-cmu > $dict_dir/lexicon-en.txt 
-
-cat $dict_dir/lexicon-en.txt $dict_dir/lexicon-ch-cmu.txt |\
+# combine English and Chinese lexicons
+cat $dict_dir/lexicon-en/lexicon-en.txt $dict_dir/lexicon-ch/lexicon-ch-cmu.txt |\
   sort -u > $dict_dir/lexicon1.txt
 
 cat $dict_dir/lexicon1.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}'| \
@@ -284,21 +304,14 @@ echo SIL > $dict_dir/optional_silence.txt
 # No "extra questions" in the input to this setup, as we don't
 # have stress or tone
 
-#echo -n > $dict_dir/extra_questions.txt
 cat $dict_dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf "\n";}' > $dict_dir/extra_questions.txt || exit 1;
 cat $dict_dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ", $_)) {
   $p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \
  >> $dict_dir/extra_questions.txt || exit 1;
 
-
 # Add to the lexicon the silences, noises etc.
 (echo '!SIL SIL'; echo '[VOCALIZED-NOISE] SPN'; echo '[NOISE] NSN'; echo '[LAUGHTER] LAU';
  echo '<UNK> SPN' ) | \
  cat - $dict_dir/lexicon1.txt  > $dict_dir/lexicon.txt || exit 1;
 
-
-export LC_ALL=C
-
-
-
 exit 1;
diff --git a/egs/hkust/s5/local/nnet/run_cnn.sh b/egs/hkust/s5/local/nnet/run_cnn.sh
index 17fbc2d7c17..e0b7e10df86 100755
--- a/egs/hkust/s5/local/nnet/run_cnn.sh
+++ b/egs/hkust/s5/local/nnet/run_cnn.sh
@@ -82,7 +82,7 @@ if [ $stage -le 4 ]; then
   cnn_dbn=$dir/cnn_dbn.nnet
   { # Concatenate CNN layers and DBN,
     num_components=$(nnet-info $feature_transform | grep -m1 num-components | awk '{print $2;}')
-    nnet-concat "nnet-copy --remove-first-layers=$num_components $feature_transform_dbn - |" $dbn $cnn_dbn \
+    nnet-concat "nnet-copy --remove-first-components=$num_components $feature_transform_dbn - |" $dbn $cnn_dbn \
       2>$dir/log/concat_cnn_dbn.log || exit 1 
   }
   # Train
diff --git a/egs/hkust/s5/local/nnet/run_lstm.sh b/egs/hkust/s5/local/nnet/run_lstm.sh
index 38c4474ac07..ec5d0e3a856 100755
--- a/egs/hkust/s5/local/nnet/run_lstm.sh
+++ b/egs/hkust/s5/local/nnet/run_lstm.sh
@@ -46,7 +46,7 @@ if [ $stage -le 1 ]; then
   $cuda_cmd $dir/log/train_nnet.log \
     steps/nnet/train.sh --network-type lstm --learn-rate 0.0001 \
       --cmvn-opts "--norm-means=true --norm-vars=true" --feat-type plain --splice 0 \
-      --train-opts "--momentum 0.9 --halving-factor 0.5" \
+      --train-tool-opts "--momentum 0.9 --halving-factor 0.5" \
       --delta-opts "--delta-order=2" \
       --train-tool "nnet-train-lstm-streams --num-stream=4 --targets-delay=5" \
       --proto-opts "--num-cells 2000 --num-recurrent 750 --num-layers 1 --clip-gradient 5.0" \
diff --git a/egs/hkust/s5/local/nnet2/run_convnet.sh b/egs/hkust/s5/local/nnet2/run_convnet.sh
index f5baab0dc5d..56b81c42a11 100755
--- a/egs/hkust/s5/local/nnet2/run_convnet.sh
+++ b/egs/hkust/s5/local/nnet2/run_convnet.sh
@@ -49,7 +49,7 @@ fi
       --num-threads 1 --minibatch-size 512 \
       --mix-up 20000 --samples-per-iter 300000 \
       --num-epochs 15 --delta-order 2 \
-      --initial-effective-lrate 0.0005 --final-effective-lrate 0.000025 \
+      --initial-effective-lrate 0.0001 --final-effective-lrate 0.00001 \
       --num-jobs-initial 3 --num-jobs-final 8 --splice-width 5 \
       --hidden-dim 2000 --num-filters1 128 --patch-dim1 7 --pool-size 3 \
       --num-filters2 256 --patch-dim2 4 \
diff --git a/egs/hkust/s5/local/nnet3/run_ivector_common.sh b/egs/hkust/s5/local/nnet3/run_ivector_common.sh
new file mode 100755
index 00000000000..046f723ca1e
--- /dev/null
+++ b/egs/hkust/s5/local/nnet3/run_ivector_common.sh
@@ -0,0 +1,153 @@
+#!/bin/bash
+
+# this script contains some common (shared) parts of the run_nnet*.sh scripts.
+
+. cmd.sh
+
+
+stage=0
+num_threads_ubm=32
+speed_perturb=true
+use_sat_alignments=true
+
+set -e
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if [ "$use_sat_alignments" == "true" ] ; then
+  gmm_dir=exp/tri5a
+  align_script=steps/align_fmllr.sh
+else
+  gmm_dir=exp/tri3a
+  align_script=steps/align_si.sh
+fi
+
+if [ $stage -le 1 ]; then
+  # Create high-resolution MFCC features (with 40 cepstra instead of 13).
+  # this shows how you can split across multiple file-systems.  we'll split the
+  # MFCC dir across multiple locations.  You might want to be careful here, if you
+  # have multiple copies of Kaldi checked out and run the same recipe, not to let
+  # them overwrite each other.
+  mfccdir=mfcc_hires
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
+    utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/hkust-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
+  fi
+
+  for datadir in train dev; do
+    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
+    if [ "$datadir" == "train" ]; then
+      dir=data/train_hires
+      cat $dir/wav.scp | python -c "
+import sys, os, subprocess, re, random
+scale_low = 1.0/8
+scale_high = 2.0
+for line in sys.stdin.readlines():
+  if len(line.strip()) == 0:
+    continue
+  print '{0} sox --vol {1} -t wav - -t wav - |'.format(line.strip(), random.uniform(scale_low, scale_high))
+"| sort -k1,1 -u  > $dir/wav.scp_scaled || exit 1;
+     mv $dir/wav.scp $dir/wav.scp_nonorm
+     mv $dir/wav.scp_scaled $dir/wav.scp
+    fi
+
+    steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1;
+    steps/compute_cmvn_stats.sh data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1;
+  done
+fi
+
+if [ $stage -le 2 ]; then
+  # Train a system just for its LDA+MLLT transform.  We use --num-iters 13
+  # because after we get the transform (12th iter is the last), any further
+  # training is pointless.
+  steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \
+    --realign-iters "" \
+    --splice-opts "--left-context=3 --right-context=3" \
+    5000 10000 data/train_hires data/lang \
+    ${gmm_dir}_ali exp/nnet3/tri5
+fi
+
+if [ $stage -le 3 ]; then
+  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
+    --num-frames 700000 \
+    --num-threads $num_threads_ubm \
+    data/train_hires 512 exp/nnet3/tri5 exp/nnet3/diag_ubm
+fi
+
+if [ $stage -le 4 ]; then
+  # iVector extractors can in general be sensitive to the amount of data, but
+  # this one has a fairly small dim (defaults to 100)
+  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
+    data/train_hires exp/nnet3/diag_ubm exp/nnet3/extractor || exit 1;
+fi
+
+if [ $stage -le 5 ] && [ "$speed_perturb" == "true" ]; then
+  # Although the nnet will be trained by high resolution data,
+  # we still have to perturbe the normal data to get the alignment
+  # _sp stands for speed-perturbed
+  utils/perturb_data_dir_speed.sh 0.9 data/train data/temp1
+  utils/perturb_data_dir_speed.sh 1.0 data/train data/temp2
+  utils/perturb_data_dir_speed.sh 1.1 data/train data/temp3
+  utils/combine_data.sh --extra-files utt2uniq data/train_sp data/temp1 data/temp2 data/temp3
+  rm -r data/temp1 data/temp2 data/temp3
+
+  mfccdir=mfcc_perturbed
+  for x in train_sp; do
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj 70 \
+      data/$x exp/make_mfcc/$x $mfccdir || exit 1;
+    steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir || exit 1;
+  done
+  utils/fix_data_dir.sh data/train_sp
+
+  $align_script --nj 30 --cmd "$train_cmd" \
+    data/train_sp data/lang $gmm_dir ${gmm_dir}_sp_ali || exit 1
+
+  # Now perturb the high resolution daa
+  utils/copy_data_dir.sh data/train_sp data/train_sp_hires
+  mfccdir=mfcc_perturbed_hires
+  for x in train_sp_hires; do
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj 70 --mfcc-config conf/mfcc_hires.conf \
+      data/$x exp/make_hires/$x $mfccdir || exit 1;
+    steps/compute_cmvn_stats.sh data/$x exp/make_hires/$x $mfccdir || exit 1;
+  done
+  utils/fix_data_dir.sh data/train_sp_hires
+fi
+
+if [ "$speed_perturb" == "true" ]; then
+  train_set=train_sp
+else
+  train_set=train
+fi
+
+if [ $stage -le 6 ]; then
+  rm -f exp/nnet3/.error 2>/dev/null
+  ivectordir=exp/nnet3/ivectors_${train_set}_hires
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
+    utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/hkust-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
+  fi
+  # We extract iVectors on all the train data, which will be what we train the
+  # system on.  With --utts-per-spk-max 2, the script.  pairs the utterances
+  # into twos, and treats each of these pairs as one speaker.  Note that these
+  # are extracted 'online'.
+
+  # having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (iVector starts at zero).
+  steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/${train_set}_hires data/${train_set}_hires_max2
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
+    data/${train_set}_hires_max2 \
+    exp/nnet3/extractor \
+    exp/nnet3/ivectors_${train_set}_hires \
+    || touch exp/nnet3/.error
+  [ -f exp/nnet3/.error ] && echo "$0: error extracting iVectors." && exit 1;
+fi
+
+if [ $stage -le 7 ]; then
+  rm -f exp/nnet3/.error 2>/dev/null
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 8 \
+    data/dev_hires exp/nnet3/extractor exp/nnet3/ivectors_dev || touch exp/nnet3/.error &
+  wait
+  [ -f exp/nnet3/.error ] && echo "$0: error extracting iVectors." && exit 1;
+fi
+
+exit 0;
diff --git a/egs/hkust/s5/local/nnet3/run_lstm.sh b/egs/hkust/s5/local/nnet3/run_lstm.sh
new file mode 100755
index 00000000000..061040e55c1
--- /dev/null
+++ b/egs/hkust/s5/local/nnet3/run_lstm.sh
@@ -0,0 +1,168 @@
+#!/bin/bash
+
+# this is a basic lstm script
+
+# At this script level we don't support not running on GPU, as it would be painfully slow.
+# If you want to run without GPU you'd have to call lstm/train.sh with --gpu false,
+# --num-threads 16 and --minibatch-size 128.
+set -e
+
+stage=0
+train_stage=-10
+use_sat_alignments=true
+affix=
+speed_perturb=true
+
+# LSTM options
+splice_indexes="-2,-1,0,1,2 0 0"
+lstm_delay=" -1 -2 -3 "
+label_delay=5
+num_lstm_layers=3
+cell_dim=1024
+hidden_dim=1024
+recurrent_projection_dim=256
+non_recurrent_projection_dim=256
+chunk_width=20
+chunk_left_context=40
+clipping_threshold=10.0
+norm_based_clipping=true
+common_egs_dir=
+
+# natural gradient options
+ng_per_element_scale_options=
+ng_affine_options=
+num_epochs=4
+
+# training options
+initial_effective_lrate=0.0002
+final_effective_lrate=0.00002
+num_jobs_initial=2
+num_jobs_final=12
+shrink=0.98
+momentum=0.5
+adaptive_shrink=true
+num_chunk_per_minibatch=100
+num_bptt_steps=20
+samples_per_iter=20000
+remove_egs=true
+
+# feature options
+use_ivectors=true
+
+#decode options
+extra_left_context=
+frames_per_chunk=
+
+# End configuration section.
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1 
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=exp/nnet3/lstm
+dir=$dir${affix:+_$affix}
+if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
+dir=${dir}$suffix
+
+if [ "$use_sat_alignments" == "true" ] ; then
+  gmm_dir=exp/tri5a
+else
+  gmm_dir=exp/tri3a
+fi
+train_set=train$suffix
+ali_dir=${gmm_dir}${suffix}_ali
+graph_dir=$gmm_dir/graph
+
+if [ $stage -le 7 ]; then
+  local/nnet3/run_ivector_common.sh --stage $stage \
+    --use-sat-alignments $use_sat_alignments \
+    --speed-perturb $speed_perturb || exit 1;
+fi
+
+if [ $stage -le 8 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/hkust-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+  if [ "$use_ivectors" == "true" ]; then
+    ivector_opts=" --online-ivector-dir exp/nnet3/ivectors_${train_set}_hires "
+    cmvn_opts="--norm-means=false --norm-vars=false"
+  else
+    ivector_opts=
+    cmvn_opts="--norm-means=true --norm-vars=true"
+  fi
+
+  steps/nnet3/lstm/train.sh $ivector_opts --stage $train_stage \
+    --label-delay $label_delay \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --num-chunk-per-minibatch $num_chunk_per_minibatch \
+    --samples-per-iter $samples_per_iter \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --cmvn-opts "$cmvn_opts" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --shrink $shrink --momentum $momentum \
+    --adaptive-shrink "$adaptive_shrink" \
+    --lstm-delay "$lstm_delay" \
+    --cmd "$decode_cmd" \
+    --num-lstm-layers $num_lstm_layers \
+    --cell-dim $cell_dim \
+    --hidden-dim $hidden_dim \
+    --clipping-threshold $clipping_threshold \
+    --recurrent-projection-dim $recurrent_projection_dim \
+    --non-recurrent-projection-dim $non_recurrent_projection_dim \
+    --chunk-width $chunk_width \
+    --chunk-left-context $chunk_left_context \
+    --num-bptt-steps $num_bptt_steps \
+    --norm-based-clipping $norm_based_clipping \
+    --ng-per-element-scale-options "$ng_per_element_scale_options" \
+    --ng-affine-options "$ng_affine_options" \
+    --egs-dir "$common_egs_dir" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires data/lang $ali_dir $dir  || exit 1;
+fi
+
+
+if [ $stage -le 9 ]; then
+  if [ -z $extra_left_context ]; then
+    extra_left_context=$chunk_left_context
+  fi
+  if [ -z $frames_per_chunk ]; then
+    frames_per_chunk=$chunk_width
+  fi
+  # this version of the decoding treats each utterance separately
+  # without carrying forward speaker information.
+  for decode_set in dev; do
+      (
+      num_jobs=`cat data/${decode_set}/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+      decode_dir=${dir}/decode_${decode_set}
+      if [ "$use_ivectors" == "true" ]; then
+        ivector_opts=" --online-ivector-dir exp/nnet3/ivectors_${decode_set} "
+      else
+        ivector_opts=
+      fi
+
+      steps/nnet3/lstm/decode.sh --nj $num_jobs --cmd "$decode_cmd" $ivector_opts \
+        --extra-left-context $extra_left_context \
+        --frames-per-chunk "$frames_per_chunk" \
+        $graph_dir data/${decode_set}_hires $decode_dir || exit 1;
+      ) &
+  done
+fi
+wait;
+
+exit 0;
+
diff --git a/egs/hkust/s5/local/nnet3/run_tdnn.sh b/egs/hkust/s5/local/nnet3/run_tdnn.sh
new file mode 100755
index 00000000000..11f12ccf394
--- /dev/null
+++ b/egs/hkust/s5/local/nnet3/run_tdnn.sh
@@ -0,0 +1,113 @@
+#!/bin/bash
+
+# this is the standard "tdnn" system, built in nnet3; it's what we use to
+# call multi-splice.
+
+# At this script level we don't support not running on GPU, as it would be painfully slow.
+# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false,
+# --num-threads 16 and --minibatch-size 128.
+set -e
+
+stage=0
+train_stage=-10
+use_sat_alignments=true
+affix=
+speed_perturb=true
+common_egs_dir=
+
+# training options
+initial_effective_lrate=0.0015
+final_effective_lrate=0.00015
+num_epochs=4
+num_jobs_initial=2
+num_jobs_final=12
+remove_egs=true
+
+# feature options
+use_ivectors=true
+
+# End configuration section.
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1 
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+dir=exp/nnet3/tdnn${speed_perturb:+_sp}${affix:+_$affix}
+if [ "$use_sat_alignments" == "true" ] ; then
+  gmm_dir=exp/tri5a
+else
+  gmm_dir=exp/tri3a
+fi
+
+if [ "$speed_perturb" == "true" ]; then
+  train_set=train_sp
+  ali_dir=${gmm_dir}_sp_ali
+else
+  train_set=train
+  ali_dir=${gmm_dir}_ali
+fi
+
+graph_dir=$gmm_dir/graph
+
+if [ $stage -le 7 ]; then
+  local/nnet3/run_ivector_common.sh --stage $stage \
+    --use-sat-alignments $use_sat_alignments \
+    --speed-perturb $speed_perturb || exit 1;
+fi
+
+if [ $stage -le 8 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/hkust-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  if [ "$use_ivectors" == "true" ]; then
+    ivector_opts=" --online-ivector-dir exp/nnet3/ivectors_${train_set}_hires "
+    cmvn_opts="--norm-means=false --norm-vars=false"
+  else
+    ivector_opts=
+    cmvn_opts="--norm-means=true --norm-vars=true"
+  fi
+
+  steps/nnet3/train_tdnn.sh $ivector_opts --stage $train_stage \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "-2,-1,0,1,2 -1,2 -3,3 -7,2 -3,3 0 0" \
+    --feat-type raw --get-egs-stage 5 \
+    --cmvn-opts "$cmvn_opts" \
+    --io-opts "--max-jobs-run 12" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --cmd "$decode_cmd" \
+    --relu-dim 850 \
+    --egs-dir "$common_egs_dir" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires data/lang $ali_dir $dir  || exit 1;
+fi
+
+if [ $stage -le 9 ]; then
+  # this version of the decoding treats each utterance separately
+  # without carrying forward speaker information.
+  for decode_set in dev; do
+      (
+      num_jobs=`cat data/${decode_set}/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+      decode_dir=${dir}/decode_${decode_set}
+      if [ "$use_ivectors" == "true" ]; then
+        ivector_opts=" --online-ivector-dir exp/nnet3/ivectors_${decode_set} "
+      else
+        ivector_opts=
+      fi
+
+      steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" $ivector_opts \
+         $graph_dir data/${decode_set}_hires $decode_dir || exit 1;
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/hkust/s5/local/score_basic.sh b/egs/hkust/s5/local/score_basic.sh
index 2d05a71112b..928a900f924 100755
--- a/egs/hkust/s5/local/score_basic.sh
+++ b/egs/hkust/s5/local/score_basic.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
 # Copyright Johns Hopkins University (Author: Daniel Povey) 2012.  Apache 2.0.
+#           2016 LeSpeech (Author: Xingyu Na)
 
 # begin configuration section.
 cmd=run.pl
@@ -56,22 +57,45 @@ done
 
 filter_text <$data/text >$dir/scoring/text.filt
 
-unset LC_ALL
 #for character error rate
 cat $dir/scoring/text.filt | awk '{ print $1}' > $dir/scoring/utt_id
-cat $dir/scoring/text.filt | awk '{{for (i = 2; i <= NF; i++) printf(" %s", $i);} printf("\n"); }' | sed -e 's/\(\S\)/\1 /g' > $dir/scoring/utt_tra 
+cat $dir/scoring/text.filt | awk '{{for (i = 2; i <= NF; i++) printf(" %s", $i);} printf("\n"); }' |\
+  perl -e '
+  use encoding utf8;
+  while (<STDIN>) {
+    @words = split(" ", $_);
+    foreach (@words) {
+      @chars = split("", $_);
+      foreach (@chars) {
+        print "$_ ";
+      }
+    }
+    print "\n";
+  }
+  ' > $dir/scoring/utt_tra 
 paste $dir/scoring/utt_id $dir/scoring/utt_tra  > $dir/scoring/char.filt
 
 for lmwt in `seq $min_lmwt $max_lmwt`; do
   cat $dir/scoring/$lmwt.txt | awk '{ print $1}' > $dir/scoring/utt_id 
-  cat $dir/scoring/$lmwt.txt | awk '{{for (i = 2; i <= NF; i++) printf(" %s", $i);} printf("\n"); }' | sed -e 's/\(\S\)/\1 /g' > $dir/scoring/utt_tra 
+  cat $dir/scoring/$lmwt.txt | awk '{{for (i = 2; i <= NF; i++) printf(" %s", $i);} printf("\n"); }' |\
+    perl -e '
+    use encoding utf8;
+    while (<STDIN>) {
+      @words = split(" ", $_);
+      foreach (@words) {
+        @chars = split("", $_);
+        foreach (@chars) {
+          print "$_ ";
+        }
+      }
+      print "\n";
+    }
+    ' > $dir/scoring/utt_tra 
   paste $dir/scoring/utt_id $dir/scoring/utt_tra  > $dir/scoring/${lmwt}.char
 done
 
 rm $dir/scoring/utt_tra $dir/scoring/utt_id 
 
-export LC_ALL=C
-
 $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \
   compute-wer --text --mode=present \
    ark:$dir/scoring/text.filt ark:$dir/scoring/LMWT.txt ">&" $dir/wer_LMWT || exit 1;
diff --git a/egs/hkust/s5/path.sh b/egs/hkust/s5/path.sh
index e49bed09e8f..5adfbeec7c2 100755
--- a/egs/hkust/s5/path.sh
+++ b/egs/hkust/s5/path.sh
@@ -1,4 +1,6 @@
 export KALDI_ROOT=`pwd`/../../..
 #export KALDI_ROOT=/home/dpovey/kaldi-trunk-test
-export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$PWD:$PATH
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
 export LC_ALL=C
diff --git a/egs/iban/README b/egs/iban/README
new file mode 100644
index 00000000000..4385ed37201
--- /dev/null
+++ b/egs/iban/README
@@ -0,0 +1,84 @@
+###
+# Iban Data collected by Sarah Samson Juan and Laurent Besacier
+# Prepared by Sarah Samson Juan and Laurent Besacier
+# Created in GETALP, Grenoble, France
+###
+
+
+## INTRODUCTION ##
+This package has iban text and speech corpora used for Automatic Speech Recognition (ASR) experiments. Data is available in the subdirectories of /data. The subdirectories contain:
+a. train - train transcript for training ASR system using Kaldi ASR (http://kaldi.sourceforge.net/)
+b. test - test transcript for testing ASR system (also Kaldi ASR format)
+c. wav - speech corpus
+
+We have provided text corpus and language model in the /LM directory, while, the pronunciation dictionary in /lang directory.
+
+###PUBLICATION ON IBAN DATA AND ASR #####
+Details on the corpora and the our experiments on iban ASR can be found in the following list of publication. We appreciate if you cite them if you intend to publish.
+
+@inproceedings{Juan14,
+	Author = {Sarah Samson Juan and Laurent Besacier and Solange Rossato},
+	Booktitle = {Proceedings of Workshop for Spoken Language Technology for Under-resourced (SLTU)},
+	Month = {May},
+	Title = {Semi-supervised G2P bootstrapping and its application to ASR for a very under-resourced language: Iban},
+	Year = {2014}}
+
+
+@inproceedings{Juan2015,
+  	Title = {Using Resources from a closely-Related language to develop ASR for a very under-resourced Language: A case study for Iban},
+  	Author = {Sarah Samson Juan and Laurent Besacier and Benjamin Lecouteux and Mohamed Dyab},
+  	Booktitle = {Proceedings of INTERSPEECH},
+  	Year = {2015},
+  	Address = {Dresden, Germany},
+  	Month = {September}}
+
+
+###IBAN SPEECH CORPUS
+News data provided by a local radio station in Sarawak, Malaysia.
+
+Directory: data/train
+Files: text (training transcript), wav.scp (file id and path to audio file), utt2spk (file id and audio id), spk2utt(audio id and file id), wav (.wav files).
+For more information about the format, please refer to Kaldi website http://kaldi.sourceforge.net/data_prep.html
+Description: training data in Kaldi format about 7 hours. Note: The path of wav files in wav.scp MUST BE MODIFIED to point to the actual location.
+
+Directory: data/test
+Files: text (test transcript), wav.scp (file id and path to audio file), utt2spk (file id and audio id), spk2utt(audio id and file id), wav (.wav files).
+Description: testing data in Kaldi format about 1 hour. Note: The path of wav files in wav.scp MUST BE MODIFIED to point to the actual location.
+
+The audio files have the format:
+ib[m|f]_SPK_UTT where, m refers to male and f refers to female speaker, SPK denotes speaker id and UTT is the utterance id.
+
+#### IBAN TEXT CORPUS
+Directory: /LM/
+Files: iban-bp-2012.txt, iban-lm-o3.arpa
+
+# /iban-bp-2012.txt
+Contains 2 M Words. Full text data crawled from an online newspaper and cleaned as much as we could.
+
+# /iban-lm-o3.arpa
+The language model build on SRILM (http://www.speech.sri.com/projects/srilm/) using iban-bp-2012.txt
+
+
+#### LEXICON/PRONUNCIATION DICTIONARY
+Directory: /lang
+Files : lexicon.txt (lexicon), nonsilence_phones.txt (speech phones), optional_silence.txt (silence phone)
+Description: lexicon contains words and their respective pronunciation, non-speech sound and noise in Kaldi format. Details on the development of the dictionary can be found in our papers. (For this package, we provided the Iban-Hybrid version.)
+
+
+#TO DOWNLOAD THE REPOSITORY
+
+svn co https://github.com/sarahjuan/iban
+
+### SCRIPTS
+In /kaldi-scripts, you can find all scripts that can be used to train and test models from the existing data and lang directory. Note: Path needs to changed to make it work in your own directory. 
+
+You can launch run.sh to prepare data & language model, make mfccs and train acoustic models.
+
+
+### WER RESULTS OBTAINED USING OUR CORPORA AND SETTINGS. RESULTS OBTAINED AFTER UPDATING TEST TRANSCRIPT. THE ONES REPORTED IN OUR PAPERS WERE BEFORE THIS UPDATE##
+
+See the latest results in s5/RESULTS file (they will not match the results from the paper)
+       
+##ACKNOWLEDGEMENT ###
+We would like to thank the Ministry of Higher Education Malaysia for providing financial support to conduct this study. We also thank The Borneo Post news agency for providing online materials for building the text corpus and also to Radio Televisyen Malaysia (RTM), Sarawak, Malaysia, for providing the news data.
+
diff --git a/egs/iban/s5/RESULTS b/egs/iban/s5/RESULTS
new file mode 100644
index 00000000000..09077fdcba8
--- /dev/null
+++ b/egs/iban/s5/RESULTS
@@ -0,0 +1,16 @@
+%WER 15.32 [ 1686 / 11006, 220 ins, 338 del, 1128 sub ] exp/sgmm2_5b2/decode_dev.big/wer_18_0.0
+%WER 15.36 [ 1691 / 11006, 214 ins, 322 del, 1155 sub ] exp/nnet3/nnet_tdnn_h_sp_4_850_170/decode_dev.big/wer_18_0.0
+%WER 15.50 [ 1706 / 11006, 212 ins, 327 del, 1167 sub ] exp/nnet3/nnet_tdnn_h_sp_4_850_170/decode_dev.rescored/wer_18_0.0
+%WER 15.84 [ 1743 / 11006, 242 ins, 332 del, 1169 sub ] exp/sgmm2_5b2/decode_dev.rescored/wer_15_0.0
+%WER 17.45 [ 1921 / 11006, 252 ins, 326 del, 1343 sub ] exp/nnet3/nnet_tdnn_h_sp_4_850_170/decode_dev/wer_15_0.0
+%WER 17.55 [ 1932 / 11006, 266 ins, 323 del, 1343 sub ] exp/sgmm2_5b2/decode_dev/wer_13_0.0
+%WER 19.08 [ 2100 / 11006, 245 ins, 503 del, 1352 sub ] exp/tri3b/decode_dev.rescored/wer_20_0.0
+%WER 20.92 [ 2302 / 11006, 263 ins, 518 del, 1521 sub ] exp/tri3b/decode_dev/wer_19_0.0
+%WER 24.19 [ 2662 / 11006, 243 ins, 900 del, 1519 sub ] exp/tri2b/decode_dev.rescored/wer_14_0.0
+%WER 25.26 [ 2780 / 11006, 294 ins, 736 del, 1750 sub ] exp/tri3b/decode_dev.si/wer_16_0.0
+%WER 26.44 [ 2910 / 11006, 292 ins, 832 del, 1786 sub ] exp/tri2b/decode_dev/wer_13_0.0
+%WER 30.99 [ 3411 / 11006, 245 ins, 1391 del, 1775 sub ] exp/tri1/decode_dev.rescored/wer_12_0.0
+%WER 33.31 [ 3666 / 11006, 260 ins, 1428 del, 1978 sub ] exp/tri1/decode_dev/wer_12_0.0
+%WER 33.81 [ 3721 / 11006, 241 ins, 1585 del, 1895 sub ] exp/tri2a/decode_dev.rescored/wer_11_0.0
+%WER 35.69 [ 3928 / 11006, 243 ins, 1750 del, 1935 sub ] exp/tri2a/decode_dev/wer_12_0.0
+%WER 39.41 [ 4338 / 11006, 190 ins, 1237 del, 2911 sub ] exp/mono/decode_dev/wer_11_0.0
diff --git a/egs/iban/s5/cmd.sh b/egs/iban/s5/cmd.sh
new file mode 100755
index 00000000000..487a2244c04
--- /dev/null
+++ b/egs/iban/s5/cmd.sh
@@ -0,0 +1,5 @@
+export train_cmd="run.pl --max-jobs-run 32"
+export decode_cmd="run.pl --max-jobs-run 32"
+
+#export train_cmd="queue.pl"
+#export decode_cmd="queue.pl --mem 4G"
diff --git a/egs/iban/s5/conf/decode.config b/egs/iban/s5/conf/decode.config
new file mode 100644
index 00000000000..10b0eee900b
--- /dev/null
+++ b/egs/iban/s5/conf/decode.config
@@ -0,0 +1,4 @@
+# Use wider-than-normal decoding beams for RM.
+first_beam=16.0
+beam=20.0
+lattice_beam=10.0
diff --git a/egs/iban/s5/conf/decode_dnn.config b/egs/iban/s5/conf/decode_dnn.config
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs/iban/s5/conf/mfcc.conf b/egs/iban/s5/conf/mfcc.conf
new file mode 100644
index 00000000000..7361509099f
--- /dev/null
+++ b/egs/iban/s5/conf/mfcc.conf
@@ -0,0 +1 @@
+--use-energy=false   # only non-default option.
diff --git a/egs/iban/s5/conf/mfcc_hires.conf b/egs/iban/s5/conf/mfcc_hires.conf
new file mode 100644
index 00000000000..434834a6725
--- /dev/null
+++ b/egs/iban/s5/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why 
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--num-mel-bins=40     # similar to Google's setup.
+--num-ceps=40     # there is no dimensionality reduction.
+--low-freq=20     # low cutoff frequency for mel bins... this is high-bandwidth data, so
+                  # there might be some information at the low end.
+--high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600) 
diff --git a/egs/iban/s5/conf/online_cmvn.conf b/egs/iban/s5/conf/online_cmvn.conf
new file mode 100644
index 00000000000..7748a4a4dd3
--- /dev/null
+++ b/egs/iban/s5/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
diff --git a/egs/iban/s5/local/arpa2G.sh b/egs/iban/s5/local/arpa2G.sh
new file mode 100755
index 00000000000..dddd7eb9097
--- /dev/null
+++ b/egs/iban/s5/local/arpa2G.sh
@@ -0,0 +1,115 @@
+#!/bin/bash
+# Copyright 2013-2014  Johns Hopkins University (authors: Yenda Trmal, Daniel Povey)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+#Simple utility script to convert the gzipped ARPA lm into a G.fst file
+
+
+oov_prob_file=
+unk_fraction=
+cleanup=true
+#end configuration section.
+
+
+
+echo $0 $@
+
+[ -f ./path.sh ] && . ./path.sh
+[ -f ./cmd.sh ]  && . ./cmd.sh
+. parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [options] <arpa-lm-file> <lang-dir> <dest-dir>"
+  echo "Options: --oov-prob-file <oov-prob-file>   # e.g. data/local/oov2prob"
+  echo "           # with this option it will replace <unk> with OOVs in G.fst."
+  exit 1;
+fi
+
+set -e           #Exit on non-zero return code from any command
+set -o pipefail  #Exit if any of the commands in the pipeline will
+                 #return non-zero return code
+
+lmfile=$1
+langdir=$2
+destdir=$3
+
+mkdir $destdir 2>/dev/null || true
+
+
+if [ ! -z "$oov_prob_file" ]; then
+  if [ ! -s "$oov_prob_file" ]; then
+    echo "$0: oov-prob file $oov_prob_file does not exist"
+    exit 1;
+  fi
+  if [ -z "$unk_fraction" ]; then
+    echo "--oov-prob option requires --unk-fraction option";
+    exit 1;
+  fi
+
+  min_prob=$(gunzip -c $lmfile | perl -e '  $minlogprob = 0.0;
+     while(<STDIN>) { if (m/\\(\d)-grams:/) { $order = $1; }
+      if ($order == 1) { @A = split;
+       if ($A[0] < $minlogprob && $A[0] != -99) { $minlogprob = $A[0]; }}} print $minlogprob')
+  echo "Minimum prob in LM file is $min_prob"
+
+  echo "$0: creating LM file with unk words, using $oov_prob_file, in $destdir/lm_tmp.gz"
+  gunzip -c $lmfile | \
+    perl -e ' ($oov_prob_file,$min_prob,$unk_fraction) = @ARGV; $ceilinged=0;
+      $min_prob < 0.0 || die "Bad min_prob"; # this is a log-prob
+      $unk_fraction > 0.0 || die "Bad unk_fraction"; # this is a prob
+      open(F, "<$oov_prob_file") || die "opening oov file";
+      while (<F>) { push @OOVS, $_; }
+      $num_oovs = @F;
+      while(<STDIN>) {
+      if (m/^ngram 1=(\d+)/) { $n = $1 + $num_oovs; print "ngram 1=$n\n"; }
+      else { print; } # print all lines unchanged except the one that says ngram 1=X.
+      if (m/^\\1-grams:$/) {
+        foreach $l (@OOVS) {
+          @A = split(" ", $l);
+          @A == 2 || die "bad line in oov2prob: $_;";
+          ($word, $prob) = @A;
+          $log10prob = (log($prob * $unk_fraction) / log(10.0));
+          if ($log10prob > $min_prob) { $log10prob = $min_prob; $ceilinged++;}
+          print "$log10prob $word\n";
+       }
+     }} print STDERR "Ceilinged $ceilinged unk-probs\n";' \
+       $oov_prob_file $min_prob $unk_fraction | gzip -c > $destdir/lm_tmp.gz
+  lmfile=$destdir/lm_tmp.gz
+fi
+
+if [[ $lmfile == *.bz2 ]] ; then
+  decompress="bunzip2 -c $lmfile"
+elif [[ $lmfile == *.gz ]] ; then
+  decompress="gunzip -c $lmfile"
+else
+  decompress="cat $lmfile"
+fi
+
+$decompress | \
+  grep -v '<s> <s>' | grep -v '</s> <s>' |  grep -v '</s> </s>' | \
+  arpa2fst - | \
+  fstprint | \
+  utils/eps2disambig.pl | \
+  utils/s2eps.pl | \
+  fstcompile --isymbols=$langdir/words.txt \
+  --osymbols=$langdir/words.txt  --keep_isymbols=false --keep_osymbols=false | \
+  fstrmepsilon | fstarcsort --sort_type=olabel > $destdir/G.fst || exit 1
+fstisstochastic $destdir/G.fst || true;
+
+if $cleanup; then
+  rm $destdir/lm_tmp.gz  2>/dev/null || true;
+fi
+
+exit 0
diff --git a/egs/iban/s5/local/nnet3/run_ivector_common.sh b/egs/iban/s5/local/nnet3/run_ivector_common.sh
new file mode 100755
index 00000000000..0aa7a301dfe
--- /dev/null
+++ b/egs/iban/s5/local/nnet3/run_ivector_common.sh
@@ -0,0 +1,95 @@
+#!/bin/bash
+
+## Script was adapted from WSJ (login) and RM (some settings)
+
+. cmd.sh
+mfccdir=mfcc
+
+stage=1
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if [ $stage -le 1 ]; then
+    for datadir in train; do
+      utils/perturb_data_dir_speed.sh 0.9 data/${datadir} data/temp1
+      utils/perturb_data_dir_speed.sh 1.1 data/${datadir} data/temp2
+      utils/combine_data.sh data/${datadir}_tmp data/temp1 data/temp2
+      utils/validate_data_dir.sh --no-feats data/${datadir}_tmp
+      rm -r data/temp1 data/temp2
+
+      mfccdir=mfcc_perturbed
+      steps/make_mfcc.sh --cmd "$train_cmd" --nj 17 \
+        data/${datadir}_tmp exp/make_mfcc/${datadir}_tmp $mfccdir || exit 1;
+      steps/compute_cmvn_stats.sh data/${datadir}_tmp exp/make_mfcc/${datadir}_tmp $mfccdir || exit 1;
+      utils/fix_data_dir.sh data/${datadir}_tmp
+
+      utils/copy_data_dir.sh --spk-prefix sp1.0- --utt-prefix sp1.0- data/${datadir} data/temp0
+      utils/combine_data.sh data/${datadir}_sp data/${datadir}_tmp data/temp0
+      utils/fix_data_dir.sh data/${datadir}_sp
+      rm -r data/temp0 data/${datadir}_tmp
+    done
+fi
+
+mkdir -p exp/nnet3
+
+if [ $stage -le 2 ]; then
+    steps/align_fmllr.sh --nj 16 --cmd "$train_cmd" \
+      data/train_sp data/lang exp/tri3b exp/nnet3/tri3b_ali_sp || exit 1
+fi
+
+mfccdir=mfcc_hires
+if [ $stage -le 3 ]; then
+   utils/copy_data_dir.sh data/train_sp data/train_hires || exit 1
+   steps/make_mfcc.sh --nj 16 --mfcc-config conf/mfcc_hires.conf \
+     --cmd "$train_cmd" data/train_hires exp/make_hires/train $mfccdir || exit 1;
+   steps/compute_cmvn_stats.sh data/train_hires exp/make_hires/train $mfccdir || exit 1;
+
+   for datadir in  dev; do
+     utils/copy_data_dir.sh data/$datadir data/${datadir}_hires || exit 1
+     steps/make_mfcc.sh --nj 6 --mfcc-config conf/mfcc_hires.conf \
+       --cmd "$train_cmd" data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1;
+     steps/compute_cmvn_stats.sh data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1;
+  done
+fi
+
+if [ $stage -le 4 ]; then
+  # Train a small system just for its LDA+MLLT transform.  We use --num-iters 13
+  # because after we get the transform (12th iter is the last), any further
+  # training is pointless.
+  steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \
+    --realign-iters "" --splice-opts "--left-context=3 --right-context=3" \
+    5000 10000 data/train_hires data/lang \
+     exp/nnet3/tri3b_ali_sp exp/nnet3/tri5b || exit 1
+fi
+
+if [ $stage -le 5 ]; then
+  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 16  --num-frames 200000 \
+     data/train_hires 256 exp/nnet3/tri5b exp/nnet3/diag_ubm || exit 1
+fi
+
+if [ $stage -le 6 ]; then
+  # even though $nj is just 10, each job uses multiple processes and threads.
+ steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" \
+    --nj 10 --num-processes 1 --num-threads 2  --ivector-dim 50\
+    data/train_hires exp/nnet3/diag_ubm exp/nnet3/extractor || exit 1;
+fi
+
+if [ $stage -le 7 ]; then
+  # having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (iVector starts at zero).
+  steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/train_hires \
+    data/train_hires_max2 || exit 1
+
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 16\
+    data/train_hires_max2 exp/nnet3/extractor exp/nnet3/ivectors_train || exit 1
+fi
+
+if [ $stage -le 8 ]; then
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 6 \
+      data/dev_hires exp/nnet3/extractor exp/nnet3/ivectors_dev || exit 1
+fi
+
+exit 0;
diff --git a/egs/iban/s5/local/nnet3/run_tdnn.sh b/egs/iban/s5/local/nnet3/run_tdnn.sh
new file mode 100755
index 00000000000..ac0e2efa1d0
--- /dev/null
+++ b/egs/iban/s5/local/nnet3/run_tdnn.sh
@@ -0,0 +1,90 @@
+#!/bin/bash
+
+# Combined from from WSJ + RM
+
+# this is the standard "tdnn" system, built in nnet3; it's what we use to
+# call multi-splice.
+
+. ./cmd.sh
+
+
+# At this script level we don't support not running on GPU, as it would be painfully slow.
+# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false,
+# --num-threads 16 and --minibatch-size 128.
+
+stage=1
+train_stage=-10
+dir=exp/nnet3/tdnn_1
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage || exit 1;
+
+if [ $stage -le 9 ]; then
+  echo "$0: creating neural net configs";
+
+  # create the config files for nnet initialization
+  python steps/nnet3/tdnn/make_configs.py  \
+    --feat-dir data/train_hires \
+    --ivector-dir exp/nnet3/ivectors_train \
+    --ali-dir exp/nnet3/tri3b_ali_sp \
+    --relu-dim 256 \
+    --splice-indexes=" -2,-1,0,1,2  -1,0,1  -1,0,1  -1,0,1  -1,0,1 -1,0,1 0 " \
+    --use-presoftmax-prior-scale true \
+   $dir/configs || exit 1;
+fi
+
+
+
+if [ $stage -le 10 ]; then
+
+  steps/nnet3/train_dnn.py --stage $train_stage \
+    --cmd="$decode_cmd" \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 4 \
+    --trainer.num-epochs 4 \
+    --trainer.add-layers-period 1 \
+    --feat.online-ivector-dir exp/nnet3/ivectors_train\
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --trainer.num-epochs 2 \
+    --trainer.optimization.initial-effective-lrate 0.005 \
+    --trainer.optimization.final-effective-lrate 0.0005 \
+    --trainer.samples-per-iter 120000 \
+    --cleanup.preserve-model-interval 10 \
+    --feat-dir data/train_hires \
+    --ali-dir exp/nnet3/tri3b_ali_sp \
+    --lang data/lang \
+    --dir=$dir  || exit 1;
+fi
+
+
+if [ $stage -le 11 ]; then
+  # this does offline decoding that should give the same results as the real
+  # online decoding.
+  graph_dir=exp/tri3b/graph
+  # use already-built graphs.
+    steps/nnet3/decode.sh --nj 6 --cmd "$decode_cmd" \
+        --online-ivector-dir exp/nnet3/ivectors_dev --iter final\
+       $graph_dir data/dev_hires $dir/decode_dev || exit 1;
+
+fi
+
+if [ $stage -le 12 ]; then
+   steps/lmrescore_const_arpa.sh  --cmd "$decode_cmd" \
+     data/lang_test/ data/lang_big/ data/dev \
+    ${dir}/decode_dev ${dir}/decode_dev.rescored
+fi
+
+exit 0;
+
diff --git a/egs/iban/s5/local/prepare_data.sh b/egs/iban/s5/local/prepare_data.sh
new file mode 100755
index 00000000000..a6e9f68ff49
--- /dev/null
+++ b/egs/iban/s5/local/prepare_data.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+# Copyright 2015-2016  Sarah Flora Juan
+# Copyright 2016  Johns Hopkins University (Author: Yenda Trmal)
+# Apache 2.0
+
+corpus=$1
+set -e -o pipefail
+if [ -z "$corpus" ] ; then
+    echo >&2 "The script $0 expects one parameter -- the location of the Iban corpus"
+    exit 1
+fi
+if [ ! -d "$corpus" ] ; then
+    echo >&2 "The directory $corpus does not exist"
+fi
+
+
+echo "Preparing train and test data"
+mkdir -p data data/local data/train data/dev
+
+for x in train dev; do
+    echo "Copy spk2utt, utt2spk, wav.scp, text for $x"
+    cp $corpus/data/$x/text     data/$x/text    || exit 1;
+    cp $corpus/data/$x/spk2utt  data/$x/spk2utt || exit 1;
+    cp $corpus/data/$x/utt2spk  data/$x/utt2spk || exit 1;
+
+    # the corpus wav.scp contains physical paths, so we just re-generate
+    # the file again from scratchn instead of figuring out how to edit it
+    for rec in $(awk '{print $1}' $corpus/data/$x/text) ; do
+        spk=${rec%_*}
+        filename=$corpus/data/wav/$spk/${rec}.wav
+        if [ ! -f "$filename" ] ; then
+            echo >&2 "The file $filename could not be found ($rec)"
+            exit 1
+        fi
+        # we might want to store physical paths as a general rule
+        filename=$(readlink -f $filename)
+        echo "$rec $filename"
+    done > data/$x/wav.scp
+
+    # fix_data_dir.sh fixes common mistakes (unsorted entries in wav.scp,
+    # duplicate entries and so on). Also, it regenerates the spk2utt from
+    # utt2sp
+    utils/fix_data_dir.sh data/$x
+done
+
+echo "Copying language model"
+if [ -f $corpus/LM/iban-lm-o3.arpa.tar.gz ] ; then
+    tar zxf $corpus/LM/iban-lm-o3.arpa.tar.gz -C data/local/
+fi
+
+echo "Data preparation completed."
+
diff --git a/egs/iban/s5/local/prepare_dict.sh b/egs/iban/s5/local/prepare_dict.sh
new file mode 100755
index 00000000000..ebec12bc171
--- /dev/null
+++ b/egs/iban/s5/local/prepare_dict.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+# Copyright 2015-2016  Sarah Flora Juan
+# Copyright 2016  Johns Hopkins University (Author: Yenda Trmal)
+# Apache 2.0
+
+corpus=$1
+if [ -z "$corpus" ] ; then
+    echo >&2 "The script $0 expects one parameter -- the location of the Iban corpus"
+    exit 1
+fi
+if [ ! -d "$corpus" ] ; then
+    echo >&2 "The directory $corpus does not exist"
+fi
+
+mkdir -p data/lang data/local/dict
+
+
+cp $corpus/lang/dict/lexicon.txt data/local/dict/lexicon.txt
+cat data/local/dict/lexicon.txt | \
+    perl -ane 'print join("\n", @F[1..$#F]) . "\n"; '  | \
+    sort -u | grep -v 'SIL' > data/local/dict/nonsilence_phones.txt
+
+
+touch data/local/dict/extra_questions.txt
+touch data/local/dict/optional_silence.txt
+
+echo "SIL"   > data/local/dict/optional_silence.txt
+echo "SIL"   > data/local/dict/silence_phones.txt
+echo "<UNK>" > data/local/dict/oov.txt
+
+echo "Dictionary preparation succeeded"
diff --git a/egs/iban/s5/local/prepare_lm.sh b/egs/iban/s5/local/prepare_lm.sh
new file mode 100755
index 00000000000..a19dc18f566
--- /dev/null
+++ b/egs/iban/s5/local/prepare_lm.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+# Copyright 2015-2016  Sarah Flora Juan
+# Copyright 2016  Johns Hopkins University (Author: Yenda Trmal)
+# Apache 2.0
+
+set -e -o pipefail
+
+# To create G.fst from ARPA language model
+. ./path.sh || die "path.sh expected";
+
+local/train_lms_srilm.sh --train-text data/train/text data/ data/srilm
+
+nl -nrz -w10  corpus/LM/iban-bp-2012.txt | sort -R > data/local/external_text
+local/train_lms_srilm.sh --train-text data/local/external_text data/ data/srilm_external
+
+# let's do ngram interpolation of the previous two LMs
+# the lm.gz is always symlink to the model with the best perplexity, so we use that
+
+mkdir -p data/srilm_interp
+for w in 0.9 0.8 0.7 0.6 0.5; do
+    ngram -lm data/srilm/lm.gz  -mix-lm data/srilm_external/lm.gz \
+          -lambda $w -write-lm data/srilm_interp/lm.${w}.gz
+    echo -n "data/srilm_interp/lm.${w}.gz "
+    ngram -lm data/srilm_interp/lm.${w}.gz -ppl data/srilm/dev.txt | paste -s
+done | sort  -k15,15g  > data/srilm_interp/perplexities.txt
+
+# for basic decoding, let's use only a trigram LM
+[ -d data/lang_test/ ] && rm -rf data/lang_test
+cp -R data/lang data/lang_test
+lm=$(cat data/srilm/perplexities.txt | grep 3gram | head -n1 | awk '{print $1}')
+local/arpa2G.sh $lm data/lang_test data/lang_test
+
+# for decoding using bigger LM let's find which interpolated gave the most improvement
+[ -d data/lang_big ] && rm -rf data/lang_big
+cp -R data/lang data/lang_big
+lm=$(cat data/srilm_interp/perplexities.txt | head -n1 | awk '{print $1}')
+local/arpa2G.sh $lm data/lang_big data/lang_big
+
+# for really big lm, we should only decode using small LM
+# and resocre using the big lm
+utils/build_const_arpa_lm.sh $lm data/lang_big data/lang_big
+exit 0;
diff --git a/egs/iban/s5/local/score.sh b/egs/iban/s5/local/score.sh
new file mode 120000
index 00000000000..0afefc3158c
--- /dev/null
+++ b/egs/iban/s5/local/score.sh
@@ -0,0 +1 @@
+../steps/score_kaldi.sh
\ No newline at end of file
diff --git a/egs/iban/s5/local/train_lms_srilm.sh b/egs/iban/s5/local/train_lms_srilm.sh
new file mode 100755
index 00000000000..9ed88842650
--- /dev/null
+++ b/egs/iban/s5/local/train_lms_srilm.sh
@@ -0,0 +1,230 @@
+#!/bin/bash
+export LC_ALL=C
+
+words_file=
+train_text=
+dev_text=
+oov_symbol="<UNK>"
+
+echo "$0 $@"
+
+[ -f path.sh ]  && . ./path.sh
+. ./utils/parse_options.sh || exit 1
+
+echo "-------------------------------------"
+echo "Building an SRILM language model     "
+echo "-------------------------------------"
+
+if [ $# -ne 2 ] ; then
+  echo "Incorrect number of parameters. "
+  echo "Script has to be called like this:"
+  echo "  $0 [switches] <datadir> <tgtdir>"
+  echo "For example: "
+  echo "  $0 data data/srilm"
+  echo "The allowed switches are: "
+  echo "    words_file=<word_file|>        word list file -- data/lang/words.txt by default"
+  echo "    train_text=<train_text|>       data/train/text is used in case when not specified"
+  echo "    dev_text=<dev_text|>           last 10 % of the train text is used by default"
+  echo "    oov_symbol=<unk_sumbol|<UNK>>  symbol to use for oov modeling -- <UNK> by default"
+  exit 1
+fi
+
+datadir=$1
+tgtdir=$2
+outlm=lm.gz
+
+
+##End of configuration
+loc=`which ngram-count`;
+if [ -z $loc ]; then
+  if uname -a | grep 64 >/dev/null; then # some kind of 64 bit...
+    sdir=`pwd`/../../../tools/srilm/bin/i686-m64
+  else
+    sdir=`pwd`/../../../tools/srilm/bin/i686
+  fi
+  if [ -f $sdir/ngram-count ]; then
+    echo Using SRILM tools from $sdir
+    export PATH=$PATH:$sdir
+  else
+    echo You appear to not have SRILM tools installed, either on your path,
+    echo or installed in $sdir.  See tools/install_srilm.sh for installation
+    echo instructions.
+    exit 1
+  fi
+fi
+
+# Prepare the destination directory
+mkdir -p $tgtdir
+
+for f in $words_file $train_text $dev_text; do
+  [ ! -s $f ] && echo "No such file $f" && exit 1;
+done
+
+[ -z $words_file ] && words_file=$datadir/lang/words.txt
+if [ ! -z "$train_text" ] && [ -z "$dev_text" ] ; then
+  nr=`cat  $train_text | wc -l`
+  nr_dev=$(($nr / 10 ))
+  nr_train=$(( $nr - $nr_dev ))
+  orig_train_text=$train_text
+  head -n $nr_train $train_text > $tgtdir/train_text
+  tail -n $nr_dev $train_text > $tgtdir/dev_text
+
+  train_text=$tgtdir/train_text
+  dev_text=$tgtdir/dev_text
+  echo "Using words file: $words_file"
+  echo "Using train text: 9/10 of $orig_train_text"
+  echo "Using dev text  : 1/10 of $orig_train_text"
+elif [ ! -z "$train_text" ] && [ ! -z "$dev_text" ] ; then
+  echo "Using words file: $words_file"
+  echo "Using train text: $train_text"
+  echo "Using dev text  : $dev_text"
+  train_text=$train_text
+  dev_text=$dev_text
+else
+  train_text=$datadir/train/text
+  dev_text=$datadir/dev2h/text
+  echo "Using words file: $words_file"
+  echo "Using train text: $train_text"
+  echo "Using dev text  : $dev_text"
+fi
+
+
+
+# Extract the word list from the training dictionary; exclude special symbols
+sort $words_file | awk '{print $1}' | grep -v '\#0' | grep -v '<eps>' | grep -v -F "$oov_symbol" > $tgtdir/vocab
+if (($?)); then
+  echo "Failed to create vocab from $words_file"
+  exit 1
+else
+  # wc vocab # doesn't work due to some encoding issues
+  echo vocab contains `cat $tgtdir/vocab | perl -ne 'BEGIN{$l=$w=0;}{split; $w+=$#_; $w++; $l++;}END{print "$l lines, $w words\n";}'`
+fi
+
+# Kaldi transcript files contain Utterance_ID as the first word; remove it
+cat $train_text | cut -f2- -d' ' > $tgtdir/train.txt
+if (($?)); then
+    echo "Failed to create $tgtdir/train.txt from $train_text"
+    exit 1
+else
+    echo "Removed first word (uid) from every line of $train_text"
+    # wc text.train train.txt # doesn't work due to some encoding issues
+    echo $train_text contains `cat $train_text | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $w--; $s++;}END{print "$w words, $s sentences\n";}'`
+    echo train.txt contains `cat $tgtdir/train.txt | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $s++;}END{print "$w words, $s sentences\n";}'`
+fi
+
+# Kaldi transcript files contain Utterance_ID as the first word; remove it
+cat $dev_text | cut -f2- -d' ' > $tgtdir/dev.txt
+if (($?)); then
+    echo "Failed to create $tgtdir/dev.txt from $dev_text"
+    exit 1
+else
+    echo "Removed first word (uid) from every line of $dev_text"
+    # wc text.train train.txt # doesn't work due to some encoding issues
+    echo $dev_text contains `cat $dev_text | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $w--; $s++;}END{print "$w words, $s sentences\n";}'`
+    echo $tgtdir/dev.txt contains `cat $tgtdir/dev.txt | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F;  $s++;}END{print "$w words, $s sentences\n";}'`
+fi
+
+echo "-------------------"
+echo "Good-Turing 2grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/2gram.gt01.gz -gt1min 0 -gt2min 1 -order 2 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/2gram.gt02.gz -gt1min 0 -gt2min 2 -order 2 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+echo "-------------------"
+echo "Kneser-Ney 2grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/2gram.kn01.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -order 2 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/2gram.kn02.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -order 2 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+echo "-------------------"
+echo "Good-Turing 3grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/3gram.gt011.gz -gt1min 0 -gt2min 1 -gt3min 1 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.gt012.gz -gt1min 0 -gt2min 1 -gt3min 2 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.gt022.gz -gt1min 0 -gt2min 2 -gt3min 2 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.gt023.gz -gt1min 0 -gt2min 2 -gt3min 3 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+echo "-------------------"
+echo "Kneser-Ney 3grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/3gram.kn011.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.kn012.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.kn022.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.kn023.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 3 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+
+echo "-------------------"
+echo "Good-Turing 4grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/4gram.gt0111.gz -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 1 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0112.gz -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0122.gz -gt1min 0 -gt2min 1 -gt3min 2 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0123.gz -gt1min 0 -gt2min 1 -gt3min 2 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0113.gz -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0222.gz -gt1min 0 -gt2min 2 -gt3min 2 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0223.gz -gt1min 0 -gt2min 2 -gt3min 2 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+echo "-------------------"
+echo "Kneser-Ney 4grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/4gram.kn0111.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 1 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0112.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0113.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0122.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0123.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0222.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0223.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+if [ ! -z ${LIBLBFGS} ]; then
+  #please not that if the switch -map-unk "$oov_symbol" is used with -maxent-convert-to-arpa, ngram-count will segfault
+  #instead of that, we simply output the model in the maxent format and convert it using the "ngram"
+  echo "-------------------"
+  echo "Maxent 2grams"
+  echo "-------------------"
+  sed 's/'${oov_symbol}'/<unk>/g' $tgtdir/train.txt | \
+    ngram-count -lm - -order 2 -text - -vocab $tgtdir/vocab -unk -sort -maxent -maxent-convert-to-arpa|\
+    sed 's/<unk>/'${oov_symbol}'/g' | gzip -c > $tgtdir/2gram.me.gz || exit 1
+
+  echo "-------------------"
+  echo "Maxent 3grams"
+  echo "-------------------"
+  sed 's/'${oov_symbol}'/<unk>/g' $tgtdir/train.txt | \
+    ngram-count -lm - -order 3 -text - -vocab $tgtdir/vocab -unk -sort -maxent -maxent-convert-to-arpa|\
+    sed 's/<unk>/'${oov_symbol}'/g' | gzip -c > $tgtdir/3gram.me.gz || exit 1
+
+  echo "-------------------"
+  echo "Maxent 4grams"
+  echo "-------------------"
+  sed 's/'${oov_symbol}'/<unk>/g' $tgtdir/train.txt | \
+    ngram-count -lm - -order 4 -text - -vocab $tgtdir/vocab -unk -sort -maxent -maxent-convert-to-arpa|\
+    sed 's/<unk>/'${oov_symbol}'/g' | gzip -c > $tgtdir/4gram.me.gz || exit 1
+
+fi
+
+
+echo "--------------------"
+echo "Computing perplexity"
+echo "--------------------"
+(
+  for f in $tgtdir/2gram* ; do ( echo $f; ngram -order 2 -lm $f -unk -map-unk "$oov_symbol" -ppl $tgtdir/dev.txt ) | paste -s -d ' ' ; done
+  for f in $tgtdir/3gram* ; do ( echo $f; ngram -order 3 -lm $f -unk -map-unk "$oov_symbol" -ppl $tgtdir/dev.txt ) | paste -s -d ' ' ; done
+  for f in $tgtdir/4gram* ; do ( echo $f; ngram -order 4 -lm $f -unk -map-unk "$oov_symbol" -ppl $tgtdir/dev.txt ) | paste -s -d ' ' ; done
+)  | sort  -r -n -k 15,15g | column -t | tee $tgtdir/perplexities.txt
+
+echo "The perlexity scores report is stored in $tgtdir/perplexities.txt "
+
+#This will link the lowest perplexity LM as the output LM.
+#ln -sf $tgtdir/`head -n 1 $tgtdir/perplexities.txt | cut -f 1 -d ' '` $outlm
+
+#A slight modification of the previous approach:
+#We look at the two lowest perplexity LMs and use a 3gram LM if one of the two, even if the 4gram is of lower ppl
+nof_trigram_lm=`head -n 2 $tgtdir/perplexities.txt | grep 3gram | wc -l`
+if [[ $nof_trigram_lm -eq 0 ]] ; then
+  lmfilename=`head -n 1 $tgtdir/perplexities.txt | cut -f 1 -d ' '`
+elif [[ $nof_trigram_lm -eq 2 ]] ; then
+  lmfilename=`head -n 1 $tgtdir/perplexities.txt | cut -f 1 -d ' '`
+else  #exactly one 3gram LM
+  lmfilename=`head -n 2 $tgtdir/perplexities.txt | grep 3gram | cut -f 1 -d ' '`
+fi
+(cd $tgtdir; ln -sf `basename $lmfilename` $outlm )
+
diff --git a/egs/iban/s5/path.sh b/egs/iban/s5/path.sh
new file mode 100755
index 00000000000..2d17b17a84a
--- /dev/null
+++ b/egs/iban/s5/path.sh
@@ -0,0 +1,6 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
diff --git a/egs/iban/s5/run.sh b/egs/iban/s5/run.sh
new file mode 100755
index 00000000000..b184a79e45e
--- /dev/null
+++ b/egs/iban/s5/run.sh
@@ -0,0 +1,196 @@
+#!/bin/bash
+
+# Copyright 2015 Sarah Samson Juan
+# Apache 2.0
+
+# This script prepares data and train/decode ASR.
+# Download the Iban corpus from github. wav files are in data/wav,
+# language model in LM/*.arpa.tar.gz and lexicon in lang/dict.
+
+stage=0
+
+# initialization PATH
+. ./path.sh  || die "path.sh expected";
+# initialization commands
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+set -e -o pipefail
+corpus=./corpus
+# download iban to build ASR
+if [ ! -f "$corpus/README" ]; then
+    #available from github
+    mkdir -p ./$corpus/
+    [ ! -f ./iban.tar.gz ] &&  wget http://www.openslr.org/resources/24/iban.tar.gz
+    tar xzf iban.tar.gz -C $corpus
+fi
+
+nj=16
+dev_nj=6
+
+if [ $stage -le 1 ]; then
+  echo "Preparing data and training language models"
+  local/prepare_data.sh $corpus/
+  local/prepare_dict.sh $corpus/
+  utils/prepare_lang.sh data/local/dict "<UNK>" data/local/lang data/lang
+  local/prepare_lm.sh
+fi
+
+
+if [ $stage -le 2 ]; then
+  # Feature extraction
+  for x in train dev; do
+      steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" data/$x exp/make_mfcc/$x mfcc
+      steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x mfcc
+  done
+fi
+
+if [ $stage -le 3 ]; then
+  ### Monophone
+  echo "Starting monophone training."
+  utils/subset_data_dir.sh data/train 1000 data/train.1k
+  steps/train_mono.sh --nj $nj --cmd "$train_cmd" data/train.1k data/lang exp/mono
+  echo "Mono training done."
+
+  (
+  echo "Decoding the dev set using monophone models."
+  utils/mkgraph.sh --mono data/lang_test exp/mono exp/mono/graph
+
+  steps/decode.sh --config conf/decode.config --nj $dev_nj --cmd "$decode_cmd" \
+    exp/mono/graph data/dev exp/mono/decode_dev
+  echo "Monophone decoding done."
+  ) &
+fi
+
+
+if [ $stage -le 4 ]; then
+  ### Triphone
+  echo "Starting triphone training."
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+      data/train data/lang exp/mono exp/mono_ali
+  steps/train_deltas.sh --boost-silence 1.25  --cmd "$train_cmd"  \
+      3200 30000 data/train data/lang exp/mono_ali exp/tri1
+  echo "Triphone training done."
+
+  (
+  echo "Decoding the dev set using triphone models."
+  utils/mkgraph.sh data/lang_test  exp/tri1 exp/tri1/graph
+  steps/decode.sh --nj $dev_nj --cmd "$decode_cmd"  \
+      exp/tri1/graph  data/dev exp/tri1/decode_dev
+
+  steps/lmrescore_const_arpa.sh  --cmd "$decode_cmd" \
+      data/lang_test/ data/lang_big/ data/dev \
+      exp/tri1/decode_dev exp/tri1/decode_dev.rescored
+  echo "Triphone decoding done."
+  ) &
+fi
+
+if [ $stage -le 5 ]; then
+  ## Triphones + delta delta
+  # Training
+  echo "Starting (larger) triphone training."
+  steps/align_si.sh  --nj $nj --cmd "$train_cmd" --use-graphs true \
+       data/train data/lang exp/tri1 exp/tri1_ali
+  steps/train_deltas.sh --cmd "$train_cmd"  \
+      4200 40000 data/train data/lang exp/tri1_ali exp/tri2a
+  echo "Triphone (large) training done."
+
+  (
+  echo "Decoding the dev set using triphone(large) models."
+  utils/mkgraph.sh data/lang_test  exp/tri2a exp/tri2a/graph
+  steps/decode.sh --nj $dev_nj --cmd "$decode_cmd" \
+      exp/tri2a/graph  data/dev exp/tri2a/decode_dev
+
+  steps/lmrescore_const_arpa.sh  --cmd "$decode_cmd" \
+      data/lang_test/ data/lang_big/ data/dev \
+      exp/tri2a/decode_dev exp/tri2a/decode_dev.rescored
+  echo "Triphone(large) decoding done."
+  ) &
+fi
+
+if [ $stage -le 6 ]; then
+  ### Triphone + LDA and MLLT
+  # Training
+  echo "Starting LDA+MLLT training."
+  steps/align_si.sh  --nj $nj --cmd "$train_cmd"  \
+      data/train data/lang exp/tri2a exp/tri2a_ali
+
+  steps/train_lda_mllt.sh  --cmd "$train_cmd"  \
+    --splice-opts "--left-context=3 --right-context=3" \
+    4200 40000 data/train data/lang  exp/tri2a_ali exp/tri2b
+  echo "LDA+MLLT training done."
+
+  (
+  echo "Decoding the dev set using LDA+MLLT models."
+  utils/mkgraph.sh data/lang_test  exp/tri2b exp/tri2b/graph
+  steps/decode.sh --nj $dev_nj    --cmd "$decode_cmd" \
+      exp/tri2b/graph  data/dev exp/tri2b/decode_dev
+
+  steps/lmrescore_const_arpa.sh  --cmd "$decode_cmd" \
+      data/lang_test/ data/lang_big/ data/dev \
+      exp/tri2b/decode_dev exp/tri2b/decode_dev.rescored
+  echo "LDA+MLLT decoding done."
+  ) &
+fi
+
+
+if [ $stage -le 7 ]; then
+  ### Triphone + LDA and MLLT + SAT and FMLLR
+  # Training
+  echo "Starting SAT+FMLLR training."
+  steps/align_si.sh  --nj $nj --cmd "$train_cmd" \
+      --use-graphs true data/train data/lang exp/tri2b exp/tri2b_ali
+  steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+      data/train data/lang exp/tri2b_ali exp/tri3b
+  echo "SAT+FMLLR training done."
+
+  (
+  echo "Decoding the dev set using SAT+FMLLR models."
+  utils/mkgraph.sh data/lang_test  exp/tri3b exp/tri3b/graph
+  steps/decode_fmllr.sh --nj $dev_nj --cmd "$decode_cmd" \
+      exp/tri3b/graph  data/dev exp/tri3b/decode_dev
+
+  steps/lmrescore_const_arpa.sh  --cmd "$decode_cmd" \
+      data/lang_test/ data/lang_big/ data/dev \
+      exp/tri3b/decode_dev exp/tri3b/decode_dev.rescored
+  echo "SAT+FMLLR decoding done."
+  ) &
+fi
+
+
+if [ $stage -le 8 ]; then
+  echo "Starting SGMM training."
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+      data/train data/lang exp/tri3b exp/tri3b_ali
+
+  steps/train_ubm.sh  --cmd "$train_cmd"  \
+      600 data/train data/lang exp/tri3b_ali exp/ubm5b2
+
+  steps/train_sgmm2.sh  --cmd "$train_cmd"  \
+       5200 12000 data/train data/lang exp/tri3b_ali exp/ubm5b2/final.ubm exp/sgmm2_5b2
+  echo "SGMM training done."
+
+  (
+  echo "Decoding the dev set using SGMM models"
+  # Graph compilation
+  utils/mkgraph.sh data/lang_test exp/sgmm2_5b2 exp/sgmm2_5b2/graph
+  utils/mkgraph.sh data/lang_big/ exp/sgmm2_5b2 exp/sgmm2_5b2/graph_big
+
+  steps/decode_sgmm2.sh --nj $dev_nj --cmd "$decode_cmd" \
+      --transform-dir exp/tri3b/decode_dev \
+      exp/sgmm2_5b2/graph data/dev exp/sgmm2_5b2/decode_dev
+
+  steps/lmrescore_const_arpa.sh  --cmd "$decode_cmd" \
+      data/lang_test/ data/lang_big/ data/dev \
+      exp/sgmm2_5b2/decode_dev exp/sgmm2_5b2/decode_dev.rescored
+
+  steps/decode_sgmm2.sh --nj $dev_nj --cmd "$decode_cmd" \
+      --transform-dir exp/tri3b/decode_dev \
+      exp/sgmm2_5b2/graph_big data/dev exp/sgmm2_5b2/decode_dev.big
+  echo "SGMM decoding done."
+  ) &
+fi
+
+wait;
+#score
+for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done
diff --git a/egs/iban/s5/steps b/egs/iban/s5/steps
new file mode 120000
index 00000000000..6e99bf5b5ad
--- /dev/null
+++ b/egs/iban/s5/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps
\ No newline at end of file
diff --git a/egs/iban/s5/utils b/egs/iban/s5/utils
new file mode 120000
index 00000000000..b240885218f
--- /dev/null
+++ b/egs/iban/s5/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils
\ No newline at end of file
diff --git a/egs/librispeech/s5/RESULTS b/egs/librispeech/s5/RESULTS
index 885b8bcd9f3..ca3806bd168 100644
--- a/egs/librispeech/s5/RESULTS
+++ b/egs/librispeech/s5/RESULTS
@@ -164,37 +164,67 @@
 %WER 17.16 [ 8982 / 52343, 855 ins, 1421 del, 6706 sub ] exp/nnet2_online/nnet_a_online/decode_test_other_tgmed/wer_12
 %WER 18.90 [ 9891 / 52343, 798 ins, 1786 del, 7307 sub ] exp/nnet2_online/nnet_a_online/decode_test_other_tgsmall/wer_13
 
-# RNNLM rescoring of tri6b
-
-%WER 7.50 [ 4080 / 54402, 617 ins, 416 del, 3047 sub ] exp/tri6b/decode_tglarge_dev_clean/wer_14
-%WER 7.09 [ 3859 / 54402, 611 ins, 354 del, 2894 sub ] exp/tri6b/decode_tglarge_dev_clean_rnnlm_h150_me5-1000_L0.25/wer_14
-%WER 7.29 [ 3968 / 54402, 661 ins, 332 del, 2975 sub ] exp/tri6b/decode_tglarge_dev_clean_rnnlm_h150_me5-1000_L0.5/wer_13
-%WER 7.73 [ 4205 / 54402, 709 ins, 349 del, 3147 sub ] exp/tri6b/decode_tglarge_dev_clean_rnnlm_h150_me5-1000_L0.75/wer_12
-
-%WER 21.94 [ 11180 / 50948, 1264 ins, 1506 del, 8410 sub ] exp/tri6b/decode_tglarge_dev_other/wer_16
-%WER 21.36 [ 10881 / 50948, 1309 ins, 1362 del, 8210 sub ] exp/tri6b/decode_tglarge_dev_other_rnnlm_h150_me5-1000_L0.25/wer_16
-%WER 21.29 [ 10848 / 50948, 1330 ins, 1324 del, 8194 sub ] exp/tri6b/decode_tglarge_dev_other_rnnlm_h150_me5-1000_L0.5/wer_16
-%WER 21.75 [ 11082 / 50948, 1351 ins, 1346 del, 8385 sub ] exp/tri6b/decode_tglarge_dev_other_rnnlm_h150_me5-1000_L0.75/wer_17
-
-%WER 9.39 [ 5106 / 54402, 597 ins, 648 del, 3861 sub ] exp/tri6b/decode_tgmed_dev_clean/wer_14
-%WER 8.09 [ 4400 / 54402, 564 ins, 517 del, 3319 sub ] exp/tri6b/decode_tgmed_dev_clean_rnnlm_h150_me5-1000_L0.25/wer_15
-%WER 8.00 [ 4350 / 54402, 609 ins, 472 del, 3269 sub ] exp/tri6b/decode_tgmed_dev_clean_rnnlm_h150_me5-1000_L0.5/wer_15
-%WER 8.21 [ 4467 / 54402, 692 ins, 415 del, 3360 sub ] exp/tri6b/decode_tgmed_dev_clean_rnnlm_h150_me5-1000_L0.75/wer_12
-
-%WER 25.16 [ 12816 / 50948, 1175 ins, 2076 del, 9565 sub ] exp/tri6b/decode_tgmed_dev_other/wer_16
-%WER 23.28 [ 11861 / 50948, 1289 ins, 1546 del, 9026 sub ] exp/tri6b/decode_tgmed_dev_other_rnnlm_h150_me5-1000_L0.25/wer_14
-%WER 23.03 [ 11732 / 50948, 1341 ins, 1467 del, 8924 sub ] exp/tri6b/decode_tgmed_dev_other_rnnlm_h150_me5-1000_L0.5/wer_14
-%WER 23.12 [ 11779 / 50948, 1351 ins, 1476 del, 8952 sub ] exp/tri6b/decode_tgmed_dev_other_rnnlm_h150_me5-1000_L0.75/wer_15
-
-%WER 10.66 [ 5800 / 54402, 558 ins, 854 del, 4388 sub ] exp/tri6b/decode_tgsmall_dev_clean/wer_15
-%WER 8.78 [ 4779 / 54402, 586 ins, 588 del, 3605 sub ] exp/tri6b/decode_tgsmall_dev_clean_rnnlm_h150_me5-1000_L0.25/wer_14
-%WER 8.50 [ 4624 / 54402, 661 ins, 505 del, 3458 sub ] exp/tri6b/decode_tgsmall_dev_clean_rnnlm_h150_me5-1000_L0.5/wer_13
-%WER 8.56 [ 4659 / 54402, 674 ins, 485 del, 3500 sub ] exp/tri6b/decode_tgsmall_dev_clean_rnnlm_h150_me5-1000_L0.75/wer_13
-
-%WER 27.18 [ 13850 / 50948, 1192 ins, 2340 del, 10318 sub ] exp/tri6b/decode_tgsmall_dev_other/wer_15
-%WER 24.72 [ 12596 / 50948, 1291 ins, 1803 del, 9502 sub ] exp/tri6b/decode_tgsmall_dev_other_rnnlm_h150_me5-1000_L0.25/wer_14
-%WER 24.18 [ 12317 / 50948, 1284 ins, 1732 del, 9301 sub ] exp/tri6b/decode_tgsmall_dev_other_rnnlm_h150_me5-1000_L0.5/wer_15
-%WER 24.19 [ 12323 / 50948, 1327 ins, 1686 del, 9310 sub ] exp/tri6b/decode_tgsmall_dev_other_rnnlm_h150_me5-1000_L0.75/wer_15
+# RNNLM rescoring of tri6b (faster-rnnlm hidden=150 direct=4.0Gb, Hierarchical Softmax)
+%WER 7.39 [ 4023 / 54402, 540 ins, 444 del, 3039 sub ] exp/tri6b/decode_tglarge_dev_clean/wer_13_1.0
+%WER 7.03 [ 3823 / 54402, 608 ins, 343 del, 2872 sub ] exp/tri6b/decode_tglarge_dev_clean_faster-rnnlm_h150-me5-1000_L0.25/wer_13_0.5
+%WER 7.03 [ 3827 / 54402, 606 ins, 320 del, 2901 sub ] exp/tri6b/decode_tglarge_dev_clean_faster-rnnlm_h150-me5-1000_L0.5/wer_14_0.5
+%WER 7.25 [ 3946 / 54402, 564 ins, 368 del, 3014 sub ] exp/tri6b/decode_tglarge_dev_clean_faster-rnnlm_h150-me5-1000_L0.75/wer_14_1.0
+
+%WER 21.31 [ 10858 / 50948, 1525 ins, 1151 del, 8182 sub ] exp/tri6b/decode_tglarge_dev_other/wer_17_0.0
+%WER 20.62 [ 10504 / 50948, 1377 ins, 1180 del, 7947 sub ] exp/tri6b/decode_tglarge_dev_other_faster-rnnlm_h150-me5-1000_L0.25/wer_15_0.5
+%WER 20.64 [ 10515 / 50948, 1253 ins, 1313 del, 7949 sub ] exp/tri6b/decode_tglarge_dev_other_faster-rnnlm_h150-me5-1000_L0.5/wer_16_1.0
+%WER 20.91 [ 10652 / 50948, 1344 ins, 1233 del, 8075 sub ] exp/tri6b/decode_tglarge_dev_other_faster-rnnlm_h150-me5-1000_L0.75/wer_15_1.0
+
+%WER 9.21 [ 5012 / 54402, 703 ins, 510 del, 3799 sub ] exp/tri6b/decode_tgmed_dev_clean/wer_14_0.0
+%WER 7.99 [ 4345 / 54402, 554 ins, 487 del, 3304 sub ] exp/tri6b/decode_tgmed_dev_clean_faster-rnnlm_h150-me5-1000_L0.25/wer_15_0.5
+%WER 7.68 [ 4177 / 54402, 596 ins, 414 del, 3167 sub ] exp/tri6b/decode_tgmed_dev_clean_faster-rnnlm_h150-me5-1000_L0.5/wer_14_0.5
+%WER 7.70 [ 4190 / 54402, 582 ins, 422 del, 3186 sub ] exp/tri6b/decode_tgmed_dev_clean_faster-rnnlm_h150-me5-1000_L0.75/wer_13_1.0
+
+%WER 24.27 [ 12365 / 50948, 1365 ins, 1591 del, 9409 sub ] exp/tri6b/decode_tgmed_dev_other/wer_17_0.0
+%WER 22.51 [ 11468 / 50948, 1496 ins, 1235 del, 8737 sub ] exp/tri6b/decode_tgmed_dev_other_faster-rnnlm_h150-me5-1000_L0.25/wer_15_0.0
+%WER 22.11 [ 11267 / 50948, 1494 ins, 1163 del, 8610 sub ] exp/tri6b/decode_tgmed_dev_other_faster-rnnlm_h150-me5-1000_L0.5/wer_16_0.0
+%WER 22.10 [ 11262 / 50948, 1532 ins, 1131 del, 8599 sub ] exp/tri6b/decode_tgmed_dev_other_faster-rnnlm_h150-me5-1000_L0.75/wer_16_0.0
+
+%WER 10.50 [ 5711 / 54402, 693 ins, 674 del, 4344 sub ] exp/tri6b/decode_tgsmall_dev_clean/wer_15_0.0
+%WER 8.53 [ 4641 / 54402, 582 ins, 555 del, 3504 sub ] exp/tri6b/decode_tgsmall_dev_clean_faster-rnnlm_h150-me5-1000_L0.25/wer_14_0.5
+%WER 8.09 [ 4400 / 54402, 605 ins, 469 del, 3326 sub ] exp/tri6b/decode_tgsmall_dev_clean_faster-rnnlm_h150-me5-1000_L0.5/wer_14_0.5
+%WER 8.02 [ 4363 / 54402, 594 ins, 460 del, 3309 sub ] exp/tri6b/decode_tgsmall_dev_clean_faster-rnnlm_h150-me5-1000_L0.75/wer_13_1.0
+
+%WER 26.22 [ 13358 / 50948, 1330 ins, 1955 del, 10073 sub ] exp/tri6b/decode_tgsmall_dev_other/wer_17_0.0
+%WER 23.95 [ 12202 / 50948, 1523 ins, 1381 del, 9298 sub ] exp/tri6b/decode_tgsmall_dev_other_faster-rnnlm_h150-me5-1000_L0.25/wer_14_0.0
+%WER 23.22 [ 11828 / 50948, 1553 ins, 1247 del, 9028 sub ] exp/tri6b/decode_tgsmall_dev_other_faster-rnnlm_h150-me5-1000_L0.5/wer_14_0.0
+%WER 23.22 [ 11832 / 50948, 1435 ins, 1376 del, 9021 sub ] exp/tri6b/decode_tgsmall_dev_other_faster-rnnlm_h150-me5-1000_L0.75/wer_15_0.5
+
+# RNNLM rescoring of tri6b (faster-rnnlm hidden=150 direct=1.6Gb Noise contrastive Estimation)
+%WER 7.39 [ 4023 / 54402, 540 ins, 444 del, 3039 sub ] exp/tri6b/decode_tglarge_dev_clean/wer_13_1.0
+%WER 7.05 [ 3835 / 54402, 487 ins, 447 del, 2901 sub ] exp/tri6b/decode_tglarge_dev_clean_faster-rnnlm_h150-me3-400-nce20_L0.25/wer_15_1.0
+%WER 6.84 [ 3723 / 54402, 524 ins, 394 del, 2805 sub ] exp/tri6b/decode_tglarge_dev_clean_faster-rnnlm_h150-me3-400-nce20_L0.5/wer_13_1.0
+%WER 6.92 [ 3766 / 54402, 564 ins, 376 del, 2826 sub ] exp/tri6b/decode_tglarge_dev_clean_faster-rnnlm_h150-me3-400-nce20_L0.75/wer_12_1.0
+
+%WER 21.31 [ 10858 / 50948, 1525 ins, 1151 del, 8182 sub ] exp/tri6b/decode_tglarge_dev_other/wer_17_0.0
+%WER 20.90 [ 10648 / 50948, 1404 ins, 1227 del, 8017 sub ] exp/tri6b/decode_tglarge_dev_other_faster-rnnlm_h150-me3-400-nce20_L0.25/wer_15_0.5
+%WER 20.70 [ 10544 / 50948, 1271 ins, 1364 del, 7909 sub ] exp/tri6b/decode_tglarge_dev_other_faster-rnnlm_h150-me3-400-nce20_L0.5/wer_15_1.0
+%WER 20.82 [ 10605 / 50948, 1295 ins, 1347 del, 7963 sub ] exp/tri6b/decode_tglarge_dev_other_faster-rnnlm_h150-me3-400-nce20_L0.75/wer_15_1.0
+
+%WER 9.21 [ 5012 / 54402, 703 ins, 510 del, 3799 sub ] exp/tri6b/decode_tgmed_dev_clean/wer_14_0.0
+%WER 8.01 [ 4360 / 54402, 669 ins, 402 del, 3289 sub ] exp/tri6b/decode_tgmed_dev_clean_faster-rnnlm_h150-me3-400-nce20_L0.25/wer_14_0.0
+%WER 7.46 [ 4056 / 54402, 584 ins, 422 del, 3050 sub ] exp/tri6b/decode_tgmed_dev_clean_faster-rnnlm_h150-me3-400-nce20_L0.5/wer_14_0.5
+%WER 7.28 [ 3962 / 54402, 536 ins, 451 del, 2975 sub ] exp/tri6b/decode_tgmed_dev_clean_faster-rnnlm_h150-me3-400-nce20_L0.75/wer_14_1.0
+
+%WER 24.27 [ 12365 / 50948, 1365 ins, 1591 del, 9409 sub ] exp/tri6b/decode_tgmed_dev_other/wer_17_0.0
+%WER 22.82 [ 11628 / 50948, 1530 ins, 1244 del, 8854 sub ] exp/tri6b/decode_tgmed_dev_other_faster-rnnlm_h150-me3-400-nce20_L0.25/wer_15_0.0
+%WER 22.21 [ 11315 / 50948, 1554 ins, 1152 del, 8609 sub ] exp/tri6b/decode_tgmed_dev_other_faster-rnnlm_h150-me3-400-nce20_L0.5/wer_15_0.0
+%WER 22.01 [ 11213 / 50948, 1609 ins, 1086 del, 8518 sub ] exp/tri6b/decode_tgmed_dev_other_faster-rnnlm_h150-me3-400-nce20_L0.75/wer_15_0.0
+
+%WER 10.50 [ 5711 / 54402, 693 ins, 674 del, 4344 sub ] exp/tri6b/decode_tgsmall_dev_clean/wer_15_0.0
+%WER 8.56 [ 4659 / 54402, 677 ins, 467 del, 3515 sub ] exp/tri6b/decode_tgsmall_dev_clean_faster-rnnlm_h150-me3-400-nce20_L0.25/wer_14_0.0
+%WER 7.81 [ 4250 / 54402, 657 ins, 387 del, 3206 sub ] exp/tri6b/decode_tgsmall_dev_clean_faster-rnnlm_h150-me3-400-nce20_L0.5/wer_14_0.0
+%WER 7.58 [ 4125 / 54402, 618 ins, 406 del, 3101 sub ] exp/tri6b/decode_tgsmall_dev_clean_faster-rnnlm_h150-me3-400-nce20_L0.75/wer_13_0.5
+
+%WER 26.22 [ 13358 / 50948, 1330 ins, 1955 del, 10073 sub ] exp/tri6b/decode_tgsmall_dev_other/wer_17_0.0
+%WER 24.07 [ 12264 / 50948, 1482 ins, 1435 del, 9347 sub ] exp/tri6b/decode_tgsmall_dev_other_faster-rnnlm_h150-me3-400-nce20_L0.25/wer_15_0.0
+%WER 23.15 [ 11797 / 50948, 1526 ins, 1276 del, 8995 sub ] exp/tri6b/decode_tgsmall_dev_other_faster-rnnlm_h150-me3-400-nce20_L0.5/wer_15_0.0
+%WER 22.92 [ 11677 / 50948, 1544 ins, 1241 del, 8892 sub ] exp/tri6b/decode_tgsmall_dev_other_faster-rnnlm_h150-me3-400-nce20_L0.75/wer_16_0.0
 
 ## Multi-splice version of online recipe.
 # for x in exp/nnet2_online/nnet_ms_a/decode_*; do grep WER $x/wer_* | utils/best_wer.sh ; done
@@ -225,6 +255,74 @@
 %WER 18.23 [ 9288 / 50948, 782 ins, 1585 del, 6921 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_other_tgsmall_utt/wer_15
 %WER 17.54 [ 8936 / 50948, 813 ins, 1425 del, 6698 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_other_tgsmall_utt_offline/wer_14
 
+## Multi-splice version of online recipe (5/16/2016).
+# for x in exp/nnet2_online/nnet_ms_a/decode_*; do grep WER $x/wer_* | utils/best_wer.sh ; done
+%WER 4.46 [ 2429 / 54402, 311 ins, 284 del, 1834 sub ] exp/nnet2_online/nnet_ms_a/decode_dev_clean_fglarge/wer_13_1.0
+%WER 4.64 [ 2522 / 54402, 362 ins, 251 del, 1909 sub ] exp/nnet2_online/nnet_ms_a/decode_dev_clean_tglarge/wer_12_0.5
+%WER 5.86 [ 3187 / 54402, 400 ins, 357 del, 2430 sub ] exp/nnet2_online/nnet_ms_a/decode_dev_clean_tgmed/wer_13_0.0
+%WER 6.60 [ 3592 / 54402, 450 ins, 403 del, 2739 sub ] exp/nnet2_online/nnet_ms_a/decode_dev_clean_tgsmall/wer_12_0.0
+%WER 12.31 [ 6274 / 50948, 742 ins, 784 del, 4748 sub ] exp/nnet2_online/nnet_ms_a/decode_dev_other_fglarge/wer_16_0.5
+%WER 12.87 [ 6557 / 50948, 774 ins, 850 del, 4933 sub ] exp/nnet2_online/nnet_ms_a/decode_dev_other_tglarge/wer_15_0.5
+%WER 15.25 [ 7770 / 50948, 871 ins, 1074 del, 5825 sub ] exp/nnet2_online/nnet_ms_a/decode_dev_other_tgmed/wer_16_0.0
+%WER 16.55 [ 8434 / 50948, 832 ins, 1280 del, 6322 sub ] exp/nnet2_online/nnet_ms_a/decode_dev_other_tgsmall/wer_16_0.0
+%WER 4.99 [ 2624 / 52576, 388 ins, 256 del, 1980 sub ] exp/nnet2_online/nnet_ms_a/decode_test_clean_fglarge/wer_13_0.5
+%WER 5.15 [ 2709 / 52576, 386 ins, 284 del, 2039 sub ] exp/nnet2_online/nnet_ms_a/decode_test_clean_tglarge/wer_13_0.5
+%WER 6.25 [ 3285 / 52576, 422 ins, 357 del, 2506 sub ] exp/nnet2_online/nnet_ms_a/decode_test_clean_tgmed/wer_13_0.0
+%WER 7.07 [ 3717 / 52576, 455 ins, 456 del, 2806 sub ] exp/nnet2_online/nnet_ms_a/decode_test_clean_tgsmall/wer_13_0.0
+%WER 12.89 [ 6748 / 52343, 878 ins, 769 del, 5101 sub ] exp/nnet2_online/nnet_ms_a/decode_test_other_fglarge/wer_16_0.0
+%WER 13.32 [ 6972 / 52343, 940 ins, 770 del, 5262 sub ] exp/nnet2_online/nnet_ms_a/decode_test_other_tglarge/wer_14_0.0
+%WER 15.82 [ 8281 / 52343, 886 ins, 1197 del, 6198 sub ] exp/nnet2_online/nnet_ms_a/decode_test_other_tgmed/wer_15_0.0
+%WER 17.09 [ 8948 / 52343, 863 ins, 1383 del, 6702 sub ] exp/nnet2_online/nnet_ms_a/decode_test_other_tgsmall/wer_15_0.0
+
+# for x in exp/nnet2_online/nnet_ms_a_online/decode_*; do grep WER $x/wer_* | utils/best_wer.sh ; done
+%WER 4.53 [ 2466 / 54402, 318 ins, 295 del, 1853 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_clean_fglarge/wer_14_1.0
+%WER 4.76 [ 2592 / 54402, 338 ins, 286 del, 1968 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_clean_fglarge_utt/wer_13_1.0
+%WER 4.57 [ 2488 / 54402, 330 ins, 285 del, 1873 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_clean_fglarge_utt_offline/wer_13_1.0
+%WER 4.71 [ 2562 / 54402, 392 ins, 236 del, 1934 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_clean_tglarge/wer_14_0.0
+%WER 4.90 [ 2665 / 54402, 352 ins, 280 del, 2033 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_clean_tglarge_utt/wer_14_0.5
+%WER 4.72 [ 2570 / 54402, 357 ins, 273 del, 1940 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_clean_tglarge_utt_offline/wer_14_0.5
+%WER 5.87 [ 3196 / 54402, 419 ins, 340 del, 2437 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_clean_tgmed/wer_12_0.0
+%WER 6.11 [ 3326 / 54402, 385 ins, 396 del, 2545 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_clean_tgmed_utt/wer_12_0.5
+%WER 5.99 [ 3258 / 54402, 382 ins, 392 del, 2484 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_clean_tgmed_utt_offline/wer_12_0.5
+%WER 6.58 [ 3581 / 54402, 472 ins, 379 del, 2730 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_clean_tgsmall/wer_11_0.0
+%WER 6.89 [ 3746 / 54402, 475 ins, 405 del, 2866 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_clean_tgsmall_utt/wer_12_0.0
+%WER 6.69 [ 3637 / 54402, 480 ins, 383 del, 2774 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_clean_tgsmall_utt_offline/wer_11_0.0
+%WER 12.67 [ 6456 / 50948, 774 ins, 771 del, 4911 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_other_fglarge/wer_16_0.5
+%WER 13.73 [ 6993 / 50948, 785 ins, 922 del, 5286 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_other_fglarge_utt/wer_14_1.0
+%WER 12.97 [ 6609 / 50948, 797 ins, 801 del, 5011 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_other_fglarge_utt_offline/wer_16_0.5
+%WER 13.09 [ 6670 / 50948, 800 ins, 826 del, 5044 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_other_tglarge/wer_15_0.5
+%WER 14.27 [ 7270 / 50948, 909 ins, 869 del, 5492 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_other_tglarge_utt/wer_14_0.5
+%WER 13.46 [ 6859 / 50948, 828 ins, 845 del, 5186 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_other_tglarge_utt_offline/wer_15_0.5
+%WER 15.27 [ 7782 / 50948, 874 ins, 1051 del, 5857 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_other_tgmed/wer_16_0.0
+%WER 16.41 [ 8359 / 50948, 949 ins, 1135 del, 6275 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_other_tgmed_utt/wer_16_0.0
+%WER 15.56 [ 7926 / 50948, 893 ins, 1051 del, 5982 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_other_tgmed_utt_offline/wer_16_0.0
+%WER 16.49 [ 8402 / 50948, 855 ins, 1210 del, 6337 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_other_tgsmall/wer_15_0.0
+%WER 17.80 [ 9068 / 50948, 969 ins, 1260 del, 6839 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_other_tgsmall_utt/wer_15_0.0
+%WER 16.97 [ 8647 / 50948, 845 ins, 1324 del, 6478 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_other_tgsmall_utt_offline/wer_17_0.0
+%WER 5.05 [ 2654 / 52576, 411 ins, 239 del, 2004 sub ] exp/nnet2_online/nnet_ms_a_online/decode_test_clean_fglarge/wer_12_0.5
+%WER 5.24 [ 2755 / 52576, 365 ins, 312 del, 2078 sub ] exp/nnet2_online/nnet_ms_a_online/decode_test_clean_fglarge_utt/wer_13_1.0
+%WER 5.09 [ 2676 / 52576, 405 ins, 241 del, 2030 sub ] exp/nnet2_online/nnet_ms_a_online/decode_test_clean_fglarge_utt_offline/wer_13_0.5
+%WER 5.22 [ 2744 / 52576, 393 ins, 282 del, 2069 sub ] exp/nnet2_online/nnet_ms_a_online/decode_test_clean_tglarge/wer_13_0.5
+%WER 5.38 [ 2826 / 52576, 413 ins, 284 del, 2129 sub ] exp/nnet2_online/nnet_ms_a_online/decode_test_clean_tglarge_utt/wer_13_0.5
+%WER 5.24 [ 2757 / 52576, 453 ins, 229 del, 2075 sub ] exp/nnet2_online/nnet_ms_a_online/decode_test_clean_tglarge_utt_offline/wer_13_0.0
+%WER 6.26 [ 3289 / 52576, 436 ins, 345 del, 2508 sub ] exp/nnet2_online/nnet_ms_a_online/decode_test_clean_tgmed/wer_13_0.0
+%WER 6.54 [ 3441 / 52576, 435 ins, 381 del, 2625 sub ] exp/nnet2_online/nnet_ms_a_online/decode_test_clean_tgmed_utt/wer_14_0.0
+%WER 6.28 [ 3303 / 52576, 426 ins, 359 del, 2518 sub ] exp/nnet2_online/nnet_ms_a_online/decode_test_clean_tgmed_utt_offline/wer_14_0.0
+%WER 7.06 [ 3711 / 52576, 446 ins, 474 del, 2791 sub ] exp/nnet2_online/nnet_ms_a_online/decode_test_clean_tgsmall/wer_14_0.0
+%WER 7.31 [ 3845 / 52576, 510 ins, 426 del, 2909 sub ] exp/nnet2_online/nnet_ms_a_online/decode_test_clean_tgsmall_utt/wer_12_0.0
+%WER 7.08 [ 3723 / 52576, 460 ins, 445 del, 2818 sub ] exp/nnet2_online/nnet_ms_a_online/decode_test_clean_tgsmall_utt_offline/wer_13_0.0
+%WER 13.17 [ 6891 / 52343, 936 ins, 713 del, 5242 sub ] exp/nnet2_online/nnet_ms_a_online/decode_test_other_fglarge/wer_14_0.0
+%WER 14.20 [ 7432 / 52343, 832 ins, 983 del, 5617 sub ] exp/nnet2_online/nnet_ms_a_online/decode_test_other_fglarge_utt/wer_15_0.5
+%WER 13.26 [ 6939 / 52343, 837 ins, 860 del, 5242 sub ] exp/nnet2_online/nnet_ms_a_online/decode_test_other_fglarge_utt_offline/wer_14_0.5
+%WER 13.53 [ 7080 / 52343, 952 ins, 779 del, 5349 sub ] exp/nnet2_online/nnet_ms_a_online/decode_test_other_tglarge/wer_14_0.0
+%WER 14.77 [ 7730 / 52343, 877 ins, 1056 del, 5797 sub ] exp/nnet2_online/nnet_ms_a_online/decode_test_other_tglarge_utt/wer_15_0.5
+%WER 13.74 [ 7192 / 52343, 871 ins, 920 del, 5401 sub ] exp/nnet2_online/nnet_ms_a_online/decode_test_other_tglarge_utt_offline/wer_14_0.5
+%WER 15.78 [ 8259 / 52343, 898 ins, 1170 del, 6191 sub ] exp/nnet2_online/nnet_ms_a_online/decode_test_other_tgmed/wer_15_0.0
+%WER 16.97 [ 8884 / 52343, 939 ins, 1304 del, 6641 sub ] exp/nnet2_online/nnet_ms_a_online/decode_test_other_tgmed_utt/wer_16_0.0
+%WER 16.01 [ 8380 / 52343, 877 ins, 1210 del, 6293 sub ] exp/nnet2_online/nnet_ms_a_online/decode_test_other_tgmed_utt_offline/wer_16_0.0
+%WER 16.98 [ 8889 / 52343, 900 ins, 1283 del, 6706 sub ] exp/nnet2_online/nnet_ms_a_online/decode_test_other_tgsmall/wer_14_0.0
+%WER 18.21 [ 9533 / 52343, 966 ins, 1398 del, 7169 sub ] exp/nnet2_online/nnet_ms_a_online/decode_test_other_tgsmall_utt/wer_14_0.0
+%WER 17.29 [ 9050 / 52343, 894 ins, 1391 del, 6765 sub ] exp/nnet2_online/nnet_ms_a_online/decode_test_other_tgsmall_utt_offline/wer_15_0.0
 
 ## Note: this learning rate is the effective learning rate; it gets multiplied by the num-jobs.
 # for x in exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch*{clean,other}*; do grep WER $x/wer_* | utils/best_wer.sh ; done
@@ -323,3 +421,45 @@
 %WER 13.79 [ 7219 / 52343, 847 ins, 953 del, 5419 sub ] exp/nnet2_online/nnet_ms_a_online/decode_pp_test_other_tglarge_utt_offline/wer_13
 %WER 16.08 [ 8416 / 52343, 746 ins, 1466 del, 6204 sub ] exp/nnet2_online/nnet_ms_a_online/decode_pp_test_other_tgmed_utt_offline/wer_15
 %WER 17.64 [ 9231 / 52343, 764 ins, 1662 del, 6805 sub ] exp/nnet2_online/nnet_ms_a_online/decode_pp_test_other_tgsmall_utt_offline/wer_14
+
+# Results with nnet3 tdnn
+# local/nnet3/run_tdnn.sh
+# (4 epoch training on speed-perturbed data)
+# num_params=19.3M
+%WER 4.43 [ 2410 / 54402, 306 ins, 278 del, 1826 sub ] exp/nnet3/tdnn_sp/decode_dev_clean_fglarge/wer_13_1.0
+%WER 4.63 [ 2520 / 54402, 369 ins, 259 del, 1892 sub ] exp/nnet3/tdnn_sp/decode_dev_clean_tglarge/wer_12_0.5
+%WER 5.90 [ 3211 / 54402, 430 ins, 337 del, 2444 sub ] exp/nnet3/tdnn_sp/decode_dev_clean_tgmed/wer_12_0.0
+%WER 6.66 [ 3622 / 54402, 450 ins, 415 del, 2757 sub ] exp/nnet3/tdnn_sp/decode_dev_clean_tgsmall/wer_12_0.0
+%WER 11.62 [ 5922 / 50948, 727 ins, 741 del, 4454 sub ] exp/nnet3/tdnn_sp/decode_dev_other_fglarge/wer_14_0.5
+%WER 12.19 [ 6209 / 50948, 863 ins, 682 del, 4664 sub ] exp/nnet3/tdnn_sp/decode_dev_other_tglarge/wer_14_0.0
+%WER 14.52 [ 7396 / 50948, 789 ins, 1079 del, 5528 sub ] exp/nnet3/tdnn_sp/decode_dev_other_tgmed/wer_16_0.0
+%WER 15.83 [ 8063 / 50948, 867 ins, 1141 del, 6055 sub ] exp/nnet3/tdnn_sp/decode_dev_other_tgsmall/wer_14_0.0
+%WER 4.97 [ 2614 / 52576, 373 ins, 271 del, 1970 sub ] exp/nnet3/tdnn_sp/decode_test_clean_fglarge/wer_14_0.5
+%WER 5.15 [ 2708 / 52576, 446 ins, 235 del, 2027 sub ] exp/nnet3/tdnn_sp/decode_test_clean_tglarge/wer_13_0.0
+%WER 6.24 [ 3281 / 52576, 467 ins, 336 del, 2478 sub ] exp/nnet3/tdnn_sp/decode_test_clean_tgmed/wer_12_0.0
+%WER 6.95 [ 3654 / 52576, 459 ins, 433 del, 2762 sub ] exp/nnet3/tdnn_sp/decode_test_clean_tgsmall/wer_13_0.0
+%WER 12.14 [ 6352 / 52343, 883 ins, 649 del, 4820 sub ] exp/nnet3/tdnn_sp/decode_test_other_fglarge/wer_13_0.0
+%WER 12.62 [ 6605 / 52343, 898 ins, 720 del, 4987 sub ] exp/nnet3/tdnn_sp/decode_test_other_tglarge/wer_13_0.0
+%WER 15.10 [ 7904 / 52343, 874 ins, 1070 del, 5960 sub ] exp/nnet3/tdnn_sp/decode_test_other_tgmed/wer_13_0.0
+%WER 16.29 [ 8528 / 52343, 828 ins, 1320 del, 6380 sub ] exp/nnet3/tdnn_sp/decode_test_other_tgsmall/wer_14_0.0
+
+# Results with nnet3 tdnn+chain model
+# local/chain/run_tdnn_6z.sh
+# (4 epoch training on speed-perturbed data)
+# num_params=16.8M (12.7M after excluding the xent branch)
+%WER 3.92 [ 2131 / 54402, 290 ins, 197 del, 1644 sub ] exp/chain/tdnn_6z_sp/decode_dev_clean_fglarge/wer_11_0.5
+%WER 4.09 [ 2227 / 54402, 337 ins, 176 del, 1714 sub ] exp/chain/tdnn_6z_sp/decode_dev_clean_tglarge/wer_11_0.0
+%WER 5.11 [ 2781 / 54402, 329 ins, 300 del, 2152 sub ] exp/chain/tdnn_6z_sp/decode_dev_clean_tgmed/wer_12_0.0
+%WER 5.83 [ 3172 / 54402, 335 ins, 372 del, 2465 sub ] exp/chain/tdnn_6z_sp/decode_dev_clean_tgsmall/wer_12_0.0
+%WER 10.43 [ 5314 / 50948, 528 ins, 697 del, 4089 sub ] exp/chain/tdnn_6z_sp/decode_dev_other_fglarge/wer_14_0.5
+%WER 10.95 [ 5581 / 50948, 546 ins, 764 del, 4271 sub ] exp/chain/tdnn_6z_sp/decode_dev_other_tglarge/wer_14_0.5
+%WER 13.20 [ 6723 / 50948, 676 ins, 858 del, 5189 sub ] exp/chain/tdnn_6z_sp/decode_dev_other_tgmed/wer_13_0.0
+%WER 14.56 [ 7419 / 50948, 715 ins, 1003 del, 5701 sub ] exp/chain/tdnn_6z_sp/decode_dev_other_tgsmall/wer_13_0.0
+%WER 4.28 [ 2251 / 52576, 292 ins, 238 del, 1721 sub ] exp/chain/tdnn_6z_sp/decode_test_clean_fglarge/wer_11_1.0
+%WER 4.47 [ 2349 / 52576, 342 ins, 225 del, 1782 sub ] exp/chain/tdnn_6z_sp/decode_test_clean_tglarge/wer_11_0.5
+%WER 5.55 [ 2917 / 52576, 366 ins, 314 del, 2237 sub ] exp/chain/tdnn_6z_sp/decode_test_clean_tgmed/wer_13_0.0
+%WER 6.20 [ 3259 / 52576, 383 ins, 381 del, 2495 sub ] exp/chain/tdnn_6z_sp/decode_test_clean_tgsmall/wer_12_0.0
+%WER 10.76 [ 5634 / 52343, 643 ins, 672 del, 4319 sub ] exp/chain/tdnn_6z_sp/decode_test_other_fglarge/wer_12_0.5
+%WER 11.20 [ 5864 / 52343, 619 ins, 781 del, 4464 sub ] exp/chain/tdnn_6z_sp/decode_test_other_tglarge/wer_13_0.5
+%WER 13.47 [ 7051 / 52343, 733 ins, 933 del, 5385 sub ] exp/chain/tdnn_6z_sp/decode_test_other_tgmed/wer_13_0.0
+%WER 14.73 [ 7710 / 52343, 662 ins, 1209 del, 5839 sub ] exp/chain/tdnn_6z_sp/decode_test_other_tgsmall/wer_14_0.0
diff --git a/egs/librispeech/s5/cmd.sh b/egs/librispeech/s5/cmd.sh
index 6395d96ca36..71dd849a93b 100644
--- a/egs/librispeech/s5/cmd.sh
+++ b/egs/librispeech/s5/cmd.sh
@@ -1,30 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64"
-export decode_cmd="queue.pl -l arch=*64 --mem 2G"
-export mkgraph_cmd="queue.pl -l arch=*64 --mem 4G"
-export big_memory_cmd="queue.pl -l arch=*64 --mem 8G"
-export cuda_cmd="queue.pl -l gpu=1"
-
-
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
-#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
-
-#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
-#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
-
-#c) run it locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-#export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/librispeech/s5/local/chain/run_chain_common.sh b/egs/librispeech/s5/local/chain/run_chain_common.sh
new file mode 100755
index 00000000000..ab8b065ddd3
--- /dev/null
+++ b/egs/librispeech/s5/local/chain/run_chain_common.sh
@@ -0,0 +1,136 @@
+#!/bin/bash
+
+# this script has common stages shared across librispeech chain recipes
+set -e
+
+# configs for 'chain'
+stage=0
+# chain options
+frames_per_eg=150
+max_wer=
+
+# output directory names
+dir=
+treedir=
+lang=
+min_seg_len=
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+[ -z $treedir ] && echo "Set --treedir, this specifies the directory to store new tree " && exit 1;
+[ -z $lang ] && echo "Set --lang, this specifies the new lang directory which will have the new topology" && exit 1;
+[ -z $dir ] && echo "Set --dir, this specifies the experiment directory to store files relevant to the experiment " && exit 1;
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 10" if you have already
+# run those things.
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --speed-perturb true \
+                                  --generate-alignments false || exit 1;
+
+
+# Set the variables. These are based on variables set by run_ivector_common.sh
+gmm_dir=exp/tri6b
+train_set=train_960_sp
+latgen_train_set=train_960_sp
+ali_dir=exp/tri6b_sp
+lat_dir=exp/tri6b_lats_sp
+
+###################################
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 6000 data/$latgen_train_set $lang $ali_dir $treedir
+fi
+
+# combining the segments in training data to have a minimum length of frames_per_eg + tolerance
+# this is critical stage in AMI (gives 1% absolute improvement)
+if [ -z $min_seg_len ]; then
+  min_seg_len=$(python -c "print ($frames_per_eg+5)/100.0")
+fi
+
+if [ $stage -le 12 ]; then
+  rm -rf data/${train_set}_min${min_seg_len}_hires
+  steps/cleanup/combine_short_segments.py --minimum-duration $min_seg_len \
+    --input-data-dir data/${train_set}_hires \
+    --output-data-dir data/${train_set}_min${min_seg_len}_hires
+
+  #extract ivectors for the new data
+  steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 \
+    data/${train_set}_min${min_seg_len}_hires data/${train_set}_min${min_seg_len}_hires_max2
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
+    data/${train_set}_min${min_seg_len}_hires_max2 \
+    exp/nnet3/extractor \
+    exp/nnet3/ivectors_${train_set}_min${min_seg_len} || exit 1;
+
+ # combine the non-hires features for alignments/lattices
+ rm -rf data/${latgen_train_set}_min${min_seg_len}
+ steps/cleanup/combine_short_segments.py --minimum-duration $min_seg_len \
+                   --input-data-dir data/${latgen_train_set} \
+                   --output-data-dir data/${latgen_train_set}_min${min_seg_len}
+fi
+
+train_set=${train_set}_min${min_seg_len}
+latgen_train_set=${latgen_train_set}_min${min_seg_len}
+ivector_dir=exp/nnet3/ivectors_${train_set}
+ali_dir=${ali_dir}_min${min_seg_len}
+lat_dir=${lat_dir}_min${min_seg_len}
+if [ $stage -le 13 ]; then
+  # realigning data as the segments would have changed
+  steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" data/$latgen_train_set data/lang $gmm_dir $ali_dir || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat ${ali_dir}/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$latgen_train_set \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+mkdir -p $dir
+train_data_dir=data/${train_set}_hires
+if [ ! -z $max_wer ]; then
+  if [ $stage -le 15 ]; then
+    bad_utts_dir=${gmm_dir}_${train_set}_bad_utts
+    if [ ! -f $bad_utts_dir/all_info.sorted.txt ]; then
+      # This stage takes a lot of time ~7hrs, so run only if file is not available already
+      steps/cleanup/find_bad_utts.sh --cmd "$decode_cmd" --nj 405 data/$latgen_train_set data/lang $ali_dir $bad_utts_dir
+    fi
+    python local/sort_bad_utts.py --bad-utt-info-file $bad_utts_dir/all_info.sorted.txt --max-wer $max_wer --output-file $dir/wer_sorted_utts_${max_wer}wer
+    utils/copy_data_dir.sh --validate-opts "--no-wav"  data/${train_set}_hires data/${train_set}_${max_wer}wer_hires
+    utils/filter_scp.pl $dir/wer_sorted_utts_${max_wer}wer data/${train_set}_hires/feats.scp  > data/${train_set}_${max_wer}wer_hires/feats.scp
+    utils/fix_data_dir.sh data/${train_set}_${max_wer}wer_hires
+  fi
+  train_data_dir=data/${train_set}_${max_wer}wer_hires
+  # we don't realign again as the segment ids don't change
+fi
+
+cat > $dir/vars <<EOF
+train_data_dir=$train_data_dir
+train_ivector_dir=$ivector_dir
+lat_dir=$lat_dir
+EOF
+
+exit 0;
diff --git a/egs/librispeech/s5/local/chain/run_tdnn_6z.sh b/egs/librispeech/s5/local/chain/run_tdnn_6z.sh
new file mode 100755
index 00000000000..6b0327131a5
--- /dev/null
+++ b/egs/librispeech/s5/local/chain/run_tdnn_6z.sh
@@ -0,0 +1,156 @@
+set -e
+
+# configs for 'chain'
+# this script is based on swbd's 6z script
+affix=
+stage=16
+train_stage=-10
+get_egs_stage=-10
+dir=exp/chain/tdnn_6z  # Note: _sp will get added to this, which means "speed perturb".
+decode_iter=
+
+# TDNN options
+# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing
+# training options
+frames_per_eg=150
+relu_dim=725
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+self_repair_scale=0.00001
+max_wer=
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 16" if you have already
+# run those things.
+
+suffix=_sp
+dir=${dir}${affix:+_$affix}
+dir=${dir}$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+max_wer_opt=${max_wer:+" --max-wer $max_wer "}
+
+local/chain/run_chain_common.sh --stage $stage \
+                                --frames-per-eg $frames_per_eg \
+                                $max_wer_opt \
+                                --dir $dir \
+                                --treedir $treedir \
+                                --lang $lang || exit 1;
+
+. $dir/vars
+# sets the directory names where features, ivectors and lattices are stored
+#train_data_dir
+#train_ivector_dir
+#lat_dir
+
+################################### 
+
+if [ $stage -le 16 ]; then
+  echo "$0: creating neural net configs";
+  # create the config files for nnet initialization
+  repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "}
+
+  steps/nnet3/tdnn/make_configs.py $repair_opts \
+    --feat-dir $train_data_dir \
+    --ivector-dir $train_ivector_dir \
+    --tree-dir $treedir \
+    --relu-dim $relu_dim \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \
+    --use-presoftmax-prior-scale false \
+    --xent-regularize $xent_regularize \
+    --xent-separate-forward-affine true \
+    --include-log-softmax false \
+    --final-layer-normalize-target 0.5 \
+    $dir/configs || exit 1;
+fi
+
+
+
+if [ $stage -le 17 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/librispeech-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --egs.dir "$common_egs_dir" \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2 \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir $train_data_dir \
+    --tree-dir $treedir \
+    --lat-dir $lat_dir \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 18 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 --remove-oov data/lang_test_tgsmall $dir $dir/graph_test_tgsmall
+  # romove <UNK> from the graph
+  fstrmsymbols --apply-to-output=true --remove-arcs=true "echo 3|" $dir/graph_test_tgsmall/HCLG.fst $dir/graph_test_tgsmall/HCLG.fst
+fi
+
+graph_dir=$dir/graph_test_tgsmall
+if [ $stage -le 19 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in test_clean test_other dev_clean dev_other; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_tgsmall || touch $dir/.error
+      steps/lmrescore.sh --cmd "$decode_cmd" --self-loop-scale 1.0 data/lang_test_{tgsmall,tgmed} \
+          data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_{tgsmall,tgmed} || touch $dir/.error
+      steps/lmrescore_const_arpa.sh \
+          --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
+          data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_{tgsmall,tglarge} || touch $dir/.error
+      steps/lmrescore_const_arpa.sh \
+          --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \
+          data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_{tgsmall,fglarge} || touch $dir/.error
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/librispeech/s5/local/chain/run_tdnn_6z_discriminative.sh b/egs/librispeech/s5/local/chain/run_tdnn_6z_discriminative.sh
new file mode 100755
index 00000000000..944cfe255da
--- /dev/null
+++ b/egs/librispeech/s5/local/chain/run_tdnn_6z_discriminative.sh
@@ -0,0 +1,234 @@
+#!/bin/bash
+
+set -o pipefail
+set -e
+# this is run_discriminative.sh
+
+# This script does discriminative training on top of chain nnet3 system.
+# note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
+# since the lattice generation runs in about real-time, so takes of the order of
+# 1000 hours of CPU time.
+# 
+
+
+stage=0
+train_stage=-10 # can be used to start training in the middle.
+get_egs_stage=-10
+use_gpu=true  # for training
+cleanup=false  # run with --cleanup true --stage 6 to clean up (remove large things like denlats,
+               # alignments and degs).
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+srcdir=exp/chain/tdnn_6z_sp
+. $srcdir/vars
+# sets the directory names where features, ivectors and lattices are stored
+#train_data_dir
+#train_ivector_dir
+#lat_dir
+
+online_ivector_dir=$train_ivector_dir
+degs_dir=                     # If provided, will skip the degs directory creation
+lats_dir=                     # If provided, will skip denlats creation
+
+## Objective options
+criterion=smbr
+one_silence_class=true
+
+dir=${srcdir}_${criterion}
+
+## Egs options
+frames_per_eg=150
+frames_overlap_per_eg=30
+truncate_deriv_weights=10
+
+## Nnet training options
+effective_learning_rate=0.00000125
+max_param_change=1
+num_jobs_nnet=4
+num_epochs=4
+regularization_opts="--xent-regularize=0.1 --l2-regularize=0.00005"          # Applicable for providing --xent-regularize and --l2-regularize options 
+minibatch_size=64
+modify_learning_rates=true    
+last_layer_factor=0.1         
+
+## Decode options
+decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we decided to run more.
+
+if $use_gpu; then
+  if ! cuda-compiled; then
+    cat <<EOF && exit 1 
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
+EOF
+  fi
+  num_threads=1
+else
+  # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
+  # almost the same, but this may be a little bit slow.
+  num_threads=16
+fi
+
+if [ ! -f ${srcdir}/final.mdl ]; then
+  echo "$0: expected ${srcdir}/final.mdl to exist; first run run_tdnn.sh or run_lstm.sh"
+  exit 1;
+fi
+
+lang=data/lang
+
+frame_subsampling_opt=
+frame_subsampling_factor=1
+if [ -f $srcdir/frame_subsampling_factor ]; then
+  frame_subsampling_factor=$(cat $srcdir/frame_subsampling_factor)
+  frame_subsampling_opt="--frame-subsampling-factor $(cat $srcdir/frame_subsampling_factor)"
+fi
+
+affix=    # Will be set if doing input frame shift
+if [ $frame_subsampling_factor -ne 1 ]; then
+  if [ $stage -le 0 ]; then
+    mkdir -p ${online_ivector_dir}_fs
+    cp -r $online_ivector_dir/{conf,ivector_period} ${online_ivector_dir}_fs
+
+    rm ${online_ivector_dir}_fs/ivector_online.scp 2>/dev/null || true
+
+    data_dirs=
+    for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do 
+      steps/shift_feats.sh --cmd "$train_cmd --max-jobs-run 40" --nj 350 \
+        $x $train_data_dir exp/shift_hires mfcc_hires
+      utils/fix_data_dir.sh ${train_data_dir}_fs$x
+      data_dirs="$data_dirs ${train_data_dir}_fs$x"
+      awk -v nfs=$x '{print "fs"nfs"-"$0}' $online_ivector_dir/ivector_online.scp >> ${online_ivector_dir}_fs/ivector_online.scp
+    done
+    utils/combine_data.sh ${train_data_dir}_fs $data_dirs
+    for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do 
+      rm -r ${train_data_dir}_fs$x
+    done
+  fi
+
+  train_data_dir=${train_data_dir}_fs
+
+  affix=_fs
+fi
+    
+rm ${online_ivector_dir}_fs/ivector_online.scp 2>/dev/null || true
+for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do 
+  awk -v nfs=$x '{print "fs"nfs"-"$0}' $online_ivector_dir/ivector_online.scp >> ${online_ivector_dir}_fs/ivector_online.scp
+done
+online_ivector_dir=${online_ivector_dir}_fs
+
+if [ $stage -le 1 ]; then
+  # hardcode no-GPU for alignment, although you could use GPU [you wouldn't
+  # get excellent GPU utilization though.]
+  nj=350 # have a high number of jobs because this could take a while, and we might
+         # have some stragglers.
+  steps/nnet3/align.sh  --cmd "$decode_cmd" --use-gpu false \
+    --online-ivector-dir $online_ivector_dir \
+    --scale-opts "--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0" \
+    --nj $nj $train_data_dir $lang $srcdir ${srcdir}_ali${affix} ;
+fi
+
+if [ -z "$lats_dir" ]; then
+  lats_dir=${srcdir}_denlats${affix}
+  if [ $stage -le 2 ]; then
+    nj=50  
+    # this doesn't really affect anything strongly, except the num-jobs for one of
+    # the phases of get_egs_discriminative.sh below.
+    num_threads_denlats=6
+    subsplit=40 # number of jobs that run per job (but 2 run at a time, so total jobs is 80, giving
+    # total slots = 80 * 6 = 480.
+    steps/nnet3/make_denlats.sh --cmd "$decode_cmd" \
+      --self-loop-scale 1.0 --acwt 1.0 --determinize true \
+      --online-ivector-dir $online_ivector_dir \
+      --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \
+      $train_data_dir $lang $srcdir ${lats_dir} ;
+  fi
+fi
+
+model_left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'` 
+model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'` 
+
+left_context=$[model_left_context + extra_left_context]
+right_context=$[model_right_context + extra_right_context]
+
+valid_left_context=$[valid_left_context + frames_per_eg]
+valid_right_context=$[valid_right_context + frames_per_eg]
+
+cmvn_opts=`cat $srcdir/cmvn_opts` 
+
+if [ -z "$degs_dir" ]; then
+  degs_dir=${srcdir}_degs${affix}
+
+  if [ $stage -le 3 ]; then
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${srcdir}_degs/storage ]; then
+      utils/create_split_dir.pl \
+        /export/b{01,02,12,13}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5/${srcdir}_degs/storage ${srcdir}_degs/storage
+    fi
+    # have a higher maximum num-jobs if
+    if [ -d ${srcdir}_degs/storage ]; then max_jobs=10; else max_jobs=5; fi
+
+    degs_opts="--determinize true --minimize true --remove-output-symbols true --remove-epsilons true --collapse-transition-ids true"
+
+    steps/nnet3/get_egs_discriminative.sh \
+      --cmd "$decode_cmd --max-jobs-run $max_jobs --mem 20G" --stage $get_egs_stage --cmvn-opts "$cmvn_opts" \
+      --adjust-priors false --acwt 1.0 \
+      --online-ivector-dir $online_ivector_dir \
+      --left-context $left_context --right-context $right_context \
+      --valid-left-context $valid_left_context --valid-right-context $valid_right_context \
+      --priors-left-context $valid_left_context --priors-right-context $valid_right_context $frame_subsampling_opt \
+      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg ${degs_opts} \
+      $train_data_dir $lang ${srcdir}_ali${affix} $lats_dir $srcdir/final.mdl $degs_dir ;
+  fi
+fi
+
+if [ $stage -le 4 ]; then
+  steps/nnet3/train_discriminative.sh --cmd "$decode_cmd" \
+    --stage $train_stage \
+    --effective-lrate $effective_learning_rate --max-param-change $max_param_change \
+    --criterion $criterion --drop-frames true --acoustic-scale 1.0 \
+    --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size $minibatch_size \
+    --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
+    --regularization-opts "$regularization_opts" --use-frame-shift false \
+    --truncate-deriv-weights $truncate_deriv_weights --adjust-priors false \
+    --modify-learning-rates $modify_learning_rates --last-layer-factor $last_layer_factor \
+      ${degs_dir} $dir ;
+fi
+
+graph_dir=$srcdir/graph_tgsmall
+if [ $stage -le 5 ]; then
+  for x in `seq $decode_start_epoch $num_epochs`; do
+    for decode_set in test_clean test_other dev_clean dev_other; do
+      (
+      num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+      iter=epoch$x.adj
+      
+      steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+        $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_tgsmall_$iter || touch $dir/.error;
+      steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
+        data/${decode_set}_hires $dir/decode_${decode_set}_{tgsmall,tgmed}_$iter  || touch $dir/.error 
+      steps/lmrescore_const_arpa.sh \
+        --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
+        data/${decode_set}_hires $dir/decode_${decode_set}_{tgsmall,tglarge}_$iter || touch $dir/.error
+      steps/lmrescore_const_arpa.sh \
+        --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \
+        data/${decode_set}_hires $dir/decode_${decode_set}_{tgsmall,fglarge}_$iter || touch $dir/.error
+      ) &
+    done
+  done
+fi
+wait;
+
+if [ $stage -le 6 ] && $cleanup; then
+  # if you run with "--cleanup true --stage 6" you can clean up.
+  rm ${lats_dir}/lat.*.gz || true
+  rm ${srcdir}_ali/ali.*.gz || true
+  steps/nnet2/remove_egs.sh ${srcdir}_degs || true
+fi
+
+
+exit 0;
+
diff --git a/egs/librispeech/s5/local/data_prep.sh b/egs/librispeech/s5/local/data_prep.sh
index a46e2de4f04..5a264a07464 100755
--- a/egs/librispeech/s5/local/data_prep.sh
+++ b/egs/librispeech/s5/local/data_prep.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2014  Vassil Panayotov 
+# Copyright 2014  Vassil Panayotov
 #           2014  Johns Hopkins University (author: Daniel Povey)
 # Apache 2.0
 
@@ -31,6 +31,7 @@ wav_scp=$dst/wav.scp; [[ -f "$wav_scp" ]] && rm $wav_scp
 trans=$dst/text; [[ -f "$trans" ]] && rm $trans
 utt2spk=$dst/utt2spk; [[ -f "$utt2spk" ]] && rm $utt2spk
 spk2gender=$dst/spk2gender; [[ -f $spk2gender ]] && rm $spk2gender
+utt2dur=$dst/utt2dur; [[ -f "$utt2dur" ]] && rm $utt2dur
 
 for reader_dir in $(find $src -mindepth 1 -maxdepth 1 -type d | sort); do
   reader=$(basename $reader_dir)
@@ -78,6 +79,8 @@ nutt2spk=$(wc -l <$utt2spk)
 ! [ "$ntrans" -eq "$nutt2spk" ] && \
   echo "Inconsistent #transcripts($ntrans) and #utt2spk($nutt2spk)" && exit 1;
 
+utils/data/get_utt2dur.sh $dst 1>&2 || exit 1
+
 utils/validate_data_dir.sh --no-feats $dst || exit 1;
 
 echo "$0: successfully prepared data in $dst"
diff --git a/egs/librispeech/s5/local/decode_example.sh b/egs/librispeech/s5/local/decode_example.sh
index 11a0670f240..815bf17b9f7 100755
--- a/egs/librispeech/s5/local/decode_example.sh
+++ b/egs/librispeech/s5/local/decode_example.sh
@@ -34,22 +34,10 @@ mfccdir=mfcc
 # here.
 lang=data/lang
 lang_test=data/lang_test
-lang_test_tmp=data/local/lang_test_tmp/
-mkdir -p $lang_test_tmp
 mkdir -p $lang_test
 cp -r $lang/* $lang_test
-gunzip -c $lm | utils/find_arpa_oovs.pl $lang_test/words.txt \
-  > $lang_test_tmp/oovs.txt || exit 1
-gunzip -c $lm | \
-  grep -v '<s> <s>' | \
-  grep -v '</s> <s>' | \
-  grep -v '</s> </s>' | \
-  arpa2fst - | fstprint | \
-  utils/remove_oovs.pl $lang_test_tmp/oovs.txt | \
-  utils/eps2disambig.pl | utils/s2eps.pl | \
-  fstcompile --isymbols=$lang_test/words.txt --osymbols=$lang_test/words.txt  \
-  --keep_isymbols=false --keep_osymbols=false | \
-  fstrmepsilon | fstarcsort --sort_type=ilabel > $lang_test/G.fst
+gunzip -c $lm | arpa2fst --disambig-symbol=#0 \
+                 --read-symbol-table=$lang_test/words.txt - $lang_test/G.fst
 utils/validate_lang.pl --skip-determinization-check $lang_test || exit 1;
 
 # Compiles decoding graph.
diff --git a/egs/librispeech/s5/local/format_data.sh b/egs/librispeech/s5/local/format_data.sh
index 52159f5e500..64914bde42d 100755
--- a/egs/librispeech/s5/local/format_data.sh
+++ b/egs/librispeech/s5/local/format_data.sh
@@ -18,40 +18,23 @@ fi
 
 lm_dir=$1
 
-tmpdir=data/local/lm_tmp
 lexicon=data/local/lang_tmp/lexiconp.txt
-mkdir -p $tmpdir
 
 # This loop was taken verbatim from wsj_format_data.sh, and I'm leaving it in place in
 # case we decide to add more language models at some point
 for lm_suffix in tgpr; do
   test=data/lang_test_${lm_suffix}
   mkdir -p $test
-  for f in phones.txt words.txt phones.txt L.fst L_disambig.fst phones oov.txt oov.int; do
+  for f in phones.txt words.txt phones.txt L.fst L_disambig.fst phones topo oov.txt oov.int; do
     cp -r data/lang/$f $test
   done
-  gunzip -c $lm_dir/lm_${lm_suffix}.arpa.gz |\
-   utils/find_arpa_oovs.pl $test/words.txt  > $tmpdir/oovs_${lm_suffix}.txt || exit 1
-
-  # grep -v '<s> <s>' because the LM seems to have some strange and useless
-  # stuff in it with multiple <s>'s in the history.  Encountered some other similar
-  # things in a LM from Geoff.  Removing all "illegal" combinations of <s> and </s>,
-  # which are supposed to occur only at being/end of utt.  These can cause
-  # determinization failures of CLG [ends up being epsilon cycles].
   gunzip -c $lm_dir/lm_${lm_suffix}.arpa.gz | \
-    grep -v '<s> <s>' | \
-    grep -v '</s> <s>' | \
-    grep -v '</s> </s>' | \
-    arpa2fst - | fstprint | \
-    utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \
-    utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
-      --osymbols=$test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-     fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
+    arpa2fst --disambig-symbol=#0 \
+             --read-symbol-table=$test/words.txt - $test/G.fst
 
   utils/validate_lang.pl $test || exit 1;
 done
 
 echo "Succeeded in formatting data."
-rm -r $tmpdir
 
 exit 0
diff --git a/egs/librispeech/s5/local/format_lms.sh b/egs/librispeech/s5/local/format_lms.sh
index d83029b0e1f..b530f61d2d9 100755
--- a/egs/librispeech/s5/local/format_lms.sh
+++ b/egs/librispeech/s5/local/format_lms.sh
@@ -49,24 +49,9 @@ for lm_suffix in tgsmall tgmed; do
   test=${src_dir}_test_${lm_suffix}
   mkdir -p $test
   cp -r ${src_dir}/* $test
-  gunzip -c $lm_dir/lm_${lm_suffix}.arpa.gz |\
-   utils/find_arpa_oovs.pl $test/words.txt  > $tmpdir/oovs_${lm_suffix}.txt || exit 1
-
-  # grep -v '<s> <s>' because the LM seems to have some strange and useless
-  # stuff in it with multiple <s>'s in the history.  Encountered some other
-  # similar things in a LM from Geoff.  Removing all "illegal" combinations of
-  # <s> and </s>, which are supposed to occur only at being/end of utt.  These
-  # can cause determinization failures of CLG [ends up being epsilon cycles].
   gunzip -c $lm_dir/lm_${lm_suffix}.arpa.gz | \
-    grep -v '<s> <s>' | \
-    grep -v '</s> <s>' | \
-    grep -v '</s> </s>' | \
-    arpa2fst - | fstprint | \
-    utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \
-    utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
-    --osymbols=$test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-    fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
-
+    arpa2fst --disambig-symbol=#0 \
+             --read-symbol-table=$test/words.txt - $test/G.fst
   utils/validate_lang.pl --skip-determinization-check $test || exit 1;
 done
 
diff --git a/egs/librispeech/s5/local/nnet3/run_ivector_common.sh b/egs/librispeech/s5/local/nnet3/run_ivector_common.sh
new file mode 100755
index 00000000000..a82e26fefe7
--- /dev/null
+++ b/egs/librispeech/s5/local/nnet3/run_ivector_common.sh
@@ -0,0 +1,145 @@
+#!/bin/bash
+
+# this script contains some common (shared) parts of the run_nnet*.sh scripts.
+
+. cmd.sh
+
+
+stage=0
+generate_alignments=true # false if doing ctc training
+speed_perturb=true
+
+set -e
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+train_set=train_960
+if [ "$speed_perturb" == "true" ]; then
+  if [ $stage -le 1 ]; then
+    #Although the nnet will be trained by high resolution data, we still have to perturbe the normal data to get the alignment
+    # _sp stands for speed-perturbed
+
+    for datadir in train_960; do
+      utils/perturb_data_dir_speed.sh 0.9 data/${datadir} data/temp1
+      utils/perturb_data_dir_speed.sh 1.1 data/${datadir} data/temp2
+      utils/combine_data.sh data/${datadir}_tmp data/temp1 data/temp2
+      utils/validate_data_dir.sh --no-feats data/${datadir}_tmp
+      rm -r data/temp1 data/temp2
+
+      mfccdir=mfcc_perturbed
+      steps/make_mfcc.sh --cmd "$train_cmd" --nj 50 \
+        data/${datadir}_tmp exp/make_mfcc/${datadir}_tmp $mfccdir || exit 1;
+      steps/compute_cmvn_stats.sh data/${datadir}_tmp exp/make_mfcc/${datadir}_tmp $mfccdir || exit 1;
+      utils/fix_data_dir.sh data/${datadir}_tmp
+
+      utils/copy_data_dir.sh --spk-prefix sp1.0- --utt-prefix sp1.0- data/${datadir} data/temp0
+      utils/combine_data.sh data/${datadir}_sp data/${datadir}_tmp data/temp0
+      utils/fix_data_dir.sh data/${datadir}_sp
+      rm -r data/temp0 data/${datadir}_tmp
+    done
+  fi
+
+  if [ $stage -le 2 ] && [ "$generate_alignments" == "true" ]; then
+    #obtain the alignment of the perturbed data
+    steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \
+      data/train_960_sp data/lang exp/tri6b exp/tri6b_sp || exit 1
+  fi
+  train_set=train_960_sp
+fi
+
+if [ $stage -le 3 ]; then
+  # Create high-resolution MFCC features (with 40 cepstra instead of 13).
+  # this shows how you can split across multiple file-systems.  we'll split the
+  # MFCC dir across multiple locations.  You might want to be careful here, if you
+  # have multiple copies of Kaldi checked out and run the same recipe, not to let
+  # them overwrite each other.
+  mfccdir=mfcc_hires
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
+    utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/librispeech-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
+  fi
+
+  for datadir in $train_set test_clean test_other dev_clean dev_other; do
+    if [ "$datadir" == "$train_set" ]; then
+      utils/data/perturb_data_dir_volume.sh data/$datadir
+    fi
+    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
+    steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1;
+    steps/compute_cmvn_stats.sh data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1;
+  done
+
+  # now create some data subsets.
+  # mixed is the clean+other data.
+  # 30k is 1/10 of the data (around 100 hours), 60k is 1/5th of it (around 200 hours).
+  utils/subset_data_dir.sh data/${train_set}_hires 30000 data/${train_set}_mixed_hires_30k
+  utils/subset_data_dir.sh data/${train_set}_hires 60000 data/${train_set}_mixed_hires_60k
+fi
+
+if [ $stage -le 4 ]; then
+  # We need to build a small system just because we need the LDA+MLLT transform
+  # to train the diag-UBM on top of.  We align a subset of training data for
+  # this purpose.
+  utils/subset_data_dir.sh --utt-list <(awk '{print $1}' data/${train_set}_mixed_hires_30k/utt2spk) \
+     data/${train_set} data/${train_set}_30k
+
+  steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
+    data/${train_set}_30k data/lang exp/tri6b exp/nnet3/tri6b_ali_30k
+fi
+
+if [ $stage -le 5 ]; then
+  # Train a small system just for its LDA+MLLT transform.  We use --num-iters 13
+  # because after we get the transform (12th iter is the last), any further
+  # training is pointless.
+  steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \
+    --realign-iters "" \
+    --splice-opts "--left-context=3 --right-context=3" \
+    5000 10000 data/${train_set}_mixed_hires_30k data/lang \
+    exp/nnet3/tri6b_ali_30k exp/nnet3/tri7b
+fi
+
+
+if [ $stage -le 6 ]; then
+  mkdir -p exp/nnet3
+  # To train a diagonal UBM we don't need very much data, so use a small subset
+  # (actually, it's not that small: still around 100 hours).
+  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 700000 \
+    data/${train_set}_mixed_hires_30k 512 exp/nnet3/tri7b exp/nnet3/diag_ubm
+fi
+
+if [ $stage -le 7 ]; then
+  # iVector extractors can in general be sensitive to the amount of data, but
+  # this one has a fairly small dim (defaults to 100) so we don't use all of it,
+  # we use just the 60k subset (about one fifth of the data, or 200 hours).
+  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
+    data/${train_set}_mixed_hires_60k exp/nnet3/diag_ubm exp/nnet3/extractor || exit 1;
+fi
+
+if [ $stage -le 8 ]; then
+  ivectordir=exp/nnet3/ivectors_${train_set}
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
+    utils/create_split_dir.pl /export/b{09,10,11,12}/$USER/kaldi-data/egs/librispeech-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
+  fi
+  # We extract iVectors on all the train data, which will be what we train the
+  # system on.  With --utts-per-spk-max 2, the script.  pairs the utterances
+  # into twos, and treats each of these pairs as one speaker.  Note that these
+  # are extracted 'online'.
+
+  # having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (iVector starts at zero).
+  steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/${train_set}_hires data/${train_set}_hires_max2
+  
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 60 \
+    data/${train_set}_hires_max2 exp/nnet3/extractor $ivectordir || exit 1;
+fi
+
+if [ $stage -le 9 ]; then
+  for data in test_clean test_other dev_clean dev_other; do
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 20 \
+      data/${data}_hires exp/nnet3/extractor exp/nnet3/ivectors_${data} || exit 1;
+   done
+   wait
+fi
+
+exit 0;
diff --git a/egs/librispeech/s5/local/nnet3/run_tdnn.sh b/egs/librispeech/s5/local/nnet3/run_tdnn.sh
new file mode 100755
index 00000000000..be253beda2f
--- /dev/null
+++ b/egs/librispeech/s5/local/nnet3/run_tdnn.sh
@@ -0,0 +1,106 @@
+#!/bin/bash
+
+# this is the standard "tdnn" system, built in nnet3; it's what we use to
+# call multi-splice.
+
+. cmd.sh
+
+
+# At this script level we don't support not running on GPU, as it would be painfully slow.
+# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false,
+# --num-threads 16 and --minibatch-size 128.
+
+stage=0
+affix=
+train_stage=-10
+common_egs_dir=
+reporting_email=
+remove_egs=true
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+suffix=_sp
+dir=exp/nnet3/tdnn
+dir=$dir${affix:+_$affix}$suffix 
+train_set=train_960$suffix
+ali_dir=exp/tri6b$suffix
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb true || exit 1;
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating neural net configs";
+
+  # create the config files for nnet initialization
+  python steps/nnet3/tdnn/make_configs.py  \
+    --feat-dir data/${train_set}_hires \
+    --ivector-dir exp/nnet3/ivectors_${train_set} \
+    --ali-dir $ali_dir \
+    --relu-dim 1280 \
+    --splice-indexes "-2,-1,0,1,2 -1,2 -3,3 -7,2 0"  \
+    --use-presoftmax-prior-scale true \
+   $dir/configs || exit 1;
+fi
+
+
+
+if [ $stage -le 11 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_dnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.0017 \
+    --trainer.optimization.final-effective-lrate 0.00017 \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --cleanup.preserve-model-interval 100 \
+    --feat-dir=data/${train_set}_hires \
+    --ali-dir $ali_dir \
+    --lang data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+
+fi
+
+if [ $stage -le 12 ]; then
+  # this does offline decoding that should give about the same results as the
+  # real online decoding (the one with --per-utt true)
+  for test in test_clean test_other dev_clean dev_other; do
+    (
+    steps/nnet3/decode.sh --nj 30 --cmd "$decode_cmd" \
+      --online-ivector-dir exp/nnet3/ivectors_${test} \
+      exp/tri6b/graph_tgsmall data/${test}_hires $dir/decode_${test}_tgsmall || touch $dir/.error 
+    steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
+      data/${test}_hires $dir/decode_${test}_{tgsmall,tgmed}  || touch $dir/.error
+    steps/lmrescore_const_arpa.sh \
+      --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
+      data/${test}_hires $dir/decode_${test}_{tgsmall,tglarge} || touch $dir/.error
+    steps/lmrescore_const_arpa.sh \
+      --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \
+      data/${test}_hires $dir/decode_${test}_{tgsmall,fglarge} || touch $dir/.error
+    ) &
+  done
+fi
+
+wait;
+exit 0;
+
diff --git a/egs/librispeech/s5/local/nnet3/run_tdnn_discriminative.sh b/egs/librispeech/s5/local/nnet3/run_tdnn_discriminative.sh
new file mode 100755
index 00000000000..70a21609756
--- /dev/null
+++ b/egs/librispeech/s5/local/nnet3/run_tdnn_discriminative.sh
@@ -0,0 +1,192 @@
+#!/bin/bash
+
+set -o pipefail
+set -e
+# this is run_discriminative.sh
+
+# This script does discriminative training on top of CE nnet3 system.
+# note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
+# since the lattice generation runs in about real-time, so takes of the order of
+# 1000 hours of CPU time.
+# 
+
+
+stage=0
+train_stage=-10 # can be used to start training in the middle.
+get_egs_stage=-10
+use_gpu=true  # for training
+cleanup=false  # run with --cleanup true --stage 6 to clean up (remove large things like denlats,
+               # alignments and degs).
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+srcdir=exp/nnet3/tdnn
+train_data_dir=data/train_960_sp_hires
+online_ivector_dir=exp/nnet3/ivectors_train_960_sp
+degs_dir=                     # If provided, will skip the degs directory creation
+lats_dir=                     # If provided, will skip denlats creation
+
+## Objective options
+criterion=smbr
+one_silence_class=true
+
+dir=${srcdir}_${criterion}
+
+## Egs options
+frames_per_eg=150
+frames_overlap_per_eg=30
+truncate_deriv_weights=10
+
+## Nnet training options
+effective_learning_rate=0.0000125
+max_param_change=1
+num_jobs_nnet=4
+num_epochs=4
+regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options 
+minibatch_size=64
+adjust_priors=true            # May need to be set to false 
+                              # because it does not help in some setups
+modify_learning_rates=true
+last_layer_factor=0.1
+
+## Decode options
+decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we decided to run more.
+
+if $use_gpu; then
+  if ! cuda-compiled; then
+    cat <<EOF && exit 1 
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
+EOF
+  fi
+  num_threads=1
+else
+  # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
+  # almost the same, but this may be a little bit slow.
+  num_threads=16
+fi
+
+if [ ! -f ${srcdir}/final.mdl ]; then
+  echo "$0: expected ${srcdir}/final.mdl to exist; first run run_tdnn.sh or run_lstm.sh"
+  exit 1;
+fi
+
+if [ $stage -le 1 ]; then
+  # hardcode no-GPU for alignment, although you could use GPU [you wouldn't
+  # get excellent GPU utilization though.]
+  nj=350 # have a high number of jobs because this could take a while, and we might
+         # have some stragglers.
+  steps/nnet3/align.sh  --cmd "$decode_cmd" --use-gpu false \
+    --online-ivector-dir $online_ivector_dir \
+     --nj $nj $train_data_dir data/lang $srcdir ${srcdir}_ali ;
+
+fi
+
+if [ -z "$lats_dir" ]; then
+  lats_dir=${srcdir}_denlats
+  if [ $stage -le 2 ]; then
+    nj=50  
+    # this doesn't really affect anything strongly, except the num-jobs for one of
+    # the phases of get_egs_discriminative.sh below.
+    num_threads_denlats=6
+    subsplit=40 # number of jobs that run per job (but 2 run at a time, so total jobs is 80, giving
+    # total slots = 80 * 6 = 480.
+    steps/nnet3/make_denlats.sh --cmd "$decode_cmd" --determinize true \
+      --online-ivector-dir $online_ivector_dir \
+      --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \
+      $train_data_dir data/lang $srcdir ${lats_dir} ;
+  fi
+fi
+
+model_left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'` 
+model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'` 
+
+left_context=$[model_left_context + extra_left_context]
+right_context=$[model_right_context + extra_right_context]
+
+valid_left_context=$[valid_left_context + frames_per_eg]
+valid_right_context=$[valid_right_context + frames_per_eg]
+
+frame_subsampling_opt=
+if [ -f $srcdir/frame_subsampling_factor ]; then
+  frame_subsampling_opt="--frame-subsampling-factor $(cat $srcdir/frame_subsampling_factor)"
+fi
+
+cmvn_opts=`cat $srcdir/cmvn_opts` 
+
+if [ -z "$degs_dir" ]; then
+  degs_dir=${srcdir}_degs
+
+  if [ $stage -le 3 ]; then
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${srcdir}_degs/storage ]; then
+      utils/create_split_dir.pl \
+        /export/b{01,02,12,13}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5/${srcdir}_degs/storage ${srcdir}_degs/storage
+    fi
+    # have a higher maximum num-jobs if
+    if [ -d ${srcdir}_degs/storage ]; then max_jobs=10; else max_jobs=5; fi
+
+    degs_opts="--determinize true --minimize true --remove-output-symbols true --remove-epsilons true --collapse-transition-ids true"
+
+    steps/nnet3/get_egs_discriminative.sh \
+      --cmd "$decode_cmd --max-jobs-run $max_jobs --mem 20G" --stage $get_egs_stage --cmvn-opts "$cmvn_opts" \
+      --adjust-priors $adjust_priors \
+      --online-ivector-dir $online_ivector_dir \
+      --left-context $left_context --right-context $right_context \
+      --valid-left-context $valid_left_context --valid-right-context $valid_right_context \
+      --priors-left-context $valid_left_context --priors-right-context $valid_right_context $frame_subsampling_opt \
+      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg ${degs_opts} \
+      $train_data_dir data/lang ${srcdir}_ali $lats_dir $srcdir/final.mdl $degs_dir ;
+  fi
+fi
+
+if [ $stage -le 4 ]; then
+  steps/nnet3/train_discriminative.sh --cmd "$decode_cmd" \
+    --stage $train_stage \
+    --effective-lrate $effective_learning_rate --max-param-change $max_param_change \
+    --criterion $criterion --drop-frames true \
+    --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size $minibatch_size \
+    --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
+    --regularization-opts "$regularization_opts" \
+    --truncate-deriv-weights $truncate_deriv_weights --adjust-priors $adjust_priors \
+    --modify-learning-rates $modify_learning_rates --last-layer-factor $last_layer_factor \
+    ${degs_dir} $dir 
+fi
+
+graph_dir=$srcdir/graph_tgsmall
+if [ $stage -le 5 ]; then
+  for x in `seq $decode_start_epoch $num_epochs`; do
+    for decode_set in test_clean test_other dev_clean dev_other; do
+      (
+      num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+      iter=epoch$x.adj
+      
+      steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
+        --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+        $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_tgsmall_$iter || touch $dir/.error
+      steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
+        data/${decode_set}_hires $dir/decode_${decode_set}_{tgsmall,tgmed}_$iter  || touch $dir/.error 
+      steps/lmrescore_const_arpa.sh \
+        --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
+        data/${decode_set}_hires $dir/decode_${decode_set}_{tgsmall,tglarge}_$iter || touch $dir/.error
+      steps/lmrescore_const_arpa.sh \
+        --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \
+        data/${decode_set}_hires $dir/decode_${decode_set}_{tgsmall,fglarge}_$iter || touch $dir/.error
+      ) &
+    done
+  done
+fi
+wait;
+
+if [ $stage -le 6 ] && $cleanup; then
+  # if you run with "--cleanup true --stage 6" you can clean up.
+  rm ${lats_dir}/lat.*.gz || true
+  rm ${srcdir}_ali/ali.*.gz || true
+  steps/nnet2/remove_egs.sh ${srcdir}_degs || true
+fi
+
+
+exit 0;
+
diff --git a/egs/librispeech/s5/local/online/run_nnet2.sh b/egs/librispeech/s5/local/online/run_nnet2.sh
index 10cb5511a9a..232794d102e 100755
--- a/egs/librispeech/s5/local/online/run_nnet2.sh
+++ b/egs/librispeech/s5/local/online/run_nnet2.sh
@@ -58,7 +58,7 @@ if [ $stage -le 7 ]; then
     --num-threads "$num_threads" \
     --minibatch-size "$minibatch_size" \
     --parallel-opts "$parallel_opts" \
-    --io-opts "-tc 12" \
+    --io-opts "--max-jobs-run 12" \
     --initial-learning-rate 0.01 --final-learning-rate 0.001 \
     --cmd "$decode_cmd" \
     --pnorm-input-dim 3500 \
diff --git a/egs/librispeech/s5/local/online/run_nnet2_common.sh b/egs/librispeech/s5/local/online/run_nnet2_common.sh
index b70b17cfa02..f9d44b4e19a 100755
--- a/egs/librispeech/s5/local/online/run_nnet2_common.sh
+++ b/egs/librispeech/s5/local/online/run_nnet2_common.sh
@@ -24,7 +24,7 @@ if [ $stage -le 1 ]; then
     utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/librispeech-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
   fi
 
-  for datadir in train_960 dev_clean dev_other; do
+  for datadir in train_960 test_clean test_other dev_clean dev_other; do
     utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
     steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \
       --cmd "$train_cmd" data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1;
diff --git a/egs/librispeech/s5/local/online/run_nnet2_disc.sh b/egs/librispeech/s5/local/online/run_nnet2_disc.sh
index 3314f250711..02e4d95831c 100755
--- a/egs/librispeech/s5/local/online/run_nnet2_disc.sh
+++ b/egs/librispeech/s5/local/online/run_nnet2_disc.sh
@@ -105,14 +105,14 @@ if [ $stage -le 3 ]; then
   if [ -d ${srcdir}_degs/storage ]; then max_jobs=10; else max_jobs=5; fi
 
   steps/nnet2/get_egs_discriminative2.sh \
-    --cmd "$decode_cmd -tc $max_jobs" \
+    --cmd "$decode_cmd --max-jobs-run $max_jobs" \
     --online-ivector-dir exp/nnet2_online/ivectors_train_960_hires \
     --criterion $criterion --drop-frames $drop_frames \
      data/train_960_hires data/lang ${srcdir}{_ali,_denlats,/final.mdl,_degs} || exit 1;
 
   # the command below is a more generic, but slower, way to do it.
   #steps/online/nnet2/get_egs_discriminative2.sh \
-  #  --cmd "$decode_cmd -tc $max_jobs" \
+  #  --cmd "$decode_cmd --max-jobs-run $max_jobs" \
   #  --criterion $criterion --drop-frames $drop_frames \
   #   data/train_960 data/lang ${srcdir}{_ali,_denlats,_online,_degs} || exit 1;
 fi
diff --git a/egs/librispeech/s5/local/online/run_nnet2_ms.sh b/egs/librispeech/s5/local/online/run_nnet2_ms.sh
index 68cf196708c..e0cee59d7fc 100755
--- a/egs/librispeech/s5/local/online/run_nnet2_ms.sh
+++ b/egs/librispeech/s5/local/online/run_nnet2_ms.sh
@@ -82,7 +82,7 @@ fi
 
 if [ $stage -le 8 ]; then
   # dump iVectors for the testing data.
-  for test in dev_clean dev_other; do
+  for test in test_clean test_other dev_clean dev_other; do
     steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 20 \
       data/${test}_hires exp/nnet2_online/extractor exp/nnet2_online/ivectors_$test || exit 1;
   done
@@ -92,7 +92,8 @@ fi
 if [ $stage -le 9 ]; then
   # this does offline decoding that should give about the same results as the
   # real online decoding (the one with --per-utt true)
-  for test in dev_clean dev_other; do
+  for test in test_clean test_other dev_clean dev_other; do
+    (
     steps/nnet2/decode.sh --nj 30 --cmd "$decode_cmd" --config conf/decode.config \
       --online-ivector-dir exp/nnet2_online/ivectors_${test} \
       exp/tri6b/graph_tgsmall data/${test}_hires $dir/decode_${test}_tgsmall || exit 1;
@@ -104,7 +105,9 @@ if [ $stage -le 9 ]; then
     steps/lmrescore_const_arpa.sh \
       --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \
       data/$test $dir/decode_${test}_{tgsmall,fglarge} || exit 1;
+    ) &
   done
+  wait
 fi
 
 
@@ -118,7 +121,8 @@ fi
 if [ $stage -le 11 ]; then
   # do the actual online decoding with iVectors, carrying info forward from 
   # previous utterances of the same speaker.
-  for test in dev_clean dev_other; do
+  for test in test_clean test_other dev_clean dev_other; do
+    (
     steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 30 \
       exp/tri6b/graph_tgsmall data/$test ${dir}_online/decode_${test}_tgsmall || exit 1;
     steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
@@ -129,13 +133,16 @@ if [ $stage -le 11 ]; then
     steps/lmrescore_const_arpa.sh \
       --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \
       data/$test ${dir}_online/decode_${test}_{tgsmall,fglarge} || exit 1;
+    ) &
   done
+  wait
 fi
 
 if [ $stage -le 12 ]; then
   # this version of the decoding treats each utterance separately
   # without carrying forward speaker information.
-  for test in dev_clean dev_other; do
+  for test in test_clean test_other dev_clean dev_other; do
+    (
     steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 30 \
       --per-utt true exp/tri6b/graph_tgsmall data/$test ${dir}_online/decode_${test}_tgsmall_utt || exit 1;
     steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
@@ -146,7 +153,9 @@ if [ $stage -le 12 ]; then
     steps/lmrescore_const_arpa.sh \
       --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \
       data/$test ${dir}_online/decode_${test}_{tgsmall,fglarge}_utt || exit 1;
+    ) &
   done
+  wait
 fi
 
 if [ $stage -le 13 ]; then
@@ -154,6 +163,7 @@ if [ $stage -le 13 ]; then
   # without carrying forward speaker information, but looks to the end
   # of the utterance while computing the iVector (--online false)
   for test in test_clean test_other dev_clean dev_other; do
+    (
     steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 30 \
       --per-utt true --online false exp/tri6b/graph_tgsmall data/$test \
         ${dir}_online/decode_${test}_tgsmall_utt_offline || exit 1;
@@ -165,7 +175,9 @@ if [ $stage -le 13 ]; then
     steps/lmrescore_const_arpa.sh \
       --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \
       data/$test ${dir}_online/decode_${test}_{tgsmall,fglarge}_utt_offline || exit 1;
+    ) &
   done
+  wait
 fi
 
 if [ $stage -le 14 ]; then
diff --git a/egs/librispeech/s5/local/online/run_nnet2_ms_disc.sh b/egs/librispeech/s5/local/online/run_nnet2_ms_disc.sh
index 0ff05eec4ff..85d6e90a534 100755
--- a/egs/librispeech/s5/local/online/run_nnet2_ms_disc.sh
+++ b/egs/librispeech/s5/local/online/run_nnet2_ms_disc.sh
@@ -106,14 +106,14 @@ if [ $stage -le 3 ]; then
   if [ -d ${srcdir}_degs/storage ]; then max_jobs=10; else max_jobs=5; fi
 
   steps/nnet2/get_egs_discriminative2.sh \
-    --cmd "$decode_cmd -tc $max_jobs" \
+    --cmd "$decode_cmd --max-jobs-run $max_jobs" \
     --online-ivector-dir exp/nnet2_online/ivectors_train_960_hires \
     --criterion $criterion --drop-frames $drop_frames \
      data/train_960_hires data/lang_pp ${srcdir}{_ali,_denlats,/final.mdl,_degs} || exit 1;
 
   # the command below is a more generic, but slower, way to do it.
   #steps/online/nnet2/get_egs_discriminative2.sh \
-  #  --cmd "$decode_cmd -tc $max_jobs" \
+  #  --cmd "$decode_cmd --max-jobs-run $max_jobs" \
   #  --criterion $criterion --drop-frames $drop_frames \
   #   data/train_960 data/lang_pp ${srcdir}{_ali,_denlats,_online,_degs} || exit 1;
 fi
diff --git a/egs/librispeech/s5/local/run_rnnlm.sh b/egs/librispeech/s5/local/run_rnnlm.sh
index ea9b997dcb9..2fc71f23ae3 100755
--- a/egs/librispeech/s5/local/run_rnnlm.sh
+++ b/egs/librispeech/s5/local/run_rnnlm.sh
@@ -3,12 +3,12 @@
 # Copyright 2014  Yandex (Author: Ilya Edrenkin)
 # Apache 2.0
 
-# Begin configuration section.  
-hidden=150
-maxent_order=5
-maxent_size=1000
-num_threads=16
+# Begin configuration section.
+rnnlm_options="-hidden 150 -direct 1000 -direct-order 5"
+rnnlm_tag="h150_me5-1000"
+num_threads=8 # set this value to the number of physical cores on your CPU
 stage=0
+rnnlm_ver=faster-rnnlm
 # End configuration section.
 
 echo "$0 $@"  # Print the command line for logging
@@ -25,9 +25,8 @@ if [ $# -ne 2 ]; then
   echo "    <data-dir> is the directory in which the text corpus is downloaded"
   echo "    <lm-dir> is the directory in which the language model is stored"
   echo "Main options:"
-  echo "  --hidden <int>          # default 150. Hidden layer size"
-  echo "  --maxent-order <int>    # default 5. Maxent features order size"
-  echo "  --maxent-size <int>     # default 1000. Maxent features hash size"
+  echo "  --rnnlm-options <int>   # default '$rnnlm_options'. Command line arguments to pass to rnnlm"
+  echo "  --rnnlm-tag <str>       # default '$rnnlm_tag' The tag is appended to exp/ folder name"
   echo "  --num-threads <int>     # default 16. Number of concurrent threadss to train RNNLM"
   echo "  --stage <int>           # 1 to download and prepare data, 2 to train RNNLM, 3 to rescore tri6b with a trained RNNLM"
   exit 1
@@ -36,51 +35,69 @@ fi
 s5_dir=`pwd`
 data_dir=`readlink -f $1`
 lm_dir=`readlink -f $2`
-rnnlm_ver=rnnlm-hs-0.1b # Probably could make this an option, but Tomas's RNN will take long to train on 200K vocab
-rnnlmdir=data/lang_rnnlm_h${hidden}_me${maxent_order}-${maxent_size}
-export PATH=$KALDI_ROOT/tools/$rnnlm_ver:$PATH
+modeldir=data/lang_${rnnlm_ver}_${rnnlm_tag}
 
 if [ $stage -le 1 ]; then
   echo "$0: Prepare training data for RNNLM"
   cd $data_dir
-  wget http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz
-  gunzip librispeech-lm-norm.txt.gz
-  $s5_dir/utils/filt.py $lm_dir/librispeech-vocab.txt librispeech-lm-norm.txt | shuf > librispeech-lm-norm.train.txt
-  $s5_dir/utils/filt.py $lm_dir/librispeech-vocab.txt <(awk '{$1=""; print $0}' $s5_dir/data/train_960/text) > librispeech-lm-norm.dev.txt
-  rm librispeech-lm-norm.txt
+  if [ -f "librispeech-lm-norm.dev.txt" ]; then
+      echo "$0: SKIP File librispeech-lm-norm.dev.txt already exists"
+  else
+      wget http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz
+      gunzip librispeech-lm-norm.txt.gz
+      $s5_dir/utils/filt.py $lm_dir/librispeech-vocab.txt librispeech-lm-norm.txt | shuf > librispeech-lm-norm.train.txt
+      $s5_dir/utils/filt.py $lm_dir/librispeech-vocab.txt <(awk '{$1=""; print $0}' $s5_dir/data/train_960/text) > librispeech-lm-norm.dev.txt.tmp
+      mv librispeech-lm-norm.dev.txt.tmp librispeech-lm-norm.dev.txt
+      rm librispeech-lm-norm.txt
+  fi
   cd $s5_dir
-  
+
 fi
 
 if [ $stage -le 2 ]; then
   echo "$0: Training RNNLM. It will probably take several hours."
-  cd $KALDI_ROOT/tools
-  if [ -f $rnnlm_ver/rnnlm ]; then
-      echo "Not installing the rnnlm toolkit since it is already there."
+  $KALDI_ROOT/tools/extras/check_for_rnnlm.sh "$rnnlm_ver" || exit 1
+  rnnlm_path="$(readlink -f $KALDI_ROOT)/tools/$rnnlm_ver/rnnlm"
+  cd $s5_dir
+  mkdir -p $modeldir
+  echo "$0: Model file: $modeldir/rnnlm"
+  if [ -f "$modeldir/rnnlm" ]; then
+      echo "$0: SKIP file '$modeldir/rnnlm' already exists"
   else
-      extras/install_rnnlm_hs.sh
+      rm -f $modeldir/rnnlm.tmp
+      rnnlm_cmd="$rnnlm_path"
+      if type taskset >/dev/null 2>&1 ; then
+          # HogWild works much faster if all threads are binded to the same phisical cpu
+          rnnlm_cmd="taskset -c $(seq -s, 0 $(( $num_threads - 1 )) ) $rnnlm_cmd"
+      fi
+      $rnnlm_cmd -rnnlm $modeldir/rnnlm.tmp \
+          -train $data_dir/librispeech-lm-norm.train.txt \
+          -valid $data_dir/librispeech-lm-norm.dev.txt \
+          -threads $num_threads $rnnlm_options -retry 1 -stop 1.0 2>&1 | tee $modeldir/rnnlm.log
+      touch $modeldir/unk.probs
+      awk '{print $1}' $modeldir/rnnlm.tmp > $modeldir/wordlist.rnn
+      mv $modeldir/rnnlm.tmp $modeldir/rnnlm
+      mv $modeldir/rnnlm.tmp.nnet $modeldir/rnnlm.nnet
   fi
-  cd $s5_dir
-  mkdir -p $rnnlmdir
-  rnnlm -rnnlm $rnnlmdir/rnnlm -train $data_dir/librispeech-lm-norm.train.txt -valid $data_dir/librispeech-lm-norm.dev.txt \
-      -threads $num_threads -hidden $hidden -direct-order $maxent_order -direct $maxent_size -retry 1 -stop 1.0
-  touch $rnnlmdir/unk.probs
-  awk '{print $1}' $rnnlmdir/rnnlm > $rnnlmdir/wordlist.rnn
 fi
 
 if [ $stage -le 3 ]; then
   echo "$0: Performing RNNLM rescoring on tri6b decoding results"
-  for lm in tgsmall tgmed; do
+  for lm in tgsmall tgmed tglarge; do
     for devset in dev_clean dev_other; do
       sourcedir=exp/tri6b/decode_${lm}_${devset}
-      resultsdir=${sourcedir}_rnnlm_h${hidden}_me${maxent_order}-${maxent_size}
-      steps/rnnlmrescore.sh --rnnlm_ver $rnnlm_ver --N 100 0.5 data/lang_test_$lm $rnnlmdir data/$devset $sourcedir ${resultsdir}_L0.5
-      cp -r ${resultsdir}_L0.5 ${resultsdir}_L0.25
-      cp -r ${resultsdir}_L0.5 ${resultsdir}_L0.75
-      steps/rnnlmrescore.sh --rnnlm_ver $rnnlm_ver --N 100 --stage 7 0.25 data/lang_test_$lm $rnnlmdir data/$devset $sourcedir ${resultsdir}_L0.25
-      steps/rnnlmrescore.sh --rnnlm_ver $rnnlm_ver --N 100 --stage 7 0.75 data/lang_test_$lm $rnnlmdir data/$devset $sourcedir ${resultsdir}_L0.75
+      if [ ! -d "$sourcedir" ]; then
+          echo "$0: WARNING cannot find source dir '$sourcedir' to rescore"
+          continue
+      fi
+      resultsdir=${sourcedir}_${rnnlm_ver}_${rnnlm_tag}
+      rm -rf ${resultsdir}_L0.5
+      steps/rnnlmrescore.sh --skip_scoring false --rnnlm_ver $rnnlm_ver --N 100 0.5 data/lang_test_$lm $modeldir data/$devset $sourcedir ${resultsdir}_L0.5
+      for coef in 0.25 0.75; do
+          rm -rf ${resultsdir}_L${coef}
+          cp -r ${resultsdir}_L0.5 ${resultsdir}_L${coef}
+          steps/rnnlmrescore.sh --skip_scoring false --rnnlm_ver $rnnlm_ver --N 100 --stage 7 $coef data/lang_test_$lm $modeldir data/$devset $sourcedir ${resultsdir}_L${coef}
+      done
     done
   done
 fi
-
-
diff --git a/egs/librispeech/s5/local/score.sh b/egs/librispeech/s5/local/score.sh
index f6359c189b4..3082c5eb9ee 100755
--- a/egs/librispeech/s5/local/score.sh
+++ b/egs/librispeech/s5/local/score.sh
@@ -13,6 +13,7 @@ reverse=false
 word_ins_penalty=0.0,0.5,1.0
 min_lmwt=9
 max_lmwt=20
+iter=final
 #end configuration section.
 
 [ -f ./path.sh ] && . ./path.sh
diff --git a/egs/librispeech/s5/path.sh b/egs/librispeech/s5/path.sh
index 74b6e31ad44..03df6dd9f2b 100755
--- a/egs/librispeech/s5/path.sh
+++ b/egs/librispeech/s5/path.sh
@@ -1,5 +1,7 @@
 export KALDI_ROOT=`pwd`/../../..
-export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin/:$KALDI_ROOT/src/kwsbin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$KALDI_ROOT/src/lmbin/:$PWD:$PATH
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
 export LC_ALL=C
 
 # we use this both in the (optional) LM training and the G2P-related scripts
diff --git a/egs/librispeech/s5/run.sh b/egs/librispeech/s5/run.sh
index 02880f3741b..5e969418c93 100755
--- a/egs/librispeech/s5/run.sh
+++ b/egs/librispeech/s5/run.sh
@@ -2,7 +2,7 @@
 
 
 # Set this to somewhere where you want to put your data, or where
-# someone else has already put it.  You'll want to change this 
+# someone else has already put it.  You'll want to change this
 # if you're not on the CLSP grid.
 data=/export/a15/vpanayotov/data
 
@@ -10,8 +10,8 @@ data=/export/a15/vpanayotov/data
 data_url=www.openslr.org/resources/12
 lm_url=www.openslr.org/resources/11
 
-. cmd.sh
-. path.sh
+. ./cmd.sh
+. ./path.sh
 
 # you might not want to do this for interactive shells.
 set -e
@@ -24,12 +24,12 @@ for part in dev-clean test-clean dev-other test-other train-clean-100; do
 done
 
 # download the LM resources
-local/download_lm.sh $lm_url data/local/lm || exit 1
+local/download_lm.sh $lm_url data/local/lm
 
 # format the data as Kaldi data directories
 for part in dev-clean test-clean dev-other test-other train-clean-100; do
   # use underscore-separated names in data directories.
-  local/data_prep.sh $data/LibriSpeech/$part data/$(echo $part | sed s/-/_/g) || exit 1
+  local/data_prep.sh $data/LibriSpeech/$part data/$(echo $part | sed s/-/_/g)
 done
 
 ## Optional text corpus normalization and LM training
@@ -39,7 +39,7 @@ done
 ## well as some intermediate data(e.g. the normalized text used for LM training),
 ## are available for download at http://www.openslr.org/11/
 #local/lm/train_lm.sh $LM_CORPUS_ROOT \
-#  data/local/lm/norm/tmp data/local/lm/norm/norm_texts data/local/lm || exit 1
+#  data/local/lm/norm/tmp data/local/lm/norm/norm_texts data/local/lm
 
 ## Optional G2P training scripts.
 ## As the LM training scripts above, this script is intended primarily to
@@ -49,24 +49,24 @@ done
 # when "--stage 3" option is used below we skip the G2P steps, and use the
 # lexicon we have already downloaded from openslr.org/11/
 local/prepare_dict.sh --stage 3 --nj 30 --cmd "$train_cmd" \
-   data/local/lm data/local/lm data/local/dict_nosp || exit 1
+   data/local/lm data/local/lm data/local/dict_nosp
 
 utils/prepare_lang.sh data/local/dict_nosp \
-  "<SPOKEN_NOISE>" data/local/lang_tmp_nosp data/lang_nosp || exit 1;
+  "<UNK>" data/local/lang_tmp_nosp data/lang_nosp
 
-local/format_lms.sh --src-dir data/lang_nosp data/local/lm || exit 1
+local/format_lms.sh --src-dir data/lang_nosp data/local/lm
 
 # Create ConstArpaLm format language model for full 3-gram and 4-gram LMs
 utils/build_const_arpa_lm.sh data/local/lm/lm_tglarge.arpa.gz \
-  data/lang_nosp data/lang_nosp_test_tglarge || exit 1;
+  data/lang_nosp data/lang_nosp_test_tglarge
 utils/build_const_arpa_lm.sh data/local/lm/lm_fglarge.arpa.gz \
-  data/lang_nosp data/lang_nosp_test_fglarge || exit 1;
+  data/lang_nosp data/lang_nosp_test_fglarge
 
 mfccdir=mfcc
 # spread the mfccs over various machines, as this data-set is quite large.
-if [[  $(hostname -f) ==  *.clsp.jhu.edu ]]; then 
+if [[  $(hostname -f) ==  *.clsp.jhu.edu ]]; then
   mfcc=$(basename mfccdir) # in case was absolute pathname (unlikely), get basename.
-  utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/librispeech/s5/$mfcc/storage \
+  utils/create_split_dir.pl /export/b{02,11,12,13}/$USER/kaldi-data/egs/librispeech/s5/$mfcc/storage \
     $mfccdir/storage
 fi
 
@@ -87,15 +87,15 @@ utils/subset_data_dir.sh data/train_clean_100 10000 data/train_10k
 
 # train a monophone system
 steps/train_mono.sh --boost-silence 1.25 --nj 20 --cmd "$train_cmd" \
-  data/train_2kshort data/lang_nosp exp/mono || exit 1;
+  data/train_2kshort data/lang_nosp exp/mono
 
 # decode using the monophone model
 (
   utils/mkgraph.sh --mono data/lang_nosp_test_tgsmall \
-    exp/mono exp/mono/graph_nosp_tgsmall || exit 1
+    exp/mono exp/mono/graph_nosp_tgsmall
   for test in test_clean test_other dev_clean dev_other; do
     steps/decode.sh --nj 20 --cmd "$decode_cmd" exp/mono/graph_nosp_tgsmall \
-      data/$test exp/mono/decode_nosp_tgsmall_$test || exit 1
+      data/$test exp/mono/decode_nosp_tgsmall_$test
   done
 )&
 
@@ -104,97 +104,97 @@ steps/align_si.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \
 
 # train a first delta + delta-delta triphone system on a subset of 5000 utterances
 steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \
-    2000 10000 data/train_5k data/lang_nosp exp/mono_ali_5k exp/tri1 || exit 1;
+    2000 10000 data/train_5k data/lang_nosp exp/mono_ali_5k exp/tri1
 
 # decode using the tri1 model
 (
   utils/mkgraph.sh data/lang_nosp_test_tgsmall \
-    exp/tri1 exp/tri1/graph_nosp_tgsmall || exit 1;
+    exp/tri1 exp/tri1/graph_nosp_tgsmall
   for test in test_clean test_other dev_clean dev_other; do
     steps/decode.sh --nj 20 --cmd "$decode_cmd" exp/tri1/graph_nosp_tgsmall \
-      data/$test exp/tri1/decode_nosp_tgsmall_$test || exit 1;
+      data/$test exp/tri1/decode_nosp_tgsmall_$test
     steps/lmrescore.sh --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tgmed} \
-      data/$test exp/tri1/decode_nosp_{tgsmall,tgmed}_$test  || exit 1;
+      data/$test exp/tri1/decode_nosp_{tgsmall,tgmed}_$test
     steps/lmrescore_const_arpa.sh \
       --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tglarge} \
-      data/$test exp/tri1/decode_nosp_{tgsmall,tglarge}_$test || exit 1;
+      data/$test exp/tri1/decode_nosp_{tgsmall,tglarge}_$test
   done
 )&
 
 steps/align_si.sh --nj 10 --cmd "$train_cmd" \
-  data/train_10k data/lang_nosp exp/tri1 exp/tri1_ali_10k || exit 1;
+  data/train_10k data/lang_nosp exp/tri1 exp/tri1_ali_10k
 
 
 # train an LDA+MLLT system.
 steps/train_lda_mllt.sh --cmd "$train_cmd" \
    --splice-opts "--left-context=3 --right-context=3" 2500 15000 \
-   data/train_10k data/lang_nosp exp/tri1_ali_10k exp/tri2b || exit 1;
+   data/train_10k data/lang_nosp exp/tri1_ali_10k exp/tri2b
 
 # decode using the LDA+MLLT model
 (
   utils/mkgraph.sh data/lang_nosp_test_tgsmall \
-    exp/tri2b exp/tri2b/graph_nosp_tgsmall || exit 1;
+    exp/tri2b exp/tri2b/graph_nosp_tgsmall
   for test in test_clean test_other dev_clean dev_other; do
     steps/decode.sh --nj 20 --cmd "$decode_cmd" exp/tri2b/graph_nosp_tgsmall \
-      data/$test exp/tri2b/decode_nosp_tgsmall_$test || exit 1;
+      data/$test exp/tri2b/decode_nosp_tgsmall_$test
     steps/lmrescore.sh --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tgmed} \
-      data/$test exp/tri2b/decode_nosp_{tgsmall,tgmed}_$test  || exit 1;
+      data/$test exp/tri2b/decode_nosp_{tgsmall,tgmed}_$test
     steps/lmrescore_const_arpa.sh \
       --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tglarge} \
-      data/$test exp/tri2b/decode_nosp_{tgsmall,tglarge}_$test || exit 1;
+      data/$test exp/tri2b/decode_nosp_{tgsmall,tglarge}_$test
   done
 )&
 
 # Align a 10k utts subset using the tri2b model
 steps/align_si.sh  --nj 10 --cmd "$train_cmd" --use-graphs true \
-  data/train_10k data/lang_nosp exp/tri2b exp/tri2b_ali_10k || exit 1;
+  data/train_10k data/lang_nosp exp/tri2b exp/tri2b_ali_10k
 
 # Train tri3b, which is LDA+MLLT+SAT on 10k utts
 steps/train_sat.sh --cmd "$train_cmd" 2500 15000 \
-  data/train_10k data/lang_nosp exp/tri2b_ali_10k exp/tri3b || exit 1;
+  data/train_10k data/lang_nosp exp/tri2b_ali_10k exp/tri3b
 
 # decode using the tri3b model
 (
   utils/mkgraph.sh data/lang_nosp_test_tgsmall \
-    exp/tri3b exp/tri3b/graph_nosp_tgsmall || exit 1;
+    exp/tri3b exp/tri3b/graph_nosp_tgsmall
   for test in test_clean test_other dev_clean dev_other; do
     steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \
       exp/tri3b/graph_nosp_tgsmall data/$test \
-      exp/tri3b/decode_nosp_tgsmall_$test || exit 1;
+      exp/tri3b/decode_nosp_tgsmall_$test
     steps/lmrescore.sh --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tgmed} \
-      data/$test exp/tri3b/decode_nosp_{tgsmall,tgmed}_$test  || exit 1;
+      data/$test exp/tri3b/decode_nosp_{tgsmall,tgmed}_$test
     steps/lmrescore_const_arpa.sh \
       --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tglarge} \
-      data/$test exp/tri3b/decode_nosp_{tgsmall,tglarge}_$test || exit 1;
+      data/$test exp/tri3b/decode_nosp_{tgsmall,tglarge}_$test
   done
 )&
 
 # align the entire train_clean_100 subset using the tri3b model
 steps/align_fmllr.sh --nj 20 --cmd "$train_cmd" \
   data/train_clean_100 data/lang_nosp \
-  exp/tri3b exp/tri3b_ali_clean_100 || exit 1;
+  exp/tri3b exp/tri3b_ali_clean_100
 
 # train another LDA+MLLT+SAT system on the entire 100 hour subset
 steps/train_sat.sh  --cmd "$train_cmd" 4200 40000 \
   data/train_clean_100 data/lang_nosp \
-  exp/tri3b_ali_clean_100 exp/tri4b || exit 1;
+  exp/tri3b_ali_clean_100 exp/tri4b
 
 # decode using the tri4b model
 (
   utils/mkgraph.sh data/lang_nosp_test_tgsmall \
-    exp/tri4b exp/tri4b/graph_nosp_tgsmall || exit 1;
+    exp/tri4b exp/tri4b/graph_nosp_tgsmall
   for test in test_clean test_other dev_clean dev_other; do
     steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \
       exp/tri4b/graph_nosp_tgsmall data/$test \
-      exp/tri4b/decode_nosp_tgsmall_$test || exit 1;
+      exp/tri4b/decode_nosp_tgsmall_$test
     steps/lmrescore.sh --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tgmed} \
-      data/$test exp/tri4b/decode_nosp_{tgsmall,tgmed}_$test  || exit 1;
+      data/$test exp/tri4b/decode_nosp_{tgsmall,tgmed}_$test
     steps/lmrescore_const_arpa.sh \
       --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tglarge} \
-      data/$test exp/tri4b/decode_nosp_{tgsmall,tglarge}_$test || exit 1;
+      data/$test exp/tri4b/decode_nosp_{tgsmall,tglarge}_$test
     steps/lmrescore_const_arpa.sh \
       --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,fglarge} \
-      data/$test exp/tri4b/decode_nosp_{tgsmall,fglarge}_$test || exit 1;
+      data/$test exp/tri4b/decode_nosp_{tgsmall,fglarge}_$test
   done
 )&
 
@@ -205,137 +205,151 @@ steps/get_prons.sh --cmd "$train_cmd" \
 utils/dict_dir_add_pronprobs.sh --max-normalize true \
   data/local/dict_nosp \
   exp/tri4b/pron_counts_nowb.txt exp/tri4b/sil_counts_nowb.txt \
-  exp/tri4b/pron_bigram_counts_nowb.txt data/local/dict || exit 1
+  exp/tri4b/pron_bigram_counts_nowb.txt data/local/dict
 
 utils/prepare_lang.sh data/local/dict \
-  "<SPOKEN_NOISE>" data/local/lang_tmp data/lang
+  "<UNK>" data/local/lang_tmp data/lang
 local/format_lms.sh --src-dir data/lang data/local/lm
 
 utils/build_const_arpa_lm.sh \
-  data/local/lm/lm_tglarge.arpa.gz data/lang data/lang_test_tglarge || exit 1;
+  data/local/lm/lm_tglarge.arpa.gz data/lang data/lang_test_tglarge
 utils/build_const_arpa_lm.sh \
-  data/local/lm/lm_fglarge.arpa.gz data/lang data/lang_test_fglarge || exit 1;
+  data/local/lm/lm_fglarge.arpa.gz data/lang data/lang_test_fglarge
 
 # decode using the tri4b model with pronunciation and silence probabilities
 (
   utils/mkgraph.sh \
-    data/lang_test_tgsmall exp/tri4b exp/tri4b/graph_tgsmall || exit 1;
+    data/lang_test_tgsmall exp/tri4b exp/tri4b/graph_tgsmall
   for test in test_clean test_other dev_clean dev_other; do
     steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \
       exp/tri4b/graph_tgsmall data/$test \
-      exp/tri4b/decode_tgsmall_$test || exit 1;
+      exp/tri4b/decode_tgsmall_$test
     steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
-      data/$test exp/tri4b/decode_{tgsmall,tgmed}_$test  || exit 1;
+      data/$test exp/tri4b/decode_{tgsmall,tgmed}_$test
     steps/lmrescore_const_arpa.sh \
       --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
-      data/$test exp/tri4b/decode_{tgsmall,tglarge}_$test || exit 1;
+      data/$test exp/tri4b/decode_{tgsmall,tglarge}_$test
     steps/lmrescore_const_arpa.sh \
       --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \
-      data/$test exp/tri4b/decode_{tgsmall,fglarge}_$test || exit 1;
+      data/$test exp/tri4b/decode_{tgsmall,fglarge}_$test
   done
 )&
 
 # align train_clean_100 using the tri4b model
 steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
-  data/train_clean_100 data/lang exp/tri4b exp/tri4b_ali_clean_100 || exit 1;
+  data/train_clean_100 data/lang exp/tri4b exp/tri4b_ali_clean_100
 
 # if you want at this point you can train and test NN model(s) on the 100 hour
 # subset
-local/nnet2/run_5a_clean_100.sh || exit 1
+local/nnet2/run_5a_clean_100.sh
 
-local/download_and_untar.sh $data $data_url train-clean-360 || exit 1;
+local/download_and_untar.sh $data $data_url train-clean-360
 
 # now add the "clean-360" subset to the mix ...
 local/data_prep.sh \
-  $data/LibriSpeech/train-clean-360 data/train_clean_360 || exit 1
+  $data/LibriSpeech/train-clean-360 data/train_clean_360
 steps/make_mfcc.sh --cmd "$train_cmd" --nj 40 data/train_clean_360 \
-  exp/make_mfcc/train_clean_360 $mfccdir || exit 1
+  exp/make_mfcc/train_clean_360 $mfccdir
 steps/compute_cmvn_stats.sh \
-  data/train_clean_360 exp/make_mfcc/train_clean_360 $mfccdir || exit 1
+  data/train_clean_360 exp/make_mfcc/train_clean_360 $mfccdir
 
 # ... and then combine the two sets into a 460 hour one
 utils/combine_data.sh \
-  data/train_clean_460 data/train_clean_100 data/train_clean_360 || exit 1
+  data/train_clean_460 data/train_clean_100 data/train_clean_360
 
 # align the new, combined set, using the tri4b model
 steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
-  data/train_clean_460 data/lang exp/tri4b exp/tri4b_ali_clean_460 || exit 1;
+  data/train_clean_460 data/lang exp/tri4b exp/tri4b_ali_clean_460
 
 # create a larger SAT model, trained on the 460 hours of data.
 steps/train_sat.sh  --cmd "$train_cmd" 5000 100000 \
-  data/train_clean_460 data/lang exp/tri4b_ali_clean_460 exp/tri5b || exit 1;
+  data/train_clean_460 data/lang exp/tri4b_ali_clean_460 exp/tri5b
 
 # decode using the tri5b model
 (
   utils/mkgraph.sh data/lang_test_tgsmall \
-    exp/tri5b exp/tri5b/graph_tgsmall || exit 1;
+    exp/tri5b exp/tri5b/graph_tgsmall
   for test in test_clean test_other dev_clean dev_other; do
     steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \
       exp/tri5b/graph_tgsmall data/$test \
-      exp/tri5b/decode_tgsmall_$test || exit 1;
+      exp/tri5b/decode_tgsmall_$test
     steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
-      data/$test exp/tri5b/decode_{tgsmall,tgmed}_$test  || exit 1;
+      data/$test exp/tri5b/decode_{tgsmall,tgmed}_$test
     steps/lmrescore_const_arpa.sh \
       --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
-      data/$test exp/tri5b/decode_{tgsmall,tglarge}_$test || exit 1;
+      data/$test exp/tri5b/decode_{tgsmall,tglarge}_$test
     steps/lmrescore_const_arpa.sh \
       --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \
-      data/$test exp/tri5b/decode_{tgsmall,fglarge}_$test || exit 1;
+      data/$test exp/tri5b/decode_{tgsmall,fglarge}_$test
   done
 )&
 
 # train a NN model on the 460 hour set
-local/nnet2/run_6a_clean_460.sh || exit 1
+local/nnet2/run_6a_clean_460.sh
 
-local/download_and_untar.sh $data $data_url train-other-500 || exit 1;
+local/download_and_untar.sh $data $data_url train-other-500
 
 # prepare the 500 hour subset.
 local/data_prep.sh \
-  $data/LibriSpeech/train-other-500 data/train_other_500 || exit 1
+  $data/LibriSpeech/train-other-500 data/train_other_500
 steps/make_mfcc.sh --cmd "$train_cmd" --nj 40 data/train_other_500 \
-  exp/make_mfcc/train_other_500 $mfccdir || exit 1
+  exp/make_mfcc/train_other_500 $mfccdir
 steps/compute_cmvn_stats.sh \
-  data/train_other_500 exp/make_mfcc/train_other_500 $mfccdir || exit 1
+  data/train_other_500 exp/make_mfcc/train_other_500 $mfccdir
 
 # combine all the data
 utils/combine_data.sh \
-  data/train_960 data/train_clean_460 data/train_other_500 || exit 1
+  data/train_960 data/train_clean_460 data/train_other_500
 
 steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
-  data/train_960 data/lang exp/tri5b exp/tri5b_ali_960 || exit 1;
+  data/train_960 data/lang exp/tri5b exp/tri5b_ali_960
 
 # train a SAT model on the 960 hour mixed data.  Use the train_quick.sh script
 # as it is faster.
 steps/train_quick.sh --cmd "$train_cmd" \
-  7000 150000 data/train_960 data/lang exp/tri5b_ali_960 exp/tri6b || exit 1;
+  7000 150000 data/train_960 data/lang exp/tri5b_ali_960 exp/tri6b
 
 # decode using the tri6b model
 (
   utils/mkgraph.sh data/lang_test_tgsmall \
-    exp/tri6b exp/tri6b/graph_tgsmall || exit 1;
+    exp/tri6b exp/tri6b/graph_tgsmall
   for test in test_clean test_other dev_clean dev_other; do
     steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \
-      exp/tri6b/graph_tgsmall data/$test exp/tri6b/decode_tgsmall_$test || exit 1;
+      exp/tri6b/graph_tgsmall data/$test exp/tri6b/decode_tgsmall_$test
     steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
-      data/$test exp/tri6b/decode_{tgsmall,tgmed}_$test  || exit 1;
+      data/$test exp/tri6b/decode_{tgsmall,tgmed}_$test
     steps/lmrescore_const_arpa.sh \
       --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
-      data/$test exp/tri6b/decode_{tgsmall,tglarge}_$test || exit 1;
+      data/$test exp/tri6b/decode_{tgsmall,tglarge}_$test
     steps/lmrescore_const_arpa.sh \
       --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \
-      data/$test exp/tri6b/decode_{tgsmall,fglarge}_$test || exit 1;
+      data/$test exp/tri6b/decode_{tgsmall,fglarge}_$test
   done
 )&
 
 # steps/cleanup/debug_lexicon.sh --remove-stress true  --nj 200 --cmd "$train_cmd" data/train_clean_100 \
 #    data/lang exp/tri6b data/local/dict/lexicon.txt exp/debug_lexicon_100h
 
-# #Perform RNNLM rescoring of tri6b
+# #Perform rescoring of tri6b be means of faster-rnnlm
 # #Attention: with default settings requires 4 GB of memory per rescoring job, so commenting this out by default
-# local/run_rnnlm.sh $data data/local/lm
+# wait && local/run_rnnlm.sh \
+#     --rnnlm-ver "faster-rnnlm" \
+#     --rnnlm-options "-hidden 150 -direct 1000 -direct-order 5" \
+#     --rnnlm-tag "h150-me5-1000" $data data/local/lm
+
+# #Perform rescoring of tri6b be means of faster-rnnlm using Noise contrastive estimation
+# #Note, that could be extremely slow without CUDA
+# #We use smaller direct layer size so that it could be stored in GPU memory (~2Gb)
+# #Suprisingly, bottleneck here is validation rather then learning
+# #Therefore you can use smaller validation dataset to speed up training
+# wait && local/run_rnnlm.sh \
+#     --rnnlm-ver "faster-rnnlm" \
+#     --rnnlm-options "-hidden 150 -direct 400 -direct-order 3 --nce 20" \
+#     --rnnlm-tag "h150-me3-400-nce20" $data data/local/lm
+
 
 # train NN models on the entire dataset
-local/nnet2/run_7a_960.sh || exit 1
+local/nnet2/run_7a_960.sh
 
 # # train models on cleaned-up data
 # # we've found that this isn't helpful-- see the comments in local/run_data_cleaning.sh
diff --git a/egs/lre/v1/cmd.sh b/egs/lre/v1/cmd.sh
index 5c38b3a5d77..d1ca1a6d126 100644
--- a/egs/lre/v1/cmd.sh
+++ b/egs/lre/v1/cmd.sh
@@ -1,28 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64*"
-export decode_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G"
-#export cuda_cmd="..."
-export mkgraph_cmd="queue.pl -l arch=*64* ram_free=4G,mem_free=4G"
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
-#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
-#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
-#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
-
-#c) run it locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 4G"
 
 
diff --git a/egs/lre/v1/path.sh b/egs/lre/v1/path.sh
index 7cf73af8c53..e50f57c5271 100755
--- a/egs/lre/v1/path.sh
+++ b/egs/lre/v1/path.sh
@@ -1,3 +1,5 @@
-export KALDI_ROOT=$(cd ../../..; pwd)
-export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/ivectorbin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH
+export KALDI_ROOT=`pwd`/../../..
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
 export LC_ALL=C
diff --git a/egs/lre07/v1/cmd.sh b/egs/lre07/v1/cmd.sh
index 5c38b3a5d77..d1ca1a6d126 100644
--- a/egs/lre07/v1/cmd.sh
+++ b/egs/lre07/v1/cmd.sh
@@ -1,28 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64*"
-export decode_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G"
-#export cuda_cmd="..."
-export mkgraph_cmd="queue.pl -l arch=*64* ram_free=4G,mem_free=4G"
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
-#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
-#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
-#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
-
-#c) run it locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 4G"
 
 
diff --git a/egs/lre07/v1/local/make_lre07.pl b/egs/lre07/v1/local/make_lre07.pl
index db29880a2f4..3dd2c089d96 100755
--- a/egs/lre07/v1/local/make_lre07.pl
+++ b/egs/lre07/v1/local/make_lre07.pl
@@ -40,10 +40,10 @@
 open(DUR10, ">$dir/10sec") || die "Failed opening output file $dir/10sec";
 open(DUR30, ">$dir/30sec") || die "Failed opening output file $dir/30sec";
 
-my $key_str = `wget -qO- "http://www.itl.nist.gov/iad/mig/tests/lang/2007/lid07key_v5.txt"`;
+my $key_str = `wget -qO- "http://www.openslr.org/resources/23/lre07_key.txt"`;
 @key_lines = split("\n",$key_str);
-%utt2lang = (); 
-%utt2dur = (); 
+%utt2lang = ();
+%utt2dur = ();
 foreach (@key_lines) {
   @words = split(' ', $_);
   if (index($words[0], "#") == -1) {
diff --git a/egs/lre07/v1/path.sh b/egs/lre07/v1/path.sh
index 7cf73af8c53..e50f57c5271 100755
--- a/egs/lre07/v1/path.sh
+++ b/egs/lre07/v1/path.sh
@@ -1,3 +1,5 @@
-export KALDI_ROOT=$(cd ../../..; pwd)
-export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/ivectorbin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH
+export KALDI_ROOT=`pwd`/../../..
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
 export LC_ALL=C
diff --git a/egs/lre07/v1/run.sh b/egs/lre07/v1/run.sh
index ff3f8ad94cd..4579c06b523 100755
--- a/egs/lre07/v1/run.sh
+++ b/egs/lre07/v1/run.sh
@@ -1,10 +1,10 @@
 #!/bin/bash
-# Copyright  2014   David Snyder
-#                   Daniel Povey
+# Copyright  2014-2015  David Snyder
+#                       Daniel Povey
 # Apache 2.0.
 #
 # This script runs the NIST 2007 General Language Recognition Closed-Set
-# evaluation. 
+# evaluation.
 
 . cmd.sh
 . path.sh
@@ -36,7 +36,7 @@ local/make_lre07_train.pl /export/corpora5/LDC/LDC2009S05 data
 local/make_lre09.pl /export/corpora5/NIST/LRE/LRE2009/eval data
 
 # Make the evaluation data set. We're concentrating on the General Language
-# Recognition Closet-Set evaluation, so we remove the dialects and filter
+# Recognition Closed-Set evaluation, so we remove the dialects and filter
 # out the unknown languages used in the open-set evaluation.
 local/make_lre07.pl /export/corpora5/LDC/LDC2009S04 data/lre07_all
 
@@ -60,7 +60,8 @@ for d in $src_list; do rm -f $d/spk2gender 2>/dev/null; done
 utils/combine_data.sh data/train_unsplit $src_list
 
 # original utt2lang will remain in data/train_unsplit/.backup/utt2lang.
-utils/apply_map.pl -f 2 --permissive local/lang_map.txt  < data/train_unsplit/utt2lang  2>/dev/null > foo
+utils/apply_map.pl -f 2 --permissive local/lang_map.txt \
+  < data/train_unsplit/utt2lang 2>/dev/null > foo
 cp foo data/train_unsplit/utt2lang
 rm foo
 
@@ -70,9 +71,9 @@ echo "**Language count in i-Vector extractor training (after splitting long utte
 awk '{print $2}' data/train/utt2lang | sort | uniq -c | sort -nr
 
 # This commented script is an alternative to the above utterance
-# splitting method. Here we split the utterance based on the number of 
+# splitting method. Here we split the utterance based on the number of
 # frames which are voiced, rather than the total number of frames.
-# max_voiced=3000 
+# max_voiced=3000
 # local/vad_split_utts.sh --max-voiced $max_voiced data/train_unsplit $mfccdir data/train
 
 use_vtln=true
@@ -81,7 +82,7 @@ if $use_vtln; then
     cp -r data/${t} data/${t}_novtln
     rm -r data/${t}_novtln/{split,.backup,spk2warp} 2>/dev/null || true
     steps/make_mfcc.sh --mfcc-config conf/mfcc_vtln.conf --nj 100 --cmd "$train_cmd" \
-       data/${t}_novtln exp/make_mfcc $mfccdir 
+       data/${t}_novtln exp/make_mfcc $mfccdir
     lid/compute_vad_decision.sh data/${t}_novtln exp/make_mfcc $mfccdir
   done
 
@@ -98,7 +99,7 @@ if $use_vtln; then
      data/train_5k_novtln exp/diag_ubm_vtln exp/vtln
 
   for t in lre07 train; do
-    lid/get_vtln_warps.sh --nj 100 --cmd "$train_cmd" \
+    lid/get_vtln_warps.sh --nj 50 --cmd "$train_cmd" \
        data/${t}_novtln exp/vtln exp/${t}_warps
     cp exp/${t}_warps/utt2warp data/$t/
   done
@@ -126,18 +127,18 @@ utils/subset_data_dir.sh data/train 5000 data/train_5k
 utils/subset_data_dir.sh data/train 10000 data/train_10k
 
 
-lid/train_diag_ubm.sh --nj 30 --cmd "$train_cmd" data/train_5k 2048 \
-  exp/diag_ubm_2048
-lid/train_full_ubm.sh --nj 30 --cmd "$train_cmd" data/train_10k \
-  exp/diag_ubm_2048 exp/full_ubm_2048_10k
+lid/train_diag_ubm.sh --nj 30 --cmd "$train_cmd -l mem_free=20G,ram_free=20G" \
+  data/train_5k 2048 exp/diag_ubm_2048
+lid/train_full_ubm.sh --nj 30 --cmd "$train_cmd -l mem_free=20G,ram_free=20G" \
+  data/train_10k exp/diag_ubm_2048 exp/full_ubm_2048_10k
 
-lid/train_full_ubm.sh --nj 30 --cmd "$train_cmd" data/train \
-  exp/full_ubm_2048_10k exp/full_ubm_2048
+lid/train_full_ubm.sh --nj 30 --cmd "$train_cmd -l mem_free=35G,ram_free=35G" \
+  data/train exp/full_ubm_2048_10k exp/full_ubm_2048
 
 # Alternatively, a diagonal UBM can replace the full UBM used above.
 # The preceding calls to train_diag_ubm.sh and train_full_ubm.sh
 # can be commented out and replaced with the following lines.
-# 
+#
 # This results in a slight degradation but could improve error rate when
 # there is less training data than used in this example.
 #
@@ -147,7 +148,8 @@ lid/train_full_ubm.sh --nj 30 --cmd "$train_cmd" data/train \
 #gmm-global-to-fgmm exp/diag_ubm_2048/final.dubm \
 #  exp/full_ubm_2048/final.ubm
 
-lid/train_ivector_extractor.sh --cmd "$train_cmd -l mem_free=8G,ram_free=8G" \
+lid/train_ivector_extractor.sh --cmd "$train_cmd -l mem_free=35G,ram_free=35G" \
+  --use-weights true \
   --num-iters 5 exp/full_ubm_2048/final.ubm data/train \
   exp/extractor_2048
 
@@ -167,13 +169,13 @@ lid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=3G,ram_free=3G" --nj 50 \
    exp/extractor_2048 data/lre07 exp/ivectors_lre07
 
 lid/run_logistic_regression.sh --prior-scale 0.70 \
-  --conf conf/logistic-regression.conf 
+  --conf conf/logistic-regression.conf
 # Training error-rate
-# ER (%): 5.15
+# ER (%): 3.95
 
 # General LR 2007 closed-set eval
 local/lre07_eval/lre07_eval.sh exp/ivectors_lre07 \
   local/general_lr_closed_set_langs.txt
 # Duration (sec):    avg      3     10     30
-#         ER (%):  23.58  43.95  19.43   7.37
-#      C_avg (%):  14.79  27.23  12.16   4.97
+#         ER (%):  23.11  42.84  19.33   7.18
+#      C_avg (%):  14.17  26.04  11.93   4.52
diff --git a/egs/reverb/s5/RESULTS b/egs/reverb/s5/RESULTS
index 031a6b2ec1a..3537852a827 100644
--- a/egs/reverb/s5/RESULTS
+++ b/egs/reverb/s5/RESULTS
@@ -1,306 +1,150 @@
-local/summarize_results.pl tri2a
-#### RESULTS FOR dt ##### 
-
-exp/tri2a/decode_bg_5k_REVERB_dt*
-LMW = 15
-RealData_dt_for_1ch_far_room1_A	89.00
-RealData_dt_for_1ch_near_room1_A	90.39
-SimData_dt_for_1ch_far_room1_A	22.35
-SimData_dt_for_1ch_far_room2_A	88.37
-SimData_dt_for_1ch_far_room3_A	90.85
-SimData_dt_for_1ch_near_room1_A	12.29
-SimData_dt_for_1ch_near_room2_A	42.86
-SimData_dt_for_1ch_near_room3_A	50.17
-Avg_Sim(6)	51.15
-Avg_Real(2)	89.69
-
-
-#### RESULTS FOR et ##### 
-
-exp/tri2a/decode_bg_5k_REVERB_et*
-LMW = 15
-Avg_Sim(0)	0.00
-Avg_Real(0)	0.00
-
-
-local/summarize_results.pl tri2a_mc
-#### RESULTS FOR dt ##### 
-
-exp/tri2a_mc/decode_bg_5k_REVERB_dt*
-LMW = 15
-RealData_dt_for_1ch_far_room1_A	51.88
-RealData_dt_for_1ch_near_room1_A	56.14
-SimData_dt_for_1ch_far_room1_A	17.45
-SimData_dt_for_1ch_far_room2_A	44.02
-SimData_dt_for_1ch_far_room3_A	49.90
-SimData_dt_for_1ch_near_room1_A	15.29
-SimData_dt_for_1ch_near_room2_A	22.11
-SimData_dt_for_1ch_near_room3_A	26.34
-Avg_Sim(6)	29.18
-Avg_Real(2)	54.01
-
-
-#### RESULTS FOR et ##### 
-
-exp/tri2a_mc/decode_bg_5k_REVERB_et*
-LMW = 15
-Avg_Sim(0)	0.00
-Avg_Real(0)	0.00
-
-
-local/summarize_results.pl tri2a_mc basis_fmllr
-#### RESULTS FOR dt ##### 
-
-exp/tri2a_mc/decode_basis_fmllr_bg_5k_REVERB_dt*
-LMW = 15
-RealData_dt_for_1ch_far_room1_A	43.95
-RealData_dt_for_1ch_near_room1_A	48.91
-SimData_dt_for_1ch_far_room1_A	16.37
-SimData_dt_for_1ch_far_room2_A	35.67
-SimData_dt_for_1ch_far_room3_A	39.59
-SimData_dt_for_1ch_near_room1_A	13.03
-SimData_dt_for_1ch_near_room2_A	17.08
-SimData_dt_for_1ch_near_room3_A	20.00
+####################
+exp/tri2a/decode_bg_5k_REVERB_*dt*
+RealData_dt_for_1ch_far_room1_A	89.13
+RealData_dt_for_1ch_near_room1_A	90.27
+SimData_dt_for_1ch_far_room1_A	22.44
+SimData_dt_for_1ch_far_room2_A	88.44
+SimData_dt_for_1ch_far_room3_A	91.27
+SimData_dt_for_1ch_near_room1_A	12.19
+SimData_dt_for_1ch_near_room2_A	42.74
+SimData_dt_for_1ch_near_room3_A	49.31
+Avg_Real(2)	89.70
+Avg_Sim(6)	51.06
+
+exp/tri2a/decode_bg_5k_REVERB_*et*
+RealData_et_for_1ch_far_room1_A	88.45
+RealData_et_for_1ch_near_room1_A	88.66
+SimData_et_for_1ch_far_room1_A	22.72
+SimData_et_for_1ch_far_room2_A	81.53
+SimData_et_for_1ch_far_room3_A	89.25
+SimData_et_for_1ch_near_room1_A	14.37
+SimData_et_for_1ch_near_room2_A	40.46
+SimData_et_for_1ch_near_room3_A	51.50
+Avg_Real(2)	88.56
+Avg_Sim(6)	49.97
+
+####################
+exp/tri2a_mc/decode_bg_5k_REVERB_*dt*
+RealData_dt_for_1ch_far_room1_A	53.38
+RealData_dt_for_1ch_near_room1_A	56.27
+SimData_dt_for_1ch_far_room1_A	16.96
+SimData_dt_for_1ch_far_room2_A	44.15
+SimData_dt_for_1ch_far_room3_A	49.88
+SimData_dt_for_1ch_near_room1_A	15.00
+SimData_dt_for_1ch_near_room2_A	21.81
+SimData_dt_for_1ch_near_room3_A	25.10
+Avg_Real(2)	54.83
+Avg_Sim(6)	28.82
+
+exp/tri2a_mc/decode_bg_5k_REVERB_*et*
+RealData_et_for_1ch_far_room1_A	52.94
+RealData_et_for_1ch_near_room1_A	55.35
+SimData_et_for_1ch_far_room1_A	18.91
+SimData_et_for_1ch_far_room2_A	37.33
+SimData_et_for_1ch_far_room3_A	46.69
+SimData_et_for_1ch_near_room1_A	17.77
+SimData_et_for_1ch_near_room2_A	21.23
+SimData_et_for_1ch_near_room3_A	26.17
+Avg_Real(2)	54.14
+Avg_Sim(6)	28.02
+
+####################
+exp/tri2a_mc/decode_basis_fmllr_bg_5k_REVERB_*dt*
+RealData_dt_for_1ch_far_room1_A	46.27
+RealData_dt_for_1ch_near_room1_A	48.85
+SimData_dt_for_1ch_far_room1_A	15.59
+SimData_dt_for_1ch_far_room2_A	35.86
+SimData_dt_for_1ch_far_room3_A	39.54
+SimData_dt_for_1ch_near_room1_A	12.78
+SimData_dt_for_1ch_near_room2_A	17.75
+SimData_dt_for_1ch_near_room3_A	20.23
+Avg_Real(2)	47.56
 Avg_Sim(6)	23.62
-Avg_Real(2)	46.43
-
-
-#### RESULTS FOR et ##### 
-
-exp/tri2a_mc/decode_basis_fmllr_bg_5k_REVERB_et*
-LMW = 15
-Avg_Sim(0)	0.00
-Avg_Real(0)	0.00
-
-
-local/summarize_results.pl tri2b
-#### RESULTS FOR dt ##### 
-
-exp/tri2b/decode_bg_5k_REVERB_dt*
-LMW = 15
-RealData_dt_for_1ch_far_room1_A	91.66
-RealData_dt_for_1ch_near_room1_A	91.33
-SimData_dt_for_1ch_far_room1_A	26.94
-SimData_dt_for_1ch_far_room2_A	85.63
-SimData_dt_for_1ch_far_room3_A	91.99
-SimData_dt_for_1ch_near_room1_A	11.95
-SimData_dt_for_1ch_near_room2_A	34.51
-SimData_dt_for_1ch_near_room3_A	44.81
-Avg_Sim(6)	49.30
-Avg_Real(2)	91.50
-
-
-#### RESULTS FOR et ##### 
-
-exp/tri2b/decode_bg_5k_REVERB_et*
-LMW = 15
-RealData_et_for_1ch_far_room1_A	91.29
-RealData_et_for_1ch_near_room1_A	92.05
-SimData_et_for_1ch_far_room1_A	24.16
-SimData_et_for_1ch_far_room2_A	78.57
-SimData_et_for_1ch_far_room3_A	91.01
-SimData_et_for_1ch_near_room1_A	13.76
-SimData_et_for_1ch_near_room2_A	32.94
-SimData_et_for_1ch_near_room3_A	48.24
-Avg_Sim(6)	48.11
-Avg_Real(2)	91.67
-
-
-local/summarize_results.pl tri2b_mc
-#### RESULTS FOR dt ##### 
-
-exp/tri2b_mc/decode_bg_5k_REVERB_dt*
-LMW = 15
-RealData_dt_for_1ch_far_room1_A	45.18
-RealData_dt_for_1ch_near_room1_A	49.91
-SimData_dt_for_1ch_far_room1_A	15.78
-SimData_dt_for_1ch_far_room2_A	34.75
-SimData_dt_for_1ch_far_room3_A	37.56
-SimData_dt_for_1ch_near_room1_A	13.45
-SimData_dt_for_1ch_near_room2_A	17.57
-SimData_dt_for_1ch_near_room3_A	19.49
-Avg_Sim(6)	23.10
-Avg_Real(2)	47.55
-
-
-#### RESULTS FOR et ##### 
-
-exp/tri2b_mc/decode_bg_5k_REVERB_et*
-LMW = 15
-RealData_et_for_1ch_far_room1_A	47.67
-RealData_et_for_1ch_near_room1_A	50.65
-SimData_et_for_1ch_far_room1_A	16.69
-SimData_et_for_1ch_far_room2_A	30.36
-SimData_et_for_1ch_far_room3_A	38.08
-SimData_et_for_1ch_near_room1_A	15.67
-SimData_et_for_1ch_near_room2_A	17.71
-SimData_et_for_1ch_near_room3_A	20.10
-Avg_Sim(6)	23.10
-Avg_Real(2)	49.16
-
-
-local/summarize_results.pl tri2b_mc basis_fmllr
-#### RESULTS FOR dt ##### 
-
-exp/tri2b_mc/decode_basis_fmllr_bg_5k_REVERB_dt*
-LMW = 15
-RealData_dt_for_1ch_far_room1_A	39.37
-RealData_dt_for_1ch_near_room1_A	42.48
-SimData_dt_for_1ch_far_room1_A	14.11
-SimData_dt_for_1ch_far_room2_A	28.81
-SimData_dt_for_1ch_far_room3_A	31.53
-SimData_dt_for_1ch_near_room1_A	11.18
-SimData_dt_for_1ch_near_room2_A	15.01
-SimData_dt_for_1ch_near_room3_A	15.48
-Avg_Sim(6)	19.35
-Avg_Real(2)	40.92
-
-
-#### RESULTS FOR et ##### 
-
-exp/tri2b_mc/decode_basis_fmllr_bg_5k_REVERB_et*
-LMW = 15
-RealData_et_for_1ch_far_room1_A	42.03
-RealData_et_for_1ch_near_room1_A	43.53
-SimData_et_for_1ch_far_room1_A	13.87
-SimData_et_for_1ch_far_room2_A	26.02
-SimData_et_for_1ch_far_room3_A	32.80
-SimData_et_for_1ch_near_room1_A	12.42
-SimData_et_for_1ch_near_room2_A	14.82
-SimData_et_for_1ch_near_room3_A	17.02
-Avg_Sim(6)	19.49
-Avg_Real(2)	42.78
-
-
-local/summarize_results.pl tri2b_mc_mmi_b0.1
-#### RESULTS FOR dt ##### 
-
-exp/tri2b_mc_mmi_b0.1/decode_bg_5k_REVERB_dt*
-LMW = 15
-RealData_dt_for_1ch_far_room1_A	43.06
-RealData_dt_for_1ch_near_room1_A	46.04
-SimData_dt_for_1ch_far_room1_A	13.59
-SimData_dt_for_1ch_far_room2_A	29.55
-SimData_dt_for_1ch_far_room3_A	32.52
-SimData_dt_for_1ch_near_room1_A	11.21
-SimData_dt_for_1ch_near_room2_A	15.23
-SimData_dt_for_1ch_near_room3_A	16.42
-Avg_Sim(6)	19.75
-Avg_Real(2)	44.55
-
-
-#### RESULTS FOR et ##### 
-
-exp/tri2b_mc_mmi_b0.1/decode_bg_5k_REVERB_et*
-LMW = 15
-RealData_et_for_1ch_far_room1_A	43.45
-RealData_et_for_1ch_near_room1_A	46.89
-SimData_et_for_1ch_far_room1_A	13.37
-SimData_et_for_1ch_far_room2_A	25.96
-SimData_et_for_1ch_far_room3_A	31.73
-SimData_et_for_1ch_near_room1_A	11.89
-SimData_et_for_1ch_near_room2_A	14.64
-SimData_et_for_1ch_near_room3_A	17.26
-Avg_Sim(6)	19.14
-Avg_Real(2)	45.17
-
-
-local/summarize_results.pl tri2b_mc_mmi_b0.1 basis_fmllr
-#### RESULTS FOR dt ##### 
-
-exp/tri2b_mc_mmi_b0.1/decode_basis_fmllr_bg_5k_REVERB_dt*
-LMW = 15
-RealData_dt_for_1ch_far_room1_A	36.98
-RealData_dt_for_1ch_near_room1_A	39.68
-SimData_dt_for_1ch_far_room1_A	11.43
-SimData_dt_for_1ch_far_room2_A	25.24
-SimData_dt_for_1ch_far_room3_A	27.77
-SimData_dt_for_1ch_near_room1_A	9.19
-SimData_dt_for_1ch_near_room2_A	12.77
-SimData_dt_for_1ch_near_room3_A	13.30
-Avg_Sim(6)	16.62
-Avg_Real(2)	38.33
-
-
-#### RESULTS FOR et ##### 
-
-exp/tri2b_mc_mmi_b0.1/decode_basis_fmllr_bg_5k_REVERB_et*
-LMW = 15
-RealData_et_for_1ch_far_room1_A	38.93
-RealData_et_for_1ch_near_room1_A	39.51
-SimData_et_for_1ch_far_room1_A	11.32
-SimData_et_for_1ch_far_room2_A	22.31
-SimData_et_for_1ch_far_room3_A	28.40
-SimData_et_for_1ch_near_room1_A	9.69
-SimData_et_for_1ch_near_room2_A	12.36
-SimData_et_for_1ch_near_room3_A	14.77
-Avg_Sim(6)	16.47
-Avg_Real(2)	39.22
-
-
-local/summarize_results.pl tri2b_mc_mmi_b0.1 basis_fmllr
-#### RESULTS FOR dt ##### 
-
-exp/tri2b_mc_mmi_b0.1/decode_basis_fmllr_tg_5k_REVERB_dt*
-LMW = 15
-RealData_dt_for_1ch_far_room1_A	31.58
-RealData_dt_for_1ch_near_room1_A	32.00
-SimData_dt_for_1ch_far_room1_A	8.51
-SimData_dt_for_1ch_far_room2_A	18.36
-SimData_dt_for_1ch_far_room3_A	20.40
-SimData_dt_for_1ch_near_room1_A	6.47
-SimData_dt_for_1ch_near_room2_A	9.61
-SimData_dt_for_1ch_near_room3_A	9.59
-Avg_Sim(6)	12.16
-Avg_Real(2)	31.79
-
-
-#### RESULTS FOR et ##### 
-
-exp/tri2b_mc_mmi_b0.1/decode_basis_fmllr_tg_5k_REVERB_et*
-LMW = 15
-RealData_et_for_1ch_far_room1_A	30.32
-RealData_et_for_1ch_near_room1_A	32.45
-SimData_et_for_1ch_far_room1_A	7.74
-SimData_et_for_1ch_far_room2_A	17.01
-SimData_et_for_1ch_far_room3_A	21.05
-SimData_et_for_1ch_near_room1_A	7.01
-SimData_et_for_1ch_near_room2_A	9.52
-SimData_et_for_1ch_near_room3_A	11.29
-Avg_Sim(6)	12.27
-Avg_Real(2)	31.39
-
-
-local/summarize_results.pl tri2b_mc_mmi_b0.1 mbr_basis_fmllr
-#### RESULTS FOR dt ##### 
-
-exp/tri2b_mc_mmi_b0.1/decode_mbr_basis_fmllr_tg_5k_REVERB_dt*
-LMW = 15
-RealData_dt_for_1ch_far_room1_A	30.96
-RealData_dt_for_1ch_near_room1_A	30.88
-SimData_dt_for_1ch_far_room1_A	8.33
-SimData_dt_for_1ch_far_room2_A	18.14
-SimData_dt_for_1ch_far_room3_A	20.15
-SimData_dt_for_1ch_near_room1_A	6.24
-SimData_dt_for_1ch_near_room2_A	9.47
-SimData_dt_for_1ch_near_room3_A	9.62
-Avg_Sim(6)	11.99
-Avg_Real(2)	30.92
-
-
-#### RESULTS FOR et ##### 
-
-exp/tri2b_mc_mmi_b0.1/decode_mbr_basis_fmllr_tg_5k_REVERB_et*
-LMW = 15
-RealData_et_for_1ch_far_room1_A	29.37
-RealData_et_for_1ch_near_room1_A	31.84
-SimData_et_for_1ch_far_room1_A	7.64
-SimData_et_for_1ch_far_room2_A	16.86
-SimData_et_for_1ch_far_room3_A	20.59
-SimData_et_for_1ch_near_room1_A	6.93
-SimData_et_for_1ch_near_room2_A	9.48
-SimData_et_for_1ch_near_room3_A	11.19
-Avg_Sim(6)	12.11
-Avg_Real(2)	30.61
 
+exp/tri2a_mc/decode_basis_fmllr_bg_5k_REVERB_*et*
+RealData_et_for_1ch_far_room1_A	48.11
+RealData_et_for_1ch_near_room1_A	48.42
+SimData_et_for_1ch_far_room1_A	16.57
+SimData_et_for_1ch_far_room2_A	31.54
+SimData_et_for_1ch_far_room3_A	39.32
+SimData_et_for_1ch_near_room1_A	14.31
+SimData_et_for_1ch_near_room2_A	18.42
+SimData_et_for_1ch_near_room3_A	21.03
+Avg_Real(2)	48.27
+Avg_Sim(6)	23.53
+
+####################
+exp/tri2b_mc/decode_basis_fmllr_tg_5k_REVERB_*dt*
+RealData_dt_for_1ch_far_room1_A	34.04
+RealData_dt_for_1ch_near_room1_A	33.37
+SimData_dt_for_1ch_far_room1_A	10.57
+SimData_dt_for_1ch_far_room2_A	22.63
+SimData_dt_for_1ch_far_room3_A	25.00
+SimData_dt_for_1ch_near_room1_A	7.57
+SimData_dt_for_1ch_near_room2_A	10.97
+SimData_dt_for_1ch_near_room3_A	12.59
+Avg_Real(2)	33.70
+Avg_Sim(6)	14.89
+
+exp/tri2b_mc/decode_basis_fmllr_tg_5k_REVERB_*et*
+RealData_et_for_1ch_far_room1_A	33.49
+RealData_et_for_1ch_near_room1_A	34.72
+SimData_et_for_1ch_far_room1_A	10.03
+SimData_et_for_1ch_far_room2_A	20.16
+SimData_et_for_1ch_far_room3_A	25.08
+SimData_et_for_1ch_near_room1_A	8.45
+SimData_et_for_1ch_near_room2_A	11.16
+SimData_et_for_1ch_near_room3_A	12.88
+Avg_Real(2)	34.11
+Avg_Sim(6)	14.63
+
+####################
+exp/tri2b_mc_mmi_b0.1/decode_basis_fmllr_tg_5k_REVERB_*dt*
+RealData_dt_for_1ch_far_room1_A	31.17
+RealData_dt_for_1ch_near_room1_A	31.82
+SimData_dt_for_1ch_far_room1_A	8.53
+SimData_dt_for_1ch_far_room2_A	17.43
+SimData_dt_for_1ch_far_room3_A	21.04
+SimData_dt_for_1ch_near_room1_A	6.78
+SimData_dt_for_1ch_near_room2_A	8.97
+SimData_dt_for_1ch_near_room3_A	10.01
+Avg_Real(2)	31.50
+Avg_Sim(6)	12.13
+
+exp/tri2b_mc_mmi_b0.1/decode_basis_fmllr_tg_5k_REVERB_*et*
+RealData_et_for_1ch_far_room1_A	31.20
+RealData_et_for_1ch_near_room1_A	30.98
+SimData_et_for_1ch_far_room1_A	8.42
+SimData_et_for_1ch_far_room2_A	17.63
+SimData_et_for_1ch_far_room3_A	20.71
+SimData_et_for_1ch_near_room1_A	7.03
+SimData_et_for_1ch_near_room2_A	9.50
+SimData_et_for_1ch_near_room3_A	11.11
+Avg_Real(2)	31.09
+Avg_Sim(6)	12.40
+
+####################
+exp/tri2b_mc_mmi_b0.1/decode_mbr_basis_fmllr_tg_5k_REVERB_*dt*
+RealData_dt_for_1ch_far_room1_A	30.42
+RealData_dt_for_1ch_near_room1_A	31.50
+SimData_dt_for_1ch_far_room1_A	8.24
+SimData_dt_for_1ch_far_room2_A	17.25
+SimData_dt_for_1ch_far_room3_A	20.72
+SimData_dt_for_1ch_near_room1_A	6.76
+SimData_dt_for_1ch_near_room2_A	8.87
+SimData_dt_for_1ch_near_room3_A	9.92
+Avg_Real(2)	30.96
+Avg_Sim(6)	11.96
+
+exp/tri2b_mc_mmi_b0.1/decode_mbr_basis_fmllr_tg_5k_REVERB_*et*
+RealData_et_for_1ch_far_room1_A	30.89
+RealData_et_for_1ch_near_room1_A	31.01
+SimData_et_for_1ch_far_room1_A	8.20
+SimData_et_for_1ch_far_room2_A	17.34
+SimData_et_for_1ch_far_room3_A	20.56
+SimData_et_for_1ch_near_room1_A	6.91
+SimData_et_for_1ch_near_room2_A	9.50
+SimData_et_for_1ch_near_room3_A	10.93
+Avg_Real(2)	30.95
+Avg_Sim(6)	12.24
 
diff --git a/egs/reverb/s5/cmd.sh b/egs/reverb/s5/cmd.sh
index e88b07e1195..71dd849a93b 100644
--- a/egs/reverb/s5/cmd.sh
+++ b/egs/reverb/s5/cmd.sh
@@ -1,29 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64,gpu=1 -q g.q"
-export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G"
-export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G"
-
-#export cuda_cmd="..."
-
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
-#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
-
-#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
-#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
-
-#c) run it locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-#export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/reverb/s5/corpus.sh b/egs/reverb/s5/corpus.sh
deleted file mode 100644
index 32a2ee4b85b..00000000000
--- a/egs/reverb/s5/corpus.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-if [[ $(hostname -f) == *.clsp.jhu.edu ]] ; then
-  REVERB_home=/export/corpora5/REVERB_2014/REVERB
-  export wsjcam0=/export/corpora3/LDC/LDC95S24/wsjcam0
-  # set LDC WSJ0 directory to obtain LMs 
-  # REVERB data directory only provides bi-gram (bcb05cnp), but this recipe also uses 3-gram (tcb05cnp.z)
-  export wsj0=/export/corpora5/LDC/LDC93S6A/11-13.1 #LDC93S6A or LDC93S6B
-  # It is assumed that there will be a 'wsj0' subdirectory
-  # within the top-level corpus directory
-else
-  echo "Set the data directory locations." && exit 1;
-fi
-
-export reverb_dt=$REVERB_home/REVERB_WSJCAM0_dt
-export reverb_et=$REVERB_home/REVERB_WSJCAM0_et
-export reverb_real_dt=$REVERB_home/MC_WSJ_AV_Dev
-export reverb_real_et=$REVERB_home/MC_WSJ_AV_Eval
-
diff --git a/egs/reverb/s5/local/Generate_mcTrainData_cut.m b/egs/reverb/s5/local/Generate_mcTrainData_cut.m
old mode 100644
new mode 100755
diff --git a/egs/reverb/s5/local/REVERB_mcwsjav_data_prep.sh b/egs/reverb/s5/local/REVERB_mcwsjav_data_prep.sh
index c3de2ba7fd3..a4599f97702 100755
--- a/egs/reverb/s5/local/REVERB_mcwsjav_data_prep.sh
+++ b/egs/reverb/s5/local/REVERB_mcwsjav_data_prep.sh
@@ -65,8 +65,8 @@ if [ ! -z "$3" ]; then
    dt_or_x=$3
 fi
 
-# unfortunately, we need a pointer to HTK baseline 
-# since the corpus does NOT contain the data set descriptions 
+# unfortunately, we need a pointer to HTK baseline
+# since the corpus does NOT contain the data set descriptions
 # for the REVERB Challenge
 
 taskFileDir=$dir/../reverb_tools/ReleasePackage/reverb_tools_for_asr_ver2.0/taskFiles/1ch
@@ -97,11 +97,11 @@ s/\x0D$//' \
             # e.g. yield' --> yield
             # reason: YIELD' is not in dict, while YIELD is
             s/YIELD'/YIELD/g
-            s/'ROOTS'/ROOTS/g 
-            s/'WHERE/WHERE/g 
+            s/'ROOTS'/ROOTS/g
+            s/'WHERE/WHERE/g
             s/PEOPLE'/PEOPLE/g
             s/SIT'/SIT/g
-            s/'DOMINEE/DOMINEE/g 
+            s/'DOMINEE/DOMINEE/g
             s/CHURCH'/CHURCH/g" \
 -e '
               # fix the single missing double full stop issue at the end of an utterance
@@ -110,9 +110,9 @@ s/\x0D$//' \
               /^[A-Z]$/ {
               # append a line
                       N
-              # search for single dot on the second line        
+              # search for single dot on the second line
                       /\n\./ {
-              # found it - now replace the 
+              # found it - now replace the
                               s/\([A-Z]\)\n\./\1\.\n\./
                       }
               }' \
@@ -156,9 +156,9 @@ echo "Data preparation for $set succeeded"
 
 
 mfccdir=mfcc/$dataset
-#for x in test_eval92_clean test_eval92_5k_clean dev_dt_05_clean dev_dt_20_clean train_si84_clean; do 
-#for x in si_tr; do 
-steps/make_mfcc.sh --nj 10 \
+#for x in test_eval92_clean test_eval92_5k_clean dev_dt_05_clean dev_dt_20_clean train_si84_clean; do
+#for x in si_tr; do
+steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 \
   data/$dataset/$set exp/make_mfcc/$dataset/$set $mfccdir || exit 1;
 steps/compute_cmvn_stats.sh data/$dataset/$set exp/make_mfcc/$dataset/$set $mfccdir || exit 1;
 
diff --git a/egs/reverb/s5/local/REVERB_wsjcam0_data_prep.sh b/egs/reverb/s5/local/REVERB_wsjcam0_data_prep.sh
index 2c169e84b59..6ab2f2f4b73 100755
--- a/egs/reverb/s5/local/REVERB_wsjcam0_data_prep.sh
+++ b/egs/reverb/s5/local/REVERB_wsjcam0_data_prep.sh
@@ -50,8 +50,8 @@ fi
 cd $dir
 MIC=primary
 
-# unfortunately, we need a pointer to HTK baseline 
-# since the corpus does NOT contain the data set descriptions 
+# unfortunately, we need a pointer to HTK baseline
+# since the corpus does NOT contain the data set descriptions
 # for the REVERB Challenge
 taskFileDir=$dir/../reverb_tools/ReleasePackage/reverb_tools_for_asr_ver2.0/taskFiles/1ch
 #taskFiles=`ls $taskFileDir/*Data_dt_for_*`
@@ -108,9 +108,9 @@ echo "Data preparation for $set succeeded"
 
 
 mfccdir=mfcc/$dataset
-#for x in test_eval92_clean test_eval92_5k_clean dev_dt_05_clean dev_dt_20_clean train_si84_clean; do 
-#for x in si_tr; do 
-steps/make_mfcc.sh --nj 10 \
+#for x in test_eval92_clean test_eval92_5k_clean dev_dt_05_clean dev_dt_20_clean train_si84_clean; do
+#for x in si_tr; do
+steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 \
   data/$dataset/$set exp/make_mfcc/$dataset/$set $mfccdir || exit 1;
 steps/compute_cmvn_stats.sh data/$dataset/$set exp/make_mfcc/$dataset/$set $mfccdir || exit 1;
 
diff --git a/egs/reverb/s5/local/calc_wer.sh b/egs/reverb/s5/local/calc_wer.sh
new file mode 100755
index 00000000000..c4b5eeb87f3
--- /dev/null
+++ b/egs/reverb/s5/local/calc_wer.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+
+# Copyright 2016 MERL (author: Shinji Watanabe)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+. ./cmd.sh
+. ./path.sh
+
+lmw=15
+am="tri2a"
+lm="bg_5k"
+decode=""
+
+. utils/parse_options.sh
+
+if [ ! -z $decode ]; then
+  decode="_$decode"
+fi
+
+dir="exp/$am/decode${decode}_${lm}_REVERB_"
+echo "####################"
+echo "${dir}*dt*"
+for a in `echo ${dir}*dt* | tr " " "\n" | grep -v "A\.si"`; do
+  echo $a | awk -F '_' '{for(i=NF-6;i<NF;i++){printf("%s%s",$i,OFS="_")}print $NF}' | tr '\n' '\t'
+  grep WER $a/wer_${lmw} | awk '{print $2}'
+done | tee exp/$am/decode_${decode}_${lm}_dt.log
+echo -n -e "Avg_Real(`cat exp/$am/decode_${decode}_${lm}_dt.log | grep RealData | wc -l`)\t"
+cat exp/$am/decode_${decode}_${lm}_dt.log | grep RealData | awk '{m+=$2} END{printf("%5.2f\n", m/NR);}'
+echo -n -e "Avg_Sim(`cat exp/$am/decode_${decode}_${lm}_dt.log | grep SimData | wc -l`)\t"
+cat exp/$am/decode_${decode}_${lm}_dt.log | grep SimData | awk '{m+=$2} END{printf("%5.2f\n", m/NR);}'
+echo ""
+
+echo "${dir}*et*"
+for a in `echo ${dir}*et* | tr " " "\n" | grep -v "A\.si"`; do
+  echo $a | awk -F '_' '{for(i=NF-6;i<NF;i++){printf("%s%s",$i,OFS="_")}print $NF}' | tr '\n' '\t'
+  grep WER $a/wer_${lmw} | awk '{print $2}'
+done | tee exp/$am/decode_${decode}_${lm}_et.log
+echo -n -e "Avg_Real(`cat exp/$am/decode_${decode}_${lm}_et.log | grep RealData | wc -l`)\t"
+cat exp/$am/decode_${decode}_${lm}_et.log | grep RealData | awk '{m+=$2} END{printf("%5.2f\n", m/NR);}'
+echo -n -e "Avg_Sim(`cat exp/$am/decode_${decode}_${lm}_et.log | grep SimData | wc -l`)\t"
+cat exp/$am/decode_${decode}_${lm}_et.log | grep SimData | awk '{m+=$2} END{printf("%5.2f\n", m/NR);}'
+
+echo ""
diff --git a/egs/reverb/s5/local/get_results.sh b/egs/reverb/s5/local/get_results.sh
index 6327fcb088a..7c74736e5d1 100755
--- a/egs/reverb/s5/local/get_results.sh
+++ b/egs/reverb/s5/local/get_results.sh
@@ -1,26 +1,18 @@
 #!/bin/bash
 
-# Reproduce results in Table 1 from Weninger et al. (2014)
+# Reproduce selected results in Table 1 from Weninger et al. (2014)
 # "Our baselines"
 
 # LDA-STC  fMLLR  MCT    DT     LM     MBR
 # No       No     No     No     BG     No
-local/summarize_results.pl --lmw=15 tri2a
+local/calc_wer.sh
 # No       No     Yes    No     BG     No
-local/summarize_results.pl --lmw=15 tri2a_mc
+local/calc_wer.sh --am tri2a_mc
 # No       Yes    Yes    No     BG     No
-local/summarize_results.pl --lmw=15 tri2a_mc basis_fmllr
-# Yes      No     No     No     BG     No
-local/summarize_results.pl --lmw=15 tri2b
-# Yes      No     Yes    No     BG     No
-local/summarize_results.pl --lmw=15 tri2b_mc
-# Yes      Yes    Yes    No     BG     No
-local/summarize_results.pl --lmw=15 tri2b_mc basis_fmllr
-# Yes      No     Yes    Yes    BG     No
-local/summarize_results.pl --lmw=15 tri2b_mc_mmi_b0.1
-# Yes      Yes    Yes    Yes    BG     No
-local/summarize_results.pl --lmw=15 tri2b_mc_mmi_b0.1 basis_fmllr
+local/calc_wer.sh --am tri2a_mc --decode basis_fmllr
+# Yes      Yes    Yes    No     TG     No
+local/calc_wer.sh --am tri2b_mc --lm tg_5k --decode basis_fmllr
 # Yes      Yes    Yes    Yes    TG     No
-local/summarize_results.pl --lm=tg_5k --lmw=15 tri2b_mc_mmi_b0.1 basis_fmllr
+local/calc_wer.sh --am tri2b_mc_mmi_b0.1 --lm tg_5k --decode basis_fmllr
 # Yes      Yes    Yes    Yes    TG     Yes
-local/summarize_results.pl --lm=tg_5k --lmw=15 tri2b_mc_mmi_b0.1 mbr_basis_fmllr
+local/calc_wer.sh --am tri2b_mc_mmi_b0.1 --lm tg_5k --decode mbr_basis_fmllr
diff --git a/egs/reverb/s5/local/summarize_results.pl b/egs/reverb/s5/local/summarize_results.pl
deleted file mode 100755
index 0977bd2dacc..00000000000
--- a/egs/reverb/s5/local/summarize_results.pl
+++ /dev/null
@@ -1,122 +0,0 @@
-#!/usr/bin/env perl
-
-# Copyright 2013 MERL (author: Felix Weninger)
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-use strict;
-
-my $opt_lmw;
-my $lm = "bg_5k";
-
-while ($#ARGV > -1) {
-    if ($ARGV[0] =~ /^--lmw=(\d+)$/)
-    {
-        $opt_lmw = $1 + 0;
-        shift @ARGV;
-    }
-    elsif ($ARGV[0] =~ /^--lm=(\w+)$/) {
-        $lm = $1;
-        shift @ARGV;
-    }
-    else {
-        last;
-    }
-}
-
-
-print "$0 @ARGV\n";
-
-my $system = "tri2b_mc";
-if ($ARGV[0] ne "") { $system = $ARGV[0]; }
-
-for my $dt_or_et ("dt", "et") {
-
-print "#### RESULTS FOR $dt_or_et ##### \n\n";
-
-my $pref = "REVERB_$dt_or_et";
-#if ($lm ne "bg_5k") {
-$pref = "${lm}_$pref";
-#}
-if ($ARGV[1] ne "") { $pref = $ARGV[1] . '_' . $pref; }
-if ($ARGV[2] ne "") { $pref = $pref . '_' . $ARGV[2]; }
-
-my $suff = "";
-
-print "exp/$system/decode_$suff$pref*\n";
-my @folders = glob("exp/$system/decode_$suff$pref*");
-
-my ($min_lmw, $max_lmw) = (9, 20);
-@folders = grep { -f "$_/wer_$min_lmw" } @folders;
-my @sum_wer;
-my %wer;
-my %avg_wer_disp;
-my $nc = 0;
-my $ns = 0;
-my $nr = 0;
-for my $lmw ($min_lmw..$max_lmw)
-{
-    for my $fold (@folders) {
-        my $res_file = "$fold/wer_$lmw";
-        #print "fold = $fold pref = $pref\n";
-        #my ($cond) = $fold =~ /decode_(\w+)$/;
-        my ($cond) = $fold =~ /decode_\Q$suff\E\Q${pref}\E_(\w+)$/;
-        if ($cond =~ /^Sim.+(far|near|cln)|^Real/) {
-            open(RES, $res_file) or die "$res_file: $_";
-            while (<RES>) {
-                if (/%WER\s+(\S+)/) {
-                    my $wer = $1;
-                    #print "cond = $cond lmw = $lmw wer = $1\n";
-                    if ($cond !~ /cln/) {
-                        $sum_wer[$lmw] += $wer;
-                    }
-                    $wer{$cond}[$lmw] = $wer;
-                }
-            }
-            #print "cond = $cond fold = $fold\n";
-        }
-    }   
-}
-
-if (!$opt_lmw && $dt_or_et eq "dt") {
-    $opt_lmw = $min_lmw;
-    for my $lmw ($min_lmw+1..$max_lmw) {
-        if ($sum_wer[$lmw] < $sum_wer[$opt_lmw]) {
-            $opt_lmw = $lmw;
-        }
-    }
-}
-
-print "LMW = $opt_lmw\n";
-for my $cond (sort keys %wer) {
-    print "$cond\t$wer{$cond}[$opt_lmw]\n";
-    if ($cond =~ /SimData_[de]t/) {
-        if ($cond !~ /cln/) {
-            $avg_wer_disp{"SimData"} += ($wer{$cond}[$opt_lmw] - $avg_wer_disp{"SimData"}) / ++$ns;
-        }
-        else {
-            $avg_wer_disp{"CleanData"} += ($wer{$cond}[$opt_lmw] - $avg_wer_disp{"CleanData"}) / ++$nc;
-        }
-    }
-    elsif ($cond =~ /RealData_[de]t/) {
-        $avg_wer_disp{"RealData"} += ($wer{$cond}[$opt_lmw] - $avg_wer_disp{"RealData"}) / ++$nr;
-    }
-}
-
-#print "Avg_Clean($nc)\t", sprintf("%.2f", $avg_wer_disp{"CleanData"}), "\n";
-print "Avg_Sim($ns)\t", sprintf("%.2f", $avg_wer_disp{"SimData"}), "\n";
-print "Avg_Real($nr)\t", sprintf("%.2f", $avg_wer_disp{"RealData"}), "\n";
-print "\n\n";
-
-}
diff --git a/egs/reverb/s5/local/wsjcam0_format_data.sh b/egs/reverb/s5/local/wsjcam0_format_data.sh
index aa1e8224fc9..883cb20ed0e 100755
--- a/egs/reverb/s5/local/wsjcam0_format_data.sh
+++ b/egs/reverb/s5/local/wsjcam0_format_data.sh
@@ -50,22 +50,8 @@ for lm_suffix in bg_5k tg_5k; do
     cp -r data/lang/$f $test
   done
   gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
-   utils/find_arpa_oovs.pl $test/words.txt  > $tmpdir/oovs_${lm_suffix}.txt
-
-  # grep -v '<s> <s>' because the LM seems to have some strange and useless
-  # stuff in it with multiple <s>'s in the history.  Encountered some other similar
-  # things in a LM from Geoff.  Removing all "illegal" combinations of <s> and </s>,
-  # which are supposed to occur only at being/end of utt.  These can cause 
-  # determinization failures of CLG [ends up being epsilon cycles].
-  gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
-    grep -v '<s> <s>' | \
-    grep -v '</s> <s>' | \
-    grep -v '</s> </s>' | \
-    arpa2fst - | fstprint | \
-    utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \
-    utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
-      --osymbols=$test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-     fstrmepsilon  | fstarcsort --sort_type=ilabel > $test/G.fst
+    arpa2fst --disambig-symbol=#0 \
+             --read-symbol-table=$test/words.txt - $test/G.fst
   fstisstochastic $test/G.fst
  # The output is like:
  # 9.14233e-05 -0.259833
@@ -83,7 +69,7 @@ for lm_suffix in bg_5k tg_5k; do
     < "$lexicon"  >$tmpdir/g/select_empty.fst.txt
   fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \
    fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst
-  fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && 
+  fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
     echo "Language model has cycles with empty words" && exit 1
   rm -r $tmpdir/g
 done
diff --git a/egs/reverb/s5/path.sh b/egs/reverb/s5/path.sh
index eea6b7a8293..1a6fb5f891b 100644
--- a/egs/reverb/s5/path.sh
+++ b/egs/reverb/s5/path.sh
@@ -1,3 +1,5 @@
 export KALDI_ROOT=`pwd`/../../..
-export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin/:$KALDI_ROOT/src/kwsbin:$PWD:$PATH
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
 export LC_ALL=C
diff --git a/egs/reverb/s5/run.sh b/egs/reverb/s5/run.sh
index 0e3eac6e6c1..ffb0b20422d 100755
--- a/egs/reverb/s5/run.sh
+++ b/egs/reverb/s5/run.sh
@@ -15,89 +15,92 @@
 # See the Apache 2 License for the specific language governing permissions and
 # limitations under the License.
 
+# This is a shell script, but it's recommended that you run the commands one by
+# one by copying and pasting into the shell.
+# Caution: some of the graph creation steps use quite a bit of memory, so you
+# should run this on a machine that has sufficient memory.
+
 # Requirements) matlab and tcsh
 if [ ! `which tcsh` ]; then
-    echo "Install tcsh, which is used in some REVERB scripts"
-    exit 1
+  echo "Install tcsh, which is used in some REVERB scripts"
+  exit 1
 fi
 if [ ! `which matlab` ]; then
-    echo "Install matlab, which is used to generate multi-condition data"
-    exit 1
+  echo "Install matlab, which is used to generate multi-condition data"
+  exit 1
 fi
 
-if [ ! -e path.sh ] || [ ! -e corpus.sh ]; then
-    echo "ERROR: path.sh and/or corpus.sh not found"
-    echo "You need to create these from {path,corpus}.sh.default to match your system"
-    echo "Make sure you follow the instructions in ../README.txt"
-    exit 1
+. ./cmd.sh
+. ./path.sh
+
+stage=1
+. utils/parse_options.sh
+# Set bash to 'debug' mode, it prints the commands (option '-x') and exits on :
+# -e 'error', -u 'undefined variable', -o pipefail 'error in pipeline',
+set -euxo pipefail
+
+# please make sure to set the paths of the REVERB and WSJ0 data
+if [[ $(hostname -f) == *.clsp.jhu.edu ]] ; then
+  REVERB_home=/export/corpora5/REVERB_2014/REVERB
+  export wsjcam0=/export/corpora3/LDC/LDC95S24/wsjcam0
+  # set LDC WSJ0 directory to obtain LMs
+  # REVERB data directory only provides bi-gram (bcb05cnp), but this recipe also uses 3-gram (tcb05cnp.z)
+  export wsj0=/export/corpora5/LDC/LDC93S6A/11-13.1 #LDC93S6A or LDC93S6B
+  # It is assumed that there will be a 'wsj0' subdirectory
+  # within the top-level corpus directory
+elif [[ $(hostname -f) == *.merl.com ]] ; then
+  REVERB_home=/db/laputa1/data/original/public/REVERB
+  export wsjcam0=$REVERB_home/wsjcam0
+  # set LDC WSJ0 directory to obtain LMs
+  # REVERB data directory only provides bi-gram (bcb05cnp), but this recipe also uses 3-gram (tcb05cnp.z)
+  export wsj0=/db/laputa1/data/original/public/WSJ0/11-13.1 #LDC93S6A or LDC93S6B
+  # It is assumed that there will be a 'wsj0' subdirectory
+  # within the top-level corpus directory
+else
+  echo "Set the data directory locations." && exit 1;
 fi
+export reverb_dt=$REVERB_home/REVERB_WSJCAM0_dt
+export reverb_et=$REVERB_home/REVERB_WSJCAM0_et
+export reverb_real_dt=$REVERB_home/MC_WSJ_AV_Dev
+export reverb_real_et=$REVERB_home/MC_WSJ_AV_Eval
 
-. ./cmd.sh 
-
-# please make sure to set the paths of the REVERB and WSJ0 data           
-. ./corpus.sh
-
-# set the directory of the multi-condition training data generated
+# set the directory of the multi-condition training data to be generated
 reverb_tr=`pwd`/data_tr_cut/REVERB_WSJCAM0_tr_cut
 
 # LDA context size (left/right) (4 is default)
 context_size=4
 
-# The language models with which to decode (tg_5k or bg_5k or "tg_5k bg_5k" for
-# both)
-lms="bg_5k tg_5k"
+# The language models with which to decode (tg_5k or bg_5k)
+lm="tg_5k"
 
 # number of jobs for feature extraction and model training
 nj_train=30
 
 # number of jobs for decoding
-# use less jobs for trigram model
-# if you have enough RAM (~ 32 GB), you can use 8 jobs for trigram as well
-nj_bg=8
-nj_tg=8
-nj_bg=25 ##
-nj_tg=25 ##
-
-# set to true if running from scratch
-do_prep=true
+nj_decode=8
 
 # set to true if you want the tri2a systems (re-implementation of the HTK baselines)
 do_tri2a=true
 
-
-# The following are the settings determined by Gaussian Process optimization.
-# However, they are not used in the final system.
-# You can use the code below for training the "tri2c_mc" system.
-
-# LDA parameters for MCT recognizer.
-# Use significantly more context than the default (7 frames ~ 85 ms)
-mct_lda_left_context=7
-mct_lda_right_context=5
-
-# Number of states and Gaussians for the MCT recognizer.
-mct_nstates=7500
-mct_ngauss=45000
-
-## End of GP tuned settings
-
-false && {
-if $do_prep; then
+if [ $stage -le 1 ]; then
   # Generate multi-condition training data
   # Note that utterance lengths match the original set.
   # This enables using clean alignments in multi-condition training (stereo training)
-  #local/REVERB_create_mcdata.sh $wsjcam0 $reverb_tr
+  local/REVERB_create_mcdata.sh $wsjcam0 $reverb_tr
+fi
 
+if [ $stage -le 2 ]; then
   # Prepare wsjcam0 clean data and wsj0 language model.
-  local/wsjcam0_data_prep.sh $wsjcam0 $wsj0 || exit 1
+  local/wsjcam0_data_prep.sh $wsjcam0 $wsj0
 
   # Prepare merged BEEP/CMU dictionary.
-  local/wsj_prepare_beep_dict.sh || exit 1;
+  local/wsj_prepare_beep_dict.sh
 
   # Prepare wordlists, etc.
-  utils/prepare_lang.sh data/local/dict "<SPOKEN_NOISE>" data/local/lang_tmp data/lang || exit 1;
+  utils/prepare_lang.sh data/local/dict "<SPOKEN_NOISE>" data/local/lang_tmp data/lang
 
   # Prepare directory structure for clean data. Apply some language model fixes.
-  local/wsjcam0_format_data.sh || exit 1;
+  local/wsjcam0_format_data.sh
 
   # Now it's getting more interesting.
   # Prepare the multi-condition training data and the REVERB dt set.
@@ -108,253 +111,227 @@ if $do_prep; then
   # local/REVERB_wsjcam0_data_prep.sh /path/to/processed/REVERB_WSJCAM0_dt processed_REVERB_dt dt
   # The first argument is supposed to point to a folder that has the same structure
   # as the REVERB corpus.
-  local/REVERB_wsjcam0_data_prep.sh $reverb_tr REVERB_tr_cut tr || exit 1;
-  local/REVERB_wsjcam0_data_prep.sh $reverb_dt REVERB_dt dt     || exit 1;
-  local/REVERB_wsjcam0_data_prep.sh $reverb_et REVERB_et et     || exit 1;
+  local/REVERB_wsjcam0_data_prep.sh $reverb_tr REVERB_tr_cut tr
+  local/REVERB_wsjcam0_data_prep.sh $reverb_dt REVERB_dt dt
+  local/REVERB_wsjcam0_data_prep.sh $reverb_et REVERB_et et
 
   # Prepare the REVERB "real" dt set from MCWSJAV corpus.
   # This corpus is *never* used for training.
   # This creates the data set called REVERB_Real_dt and its subfolders
-  local/REVERB_mcwsjav_data_prep.sh $reverb_real_dt REVERB_Real_dt dt || exit 1;
+  local/REVERB_mcwsjav_data_prep.sh $reverb_real_dt REVERB_Real_dt dt
   # The MLF file exists only once in the corpus, namely in the real_dt directory
   # so we pass it as 4th argument
-  local/REVERB_mcwsjav_data_prep.sh $reverb_real_et REVERB_Real_et et $reverb_real_dt/mlf/WSJ.mlf || exit 1;
+  local/REVERB_mcwsjav_data_prep.sh $reverb_real_et REVERB_Real_et et $reverb_real_dt/mlf/WSJ.mlf
+fi
 
+if [ $stage -le 3 ]; then
   # Extract MFCC features for clean sets.
   # For the non-clean data sets, this is outsourced to the data preparation scripts.
   mfccdir=mfcc
   ### for x in si_tr si_dt; do it seems that the number of transcriptions of si_dt is not correct.
-  for x in si_tr; do 
-   steps/make_mfcc.sh --nj $nj_train \
-     data/$x exp/make_mfcc/$x $mfccdir || exit 1;
-   steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir || exit 1;
+  for x in si_tr; do
+   steps/make_mfcc.sh --cmd "$train_cmd" --nj $nj_train \
+     data/$x exp/make_mfcc/$x $mfccdir
+   steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir
   done
 fi
 
-# Train monophone model on clean data (si_tr).
-if [ ! -e exp/mono0a/final.mdl ]; then
-    echo "### TRAINING mono0a ###"
-    steps/train_mono.sh --boost-silence 1.25 --nj $nj_train \
-      data/si_tr data/lang exp/mono0a || exit 1;
+if [ $stage -le 4 ]; then
+  # Train monophone model on clean data (si_tr).
+  echo "### TRAINING mono0a ###"
+  steps/train_mono.sh --boost-silence 1.25 --nj $nj_train --cmd "$train_cmd" \
+    data/si_tr data/lang exp/mono0a
+
+  # Align monophones with clean data.
+  echo "### ALIGNING mono0a_ali ###"
+  steps/align_si.sh --boost-silence 1.25 --nj $nj_train --cmd "$train_cmd" \
+    data/si_tr data/lang exp/mono0a exp/mono0a_ali
+
+  # Create first triphone recognizer.
+  echo "### TRAINING tri1 ###"
+  steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \
+    2000 10000 data/si_tr data/lang exp/mono0a_ali exp/tri1
+
+  echo "### ALIGNING tri1_ali ###"
+  # Re-align triphones.
+  steps/align_si.sh --nj $nj_train --cmd "$train_cmd" \
+    data/si_tr data/lang exp/tri1 exp/tri1_ali
 fi
 
-# Align monophones with clean data.
-if [ ! -e exp/mono0a_ali/ali.1.gz ]; then
-    echo "### ALIGNING mono0a_ali ###"
-    steps/align_si.sh --boost-silence 1.25 --nj $nj_train \
-       data/si_tr data/lang exp/mono0a exp/mono0a_ali || exit 1;
-fi
-
-# Create first triphone recognizer.
-if [ ! -e exp/tri1/final.mdl ]; then
-    echo "### TRAINING tri1 ###"
-    steps/train_deltas.sh --boost-silence 1.25 \
-        2000 10000 data/si_tr data/lang exp/mono0a_ali exp/tri1 || exit 1;
-fi
-
-# Prepare first triphone recognizer and decode clean si_dt for verification.
-#utils/mkgraph.sh data/lang_test_bg_5k exp/tri1 exp/tri1/graph_bg_5k || exit 1;
-#steps/decode.sh --nj 8 exp/tri1/graph_bg_5k data/si_dt exp/tri1/decode_si_dt
-
-if [ ! -e exp/tri1_ali/ali.1.gz ]; then
-    echo "### ALIGNING tri1_ali ###"
-    # Re-align triphones.
-    steps/align_si.sh --nj $nj_train \
-      data/si_tr data/lang exp/tri1 exp/tri1_ali || exit 1;
-fi
-
-
 # The following code trains and evaluates a delta feature recognizer, which is similar to the HTK
 # baseline (but using per-utterance basis fMLLR instead of batch MLLR). This is for reference only.
 if $do_tri2a; then
+if [ $stage -le 5 ]; then
   # Train tri2a, which is deltas + delta-deltas, on clean data.
-  steps/train_deltas.sh \
-    2500 15000 data/si_tr data/lang exp/tri1_ali exp/tri2a || exit 1;
+  steps/train_deltas.sh --cmd "$train_cmd" \
+    2500 15000 data/si_tr data/lang exp/tri1_ali exp/tri2a
 
   # Re-align triphones using clean data. This gives a smallish performance gain.
-  steps/align_si.sh --nj $nj_train \
-    data/si_tr data/lang exp/tri2a exp/tri2a_ali || exit 1;
+  steps/align_si.sh --nj $nj_train --cmd "$train_cmd" \
+    data/si_tr data/lang exp/tri2a exp/tri2a_ali
 
   # Train a multi-condition triphone recognizer.
   # This uses alignments on *clean* data, which is allowed for REVERB.
-  # However, we have to use the "cut" version so that the length of the 
+  # However, we have to use the "cut" version so that the length of the
   # waveforms match.
   # It is actually asserted by the Challenge that clean and multi-condition waves are aligned.
-  steps/train_deltas.sh \
-    2500 15000 data/REVERB_tr_cut/SimData_tr_for_1ch_A data/lang exp/tri2a_ali exp/tri2a_mc || exit 1;
+  steps/train_deltas.sh --cmd "$train_cmd" \
+    2500 15000 data/REVERB_tr_cut/SimData_tr_for_1ch_A data/lang exp/tri2a_ali exp/tri2a_mc
 
   # Prepare clean and mc tri2a models for decoding.
-  utils/mkgraph.sh data/lang_test_bg_5k exp/tri2a exp/tri2a/graph_bg_5k
-  utils/mkgraph.sh data/lang_test_bg_5k exp/tri2a_mc exp/tri2a_mc/graph_bg_5k
+  utils/mkgraph.sh data/lang_test_bg_5k exp/tri2a exp/tri2a/graph_bg_5k &
+  utils/mkgraph.sh data/lang_test_bg_5k exp/tri2a_mc exp/tri2a_mc/graph_bg_5k &
+  wait
+fi
 
+if [ $stage -le 6 ]; then
   # decode REVERB dt using tri2a, clean
-  for dataset in data/REVERB_dt/SimData_dt* data/REVERB_Real_dt/RealData_dt*; do
-    steps/decode.sh --nj $nj_bg \
-      exp/tri2a/graph_bg_5k $dataset exp/tri2a/decode_bg_5k_REVERB_dt_`basename $dataset` || exit 1;
+  for dataset in data/REVERB_*{dt,et}/*; do
+    steps/decode.sh --nj $nj_decode --cmd "$decode_cmd" \
+      exp/tri2a/graph_bg_5k $dataset exp/tri2a/decode_bg_5k_`echo $dataset | awk -F '/' '{print $2 "_" $3}'` &
   done
 
   # decode REVERB dt using tri2a, mc
-  for dataset in data/REVERB_dt/SimData_dt* data/REVERB_Real_dt/RealData_dt*; do
-    steps/decode.sh --nj $nj_bg \
-      exp/tri2a_mc/graph_bg_5k $dataset exp/tri2a_mc/decode_bg_5k_REVERB_dt_`basename $dataset` || exit 1;
+  for dataset in data/REVERB_*{dt,et}/*; do
+    steps/decode.sh --nj $nj_decode --cmd "$decode_cmd" \
+      exp/tri2a_mc/graph_bg_5k $dataset exp/tri2a_mc/decode_bg_5k_`echo $dataset | awk -F '/' '{print $2 "_" $3}'` &
   done
+
   # basis fMLLR for tri2a_mc system
   # This computes a transform for every training utterance and computes a basis from that.
-  steps/get_fmllr_basis.sh --per-utt true data/REVERB_tr_cut/SimData_tr_for_1ch_A data/lang exp/tri2a_mc || exit 1;
+  steps/get_fmllr_basis.sh --cmd "$train_cmd" --per-utt true data/REVERB_tr_cut/SimData_tr_for_1ch_A data/lang exp/tri2a_mc
 
   # Recognition using fMLLR adaptation (per-utterance processing).
-  for dataset in data/REVERB_dt/SimData_dt* data/REVERB_Real_dt/RealData_dt*; do
-    steps/decode_basis_fmllr.sh --nj $nj_bg \
-      exp/tri2a_mc/graph_bg_5k $dataset exp/tri2a_mc/decode_basis_fmllr_bg_5k_REVERB_dt_`basename $dataset` || exit 1;
+  for dataset in data/REVERB_*{dt,et}/*; do
+    steps/decode_basis_fmllr.sh --nj $nj_decode --cmd "$decode_cmd" \
+      exp/tri2a_mc/graph_bg_5k $dataset exp/tri2a_mc/decode_basis_fmllr_bg_5k_`echo $dataset | awk -F '/' '{print $2 "_" $3}'` &
   done
-
-fi # train tri2a, tri2a_mc
-
-
-# Train tri2b recognizer, which uses LDA-MLLT, using the default parameters from the WSJ recipe.
-if [ ! -e exp/tri2b/final.mdl ]; then
-    echo "### TRAINING tri2b ###"
-    steps/train_lda_mllt.sh \
-       --splice-opts "--left-context=$context_size --right-context=$context_size" \
-       2500 15000 data/si_tr data/lang exp/tri1_ali exp/tri2b || exit 1;
+  wait
 fi
-
-# tri2b (LDA-MLLT system) with multi-condition training, using default parameters.
-if [ ! -e exp/tri2b_mc/final.mdl ]; then
-    echo "### TRAINING tri2b_mc ###"
-    steps/train_lda_mllt.sh \
-       --splice-opts "--left-context=$context_size --right-context=$context_size" \
-       2500 15000 data/REVERB_tr_cut/SimData_tr_for_1ch_A data/lang exp/tri1_ali exp/tri2b_mc || exit 1;
 fi
 
-
-# tri2c (LDA-MLLT system) with multi-condition training, optimized parameters.
-# Disabled by default -- it only improves slightly, and tends to overfit.
-if [ ! -e exp/tri2c_mc/final.mdl ]; then
-    echo "### TRAINING tri2c_mc ###"
-    steps/train_lda_mllt.sh \
-       --splice-opts "--left-context=$mct_lda_left_context --right-context=$mct_lda_right_context" \
-       $mct_nstates $mct_ngauss data/REVERB_tr_cut/SimData_tr_for_1ch_A data/lang exp/tri1_ali exp/tri2c_mc || exit 1;
+if [ $stage -le 7 ]; then
+  # Train tri2b recognizer, which uses LDA-MLLT, using the default parameters from the WSJ recipe.
+  echo "### TRAINING tri2b ###"
+  steps/train_lda_mllt.sh --cmd "$train_cmd" \
+    --splice-opts "--left-context=$context_size --right-context=$context_size" \
+    2500 15000 data/si_tr data/lang exp/tri1_ali exp/tri2b
+
+  # tri2b (LDA-MLLT system) with multi-condition training, using default parameters.
+  echo "### TRAINING tri2b_mc ###"
+  steps/train_lda_mllt.sh  --cmd "$train_cmd"\
+    --splice-opts "--left-context=$context_size --right-context=$context_size" \
+    2500 15000 data/REVERB_tr_cut/SimData_tr_for_1ch_A data/lang exp/tri1_ali exp/tri2b_mc
 fi
 
-
 # Prepare tri2b* systems for decoding.
-for recog in tri2b tri2b_mc; do
-    for lm in $lms; do
-        graph=exp/$recog/graph_$lm
-        if [ ! -e "$graph" ]; then
-            echo "### MAKING GRAPH $graph ###"
-            utils/mkgraph.sh data/lang_test_$lm exp/$recog $graph || exit 1;
-        fi
-    done
-done
-
+if [ $stage -le 8 ]; then
+  echo "### MAKING GRAPH {tri2b,tri2b_mc}/graph_$lm ###"
+  for recog in tri2b tri2b_mc; do
+    utils/mkgraph.sh data/lang_test_$lm exp/$recog exp/$recog/graph_$lm &
+  done
+  wait
+fi
 
 # discriminative training on top of multi-condition systems
 # one could also add tri2b here to have a DT clean recognizer for reference
-for base_recog in tri2b_mc; do
-
-    bmmi_recog=${base_recog}_mmi_b0.1
-    echo "### DT $base_recog --> $bmmi_recog ###"
+if [ $stage -le 9 ]; then
+  base_recog=tri2b_mc
+  bmmi_recog=${base_recog}_mmi_b0.1
+  echo "### DT $base_recog --> $bmmi_recog ###"
+
+  # get alignments from base recognizer
+  steps/align_si.sh --nj $nj_train --cmd "$train_cmd" \
+    --use-graphs true data/REVERB_tr_cut/SimData_tr_for_1ch_A data/lang exp/$base_recog exp/${base_recog}_ali
+
+  # get lattices from base recognizer
+  denlats_dir=${base_recog}_denlats
+  subsplit=`echo $nj_train \* 2 | bc`
+  # DT with multi-condition data ...
+  steps/make_denlats.sh --sub-split $subsplit --nj $nj_train --cmd "$decode_cmd" \
+    data/REVERB_tr_cut/SimData_tr_for_1ch_A data/lang exp/$base_recog exp/$denlats_dir
+
+  # boosted MMI training
+  steps/train_mmi.sh --boost 0.1 --cmd "$train_cmd" \
+    data/REVERB_tr_cut/SimData_tr_for_1ch_A \
+    data/lang \
+    exp/${base_recog}_ali \
+    exp/$denlats_dir \
+    exp/$bmmi_recog
+  cp exp/$base_recog/ali.* exp/$bmmi_recog
+fi
 
-    # get alignments from base recognizer
-    if [ ! -e exp/${base_recog}_ali/ali.1.gz ]; then
-        steps/align_si.sh --nj $nj_train \
-          --use-graphs true data/REVERB_tr_cut/SimData_tr_for_1ch_A data/lang exp/$base_recog exp/${base_recog}_ali || exit 1;
-    fi
+# decoding using various recognizers
+if [ $stage -le 10 ]; then
+  # put tri2b last since it takes longest due to the large mismatch.
+  for recog in tri2b_mc tri2b_mc_mmi_b0.1 tri2b; do
+    # The graph from the ML directory is used in recipe
+    recog2=`echo $recog | sed s/_mmi.*//`
+    graph=exp/$recog2/graph_$lm
+
+    echo "### DECODING with $recog, noadapt, $lm ###"
+    for dataset in data/REVERB_*{dt,et}/*; do
+      decode_suff=${lm}_`echo $dataset | awk -F '/' '{print $2 "_" $3}'`
+      steps/decode.sh --nj $nj_decode --cmd "$decode_cmd" \
+        $graph $dataset \
+        exp/$recog/decode_$decode_suff &
+    done
+    wait
+
+    echo " ## MBR RESCORING with $recog, noadapt ##"
+    for dataset in data/REVERB_*{dt,et}/*; do
+      decode_suff=${lm}_`echo $dataset | awk -F '/' '{print $2 "_" $3}'`
+      mkdir -p exp/$recog/decode_mbr_$decode_suff
+      cp exp/$recog/decode_$decode_suff/lat.*.gz exp/$recog/decode_mbr_$decode_suff
+      local/score_mbr.sh --cmd "$decode_cmd" \
+	$dataset data/lang_test_$lm/ exp/$recog/decode_mbr_$decode_suff &
+    done
+    wait
 
-    # get lattices from base recognizer
-    denlats_dir=${base_recog}_denlats
-    subsplit=`echo $nj_train \* 2 | bc`
-    if [ ! -e exp/$denlats_dir/.done.1 ]; then
-        # DT with multi-condition data ...
-        steps/make_denlats.sh --sub-split $subsplit --nj $nj_train \
-          data/REVERB_tr_cut/SimData_tr_for_1ch_A data/lang exp/$base_recog exp/$denlats_dir || exit 1;
-    fi
+  done # loop recog
+fi
 
-    # boosted MMI training
-    if [ ! -e exp/$bmmi_recog/final.mdl ]; then
-        steps/train_mmi.sh --boost 0.1 \
-          data/REVERB_tr_cut/SimData_tr_for_1ch_A \
-          data/lang \
-          exp/${base_recog}_ali \
-          exp/$denlats_dir \
-          exp/$bmmi_recog  || exit 1;
-        cp exp/$base_recog/ali.* exp/$bmmi_recog
+# decoding using various recognizers with adaptation
+if [ $stage -le 11 ]; then
+  # put tri2b last since it takes longest due to the large mismatch.
+  for recog in tri2b_mc tri2b_mc_mmi_b0.1 tri2b; do
+    # The graph from the ML directory is used in recipe
+    recog2=`echo $recog | sed s/_mmi.*//`
+    graph=exp/$recog2/graph_$lm
+
+    # set the adaptation data
+    if [[ "$recog" =~ _mc ]]; then
+      tr_dataset=REVERB_tr_cut/SimData_tr_for_1ch_A
+    else
+      tr_dataset=si_tr
     fi
 
-done
-
-}
+    echo "### DECODING with $recog, basis_fmllr, $lm ###"
+    steps/get_fmllr_basis.sh --cmd "$train_cmd" --per-utt true data/$tr_dataset data/lang exp/$recog
+    for dataset in data/REVERB_*{dt,et}/*; do
+      (
+	decode_suff=${lm}_`echo $dataset | awk -F '/' '{print $2 "_" $3}'`
+        steps/decode_basis_fmllr.sh --nj $nj_decode --cmd "$decode_cmd" \
+          $graph $dataset \
+          exp/$recog/decode_basis_fmllr_$decode_suff
+      ) &
+    done
+    wait
+
+    echo " ## MBR RESCORING with $recog, basis_fmllr ##"
+    for dataset in data/REVERB_*{dt,et}/*; do
+      decode_suff=${lm}_`echo $dataset | awk -F '/' '{print $2 "_" $3}'`
+      mkdir -p exp/$recog/decode_mbr_basis_fmllr_$decode_suff
+      cp exp/$recog/decode_basis_fmllr_$decode_suff/lat.*.gz exp/$recog/decode_mbr_basis_fmllr_$decode_suff
+      local/score_mbr.sh --cmd "$decode_cmd" \
+        $dataset data/lang_test_$lm/ exp/$recog/decode_mbr_basis_fmllr_$decode_suff &
+    done
+    wait
 
-# decoding using bigram / trigram and various recognizers
-do_adapt=true
-for lm in $lms; do
-    if [[ "$lm" =~ tg ]]; then
-        nj=$nj_tg
-    else
-        nj=$nj_bg
-    fi
-    # put tri2b last since it takes longest due to the large mismatch.
-    for recog in tri2b_mc tri2b_mc_mmi_b0.1 tri2b; do
-        # The graph from the ML directory is used in recipe
-        recog2=`echo $recog | sed s/_mmi.*//`
-        graph=exp/$recog2/graph_$lm
-        for dataset in data/REVERB_dt/SimData_dt* \
-                       data/REVERB_et/SimData_et* \
-                       data/REVERB_Real_dt/RealData_dt* \
-                       data/REVERB_Real_et/RealData_et*; do
-            if [[ $dataset =~ _dt ]]; then
-                pdataset=REVERB_dt
-            elif [[ $dataset =~ _et ]]; then
-                pdataset=REVERB_et
-            else
-                echo "$0: Cannot figure out what to do with: $dataset"
-                exit 1
-            fi
-            #pdataset=$(basename $(dirname $dataset))
-            #echo $pdataset
-            decode_suff=${lm}_${pdataset}_`basename $dataset`
-            if [ ! -e exp/$recog/decode_$decode_suff/wer_15 ]; then
-                echo "### DECODING $dataset | $recog, noadapt, $lm ###"
-                steps/decode.sh --nj $nj \
-                    $graph $dataset \
-                    exp/$recog/decode_$decode_suff || exit 1;
-            fi
-            if [ ! -e exp/$recog/decode_mbr_$decode_suff/wer_15 ]; then
-                mkdir -p exp/$recog/decode_mbr_$decode_suff
-                cp exp/$recog/decode_$decode_suff/lat.*.gz exp/$recog/decode_mbr_$decode_suff 
-                echo " ## MBR RESCORING $dataset | $recog, noadapt ##"
-                local/score_mbr.sh \
-                    $dataset data/lang_test_$lm/ exp/$recog/decode_mbr_$decode_suff || exit 1
-            fi
-            if $do_adapt; then
-                if [ ! -e exp/$recog/fmllr.basis ]; then
-                    if [[ "$recog" =~ _mc ]]; then
-                        tr_dataset=REVERB_tr_cut/SimData_tr_for_1ch_A
-                    else
-                        tr_dataset=si_tr
-                    fi
-                    steps/get_fmllr_basis.sh --per-utt true data/$tr_dataset data/lang exp/$recog || exit 1;
-                fi
-                if [ ! -e exp/$recog/decode_basis_fmllr_$decode_suff/wer_15 ]; then
-                    echo "### DECODING $dataset | $recog, basis_fmllr, $lm ###"
-                    steps/decode_basis_fmllr.sh --nj $nj \
-                        $graph $dataset \
-                        exp/$recog/decode_basis_fmllr_$decode_suff || exit 1;
-                fi
-                if [ ! -e exp/$recog/decode_mbr_basis_fmllr_$decode_suff/wer_15 ]; then
-                    mkdir -p exp/$recog/decode_mbr_basis_fmllr_$decode_suff
-                    cp exp/$recog/decode_basis_fmllr_$decode_suff/lat.*.gz exp/$recog/decode_mbr_basis_fmllr_$decode_suff 
-                    echo " ## MBR RESCORING $dataset | $recog, basis_fmllr ##"
-                    local/score_mbr.sh \
-                        $dataset data/lang_test_$lm/ exp/$recog/decode_mbr_basis_fmllr_$decode_suff || exit 1
-                fi
-            fi
-
-        done # loop data set
-    done # loop recog
-done # loop LM
+  done # loop recog
+fi
 
 # get all WERs with lmw=15
-local/get_results.sh
+if [ $stage -le 12 ]; then
+  local/get_results.sh
+fi
diff --git a/egs/rm/s5/RESULTS b/egs/rm/s5/RESULTS
index 11587e765c7..1014fce03ed 100644
--- a/egs/rm/s5/RESULTS
+++ b/egs/rm/s5/RESULTS
@@ -1,5 +1,5 @@
-for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done
-
+#!/bin/bash
+for x in exp/*/decode*; do [ -d $x ] && [[ $x =~ "$1" ]] && grep WER $x/wer_* | utils/best_wer.sh; done
 exit 0
 
 # Monophone, MFCC+delta+accel
@@ -163,7 +163,6 @@ exit 0
 %WER 7.73 [ 969 / 12533, 74 ins, 157 del, 738 sub ] exp/nnet5e_mpe_gpu/decode_ug_epoch4/wer_9
 
 
-
 # Some system combination experiments.
 %WER 3.18 [ 398 / 12533, 60 ins, 75 del, 263 sub ] exp/combine_1_2a/decode/wer_4
 %WER 1.56 [ 196 / 12533, 27 ins, 32 del, 137 sub ] exp/combine_sgmm2_4a_3b/decode/wer_2
@@ -230,34 +229,61 @@ for x in exp/nnet2_online_wsj/nnet_ms_a_smbr_0.00005/1/decode_*; do grep WER $x/
 %WER 7.33 [ 919 / 12533, 80 ins, 153 del, 686 sub ] exp/nnet2_online_wsj/nnet_ms_a_smbr_0.00005/1/decode_ug_epoch3/wer_13
 %WER 7.36 [ 923 / 12533, 85 ins, 148 del, 690 sub ] exp/nnet2_online_wsj/nnet_ms_a_smbr_0.00005/1/decode_ug_epoch4/wer_13
 
+### chain results ###
+# current best chain result with TDNN (check local/chain/run_tdnn_5f.sh)
+%WER 2.94 [ 369 / 12533, 51 ins, 71 del, 247 sub ] exp/chain/tdnn_5f/decode/wer_3_0.5
 
 ### nnet1 results ###
-# DNN systems (Karel - 25.9.2014)
-# Per-frame cross-entropy training
-%WER 1.63 [ 204 / 12533, 32 ins, 42 del, 130 sub ] exp/dnn4b_pretrain-dbn_dnn/decode/wer_3
-%WER 7.77 [ 974 / 12533, 81 ins, 158 del, 735 sub ] exp/dnn4b_pretrain-dbn_dnn/decode_ug/wer_7
-# Sequence-based sMBR training
-%WER 1.61 [ 202 / 12533, 32 ins, 42 del, 128 sub ] exp/dnn4b_pretrain-dbn_dnn_smbr/decode_it1/wer_3
-%WER 1.62 [ 203 / 12533, 33 ins, 42 del, 128 sub ] exp/dnn4b_pretrain-dbn_dnn_smbr/decode_it2/wer_3
-%WER 1.63 [ 204 / 12533, 32 ins, 42 del, 130 sub ] exp/dnn4b_pretrain-dbn_dnn_smbr/decode_it3/wer_3
-%WER 1.64 [ 206 / 12533, 32 ins, 42 del, 132 sub ] exp/dnn4b_pretrain-dbn_dnn_smbr/decode_it4/wer_3
-%WER 1.63 [ 204 / 12533, 32 ins, 41 del, 131 sub ] exp/dnn4b_pretrain-dbn_dnn_smbr/decode_it5/wer_3
-%WER 1.64 [ 206 / 12533, 20 ins, 58 del, 128 sub ] exp/dnn4b_pretrain-dbn_dnn_smbr/decode_it6/wer_5
-
-# CNN systems (Karel - 25.9.2014)
-%WER 1.89 [ 237 / 12533, 30 ins, 47 del, 160 sub ] exp/cnn4c/decode/wer_3 # per-frame training
-# 2D-CNN system (from Harish Mallidi, run by Karel - 22.6.2015)
-%WER 2.07 [ 260 / 12533, 32 ins, 60 del, 168 sub ] exp/cnn2d4c/decode/wer_4_0.0 # per-frame training
-
-# Joint training with WSJ data, FBANK+pitch features. 2 softmax layers, multitask training,
-# (Karel - 10.7.2015)
-%WER 1.52 [ 191 / 12533, 17 ins, 52 del, 122 sub ] exp/dnn4e-fbank_blocksoftmax/decode/wer_4_0.5
+
+# dnn4b, MFCC,LDA,fMLLR feaures, (Karel - 30.7.2015)
+# Xent,
+%WER 1.75 [ 219 / 12533, 36 ins, 35 del, 148 sub ] exp/dnn4b_pretrain-dbn_dnn/decode/wer_2_0.0
+%WER 7.90 [ 990 / 12533, 90 ins, 147 del, 753 sub ] exp/dnn4b_pretrain-dbn_dnn/decode_ug/wer_5_1.0
+# sMBR,
+%WER 1.77 [ 222 / 12533, 21 ins, 57 del, 144 sub ] exp/dnn4b_pretrain-dbn_dnn_smbr/decode_it1/wer_4_0.0
+%WER 1.68 [ 210 / 12533, 24 ins, 43 del, 143 sub ] exp/dnn4b_pretrain-dbn_dnn_smbr/decode_it3/wer_4_0.0
+%WER 1.58 [ 198 / 12533, 20 ins, 41 del, 137 sub ] exp/dnn4b_pretrain-dbn_dnn_smbr/decode_it6/wer_5_0.0
+
+# cnn4c, FBANK+pitch features, (Karel - 30.7.2015)
+# Xent, no-RBM,
+%WER 2.00 [ 251 / 12533, 34 ins, 54 del, 163 sub ] exp/cnn4c/decode/wer_3_0.5
+# Xent, RBM on top of CNN,
+%WER 2.04 [ 256 / 12533, 20 ins, 78 del, 158 sub ] exp/cnn4c_pretrain-dbn_dnn/decode/wer_6_0.5
+# sMBR,
+%WER 2.02 [ 253 / 12533, 35 ins, 54 del, 164 sub ] exp/cnn4c_pretrain-dbn_dnn_smbr/decode_it1/wer_5_0.0
+%WER 1.93 [ 242 / 12533, 23 ins, 62 del, 157 sub ] exp/cnn4c_pretrain-dbn_dnn_smbr/decode_it3/wer_6_0.5
+%WER 1.90 [ 238 / 12533, 29 ins, 49 del, 160 sub ] exp/cnn4c_pretrain-dbn_dnn_smbr/decode_it6/wer_6_0.0
+
+# dnn4d, FBANK+pitch, (Karel - 30.7.2015)
+# Xent,
+%WER 1.95 [ 245 / 12533, 22 ins, 63 del, 160 sub ] exp/dnn4d-fbank_pretrain-dbn_dnn/decode/wer_4_1.0
+# sMBR,
+%WER 1.98 [ 248 / 12533, 35 ins, 50 del, 163 sub ] exp/dnn4d-fbank_pretrain-dbn_dnn_smbr/decode_it1/wer_3_0.0
+%WER 1.91 [ 239 / 12533, 19 ins, 60 del, 160 sub ] exp/dnn4d-fbank_pretrain-dbn_dnn_smbr/decode_it3/wer_5_0.5
+%WER 1.88 [ 236 / 12533, 17 ins, 61 del, 158 sub ] exp/dnn4d-fbank_pretrain-dbn_dnn_smbr/decode_it6/wer_6_0.5
+
+# dnn4e, FBANK+pitch, 2 output layers: rm + wsj, (Karel - 10.7.2015)
+%WER 1.52 [ 191 / 12533, 17 ins, 52 del, 122 sub ] exp/dnn4e-fbank_blocksoftmax/decode/wer_4_0.5 <<<[BEST]
 %WER 7.86 [ 985 / 12533, 84 ins, 160 del, 741 sub ] exp/dnn4e-fbank_blocksoftmax/decode_ug/wer_8_0.0
 
-# LSTM result
-for x in exp/lstm4f/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done
-%WER 2.04 [ 256 / 12533, 18 ins, 60 del, 178 sub ] exp/lstm4f_c512_r200_c512_r200_lr0.0001_mmt0.9_clip50/decode/wer_4_0.5
-# BLSTM result
+# lstm4f, FBANK+pitch, 2LSTMs, (Karel - 30.7.2015)
+%WER 2.15 [ 270 / 12533, 20 ins, 69 del, 181 sub ] exp/lstm4f/decode/wer_5_0.0
+
+# cnn4g-2D, FBANK+pitch, 2D-CNN system (from Harish Mallidi, run by Karel - 22.6.2015)
+%WER 2.07 [ 260 / 12533, 32 ins, 60 del, 168 sub ] exp/cnn2d4c/decode/wer_4_0.0
+
+# dnn4h, FBANK+pitch, ``dummy ivector'', should be same as 'dnn4d', (Karel - 30.7.2015)
+# Xent, no-RBM,
+%WER 2.14 [ 268 / 12533, 29 ins, 71 del, 168 sub ] exp/dnn4h-dummy-ivec/decode/wer_4_0.0
+# Xent, RBM,
+%WER 1.84 [ 230 / 12533, 29 ins, 51 del, 150 sub ] exp/dnn4h-dummy-ivec_pretrain-dbn_dnn/decode/wer_3_1.0
+# sMBR,
+%WER 1.83 [ 229 / 12533, 29 ins, 50 del, 150 sub ] exp/dnn4h-dummy-ivec_pretrain-dbn_dnn_smbr/decode_it1/wer_3_1.0
+%WER 1.81 [ 227 / 12533, 29 ins, 49 del, 149 sub ] exp/dnn4h-dummy-ivec_pretrain-dbn_dnn_smbr/decode_it3/wer_3_1.0
+%WER 1.86 [ 233 / 12533, 34 ins, 46 del, 153 sub ] exp/dnn4h-dummy-ivec_pretrain-dbn_dnn_smbr/decode_it6/wer_3_0.5
+
+# blstm4i, FBANK+pitch, (Karel - ??.6.2015) 
 %WER 2.09 [ 262 / 12533, 25 ins, 69 del, 168 sub ] exp/blstm4g/decode/wer_4_0.0
 
-### nnet1 results, the end ###
+### ^^^ nnet1 results ^^^ ###
+
diff --git a/egs/rm/s5/cmd.sh b/egs/rm/s5/cmd.sh
index 4478796305e..6e2f3e9ee48 100644
--- a/egs/rm/s5/cmd.sh
+++ b/egs/rm/s5/cmd.sh
@@ -1,30 +1,31 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
-train_cmd="queue.pl -l arch=*64"
-decode_cmd="queue.pl -l arch=*64"
+export train_cmd=queue.pl
+export decode_cmd=queue.pl
+export mkgraph_cmd=queue.pl
+export cuda_cmd="queue.pl --gpu 1"
 
-# cuda_cmd is used for nnet1 scripts e.g. local/run_dnn.sh, but
-# in the nnet2 scripts e.g. local/run_nnet2.sh, this is not
-# used and we append options to train_cmd.
-cuda_cmd="queue.pl -l arch=*64 -l gpu=1"
-
-#train_cmd="run.pl"
-# with run.pl we do training locally.  Note: for jobs on smallish subsets,
-# it's way faster to run on a single machine with a handful of CPUs, as
-# you avoid the latency of starting GridEngine jobs.
 
+# The rest of this file is here for historical reasons.  For cluster-specific
+# configuration it's generally better to use conf/queue.conf, see
+# http://kaldi-asr.org/doc/queue.html.
 
 # BUT cluster:
 if [ "$(hostname -d)" == "fit.vutbr.cz" ]; then
   queue="all.q@@blade,all.q@@speech"
-  gpu_queue="long.q@supergpu*,long.q@dellgpu*,long.q@pcspeech-gpu,long.q@pcgpu*"
+  gpu_queue="long.q@@gpu"
   storage="matylda5"
   export train_cmd="queue.pl -q $queue -l ram_free=1500M,mem_free=1500M,${storage}=1"
   export decode_cmd="queue.pl -q $queue -l ram_free=2500M,mem_free=2500M,${storage}=0.5"
-  export cuda_cmd="queue.pl -q $gpu_queue -l gpu=1" 
+  export cuda_cmd="queue.pl -q $gpu_queue -l gpu=1"
 fi
diff --git a/egs/rm/s5/conf/decode_dnn.config b/egs/rm/s5/conf/decode_dnn.config
index e5f85633c5b..e7cfca74763 100644
--- a/egs/rm/s5/conf/decode_dnn.config
+++ b/egs/rm/s5/conf/decode_dnn.config
@@ -1,13 +1,8 @@
-# RM setup has weird optimal scaling (ACWT is 1/3)
-#
-# This is much larger than 1/10 on SWBD, we use pseudo LM, 
-# so LM scores are likely to be overboosted. 
-# For the discriminative training we will still use acwt 0.1,
-# scaling down the LM scores did not bring significant improvement.
-#
+# In RM, the optimal decode LMWT is in range 2..5, which is different from usual 10..15
+# (it is caused by using simple rule-based LM, instead of n-gram LM),
+scoring_opts="--min-lmwt 2 --max-lmwt 10"
+# Still, it is better to use --acwt 0.1, both for decoding and sMBR,
 acwt=0.1
-# Large acwt, beams need to be larger too:
+# For this small task we can afford to have large beams,
 beam=30.0 # beam for decoding.  Was 13.0 in the scripts.
 lattice_beam=18.0 # this has most effect on size of the lattices.
-# We search for optimal WER in low LMWTs:
-scoring_opts="--min-lmwt 2 --max-lmwt 10" # search acoustic scale in larger values 
diff --git a/egs/rm/s5/local/chain/run_tdnn_5f.sh b/egs/rm/s5/local/chain/run_tdnn_5f.sh
new file mode 100644
index 00000000000..0379d16fe13
--- /dev/null
+++ b/egs/rm/s5/local/chain/run_tdnn_5f.sh
@@ -0,0 +1,131 @@
+#!/bin/bash
+
+# this script is a modified version of swbd/run_tdnn_5f.sh
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+dir=exp/chain/tdnn_5f
+
+# training options
+num_epochs=12
+initial_effective_lrate=0.005
+final_effective_lrate=0.0005
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=2
+num_jobs_final=4
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet2 setup, and you can skip them by setting "--stage 4" if you have already
+# run those things.
+
+ali_dir=exp/tri3b_ali
+treedir=exp/chain/tri4_2y_tree
+lang=data/lang_chain_2y
+
+local/online/run_nnet2_common.sh --stage $stage || exit 1;
+
+if [ $stage -le 4 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri3b_ali/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/train \
+    data/lang exp/tri3b exp/tri3b_lats
+  rm exp/tri3b_lats/fsts.*.gz # save space
+fi
+
+if [ $stage -le 5 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 6 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+    --leftmost-questions-truncate $leftmost_questions_truncate \
+    --cmd "$train_cmd" 1200 data/train $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 7 ]; then
+  steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --jesus-opts "--jesus-forward-input-dim 200  --jesus-forward-output-dim 500 --jesus-hidden-dim 2000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -2,-1,0,1 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1000000 \
+    --lm-opts "--num-extra-lm-states=200" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet2_online/ivectors \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/train $treedir exp/tri3b_lats $dir  || exit 1;
+fi
+
+if [ $stage -le 8 ]; then
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 4 \
+    data/test exp/nnet2_online/extractor exp/nnet2_online/ivectors_test || exit 1;
+fi
+
+if [ $stage -le 9 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+    --extra-left-context 20 --scoring-opts "--min-lmwt 1" \
+    --nj 20 --cmd "$decode_cmd" \
+    --online-ivector-dir exp/nnet2_online/ivectors_test \
+    $dir/graph data/test $dir/decode || exit 1;
+fi
+
+if [ $stage -le 10 ]; then
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_ug $dir $dir/graph_ug
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+    --extra-left-context 20 \
+    --nj 20 --cmd "$decode_cmd" \
+    --online-ivector-dir exp/nnet2_online/ivectors_test \
+    $dir/graph_ug data/test $dir/decode_ug || exit 1;
+fi
+wait;
+exit 0;
diff --git a/egs/rm/s5/local/nnet/run_autoencoder.sh b/egs/rm/s5/local/nnet/run_autoencoder.sh
index 2ee4b19bf80..c05792a936b 100755
--- a/egs/rm/s5/local/nnet/run_autoencoder.sh
+++ b/egs/rm/s5/local/nnet/run_autoencoder.sh
@@ -1,8 +1,16 @@
 #!/bin/bash
 
+# Copyright 2012-2014  Brno University of Technology (Author: Karel Vesely)
+# Apache 2.0
+
+# This example shows how to train a simple autoencoder network.
+# We use <tanh>, little different training hyperparameters and MSE objective.
+
 . path.sh
 . cmd.sh
 
+set -eu
+
 # Train,
 dir=exp/autoencoder
 data_fmllr=data-fmllr-tri3b
diff --git a/egs/rm/s5/local/nnet/run_blocksoftmax.sh b/egs/rm/s5/local/nnet/run_blocksoftmax.sh
index a1de4d433ca..175a6021778 100755
--- a/egs/rm/s5/local/nnet/run_blocksoftmax.sh
+++ b/egs/rm/s5/local/nnet/run_blocksoftmax.sh
@@ -28,17 +28,14 @@ wsj_ali=../../wsj/s5/exp/tri4b_ali_si284
 stage=0
 . utils/parse_options.sh || exit 1;
 
-set -u 
-set -e
-set -o pipefail
-set -x
+set -euxo pipefail
 
 # Make the FBANK features,
-if [ $stage -le 0 ]; then
+[ ! -e $dev ] && if [ $stage -le 0 ]; then
   # Make datadir copies,
-  utils/copy_data_dir.sh $dev_original $dev; rm $dev/{cmvn,feats}.scp 2>/dev/null
-  utils/copy_data_dir.sh $train_original $train; rm $train/{cmvn,feats}.scp 2>/dev/null
-  utils/copy_data_dir.sh --utt-prefix wsj_ --spk-prefix wsj_ $wsj_original $wsj; rm $wsj/{cmvn,feats}.scp 2>/dev/null
+  utils/copy_data_dir.sh $dev_original $dev; rm $dev/{cmvn,feats}.scp
+  utils/copy_data_dir.sh $train_original $train; rm $train/{cmvn,feats}.scp
+  utils/copy_data_dir.sh --utt-prefix wsj --spk-prefix wsj $wsj_original $wsj; rm $wsj/{cmvn,feats}.scp
   
   # Feature extraction,
   # Dev set,
@@ -46,11 +43,11 @@ if [ $stage -le 0 ]; then
     $dev $dev/log $dev/data
   steps/compute_cmvn_stats.sh $dev $dev/log $dev/data
   # Training set,
-  steps/make_fbank_pitch.sh --nj 10 --cmd "$train_cmd -tc 10" \
+  steps/make_fbank_pitch.sh --nj 10 --cmd "$train_cmd --max-jobs-run 10" \
     $train $train/log $train/data
   steps/compute_cmvn_stats.sh $train $train/log $train/data
   # Wsj,
-  steps/make_fbank_pitch.sh --nj 10 --cmd "$train_cmd -tc 10" \
+  steps/make_fbank_pitch.sh --nj 10 --cmd "$train_cmd --max-jobs-run 10" \
     $wsj $wsj/log $wsj/data
   steps/compute_cmvn_stats.sh $wsj $wsj/log $wsj/data
 
diff --git a/egs/rm/s5/local/nnet/run_blstm.sh b/egs/rm/s5/local/nnet/run_blstm.sh
index 25dc7dcb455..c9db65f738e 100755
--- a/egs/rm/s5/local/nnet/run_blstm.sh
+++ b/egs/rm/s5/local/nnet/run_blstm.sh
@@ -12,7 +12,9 @@
 #
 # A more sensible approach should be single-stream training, 
 # and per-utterance updates. But the results were worse.
-#
+
+# Note: With DNNs in RM, the optimal LMWT is 2-6. Don't be tempted to try acwt's like 0.2, 
+# the value 0.1 is better both for decoding and sMBR.
 
 . ./cmd.sh
 . ./path.sh
@@ -28,6 +30,8 @@ gmm=exp/tri3b
 stage=0
 . utils/parse_options.sh || exit 1;
 
+set -eu
+
 # Make the FBANK features
 [ ! -e $dev ] && if [ $stage -le 0 ]; then
   # Dev set
@@ -37,7 +41,7 @@ stage=0
   steps/compute_cmvn_stats.sh $dev $dev/log $dev/data || exit 1;
   # Training set
   utils/copy_data_dir.sh $train_original $train || exit 1; rm $train/{cmvn,feats}.scp
-  steps/make_fbank_pitch.sh --nj 10 --cmd "$train_cmd -tc 10" \
+  steps/make_fbank_pitch.sh --nj 10 --cmd "$train_cmd --max-jobs-run 10" \
      $train $train/log $train/data || exit 1;
   steps/compute_cmvn_stats.sh $train $train/log $train/data || exit 1;
   # Split the training set
@@ -46,7 +50,7 @@ fi
 
 if [ $stage -le 1 ]; then
   # Train the DNN optimizing per-frame cross-entropy.
-  dir=exp/blstm4g
+  dir=exp/blstm4i
   ali=${gmm}_ali
 
   # Train
@@ -61,8 +65,6 @@ if [ $stage -le 1 ]; then
   # Decode (reuse HCLG graph)
   steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \
     $gmm/graph $dev $dir/decode || exit 1;
-  steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \
-    $gmm/graph_ug $dev $dir/decode_ug || exit 1;
 fi
 
 # TODO : sequence training,
diff --git a/egs/rm/s5/local/nnet/run_cnn.sh b/egs/rm/s5/local/nnet/run_cnn.sh
index c6a5ee209c2..8c5730a1c85 100755
--- a/egs/rm/s5/local/nnet/run_cnn.sh
+++ b/egs/rm/s5/local/nnet/run_cnn.sh
@@ -1,5 +1,15 @@
 #!/bin/bash
 
+# Copyright 2012-2015  Brno University of Technology (Author: Karel Vesely)
+# Apache 2.0
+
+# This example shows how to build CNN with convolution along frequency axis.
+# First we train CNN, then build RBMs on top, then do train per-frame training 
+# and sequence-discriminative training.
+
+# Note: With DNNs in RM, the optimal LMWT is 2-6. Don't be tempted to try acwt's like 0.2, 
+# the value 0.1 is better both for decoding and sMBR.
+
 . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
            ## This relates to the queue.
 
@@ -16,9 +26,10 @@ gmm=exp/tri3b
 stage=0
 . utils/parse_options.sh
 
+set -euxo pipefail
 
 # Make the FBANK features,
-if [ $stage -le 0 ]; then
+[ ! -e $dev ] && if [ $stage -le 0 ]; then
   # Dev set
   utils/copy_data_dir.sh $dev_original $dev || exit 1; rm $dev/{cmvn,feats}.scp
   steps/make_fbank_pitch.sh --nj 10 --cmd "$train_cmd" \
@@ -34,6 +45,7 @@ if [ $stage -le 0 ]; then
 fi
 
 # Run the CNN pre-training,
+hid_layers=2
 if [ $stage -le 1 ]; then
   dir=exp/cnn4c
   ali=${gmm}_ali
@@ -43,17 +55,23 @@ if [ $stage -le 1 ]; then
       --cmvn-opts "--norm-means=true --norm-vars=true" \
       --delta-opts "--delta-order=2" --splice 5 \
       --network-type cnn1d --cnn-proto-opts "--patch-dim1 8 --pitch-dim 3" \
-      --hid-layers 2 --learn-rate 0.008 \
+      --hid-layers $hid_layers --learn-rate 0.008 \
       ${train}_tr90 ${train}_cv10 data/lang $ali $ali $dir || exit 1;
-  # Decode
-  steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.2 \
+  # Decode,
+  steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \
     $gmm/graph $dev $dir/decode || exit 1;
-  steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.2 \
-    $gmm/graph_ug $dev $dir/decode_ug || exit 1;
 fi
 
-# Pre-train stack of RBMs on top of the convolutional layers (4 layers, 1024 units),
 if [ $stage -le 2 ]; then
+  # Concat 'feature_transform' with convolutional layers,
+  dir=exp/cnn4c
+  nnet-concat $dir/final.feature_transform \
+    "nnet-copy --remove-last-components=$(((hid_layers+1)*2)) $dir/final.nnet - |" \
+    $dir/final.feature_transform_cnn
+fi
+
+# Pre-train stack of RBMs on top of the convolutional layers (4 layers, 1024 units),
+if [ $stage -le 3 ]; then
   dir=exp/cnn4c_pretrain-dbn
   transf_cnn=exp/cnn4c/final.feature_transform_cnn # transform with convolutional layers
   # Train
@@ -65,14 +83,14 @@ if [ $stage -le 2 ]; then
 fi
 
 # Re-align using CNN,
-if [ $stage -le 3 ]; then
+if [ $stage -le 4 ]; then
   dir=exp/cnn4c
   steps/nnet/align.sh --nj 20 --cmd "$train_cmd" \
     $train data/lang $dir ${dir}_ali || exit 1
 fi
 
 # Train the DNN optimizing cross-entropy,
-if [ $stage -le 4 ]; then
+if [ $stage -le 5 ]; then
   dir=exp/cnn4c_pretrain-dbn_dnn; [ ! -d $dir ] && mkdir -p $dir/log;
   ali=exp/cnn4c_ali
   feature_transform=exp/cnn4c/final.feature_transform
@@ -81,7 +99,7 @@ if [ $stage -le 4 ]; then
   cnn_dbn=$dir/cnn_dbn.nnet
   { # Concatenate CNN layers and DBN,
     num_components=$(nnet-info $feature_transform | grep -m1 num-components | awk '{print $2;}')
-    cnn="nnet-copy --remove-first-layers=$num_components $feature_transform_dbn - |"
+    cnn="nnet-copy --remove-first-components=$num_components $feature_transform_dbn - |"
     nnet-concat "$cnn" $dbn $cnn_dbn 2>$dir/log/concat_cnn_dbn.log || exit 1 
   }
   # Train
@@ -89,20 +107,20 @@ if [ $stage -le 4 ]; then
     steps/nnet/train.sh --feature-transform $feature_transform --dbn $cnn_dbn --hid-layers 0 \
     ${train}_tr90 ${train}_cv10 data/lang $ali $ali $dir || exit 1;
   # Decode (reuse HCLG graph)
-  steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.2 \
+  steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \
     $gmm/graph $dev $dir/decode || exit 1;
-  steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.2 \
-    $gmm/graph_ug $dev $dir/decode_ug || exit 1;
 fi
 
-# Sequence training using sMBR criterion, we do Stochastic-GD 
-# with per-utterance updates. For RM good acwt is 0.2,
+
+# Sequence training using sMBR criterion, we do Stochastic-GD with per-utterance updates.
+# Note: With DNNs in RM, the optimal LMWT is 2-6. Don't be tempted to try acwt's like 0.2, 
+# the value 0.1 is better both for decoding and sMBR.
 dir=exp/cnn4c_pretrain-dbn_dnn_smbr
 srcdir=exp/cnn4c_pretrain-dbn_dnn
-acwt=0.2
+acwt=0.1
 
 # First we generate lattices and alignments,
-if [ $stage -le 4 ]; then
+if [ $stage -le 6 ]; then
   steps/nnet/align.sh --nj 20 --cmd "$train_cmd" \
     $train data/lang $srcdir ${srcdir}_ali || exit 1;
   steps/nnet/make_denlats.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt $acwt \
@@ -110,11 +128,11 @@ if [ $stage -le 4 ]; then
 fi
 
 # Re-train the DNN by 6 iterations of sMBR,
-if [ $stage -le 5 ]; then
+if [ $stage -le 7 ]; then
   steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 6 --acwt $acwt --do-smbr true \
     $train data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir || exit 1
   # Decode
-  for ITER in 1 2 3 4 5 6; do
+  for ITER in 1 3 6; do
     steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config \
       --nnet $dir/${ITER}.nnet --acwt $acwt \
       $gmm/graph $dev $dir/decode_it${ITER} || exit 1
diff --git a/egs/rm/s5/local/nnet/run_cnn2d.sh b/egs/rm/s5/local/nnet/run_cnn2d.sh
index ac69074cf6e..be17bce7a57 100755
--- a/egs/rm/s5/local/nnet/run_cnn2d.sh
+++ b/egs/rm/s5/local/nnet/run_cnn2d.sh
@@ -1,10 +1,23 @@
 #!/bin/bash
 
+# Copyright 2012-2015  Brno University of Technology (Author: Karel Vesely)
+# Apache 2.0
+
+# This example shows how to build CNN with 2D convolution along both frequency 
+# and time axis. First we train CNN, then build RBMs on top, then do train 
+# per-frame training and sequence-discriminative training.
+
+# Note: With DNNs in RM, the optimal LMWT is 2-6. Don't be tempted to try acwt's like 0.2, 
+# the value 0.1 is better both for decoding and sMBR.
+
 . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
            ## This relates to the queue.
 
 . ./path.sh ## Source the tools/utils (import the queue.pl)
 
+# Note: With DNNs in RM, the optimal LMWT is 2-6. Don't be tempted to try acwt's like 0.2, 
+# the value 0.1 is better both for decoding and sMBR.
+
 dev=data-fbank/test
 train=data-fbank/train
 
@@ -16,9 +29,10 @@ gmm=exp/tri3b
 stage=0
 . utils/parse_options.sh
 
+set -eu
 
 # Make the FBANK features,
-if [ $stage -le 0 ]; then
+[ ! -e $dev ] && if [ $stage -le 0 ]; then
   # Dev set
   utils/copy_data_dir.sh $dev_original $dev || exit 1; rm $dev/{cmvn,feats}.scp
   steps/make_fbank_pitch.sh --nj 10 --cmd "$train_cmd" \
@@ -35,7 +49,7 @@ fi
 
 # Run the CNN pre-training,
 if [ $stage -le 1 ]; then
-  dir=exp/cnn2d4c
+  dir=exp/cnn4g-2D
   ali=${gmm}_ali
   # Train
   $cuda_cmd $dir/log/train_nnet.log \
@@ -46,16 +60,14 @@ if [ $stage -le 1 ]; then
       --hid-layers 2 --learn-rate 0.008 \
       ${train}_tr90 ${train}_cv10 data/lang $ali $ali $dir || exit 1;
   # Decode
-  steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.2 \
+  steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \
     $gmm/graph $dev $dir/decode || exit 1;
-  steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.2 \
-    $gmm/graph_ug $dev $dir/decode_ug || exit 1;
 fi
 
 # Pre-train stack of RBMs on top of the convolutional layers (4 layers, 1024 units),
 if [ $stage -le 2 ]; then
-  dir=exp/cnn2d4c_pretrain-dbn
-  transf_cnn=exp/cnn2d4c/final.feature_transform_cnn # transform with convolutional layers
+  dir=exp/cnn4g-2D_pretrain-dbn
+  transf_cnn=exp/cnn4g-2D/final.feature_transform_cnn # transform with convolutional layers
   # Train
   $cuda_cmd $dir/log/pretrain_dbn.log \
     steps/nnet/pretrain_dbn.sh --nn-depth 4 --hid-dim 1024 --rbm-iter 20 \
@@ -66,22 +78,22 @@ fi
 
 # Re-align using CNN,
 if [ $stage -le 3 ]; then
-  dir=exp/cnn2d4c
+  dir=exp/cnn4g-2D
   steps/nnet/align.sh --nj 20 --cmd "$train_cmd" \
     $train data/lang $dir ${dir}_ali || exit 1
 fi
 
 # Train the DNN optimizing cross-entropy,
 if [ $stage -le 4 ]; then
-  dir=exp/cnn2d4c_pretrain-dbn_dnn; [ ! -d $dir ] && mkdir -p $dir/log;
-  ali=exp/cnn2d4c_ali
-  feature_transform=exp/cnn2d4c/final.feature_transform
-  feature_transform_dbn=exp/cnn2d4c_pretrain-dbn/final.feature_transform
-  dbn=exp/cnn2d4c_pretrain-dbn/4.dbn
+  dir=exp/cnn4g-2D_pretrain-dbn_dnn; [ ! -d $dir ] && mkdir -p $dir/log;
+  ali=exp/cnn4g-2D_ali
+  feature_transform=exp/cnn4g-2D/final.feature_transform
+  feature_transform_dbn=exp/cnn4g-2D_pretrain-dbn/final.feature_transform
+  dbn=exp/cnn4g-2D_pretrain-dbn/4.dbn
   cnn_dbn=$dir/cnn_dbn.nnet
   { # Concatenate CNN layers and DBN,
     num_components=$(nnet-info $feature_transform | grep -m1 num-components | awk '{print $2;}')
-    cnn="nnet-copy --remove-first-layers=$num_components $feature_transform_dbn - |"
+    cnn="nnet-copy --remove-first-components=$num_components $feature_transform_dbn - |"
     nnet-concat "$cnn" $dbn $cnn_dbn 2>$dir/log/concat_cnn_dbn.log || exit 1 
   }
   # Train
@@ -89,17 +101,17 @@ if [ $stage -le 4 ]; then
     steps/nnet/train.sh --feature-transform $feature_transform --dbn $cnn_dbn --hid-layers 0 \
     ${train}_tr90 ${train}_cv10 data/lang $ali $ali $dir || exit 1;
   # Decode (reuse HCLG graph)
-  steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.2 \
+  steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \
     $gmm/graph $dev $dir/decode || exit 1;
-  steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.2 \
-    $gmm/graph_ug $dev $dir/decode_ug || exit 1;
 fi
 
-# Sequence training using sMBR criterion, we do Stochastic-GD 
-# with per-utterance updates. For RM good acwt is 0.2,
-dir=exp/cnn2d4c_pretrain-dbn_dnn_smbr
-srcdir=exp/cnn2d4c_pretrain-dbn_dnn
-acwt=0.2
+
+# Sequence training using sMBR criterion, we do Stochastic-GD with per-utterance updates.
+# Note: With DNNs in RM, the optimal LMWT is 2-6. Don't be tempted to try acwt's like 0.2, 
+# the value 0.1 is better both for decoding and sMBR.
+dir=exp/cnn4g-2D_pretrain-dbn_dnn_smbr
+srcdir=exp/cnn4g-2D_pretrain-dbn_dnn
+acwt=0.1
 
 # First we generate lattices and alignments,
 if [ $stage -le 4 ]; then
@@ -114,7 +126,7 @@ if [ $stage -le 5 ]; then
   steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 6 --acwt $acwt --do-smbr true \
     $train data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir || exit 1
   # Decode
-  for ITER in 1 2 3 4 5 6; do
+  for ITER in 6 3 1; do
     steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config \
       --nnet $dir/${ITER}.nnet --acwt $acwt \
       $gmm/graph $dev $dir/decode_it${ITER} || exit 1
diff --git a/egs/rm/s5/local/nnet/run_dnn.sh b/egs/rm/s5/local/nnet/run_dnn.sh
index c30d93a7861..c2ba26970ad 100755
--- a/egs/rm/s5/local/nnet/run_dnn.sh
+++ b/egs/rm/s5/local/nnet/run_dnn.sh
@@ -15,41 +15,45 @@
 #    the objective is to emphasize state-sequences with better 
 #    frame accuracy w.r.t. reference alignment.
 
+# Note: With DNNs in RM, the optimal LMWT is 2-6. Don't be tempted to try acwt's like 0.2, 
+# the value 0.1 is better both for decoding and sMBR.
+
 . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
            ## This relates to the queue.
 
 . ./path.sh ## Source the tools/utils (import the queue.pl)
 
+set -eu
+
 # Config:
 gmm=exp/tri3b
 data_fmllr=data-fmllr-tri3b
 stage=0 # resume training with --stage=N
 # End of config.
-. utils/parse_options.sh || exit 1;
+. utils/parse_options.sh
 #
 
-if [ $stage -le 0 ]; then
+[ ! -e $data_fmllr/test ] && if [ $stage -le 0 ]; then
   # Store fMLLR features, so we can train on them easily,
   # test
   dir=$data_fmllr/test
   steps/nnet/make_fmllr_feats.sh --nj 10 --cmd "$train_cmd" \
      --transform-dir $gmm/decode \
-     $dir data/test $gmm $dir/log $dir/data || exit 1
+     $dir data/test $gmm $dir/log $dir/data
   # train
   dir=$data_fmllr/train
   steps/nnet/make_fmllr_feats.sh --nj 10 --cmd "$train_cmd" \
      --transform-dir ${gmm}_ali \
-     $dir data/train $gmm $dir/log $dir/data || exit 1
+     $dir data/train $gmm $dir/log $dir/data
   # split the data : 90% train 10% cross-validation (held-out)
-  utils/subset_data_dir_tr_cv.sh $dir ${dir}_tr90 ${dir}_cv10 || exit 1
+  utils/subset_data_dir_tr_cv.sh $dir ${dir}_tr90 ${dir}_cv10
 fi
 
 if [ $stage -le 1 ]; then
   # Pre-train DBN, i.e. a stack of RBMs (small database, smaller DNN)
   dir=exp/dnn4b_pretrain-dbn
-  (tail --pid=$$ -F $dir/log/pretrain_dbn.log 2>/dev/null)& # forward log
   $cuda_cmd $dir/log/pretrain_dbn.log \
-    steps/nnet/pretrain_dbn.sh --hid-dim 1024 --rbm-iter 20 $data_fmllr/train $dir || exit 1;
+    steps/nnet/pretrain_dbn.sh --hid-dim 1024 --rbm-iter 20 $data_fmllr/train $dir
 fi
 
 if [ $stage -le 2 ]; then
@@ -58,42 +62,42 @@ if [ $stage -le 2 ]; then
   ali=${gmm}_ali
   feature_transform=exp/dnn4b_pretrain-dbn/final.feature_transform
   dbn=exp/dnn4b_pretrain-dbn/6.dbn
-  (tail --pid=$$ -F $dir/log/train_nnet.log 2>/dev/null)& # forward log
   # Train
   $cuda_cmd $dir/log/train_nnet.log \
     steps/nnet/train.sh --feature-transform $feature_transform --dbn $dbn --hid-layers 0 --learn-rate 0.008 \
-    $data_fmllr/train_tr90 $data_fmllr/train_cv10 data/lang $ali $ali $dir || exit 1;
+    $data_fmllr/train_tr90 $data_fmllr/train_cv10 data/lang $ali $ali $dir
   # Decode (reuse HCLG graph)
-  steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.2 \
-    $gmm/graph $data_fmllr/test $dir/decode || exit 1;
-  steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.2 \
-    $gmm/graph_ug $data_fmllr/test $dir/decode_ug || exit 1;
+  steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \
+    $gmm/graph $data_fmllr/test $dir/decode
+  steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \
+    $gmm/graph_ug $data_fmllr/test $dir/decode_ug
 fi
 
 
-# Sequence training using sMBR criterion, we do Stochastic-GD 
-# with per-utterance updates. For RM good acwt is 0.2
+# Sequence training using sMBR criterion, we do Stochastic-GD with per-utterance updates.
+# Note: With DNNs in RM, the optimal LMWT is 2-6. Don't be tempted to try acwt's like 0.2, 
+# the value 0.1 is better both for decoding and sMBR.
 dir=exp/dnn4b_pretrain-dbn_dnn_smbr
 srcdir=exp/dnn4b_pretrain-dbn_dnn
-acwt=0.2
+acwt=0.1
 
 if [ $stage -le 3 ]; then
   # First we generate lattices and alignments:
   steps/nnet/align.sh --nj 20 --cmd "$train_cmd" \
-    $data_fmllr/train data/lang $srcdir ${srcdir}_ali || exit 1;
+    $data_fmllr/train data/lang $srcdir ${srcdir}_ali
   steps/nnet/make_denlats.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt $acwt \
-    $data_fmllr/train data/lang $srcdir ${srcdir}_denlats || exit 1;
+    $data_fmllr/train data/lang $srcdir ${srcdir}_denlats
 fi
 
 if [ $stage -le 4 ]; then
   # Re-train the DNN by 6 iterations of sMBR 
   steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 6 --acwt $acwt --do-smbr true \
-    $data_fmllr/train data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir || exit 1
+    $data_fmllr/train data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir
   # Decode
-  for ITER in 1 2 3 4 5 6; do
+  for ITER in 6 3 1; do
     steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config \
       --nnet $dir/${ITER}.nnet --acwt $acwt \
-      $gmm/graph $data_fmllr/test $dir/decode_it${ITER} || exit 1
+      $gmm/graph $data_fmllr/test $dir/decode_it${ITER}
   done 
 fi
 
diff --git a/egs/rm/s5/local/nnet/run_dnn_fbank.sh b/egs/rm/s5/local/nnet/run_dnn_fbank.sh
index 1d736c2603b..4671381d3d3 100755
--- a/egs/rm/s5/local/nnet/run_dnn_fbank.sh
+++ b/egs/rm/s5/local/nnet/run_dnn_fbank.sh
@@ -15,6 +15,9 @@
 #    the objective is to emphasize state-sequences with better 
 #    frame accuracy w.r.t. reference alignment.
 
+# Note: With DNNs in RM, the optimal LMWT is 2-6. Don't be tempted to try acwt's like 0.2, 
+# the value 0.1 is better both for decoding and sMBR.
+
 . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
            ## This relates to the queue.
 
@@ -31,8 +34,10 @@ gmm=exp/tri3b
 stage=0
 . utils/parse_options.sh || exit 1;
 
+set -eu
+
 # Make the FBANK features
-if [ $stage -le 0 ]; then
+[ ! -e $dev ] && if [ $stage -le 0 ]; then
   # Dev set
   utils/copy_data_dir.sh $dev_original $dev || exit 1; rm $dev/{cmvn,feats}.scp
   steps/make_fbank_pitch.sh --nj 10 --cmd "$train_cmd" \
@@ -40,7 +45,7 @@ if [ $stage -le 0 ]; then
   steps/compute_cmvn_stats.sh $dev $dev/log $dev/data || exit 1;
   # Training set
   utils/copy_data_dir.sh $train_original $train || exit 1; rm $train/{cmvn,feats}.scp
-  steps/make_fbank_pitch.sh --nj 10 --cmd "$train_cmd -tc 10" \
+  steps/make_fbank_pitch.sh --nj 10 --cmd "$train_cmd --max-jobs-run 10" \
      $train $train/log $train/data || exit 1;
   steps/compute_cmvn_stats.sh $train $train/log $train/data || exit 1;
   # Split the training set
@@ -70,13 +75,12 @@ if [ $stage -le 2 ]; then
   # Decode (reuse HCLG graph)
   steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \
     $gmm/graph $dev $dir/decode || exit 1;
-  steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \
-    $gmm/graph_ug $dev $dir/decode_ug || exit 1;
 fi
 
 
-# Sequence training using sMBR criterion, we do Stochastic-GD 
-# with per-utterance updates. We use usually good acwt 0.1
+# Sequence training using sMBR criterion, we do Stochastic-GD with per-utterance updates.
+# Note: With DNNs in RM, the optimal LMWT is 2-6. Don't be tempted to try acwt's like 0.2, 
+# the value 0.1 is better both for decoding and sMBR.
 dir=exp/dnn4d-fbank_pretrain-dbn_dnn_smbr
 srcdir=exp/dnn4d-fbank_pretrain-dbn_dnn
 acwt=0.1
@@ -94,7 +98,7 @@ if [ $stage -le 4 ]; then
   steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 6 --acwt $acwt --do-smbr true \
     $train data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir || exit 1
   # Decode
-  for ITER in 1 2 3 4 5 6; do
+  for ITER in 6 3 1; do
     steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config \
       --nnet $dir/${ITER}.nnet --acwt $acwt \
       $gmm/graph $dev $dir/decode_it${ITER} || exit 1
diff --git a/egs/rm/s5/local/nnet/run_dummy_ivec.sh b/egs/rm/s5/local/nnet/run_dummy_ivec.sh
new file mode 100755
index 00000000000..860f209c2a0
--- /dev/null
+++ b/egs/rm/s5/local/nnet/run_dummy_ivec.sh
@@ -0,0 +1,140 @@
+#!/bin/bash
+
+# Copyright 2015  Brno University of Technology (Author: Karel Vesely)
+# Apache 2.0
+
+# This example demonstrates how to add i-vector on DNN input (or any other side-info). 
+# A fixed vector is pasted to all the frames of an utterance and forwarded to nn-input `as-is', 
+# bypassing both the feaure transform and global CMVN normalization.
+#
+# The i-vector is simulated by a dummy vector [ 0 0 0 ],
+# note that all the scripts get an extra option '--ivector'
+#
+# First we train NN with w/o RBM pre-training, then we do the full recipe:
+# RBM pre-training, per-frame training, and sequence-discriminative training.
+
+# Note: With DNNs in RM, the optimal LMWT is 2-6. Don't be tempted to try acwt's like 0.2, 
+# the value 0.1 is better both for decoding and sMBR.
+
+. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
+           ## This relates to the queue.
+
+. ./path.sh ## Source the tools/utils (import the queue.pl)
+
+dev=data-fbank/test
+train=data-fbank/train
+
+dev_original=data/test
+train_original=data/train
+
+gmm=exp/tri3b
+
+stage=0
+. utils/parse_options.sh
+
+set -uexo pipefail
+
+# Make the FBANK features
+[ ! -e $dev ] && if [ $stage -le 0 ]; then
+  # Dev set
+  utils/copy_data_dir.sh $dev_original $dev rm $dev/{cmvn,feats}.scp
+  steps/make_fbank_pitch.sh --nj 10 --cmd "$train_cmd" \
+     $dev $dev/log $dev/data
+  steps/compute_cmvn_stats.sh $dev $dev/log $dev/data
+  # Training set
+  utils/copy_data_dir.sh $train_original $train rm $train/{cmvn,feats}.scp
+  steps/make_fbank_pitch.sh --nj 10 --cmd "$train_cmd --max-jobs-run 10" \
+     $train $train/log $train/data
+  steps/compute_cmvn_stats.sh $train $train/log $train/data
+  # Split the training set
+  utils/subset_data_dir_tr_cv.sh --cv-spk-percent 10 $train ${train}_tr90 ${train}_cv10
+fi
+
+# Create ark with dummy-ivectors,
+[ ! -e data/dummy_ivec.ark ] && cat {$train,$dev}/feats.scp | awk '{ print $1, "[ 0 0 0 ]"; }' >data/dummy_ivec.ark
+ivector=ark:data/dummy_ivec.ark
+
+# 1) Build NN, no pre-training (script test),
+if [ $stage -le 1 ]; then
+  # Train the DNN optimizing per-frame cross-entropy.
+  dir=exp/dnn4h-dummy-ivec
+  ali=${gmm}_ali
+  # Train
+  $cuda_cmd $dir/log/train_nnet.log \
+    steps/nnet/train.sh --hid-layers 4 --hid-dim 1024 --learn-rate 0.008 \
+    --ivector $ivector \
+    --cmvn-opts "--norm-means=true --norm-vars=true" \
+    --delta-opts "--delta-order=2" --splice 5 \
+    ${train}_tr90 ${train}_cv10 data/lang $ali $ali $dir
+  # Decode (reuse HCLG graph)
+  steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \
+    --ivector $ivector \
+    $gmm/graph $dev $dir/decode
+fi
+
+# 2) Build NN, with pre-training (script test),
+if [ $stage -le 2 ]; then
+  # Pre-train DBN, i.e. a stack of RBMs (small database, smaller DNN)
+  dir=exp/dnn4h-dummy-ivec_pretrain-dbn
+  $cuda_cmd $dir/log/pretrain_dbn.log \
+    steps/nnet/pretrain_dbn.sh \
+      --ivector $ivector \
+      --cmvn-opts "--norm-means=true --norm-vars=true" \
+      --delta-opts "--delta-order=2" --splice 5 \
+      --hid-dim 1024 --rbm-iter 20 $train $dir
+fi
+
+if [ $stage -le 3 ]; then
+  # Train the DNN optimizing per-frame cross-entropy.
+  dir=exp/dnn4h-dummy-ivec_pretrain-dbn_dnn
+  ali=${gmm}_ali
+  feature_transform=exp/dnn4h-dummy-ivec_pretrain-dbn/final.feature_transform
+  dbn=exp/dnn4h-dummy-ivec_pretrain-dbn/6.dbn
+  # Train
+  $cuda_cmd $dir/log/train_nnet.log \
+    steps/nnet/train.sh --feature-transform $feature_transform --dbn $dbn --hid-layers 0 --learn-rate 0.008 \
+    --ivector $ivector \
+    ${train}_tr90 ${train}_cv10 data/lang $ali $ali $dir
+  # Decode (reuse HCLG graph)
+  steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \
+    --ivector $ivector \
+    $gmm/graph $dev $dir/decode
+fi
+
+
+# Sequence training using sMBR criterion, we do Stochastic-GD with per-utterance updates.
+# Note: With DNNs in RM, the optimal LMWT is 2-6. Don't be tempted to try acwt's like 0.2, 
+# the value 0.1 is better both for decoding and sMBR.
+dir=exp/dnn4h-dummy-ivec_pretrain-dbn_dnn_smbr
+srcdir=exp/dnn4h-dummy-ivec_pretrain-dbn_dnn
+acwt=0.1
+
+if [ $stage -le 4 ]; then
+  # First we generate lattices and alignments:
+  steps/nnet/align.sh --nj 20 --cmd "$train_cmd" \
+    --ivector $ivector \
+    $train data/lang $srcdir ${srcdir}_ali
+  steps/nnet/make_denlats.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt $acwt \
+    --ivector $ivector \
+    $train data/lang $srcdir ${srcdir}_denlats
+fi
+
+if [ $stage -le 5 ]; then
+  # Re-train the DNN by 6 iterations of sMBR 
+  steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 6 --acwt $acwt --do-smbr true \
+    --ivector $ivector \
+    $train data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir || exit 1
+  # Decode
+  for ITER in 1 3 6; do
+    steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config \
+      --ivector $ivector \
+      --nnet $dir/${ITER}.nnet --acwt $acwt \
+      $gmm/graph $dev $dir/decode_it${ITER} || exit 1
+  done 
+fi
+
+echo Success
+exit 0
+
+# Getting results [see RESULTS file]
+# for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done
diff --git a/egs/rm/s5/local/nnet/run_lstm.sh b/egs/rm/s5/local/nnet/run_lstm.sh
index f684ea5b036..191ebbf066e 100755
--- a/egs/rm/s5/local/nnet/run_lstm.sh
+++ b/egs/rm/s5/local/nnet/run_lstm.sh
@@ -6,6 +6,9 @@
 # This example script trains a LSTM network on FBANK features.
 # The LSTM code comes from Yiayu DU, and Wei Li, thanks!
 
+# Note: With DNNs in RM, the optimal LMWT is 2-6. Don't be tempted to try acwt's like 0.2, 
+# the value 0.1 is better both for decoding and sMBR.
+
 . ./cmd.sh
 . ./path.sh
 
@@ -20,6 +23,8 @@ gmm=exp/tri3b
 stage=0
 . utils/parse_options.sh || exit 1;
 
+set -eu
+
 # Make the FBANK features
 [ ! -e $dev ] && if [ $stage -le 0 ]; then
   # Dev set
@@ -29,7 +34,7 @@ stage=0
   steps/compute_cmvn_stats.sh $dev $dev/log $dev/data || exit 1;
   # Training set
   utils/copy_data_dir.sh $train_original $train || exit 1; rm $train/{cmvn,feats}.scp
-  steps/make_fbank_pitch.sh --nj 10 --cmd "$train_cmd -tc 10" \
+  steps/make_fbank_pitch.sh --nj 10 --cmd "$train_cmd --max-jobs-run 10" \
      $train $train/log $train/data || exit 1;
   steps/compute_cmvn_stats.sh $train $train/log $train/data || exit 1;
   # Split the training set
@@ -45,8 +50,9 @@ if [ $stage -le 1 ]; then
   $cuda_cmd $dir/log/train_nnet.log \
     steps/nnet/train.sh --network-type lstm --learn-rate 0.0001 \
       --cmvn-opts "--norm-means=true --norm-vars=true" --feat-type plain --splice 0 \
-      --train-opts "--momentum 0.9 --halving-factor 0.5" \
-      --train-tool "nnet-train-lstm-streams --num-stream=4 --targets-delay=5" \
+      --scheduler-opts "--momentum 0.9 --halving-factor 0.5" \
+      --train-tool "nnet-train-lstm-streams" \
+      --train-tool-opts "--num-stream=4 --targets-delay=5" \
       --proto-opts "--num-cells 512 --num-recurrent 200 --num-layers 2 --clip-gradient 5.0" \
     ${train}_tr90 ${train}_cv10 data/lang $ali $ali $dir || exit 1;
 
diff --git a/egs/rm/s5/local/nnet/run_multilingual.sh b/egs/rm/s5/local/nnet/run_multilingual.sh
index cecfe09be90..126f616c34d 100755
--- a/egs/rm/s5/local/nnet/run_multilingual.sh
+++ b/egs/rm/s5/local/nnet/run_multilingual.sh
@@ -58,7 +58,7 @@ if [ $stage -le 0 ]; then
     tgt_dir=$data/${code}_$(basename $dir)
     utils/copy_data_dir.sh --utt-suffix _$code --spk-suffix _$code $dir $tgt_dir; rm $tgt_dir/{feats,cmvn}.scp || true # remove features,
     # extract features, get cmvn stats,
-    steps/make_fbank_pitch.sh --nj 30 --cmd "$train_cmd -tc 10" $tgt_dir{,/log,/data}
+    steps/make_fbank_pitch.sh --nj 30 --cmd "$train_cmd --max-jobs-run 10" $tgt_dir{,/log,/data}
     steps/compute_cmvn_stats.sh $tgt_dir{,/log,/data}
     # split lists 90% train / 10% held-out,
     utils/subset_data_dir_tr_cv.sh $tgt_dir ${tgt_dir}_tr90 ${tgt_dir}_cv10
@@ -89,7 +89,7 @@ objective_function="multitask$(echo ${ali_dim[@]} | tr ' ' '\n' | \
 echo "Multitask objective function: $objective_function"
 
 # DNN training will be in $dir, the alignments are prepared beforehand,
-dir=exp/dnn4g-multilingual${num_langs}-$(echo $lang_code_csl | tr ',' '-') 
+dir=exp/dnn4g-multilingual${num_langs}-$(echo $lang_code_csl | tr ',' '-')-${nnet_type} 
 [ ! -e $dir ] && mkdir -p $dir
 echo "$lang_code_csl" >$dir/lang_code_csl
 echo "$ali_dir_csl" >$dir/ali_dir_csl
@@ -119,9 +119,10 @@ fi
 if [ $stage -le 2 ]; then  
   case $nnet_type in
     bn)
+    # Bottleneck network (40 dimensional bottleneck is good for fMLLR),
     $cuda_cmd $dir/log/train_nnet.log \
       steps/nnet/train.sh --learn-rate 0.008 \
-        --hid-layers 2 --hid-dim 1500 --bn-dim 80 \
+        --hid-layers 2 --hid-dim 1500 --bn-dim 40 \
         --cmvn-opts "--norm-means=true --norm-vars=false" \
         --feat-type "traps" --splice 5 --traps-dct-basis 6 \
         --labels "scp:$dir/ali-post/combined.scp" --num-tgt $output_dim \
@@ -129,7 +130,38 @@ if [ $stage -le 2 ]; then
         --train-tool "nnet-train-frmshuff --objective-function=$objective_function" \
         ${data_tr90} ${data_cv10} lang-dummy ali-dummy ali-dummy $dir
     ;;
+    sbn)
+    # Stacked Bottleneck Netowork, no fMLLR in between,
+    bn1_dim=80
+    bn2_dim=30
+    # Train 1st part,
+    dir_part1=${dir}_part1
+    $cuda_cmd ${dir}_part1/log/train_nnet.log \
+      steps/nnet/train.sh --learn-rate 0.008 \
+        --hid-layers 2 --hid-dim 1500 --bn-dim $bn1_dim \
+        --cmvn-opts "--norm-means=true --norm-vars=false" \
+        --feat-type "traps" --splice 5 --traps-dct-basis 6 \
+        --labels "scp:$dir/ali-post/combined.scp" --num-tgt $output_dim \
+        --proto-opts "--block-softmax-dims=${ali_dim_csl}" \
+        --train-tool "nnet-train-frmshuff --objective-function=$objective_function" \
+        ${data_tr90} ${data_cv10} lang-dummy ali-dummy ali-dummy $dir_part1
+    # Compose feature_transform for 2nd part,
+    nnet-initialize <(echo "<Splice> <InputDim> $bn1_dim <OutputDim> $((13*bn1_dim)) <BuildVector> -10 -5:5 10 </BuildVector>") \
+      $dir_part1/splice_for_bottleneck.nnet 
+    nnet-concat $dir_part1/final.feature_transform "nnet-copy --remove-last-components=4 $dir_part1/final.nnet - |" \
+      $dir_part1/splice_for_bottleneck.nnet $dir_part1/final.feature_transform.part1
+    # Train 2nd part,
+    $cuda_cmd $dir/log/train_nnet.log \
+      steps/nnet/train.sh --learn-rate 0.008 \
+        --feature-transform $dir_part1/final.feature_transform.part1 \
+        --hid-layers 2 --hid-dim 1500 --bn-dim $bn2_dim \
+        --labels "scp:$dir/ali-post/combined.scp" --num-tgt $output_dim \
+        --proto-opts "--block-softmax-dims=${ali_dim_csl}" \
+        --train-tool "nnet-train-frmshuff --objective-function=$objective_function" \
+        ${data_tr90} ${data_cv10} lang-dummy ali-dummy ali-dummy $dir
+    ;;
     dnn_small)
+    # 4 hidden layers, 1024 sigmoid neurons,  
     $cuda_cmd $dir/log/train_nnet.log \
       steps/nnet/train.sh --learn-rate 0.008 \
         --cmvn-opts "--norm-means=true --norm-vars=true" \
@@ -140,6 +172,7 @@ if [ $stage -le 2 ]; then
         ${data_tr90} ${data_cv10} lang-dummy ali-dummy ali-dummy $dir
     ;;
     dnn)
+    # 6 hidden layers, 2048 simgoid neurons,
     $cuda_cmd $dir/log/train_nnet.log \
       steps/nnet/train.sh --learn-rate 0.008 \
         --hid-layers 6 --hid-dim 2048 \
diff --git a/egs/rm/s5/local/online/run_nnet2_multisplice_disc.sh b/egs/rm/s5/local/online/run_nnet2_multisplice_disc.sh
index 132d2c8f93f..2bddefdac04 100755
--- a/egs/rm/s5/local/online/run_nnet2_multisplice_disc.sh
+++ b/egs/rm/s5/local/online/run_nnet2_multisplice_disc.sh
@@ -65,9 +65,9 @@ fi
 if [ $stage -le 3 ]; then
   # I tested the following with  --max-temp-archives 3 
   # to test other branches of the code.
-  # the -tc 5 limits the I/O.
+  # the --max-jobs-run 5 limits the I/O.
   steps/online/nnet2/get_egs_discriminative2.sh \
-    --cmd "$decode_cmd -tc 5" \
+    --cmd "$decode_cmd --max-jobs-run 5" \
     --criterion $criterion --drop-frames $drop_frames \
      data/train data/lang ${srcdir}{_ali,_denlats,,_degs} || exit 1;
 fi
diff --git a/egs/rm/s5/local/online/run_nnet2_perturbed.sh b/egs/rm/s5/local/online/run_nnet2_perturbed.sh
index eacb071be6e..c018ca2880b 100755
--- a/egs/rm/s5/local/online/run_nnet2_perturbed.sh
+++ b/egs/rm/s5/local/online/run_nnet2_perturbed.sh
@@ -95,7 +95,7 @@ if [ $stage -le 6 ]; then
     # dir is the neural-net training dir.
     utils/create_split_dir.pl /export/b0{1,2,3,4}/dpovey/kaldi-online/egs/rm/s5/$dir/egs $dir/egs/storage
   fi
-  # the -tc 15 allows more of the dump_egs jobs than the default (5), since we
+  # the --max-jobs-run 15 allows more of the dump_egs jobs than the default (5), since we
   # have 4 filesystems to access.  We reduce the number of epochs since we have
   # more data and we don't want so slow down the training too much, and we also
   # reduce the final learning rate (when we have a lot of data we like a ratio of 10
@@ -110,7 +110,7 @@ if [ $stage -le 6 ]; then
     --num-threads "$num_threads" \
     --minibatch-size "$minibatch_size" \
     --parallel-opts "$parallel_opts" \
-    --io-opts "-tc 15" \
+    --io-opts "--max-jobs-run 15" \
     --num-jobs-nnet 4 \
     --num-epochs 5 --num-epochs-extra 2 \
     --add-layers-period 2 \
diff --git a/egs/rm/s5/local/test_decoders.sh b/egs/rm/s5/local/test_decoders.sh
index 53e9d1f884c..2b1d4172139 100755
--- a/egs/rm/s5/local/test_decoders.sh
+++ b/egs/rm/s5/local/test_decoders.sh
@@ -4,12 +4,12 @@
 dir=exp/tri1/decode/tmp
 mkdir -p $dir
 acwt=0.083333
-beam=15.0 
+beam=15.0
 n=100 # number of utts to decode
 
 . ./path.sh
 
-gmm-latgen-faster --max-arcs=-1 --beam=$beam --lattice-beam=6.0 --acoustic-scale=$acwt --allow-partial=true --word-symbol-table=exp/tri1/graph/words.txt exp/tri1/final.mdl exp/tri1/graph/HCLG.fst "ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:data/test/utt2spk scp:data/test/cmvn.scp 'scp:head -n $n data/test/feats.scp|' ark:- | add-deltas ark:- ark:- |" "ark:|lattice-1best --acoustic-scale=$acwt ark:- ark:- | gzip -c > $dir/lat.1.gz" 2>$dir/decode_latgen_faster.log &
+gmm-latgen-faster --beam=$beam --lattice-beam=6.0 --acoustic-scale=$acwt --allow-partial=true --word-symbol-table=exp/tri1/graph/words.txt exp/tri1/final.mdl exp/tri1/graph/HCLG.fst "ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:data/test/utt2spk scp:data/test/cmvn.scp 'scp:head -n $n data/test/feats.scp|' ark:- | add-deltas ark:- ark:- |" "ark:|lattice-1best --acoustic-scale=$acwt ark:- ark:- | gzip -c > $dir/lat.1.gz" 2>$dir/decode_latgen_faster.log &
 
 gmm-decode-faster --beam=$beam --acoustic-scale=$acwt --allow-partial=true --word-symbol-table=exp/tri1/graph/words.txt exp/tri1/final.mdl exp/tri1/graph/HCLG.fst "ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:data/test/utt2spk scp:data/test/cmvn.scp 'scp:head -n $n data/test/feats.scp|' ark:- | add-deltas ark:- ark:- |" ark:/dev/null ark:/dev/null "ark:|gzip -c > $dir/lat.2.gz" 2>$dir/decode_faster.log &
 
@@ -26,4 +26,3 @@ wait
 
 echo "$0: decoder comparison test succeeded"
 exit 0;
-
diff --git a/egs/rm/s5/path.sh b/egs/rm/s5/path.sh
index c3be1ca9d0e..1a6fb5f891b 100755
--- a/egs/rm/s5/path.sh
+++ b/egs/rm/s5/path.sh
@@ -1,3 +1,5 @@
 export KALDI_ROOT=`pwd`/../../..
-export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnet2bin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$PWD:$PATH
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
 export LC_ALL=C
diff --git a/egs/rm/s5/run.sh b/egs/rm/s5/run.sh
index 362e215ecfa..43ec446e6fe 100755
--- a/egs/rm/s5/run.sh
+++ b/egs/rm/s5/run.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-. cmd.sh
+. ./cmd.sh
 set -e # exit on error
 
 
@@ -26,8 +26,8 @@ local/rm_prepare_grammar_ug.sh   # Unigram grammar (gives worse results, but
 featdir=mfcc
 
 for x in test_mar87 test_oct87 test_feb89 test_oct89 test_feb91 test_sep92 train; do
-  steps/make_mfcc.sh --nj 8 --cmd "run.pl" data/$x exp/make_feat/$x $featdir
-  #steps/make_plp.sh --nj 8 --cmd "run.pl" data/$x exp/make_feat/$x $featdir
+  steps/make_mfcc.sh --nj 8 --cmd "$train_cmd" data/$x exp/make_feat/$x $featdir
+  #steps/make_plp.sh --nj 8 --cmd "$train_cmd" data/$x exp/make_feat/$x $featdir
   steps/compute_cmvn_stats.sh data/$x exp/make_feat/$x $featdir
 done
 
@@ -38,7 +38,7 @@ done
 utils/combine_data.sh data/test data/test_{mar87,oct87,feb89,oct89,feb91,sep92}
 steps/compute_cmvn_stats.sh data/test exp/make_feat/test $featdir
 
-utils/subset_data_dir.sh data/train 1000 data/train.1k 
+utils/subset_data_dir.sh data/train 1000 data/train.1k
 
 
 steps/train_mono.sh --nj 4 --cmd "$train_cmd" data/train.1k data/lang exp/mono
@@ -50,8 +50,6 @@ steps/train_mono.sh --nj 4 --cmd "$train_cmd" data/train.1k data/lang exp/mono
 utils/mkgraph.sh --mono data/lang exp/mono exp/mono/graph
 
 
-
-
 steps/decode.sh --config conf/decode.config --nj 20 --cmd "$decode_cmd" \
   exp/mono/graph data/test exp/mono/decode
 
@@ -78,14 +76,15 @@ local/test_decoders.sh # This is a test program that we run only in the
 steps/align_si.sh --nj 8 --cmd "$train_cmd" \
   --use-graphs true data/train data/lang exp/tri1 exp/tri1_ali
 
-# train tri2a [delta+delta-deltas]
-steps/train_deltas.sh --cmd "$train_cmd" 1800 9000 \
- data/train data/lang exp/tri1_ali exp/tri2a
+# the tri2a experiments are not needed downstream, so commenting them out.
+# # train tri2a [delta+delta-deltas]
+# steps/train_deltas.sh --cmd "$train_cmd" 1800 9000 \
+#  data/train data/lang exp/tri1_ali exp/tri2a
 
-# decode tri2a
-utils/mkgraph.sh data/lang exp/tri2a exp/tri2a/graph
-steps/decode.sh --config conf/decode.config --nj 20 --cmd "$decode_cmd" \
-  exp/tri2a/graph data/test exp/tri2a/decode
+# # decode tri2a
+# utils/mkgraph.sh data/lang exp/tri2a exp/tri2a/graph
+# steps/decode.sh --config conf/decode.config --nj 20 --cmd "$decode_cmd" \
+#   exp/tri2a/graph data/test exp/tri2a/decode
 
 # train and decode tri2b [LDA+MLLT]
 steps/train_lda_mllt.sh --cmd "$train_cmd" \
@@ -151,9 +150,9 @@ steps/align_fmllr.sh --nj 8 --cmd "$train_cmd" --use-graphs true \
 # # has bad transcripts, so you can filter it out.  Below we demonstrate how to
 # # run this script.
 # steps/cleanup/find_bad_utts.sh --nj 20 --cmd "$train_cmd" data/train data/lang \
-#   exp/tri3b_ali exp/tri3b_cleanup 
+#   exp/tri3b_ali exp/tri3b_cleanup
 # # The following command will show you some of the hardest-to-align utterances in the data.
-# head  exp/tri3b_cleanup/all_info.sorted.txt 
+# head  exp/tri3b_cleanup/all_info.sorted.txt
 
 ## MMI on top of tri3b (i.e. LDA+MLLT+SAT+MMI)
 steps/make_denlats.sh --config conf/decode.config \
@@ -173,7 +172,7 @@ steps/decode.sh --config conf/decode.config --nj 20 --cmd "$decode_cmd" \
 # local/online/run_gmm.sh
 # local/online/run_nnet2.sh
 # local/online/run_baseline.sh
-# Note: for online decoding with pitch, look at local/run_pitch.sh, 
+# Note: for online decoding with pitch, look at local/run_pitch.sh,
 # which calls local/online/run_gmm_pitch.sh
 
 #
@@ -243,11 +242,14 @@ local/run_sgmm2.sh
 #
 # local/run_nnet2.sh
 
-# Karel's neural net recipe.                                                                                                                                        
-# local/nnet/run_dnn.sh                                                                                                                                                  
+# Karel's neural net recipe.
+# local/nnet/run_dnn.sh
 
 # Karel's CNN recipe.
 # local/nnet/run_cnn.sh
 
 # Karel's 2D-CNN recipe (from Harish).
 # local/nnet/run_cnn2d.sh
+
+# chain recipe
+# local/chain/run_tdnn_5f.sh
diff --git a/egs/sprakbanken/README.txt b/egs/sprakbanken/README.txt
index 1cf88788ce8..962b7cb7dbe 100644
--- a/egs/sprakbanken/README.txt
+++ b/egs/sprakbanken/README.txt
@@ -1,10 +1,10 @@
 About the sprakbanken corpus:
     This corpus is a free corpus originally collected by NST for ASR purposes and currently 
     hosted by the Norwegian libraries. The corpus is multilingual and contains Swedish, 
-    Norwegian (Bokmål) and Danish. The current setup works for Danish. The vocabulary is 
+    Norwegian (Bokmål) and Danish. The current setup uses the Danish subcorpus. The vocabulary is 
     large and there is approx. 350 hours of read-aloud speech with associated text scripts.
 
+    Some months ago the corpus was republished here: http://www.nb.no/sprakbanken/#ticketsfrom?lang=en
 
-
-  s1: This is the current recommended recipe. (Danish) 
+    s5: This is the current recommended recipe. (Danish) 
 
diff --git a/egs/sprakbanken/s5/cmd.sh b/egs/sprakbanken/s5/cmd.sh
index 43867ccf0d9..71dd849a93b 100644
--- a/egs/sprakbanken/s5/cmd.sh
+++ b/egs/sprakbanken/s5/cmd.sh
@@ -1,30 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#a) JHU cluster options
-#export train_cmd="queue.pl -l arch=*64"
-#export decode_cmd="queue.pl -l arch=*64 --mem 2G"
-#export mkgraph_cmd="queue.pl -l arch=*64 --mem 2G"
-#export big_memory_cmd="queue.pl -l arch=*64 --mem 2G"
-#export cuda_cmd="queue.pl -l gpu=1"
-
-
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
-#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
-
-#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
-#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
-
-#c) run it locally...
-export train_cmd=run.pl
-export decode_cmd=run.pl
-export cuda_cmd=run.pl
-export mkgraph_cmd=run.pl
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/sprakbanken/s5/local/nnet2/run_5d_gpu.sh b/egs/sprakbanken/s5/local/nnet2/run_5d_gpu.sh
index ce3edc7a9a3..f52a0028074 100755
--- a/egs/sprakbanken/s5/local/nnet2/run_5d_gpu.sh
+++ b/egs/sprakbanken/s5/local/nnet2/run_5d_gpu.sh
@@ -13,7 +13,7 @@ dir=exp/nnet5d_gpu
 . ./cmd.sh
 . utils/parse_options.sh
 
-( 
+(
 
   if [ ! -z "$temp_dir" ] && [ ! -e $dir/egs ]; then
     mkdir -p $dir
@@ -32,19 +32,19 @@ dir=exp/nnet5d_gpu
    --p 2 \
     data/train_si284 data/lang exp/tri4b_ali_si284 $dir || exit 1
 
-  steps/decode_nnet_cpu.sh --cmd "$decode_cmd" --nj 10 \
+  steps/nnet2/decode.sh --cmd "$decode_cmd" --nj 10 \
     --transform-dir exp/tri4b/decode_tgpr_dev93 \
      exp/tri4b/graph_tgpr data/test_dev93 $dir/decode_tgpr_dev93
 
-  steps/decode_nnet_cpu.sh --cmd "$decode_cmd" --nj 8 \
+  steps/nnet2/decode.sh --cmd "$decode_cmd" --nj 8 \
     --transform-dir exp/tri4b/decode_tgpr_eval92 \
      exp/tri4b/graph_tgpr data/test_eval92 $dir/decode_tgpr_eval92
 
-  steps/decode_nnet_cpu.sh --cmd "$decode_cmd" --nj 10 \
+  steps/nnet2/decode.sh --cmd "$decode_cmd" --nj 10 \
     --transform-dir exp/tri4b/decode_bd_tgpr_dev93 \
      exp/tri4b/graph_bd_tgpr data/test_dev93 $dir/decode_bd_tgpr_dev93
 
-  steps/decode_nnet_cpu.sh --cmd "$decode_cmd" --nj 8 \
+  steps/nnet2/decode.sh --cmd "$decode_cmd" --nj 8 \
     --transform-dir exp/tri4b/decode_bd_tgpr_eval92 \
      exp/tri4b/graph_bd_tgpr data/test_eval92 $dir/decode_bd_tgpr_eval92
 )
diff --git a/egs/sprakbanken/s5/local/nnet2/sprak_run_5c.sh b/egs/sprakbanken/s5/local/nnet2/sprak_run_5c.sh
index 8b1fed26422..4ce59dbf86d 100755
--- a/egs/sprakbanken/s5/local/nnet2/sprak_run_5c.sh
+++ b/egs/sprakbanken/s5/local/nnet2/sprak_run_5c.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 # This is neural net training on top of adapted 40-dimensional features.
-# 
+#
 
 . ./cmd.sh
 
@@ -16,13 +16,13 @@ test2=$3
    --num-hidden-layers 4 --hidden-layer-dim 1024 \
    --cmd "$decode_cmd" \
    data/train data/lang exp/tri4b_ali exp/nnet5c || exit 1
-  
-  steps/decode_nnet_cpu.sh --cmd "$decode_cmd" --nj 7 \
+
+  steps/nnet2/decode.sh --cmd "$decode_cmd" --nj 7 \
     --transform-dir exp/tri4b/decode_${uid}_$test1 \
      exp/tri4b/graph_$uid data/$test1 exp/nnet5c/decode_${uid}_$test1
 
 if [ -d $test2 ]; then
-  steps/decode_nnet_cpu.sh --cmd "$decode_cmd" --nj 4 \
+  steps/nnet2/decode.sh --cmd "$decode_cmd" --nj 4 \
     --transform-dir exp/tri4b/decode_${uid}_$test2 \
      exp/tri4b/graph_${uid} data/$test2 exp/nnet5c/decode_${uid}_$test2
 fi
diff --git a/egs/sprakbanken/s5/local/sprak_train_cmulm.sh b/egs/sprakbanken/s5/local/sprak_train_cmulm.sh
index 5b74fcdfee5..55d6d60bf9d 100755
--- a/egs/sprakbanken/s5/local/sprak_train_cmulm.sh
+++ b/egs/sprakbanken/s5/local/sprak_train_cmulm.sh
@@ -52,25 +52,10 @@ mkdir -p $test
 cp -r data/lang/* $test
 
 cat $lmdir/sprak.arpa | \
-utils/find_arpa_oovs.pl $test/words.txt  > $lmdir/oovs_${lm_suffix}.txt
-
-  # grep -v '<s> <s>' because the LM seems to have some strange and useless
-  # stuff in it with multiple <s>'s in the history.  Encountered some other similar
-  # things in a LM from Geoff.  Removing all "illegal" combinations of <s> and </s>,
-  # which are supposed to occur only at being/end of utt.  These can cause 
-  # determinization failures of CLG [ends up being epsilon cycles].
-cat $lmdir/sprak.arpa | \
-  grep -v '<s> <s>' | \
-  grep -v '</s> <s>' | \
-  grep -v '</s> </s>' | \
-  arpa2fst - | fstprint | \
-  utils/remove_oovs.pl $lmdir/oovs_${lm_suffix}.txt | \
-  utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
-    --osymbols=$test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-   fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
+  arpa2fst --disambig-symbol=#0 \
+           --read-symbol-table=$test/words.txt - $test/G.fst
 
 
 utils/validate_lang.pl $test || exit 1;
 
 exit 0;
-
diff --git a/egs/sprakbanken/s5/local/sprak_train_irstlm.sh b/egs/sprakbanken/s5/local/sprak_train_irstlm.sh
index 7abef919e0c..33b27cc3e4c 100755
--- a/egs/sprakbanken/s5/local/sprak_train_irstlm.sh
+++ b/egs/sprakbanken/s5/local/sprak_train_irstlm.sh
@@ -61,8 +61,8 @@ fi
 
 
 # Checks if espeak is available on the system. espeak is necessary to extend
-# the setup because the original transcriptions were created with espeak and 
-# filtered 
+# the setup because the original transcriptions were created with espeak and
+# filtered
 
 if ! which espeak >&/dev/null; then
   echo "espeak is not available on your system. You must install espeak before proceeding."
@@ -95,7 +95,7 @@ if [ ! -f $extdict/lexicon.txt ];
 
 
   # Filter transcription
-  # Remove diacritics, language annotation ((da), (en), (fr) etc.), insert space between symbols, remove 
+  # Remove diacritics, language annotation ((da), (en), (fr) etc.), insert space between symbols, remove
   # initial and trailing spaces and collapse 2 or more spaces to one space
 
   cat $dir/plist.txt | perl -pe 's/\([[a-z]{2}\)//g' | perl -pe 's// /g' | perl -pe 's/ a I / aI /g' | perl -pe 's/ d Z / dZ /g' | perl -pe 's/ \? / /g' | perl -pe 's/ ([\#]) /\+ /g' | perl -pe 's/([\@n3]) \- /\1\- /g' | perl -pe "s/[\_\:\!\'\,\|2]//g" | perl -pe 's/ \- / /g' | tr -s ' ' | perl -pe 's/^ +| +$//g' > $dir/plist2.txt
@@ -128,7 +128,7 @@ if [ ! -f $lmdir/extra4.ngt ];
 
   grep -P -v '^[\s?|\.|\!]*$' $newtext | \
   awk '{if(NF>=4){ printf("%s\n",$0); }}' > $lmdir/text.filt
-    
+
   # Envelop LM training data in context cues
   add-start-end.sh < $lmdir/text.filt > $lmdir/lm_input
 
@@ -151,22 +151,8 @@ mkdir -p $test
 cp -r $extlang $test
 
 cat $lmdir/extra${N}$lm_suffix | \
-utils/find_arpa_oovs.pl $test/words.txt  > $lmdir/oovs_${lm_suffix}.txt
-
-  # grep -v '<s> <s>' because the LM seems to have some strange and useless
-  # stuff in it with multiple <s>'s in the history.  Encountered some other similar
-  # things in a LM from Geoff.  Removing all "illegal" combinations of <s> and </s>,
-  # which are supposed to occur only at being/end of utt.  These can cause 
-  # determinization failures of CLG [ends up being epsilon cycles].
-cat $lmdir/extra${N}$lm_suffix | \
-  grep -v '<s> <s>' | \
-  grep -v '</s> <s>' | \
-  grep -v '</s> </s>' | \
-  arpa2fst - | fstprint | \
-  utils/remove_oovs.pl $lmdir/oovs_${lm_suffix}.txt | \
-  utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
-    --osymbols=$test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-   fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
+  arpa2fst --disambig-symbol=#0 \
+           --read-symbol-table=$test/words.txt - $test/G.fst
 
 utils/validate_lang.pl $test || exit 1;
 
diff --git a/egs/sprakbanken/s5/local/sprak_train_rnnlms.sh b/egs/sprakbanken/s5/local/sprak_train_rnnlms.sh
index 4d5c0cbb462..16233da5d65 100755
--- a/egs/sprakbanken/s5/local/sprak_train_rnnlms.sh
+++ b/egs/sprakbanken/s5/local/sprak_train_rnnlms.sh
@@ -28,32 +28,10 @@ devtext=$2
 dir=$3
 mkdir -p $dir
 
+$KALDI_ROOT/tools/extras/check_for_rnnlm.sh "$rnnlm_ver" || exit 1
 export PATH=$KALDI_ROOT/tools/$rnnlm_ver:$PATH
 
 
-( # First make sure the kaldi_lm toolkit is installed.
- # Note: this didn't work out of the box for me, I had to
- # change the g++ version to just "g++" (no cross-compilation
- # needed for me as I ran on a machine that had been setup
- # as 64 bit by default.
- cd $KALDI_ROOT/tools || exit 1;
- if [ -d $rnnlm_ver ]; then
-   echo Not installing the rnnlm toolkit since it is already there.
- else
-   echo Downloading and installing the rnnlm tools
-   # http://www.fit.vutbr.cz/~imikolov/rnnlm/$rnnlm_ver.tgz
-   if [ ! -f $rnnlm_ver.tgz ]; then
-     wget http://www.fit.vutbr.cz/~imikolov/rnnlm/$rnnlm_ver.tgz || exit 1;
-   fi
-   mkdir $rnnlm_ver
-   cd $rnnlm_ver
-   tar -xvzf ../$rnnlm_ver.tgz || exit 1;
-   make CC=g++ || exit 1;
-   echo Done making the rnnlm tools
- fi
-) || exit 1;
-
-
 if [ ! -f $srcdir/transcripts.uniq ] || [ ! -f $srcdir/lexicon.txt ]; then
   echo "Expecting $srcdir/transcripts.uniq and $srcdir/lexicon.txt to exist";
   exit 1;
diff --git a/egs/sprakbanken/s5/local/train_irstlm.sh b/egs/sprakbanken/s5/local/train_irstlm.sh
index f0b649dd1c7..c91b68f8aab 100755
--- a/egs/sprakbanken/s5/local/train_irstlm.sh
+++ b/egs/sprakbanken/s5/local/train_irstlm.sh
@@ -66,22 +66,8 @@ mkdir -p $test
 cp -r $srcdir/* $test
 
 cat $lmdir/train${ngram}.arpa | \
-  utils/find_arpa_oovs.pl $test/words.txt  > $lmdir/oovs_${lm_suffix}.txt
-
-  # grep -v '<s> <s>' because the LM seems to have some strange and useless
-  # stuff in it with multiple <s>'s in the history.  Encountered some other similar
-  # things in a LM from Geoff.  Removing all "illegal" combinations of <s> and </s>,
-  # which are supposed to occur only at being/end of utt.  These can cause 
-  # determinization failures of CLG [ends up being epsilon cycles].
-cat $lmdir/train${ngram}.arpa | \
-  grep -v '<s> <s>' | \
-  grep -v '</s> <s>' | \
-  grep -v '</s> </s>' | \
-  arpa2fst - | fstprint | \
-  utils/remove_oovs.pl $lmdir/oovs_${lm_suffix}.txt | \
-  utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
-    --osymbols=$test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-   fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
+  arpa2fst --disambig-symbol=#0 \
+           --read-symbol-table=$test/words.txt - $test/G.fst
 
 utils/validate_lang.pl $test || exit 1;
 
diff --git a/egs/sprakbanken/s5/path.sh b/egs/sprakbanken/s5/path.sh
index 9df7df54e99..2d17b17a84a 100755
--- a/egs/sprakbanken/s5/path.sh
+++ b/egs/sprakbanken/s5/path.sh
@@ -1,4 +1,6 @@
 export KALDI_ROOT=`pwd`/../../..
-[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh 
-export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin/:$KALDI_ROOT/src/kwsbin:$PWD:$PATH
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
 export LC_ALL=C
diff --git a/egs/sre08/v1/cmd.sh b/egs/sre08/v1/cmd.sh
index 5c38b3a5d77..d1ca1a6d126 100644
--- a/egs/sre08/v1/cmd.sh
+++ b/egs/sre08/v1/cmd.sh
@@ -1,28 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64*"
-export decode_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G"
-#export cuda_cmd="..."
-export mkgraph_cmd="queue.pl -l arch=*64* ram_free=4G,mem_free=4G"
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
-#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
-#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
-#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
-
-#c) run it locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 4G"
 
 
diff --git a/egs/sre08/v1/path.sh b/egs/sre08/v1/path.sh
index 7cf73af8c53..e50f57c5271 100755
--- a/egs/sre08/v1/path.sh
+++ b/egs/sre08/v1/path.sh
@@ -1,3 +1,5 @@
-export KALDI_ROOT=$(cd ../../..; pwd)
-export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/ivectorbin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH
+export KALDI_ROOT=`pwd`/../../..
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
 export LC_ALL=C
diff --git a/egs/sre08/v1/sid/compute_vad_decision_gmm.sh b/egs/sre08/v1/sid/compute_vad_decision_gmm.sh
new file mode 100755
index 00000000000..b1fee318f34
--- /dev/null
+++ b/egs/sre08/v1/sid/compute_vad_decision_gmm.sh
@@ -0,0 +1,161 @@
+#!/bin/bash
+
+# Copyright    2015 David Snyder
+# Apache 2.0
+#
+# Compute GMM-based VAD output and optionally combine with
+# the energy-based VAD decisions.
+
+nj=10
+cmd=run.pl
+map_config=
+merge_map_config=
+priors=
+use_energy_vad=false
+num_gselect=20
+norm_vars=false
+center=true
+stage=-4
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+if [ $# -lt 5 ]; then
+   echo "Usage: $0 [options] <data-dir> <gmm-dir-1> ... <gmm-dir-N> <log-dir> <vad-dir>";
+   echo "e.g.: $0 data/train exp/music_gmm exp/speech_gmm exp/noise_gmm exp/gmm_vad exp/gmm_vad"
+   echo " Options:"
+   echo "  --map-config <config-file>                       # config passed to compute-vad-from-frame-likes"
+   echo "  --priors <comma-separated-floats>                # list passed to compute-vad-from-frame-likes"
+   echo "  --merge-map-config <config-file>                 # config passed to merge-vads"
+   echo "  --use-energy-vad <true,false>                    # If true, look for a vad.scp file and combine it with this VAD"
+   echo "  --nj <nj>                                        # number of parallel jobs"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   exit 1;
+fi
+
+args=("$@")
+gmm_dirs=(${@:2:$(($#-3))}) # The GMM directories
+num_gmms=`expr $# - 3`
+
+data=${args[0]}
+log_dir=${args[$num_gmms+1]}
+vad_dir=${args[$num_gmms+2]}
+
+# make $vad_dir an absolute pathname.
+vad_dir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' ${vad_dir} ${PWD}`
+# use "name" as part of name of the archive.
+name=`basename $data`
+
+mkdir -p $vad_dir || exit 1;
+mkdir -p $log_dir || exit 1;
+
+if $use_energy_vad; then
+  for f in $data/vad.scp "$merge_map_config"; do
+    if [ ! -f $f ]; then
+      echo "compute_vad_decision_gmm.sh: no such file $f"
+      exit 1;
+    fi
+  done
+fi
+
+if [ ! -f $data/feats.scp ]; then
+  echo "compute_vad_decision_gmm.sh: no such file $f"
+  exit 1;
+fi
+
+utils/split_data.sh $data $nj || exit 1;
+sdata=$data/split$nj;
+
+# We assume that the same delta-opts is used for each
+# GMM dir.
+delta_opts=`cat ${gmm_dirs[0]}/delta_opts 2>/dev/null`
+if [ -f ${gmm_dirs[0]}/delta_opts ]; then
+  cp ${gmm_dirs[0]}/delta_opts $dir/ 2>/dev/null
+fi
+
+## Set up features.
+feats="ark,s,cs:add-deltas $delta_opts scp:$sdata/JOB/feats.scp ark:- | apply-cmvn-sliding --norm-vars=$norm_vars --center=$center --cmn-window=300 ark:- ark:- |"
+
+if [ $stage -le -2 ]; then
+  for gmm_dir in "${gmm_dirs[@]}";
+  do
+    gmm_name=`basename $gmm_dir`
+    $cmd ${log_dir}/log/${gmm_name}_convert.log \
+      fgmm-global-to-gmm ${gmm_dir}/final.ubm ${vad_dir}/${gmm_name}_final.dubm || exit 1;
+  done
+fi
+
+if [ $stage -le -1 ]; then
+  echo "$0: doing Gaussian selection"
+  for gmm_dir in "${gmm_dirs[@]}";
+  do
+    gmm_name=`basename $gmm_dir`
+    $cmd JOB=1:$nj ${log_dir}/log/${gmm_name}_gselect.JOB.log \
+      gmm-gselect --n=$num_gselect ${vad_dir}/${gmm_name}_final.dubm "$feats" ark:- \| \
+      fgmm-gselect --gselect=ark,s,cs:- --n=${num_gselect} ${gmm_dir}/final.ubm \
+      "$feats" "ark:|gzip -c >${vad_dir}/${gmm_name}_gselect.JOB.gz" || exit 1;
+  done
+fi
+
+frame_likes=""
+if [ $stage -le 0 ]; then
+  echo "$0: computing frame likelihoods"
+  for gmm_dir in "${gmm_dirs[@]}";
+  do
+    gmm_name=`basename $gmm_dir`
+    frame_likes="${frame_likes} ark:${vad_dir}/${gmm_name}_logprob.JOB.ark"
+    $cmd JOB=1:$nj ${log_dir}/log/get_${gmm_name}_logprob.JOB.log \
+      fgmm-global-get-frame-likes --average=false \
+      "--gselect=ark,s,cs:gunzip -c ${vad_dir}/${gmm_name}_gselect.JOB.gz|" ${gmm_dir}/final.ubm \
+      "$feats" ark:${vad_dir}/${gmm_name}_logprob.JOB.ark || exit 1;
+  done
+
+  echo "$0: computing VAD decisions from frame likelihoods"
+  $cmd JOB=1:$nj ${log_dir}/log/make_vad_gmm_${name}.JOB.log \
+    compute-vad-from-frame-likes --map=${map_config} --priors=$priors $frame_likes \
+    ark,scp:${vad_dir}/vad_gmm_${name}.JOB.ark,${vad_dir}/vad_gmm_${name}.JOB.scp \
+    || exit 1;
+
+  if $use_energy_vad ; then
+    echo "$0: merging with energy-based VAD decisions"
+    $cmd JOB=1:$nj ${log_dir}/log/merge_vads_${name}.JOB.log \
+      merge-vads --map=${merge_map_config} scp:$sdata/JOB/vad.scp \
+      scp:${vad_dir}/vad_gmm_${name}.JOB.scp \
+      ark,scp:${vad_dir}/vad_merged_${name}.JOB.ark,${vad_dir}/vad_merged_${name}.JOB.scp \
+      || exit 1;
+  fi
+
+  echo "$0: moving old vad.scp to ${data}/vad.scp.bak"
+  mv ${data}/vad.scp ${data}/vad.scp.bak
+
+  for ((n=1; n<=nj; n++)); do
+    if $use_energy_vad ; then
+      cat ${vad_dir}/vad_merged_${name}.$n.scp || exit 1;
+    else
+      cat ${vad_dir}/vad_gmm_${name}.$n.scp || exit 1;
+    fi
+  done > ${data}/vad.scp
+fi
+
+nc=`cat $data/vad.scp | wc -l`
+nu=`cat $data/feats.scp | wc -l`
+if [ $nc -ne $nu ]; then
+  echo "**Warning it seems not all of the speakers got VAD output ($nc != $nu);"
+  echo "**validate_data_dir.sh will fail; you might want to use fix_data_dir.sh"
+  [ $nc -eq 0 ] && exit 1;
+fi
+
+echo "$0 created GMM-based VAD output for $name"
+
+if $cleanup ; then
+  for gmm_dir in "${gmm_dirs[@]}";
+  do
+    gmm_name=`basename $gmm_dir`
+    rm ${vad_dir}/${gmm_name}_gselect.*.gz
+    rm ${vad_dir}/${gmm_name}_logprob.*.ark
+  done
+fi
+
+exit 0;
diff --git a/egs/sre08/v1/sid/init_full_ubm_from_dnn.sh b/egs/sre08/v1/sid/init_full_ubm_from_dnn.sh
index 22c5de9b9c3..f6710028ae5 100755
--- a/egs/sre08/v1/sid/init_full_ubm_from_dnn.sh
+++ b/egs/sre08/v1/sid/init_full_ubm_from_dnn.sh
@@ -13,7 +13,6 @@ cmd="run.pl"
 stage=-2
 delta_window=3
 delta_order=2
-num_components=5297
 # End configuration section.
 
 echo "$0 $@"  # Print the command line for logging
@@ -31,7 +30,6 @@ if [ $# != 4 ]; then
   echo "  --nj <n|16>                                      # number of parallel training jobs"
   echo "  --delta-window <n|3>                             # delta window size"
   echo "  --delta-order <n|2>                              # delta order"
-  echo "  --number-components <n|5297>                     # number of components in the final GMM needs"
   echo "                                                   # to be equal to the size of the DNN output layer."
   exit 1;
 fi
@@ -41,7 +39,9 @@ data_dnn=$2
 nnet=$3
 dir=$4
 
-for f in $data/feats.scp $data/vad.scp; do
+
+for f in $data/feats.scp $data/vad.scp ${data_dnn}/feats.scp \
+    ${data_dnn}/vad.scp $nnet; do
   [ ! -f $f ] && echo "No such file $f" && exit 1;
 done
 
@@ -64,6 +64,11 @@ feats="ark,s,cs:add-deltas $delta_opts scp:$sdata/JOB/feats.scp ark:- | \
 apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=300 ark:- ark:- | \
 select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- |"
 
+# Parse the output of nnet-am-info to find the size of the output layer
+# of the TDNN.  This will also correspond to the number of components
+# in the ancillary GMM.
+num_components=`grep -oP 'output-dim\ \K[0-9]+' <(nnet-am-info $nnet 2> /dev/null)`
+
 $cmd JOB=1:$nj $logdir/make_stats.JOB.log \
   nnet-am-compute --apply-log=true $nnet "$nnet_feats" ark:- \
   \| select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- \
diff --git a/egs/sre08/v1/sid/music_id.sh b/egs/sre08/v1/sid/music_id.sh
new file mode 100755
index 00000000000..4233b5752fd
--- /dev/null
+++ b/egs/sre08/v1/sid/music_id.sh
@@ -0,0 +1,134 @@
+#!/bin/bash
+
+# Copyright    2015  David Snyder
+# Apache 2.0.
+
+# This script calculates the relative probability of music versus
+# speech.
+
+# Begin configuration section.
+nj=10
+cmd="run.pl"
+stage=-4
+num_gselect=20 # Gaussian-selection using diagonal and full covariance models
+norm_vars=false
+center=true
+cleanup=true
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+
+if [ $# != 4 ]; then
+  echo "Usage: $0 <music-ubm-dir> <speech-ubm-dir> <data> <exp-dir>"
+  echo " e.g.: $0  exp/full_ubm_music exp/full_ubm_speech data/test exp/test_results"
+  echo "main options (for others, see top of script file)"
+  echo "  --config <config-file>                           # config containing options"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --nj <n|10>                                      # Number of jobs (also see num-processes and num-threads)"
+  echo "  --cleanup <true,false|true>                      # If true, clean up temporary files"
+  echo "  --num-processes <n|4>                            # Number of processes for each queue job (relates"
+  echo "                                                   # to summing accs in memory)"
+  echo "  --stage <stage|-4>                               # To control partial reruns"
+  echo "  --num-gselect <n|20>                             # Number of Gaussians to select using"
+  echo "                                                   # diagonal model."
+  exit 1;
+fi
+
+music_ubmdir=$1
+speech_ubmdir=$2
+data=$3
+dir=$4
+
+delta_opts=`cat $speech_ubmdir/delta_opts 2>/dev/null`
+
+for f in $music_ubmdir/final.ubm $speech_ubmdir/final.ubm $data/feats.scp $data/vad.scp; do
+  [ ! -f $f ] && echo "No such file $f" && exit 1;
+done
+
+# Set various variables.
+mkdir -p $dir/log || exit 1;
+sdata=$data/split$nj
+utils/split_data.sh $data $nj || exit 1;
+
+## Set up features.
+feats="ark,s,cs:add-deltas $delta_opts scp:$sdata/JOB/feats.scp ark:- | apply-cmvn-sliding --norm-vars=$norm_vars --center=$center --cmn-window=300 ark:- ark:- | select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- |"
+
+if [ $stage -le -2 ]; then
+  $cmd $dir/log/music_convert.log \
+    fgmm-global-to-gmm $music_ubmdir/final.ubm $dir/music_final.dubm || exit 1;
+fi
+if [ $stage -le -2 ]; then
+  $cmd $dir/log/speech_convert.log \
+    fgmm-global-to-gmm $speech_ubmdir/final.ubm $dir/speech_final.dubm || exit 1;
+fi
+
+# Do Gaussian selection using the diagonal forms of the models.
+
+if [ $stage -le -1 ]; then
+  echo $nj > $dir/num_jobs
+  echo "$0: doing Gaussian selection for music UBM"
+  $cmd JOB=1:$nj $dir/log/music_gselect.JOB.log \
+    gmm-gselect --n=$num_gselect $dir/music_final.dubm "$feats" ark:- \| \
+    fgmm-gselect --gselect=ark,s,cs:- --n=$num_gselect $music_ubmdir/final.ubm \
+      "$feats" "ark:|gzip -c >$dir/music_gselect.JOB.gz" || exit 1;
+
+  echo $nj > $dir/num_jobs
+  echo "$0: doing Gaussian selection for speech UBM"
+  $cmd JOB=1:$nj $dir/log/speech_gselect.JOB.log \
+    gmm-gselect --n=$num_gselect $dir/speech_final.dubm "$feats" ark:- \| \
+    fgmm-gselect --gselect=ark,s,cs:- --n=$num_gselect $speech_ubmdir/final.ubm \
+      "$feats" "ark:|gzip -c >$dir/speech_gselect.JOB.gz" || exit 1;
+fi
+
+if ! [ $nj -eq $(cat $dir/num_jobs) ]; then
+  echo "Number of jobs mismatch"
+  exit 1;
+fi
+
+# Calculate the average frame-level log-likelihoods for the utterances under
+# the music and speech UBMs.
+if [ $stage -le 0 ]; then
+  $cmd JOB=1:$nj $dir/log/get_music_logprob.JOB.log \
+    fgmm-global-get-frame-likes --average=true \
+     "--gselect=ark,s,cs:gunzip -c $dir/music_gselect.JOB.gz|" $music_ubmdir/final.ubm \
+      "$feats" ark,t:$dir/music_logprob.JOB || exit 1;
+fi
+if [ $stage -le 1 ]; then
+  $cmd JOB=1:$nj $dir/log/get_speech_logprob.JOB.log \
+    fgmm-global-get-frame-likes --average=true \
+     "--gselect=ark,s,cs:gunzip -c $dir/speech_gselect.JOB.gz|" $speech_ubmdir/final.ubm \
+      "$feats" ark,t:$dir/speech_logprob.JOB || exit 1;
+fi
+
+if [ $stage -le 2 ]; then
+
+  for j in $(seq $nj); do cat $dir/music_logprob.$j; done > $dir/music_logprob
+  for j in $(seq $nj); do cat $dir/speech_logprob.$j; done > $dir/speech_logprob
+
+  n1=$(cat $dir/music_logprob | wc -l)
+  n2=$(cat $dir/speech_logprob | wc -l)
+
+  if [ $n1 -ne $n2 ]; then
+    echo "Number of lines mismatch, music versus speech UBM probs: $n1 vs $n2"
+    exit 1;
+  fi
+
+  paste $dir/music_logprob $dir/speech_logprob | \
+    awk '{if ($1 != $3) { print >/dev/stderr "Sorting mismatch"; exit(1);  } print $1, $2, $4;}' \
+    >$dir/logprob || exit 1;
+
+  cat $dir/logprob | \
+    awk '{lratio = $2-$3; print $1, 1/(1+exp(-lratio));}' \
+    >$dir/ratio || exit 1;
+fi
+
+if $cleanup; then
+  rm $dir/speech_gselect.*.gz
+  rm $dir/music_gselect.*.gz
+fi
+
+exit 0;
diff --git a/egs/sre10/README.txt b/egs/sre10/README.txt
index 5f9c0337550..8390136d52b 100644
--- a/egs/sre10/README.txt
+++ b/egs/sre10/README.txt
@@ -10,8 +10,9 @@
  are required by the subdirectories. See the corresponding README.txt files 
  in the subdirectories for more details.
 
- The subdirectories "v1" and so on are different versions of the recipe;
- we don't call them "s1" etc., because they don't really correspond to
- the speech recognition recipes.
-
+ The subdirectories "v1" and so on are different iVector-based speaker 
+ recognition recipes. The recipe in v1 demonstrates a standard approach 
+ using a full-covariance GMM-UBM, iVectors, and a PLDA backend. The example 
+ in v2 replaces the GMM of the v1 recipe with a time-delay deep neural 
+ network.
 
diff --git a/egs/sre10/v1/cmd.sh b/egs/sre10/v1/cmd.sh
index 5c38b3a5d77..d1ca1a6d126 100755
--- a/egs/sre10/v1/cmd.sh
+++ b/egs/sre10/v1/cmd.sh
@@ -1,28 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64*"
-export decode_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G"
-#export cuda_cmd="..."
-export mkgraph_cmd="queue.pl -l arch=*64* ram_free=4G,mem_free=4G"
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
-#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
-#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
-#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
-
-#c) run it locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 4G"
 
 
diff --git a/egs/sre10/v1/local/dnn/fisher_create_test_lang.sh b/egs/sre10/v1/local/dnn/fisher_create_test_lang.sh
index aaa45f8e4e1..1d7c4013b83 100755
--- a/egs/sre10/v1/local/dnn/fisher_create_test_lang.sh
+++ b/egs/sre10/v1/local/dnn/fisher_create_test_lang.sh
@@ -1,4 +1,4 @@
-#!/bin/bash 
+#!/bin/bash
 #
 
 if [ -f path.sh ]; then . path.sh; fi
@@ -11,26 +11,13 @@ arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz
 mkdir -p data/lang_test
 cp -r data/lang/* data/lang_test
 
-# grep -v '<s> <s>' etc. is only for future-proofing this script.  Our
-# LM doesn't have these "invalid combinations".  These can cause 
-# determinization failures of CLG [ends up being epsilon cycles].
-# Note: remove_oovs.pl takes a list of words in the LM that aren't in
-# our word list.  Since our LM doesn't have any, we just give it
-# /dev/null [we leave it in the script to show how you'd do it].
 gunzip -c "$arpa_lm" | \
-   grep -v '<s> <s>' | \
-   grep -v '</s> <s>' | \
-   grep -v '</s> </s>' | \
-   arpa2fst - | fstprint | \
-   utils/remove_oovs.pl /dev/null | \
-   utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \
-     --osymbols=data/lang_test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-    fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst
-  fstisstochastic data/lang_test/G.fst
+  arpa2fst --disambig-symbol=#0 \
+           --read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst
 
 
 echo  "Checking how stochastic G is (the first of these numbers should be small):"
-fstisstochastic data/lang_test/G.fst 
+fstisstochastic data/lang_test/G.fst
 
 ## Check lexicon.
 ## just have a look and make sure it seems sane.
@@ -59,4 +46,3 @@ fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \
 
 
 echo "$0 succeeded"
-
diff --git a/egs/sre10/v1/local/dnn/train_dnn.sh b/egs/sre10/v1/local/dnn/train_dnn.sh
index e1ce8ae8e79..d9330e58b69 100755
--- a/egs/sre10/v1/local/dnn/train_dnn.sh
+++ b/egs/sre10/v1/local/dnn/train_dnn.sh
@@ -15,7 +15,7 @@ set -e
 local/dnn/fisher_data_prep.sh /export/corpora3/LDC/LDC2004T19 /export/corpora3/LDC/LDC2005T19 \
    /export/corpora3/LDC/LDC2004S13 /export/corpora3/LDC/LDC2005S13
 # You could also try specifying the --calldata argument to this command as below.
-# If specified, the script will use actual speaker personal identification 
+# If specified, the script will use actual speaker personal identification
 # numbers released with the dataset, i.e. real speaker IDs. Note: --calldata has
 # to be the first argument of this script.
 # local/fisher_data_prep.sh --calldata /export/corpora3/LDC/LDC2004T19 /export/corpora3/LDC/LDC2005T19 \
@@ -28,7 +28,7 @@ local/dnn/fisher_prepare_dict.sh
 
 utils/prepare_lang.sh data/local/dict "<unk>" data/local/lang data/lang
 
-local/dnn/fisher_train_lms.sh 
+local/dnn/fisher_train_lms.sh
 local/dnn/fisher_create_test_lang.sh
 
 # Use the first 4k sentences as dev set.  Note: when we trained the LM, we used
@@ -55,12 +55,12 @@ utils/subset_data_dir.sh --first data/dev_and_test_asr 5000 data/dev_asr
 utils/subset_data_dir.sh --last data/dev_and_test_asr 5000 data/test_asr
 rm -r data/dev_and_test_asr
 
-steps/compute_cmvn_stats.sh data/dev_asr exp/make_mfcc/dev_asr $mfccdir 
-steps/compute_cmvn_stats.sh data/test_asr exp/make_mfcc/test_asr $mfccdir 
+steps/compute_cmvn_stats.sh data/dev_asr exp/make_mfcc/dev_asr $mfccdir
+steps/compute_cmvn_stats.sh data/test_asr exp/make_mfcc/test_asr $mfccdir
 
 n=$[`cat data/train_all_asr/segments | wc -l` - 10000]
 utils/subset_data_dir.sh --last data/train_all_asr $n data/train_asr
-steps/compute_cmvn_stats.sh data/train_asr exp/make_mfcc/train_asr $mfccdir 
+steps/compute_cmvn_stats.sh data/train_asr exp/make_mfcc/train_asr $mfccdir
 
 
 # Now-- there are 1.6 million utterances, and we want to start the monophone training
@@ -75,30 +75,30 @@ utils/subset_data_dir.sh --speakers data/train_asr 30000 data/train_asr_30k
 utils/subset_data_dir.sh --speakers data/train_asr 100000 data/train_asr_100k
 
 
-# The next commands are not necessary for the scripts to run, but increase 
-# efficiency of data access by putting the mfcc's of the subset 
+# The next commands are not necessary for the scripts to run, but increase
+# efficiency of data access by putting the mfcc's of the subset
 # in a contiguous place in a file.
-( . path.sh; 
+( . path.sh;
   # make sure mfccdir is defined as above..
-  cp data/train_asr_10k_nodup/feats.scp{,.bak} 
+  cp data/train_asr_10k_nodup/feats.scp{,.bak}
   copy-feats scp:data/train_asr_10k_nodup/feats.scp  ark,scp:$mfccdir/kaldi_fish_10k_nodup.ark,$mfccdir/kaldi_fish_10k_nodup.scp \
   && cp $mfccdir/kaldi_fish_10k_nodup.scp data/train_asr_10k_nodup/feats.scp
 )
-( . path.sh; 
+( . path.sh;
   # make sure mfccdir is defined as above..
-  cp data/train_asr_30k/feats.scp{,.bak} 
+  cp data/train_asr_30k/feats.scp{,.bak}
   copy-feats scp:data/train_asr_30k/feats.scp  ark,scp:$mfccdir/kaldi_fish_30k.ark,$mfccdir/kaldi_fish_30k.scp \
   && cp $mfccdir/kaldi_fish_30k.scp data/train_asr_30k/feats.scp
 )
-( . path.sh; 
+( . path.sh;
   # make sure mfccdir is defined as above..
-  cp data/train_asr_100k/feats.scp{,.bak} 
+  cp data/train_asr_100k/feats.scp{,.bak}
   copy-feats scp:data/train_asr_100k/feats.scp  ark,scp:$mfccdir/kaldi_fish_100k.ark,$mfccdir/kaldi_fish_100k.scp \
   && cp $mfccdir/kaldi_fish_100k.scp data/train_asr_100k/feats.scp
 )
 
 steps/train_mono.sh --nj 10 --cmd "$train_cmd" \
-  data/train_asr_10k_nodup data/lang exp/mono0a 
+  data/train_asr_10k_nodup data/lang exp/mono0a
 
 steps/align_si.sh --nj 30 --cmd "$train_cmd" \
    data/train_asr_30k data/lang exp/mono0a exp/mono0a_ali || exit 1;
@@ -109,7 +109,7 @@ steps/train_deltas.sh --cmd "$train_cmd" \
 
 (utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph
  steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
-   exp/tri1/graph data/dev exp/tri1/decode_dev)&
+   exp/tri1/graph data/dev_asr exp/tri1/decode_dev)&
 
 steps/align_si.sh --nj 30 --cmd "$train_cmd" \
    data/train_asr_30k data/lang exp/tri1 exp/tri1_ali || exit 1;
@@ -120,7 +120,7 @@ steps/train_deltas.sh --cmd "$train_cmd" \
 (
   utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph || exit 1;
   steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
-   exp/tri2/graph data/dev exp/tri2/decode_dev || exit 1;
+   exp/tri2/graph data/dev_asr exp/tri2/decode_dev || exit 1;
 )&
 
 
@@ -134,11 +134,11 @@ steps/train_lda_mllt.sh --cmd "$train_cmd" \
 (
   utils/mkgraph.sh data/lang_test exp/tri3a exp/tri3a/graph || exit 1;
   steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
-   exp/tri3a/graph data/dev exp/tri3a/decode_dev || exit 1;
+   exp/tri3a/graph data/dev_asr exp/tri3a/decode_dev || exit 1;
 )&
 
 
-# Next we'll use fMLLR and train with SAT (i.e. on 
+# Next we'll use fMLLR and train with SAT (i.e. on
 # fMLLR features)
 
 steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
@@ -150,10 +150,9 @@ steps/train_sat.sh  --cmd "$train_cmd" \
 (
   utils/mkgraph.sh data/lang_test exp/tri4a exp/tri4a/graph
   steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
-   exp/tri4a/graph data/dev exp/tri4a/decode_dev
+   exp/tri4a/graph data/dev_asr exp/tri4a/decode_dev
 )&
 
-
 steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
   data/train_asr data/lang exp/tri4a exp/tri4a_ali || exit 1;
 
@@ -164,7 +163,7 @@ steps/train_sat.sh  --cmd "$train_cmd" \
 (
   utils/mkgraph.sh data/lang_test exp/tri5a exp/tri5a/graph
   steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
-    exp/tri5a/graph data/dev exp/tri5a/decode_dev
+    exp/tri5a/graph data/dev_asr exp/tri5a/decode_dev
 )&
 
 # this will help find issues with the lexicon.
diff --git a/egs/sre10/v1/path.sh b/egs/sre10/v1/path.sh
index 7cf73af8c53..e50f57c5271 100755
--- a/egs/sre10/v1/path.sh
+++ b/egs/sre10/v1/path.sh
@@ -1,3 +1,5 @@
-export KALDI_ROOT=$(cd ../../..; pwd)
-export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/ivectorbin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH
+export KALDI_ROOT=`pwd`/../../..
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
 export LC_ALL=C
diff --git a/egs/sre10/v2/README.txt b/egs/sre10/v2/README.txt
new file mode 100644
index 00000000000..1ba8705e089
--- /dev/null
+++ b/egs/sre10/v2/README.txt
@@ -0,0 +1,20 @@
+ Data required for system development (on top of the data for testing described
+ in ../README.txt).  We use SWBD and the older (prior to 2010) SREs to train the
+ supervised-GMM and iVector extractor. To create an in-domain system, the SREs
+ are needed to train the PLDA backend.  The TDNN is trained on Fisher English.
+ 
+     Corpus              LDC Catalog No.
+     SWBD2 Phase 2       LDC99S79
+     SWBD2 Phase 3       LDC2002S06
+     SWBD Cellular 1     LDC2001S13
+     SWBD Ceullar 2      LDC2004S07
+     SRE2004             LDC2006S44
+     SRE2005 Train       LDC2011S01
+     SRE2005 Test        LDC2011S04
+     SRE2006 Train       LDC2011S09
+     SRE2006 Test 1      LDC2011S10
+     SRE2006 Test 2      LDC2012S01
+     SRE2008 Train       LDC2011S05
+     SRE2008 Test        LDC2011S08
+     Fisher speech       LDC2004S13, LDC2005S13 
+     Fisher test         LDC2004T19, LDC2005T19        
diff --git a/egs/sre10/v2/path.sh b/egs/sre10/v2/path.sh
index 7cf73af8c53..e50f57c5271 100755
--- a/egs/sre10/v2/path.sh
+++ b/egs/sre10/v2/path.sh
@@ -1,3 +1,5 @@
-export KALDI_ROOT=$(cd ../../..; pwd)
-export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/ivectorbin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH
+export KALDI_ROOT=`pwd`/../../..
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
 export LC_ALL=C
diff --git a/egs/sre10/v2/run.sh b/egs/sre10/v2/run.sh
index aaa79a749b9..80d1bcf5944 100755
--- a/egs/sre10/v2/run.sh
+++ b/egs/sre10/v2/run.sh
@@ -8,7 +8,7 @@
 # Results (EERs) are inline in comments below.
 #
 # This example script shows how to replace the GMM-UBM
-# with a DNN trained for ASR. It also demonstrates the 
+# with a DNN trained for ASR. It also demonstrates the
 # using the DNN to create a supervised-GMM.
 
 . cmd.sh
@@ -21,9 +21,6 @@ trials_male=data/sre10_test_male/trials
 trials=data/sre10_test/trials
 nnet=exp/nnet2_online/nnet_ms_a/final.mdl
 
-# Use nnet-am-info to determine the size of the output layer.
-num_components=5297
-
 # Train a DNN on about 1800 hours of the english portion of Fisher.
 local/dnn/train_dnn.sh
 
@@ -66,16 +63,17 @@ steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 40 --cmd "$train_cmd" \
     data/sre10_test exp/make_mfcc $mfccdir
 
 # Extract DNN features.
-steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj 40 --cmd "$train_cmd" \
-    data/train_dnn exp/make_mfcc $mfccdir
-steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj 40 --cmd "$train_cmd" \
-    data/sre_dnn exp/make_mfcc $mfccdir
-steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj 40 --cmd "$train_cmd" \
-    data/sre10_train_dnn exp/make_mfcc $mfccdir
-steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj 40 --cmd "$train_cmd" \
-    data/sre10_test_dnn exp/make_mfcc $mfccdir
-
-for name in sre_dnn sre10_train_dnn sre10_test_dnn train_dnn sre sre10_train sre10_test train; do
+steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj 40 \
+    --cmd "$train_cmd" data/train_dnn exp/make_mfcc $mfccdir
+steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj 40 \
+    --cmd "$train_cmd" data/sre_dnn exp/make_mfcc $mfccdir
+steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj 40 \
+    --cmd "$train_cmd" data/sre10_train_dnn exp/make_mfcc $mfccdir
+steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj 40 \
+    --cmd "$train_cmd" data/sre10_test_dnn exp/make_mfcc $mfccdir
+
+for name in sre_dnn sre10_train_dnn sre10_test_dnn train_dnn sre \
+    sre10_train sre10_test train; do
   utils/fix_data_dir.sh data/${name}
 done
 
@@ -89,7 +87,7 @@ sid/compute_vad_decision.sh --nj 40 --cmd "$train_cmd" \
 sid/compute_vad_decision.sh --nj 40 --cmd "$train_cmd" \
     data/sre10_test exp/make_vad $vaddir
 
-for name sre sre10_train sre10_test train; do
+for name in sre sre10_train sre10_test train; do
   cp data/${name}/vad.scp data/${name}_dnn/vad.scp
   cp data/${name}/utt2spk data/${name}_dnn/utt2spk
   cp data/${name}/spk2utt data/${name}_dnn/spk2utt
@@ -100,25 +98,27 @@ done
 # Subset training data for faster sup-GMM initialization.
 utils/subset_data_dir.sh data/train_dnn 32000 data/train_dnn_32k
 utils/fix_data_dir.sh data/train_dnn_32k
-utils/subset_data_dir.sh --utt-list data/train_dnn_32k/utt2spk data/train data/train_32k
+utils/subset_data_dir.sh --utt-list data/train_dnn_32k/utt2spk data/train \
+    data/train_32k
 utils/fix_data_dir.sh data/train_32k
 
 # Initialize a full GMM from the DNN posteriors and speaker recognition
 # features. This can be used both alone, as a UBM, or to initialize the
 # i-vector extractor in a DNN-based system.
 sid/init_full_ubm_from_dnn.sh --cmd "$train_cmd -l mem_free=6G,ram_free=6G" \
-  --num_components $num_components \
   data/train_32k \
   data/train_dnn_32k $nnet exp/full_ubm
 
-# Train an i-vector extractor based on just the supervised-GMM. 
-sid/train_ivector_extractor.sh --cmd "$train_cmd -l mem_free=70G,ram_free=70G" \
+# Train an i-vector extractor based on just the supervised-GMM.
+sid/train_ivector_extractor.sh \
+  --cmd "$train_cmd -l mem_free=70G,ram_free=70G" \
   --ivector-dim 600 \
   --num-iters 5 exp/full_ubm/final.ubm data/train \
   exp/extractor_sup_gmm
 
 # Train an i-vector extractor based on the DNN-UBM.
-sid/train_ivector_extractor_dnn.sh --cmd "$train_cmd -l mem_free=80G,ram_free=80G" \
+sid/train_ivector_extractor_dnn.sh \
+  --cmd "$train_cmd -l mem_free=80G,ram_free=80G" \
   --min-post 0.015 \
   --ivector-dim 600 \
   --num-iters 5 exp/full_ubm/final.ubm $nnet \
@@ -127,34 +127,40 @@ sid/train_ivector_extractor_dnn.sh --cmd "$train_cmd -l mem_free=80G,ram_free=80
   exp/extractor_dnn
 
 # Extract i-vectors from the extractor with the sup-GMM UBM.
-sid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=8G,ram_free=8G" --nj 50 \
+sid/extract_ivectors.sh \
+   --cmd "$train_cmd -l mem_free=8G,ram_free=8G" --nj 40 \
    exp/extractor_sup_gmm data/sre10_train \
    exp/ivectors_sre10_train_sup_gmm
 
-sid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=8G,ram_free=8G" --nj 50 \
+sid/extract_ivectors.sh \
+   --cmd "$train_cmd -l mem_free=8G,ram_free=8G" --nj 40 \
    exp/extractor_sup_gmm data/sre10_test \
    exp/ivectors_sre10_test_sup_gmm
 
-sid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=8G,ram_free=8G" --nj 50 \
+sid/extract_ivectors.sh \
+   --cmd "$train_cmd -l mem_free=8G,ram_free=8G" --nj 40 \
    exp/extractor_sup_gmm data/sre \
    exp/ivectors_sre_sup_gmm
 
 # Extract i-vectors using the extractor with the DNN-UBM.
-sid/extract_ivectors_dnn.sh --cmd "$train_cmd -l mem_free=10G,ram_free=10G" --nj 40 \
+sid/extract_ivectors_dnn.sh \
+   --cmd "$train_cmd -l mem_free=10G,ram_free=10G" --nj 40 \
    exp/extractor_dnn \
    $nnet \
    data/sre10_test \
    data/sre10_test_dnn \
    exp/ivectors10_test_dnn
 
-sid/extract_ivectors_dnn.sh --cmd "$train_cmd -l mem_free=10G,ram_free=10G" --nj 40 \
+sid/extract_ivectors_dnn.sh
+   --cmd "$train_cmd -l mem_free=10G,ram_free=10G" --nj 40 \
    exp/extractor_dnn \
    $nnet \
    data/sre10_train \
    data/sre10_train_dnn \
    exp/ivectors10_train_dnn
 
-sid/extract_ivectors_dnn.sh --cmd "$train_cmd -l mem_free=10G,ram_free=10G" --nj 40 \
+sid/extract_ivectors_dnn.sh
+   --cmd "$train_cmd -l mem_free=10G,ram_free=10G" --nj 40 \
    exp/extractor_dnn \
    $nnet \
    data/sre \
@@ -172,7 +178,7 @@ local/scoring_common.sh data/sre data/sre10_train data/sre10_test \
   exp/ivectors_sre10_test_dnn
 
 # The commented out scripts show how to do cosine scoring with and without
-# first reducing the i-vector dimensionality with LDA. PLDA tends to work 
+# first reducing the i-vector dimensionality with LDA. PLDA tends to work
 # best, so we don't focus on the scores obtained here.
 #
 # local/cosine_scoring.sh data/sre10_train data/sre10_test \
diff --git a/egs/swahili/s5/local/prepare_lm.sh b/egs/swahili/s5/local/prepare_lm.sh
index 3d52417ca19..028aaa421f2 100755
--- a/egs/swahili/s5/local/prepare_lm.sh
+++ b/egs/swahili/s5/local/prepare_lm.sh
@@ -4,13 +4,5 @@
 
 cd data
 #convert to FST format for Kaldi
-cat local/swahili.arpa | ../utils/find_arpa_oovs.pl lang/words.txt  > lang/oovs.txt
-cat local/swahili.arpa |    \
-    grep -v '<s> <s>' | \
-    grep -v '</s> <s>' | \
-    grep -v '</s> </s>' | \
-    arpa2fst - | fstprint | \
-    ../utils/remove_oovs.pl lang/oovs.txt | \
-    ../utils/eps2disambig.pl | ../utils/s2eps.pl | fstcompile --isymbols=lang/words.txt \
-      --osymbols=lang/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-     fstrmepsilon > lang/G.fst
+arpa2fst --disambig-symbol=#0 --read-symbol-table=lang/words.txt \
+  local/swahili.arpa lang/G.fst
diff --git a/egs/swahili/s5/path.sh b/egs/swahili/s5/path.sh
index 3dc94fa8313..8b61dce675e 100755
--- a/egs/swahili/s5/path.sh
+++ b/egs/swahili/s5/path.sh
@@ -1,11 +1,16 @@
 #!/bin/bash
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+#export PATH=$PWD/utils/:$PWD/steps/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
 
 DATA_DIR=$PWD/data
 LEXICON=$DATA_DIR/local/dict/lexicon.txt
 EXP_DIR="dev test"
 TRAIN_DIR="train"
 
-export KALDI_ROOT=`pwd`/../../..
-export PATH=$PWD/utils/:$PWD/steps/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin/:$KALDI_ROOT/src/kwsbin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$PWD:$PATH
 
 export LC_ALL=C
diff --git a/egs/swbd/s5/cmd.sh b/egs/swbd/s5/cmd.sh
index 4abf8546b0d..bae7f5cdf45 100644
--- a/egs/swbd/s5/cmd.sh
+++ b/egs/swbd/s5/cmd.sh
@@ -1,28 +1,16 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64*"
-export decode_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G"
-#export cuda_cmd="..."
-export mkgraph_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G"
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
-#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
-#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
-#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
-
-#c) run it locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
+export train_cmd="queue.pl"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
 
diff --git a/egs/swbd/s5/local/swbd_p1_format_data.sh b/egs/swbd/s5/local/swbd_p1_format_data.sh
index f0d38a08dd2..69ad44ccc50 100755
--- a/egs/swbd/s5/local/swbd_p1_format_data.sh
+++ b/egs/swbd/s5/local/swbd_p1_format_data.sh
@@ -1,4 +1,4 @@
-#!/bin/bash 
+#!/bin/bash
 #
 
 if [ -f path.sh ]; then . path.sh; fi
@@ -20,26 +20,13 @@ done
 rm -r data/lang_test
 cp -r data/lang data/lang_test
 
-# grep -v '<s> <s>' etc. is only for future-proofing this script.  Our
-# LM doesn't have these "invalid combinations".  These can cause 
-# determinization failures of CLG [ends up being epsilon cycles].
-# Note: remove_oovs.pl takes a list of words in the LM that aren't in
-# our word list.  Since our LM doesn't have any, we just give it
-# /dev/null [we leave it in the script to show how you'd do it].
 gunzip -c "$arpa_lm" | \
-   grep -v '<s> <s>' | \
-   grep -v '</s> <s>' | \
-   grep -v '</s> </s>' | \
-   arpa2fst - | fstprint | \
-   utils/remove_oovs.pl /dev/null | \
-   utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \
-     --osymbols=data/lang_test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-    fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst
-  fstisstochastic data/lang_test/G.fst
+  arpa2fst --disambig-symbol=#0 \
+           --read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst
 
 
 echo  "Checking how stochastic G is (the first of these numbers should be small):"
-fstisstochastic data/lang_test/G.fst 
+fstisstochastic data/lang_test/G.fst
 
 ## Check lexicon.
 ## just have a look and make sure it seems sane.
@@ -68,4 +55,3 @@ fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \
 
 
 echo swbd_p1_format_data succeeded.
-
diff --git a/egs/swbd/s5/path.sh b/egs/swbd/s5/path.sh
index e1d916917f1..50eedcbb1f4 100755
--- a/egs/swbd/s5/path.sh
+++ b/egs/swbd/s5/path.sh
@@ -1,6 +1,8 @@
 export KALDI_ROOT=`pwd`/../../..
 [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
 #export KALDI_ROOT=/home/dpovey/kaldi-trunk-test
-export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$PWD:$PATH
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
 export LC_ALL=C
 export MKL_NUM_THREADS=16
diff --git a/egs/swbd/s5b/cmd.sh b/egs/swbd/s5b/cmd.sh
index 4abf8546b0d..575407ac0ff 100644
--- a/egs/swbd/s5b/cmd.sh
+++ b/egs/swbd/s5b/cmd.sh
@@ -1,28 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64*"
-export decode_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G"
-#export cuda_cmd="..."
-export mkgraph_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G"
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
-#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
-#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
-#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
-
-#c) run it locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
-
-
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/swbd/s5b/local/nnet/run_dnn_tandem_uc.sh b/egs/swbd/s5b/local/nnet/run_dnn_tandem_uc.sh
index 23c4945a8e7..06ea344be4d 100755
--- a/egs/swbd/s5b/local/nnet/run_dnn_tandem_uc.sh
+++ b/egs/swbd/s5b/local/nnet/run_dnn_tandem_uc.sh
@@ -89,7 +89,7 @@ if [ $stage -le 4 ]; then
   dir=exp/nnet5b_uc-part1
   feature_transform=$dir/final.feature_transform.part1
   nnet-concat $dir/final.feature_transform \
-    "nnet-copy --remove-last-layers=4 --binary=false $dir/final.nnet - |" \
+    "nnet-copy --remove-last-components=4 --binary=false $dir/final.nnet - |" \
     "utils/nnet/gen_splice.py --fea-dim=80 --splice=2 --splice-step=5 |" \
     $feature_transform || exit 1
   
diff --git a/egs/swbd/s5b/local/nnet2/run_5c_gpu.sh b/egs/swbd/s5b/local/nnet2/run_5c_gpu.sh
index a8eef429fe1..36f72b77083 100755
--- a/egs/swbd/s5b/local/nnet2/run_5c_gpu.sh
+++ b/egs/swbd/s5b/local/nnet2/run_5c_gpu.sh
@@ -20,7 +20,7 @@ EOF
 
 ( 
   if [ ! -f exp/nnet5c_gpu/final.mdl ]; then
-    steps/nnet2/train_tanh.sh --cmd "$decode_cmd" --parallel-opts "-l gpu=1" --io-opts "-tc 5" \
+    steps/nnet2/train_tanh.sh --cmd "$decode_cmd" --parallel-opts "-l gpu=1" --io-opts "--max-jobs-run 5" \
       --num-threads 1 --minibatch-size 512 --max-change 40.0 --mix-up 20000 --samples-per-iter 300000 \
       --num-epochs 10 --num-epochs-extra 3 --initial-learning-rate 0.0067 --final-learning-rate 0.00067 \
       --num-jobs-nnet 10 --num-hidden-layers 5 --hidden-layer-dim 1536 data/train_nodup data/lang \
diff --git a/egs/swbd/s5b/local/online/run_nnet2_ms_disc.sh b/egs/swbd/s5b/local/online/run_nnet2_ms_disc.sh
index 1ed461027e1..dc56a8371fb 100755
--- a/egs/swbd/s5b/local/online/run_nnet2_ms_disc.sh
+++ b/egs/swbd/s5b/local/online/run_nnet2_ms_disc.sh
@@ -108,14 +108,14 @@ if [ $stage -le 3 ]; then
   if [ -d ${srcdir}_degs/storage ]; then max_jobs=10; else max_jobs=5; fi
 
   steps/nnet2/get_egs_discriminative2.sh \
-    --cmd "$decode_cmd -tc $max_jobs" \
+    --cmd "$decode_cmd --max-jobs-run $max_jobs" \
     --online-ivector-dir exp/nnet2_online/ivectors_train_hires_nodup2 \
     --criterion $criterion --drop-frames $drop_frames \
      data/train_hires_nodup data/lang ${srcdir}{_ali,_denlats,/final.mdl,_degs} || exit 1;
 
   # the command below is a more generic, but slower, way to do it.
   #steps/online/nnet2/get_egs_discriminative2.sh \
-  #  --cmd "$decode_cmd -tc $max_jobs" \
+  #  --cmd "$decode_cmd --max-jobs-run $max_jobs" \
   #  --criterion $criterion --drop-frames $drop_frames \
   #   data/train_960 data/lang ${srcdir}{_ali,_denlats,_online,_degs} || exit 1;
 fi
diff --git a/egs/swbd/s5b/path.sh b/egs/swbd/s5b/path.sh
index db666cc10f6..2d17b17a84a 100755
--- a/egs/swbd/s5b/path.sh
+++ b/egs/swbd/s5b/path.sh
@@ -1,4 +1,6 @@
 export KALDI_ROOT=`pwd`/../../..
-export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$KALDI_ROOT/src/lmbin/:$PWD:$PATH
-#$KALDI_ROOT/tools/srilm/bin:$KALDI_ROOT/tools/srilm/bin/i686-m64:$KALDI_ROOT/tools/srilm/bin/i686:$PATH
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
 export LC_ALL=C
diff --git a/egs/swbd/s5c/RESULTS b/egs/swbd/s5c/RESULTS
index 5302ca6d700..4a95ae7c7a4 100644
--- a/egs/swbd/s5c/RESULTS
+++ b/egs/swbd/s5c/RESULTS
@@ -1,3 +1,10 @@
+#!/bin/bash
+# eval2000,
+for x in exp/{mono,tri,sgmm,nnet,dnn,lstm}*/decode*; do [ -d $x ] && [[ $x =~ "$1" ]] && grep Sum $x/score_*/*.ctm.filt.sys | utils/best_wer.sh; done 2>/dev/null
+# swbd subset of eval2000,
+for x in exp/{mono,tri,sgmm,nnet,dnn,lstm}*/decode*; do [ -d $x ] && [[ $x =~ "$1" ]] && grep Sum $x/score_*/*.ctm.swbd.filt.sys | utils/best_wer.sh; done 2>/dev/null
+exit 0
+
 # Note: we report the overall eval2000 performance and the Switchboard portion
 # of eval2000 (without CallHome) performance separately below.
 
@@ -98,10 +105,105 @@
 %WER 14.5 | 1831 21395 | 86.8 8.5 4.6 1.3 14.5 52.4 | exp/nnet2_online/nnet_ms_b_online/decode_eval2000_hires_sw1_tg/score_12/eval2000_hires.ctm.swbd.filt.sys
 %WER 14.8 | 1831 21395 | 86.7 9.0 4.3 1.6 14.8 52.8 | exp/nnet2_online/nnet_ms_b_online/decode_eval2000_hires_sw1_tg_per_utt/score_10/eval2000_hires.ctm.swbd.filt.sys
 
+
+(
+# old results with 25 million parameter model. We do not want to use such a big model. So see the new results below
+# local/nnet3/run_lstm.sh
+# these are results with nnet3 LSTMs cell_dim=1280, recurrent_dim=384, lstm_delay=-1 -2 -3, label_delay=5 num_params=25010228 (8 epoch training on speed-perturbed
+# and volume perturbed data)
+%WER 11.4 | 1831 21395 | 89.8 6.8 3.4 1.2 11.4 46.0 | exp/nnet3/lstm_ld5_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.swbd.filt.sys
+%WER 12.7 | 1831 21395 | 88.6 7.6 3.8 1.3 12.7 48.7 | exp/nnet3/lstm_ld5_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.swbd.filt.sys
+%WER 16.8 | 4459 42989 | 85.1 10.4 4.5 1.9 16.8 52.8 | exp/nnet3/lstm_ld5_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+%WER 18.1 | 4459 42989 | 84.0 11.2 4.8 2.0 18.1 54.9 | exp/nnet3/lstm_ld5_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+%WER 22.0 | 2628 21594 | 80.5 13.9 5.6 2.5 22.0 57.3 | exp/nnet3/lstm_ld5_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys
+%WER 23.3 | 2628 21594 | 79.4 14.7 6.0 2.7 23.3 59.2 | exp/nnet3/lstm_ld5_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys
+)
+
+
+# local/nnet3/run_lstm.sh
+# these are results with nnet3 LSTMs cell_dim=1024, recurrent_dim=256, nonrecurrent_projection_dim=256, lstm_delay=-1 -2 -3, label_delay=5 num_params=14.6M (8 epoch training on speed-perturbed
+# this setup has the newly introduced feature self-repair, in addition to shrink
+%WER 11.6 | 1831 21395 | 89.7 6.9 3.4 1.3 11.6 46.9 | exp/nnet3/lstm_ld5_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.swbd.filt.sys
+%WER 12.6 | 1831 21395 | 88.7 7.6 3.7 1.4 12.6 49.6 | exp/nnet3/lstm_ld5_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
+%WER 21.3 | 2628 21594 | 81.0 13.2 5.8 2.4 21.3 57.3 | exp/nnet3/lstm_ld5_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.callhm.filt.sys
+%WER 23.1 | 2628 21594 | 79.5 14.7 5.8 2.6 23.1 59.6 | exp/nnet3/lstm_ld5_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys
+%WER 16.5 | 4459 42989 | 85.3 10.1 4.6 1.8 16.5 53.0 | exp/nnet3/lstm_ld5_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+%WER 17.9 | 4459 42989 | 84.1 11.2 4.8 2.0 17.9 55.5 | exp/nnet3/lstm_ld5_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+%WER 15.17 [ 7466 / 49204, 993 ins, 1937 del, 4536 sub ] exp/nnet3/lstm_ld5_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+%WER 16.12 [ 7931 / 49204, 1072 ins, 1910 del, 4949 sub ] exp/nnet3/lstm_ld5_sp/decode_train_dev_sw1_tg/wer_11_0.0
+
+
+# bidirectional LSTM
+# -----------------------
+# local/nnet3/run_lstm.sh --affix bidirectional \
+#                         --lstm-delay " [-1,1] [-2,2] [-3,3] " \
+#                         --label-delay 0 \
+#                         --cell-dim 1024 \
+#                         --recurrent-projection-dim 128 \
+#                         --non-recurrent-projection-dim 128 \
+#                         --chunk-left-context 40 \
+#                         --chunk-right-context 40
+# (8 epoch training on speed-perturbed and volume perturbed data)
+# num_params=20101172
+%WER 10.3 | 1831 21395 | 90.6 6.1 3.2 0.9 10.3 44.2 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.swbd.filt.sys
+%WER 11.3 | 1831 21395 | 89.6 6.9 3.5 1.0 11.3 46.6 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_sw1_tg/score_11_1.0/eval2000_hires.ctm.swbd.filt.sys
+%WER 14.9 | 4459 42989 | 86.6 9.1 4.3 1.5 14.9 50.6 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+%WER 16.1 | 4459 42989 | 85.5 10.1 4.5 1.6 16.1 52.7 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+%WER 19.4 | 2628 21594 | 82.7 12.0 5.3 2.1 19.4 54.9 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys
+%WER 20.8 | 2628 21594 | 81.3 13.1 5.6 2.2 20.8 56.9 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys
+
+# results with nnet3 tdnn: local/nnet3/run_tdnn.sh (11.10.2015) (2 epoch training on speed-perturbed and volume perturbed data)
+%WER 12.1 | 1831 21395 | 89.1 7.1 3.8 1.3 12.1 48.1 | exp/nnet3/tdnn_sp/decode_eval2000_hires_sw1_fsh_fg/score_12_0.0/eval2000_hires.ctm.swbd.filt.sys
+%WER 13.6 | 1831 21395 | 87.9 8.2 3.9 1.5 13.6 51.0 | exp/nnet3/tdnn_sp/decode_eval2000_hires_sw1_tg/score_11_0.0/eval2000_hires.ctm.swbd.filt.sys
+
+# results with nnet3 cnn+tdnn: local/nnet3/run_tdnn.sh --use_cnn true (1.2.2016) (2 epoch training on speed-perturbed and volume perturbed data)
+%WER 12.0 | 1831 21395 | 89.3 7.1 3.7 1.3 12.0 47.1 | exp/nnet3/tdnn_cnn_sp/decode_eval2000_hires_sw1_fsh_fg/score_12_0.0/eval2000_hires.ctm.swbd.filt.sys
+%WER 22.4 | 2628 21594 | 80.1 13.7 6.2 2.6 22.4 57.6 | exp/nnet3/tdnn_cnn_sp/decode_eval2000_hires_sw1_fsh_fg/score_11_1.0/eval2000_hires.ctm.callhm.filt.sys
+%WER 17.3 | 4459 42989 | 84.7 10.5 4.9 2.0 17.3 53.5 | exp/nnet3/tdnn_cnn_sp/decode_eval2000_hires_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+%WER 13.5 | 1831 21395 | 88.0 8.1 3.9 1.5 13.5 49.4 | exp/nnet3/tdnn_cnn_sp/decode_eval2000_hires_sw1_tg/score_11_0.0/eval2000_hires.ctm.swbd.filt.sys
+%WER 24.3 | 2628 21594 | 78.6 15.0 6.4 2.9 24.3 60.0 | exp/nnet3/tdnn_cnn_sp/decode_eval2000_hires_sw1_tg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys
+%WER 18.9 | 4459 42989 | 83.2 11.5 5.3 2.2 18.9 55.6 | exp/nnet3/tdnn_cnn_sp/decode_eval2000_hires_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+
+
+# current best 'chain' models with TDNNs (see local/chain/run_tdnn_2o.sh)
+%WER 11.3 | 1831 21395 | 90.0 6.8 3.2 1.3 11.3 46.6 | exp/chain/tdnn_2o_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
+%WER 13.0 | 1831 21395 | 88.6 7.9 3.6 1.6 13.0 50.4 | exp/chain/tdnn_2o_sp/decode_eval2000_sw1_tg/score_10_0.5/eval2000_hires.ctm.swbd.filt.sys
+
+# current best 'chain' models with LSTM (see local/chain/run_lstm_d.sh)
+%WER 10.5 | 1831 21395 | 90.8 6.4 2.9 1.3 10.5 44.3 | exp/chain/lstm_d_ld5_sp/decode_eval2000_sw1_fsh_fg/score_10_1.0/eval2000_hires.ctm.swbd.filt.sys
+%WER 15.9 | 4459 42989 | 86.0 9.6 4.3 2.0 15.9 51.7 | exp/chain/lstm_d_ld5_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+%WER 21.2 | 2628 21594 | 81.4 12.8 5.9 2.6 21.2 56.7 | exp/chain/lstm_d_ld5_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys
+%WER 13.88 [ 6829 / 49204, 935 ins, 1690 del, 4204 sub ] exp/chain/lstm_d_ld5_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+
+# these are results with nnet3 LSTMs with CTC training : local/ctc/run_lstm.sh
+%WER 17.4 | 1831 21395 | 85.3 10.1 4.6 2.7 17.4 57.8 | exp/ctc/lstm_sp/decode_eval2000_sw1_fsh_fg_0.15/score_12_0.0/eval2000_hires.ctm.swbd.filt.sys
+%WER 19.4 | 1831 21395 | 83.5 11.2 5.2 3.0 19.4 60.7 | exp/ctc/lstm_sp/decode_eval2000_sw1_tg_0.15/score_12_0.5/eval2000_hires.ctm.swbd.filt.sys
+
+
 # Resegmentation numbers for swbd subset.
 %WER 22.4 | 1831 21395 | 79.9 13.4 6.6 2.4 22.4 61.4 | exp/tri4a_reseg/decode_eval2000_sw1_tg/score_15/eval2000.ctm.swbd.filt.sys
 %WER 30.3 | 1831 21395 | 73.1 19.1 7.9 3.4 30.3 67.5 | exp/tri4a_reseg/decode_eval2000_sw1_tg.si/score_13/eval2000.ctm.swbd.filt.sys
 
-# Raw fmllr numbers for swbd subset. 
+# Raw fmllr numbers for swbd subset.
 %WER 22.1 | 1831 21395 | 80.1 14.1 5.8 2.2 22.1 59.8 | exp/tri4b/decode_eval2000_sw1_tg/score_13/eval2000.ctm.swbd.filt.sys
 %WER 30.1 | 1831 21395 | 72.7 19.5 7.9 2.8 30.1 65.4 | exp/tri4b/decode_eval2000_sw1_tg.si/score_14/eval2000.ctm.swbd.filt.sys
+
+
+### Karel's nnet1
+# nnet1 DNN recipe (29.09.2015), swbd subset,
+# cross-entropy (3gram decoding, fisher 4gram rescoring),
+%WER 14.6 | 1831 21395 | 87.0 8.9 4.2 1.6 14.6 52.3 | exp/dnn5b_pretrain-dbn_dnn/decode_eval2000_sw1_tg/score_12_0.5/eval2000.ctm.swbd.filt.sys
+%WER 13.0 | 1831 21395 | 88.5 7.8 3.7 1.4 13.0 49.5 | exp/dnn5b_pretrain-dbn_dnn/decode_eval2000_sw1_fsh_fg/score_12_0.0/eval2000.ctm.swbd.filt.sys
+# sMBR (3gram decoding, fisher 4gram rescoring),
+%WER 13.2 | 1831 21395 | 88.5 8.1 3.4 1.7 13.2 48.7 | exp/dnn5b_pretrain-dbn_dnn_smbr/decode_eval2000_sw1_tg_it4/score_14_0.0/eval2000.ctm.swbd.filt.sys
+%WER 11.7 | 1831 21395 | 89.9 7.1 3.0 1.6 11.7 45.8 | exp/dnn5b_pretrain-dbn_dnn_smbr/decode_eval2000_sw1_fsh_fg_it4/score_13_0.0/eval2000.ctm.swbd.filt.sys
+
+# nnet1 Tandem recipe local/nnet/run_dnn_tandem_uc.sh (29.09.2015), swbd subset,
+# Stacked bottleneck network,
+%WER 15.3 | 1831 21395 | 86.2 9.5 4.3 1.5 15.3 52.6 | exp/nnet5uc-part2/decode_eval2000_sw1_tg/score_13_0.0/eval2000.ctm.swbd.filt.sys
+# GMMs on BN-features,
+%WER 16.7 | 1831 21395 | 85.0 10.5 4.5 1.8 16.7 54.3 | exp/tri6uc/decode_eval2000_graph_sw1_tg/score_20_0.0/eval2000.ctm.swbd.filt.sys
+%WER 15.8 | 1831 21395 | 85.8 9.9 4.4 1.6 15.8 53.2 | exp/tri7uc-sat/decode_eval2000_graph_sw1_tg/score_20_0.0/eval2000.ctm.swbd.filt.sys
+%WER 14.6 | 1831 21395 | 87.1 9.2 3.6 1.8 14.6 51.8 | exp/tri7uc-sat_mmi_b0.1/decode_eval2000_graph_sw1_tg_it4/score_17_0.0/eval2000.ctm.swbd.filt.sys
+# fisher 4gram rescoring,
+%WER 13.2 | 1831 21395 | 88.3 8.2 3.4 1.5 13.2 49.2 | exp/tri7uc-sat_mmi_b0.1/decode_eval2000_graph_sw1_fsh_fg_it4/score_19_0.0/eval2000.ctm.swbd.filt.sys
diff --git a/egs/swbd/s5c/callhm.perf b/egs/swbd/s5c/callhm.perf
new file mode 100644
index 00000000000..a31a83b32bb
--- /dev/null
+++ b/egs/swbd/s5c/callhm.perf
@@ -0,0 +1,33 @@
+%WER 25.6 | 2628 21594 | 77.8 16.0 6.2 3.4 25.6 63.0 | exp/chain/tdnn_v_sp/decode_eval2000_sw1_fsh_fg/score_11_0.5/eval2000_hires.ctm.callhm.filt.sys
+%WER 22.7 | 2628 21594 | 79.9 13.6 6.5 2.6 22.7 59.0 | exp/chain/tdnn_v1_trial4_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.callhm.filt.sys
+%WER 22.6 | 2628 21594 | 80.0 13.5 6.5 2.6 22.6 58.5 | exp/chain/tdnn_v1_trial5_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.callhm.filt.sys
+%WER 22.6 | 2628 21594 | 80.0 13.2 6.8 2.6 22.6 59.2 | exp/chain/tdnn_v1_trial3_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.callhm.filt.sys
+%WER 22.5 | 2628 21594 | 80.1 13.5 6.4 2.6 22.5 58.6 | exp/chain/tdnn_v1_trial1_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys
+%WER 22.5 | 2628 21594 | 79.6 12.5 7.9 2.2 22.5 59.1 | exp/chain/tdnn_v2_trial5_sp/decode_eval2000_300_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys
+%WER 22.3 | 2628 21594 | 80.3 13.1 6.6 2.6 22.3 59.2 | exp/chain/tdnn_v2_trial5_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.callhm.filt.sys
+%WER 22.3 | 2628 21594 | 80.2 13.2 6.6 2.5 22.3 57.9 | exp/chain/tdnn_v2_trial8_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys
+%WER 22.3 | 2628 21594 | 80.2 13.2 6.6 2.5 22.3 57.9 | exp/chain/tdnn_v2_trial8_sp/decode_eval2000_400_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys
+%WER 22.3 | 2628 21594 | 80.2 13.2 6.6 2.5 22.3 57.9 | exp/chain/tdnn_v1_trial8_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys
+%WER 22.3 | 2628 21594 | 80.2 13.2 6.6 2.5 22.3 57.9 | exp/chain/tdnn_v1_trial8_sp/decode_eval2000_400_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys
+%WER 22.3 | 2628 21594 | 80.1 13.3 6.6 2.4 22.3 58.4 | exp/chain/tdnn_v2_trial1_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.callhm.filt.sys
+%WER 22.2 | 2628 21594 | 80.2 13.0 6.8 2.4 22.2 58.4 | exp/chain/tdnn_v2_trial1_sp/decode_eval2000_400_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys
+%WER 22.1 | 2628 21594 | 80.4 13.3 6.3 2.6 22.1 58.5 | exp/chain/tdnn_v2_trial5_sp/decode_eval2000_400_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.callhm.filt.sys
+%WER 22.1 | 2628 21594 | 80.4 13.3 6.3 2.5 22.1 58.6 | exp/chain/tdnn_v2_trial8_sp/decode_eval2000_300_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.callhm.filt.sys
+%WER 22.1 | 2628 21594 | 80.4 13.3 6.3 2.5 22.1 58.6 | exp/chain/tdnn_v1_trial8_sp/decode_eval2000_300_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.callhm.filt.sys
+%WER 22.1 | 2628 21594 | 80.0 12.3 7.7 2.2 22.1 58.0 | exp/chain/tdnn_v2_trial1_sp/decode_eval2000_200_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys
+%WER 22.0 | 2628 21594 | 80.6 13.1 6.4 2.5 22.0 58.7 | exp/chain/tdnn_v2_trial3_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.callhm.filt.sys
+%WER 22.0 | 2628 21594 | 80.6 13.1 6.3 2.6 22.0 58.5 | exp/chain/tdnn_v2_trial4_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.callhm.filt.sys
+%WER 22.0 | 2628 21594 | 80.3 12.5 7.2 2.4 22.0 58.1 | exp/chain/tdnn_v1_trial6_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.callhm.filt.sys
+%WER 22.0 | 2628 21594 | 80.3 12.0 7.7 2.2 22.0 57.7 | exp/chain/tdnn_v2_trial8_sp/decode_eval2000_200_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys
+%WER 22.0 | 2628 21594 | 80.3 12.0 7.7 2.2 22.0 57.7 | exp/chain/tdnn_v1_trial8_sp/decode_eval2000_200_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys
+%WER 21.9 | 2628 21594 | 80.7 13.2 6.1 2.6 21.9 58.1 | exp/chain/tdnn_v2_trial4_sp/decode_eval2000_300_sw1_fsh_fg/score_8_1.0/eval2000_hires.ctm.callhm.filt.sys
+%WER 21.9 | 2628 21594 | 80.6 13.3 6.1 2.5 21.9 58.2 | exp/chain/tdnn_v2_trial2_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.callhm.filt.sys
+%WER 21.9 | 2628 21594 | 80.6 12.9 6.6 2.5 21.9 58.2 | exp/chain/tdnn_v2_trial3_sp/decode_eval2000_400_sw1_fsh_fg/score_9_1.0/eval2000_hires.ctm.callhm.filt.sys
+%WER 21.9 | 2628 21594 | 80.4 12.3 7.3 2.3 21.9 59.1 | exp/chain/tdnn_v2_trial3_sp/decode_eval2000_300_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys
+%WER 21.8 | 2628 21594 | 80.6 12.5 6.9 2.4 21.8 58.2 | exp/chain/tdnn_v3_trial1_sp/decode_eval2000_300_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.callhm.filt.sys
+%WER 21.8 | 2628 21594 | 80.3 12.6 7.1 2.1 21.8 58.4 | exp/chain/tdnn_v2_trial1_sp/decode_eval2000_300_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.callhm.filt.sys
+%WER 21.7 | 2628 21594 | 80.9 13.0 6.1 2.6 21.7 58.2 | exp/chain/tdnn_v3_trial1_sp/decode_eval2000_400_sw1_fsh_fg/score_8_1.0/eval2000_hires.ctm.callhm.filt.sys
+%WER 21.7 | 2628 21594 | 80.7 12.9 6.4 2.5 21.7 58.8 | exp/chain/tdnn_v2_trial2_sp/decode_eval2000_400_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.callhm.filt.sys
+%WER 21.7 | 2628 21594 | 80.6 12.9 6.5 2.4 21.7 58.2 | exp/chain/tdnn_v2_trial4_sp/decode_eval2000_400_sw1_fsh_fg/score_9_1.0/eval2000_hires.ctm.callhm.filt.sys
+%WER 21.6 | 2628 21594 | 80.6 11.9 7.5 2.2 21.6 57.8 | exp/chain/tdnn_v3_trial1_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys
+%WER 21.5 | 2628 21594 | 80.7 12.8 6.5 2.3 21.5 58.3 | exp/chain/tdnn_v2_trial2_sp/decode_eval2000_300_sw1_fsh_fg/score_9_0.5/eval2000_hires.ctm.callhm.filt.sys
diff --git a/egs/swbd/s5c/cmd.sh b/egs/swbd/s5c/cmd.sh
index 036d89a9ea5..d500a690621 100644
--- a/egs/swbd/s5c/cmd.sh
+++ b/egs/swbd/s5c/cmd.sh
@@ -1,28 +1,29 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
-#a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64*"
-export decode_cmd="queue.pl -l arch=*64* --mem 4G"
-#export cuda_cmd="..."
-export mkgraph_cmd="queue.pl -l arch=*64* --mem 4G"
+export train_cmd="queue.pl"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
+export cuda_cmd="queue.pl --gpu 1"
 
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
-#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
-#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
-#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
-
-#c) run it locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
 
+# the rest of this file is present for historical reasons.  it's better to
+# create and edit conf/queue.conf for cluster-specific configuration.
+if [ "$(hostname -d)" == "fit.vutbr.cz" ]; then
+  # BUT cluster:
+  queue="all.q@@blade,all.q@@speech"
+  storage="matylda5"
+  export train_cmd="queue.pl -q $queue -l ram_free=1.5G,mem_free=1.5G,${storage}=0.25"
+  export decode_cmd="queue.pl -q $queue -l ram_free=2.5G,mem_free=2.5G,${storage}=0.1"
+  export cuda_cmd="queue.pl -q long.q -l gpu=1"
+fi
 
diff --git a/egs/swbd/s5c/conf/decode_online.config b/egs/swbd/s5c/conf/decode_online.config
new file mode 100644
index 00000000000..410ca63c28b
--- /dev/null
+++ b/egs/swbd/s5c/conf/decode_online.config
@@ -0,0 +1,2 @@
+beam=11.0 # beam for decoding. 
+first_beam=8.0 # beam for 1st-pass decoding in SAT.
diff --git a/egs/swbd/s5c/conf/mfcc_dbl3.conf b/egs/swbd/s5c/conf/mfcc_dbl3.conf
new file mode 100644
index 00000000000..f0e09186f3e
--- /dev/null
+++ b/egs/swbd/s5c/conf/mfcc_dbl3.conf
@@ -0,0 +1,16 @@
+# config for high-resolution MFCC features extracted at double the normal frame
+# rate, intended for neural network training.  Note: we keep all cepstra, so it
+# has the same info as filterbank features, but MFCC is more easily compressible
+# (because less correlated) which is why we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--sample-frequency=8000 #  Switchboard is sampled at 8kHz
+--num-mel-bins=40     # similar to Google's setup.
+--num-ceps=10     # for the higher-frequency-resolution mfcc coefficients, we'll use
+                  # a larger window size of 25ms and the normal window.
+--low-freq=40    # low cutoff frequency for mel bins
+--high-freq=-200 # high cutoff frequently, relative to Nyquist of 4000 (=3800)
+--frame-length=17 # shorter than normal (25ms) frame length.... the shortest we can
+                  # go without the FFT becoming lower resolution which might cause
+                  # problems
+--window-type=hanning # additionally making the context shorter by using a more aggressively tapering window.
+--frame-shift=5  # half the normal frame shift
diff --git a/egs/swbd/s5c/conf/mfcc_hires_dbl.conf b/egs/swbd/s5c/conf/mfcc_hires_dbl.conf
new file mode 100644
index 00000000000..c41b76116ee
--- /dev/null
+++ b/egs/swbd/s5c/conf/mfcc_hires_dbl.conf
@@ -0,0 +1,12 @@
+# config for high-resolution MFCC features extracted at double the normal frame
+# rate, intended for neural network training.  Note: we keep all cepstra, so it
+# has the same info as filterbank features, but MFCC is more easily compressible
+# (because less correlated) which is why we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--sample-frequency=8000 #  Switchboard is sampled at 8kHz
+--num-mel-bins=40     # similar to Google's setup.
+--num-ceps=40     # there is no dimensionality reduction.
+--low-freq=40    # low cutoff frequency for mel bins
+--high-freq=-200 # high cutoff frequently, relative to Nyquist of 4000 (=3800)
+--frame-length=20 # slightly less than the normal 25ms frame length.
+--frame-shift=5  # half the normal frame shift
diff --git a/egs/swbd/s5c/conf/mfcc_hires_dbl2.conf b/egs/swbd/s5c/conf/mfcc_hires_dbl2.conf
new file mode 100644
index 00000000000..92670e7ed6e
--- /dev/null
+++ b/egs/swbd/s5c/conf/mfcc_hires_dbl2.conf
@@ -0,0 +1,11 @@
+# config for high-resolution MFCC features extracted at double the normal frame
+# rate, intended for neural network training.  Note: we keep all cepstra, so it
+# has the same info as filterbank features, but MFCC is more easily compressible
+# (because less correlated) which is why we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--sample-frequency=8000 #  Switchboard is sampled at 8kHz
+--num-mel-bins=40     # similar to Google's setup.
+--num-ceps=40     # there is no dimensionality reduction.
+--low-freq=40    # low cutoff frequency for mel bins
+--high-freq=-200 # high cutoff frequently, relative to Nyquist of 4000 (=3800)
+--frame-shift=5  # half the normal frame shift
diff --git a/egs/swbd/s5c/conf/mfcc_hiresf.conf b/egs/swbd/s5c/conf/mfcc_hiresf.conf
new file mode 100644
index 00000000000..c0b1798a9c5
--- /dev/null
+++ b/egs/swbd/s5c/conf/mfcc_hiresf.conf
@@ -0,0 +1,12 @@
+# this is a config for 'fast' (7.5ms frame shift) high-resolution MFCC features,
+# intended for use with chain models.  Note: we keep all cepstra, so it has the
+# same info as filterbank features, but MFCC is more easily compressible
+# (because less correlated) which is why we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--sample-frequency=8000 #  Switchboard is sampled at 8kHz
+--num-mel-bins=40     # similar to Google's setup.
+--num-ceps=40     # there is no dimensionality reduction.
+--low-freq=40    # low cutoff frequency for mel bins
+--high-freq=-200 # high cutoff frequently, relative to Nyquist of 4000 (=3800)
+--frame-length=25 # the normal frame length
+--frame-shift=7.5
diff --git a/egs/swbd/s5c/local/chain/README.txt b/egs/swbd/s5c/local/chain/README.txt
new file mode 100644
index 00000000000..8e347f4f889
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/README.txt
@@ -0,0 +1,29 @@
+
+there are a lot of tuning experiments here.
+
+ones to look at right now:
+  2y is a TDNN baseline
+  4f is a good jesus-layer system
+  4q is an improved TDNN with various bells and whistles from Vijay.
+  4r is a slightly-better jesus-layer system than 4f, with one more layer.
+  5e is the best configuration run so far that doesn't have statistics-averaging layers.
+  5g uses a statistics-averaging layer in the middle to slightly improve on 5e (by about
+     0.2%).
+  5j is a basic configuration without iVectors (about 2% abs worse than 5e)
+  5k is the best configurations without iVectors... about 1% abs worse than 5e; we
+     use statistics-averaging layers to do some crude adaptation.
+  5t gives about the same performance as 5e but is about 30% faster to train
+     and is smaller.
+  5v is what I am currently using as a baseline- it has an even smaller
+     --jesus-hidden-dim as 5t (hence faster to train), but gives the same
+     performance.
+  6g is a setup with a 'thinner' jesus-layer (with only one repeated-affine component)
+     and slightly more parameters, which is quicker to train than 5v but gives
+     about the same results.  I'm hoping to use this setup, going forward.
+  6i is like 6i but with a separate last-but-one affine layer for the xent output
+     (marginally better than 6g).
+  6z is probably the thing I currently recommend to run-- it's a TDNN+ReLU based
+     setup that's quite fast to train and gives better results than our old
+     jesus-layer-based system.
+
+
diff --git a/egs/swbd/s5c/local/chain/compare_wer.sh b/egs/swbd/s5c/local/chain/compare_wer.sh
new file mode 100755
index 00000000000..ded03563711
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/compare_wer.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+
+
+echo -n "System               "
+for x in $*; do   printf "% 10s" $x;   done
+echo
+
+echo -n "WER on train_dev(tg) "
+for x in $*; do
+  wer=$(grep WER exp/chain/tdnn_${x}_sp/decode_train_dev_sw1_tg/wer_* | utils/best_wer.sh | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "WER on train_dev(fg) "
+for x in $*; do
+  wer=$(grep WER exp/chain/tdnn_${x}_sp/decode_train_dev_sw1_fsh_fg/wer_* | utils/best_wer.sh | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "WER on eval2000(tg)  "
+for x in $*; do
+  wer=$(grep Sum exp/chain/tdnn_${x}_sp/decode_eval2000_sw1_tg/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "WER on eval2000(fg)  "
+for x in $*; do
+  wer=$(grep Sum exp/chain/tdnn_${x}_sp/decode_eval2000_sw1_fsh_fg/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "Final train prob     "
+for x in $*; do
+  prob=$(grep Overall exp/chain/tdnn_${x}_sp/log/compute_prob_train.final.log | grep -v xent | awk '{print $8}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "Final valid prob     "
+for x in $*; do
+  prob=$(grep Overall exp/chain/tdnn_${x}_sp/log/compute_prob_valid.final.log | grep -v xent | awk '{print $8}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "Final train prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall exp/chain/tdnn_${x}_sp/log/compute_prob_train.final.log | grep -w xent | awk '{print $8}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "Final valid prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall exp/chain/tdnn_${x}_sp/log/compute_prob_valid.final.log | grep -w xent | awk '{print $8}')
+  printf "% 10s" $prob
+done
+echo
diff --git a/egs/swbd/s5c/local/chain/run_blstm_6h.sh b/egs/swbd/s5c/local/chain/run_blstm_6h.sh
new file mode 100755
index 00000000000..b19a0b489a0
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_blstm_6h.sh
@@ -0,0 +1,206 @@
+#!/bin/bash
+
+# based on run_tdnn_6h.sh
+
+#%WER 9.6 | 1831 21395 | 91.6 5.8 2.6 1.2 9.6 44.2 | exp/chain/blstm_6h_sp/decode_eval2000_sw1_fsh_fg/score_10_1.0/eval2000_hires.ctm.swbd.filt.sys
+#%WER 14.5 | 4459 42989 | 87.4 8.9 3.7 1.9 14.5 50.5 | exp/chain/blstm_6h_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+#%WER 19.3 | 2628 21594 | 83.3 11.8 4.9 2.5 19.3 54.8 | exp/chain/blstm_6h_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.callhm.filt.sys
+#%WER 13.32 [ 6554 / 49204, 830 ins, 1696 del, 4028 sub ] exp/chain/blstm_6h_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/blstm_6h  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+decode_dir_affix=
+
+# training options
+leftmost_questions_truncate=-1
+chunk_width=150
+chunk_left_context=40
+chunk_right_context=40
+xent_regularize=0.025
+
+label_delay=0
+# decode options
+extra_left_context=
+extra_right_context=
+frames_per_chunk=
+
+remove_egs=false
+common_egs_dir=
+
+affix=
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=$dir${affix:+_$affix}
+if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs";
+
+  steps/nnet3/lstm/make_configs.py  \
+    --feat-dir data/${train_set}_hires \
+    --ivector-dir exp/nnet3/ivectors_${train_set} \
+    --tree-dir $treedir \
+    --splice-indexes="-2,-1,0,1,2 0 0" \
+    --lstm-delay=" [-3,3] [-3,3] [-3,3] " \
+    --xent-regularize $xent_regularize \
+    --include-log-softmax false \
+    --num-lstm-layers 3 \
+    --cell-dim 1024 \
+    --hidden-dim 1024 \
+    --recurrent-projection-dim 256 \
+    --non-recurrent-projection-dim 256 \
+    --label-delay $label_delay \
+    --self-repair-scale 0.00001 \
+   $dir/configs || exit 1;
+
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --chain.left-deriv-truncate 0 \
+    --trainer.num-chunk-per-minibatch 64 \
+    --trainer.frames-per-iter 1200000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $chunk_width \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $extra_right_context ] && extra_right_context=$chunk_right_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 250 --cmd "$decode_cmd" $iter_opts \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --frames-per-chunk "$frames_per_chunk" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_blstm_6h_discriminative.sh b/egs/swbd/s5c/local/chain/run_blstm_6h_discriminative.sh
new file mode 100755
index 00000000000..b0264c17d8b
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_blstm_6h_discriminative.sh
@@ -0,0 +1,238 @@
+#!/bin/bash
+
+set -o pipefail
+set -e
+# this is run_discriminative.sh
+
+# This script does discriminative training on top of chain nnet3 system.
+# note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
+# since the lattice generation runs in about real-time, so takes of the order of
+# 1000 hours of CPU time.
+# 
+. cmd.sh
+
+
+stage=0
+train_stage=-10 # can be used to start training in the middle.
+get_egs_stage=-10
+use_gpu=true  # for training
+cleanup=false  # run with --cleanup true --stage 6 to clean up (remove large things like denlats,
+               # alignments and degs).
+
+# Frame chunk options that will be used for blstm models.
+frames_per_chunk=150
+extra_left_context=40
+extra_right_context=40
+extra_left_context_initial=-1
+extra_right_context_final=-1
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+srcdir=exp/chain/blstm_6h_sp
+train_data_dir=data/train_nodup_sp_hires
+online_ivector_dir=exp/nnet3/ivectors_train_nodup_sp
+degs_dir=                     # If provided, will skip the degs directory creation
+lats_dir=                     # If provided, will skip denlats creation
+
+## Objective options
+criterion=smbr
+one_silence_class=true
+
+dir=${srcdir}_${criterion}
+
+## Egs options
+frames_per_eg=150
+frames_overlap_per_eg=30
+truncate_deriv_weights=10
+
+## Nnet training options
+effective_learning_rate=0.000000125
+max_param_change=1
+num_jobs_nnet=4
+num_epochs=4
+regularization_opts="--xent-regularize=0.1 --l2-regularize=0.00005"          # Applicable for providing --xent-regularize and --l2-regularize options 
+minibatch_size=64
+
+## Decode options
+decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we decided to run more.
+
+if $use_gpu; then
+  if ! cuda-compiled; then
+    cat <<EOF && exit 1 
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
+EOF
+  fi
+  num_threads=1
+else
+  # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
+  # almost the same, but this may be a little bit slow.
+  num_threads=16
+fi
+
+[ ! -z "$frames_per_chunk" ] && context_opts="$context_opts --frames-per-chunk $frames_per_chunk"
+[ ! -z "$extra_left_context" ] && context_opts="$context_opts --extra-left-context $extra_left_context"
+[ ! -z "$extra_right_context" ] && context_opts="$context_opts --extra-right-context $extra_right_context"
+[ ! -z "$extra_left_context_initial" ] && context_opts="$context_opts --extra-left-context-initial $extra_left_context_initial"
+[ ! -z "$extra_right_context_final" ] && context_opts="$context_opts --extra-right-context-final $extra_right_context_final"
+
+if [ ! -f ${srcdir}/final.mdl ]; then
+  echo "$0: expected ${srcdir}/final.mdl to exist; first run run_tdnn.sh or run_lstm.sh"
+  exit 1;
+fi
+
+lang=data/lang
+
+frame_subsampling_opt=
+frame_subsampling_factor=1
+if [ -f $srcdir/frame_subsampling_factor ]; then
+  frame_subsampling_factor=$(cat $srcdir/frame_subsampling_factor)
+  frame_subsampling_opt="--frame-subsampling-factor $(cat $srcdir/frame_subsampling_factor)"
+fi
+
+affix=    # Will be set if doing input frame shift
+if [ $frame_subsampling_factor -ne 1 ]; then
+  if [ $stage -le 0 ]; then
+    mkdir -p ${online_ivector_dir}_fs
+    cp -r $online_ivector_dir/{conf,ivector_period} ${online_ivector_dir}_fs
+
+    rm ${online_ivector_dir}_fs/ivector_online.scp 2>/dev/null || true
+
+    data_dirs=
+    for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do 
+      steps/shift_feats.sh --cmd "$train_cmd --max-jobs-run 40" --nj 350 \
+        $x $train_data_dir exp/shift_hires/ mfcc_hires
+      utils/fix_data_dir.sh ${train_data_dir}_fs$x
+      data_dirs="$data_dirs ${train_data_dir}_fs$x"
+      awk -v nfs=$x '{print "fs"nfs"-"$0}' $online_ivector_dir/ivector_online.scp >> ${online_ivector_dir}_fs/ivector_online.scp
+    done
+    utils/combine_data.sh ${train_data_dir}_fs $data_dirs
+    for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do 
+      rm -r ${train_data_dir}_fs$x
+    done
+  fi
+
+  train_data_dir=${train_data_dir}_fs
+
+  affix=_fs
+fi
+    
+rm ${online_ivector_dir}_fs/ivector_online.scp 2>/dev/null || true
+for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do 
+  awk -v nfs=$x '{print "fs"nfs"-"$0}' $online_ivector_dir/ivector_online.scp >> ${online_ivector_dir}_fs/ivector_online.scp
+done
+online_ivector_dir=${online_ivector_dir}_fs
+
+if [ $stage -le 1 ]; then
+  # hardcode no-GPU for alignment, although you could use GPU [you wouldn't
+  # get excellent GPU utilization though.]
+  nj=350 # have a high number of jobs because this could take a while, and we might
+         # have some stragglers.
+  steps/nnet3/align.sh  --cmd "$decode_cmd" --use-gpu false \
+    --online-ivector-dir $online_ivector_dir $context_opts \
+    --scale-opts "--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0" \
+    --nj $nj $train_data_dir $lang $srcdir ${srcdir}_ali${affix} ;
+fi
+
+if [ -z "$lats_dir" ]; then
+  lats_dir=${srcdir}_denlats${affix}
+  if [ $stage -le 2 ]; then
+    nj=50  
+    # this doesn't really affect anything strongly, except the num-jobs for one of
+    # the phases of get_egs_discriminative.sh below.
+    num_threads_denlats=6
+    subsplit=40 # number of jobs that run per job (but 2 run at a time, so total jobs is 80, giving
+    # total slots = 80 * 6 = 480.
+    steps/nnet3/make_denlats.sh --cmd "$decode_cmd" \
+      --self-loop-scale 1.0 --acwt 1.0 --determinize true \
+      --online-ivector-dir $online_ivector_dir $context_opts \
+      --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \
+      $train_data_dir $lang $srcdir ${lats_dir} ;
+  fi
+fi
+
+model_left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'` 
+model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'` 
+
+left_context=$[model_left_context + extra_left_context]
+right_context=$[model_right_context + extra_right_context]
+
+valid_left_context=$[valid_left_context + frames_per_eg]
+valid_right_context=$[valid_right_context + frames_per_eg]
+
+cmvn_opts=`cat $srcdir/cmvn_opts` 
+
+if [ -z "$degs_dir" ]; then
+  degs_dir=${srcdir}_degs${affix}
+
+  if [ $stage -le 3 ]; then
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${srcdir}_degs/storage ]; then
+      utils/create_split_dir.pl \
+        /export/b0{1,2,12,13}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5/${srcdir}_degs/storage ${srcdir}_degs/storage
+    fi
+    # have a higher maximum num-jobs if
+    if [ -d ${srcdir}_degs/storage ]; then max_jobs=10; else max_jobs=5; fi
+
+    degs_opts="--determinize true --minimize true --remove-output-symbols true --remove-epsilons true --collapse-transition-ids true"
+
+    steps/nnet3/get_egs_discriminative.sh \
+      --cmd "$decode_cmd --max-jobs-run $max_jobs --mem 20G" --stage $get_egs_stage --cmvn-opts "$cmvn_opts" \
+      --adjust-priors false --acwt 1.0 \
+      --online-ivector-dir $online_ivector_dir \
+      --left-context $left_context --right-context $right_context \
+      --valid-left-context $valid_left_context --valid-right-context $valid_right_context \
+      --priors-left-context $valid_left_context --priors-right-context $valid_right_context $frame_subsampling_opt \
+      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg ${degs_opts} \
+      $train_data_dir $lang ${srcdir}_ali${affix} $lats_dir $srcdir/final.mdl $degs_dir ;
+  fi
+fi
+
+if [ $stage -le 4 ]; then
+  steps/nnet3/train_discriminative.sh --cmd "$decode_cmd" \
+    --stage $train_stage \
+    --effective-lrate $effective_learning_rate --max-param-change $max_param_change \
+    --criterion $criterion --drop-frames true --acoustic-scale 1.0 \
+    --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size $minibatch_size \
+    --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
+    --regularization-opts "$regularization_opts" --use-frame-shift false \
+    --truncate-deriv-weights $truncate_deriv_weights --adjust-priors false \
+    --modify-learning-rates false \
+      ${degs_dir} $dir ;
+fi
+
+graph_dir=$srcdir/graph_sw1_tg
+if [ $stage -le 5 ]; then
+  for x in `seq $decode_start_epoch $num_epochs`; do
+    for decode_set in train_dev eval2000 rt03; do
+      (
+      num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+      iter=epoch$x.adj
+      
+      steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --online-ivector-dir exp/nnet3/ivectors_${decode_set} $context_opts \
+        $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_sw1_tg_$iter ;
+      if $has_fisher; then
+        steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+          data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+          $dir/decode_${decode_set}_sw1_{tg,fsh_fg}_$iter ;
+      fi
+      ) &
+    done
+  done
+fi
+wait;
+
+if [ $stage -le 6 ] && $cleanup; then
+  # if you run with "--cleanup true --stage 6" you can clean up.
+  rm ${lats_dir}/lat.*.gz || true
+  rm ${srcdir}_ali/ali.*.gz || true
+  steps/nnet2/remove_egs.sh ${srcdir}_degs || true
+fi
+
+
+exit 0;
+
diff --git a/egs/swbd/s5c/local/chain/run_blstm_d.sh b/egs/swbd/s5c/local/chain/run_blstm_d.sh
new file mode 100755
index 00000000000..74cea0c28ab
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_blstm_d.sh
@@ -0,0 +1,216 @@
+#!/bin/bash
+
+# based on run_tdnn_2o.sh
+
+set -e
+
+# configs for 'chain'
+stage=10
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/lstm_d  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+decode_dir_affix=
+
+# LSTM options
+splice_indexes="-2,-1,0,1,2 0 0"
+lstm_delay=" [-3,3] [-3,3] [-3,3] "
+label_delay=0
+num_lstm_layers=3
+cell_dim=1024
+hidden_dim=1024
+recurrent_projection_dim=128
+non_recurrent_projection_dim=128
+
+# training options
+leftmost_questions_truncate=-1
+chunk_width=150
+chunk_left_context=40
+chunk_right_context=40
+xent_regularize=0.025
+
+# decode options
+extra_left_context=
+extra_right_context=
+frames_per_chunk=
+
+remove_egs=false
+common_egs_dir=
+
+affix=
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=$dir${affix:+_$affix}
+if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2o_tree$suffix
+lang=data/lang_chain_2o
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs";
+
+  # create the config files for nnet initialization
+  # note an additional space is added to splice_indexes to
+  # avoid issues with the python ArgParser which can have
+  # issues with negative arguments (due to minus sign)
+  config_extra_opts=()
+  [ ! -z "$lstm_delay" ] && config_extra_opts+=(--lstm-delay "$lstm_delay")
+
+  steps/nnet3/lstm/make_configs.py  "${config_extra_opts[@]}" \
+    --feat-dir data/${train_set}_hires \
+    --ivector-dir exp/nnet3/ivectors_${train_set} \
+    --tree-dir $treedir \
+    --xent-regularize $xent_regularize \
+    --include-log-softmax false \
+    --splice-indexes "$splice_indexes " \
+    --num-lstm-layers $num_lstm_layers \
+    --cell-dim $cell_dim \
+    --hidden-dim $hidden_dim \
+    --recurrent-projection-dim $recurrent_projection_dim \
+    --non-recurrent-projection-dim $non_recurrent_projection_dim \
+    --label-delay $label_delay \
+    --self-repair-scale 0.00001 \
+   $dir/configs || exit 1;
+
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.xent-regularize $xent_regularize \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --chain.left-deriv-truncate 0 \
+    --trainer.num-chunk-per-minibatch 64 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.9 \
+    --egs.stage $get_egs_stage \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --egs.chunk-width $chunk_width \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $extra_right_context ] && extra_right_context=$chunk_right_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/lstm/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 250 --cmd "$decode_cmd" $iter_opts \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --frames-per-chunk "$frames_per_chunk" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_lstm_6h.sh b/egs/swbd/s5c/local/chain/run_lstm_6h.sh
new file mode 100755
index 00000000000..0e777d85fac
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_lstm_6h.sh
@@ -0,0 +1,211 @@
+#!/bin/bash
+
+# based on run_tdnn_6h.sh
+
+# %WER 15.6 | 4459 42989 | 86.1 9.2 4.7 1.8 15.6 52.1 | exp/chain/lstm_6h_ld5_sp/decode_eval2000_sw1_fsh_fg/score_10_1.0/eval2000_hires.ctm.filt.sys
+# %WER 10.3 | 1831 21395 | 90.9 6.1 3.0 1.3 10.3 44.7 | exp/chain/lstm_6h_ld5_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
+# %WER 20.7 | 2628 21594 | 82.0 12.8 5.3 2.7 20.7 56.7 | exp/chain/lstm_6h_ld5_sp/decode_eval2000_sw1_fsh_fg/score_8_0.0/eval2000_hires.ctm.callhm.filt.sys
+
+# if right-tolerance was 10 (these are old results)
+#---------------------------
+# %WER 15.8 | 4459 42989 | 86.0 9.3 4.8 1.8 15.8 52.0 | exp/chain/lstm_6h_ld5_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+# %WER 10.6 | 1831 21395 | 90.6 6.2 3.2 1.2 10.6 45.2 | exp/chain/lstm_6h_ld5_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
+# %WER 21.0 | 2628 21594 | 81.4 12.4 6.3 2.4 21.0 56.8 | exp/chain/lstm_6h_ld5_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.callhm.filt.sys
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/lstm_6h2  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+decode_dir_affix=
+
+# training options
+leftmost_questions_truncate=-1
+chunk_width=150
+chunk_left_context=40
+chunk_right_context=0
+xent_regularize=0.025
+
+label_delay=5
+# decode options
+extra_left_context=
+extra_right_context=
+frames_per_chunk=
+
+remove_egs=false
+common_egs_dir=
+
+affix=
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=$dir${affix:+_$affix}
+if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs";
+
+  steps/nnet3/lstm/make_configs.py  \
+    --feat-dir data/${train_set}_hires \
+    --ivector-dir exp/nnet3/ivectors_${train_set} \
+    --tree-dir $treedir \
+    --splice-indexes="-2,-1,0,1,2 0 0" \
+    --lstm-delay=" -3 -3 -3 " \
+    --xent-regularize $xent_regularize \
+    --include-log-softmax false \
+    --num-lstm-layers 3 \
+    --cell-dim 1024 \
+    --hidden-dim 1024 \
+    --recurrent-projection-dim 256 \
+    --non-recurrent-projection-dim 256 \
+    --label-delay $label_delay \
+    --self-repair-scale 0.00001 \
+   $dir/configs || exit 1;
+
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --chain.left-deriv-truncate 0 \
+    --trainer.num-chunk-per-minibatch 64 \
+    --trainer.frames-per-iter 1200000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $chunk_width \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $extra_right_context ] && extra_right_context=$chunk_right_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 250 --cmd "$decode_cmd" $iter_opts \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --frames-per-chunk "$frames_per_chunk" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_lstm_d.sh b/egs/swbd/s5c/local/chain/run_lstm_d.sh
new file mode 100755
index 00000000000..05db63c2bee
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_lstm_d.sh
@@ -0,0 +1,216 @@
+#!/bin/bash
+
+# based on run_tdnn_2o.sh
+
+set -e
+
+# configs for 'chain'
+stage=10
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/lstm_d  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+decode_dir_affix=
+
+# LSTM options
+splice_indexes="-2,-1,0,1,2 0 0"
+lstm_delay=" -3 -3 -3 "
+label_delay=5
+num_lstm_layers=3
+cell_dim=1024
+hidden_dim=1024
+recurrent_projection_dim=256
+non_recurrent_projection_dim=256
+
+# training options
+leftmost_questions_truncate=-1
+chunk_width=150
+chunk_left_context=40
+chunk_right_context=0
+xent_regularize=0.025
+
+# decode options
+extra_left_context=
+extra_right_context=
+frames_per_chunk=
+
+remove_egs=false
+common_egs_dir=
+
+affix=
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=$dir${affix:+_$affix}
+if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2o_tree$suffix
+lang=data/lang_chain_2o
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs";
+
+  # create the config files for nnet initialization
+  # note an additional space is added to splice_indexes to
+  # avoid issues with the python ArgParser which can have
+  # issues with negative arguments (due to minus sign)
+  config_extra_opts=()
+  [ ! -z "$lstm_delay" ] && config_extra_opts+=(--lstm-delay "$lstm_delay")
+
+  steps/nnet3/lstm/make_configs.py  "${config_extra_opts[@]}" \
+    --feat-dir data/${train_set}_hires \
+    --ivector-dir exp/nnet3/ivectors_${train_set} \
+    --tree-dir $treedir \
+    --xent-regularize $xent_regularize \
+    --include-log-softmax false \
+    --splice-indexes "$splice_indexes " \
+    --num-lstm-layers $num_lstm_layers \
+    --cell-dim $cell_dim \
+    --hidden-dim $hidden_dim \
+    --recurrent-projection-dim $recurrent_projection_dim \
+    --non-recurrent-projection-dim $non_recurrent_projection_dim \
+    --label-delay $label_delay \
+    --self-repair-scale 0.00001 \
+   $dir/configs || exit 1;
+
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00001 \
+    --chain.xent-regularize $xent_regularize \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --chain.left-deriv-truncate 0 \
+    --trainer.num-chunk-per-minibatch 64 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --egs.stage $get_egs_stage \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --egs.chunk-width $chunk_width \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $extra_right_context ] && extra_right_context=$chunk_right_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/lstm/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 250 --cmd "$decode_cmd" $iter_opts \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --frames-per-chunk "$frames_per_chunk" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2a.sh b/egs/swbd/s5c/local/chain/run_tdnn_2a.sh
new file mode 100755
index 00000000000..98d9130989a
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_2a.sh
@@ -0,0 +1,245 @@
+#!/bin/bash
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+#  Note, this leads to a cutoff of zero, so it's the same as infinite --num-extra-states.
+#  The table below compares a sequence of experiments {x,s,w,z,2a} where only
+#  the --num-extra-states is varied.
+
+#    I'm also adding to this table some other experiments: 2d, which also had
+#   --num-extra-states=2000 --ngram-order=4 --leftmost-context-questions=/dev/null [so
+#     there was no concept of sets of phones for the 3-gram, plus we could go to 4-gram].
+#    [note that the actual baseline for 2d was 2c, which was as 2a but with
+#      a code change RE transition-scale, but that made no consistent difference, so
+#      acting as if that was a no-op.]
+#
+#
+#  Comparing the --num-extra-states:
+#
+#  --num-extra-states:   0       200     500    2000    8000         *these all had the default --leftmost-context-questions, splitting to ~23 sets.]
+#  --num-extra-states:                                         2000   *plus: --ngram-order=4 --leftmost-context-questions=/dev/null [so 3gram and 4gram all in one set, and 4gram allowed.]
+# new code, --num-lm-states,--ngram-order:                             10k,5  7k,5   5k,4  (this pruned on state count and only left bigrams unpruned)
+# newer code, --num-extra-lm-states (note, ngram-order=5,no-prune-order=3)                    2000    1000   (prune on perplexity, no-prune default=3gram).
+#  experiment:           x       s       w      z       2a    | 2d    | 2f    2g      2h     |  2i      2j
+# WER (train_dev,tg)     18.67   18.45  *18.02  18.06   18.20 |*17.55 | 17.49*17.28   17.46  |*17.44  17.54
+# WER (train_dev,fg)     17.22   16.96   16.70 *16.46   16.59 |*16.14 | 16.21 16.14  *16.08  |*16.09  16.20
+# WER (eval2000,tg)      20.4    20.1    19.9  *19.7    19.8  |*19.5  | 19.6 *19.4    19.5   |*19.2   *19.2
+# WER (eval2000,fg)      18.4    18.0    17.9   18.0   *17.7  |*17.6  | 17.8  17.7   *17.6   | 17.3   *17.2
+# #states in den.fst     29384   30064   30744  31487   31729 | 37451 | 48591 42804   38818  | 35460   33272
+# #arcs in den.fst       249524  252690  255242 251118  238678| 342831|618289 515353  428241 | 299068  267092
+# LM perplexity          8.78    8.07    7.76   7.39    7.37  | 6.34  | 5.75  6.04    6.27   | 6.07    6.35
+# # phone-lm states      2644    2864    3092   4321    6438  | 7437  | 10000 7000    5000   | 8437    7437
+# # phone-lm arcs        44581   50007   54167  68044   73839 | 118699|192690 146938  110505 | 100969  88520
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_2a  # Note: _sp will get added to this if $speed_perturb == true.
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3"
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=30
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5o_tree$suffix
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  lang=data/lang_chain_d
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo2.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set data/lang_chain_d $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --pdf-boundary-penalty 0.0 \
+    --lm-opts "--num-extra-states=8000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 30" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --transition-scale 0.0 \
+      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2b.sh b/egs/swbd/s5c/local/chain/run_tdnn_2b.sh
new file mode 100755
index 00000000000..0515f73b434
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_2b.sh
@@ -0,0 +1,255 @@
+#!/bin/bash
+
+# _2b is as _y but --frames-overlap-per-eg 75 (was 30 before).  This is not very
+# efficient in terms of disk space but I want to see the effect on results.
+
+# In terms of the objf, the training is a lot better, -0.0879->-0.0779, and validation is
+#  slightly better: -0.126 -> -0.123.
+# But the WERs are 0.3 worse across the board: on train_dev, with tg 18.04->18.15, with fg
+#   16.57->16.83; on all of eval2000, with tg 13.2->13.7, and with fg 11.7->12.0.
+# I'm a little at a loss how to interpret these.
+#   Note: I decode an earlier iter (300) but the results were not much better: final->300,
+#   13.7->13.7 on all of eval2000 with tg, and 18.15->18.10 on all of train_dev with tg.
+
+# _y is as _s but trying --apply-deriv-weights false. (note: in the
+# interim, the script was changed so the train and valid probs have --pdf-boundary-penalty 0
+# and are no longer comparable with the ones in _s.
+#
+#   Compared to s, the results are improved: on train_dev, 18.45->18.04 with tg
+# and 16.96->16.57 with fg; on all of eval2000, 20.1->19.8 with tg and 18.0 to
+# 17.9 with fg.
+#
+#
+#  I recomputed the train and valid probs using the .486 model and no --pdf-boundary-penalty option, to
+# be able to compre with the _s ones.  In _s the (train,valid) probs at iter 485 were (-0.0691, -0.0997),
+# in _y the (train,valid) probs at iter 486 were (-0.0655,-0.0998).  So better on train, essentially
+# the same on valid.  It makes sense it would be better on train, since its overtraining is more
+# closely aligned with the distribution of training segments on which we compute the objf-- also because
+# we've simply trained more, i.e. equivalent to slightly more epochs.
+
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_2b  # Note: _sp will get added to this if $speed_perturb == true.
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3"
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=30
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5o_tree$suffix
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  lang=data/lang_chain_d
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo2.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set data/lang_chain_d $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --apply-deriv-weights false \
+    --pdf-boundary-penalty 0.0 \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 75" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --transition-scale 0.0 \
+      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+
+if [ $stage -le 15 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      iter=300
+      steps/nnet3/decode.sh --iter $iter --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff}_it$iter || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg}_it$iter || exit 1;
+      fi
+      ) &
+  done
+fi
+
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2c.sh b/egs/swbd/s5c/local/chain/run_tdnn_2c.sh
new file mode 100755
index 00000000000..ffd2044c272
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_2c.sh
@@ -0,0 +1,226 @@
+#!/bin/bash
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# As expected the results are consistent with randomness: 2a->2c, on all of eval2000,
+# before rescoring 19.8->19.8 and after rescoring 17.7->17.8; on train_dev,
+# before rescoring 18.20->18.12, and after rescoring 16.59->16.73.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_2c  # Note: _sp will get added to this if $speed_perturb == true.
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3"
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=30
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5o_tree$suffix
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  lang=data/lang_chain_d
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo2.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set data/lang_chain_d $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --pdf-boundary-penalty 0.0 \
+    --lm-opts "--num-extra-states=8000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 30" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2d.sh b/egs/swbd/s5c/local/chain/run_tdnn_2d.sh
new file mode 100755
index 00000000000..c93121499cd
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_2d.sh
@@ -0,0 +1,231 @@
+#!/bin/bash
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_2d  # Note: _sp will get added to this if $speed_perturb == true.
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3"
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=30
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5o_tree$suffix
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  lang=data/lang_chain_d
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo2.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set data/lang_chain_d $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --pdf-boundary-penalty 0.0 \
+    --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 30" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2e.sh b/egs/swbd/s5c/local/chain/run_tdnn_2e.sh
new file mode 100755
index 00000000000..a8552244ed2
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_2e.sh
@@ -0,0 +1,279 @@
+#!/bin/bash
+
+# _2e is as _2b, but --frames-overlap-per-eg 0 (also compare with _y, which has
+# an overlap of 30; _2b has 75).  BUT we also made a code change as in 2a->2c, where we use
+# transition-scale and self-loop-scale of 1, so we are making the same change in
+# 2b->2e; it requires a script change too, to match.  we'll have to correct the
+# results for this.  (note: this won't matter as the results did not change)
+#
+#  Comparing results:
+#      expt:                _2b      _y     _2e     _s
+# --frames-overlap-per-eg   75      30       0      30
+# --apply-deriv-weights     f       f        f      t
+#  all of eval2000 (tg)     20.1    19.8    19.7   20.1
+#  all of eval2000 (fg)     18.0    17.9    17.8   18.0
+#        train_dev (tg)     18.15   18.04   17.85  18.45
+#        train_dev (fg)     16.83   16.57   16.52  16.96
+#  ... on all of these tests, results are consistently better towards smaller
+#   --frames-overlap-per-eg.  and apply-deriv-weights=f seems better.
+#
+
+
+# _2b is as _y but --frames-overlap-per-eg 75 (was 30 before).  This is not very
+# efficient in terms of disk space but I want to see the effect on results.
+
+# In terms of the objf, the training is a lot better, -0.0879->-0.0779, and validation is
+#  slightly better: -0.126 -> -0.123.
+# But the WERs are 0.3 worse across the board: on train_dev, with tg 18.04->18.15, with fg
+#   16.57->16.83; on all of eval2000, with tg 13.2->13.7, and with fg 11.7->12.0.
+# I'm a little at a loss how to interpret these.
+#   Note: I decode an earlier iter (300) but the results were not much better: final->300,
+#   13.7->13.7 on all of eval2000 with tg, and 18.15->18.10 on all of train_dev with tg.
+
+# _y is as _s but trying --apply-deriv-weights false. (note: in the
+# interim, the script was changed so the train and valid probs have --pdf-boundary-penalty 0
+# and are no longer comparable with the ones in _s.
+#
+#   Compared to s, the results are improved: on train_dev, 18.45->18.04 with tg
+# and 16.96->16.57 with fg; on all of eval2000, 20.1->19.8 with tg and 18.0 to
+# 17.9 with fg.
+#
+#
+#  I recomputed the train and valid probs using the .486 model and no --pdf-boundary-penalty option, to
+# be able to compre with the _s ones.  In _s the (train,valid) probs at iter 485 were (-0.0691, -0.0997),
+# in _y the (train,valid) probs at iter 486 were (-0.0655,-0.0998).  So better on train, essentially
+# the same on valid.  It makes sense it would be better on train, since its overtraining is more
+# closely aligned with the distribution of training segments on which we compute the objf-- also because
+# we've simply trained more, i.e. equivalent to slightly more epochs.
+
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_2e  # Note: _sp will get added to this if $speed_perturb == true.
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3"
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=30
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5o_tree$suffix
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  lang=data/lang_chain_d
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo2.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set data/lang_chain_d $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --apply-deriv-weights false \
+    --pdf-boundary-penalty 0.0 \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+
+exit 0;
+
+
+# BROKEN results where I had overlap of 75, so it was mostly just a repetition of _2b, except with
+# that 2a->2c change.
+
+b01:s5c: for l in y 2b 2e; do grep Sum exp/chain/tdnn_${l}_sp/decode_eval2000_sw1_tg/score*/*ys | utils/best_wer.sh ; done
+%WER 13.2 | 1831 21395 | 88.4 8.0 3.6 1.6 13.2 50.6 | exp/chain/tdnn_y_sp/decode_eval2000_sw1_tg/score_12_0.0/eval2000_hires.ctm.swbd.filt.sys
+%WER 13.7 | 1831 21395 | 88.1 8.2 3.7 1.8 13.7 51.0 | exp/chain/tdnn_2b_sp/decode_eval2000_sw1_tg/score_12_0.0/eval2000_hires.ctm.swbd.filt.sys
+%WER 13.4 | 1831 21395 | 88.4 8.2 3.4 1.8 13.4 50.8 | exp/chain/tdnn_2e_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.swbd.filt.sys
+b01:s5c: for l in y 2b 2e; do grep Sum exp/chain/tdnn_${l}_sp/decode_eval2000_sw1_fsh_fg/score*/*ys | utils/best_wer.sh ; done
+On iteration 368, learning rate is 0.00304840891076219.
+Training neural net (pass 368)
+%WER 11.7 | 1831 21395 | 89.7 7.0 3.2 1.4 11.7 47.8 | exp/chain/tdnn_y_sp/decode_eval2000_sw1_fsh_fg/score_12_0.0/eval2000_hires.ctm.swbd.filt.sys
+%WER 12.0 | 1831 21395 | 89.5 7.1 3.4 1.5 12.0 49.4 | exp/chain/tdnn_2b_sp/decode_eval2000_sw1_fsh_fg/score_12_0.0/eval2000_hires.ctm.swbd.filt.sys
+%WER 12.1 | 1831 21395 | 89.4 7.5 3.1 1.5 12.1 48.4 | exp/chain/tdnn_2e_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.swbd.filt.sys
+b01:s5c:
+b01:s5c: for l in y 2b 2e; do grep WER exp/chain/tdnn_${l}_sp/decode_train_dev_sw1_tg/wer_* | utils/best_wer.sh ; done
+%WER 18.04 [ 8877 / 49204, 1125 ins, 2296 del, 5456 sub ] exp/chain/tdnn_y_sp/decode_train_dev_sw1_tg/wer_12_0.0
+%WER 18.15 [ 8930 / 49204, 1121 ins, 2244 del, 5565 sub ] exp/chain/tdnn_2b_sp/decode_train_dev_sw1_tg/wer_12_0.0
+%WER 18.24 [ 8975 / 49204, 1242 ins, 2064 del, 5669 sub ] exp/chain/tdnn_2e_sp/decode_train_dev_sw1_tg/wer_11_0.0
+b01:s5c: for l in y 2b 2e; do grep WER exp/chain/tdnn_${l}_sp/decode_train_dev_sw1_fsh_fg/wer_* | utils/best_wer.sh ; done
+%WER 16.57 [ 8155 / 49204, 1144 ins, 1988 del, 5023 sub ] exp/chain/tdnn_y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+%WER 16.83 [ 8282 / 49204, 1106 ins, 2115 del, 5061 sub ] exp/chain/tdnn_2b_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+%WER 16.79 [ 8260 / 49204, 1090 ins, 2138 del, 5032 sub ] exp/chain/tdnn_2e_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2f.sh b/egs/swbd/s5c/local/chain/run_tdnn_2f.sh
new file mode 100755
index 00000000000..86c23acc90c
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_2f.sh
@@ -0,0 +1,236 @@
+#!/bin/bash
+
+# _2f is as _2d but following a code change, and with different LM options:
+#  --ngram-order=5 --num-lm-states=10000
+# Now the extra questions are not needed.
+# see table in run_tdnn_2a.sh for results
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_2f  # Note: _sp will get added to this if $speed_perturb == true.
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3"
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=30
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5o_tree$suffix
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  lang=data/lang_chain_d
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo2.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set data/lang_chain_d $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --pdf-boundary-penalty 0.0 \
+    --lm-opts "--ngram-order=5 --num-lm-states=10000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 30" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2g.sh b/egs/swbd/s5c/local/chain/run_tdnn_2g.sh
new file mode 100755
index 00000000000..db2f7a00410
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_2g.sh
@@ -0,0 +1,239 @@
+#!/bin/bash
+
+# _2g is as _2f but reducing the --num-lm-states from 10k to 7k
+# see table in run_tdnn_2a.sh for results.
+
+# _2f is as _2d but following a code change, and with different LM options:
+#  --ngram-order=5 --num-lm-states=10000
+# Now the extra questions are not needed.
+# LM perplexity changes from 6.34 to 5.75.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_2g  # Note: _sp will get added to this if $speed_perturb == true.
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3"
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=30
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5o_tree$suffix
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  lang=data/lang_chain_d
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo2.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set data/lang_chain_d $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --pdf-boundary-penalty 0.0 \
+    --lm-opts "--ngram-order=5 --num-lm-states=7000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 30" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2h.sh b/egs/swbd/s5c/local/chain/run_tdnn_2h.sh
new file mode 100755
index 00000000000..9d5bfdd1207
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_2h.sh
@@ -0,0 +1,241 @@
+#!/bin/bash
+
+# _2h is as _2g but --ngram-order=4, and --num-lm-states=5k.
+# see table in run_tdnn_2a.sh for results.
+
+# _2g is as _2f but reducing the --num-lm-states from 10k to 7k.
+
+# _2f is as _2d but following a code change, and with different LM options:
+#  --ngram-order=5 --num-lm-states=10000
+# Now the extra questions are not needed.
+# LM perplexity changes from 6.34 to 5.75.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_2h  # Note: _sp will get added to this if $speed_perturb == true.
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3"
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=30
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5o_tree$suffix
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  lang=data/lang_chain_d
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo2.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set data/lang_chain_d $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --pdf-boundary-penalty 0.0 \
+    --lm-opts "--ngram-order=4 --num-lm-states=5000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 30" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2i.sh b/egs/swbd/s5c/local/chain/run_tdnn_2i.sh
new file mode 100755
index 00000000000..eaa5a77949f
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_2i.sh
@@ -0,0 +1,239 @@
+#!/bin/bash
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# see table in run_tdnn_2a.sh for results
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_2i  # Note: _sp will get added to this if $speed_perturb == true.
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3"
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=30
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5o_tree$suffix
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  lang=data/lang_chain_d
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo2.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set data/lang_chain_d $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --pdf-boundary-penalty 0.0 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 30" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2j.sh b/egs/swbd/s5c/local/chain/run_tdnn_2j.sh
new file mode 100755
index 00000000000..70ba86a3fd0
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_2j.sh
@@ -0,0 +1,240 @@
+#!/bin/bash
+
+# _2j is as _2i but with --num-extra-lm-states=1000, not 2000.
+# see table in run_tdnn_2a.sh for results
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_2j  # Note: _sp will get added to this if $speed_perturb == true.
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3"
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=30
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5o_tree$suffix
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  lang=data/lang_chain_d
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo2.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set data/lang_chain_d $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --pdf-boundary-penalty 0.0 \
+    --lm-opts "--num-extra-lm-states=1000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 30" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2k.sh b/egs/swbd/s5c/local/chain/run_tdnn_2k.sh
new file mode 100755
index 00000000000..fb1f59d3c5a
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_2k.sh
@@ -0,0 +1,249 @@
+#!/bin/bash
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+#
+#  This is helpful more often than not (but it doesn't seem to make as much
+#  of a difference as it did before).
+#                  2i         2k
+# train_dev,tg  17.44        17.08
+# train_dev,fg  16.09        15.79
+# eval2000,tg    19.2        19.3
+# eval2000,fg    17.3        17.3
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_2k  # Note: _sp will get added to this if $speed_perturb == true.
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3"
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=30
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5o_tree$suffix
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  lang=data/lang_chain_d
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo2.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set data/lang_chain_d $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ # note, I removed the --pdf-boundary 0.0 option after taking it out of the script
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --apply-deriv-weights false \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2l.sh b/egs/swbd/s5c/local/chain/run_tdnn_2l.sh
new file mode 100755
index 00000000000..56365029f3c
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_2l.sh
@@ -0,0 +1,259 @@
+#!/bin/bash
+
+# _2l is as _2k, but using 100 frames per eg instead of 150.
+#  Previously we had found 150 better than 75, but this may have changed as we
+#  are no longer treating the edges in the same way (e.g. we now use
+#  --pdf-boundary-penalty=0.0).  So re-tuning.
+
+#  This is: [better by 0.1, better by 0.1, the same, worse by 0.1].  So
+#  I guess it's either not sensitive to this, or the optimal value lies
+#  somewhere in between.   I'm leaving it at 150 in the scripts for
+#  now, but if we have memory problems in the future, we can reduce to 100.
+#
+#                        2k       2l
+#  --frames-per-eg     150       100
+# train_dev,tg         17.08     16.99
+# train_dev,fg         15.79     15.67
+# eval2000,tg          19.3      19.3
+# eval2000,fg          17.3      17.4
+
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_2l  # Note: _sp will get added to this if $speed_perturb == true.
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3"
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=30
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=100
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5o_tree$suffix
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  lang=data/lang_chain_d
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo2.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set data/lang_chain_d $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --pdf-boundary-penalty 0.0 \
+    --apply-deriv-weights false \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2m.sh b/egs/swbd/s5c/local/chain/run_tdnn_2m.sh
new file mode 100755
index 00000000000..93ba4ac82b3
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_2m.sh
@@ -0,0 +1,260 @@
+#!/bin/bash
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# So it's [0.1 worse, 0.1 worse, 0.6 better, 0.3 better]: better on average.
+# Which kind of makes sense (we expected that the previous limitation on how the
+# tree was built would not be helpful).
+
+#                                  2k       2m
+#  --leftmost-questions-truncate   30       -1
+#           train_dev,tg         17.08     17.22
+#           train_dev,fg         15.79     15.87
+#           eval2000,tg          19.3      18.7
+#           eval2000,fg          17.3      17.0
+# in tree-building,
+#           like-impr           4.9099     5.33844
+#    Den-fst num-states          35460     299068
+#    Den-fst num-arcs            47036     331403
+
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=11
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_2m  # Note: _sp will get added to this if $speed_perturb == true.
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3"
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5m_tree$suffix
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  lang=data/lang_chain_d
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo2.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set data/lang_chain_d $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --pdf-boundary-penalty 0.0 \
+    --apply-deriv-weights false \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2n.sh b/egs/swbd/s5c/local/chain/run_tdnn_2n.sh
new file mode 100755
index 00000000000..c90c5f0a41f
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_2n.sh
@@ -0,0 +1,301 @@
+#!/bin/bash
+
+# _2n is as _2m, but using the combine-data script to ensure that we don't have
+# very short segments (this can cause an excessive amount of either missing or
+# overlapped data in the egs).
+
+# (m->n) This doesn't seem to make a consistent difference, but maybe a little worse.
+# Note, the tree-split improvement was more in 2n.  I suspect this it's because we
+# did the alignments after the 'max1' thing, and the fMLLR was somehow more
+# utterance-specific.
+
+# WER on          2m        2n
+# train_dev,tg    17.22     17.11       0.1 better
+# train_dev,fg    15.87     15.75       0.1 better
+# eval2000,tg     18.7      19.2        0.5 worse
+# eval2000,fg     17.0      17.2        0.2 worse
+#
+# tree-split impr  5.34      5.78
+# train-prob,final -0.080     -0.090
+# valid-prob,final -0.116     -0.1006   # note, the 2n valid prob is not correct, because
+#                                       # the combine_data.sh script doesn't preserve utt2uniq info.
+
+# (note: I removed the --pdf-boundary-penalty 0.0 option from the script as it's
+#  now the default, and no longer supported.)
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=9
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_2n  # Note: _sp will get added to this if $speed_perturb == true.
+
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3"
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=true
+min_segment_length=8 # min length in seconds, for combining data.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+# the following two variables can't be set to arbitrary values, you'd have
+# to change other things in the script below.
+train_set_hires=train_nodup${suffix}_hires_ml${min_segment_length}_max1
+train_set=train_nodup${suffix}_ml${min_segment_length}_max1
+
+dir=${dir}$suffix
+ali_dir=exp/chain/tri4_ali_${train_set}
+treedir=exp/chain/tri5n_tree$suffix
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  echo "$0: combining segments"
+  # Get rid of short segments by combining them; and make 1 utterance per speaker to
+  # get more iVector diversity.
+  for s in "${suffix}" "${suffix}_hires"; do
+
+    steps/cleanup/combine_short_segments.py --minimum-duration ${min_segment_length} \
+      --input-data-dir data/train_nodup${s} \
+      --output-data-dir data/train_nodup${s}_ml${min_segment_length} \
+
+    steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 1 data/train_nodup${s}_ml${min_segment_length} \
+      data/train_nodup${s}_ml${min_segment_length}_max1
+  done
+fi
+
+if [ $stage -le 10 ]; then
+  echo "$0: extracting iVectors for training set "
+
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 60 \
+    data/${train_set_hires} exp/nnet3/extractor exp/chain/ivectors_${train_set}   || exit 1;
+
+fi
+
+if [ $stage -le 11 ]; then
+  # obtain the alignment of the perturbed and segment-combined data
+  steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \
+    data/${train_set} data/lang_nosp exp/tri4 ${ali_dir} || exit 1
+fi
+
+
+if [ $stage -le 12 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_${train_set}
+  rm exp/tri4_lats_${train_set}/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 13 ]; then
+  # Create a version of the lang/ directory that has a different topology than before,
+  # allowing transition of the HMM in one frame.
+  lang=data/lang_chain_d
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo2.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 14 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set data/lang_chain_d $ali_dir $treedir
+fi
+
+if [ $stage -le 15 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --apply-deriv-weights false \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/chain/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/$train_set_hires $treedir exp/tri4_lats_${train_set} $dir || exit 1;
+fi
+
+if [ $stage -le 16 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 17 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2o.sh b/egs/swbd/s5c/local/chain/run_tdnn_2o.sh
new file mode 100755
index 00000000000..5a8166acbf7
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_2o.sh
@@ -0,0 +1,259 @@
+#!/bin/bash
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+#   Correction: after rerunning, it actually seems a little worse.
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# WER on          2m        2o        2o[rerun after delete]
+# train_dev,tg    17.22     17.24     17.19
+# train_dev,fg    15.87     15.93     15.89
+# eval2000,tg     18.7      18.7      19.3
+# eval2000,fg     17.0      16.9      17.4
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=10
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_2o  # Note: _sp will get added to this if $speed_perturb == true.
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3"
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2o_tree$suffix
+lang=data/lang_chain_2o
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --apply-deriv-weights false \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2p.sh b/egs/swbd/s5c/local/chain/run_tdnn_2p.sh
new file mode 100755
index 00000000000..3ff85ad5562
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_2p.sh
@@ -0,0 +1,274 @@
+#!/bin/bash
+
+# _2p is as _2m, but 6500 instead of 9000 as the target for num-leaves.
+
+# consistently slightly worse.
+
+# WER on          2m        2p
+# train_dev,tg    17.22     17.42       0.2 worse
+# train_dev,fg    15.87     16.07       0.2 worse
+# eval2000,tg     18.7      19.0        0.3 worse
+# eval2000,fg     17.0      17.1        0.1 worse
+#
+# oddly, the final train and valid probs were better.
+# final-train    -0.0803   -0.0791
+# final-valid    -0.0116   -0.0115
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+# Caution: I accidentally overwrote its treedir with the '2o' experiment, so I
+# moved it to '2o'.  But the 2m experiment was done by then.
+
+# So it's [0.1 worse, 0.1 worse, 0.6 better, 0.3 better]: better on average.
+# Which kind of makes sense
+#
+#                                  2k       2m
+#  --leftmost-questions-truncate   30       -1
+#           train_dev,tg         17.08     17.22
+#           train_dev,fg         15.79     15.87
+#           eval2000,tg          19.3      18.7
+#           eval2000,fg          17.3      17.0
+# in tree-building,
+#           like-impr           4.9099     5.33844
+#    Den-fst num-states          35460     299068
+#    Den-fst num-arcs            47036     331403
+
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=11
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_2p  # Note: _sp will get added to this if $speed_perturb == true.
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3"
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2p_tree$suffix
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  lang=data/lang_chain_d
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo2.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 6500 data/$train_set data/lang_chain_d $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --apply-deriv-weights false \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2q.sh b/egs/swbd/s5c/local/chain/run_tdnn_2q.sh
new file mode 100755
index 00000000000..2c7669cdbc4
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_2q.sh
@@ -0,0 +1,268 @@
+#!/bin/bash
+
+# _2q is as _2o but changing from 9000 -> 6000 states as the target.
+#  (like 2p, where it wasn't helpful, but doing this experiment for the topology with fewer state).
+
+# it's consistently a little worse.
+# WER on           2o       2q
+# train_dev,tg     17.24    17.43  0.2% worse
+# train_dev,fg     15.93    16.07  0.2% worse
+# eval2000,tg      18.7     19.0   0.3% worse
+# eval2000,fg      16.9     17.1   0.2% worse
+# train-prob     -0.08352 -0.08441
+# valid-prob     -0.1218  -0.01221
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+
+# WER on          2m        2o
+# train_dev,tg    17.22     17.24       no diff
+# train_dev,fg    15.87     15.93       no diff
+# eval2000,tg     18.7      18.7        no diff
+# eval2000,fg     17.0      16.9        0.1 better
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=11
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_2q  # Note: _sp will get added to this if $speed_perturb == true.
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3"
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2q_tree$suffix
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  lang=data/lang_chain_o
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 6000 data/$train_set data/lang_chain_o $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --apply-deriv-weights false \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2r.sh b/egs/swbd/s5c/local/chain/run_tdnn_2r.sh
new file mode 100755
index 00000000000..d17ebdf9be7
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_2r.sh
@@ -0,0 +1,304 @@
+#!/bin/bash
+
+# _2r is as _2q, but further changing the topology to have one rather than
+# two pdf-ids per triphone.
+
+# it's consistently worse, and a fairly substantial difference.
+# WER on           2q     2r
+# train_dev,tg     17.43  17.82  0.4% worse
+# train_dev,fg     16.07  16.64  0.6% worse
+# eval2000,tg      19.0   19.8   0.8% worse
+# eval2000,fg      17.1   18.0   0.9% worse
+# train-prob      -0.08441  -0.08318
+# valid-prob      -0.01221  -0.1272
+
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+
+# WER on          2m        2o
+# train_dev,tg    17.22     17.24       no diff
+# train_dev,fg    15.87     15.93       no diff
+# eval2000,tg     18.7      18.7        no diff
+# eval2000,fg     17.0      16.9        0.1 better
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=10
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_2r  # Note: _sp will get added to this if $speed_perturb == true.
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3"
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5r_tree$suffix
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  lang=data/lang_chain_2r
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo3.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.  The "--pdf-class-list=0" option is
+  # needed, as in this type of topology we only have a single pdf-class,
+  # numbered zero.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --cluster-phones-opts "--pdf-class-list=0" \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 6000 data/$train_set data/lang_chain_2r $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --apply-deriv-weights false \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
+
+
+# Just a note: I saw some warnings like this in the logs:
+
+WARNING (nnet3-chain-normalize-egs:main():nnet3-chain-normalize-egs.cc:72) For example sp1.0-sw02859-B_050239-051084-0, FST was empty after composing with normalization FST. This should be extremely rare (a few per corpus, at most)
+
+#below is how I verified that they were caused by a benign cause.. it was that the lattice versus
+#1-best alignment had different paths (and presumably the lattice didn't have the same path
+#contained in the 1-best.
+#
+# after the first ow_S we have a silence in the 1-best:
+
+copy-int-vector 'ark:gunzip -c exp/chain/tri5r_tree_sp/ali.45.gz |' ark,t:- |  grep sp1.0-sw02859-B_050239-051084 | ali-to-phones exp/chain/tri5r_tree_sp/final.mdl ark:- ark,t:-  | utils/int2sym.pl -f 2- data/lang/phones.txt
+copy-int-vector 'ark:gunzip -c exp/chain/tri5r_tree_sp/ali.45.gz |' ark,t:-
+ali-to-phones exp/chain/tri5r_tree_sp/final.mdl ark:- ark,t:-
+LOG (copy-int-vector:main():copy-int-vector.cc:83) Copied 5884 vectors of int32.
+LOG (ali-to-phones:main():ali-to-phones.cc:134) Done 1 utterances.
+sp1.0-sw02859-B_050239-051084 sil ow_S sil ay_B k_I m_I ax_I n_E hh_B ih_I m_I s_I eh_I l_I f_E ih_B f_E hh_B iy_E hh_B ae_I d_E s_B ah_I m_E t_B ae_I l_I ih_I n_I t_E ax_B r_I aw_I n_I d_E ay_S th_B ih_I ng_I k_E dh_B ey_I d_E b_B iy_E ax_S s_B uw_I p_I er_E t_B iy_I m_E b_B ah_I t_E hh_B iy_E k_B ae_I n_I t_E d_B uw_E ih_B t_E b_B ay_E hh_B ih_I m_I s_I eh_I l_I f_E hh_B iy_I z_E g_B aa_I t_E t_B ax_E hh_B ae_I v_E ax_S l_B ay_I n_E ih_B n_E f_B r_I ah_I n_I t_E ah_B v_E hh_B ih_I m_E dh_B ae_I t_E n_B ow_I z_E hh_B aw_E t_B ax_E b_B l_I aa_I k_E sil
+
+
+# but in the lattice (which seems to be linear at that point), after the first
+# ow_S there is no silence:
+
+lattice-copy "ark:gunzip -c exp/tri4_lats_nodup_sp/lat.45.gz |" "scp,t,p:echo sp1.0-sw02859-B_050239-051084 -|" | lattice-best-path "scp:echo sp1.0-sw02859-B_050239-051084 -|" ark:/dev/null ark,t:- | ali-to-phones exp/tri4/final.mdl ark:- ark,t:- | utils/int2sym.pl -f 2- data/lang/phones.txt
+lattice-copy 'ark:gunzip -c exp/tri4_lats_nodup_sp/lat.45.gz |' 'scp,t,p:echo sp1.0-sw02859-B_050239-051084 -|'
+lattice-best-path 'scp:echo sp1.0-sw02859-B_050239-051084 -|' ark:/dev/null ark,t:-
+ali-to-phones exp/tri4/final.mdl ark:- ark,t:-
+LOG (lattice-best-path:main():lattice-best-path.cc:99) For utterance sp1.0-sw02859-B_050239-051084, best cost 53.7031 + 39521.9 = 39575.6 over 843 frames.
+LOG (lattice-best-path:main():lattice-best-path.cc:124) Overall score per frame is 46.9461 = 0.0637047 [graph] + 46.8824 [acoustic] over 843 frames.
+LOG (lattice-best-path:main():lattice-best-path.cc:128) Done 1 lattices, failed for 0
+LOG (ali-to-phones:main():ali-to-phones.cc:134) Done 1 utterances.
+sp1.0-sw02859-B_050239-051084 sil ow_S ay_B k_I m_I ax_I n_E hh_B ih_I m_I s_I eh_I l_I f_E ih_B f_E hh_B iy_E hh_B ae_I d_E s_B ah_I m_E t_B ae_I l_I ih_I n_I t_E ax_B r_I aw_I n_I d_E ay_S th_B ih_I ng_I k_E dh_B ey_I d_E b_B iy_E ax_S s_B uw_I p_I er_E t_B iy_I m_E b_B ah_I t_E hh_B iy_E k_B ae_I n_I t_E d_B uw_E ih_B t_E b_B ay_E hh_B ih_I m_I s_I eh_I l_I f_E hh_B iy_I z_E g_B aa_I t_E t_B ax_E hh_B ae_I v_E ax_S l_B ay_I n_E ih_B n_E f_B r_I ah_I n_I t_E ah_B v_E hh_B ih_I m_E dh_B ae_I t_E n_B ow_I z_E hh_B aw_E t_B ax_E b_B l_I aa_I k_E sil
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2s.sh b/egs/swbd/s5c/local/chain/run_tdnn_2s.sh
new file mode 100755
index 00000000000..6f7f9978ac6
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_2s.sh
@@ -0,0 +1,260 @@
+#!/bin/bash
+
+# _2s is as _2o, but another topology, this time with 3 states and 3 pdf-ids
+# worse :-(
+
+# WER on           2o      2s
+# train_dev,tg    17.24    17.19   no diff
+# train_dev,fg    15.93    15.97   no diff
+# eval2000,tg     18.7     19.0    0.3 worse
+# eval2000,fg     16.9     17.2    0.3 worse
+#
+
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=10
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_2s  # Note: _sp will get added to this if $speed_perturb == true.
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3"
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2s_tree$suffix
+lang=data/lang_chain_2s
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo4.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --apply-deriv-weights false \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2t.sh b/egs/swbd/s5c/local/chain/run_tdnn_2t.sh
new file mode 100755
index 00000000000..53a343d9f80
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_2t.sh
@@ -0,0 +1,264 @@
+#!/bin/bash
+
+# _2t is as _2o and _2s, but another topology: with 3 pdf-ids like 2s, but
+# differently arranged.
+# see table below, it's worse.
+
+#[ _2s is as _2o, but another topology, this time with 3 states and 3 pdf-ids
+# worse :-(]
+
+# WER on           2o      2s       2t
+# train_dev,tg    17.24    17.19    17.44
+# train_dev,fg    15.93    15.97
+# eval2000,tg     18.7     19.0     19.4
+# eval2000,fg     16.9     17.2
+#
+
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=10
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_2t  # Note: _sp will get added to this if $speed_perturb == true.
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3"
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2t_tree$suffix
+lang=data/lang_chain_2t
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo5.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --apply-deriv-weights false \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2u.sh b/egs/swbd/s5c/local/chain/run_tdnn_2u.sh
new file mode 100755
index 00000000000..c05fb697d6f
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_2u.sh
@@ -0,0 +1,276 @@
+#!/bin/bash
+
+# _2u is as _2o, but using 'not-shared' in the roots files, to ensure that
+# the initial and non-initial states will never be shared.  I don't expect this
+# to make any difference, as that question always gets asked, but it's a baseline for _2v.
+
+
+#  If anything, it's a little worse.
+
+# WER on            2o       2u
+# train_dev,tg      17.24    17.23   no diff
+# train_dev,fg      15.93    15.98   no diff
+# eval2000,tg       18.7     19.3  0.6% worse
+# eval2000,fg       16.9     17.3  0.4% worse
+
+
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# WER on          2m        2o
+# train_dev,tg    17.22     17.24       no diff
+# train_dev,fg    15.87     15.93       no diff
+# eval2000,tg     18.7      18.7        no diff
+# eval2000,fg     17.0      16.9        0.1 better
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17..
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=10
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_2u  # Note: _sp will get added to this if $speed_perturb == true.
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3"
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2u_tree$suffix
+lang=data/lang_chain_2u
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  # use 'not-shared' roots so initial and non-initial pdf-ids cannot be the
+  # same.
+  awk '{$1 = "not-shared"; print;}' <data/lang/phones/roots.txt >$lang/phones/roots.txt
+  awk '{$1 = "not-shared"; print;}' <data/lang/phones/roots.int >$lang/phones/roots.int
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --apply-deriv-weights false \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2v.sh b/egs/swbd/s5c/local/chain/run_tdnn_2v.sh
new file mode 100755
index 00000000000..3d279841190
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_2v.sh
@@ -0,0 +1,281 @@
+#!/bin/bash
+
+# _2v is as _2u, but adding the --scale-stats-for-pdf-classes="1=0.5" option to
+# the tree building, to scale down the stats for the self-loop to have fewer pdf-ids
+# assigned there and more to the initial state.
+
+# It's maybe a shade better than 2u, but certainly not better than 2o.  I don't
+# think I'll pursue this.  Note: the code and the script option may not be
+# checked in, and won't be checked in with this commit.
+
+# WER on            2o       2u       2v
+# train_dev,tg      17.24    17.23    17.28   0.05% worse than 2u
+# train_dev,fg      15.93    15.98    16.05   0.05% worse than 2u
+# eval2000,tg       18.7     19.3     19.1    0.2% better than 2u
+# eval2000,fg       16.9     17.3     17.1    0.2% better than 2u.
+
+
+# _2u is as _2o, but using 'not-shared' in the roots files, to ensure that
+# the initial and non-initial states will never be shared.  I don't expect this
+# to make any difference, as that question always gets asked, but it's a baseline for _2v.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# WER on          2m        2o
+# train_dev,tg    17.22     17.24       no diff
+# train_dev,fg    15.87     15.93       no diff
+# eval2000,tg     18.7      18.7        no diff
+# eval2000,fg     17.0      16.9        0.1 better
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=10
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_2v  # Note: _sp will get added to this if $speed_perturb == true.
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3"
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2v_tree$suffix
+lang=data/lang_chain_2v
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  # use 'not-shared' roots so initial and non-initial pdf-ids cannot be the
+  # same.
+  awk '{$1 = "not-shared"; print;}' <data/lang/phones/roots.txt >$lang/phones/roots.txt
+  awk '{$1 = "not-shared"; print;}' <data/lang/phones/roots.int >$lang/phones/roots.int
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --scale-stats-for-pdf-classes "1=0.5" \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --apply-deriv-weights false \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2w.sh b/egs/swbd/s5c/local/chain/run_tdnn_2w.sh
new file mode 100755
index 00000000000..bcfc93aadb0
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_2w.sh
@@ -0,0 +1,276 @@
+#!/bin/bash
+
+# _2w is as _2o, but setting the frame subsampling factor to 2 instead of 3.
+# Going back to 100 frames per eg, which I previously found to be about the same in
+# WER, because we were running out of memory [although this is before a code
+# change to use reorder=false, which halved the num-states in the graph on this setup
+# [~45k->22k], and reduced the num-transitions to a quarter [900k->225k].
+
+
+# a little surprisingly, it's worse, and clearly so.
+# note, we can't really compare the objf values, as the chunk size is not the same.
+
+# WER on          2m        2o          2w
+# train_dev,tg    17.22     17.24       17.62
+# train_dev,fg    15.87     15.93       16.49
+# eval2000,tg     18.7      18.7        19.4
+# eval2000,fg     17.0      16.9        17.8
+
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# WER on          2m        2o
+# train_dev,tg    17.22     17.24       no diff
+# train_dev,fg    15.87     15.93       no diff
+# eval2000,tg     18.7      18.7        no diff
+# eval2000,fg     17.0      16.9        0.1 better
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=10
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_2w  # Note: _sp will get added to this if $speed_perturb == true.
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3"
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=100
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2o_tree$suffix
+lang=data/lang_chain_2w
+frame_subsampling_factor=2
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor $frame_subsampling_factor \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --frame-subsampling-factor $frame_subsampling_factor \
+    --apply-deriv-weights false \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2x.sh b/egs/swbd/s5c/local/chain/run_tdnn_2x.sh
new file mode 100755
index 00000000000..bff0983bd49
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_2x.sh
@@ -0,0 +1,282 @@
+#!/bin/bash
+
+# _2x is as _2w (which has frame subsampling factor of 2 not 3), but with more
+# epochs (6 vs 4), as it looks like the 2w model hadn't completely trained.
+# Re-using the egs.  I added the results to the table below.  The WER is
+# even worse than 2x.
+
+# _2w is as _2o, but setting the frame subsampling factor to 2 instead of 3.
+# Going back to 100 frames per eg, which I previously found to be about the same in
+# WER, because we were running out of memory [although this is before a code
+# change to use reorder=false, which halved the num-states in the graph on this setup
+# [~45k->22k], and reduced the num-transitions to a quarter [900k->225k].
+
+
+# a little surprisingly, it's worse, and clearly so.
+# note, we can't really compare the objf values, as the chunk size is not the same.
+
+# WER on          2m        2o          2w       2x
+# train_dev,tg    17.22     17.24       17.62    17.79
+# train_dev,fg    15.87     15.93       16.49    16.57
+# eval2000,tg     18.7      18.7        19.4     19.6
+# eval2000,fg     17.0      16.9        17.8     18.0
+
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# WER on          2m        2o
+# train_dev,tg    17.22     17.24       no diff
+# train_dev,fg    15.87     15.93       no diff
+# eval2000,tg     18.7      18.7        no diff
+# eval2000,fg     17.0      16.9        0.1 better
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_2x  # Note: _sp will get added to this if $speed_perturb == true.
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3"
+
+# training options
+num_epochs=6
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=100
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2o_tree$suffix
+lang=data/lang_chain_2w
+frame_subsampling_factor=2
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor $frame_subsampling_factor \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2w_sp/egs \
+    --frame-subsampling-factor $frame_subsampling_factor \
+    --apply-deriv-weights false \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2y.sh b/egs/swbd/s5c/local/chain/run_tdnn_2y.sh
new file mode 100755
index 00000000000..6d61b7d860d
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_2y.sh
@@ -0,0 +1,267 @@
+#!/bin/bash
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=10
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_2y  # Note: _sp will get added to this if $speed_perturb == true.
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3"
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3c.sh b/egs/swbd/s5c/local/chain/run_tdnn_3c.sh
new file mode 100755
index 00000000000..4f350891e8a
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3c.sh
@@ -0,0 +1,274 @@
+#!/bin/bash
+
+# _3c is as _2y, but using 'jesus' nonlinearity: the --jesus-dim 800 option, instead of
+#   --relu-dim 850.
+#  reusing the egs from 2y.
+# caution: see config section, I changed some things while running.
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3c  # Note: _sp will get added to this if $speed_perturb == true.
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3"
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+# max_param_change=1.0
+max_param_change=0.5  # Changed it to this value on iteration  74.
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=64  # switched to 64 on iteration 7 after a failure.
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --jesus-dim 800 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3d.sh b/egs/swbd/s5c/local/chain/run_tdnn_3d.sh
new file mode 100755
index 00000000000..ca8080db080
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3d.sh
@@ -0,0 +1,286 @@
+#!/bin/bash
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+# (note: cannot be reproduced using current scripts).
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+# Results are about the same as 2y, or maybe just a little worse.
+
+# a03:s5c: ./show_wer.sh 3d
+# %WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3d  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+    --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3e.sh b/egs/swbd/s5c/local/chain/run_tdnn_3e.sh
new file mode 100755
index 00000000000..af5661b8c85
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3e.sh
@@ -0,0 +1,275 @@
+#!/bin/bash
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+# (note: cannot be reproduced using current scripts).
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3e  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000 --num-jesus-blocks 200" \
+    --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3f.sh b/egs/swbd/s5c/local/chain/run_tdnn_3f.sh
new file mode 100755
index 00000000000..f33459f5f08
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3f.sh
@@ -0,0 +1,283 @@
+#!/bin/bash
+
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# (note: cannot be reproduced using current scripts).
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3f  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000 --num-jesus-blocks 200" \
+    --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3g.sh b/egs/swbd/s5c/local/chain/run_tdnn_3g.sh
new file mode 100755
index 00000000000..ff1e539306f
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3g.sh
@@ -0,0 +1,303 @@
+#!/bin/bash
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# (note: cannot be reproduced using current scripts).
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3g  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+    --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3h.sh b/egs/swbd/s5c/local/chain/run_tdnn_3h.sh
new file mode 100755
index 00000000000..f0e9efc2ac4
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3h.sh
@@ -0,0 +1,289 @@
+#!/bin/bash
+
+# _3h is as _3g but using a different and hopefully better type of recurrence, using
+# steps/nnet3/make_jesus_configs_recurrent.py to create the configs.  This is more
+# similar to LSTMs.
+# We're re-using the egs from 2y, which isn't 100% ideal as we'd like some context.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worde.
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3h  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1500 --jesus-direct-recurrence-dim 1000 --jesus-projected-recurrence-output-dim 600 --jesus-projected-recurrence-input-dim 300 --jesus-hidden-dim 15000" \
+    --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3i.sh b/egs/swbd/s5c/local/chain/run_tdnn_3i.sh
new file mode 100755
index 00000000000..876048b5852
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3i.sh
@@ -0,0 +1,311 @@
+#!/bin/bash
+
+# _3i is as _3h but after a script fix in which the --final-layer-normalize-target is
+# applied, in order to control how fast the final layer's affine component learns.
+# also a code fix (the recurrent connections weren't being used; bug in OptionalDescriptor)
+
+# Here is the original decoding, with frame-per-chunk=50
+#./show_wer.sh 3i
+#%WER 18.00 [ 8856 / 49204, 1025 ins, 2376 del, 5455 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 16.52 [ 8129 / 49204, 1084 ins, 1995 del, 5050 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 19.8 | 4459 42989 | 82.6 11.9 5.5 2.4 19.8 57.7 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.9 | 4459 42989 | 84.1 10.5 5.5 2.0 17.9 55.3 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+
+# and a newer decoding with frames-per-chunk=100.
+# ./show_wer.sh 3i
+#%WER 17.86 [ 8787 / 49204, 1015 ins, 2366 del, 5406 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 16.52 [ 8130 / 49204, 1092 ins, 1969 del, 5069 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 19.6 | 4459 42989 | 82.5 11.4 6.0 2.2 19.6 57.5 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.8 | 4459 42989 | 84.1 10.4 5.5 2.0 17.8 55.1 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+
+# after initial decoding wasn't great, trying increasing frames-per-chunk from
+# 50 to 100.
+
+# _3h is as _3g but using a different and hopefully better type of recurrence, using
+# steps/nnet3/make_jesus_configs_recurrent.py to create the configs.  This is more
+# similar to LSTMs.
+# We're re-using the egs from 2y, which isn't 100% ideal as we'd like some context.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worde.
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3i  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1500 --jesus-direct-recurrence-dim 1000 --jesus-projected-recurrence-output-dim 600 --jesus-projected-recurrence-input-dim 300 --jesus-hidden-dim 15000" \
+    --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --frames-per-chunk 100 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3j.sh b/egs/swbd/s5c/local/chain/run_tdnn_3j.sh
new file mode 100755
index 00000000000..faef84e8879
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3j.sh
@@ -0,0 +1,296 @@
+#!/bin/bash
+
+# _3j is as _3i but using BlockAffineComponent instead of
+# RepeatedAffineComponent in Jesus layers. (see --use-repeated-affine false
+# option, which is newly added to the script).
+
+# _3i is as _3h but after a script fix in which the --final-layer-normalize-target is
+# applied, in order to control how fast the final layer's affine component learns.
+
+# _3h is as _3g but using a different and hopefully better type of recurrence, using
+# steps/nnet3/make_jesus_configs_recurrent.py to create the configs.  This is more
+# similar to LSTMs.
+# We're re-using the egs from 2y, which isn't 100% ideal as we'd like some context.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worde.
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3j  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1500 --jesus-direct-recurrence-dim 1000 --jesus-projected-recurrence-output-dim 600 --jesus-projected-recurrence-input-dim 300 --jesus-hidden-dim 15000 --use-repeated-affine false" \
+    --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3k.sh b/egs/swbd/s5c/local/chain/run_tdnn_3k.sh
new file mode 100755
index 00000000000..b869c7b2553
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3k.sh
@@ -0,0 +1,310 @@
+#!/bin/bash
+
+# _3k is as _3i, but adding the option --jesus-stddev-scale 0.316 "
+# [~sqrt(1/10)], which will make the jesus layer learn about 10 times faster- it
+# was previously learning too slow, I think.  I also changed the script
+# make_jesus_configs_recurrent.py to give the recurrent affine layers an initial
+# param-stddev of 0 which will discourage those corresponding input weights in
+# the jesus layer from getting small in early iters; and removed the --normalize-target
+# option and replaced it with the --final-layer-learning-rate-factor option.
+
+#  # these results are with the non-optimal chunk size of 50 (in 3i, 100 was slightly better):
+#%WER 17.86 [ 8787 / 49204, 1015 ins, 2366 del, 5406 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 16.52 [ 8130 / 49204, 1092 ins, 1969 del, 5069 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 19.6 | 4459 42989 | 82.5 11.4 6.0 2.2 19.6 57.5 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.8 | 4459 42989 | 84.1 10.4 5.5 2.0 17.8 55.1 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+
+# The following are the corresponding results from 3i, decoded with the same chunk size.
+##%WER 18.00 [ 8856 / 49204, 1025 ins, 2376 del, 5455 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_tg/wer_11_0.0
+##%WER 16.52 [ 8129 / 49204, 1084 ins, 1995 del, 5050 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+##%WER 19.8 | 4459 42989 | 82.6 11.9 5.5 2.4 19.8 57.7 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+##%WER 17.9 | 4459 42989 | 84.1 10.5 5.5 2.0 17.9 55.3 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3i is as _3h but after a script fix in which the --final-layer-normalize-target is
+# applied, in order to control how fast the final layer's affine component learns.
+
+# _3h is as _3g but using a different and hopefully better type of recurrence, using
+# steps/nnet3/make_jesus_configs_recurrent.py to create the configs.  This is more
+# similar to LSTMs.
+# We're re-using the egs from 2y, which isn't 100% ideal as we'd like some context.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worde.
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3k  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1500 --jesus-direct-recurrence-dim 1000 --jesus-projected-recurrence-output-dim 600 --jesus-projected-recurrence-input-dim 300 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.316 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3k2.sh b/egs/swbd/s5c/local/chain/run_tdnn_3k2.sh
new file mode 100755
index 00000000000..7a016ed2197
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3k2.sh
@@ -0,0 +1,358 @@
+#!/bin/bash
+
+# 3k2 is as 3k, but dumping the egs with --extra-left-context 20.
+# Also there will have been some script changes in the meantime,
+# e.g. possibly nonzero bias-mean; and reduced max-change on mix-up
+# iters.
+
+# log-probs are better than 3k and in fact better than any experiment so far:
+# valid -0.115->-0.107, and train -0.077 to -0.074.
+
+# Here is the WER using the default --frames-per-chunk of 50, and --extra-left-context 20:
+#./show_wer.sh 3k2
+#%WER 20.45 [ 10060 / 49204, 988 ins, 3050 del, 6022 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_tg/wer_12_0.0
+#%WER 19.02 [ 9359 / 49204, 977 ins, 2877 del, 5505 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+#%WER 22.3 | 4459 42989 | 79.9 12.8 7.3 2.3 22.3 60.2 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+#%WER 20.4 | 4459 42989 | 81.5 11.1 7.4 1.9 20.4 58.4 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_fsh_fg/score_12_0.0/eval2000_hires.ctm.filt.sys
+
+#... and here is the WER after changing it to 150, still with --extra-left-context 20:
+#./show_wer.sh 3k2
+#%WER 18.91 [ 9306 / 49204, 1076 ins, 2517 del, 5713 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 17.43 [ 8574 / 49204, 958 ins, 2607 del, 5009 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+#%WER 20.6 | 4459 42989 | 81.7 12.2 6.0 2.4 20.6 58.8 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_tg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#%WER 18.8 | 4459 42989 | 83.4 10.9 5.6 2.3 18.8 56.0 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# the following is --frames-per-chunk 150, --extra-left-context 50 (changing the extra-left-context from 20 to 50 makes it worse):
+#./show_wer.sh 3k2
+#%WER 19.46 [ 9574 / 49204, 1134 ins, 2635 del, 5805 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 17.87 [ 8792 / 49204, 880 ins, 3011 del, 4901 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+#%WER 21.0 | 4459 42989 | 81.2 12.4 6.3 2.2 21.0 58.6 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 19.2 | 4459 42989 | 82.7 10.8 6.5 1.9 19.2 56.0 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+
+# the following is with --frames-per-chunk 150, --extra-left-context 50, --extra-left-context-initial 20.
+#./show_wer.sh 3k2
+#%WER 19.10 [ 9400 / 49204, 1116 ins, 2498 del, 5786 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 17.54 [ 8628 / 49204, 884 ins, 2890 del, 4854 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+#%WER 20.6 | 4459 42989 | 81.7 12.2 6.1 2.3 20.6 58.4 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 18.7 | 4459 42989 | 83.4 10.8 5.8 2.1 18.7 55.6 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# the following is with --extra-left-context-initial 20 --extra-left-context 50 --frames-per-chunk 100.
+# I think what's happening is that it's figuring out when it's near the end of the chunk, and encouraging
+# deletions at that point, for reasons that relate to edge effects in the objective function.
+#./show_wer.sh 3k2
+#%WER 17.87 [ 8793 / 49204, 1061 ins, 2277 del, 5455 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 16.36 [ 8049 / 49204, 1033 ins, 2148 del, 4868 sub ] exp/chain/tdnn_3k2_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 19.7 | 4459 42989 | 82.8 11.8 5.5 2.5 19.7 57.8 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.8 | 4459 42989 | 84.4 10.3 5.2 2.2 17.8 54.7 | exp/chain/tdnn_3k2_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3k is as _3i, but adding the option --jesus-stddev-scale 0.316 "
+# [~sqrt(1/10)], which will make the jesus layer learn about 10 times faster- it
+# was previously learning too slow, I think.  I also changed the script
+# make_jesus_configs_recurrent.py to give the recurrent affine layers an initial
+# param-stddev of 0 which will discourage those corresponding input weights in
+# the jesus layer from getting small in early iters; and removed the --normalize-target
+# option and replaced it with the --final-layer-learning-rate-factor option.
+
+#  # these results are with the non-optimal chunk size of 50 (in 3i, 100 was slightly better):
+#%WER 17.86 [ 8787 / 49204, 1015 ins, 2366 del, 5406 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 16.52 [ 8130 / 49204, 1092 ins, 1969 del, 5069 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 19.6 | 4459 42989 | 82.5 11.4 6.0 2.2 19.6 57.5 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.8 | 4459 42989 | 84.1 10.4 5.5 2.0 17.8 55.1 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+
+# The following are the corresponding results from 3i, decoded with the same chunk size.
+##%WER 18.00 [ 8856 / 49204, 1025 ins, 2376 del, 5455 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_tg/wer_11_0.0
+##%WER 16.52 [ 8129 / 49204, 1084 ins, 1995 del, 5050 sub ] exp/chain/tdnn_3i_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+##%WER 19.8 | 4459 42989 | 82.6 11.9 5.5 2.4 19.8 57.7 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+##%WER 17.9 | 4459 42989 | 84.1 10.5 5.5 2.0 17.9 55.3 | exp/chain/tdnn_3i_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3i is as _3h but after a script fix in which the --final-layer-normalize-target is
+# applied, in order to control how fast the final layer's affine component learns.
+
+# _3h is as _3g but using a different and hopefully better type of recurrence, using
+# steps/nnet3/make_jesus_configs_recurrent.py to create the configs.  This is more
+# similar to LSTMs.
+# We're re-using the egs from 2y, which isn't 100% ideal as we'd like some context.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worde.
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3k2  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --extra-left-context 20 \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1500 --jesus-direct-recurrence-dim 1000 --jesus-projected-recurrence-output-dim 600 --jesus-projected-recurrence-input-dim 300 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.316 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context-initial 20 \
+         --extra-left-context 50 \
+         --frames-per-chunk 100 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3l.sh b/egs/swbd/s5c/local/chain/run_tdnn_3l.sh
new file mode 100755
index 00000000000..608e437659e
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3l.sh
@@ -0,0 +1,306 @@
+#!/bin/bash
+
+# [abandoned, not working well.]
+# _3l is as _3j, but making similar changes to as 3i->3k, which is (1) adding
+# the option --jesus-stddev-scale 0.2 [0.32 was not strong enough], and (2) a
+# script change to give the recurrent affine layers an initial param-stddev of
+# 0.  I also changed the script
+# make_jesus_configs_recurrent.py to give the recurrent affine layers an initial
+# param-stddev of 0 which will discourage those corresponding input weights in
+# the jesus layer from getting small in early iters; and removed the --normalize-target
+# option and replaced it with the --final-layer-learning-rate-factor option;
+# and added a learning-rate factor for
+
+# _3j is as _3i but using BlockAffineComponent instead of
+# RepeatedAffineComponent in Jesus layers. (see --use-repeated-affine false
+# option, which is newly added to the script).
+
+# _3i is as _3h but after a script fix in which the --final-layer-normalize-target is
+# applied, in order to control how fast the final layer's affine component learns.
+
+# _3h is as _3g but using a different and hopefully better type of recurrence, using
+# steps/nnet3/make_jesus_configs_recurrent.py to create the configs.  This is more
+# similar to LSTMs.
+# We're re-using the egs from 2y, which isn't 100% ideal as we'd like some context.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worde.
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3l  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1500 --jesus-direct-recurrence-dim 1000 --jesus-projected-recurrence-output-dim 600 --jesus-projected-recurrence-input-dim 300 --jesus-hidden-dim 15000 --use-repeated-affine false --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3m.sh b/egs/swbd/s5c/local/chain/run_tdnn_3m.sh
new file mode 100755
index 00000000000..b25f9f15130
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3m.sh
@@ -0,0 +1,310 @@
+#!/bin/bash
+
+# [note: this uses BlockAffineComponent not RepeatedAffineComponent]
+# _3m is as _3l, but changing --jesus-stddev-scale from 0.2 to 0.1, as the Jesus layers
+# were learning too slowly in 3l (this will make them learn approximately 4x faster).
+# [terminated, likelihoods were not promising].
+
+# _3l is as _3j, but making similar changes to as 3i->3k, which is (1) adding
+# the option --jesus-stddev-scale 0.2 [0.32 was not strong enough], and (2) a
+# script change to give the recurrent affine layers an initial param-stddev of
+# 0.  I also changed the script
+# make_jesus_configs_recurrent.py to give the recurrent affine layers an initial
+# param-stddev of 0 which will discourage those corresponding input weights in
+# the jesus layer from getting small in early iters; and removed the --normalize-target
+# option and replaced it with the --final-layer-learning-rate-factor option;
+# and added a learning-rate factor for
+
+# _3j is as _3i but using BlockAffineComponent instead of
+# RepeatedAffineComponent in Jesus layers. (see --use-repeated-affine false
+# option, which is newly added to the script).
+
+# _3i is as _3h but after a script fix in which the --final-layer-normalize-target is
+# applied, in order to control how fast the final layer's affine component learns.
+
+# _3h is as _3g but using a different and hopefully better type of recurrence, using
+# steps/nnet3/make_jesus_configs_recurrent.py to create the configs.  This is more
+# similar to LSTMs.
+# We're re-using the egs from 2y, which isn't 100% ideal as we'd like some context.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worde.
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3m  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1500 --jesus-direct-recurrence-dim 1000 --jesus-projected-recurrence-output-dim 600 --jesus-projected-recurrence-input-dim 300 --jesus-hidden-dim 15000 --use-repeated-affine false --jesus-stddev-scale 0.1 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3n.sh b/egs/swbd/s5c/local/chain/run_tdnn_3n.sh
new file mode 100755
index 00000000000..dedbd84be75
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3n.sh
@@ -0,0 +1,305 @@
+#!/bin/bash
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3n  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1800 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3o.sh b/egs/swbd/s5c/local/chain/run_tdnn_3o.sh
new file mode 100755
index 00000000000..14383fe1a32
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3o.sh
@@ -0,0 +1,309 @@
+#!/bin/bash
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+# [ seemed helpful based on likelihoods on first iterations]: on iter 42,
+# train prob is -0.1554->-0.1523, and valid prob is -0.1559->-0.1540.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3o  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1800 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-2,-1,0,1,2 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3p.sh b/egs/swbd/s5c/local/chain/run_tdnn_3p.sh
new file mode 100755
index 00000000000..ddba7e7f9c5
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3p.sh
@@ -0,0 +1,333 @@
+#!/bin/bash
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# Comparing the WER with 2y, it's about 1% abs worse [see below].  However, this is
+# for an odd reason: the model, while smaller than the 2y one (8.8 vs. 12.1 million
+# parameters), seems to have a lot more learning capacity, with better train and worse valid
+# prob.  In 3r and 3s I am trying smaller versions of this architecture.
+
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#                        2y             3p
+#  final-train-prob:  -0.083068    -0.0771
+#  final-valid-prob:  -0.01212     -0.12715
+# num-parameters:      12094115     8804087
+
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3p  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1800 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-2,-1,0,1,2 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3q.sh b/egs/swbd/s5c/local/chain/run_tdnn_3q.sh
new file mode 100755
index 00000000000..9f67164b806
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3q.sh
@@ -0,0 +1,315 @@
+#!/bin/bash
+
+# _3q is as _3p, but now trying out the 'block' training script, where in addition to
+# the affine connections we have block-matrix connections between the layers.
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3q  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-block-opts "--jesus-full-output-dim 900 --jesus-full-input-dim 900 --jesus-block-input-dim 900 --jesus-block-output-dim 900  --jesus-hidden-dim 15000 --jesus-final-output-dim 600 --jesus-stddev-scale 0.4 --num-affine-blocks 25 --final-layer-target-rms 0.5" \
+    --splice-indexes "-2,-1,0,1,2 -1,0,1,2 -3,0,3 -6,0,3 -6,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3r.sh b/egs/swbd/s5c/local/chain/run_tdnn_3r.sh
new file mode 100755
index 00000000000..7815adffb9f
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3r.sh
@@ -0,0 +1,321 @@
+#!/bin/bash
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+# [I think I abandoned this after deciding to reduce the parameters even further,
+# to the setup in 3s].
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3r  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1500 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-2,-1,0,1,2 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3s.sh b/egs/swbd/s5c/local/chain/run_tdnn_3s.sh
new file mode 100755
index 00000000000..6cee8b11925
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3s.sh
@@ -0,0 +1,340 @@
+#!/bin/bash
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3s  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1500 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-2,-1,0,1,2 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3t.sh b/egs/swbd/s5c/local/chain/run_tdnn_3t.sh
new file mode 100755
index 00000000000..25e30900e36
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3t.sh
@@ -0,0 +1,336 @@
+#!/bin/bash
+
+# _3t is as _3s but using slightly wider context.  Dumping our own egs.
+#  The final train prob is better -0.0851->-0.0815, but valid prob is worse -0.1231->-0.1243.
+# WER is slightly worse.  So we won't use this for now, but later if we use more data we
+# could try wider context like this.
+#a03:s5c: ./show_wer.sh 3s
+#%WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+#
+#%WER 18.01 [ 8860 / 49204, 1043 ins, 2315 del, 5502 sub ] exp/chain/tdnn_3t_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 16.68 [ 8205 / 49204, 930 ins, 2420 del, 4855 sub ] exp/chain/tdnn_3t_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+#%WER 19.7 | 4459 42989 | 82.6 11.9 5.5 2.3 19.7 57.4 | exp/chain/tdnn_3t_sp/decode_eval2000_sw1_tg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#%WER 17.8 | 4459 42989 | 84.2 10.4 5.4 2.0 17.8 55.4 | exp/chain/tdnn_3t_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3t  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1500 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-2,-1,0,1,2 -3,-2,-1,0,1,2,3 -3,0,3 -6,-3,0,3,6 -6,-3,0,3,6" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3u.sh b/egs/swbd/s5c/local/chain/run_tdnn_3u.sh
new file mode 100755
index 00000000000..d1b93d9084c
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3u.sh
@@ -0,0 +1,330 @@
+#!/bin/bash
+
+# _3u is as _3s (and re-using the egs) but with one more layer; keeping the same dim
+# and total context, and reducing --jesus-forward-output-dim from 1500 to 1300 to
+# ensure that the number of parameters doesn't increase too much.
+#  [stopping this run, as the likelihoods weren't promising, e.g. by iteration
+#  39, the valid-prob was worse vs. 3t, -0.1488 -> -0.1521 (train: -0.1510 -> -0.1532)
+
+# _3t is as _3s but using slightly wider context.  Dumping our own egs.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3u  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_3t_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1300 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-2,-1,0,1,2 -3,-2,-1,0,1,2,3 -3,0,3 -3,0,3 -3,0,3 -6,-3,0,3,6" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3v.sh b/egs/swbd/s5c/local/chain/run_tdnn_3v.sh
new file mode 100755
index 00000000000..c7fcb7e24f5
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3v.sh
@@ -0,0 +1,328 @@
+#!/bin/bash
+
+# _3v is as _3t but decreasing the --num-jesus-blocks from 100 to 50.
+# I stopped it early after likelihoods were not promising:
+#  on iter 90, train prob was -0.1226->-0.1240, valid -0.1304->-0.1340.
+
+# _3t is as _3s but using slightly wider context.  Dumping our own egs.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3v  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_3t_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400 --num-jesus-blocks 50 --jesus-forward-output-dim 1500 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-2,-1,0,1,2 -3,-2,-1,0,1,2,3 -3,0,3 -6,-3,0,3,6 -6,-3,0,3,6" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3w.sh b/egs/swbd/s5c/local/chain/run_tdnn_3w.sh
new file mode 100755
index 00000000000..e4165e54de6
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3w.sh
@@ -0,0 +1,332 @@
+#!/bin/bash
+
+# _3w is as _3t but instead of having a rectangular affine component in each
+# layer, making it square (700->600 not 1300->400), and introducing a new script
+# option --final-hidden-dim to have something like a bottleneck at the last
+# layer, to avoid a blowup in parameters.
+#  (note: num-params was slightly smaller, 4.8 million vs 5.3
+#  I stopped this on iter 65 after likelihoods were not promising:
+# on iter 63, train -0.133->-0.138, valid -0.138->-0.141.
+
+# _3t is as _3s but using slightly wider context.  Dumping our own egs.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3w  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_3t_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 800 --final-hidden-dim 400 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-2,-1,0,1,2 -3,-2,-1,0,1,2,3 -3,0,3 -6,-3,0,3,6 -6,-3,0,3,6" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3x.sh b/egs/swbd/s5c/local/chain/run_tdnn_3x.sh
new file mode 100755
index 00000000000..1585d209a93
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3x.sh
@@ -0,0 +1,341 @@
+#!/bin/bash
+
+# _3x is as _3s (and continuing the same kind of experimentation as in 3t->3w)...
+#  increasing --jesus-forward-output-dim from 1500 to 2000.
+# More overtraining: final-train -0.0852->-0.0799, final-valid -0.1231->-0.1261,
+# WER effect is very tiny but maybe slightly better.
+#a03:s5c: ./show_wer.sh 3x
+#%WER 17.78 [ 8750 / 49204, 910 ins, 2405 del, 5435 sub ] exp/chain/tdnn_3x_sp/decode_train_dev_sw1_tg/wer_12_0.0
+#%WER 16.60 [ 8166 / 49204, 921 ins, 2290 del, 4955 sub ] exp/chain/tdnn_3x_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+#%WER 19.5 | 4459 42989 | 82.7 11.4 5.9 2.2 19.5 57.5 | exp/chain/tdnn_3x_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.7 | 4459 42989 | 84.3 10.3 5.5 1.9 17.7 54.6 | exp/chain/tdnn_3x_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 3s
+#%WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+
+
+
+# _3t is as _3s but using slightly wider context.  Dumping our own egs.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3x  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_3t_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400 --jesus-forward-output-dim 2000 --final-hidden-dim 350 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-2,-1,0,1,2 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3y.sh b/egs/swbd/s5c/local/chain/run_tdnn_3y.sh
new file mode 100755
index 00000000000..042ec84898b
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3y.sh
@@ -0,0 +1,346 @@
+#!/bin/bash
+
+# _3y is as _3s but doubling jesus-hidden-dim from 15000 to 30000.
+#  not promising: by iteration 228, train prob changed -0.09583->-0.09575, and
+# valid prob from -0.1213 -> -0.1239.  Killed it.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 3s.
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3y  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1500 --jesus-hidden-dim 30000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-2,-1,0,1,2 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_3z.sh b/egs/swbd/s5c/local/chain/run_tdnn_3z.sh
new file mode 100755
index 00000000000..f1fa2c5a45e
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_3z.sh
@@ -0,0 +1,350 @@
+#!/bin/bash
+
+# _3z is as _3s, but reducing the target num-states in the tree building from 9k to 6k.
+# A slight degradation in WER, but it's not 100% consistent.  The final train-prob
+# was worse -0.0852 -> -0.0888, and valid-prob was worse -0.1231->-0.1280.
+#./show_wer.sh 3z
+#%WER 18.05 [ 8883 / 49204, 990 ins, 2397 del, 5496 sub ] exp/chain/tdnn_3z_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 16.50 [ 8120 / 49204, 960 ins, 2234 del, 4926 sub ] exp/chain/tdnn_3z_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 19.7 | 4459 42989 | 82.5 11.9 5.5 2.2 19.7 57.6 | exp/chain/tdnn_3z_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.8 | 4459 42989 | 84.1 10.4 5.5 1.9 17.8 55.1 | exp/chain/tdnn_3z_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=11
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_3z  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_3z_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 6000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1500 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-2,-1,0,1,2 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4a.sh b/egs/swbd/s5c/local/chain/run_tdnn_4a.sh
new file mode 100755
index 00000000000..c02ad2cb0e4
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_4a.sh
@@ -0,0 +1,349 @@
+#!/bin/bash
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+# WER is maybe a fraction worse than 3s (see below); final train prob is
+# worse -0->0852 -> -0.0879, and valid prob is better -0.121 ->-0.1213
+#./show_wer.sh 4a
+#%WER 17.88 [ 8800 / 49204, 1017 ins, 2233 del, 5550 sub ] exp/chain/tdnn_4a_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 16.73 [ 8231 / 49204, 898 ins, 2397 del, 4936 sub ] exp/chain/tdnn_4a_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+#%WER 19.7 | 4459 42989 | 82.5 12.0 5.5 2.3 19.7 57.6 | exp/chain/tdnn_4a_sp/decode_eval2000_sw1_tg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#%WER 17.8 | 4459 42989 | 84.2 10.3 5.5 2.0 17.8 55.1 | exp/chain/tdnn_4a_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_4a  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1500 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4b.sh b/egs/swbd/s5c/local/chain/run_tdnn_4b.sh
new file mode 100755
index 00000000000..aad278c3037
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_4b.sh
@@ -0,0 +1,346 @@
+#!/bin/bash
+
+# _4b is as _4a, but even narrower splice-indexes in 1st layer (no splicing)
+#  stopped early after train and valid likelihoods were not promising.
+# [later accidentally overwrote and moved the dir.]
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_4b  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1500 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "0 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4c.sh b/egs/swbd/s5c/local/chain/run_tdnn_4c.sh
new file mode 100755
index 00000000000..d9060251844
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_4c.sh
@@ -0,0 +1,357 @@
+#!/bin/bash
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+# Yay-- WER is slightly better or the same.  Final train-prob is worse
+# -0.0879 -> -0.0882, and valid-prob worse -0.1213 -> -0.1241.
+
+# %WER 17.63 [ 8673 / 49204, 956 ins, 2334 del, 5383 sub ] exp/chain/tdnn_4c_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.61 [ 8175 / 49204, 964 ins, 2272 del, 4939 sub ] exp/chain/tdnn_4c_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.7 | 4459 42989 | 82.6 11.8 5.6 2.3 19.7 57.4 | exp/chain/tdnn_4c_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.8 | 4459 42989 | 84.2 10.6 5.2 2.0 17.8 54.4 | exp/chain/tdnn_4c_sp/decode_eval2000_sw1_fsh_fg/score_10_1.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4a
+# %WER 17.88 [ 8800 / 49204, 1017 ins, 2233 del, 5550 sub ] exp/chain/tdnn_4a_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.73 [ 8231 / 49204, 898 ins, 2397 del, 4936 sub ] exp/chain/tdnn_4a_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.7 | 4459 42989 | 82.5 12.0 5.5 2.3 19.7 57.6 | exp/chain/tdnn_4a_sp/decode_eval2000_sw1_tg/score_10_0.5/eval2000_hires.ctm.filt.sys
+# %WER 17.8 | 4459 42989 | 84.2 10.3 5.5 2.0 17.8 55.1 | exp/chain/tdnn_4a_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_4c # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1500 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4d.sh b/egs/swbd/s5c/local/chain/run_tdnn_4d.sh
new file mode 100755
index 00000000000..1ae220dc21a
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_4d.sh
@@ -0,0 +1,346 @@
+#!/bin/bash
+
+# _4d is as _4a, but with --egs-opts "--frames-overlap-per-eg 10
+# --cut-zero-frames 5" and changing apply-deriv-weights to true... this to
+# activate the new-style derivative weights.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_4d  # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1500 --jesus-hidden-dim 15000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights true \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 10 --cut-zero-frames 5" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4e.sh b/egs/swbd/s5c/local/chain/run_tdnn_4e.sh
new file mode 100755
index 00000000000..fea5495ee06
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_4e.sh
@@ -0,0 +1,362 @@
+#!/bin/bash
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+# big improvement- about 0.7% WER abs.  Considering the non-l2 part of the objf, the
+# final valid objf c->e is -0.1241->-0.1266 [and the l2 term is -0.0196].
+# and for the training st it's -0.08820 -> -0.1149.
+
+
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4c
+# %WER 17.63 [ 8673 / 49204, 956 ins, 2334 del, 5383 sub ] exp/chain/tdnn_4c_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.61 [ 8175 / 49204, 964 ins, 2272 del, 4939 sub ] exp/chain/tdnn_4c_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.7 | 4459 42989 | 82.6 11.8 5.6 2.3 19.7 57.4 | exp/chain/tdnn_4c_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.8 | 4459 42989 | 84.2 10.6 5.2 2.0 17.8 54.4 | exp/chain/tdnn_4c_sp/decode_eval2000_sw1_fsh_fg/score_10_1.0/eval2000_hires.ctm.filt.sys
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_4e # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --l2-regularize 0.0001 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1500 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4f.sh b/egs/swbd/s5c/local/chain/run_tdnn_4f.sh
new file mode 100755
index 00000000000..36d5f188c56
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_4f.sh
@@ -0,0 +1,366 @@
+#!/bin/bash
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_4f # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1500 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4g.sh b/egs/swbd/s5c/local/chain/run_tdnn_4g.sh
new file mode 100755
index 00000000000..430c6c28c70
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_4g.sh
@@ -0,0 +1,365 @@
+#!/bin/bash
+
+# _4g is as _4c, but reducing the --jesus-hidden-dim further from 7500 to 4000.
+# Strangely, the trend from 4a->4a does not continue: instead of continuing to get worse,
+# the train and valid probs both get better.
+
+#                      4a     4c      4g
+#  Final train prob: -0.0879  -0.08820  -0.08784
+#  Final valid prob: -0.1214  -0.1241   -0.1204
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+# Yay-- WER is slightly better or the same.  Final train-prob is worse
+# -0.0879 -> -0.0882, and valid-prob worse -0.1213 -> -0.1241.
+
+# %WER 17.63 [ 8673 / 49204, 956 ins, 2334 del, 5383 sub ] exp/chain/tdnn_4c_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.61 [ 8175 / 49204, 964 ins, 2272 del, 4939 sub ] exp/chain/tdnn_4c_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.7 | 4459 42989 | 82.6 11.8 5.6 2.3 19.7 57.4 | exp/chain/tdnn_4c_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.8 | 4459 42989 | 84.2 10.6 5.2 2.0 17.8 54.4 | exp/chain/tdnn_4c_sp/decode_eval2000_sw1_fsh_fg/score_10_1.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4a
+# %WER 17.88 [ 8800 / 49204, 1017 ins, 2233 del, 5550 sub ] exp/chain/tdnn_4a_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.73 [ 8231 / 49204, 898 ins, 2397 del, 4936 sub ] exp/chain/tdnn_4a_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.7 | 4459 42989 | 82.5 12.0 5.5 2.3 19.7 57.6 | exp/chain/tdnn_4a_sp/decode_eval2000_sw1_tg/score_10_0.5/eval2000_hires.ctm.filt.sys
+# %WER 17.8 | 4459 42989 | 84.2 10.3 5.5 2.0 17.8 55.1 | exp/chain/tdnn_4a_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_4g # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1500 --jesus-hidden-dim 4000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4n.sh b/egs/swbd/s5c/local/chain/run_tdnn_4n.sh
new file mode 100644
index 00000000000..9125d4e7967
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_4n.sh
@@ -0,0 +1,386 @@
+#!/bin/bash
+
+# _4n is as _4f, but adding the [new] option --convert-repeated-to-block-iter=100.
+#  reusing iter 100 of model 4f to avoid some iterations of training [did this by
+# doing (cd exp/chain; cp -r tdnn_4f_sp tdnn_4n_sp), and then running this script with
+# --iter 100].
+# [note: to get the block-affine stuff to train fast enough to make a difference
+#  I multiplied a factor of sqrt(num-blocks) into the learning-rate factor in
+#  the code.  That change is not committed.]
+#
+# Essentially no effect on WER, but train and valid probs are worse.
+# ./compare_wer.sh 4f 4n
+# System                       4f        4n
+# WER on train_dev(tg)      16.83     16.84
+# WER on train_dev(fg)      15.73     15.69
+# WER on eval2000(tg)        18.4      18.4
+# WER on eval2000(fg)        16.6      16.6
+# Final train prob      -0.105832 -0.111309
+# Final valid prob      -0.123021 -0.123601
+
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_4n # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --convert-repeated-to-block-iter 100 \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1500 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4p.sh b/egs/swbd/s5c/local/chain/run_tdnn_4p.sh
new file mode 100755
index 00000000000..d2b073cdc77
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_4p.sh
@@ -0,0 +1,381 @@
+#!/bin/bash
+
+# _4p is as _4f, but one fewer layer, and making the final-layer context wider to
+# compensate; also increasing the jesus-layer input and output dims 400->500 and 1500->1600 to
+# somewhat compensate for the reduction in parameters.
+
+# definitely worse.  Later with 4r I go in the opposite direction by adding a new layer,
+# and get a small improvement.
+# ./compare_wer.sh 4f 4p
+# System                       4f        4p
+# WER on train_dev(tg)      16.83     17.36
+# WER on train_dev(fg)      15.73     16.10
+# WER on eval2000(tg)        18.4      19.1
+# WER on eval2000(fg)        16.6      17.2
+# Final train prob      -0.105832 -0.104439
+# Final valid prob      -0.123021 -0.125576
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_4p # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 450  --jesus-forward-output-dim 1600 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -6,-3,0,3 -9,-6,-3,0,3,6" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4q.sh b/egs/swbd/s5c/local/chain/run_tdnn_4q.sh
new file mode 100755
index 00000000000..9f2534f4f22
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_4q.sh
@@ -0,0 +1,177 @@
+#!/bin/bash
+
+# this is based on Dan's tdnn_2o script
+# it has a different splicing configuration
+# it uses the PerDimensionWeightedAverage pooling in place of the Jesus layer
+
+set -e
+
+#%WER 11.1 | 1831 21395 | 90.2 6.3 3.5 1.3 11.1 46.6 | exp/chain/tdnn_v1_trial6_sp/decode_eval2000_sw1_fsh_fg/score_12_0.0/eval2000_hires.ctm.swbd.filt.sys
+#%WER 16.6 | 4459 42989 | 85.2 9.5 5.3 1.8 16.6 53.4 | exp/chain/tdnn_v1_trial6_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+#%WER 15.59 [ 7671 / 49204, 883 ins, 2234 del, 4554 sub ] exp/chain/tdnn_v1_trial6_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+
+
+# configs for 'chain'
+affix=
+stage=10
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_4q  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,0,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+# smoothing options
+pool_window=7 
+pool_type='per-dim-weighted-average'
+pool_lpfilter_width=
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=64
+relu_dim=700
+frames_per_eg=150
+remove_egs=false
+common_egs_dir=
+
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}${affix:+_$affix}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2o_tree$suffix
+lang=data/lang_chain_2o
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --apply-deriv-weights false \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --pool-type "$pool_type" \
+    --pool-window "$pool_window" \
+    --pool-lpfilter-width "$pool_lpfilter_width" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim $relu_dim \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    --egs-dir "$common_egs_dir" \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4r.sh b/egs/swbd/s5c/local/chain/run_tdnn_4r.sh
new file mode 100755
index 00000000000..64831b5802a
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_4r.sh
@@ -0,0 +1,380 @@
+#!/bin/bash
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_4r # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1400 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4s.sh b/egs/swbd/s5c/local/chain/run_tdnn_4s.sh
new file mode 100755
index 00000000000..92a1a7da277
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_4s.sh
@@ -0,0 +1,380 @@
+#!/bin/bash
+
+# _4s is as _4f, but with --leaky-hmm-coefficient 0.02.  [A new option-
+#currently in a branch]
+# Overall no real change.
+
+# ./compare_wer.sh 4f 4s
+# System                       4f        4s
+# WER on train_dev(tg)      16.83     16.82
+# WER on train_dev(fg)      15.73     15.62
+# WER on eval2000(tg)        18.4      18.5
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.111371
+# Final valid prob      -0.123021  -0.12648
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_4s # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --leaky-hmm-coefficient 0.02 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1500 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4t.sh b/egs/swbd/s5c/local/chain/run_tdnn_4t.sh
new file mode 100755
index 00000000000..30b383d05d7
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_4t.sh
@@ -0,0 +1,382 @@
+#!/bin/bash
+
+# _4t is as _4s, but with --leaky-hmm-coefficient 0.04.
+
+# [note, I accidentally overwrote this directory afterwards, and moved it.]
+# It's really not clear whether it's helpful.
+# ./compare_wer.sh 4f 4t
+# System                       4f        4t
+# WER on train_dev(tg)      16.83     16.75
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.5
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.112721
+# Final valid prob      -0.123021 -0.129688
+
+# _4s is as _4f, but with --leaky-hmm-coefficient 0.02.  [A new option.]
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_4u # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --leaky-hmm-coefficient 0.08 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1500 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4u.sh b/egs/swbd/s5c/local/chain/run_tdnn_4u.sh
new file mode 100755
index 00000000000..ae7cf02b426
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_4u.sh
@@ -0,0 +1,384 @@
+#!/bin/bash
+
+# _4u is as _4t, but with --leaky-hmm-coefficient 0.08.  Note: the
+# ultimate baseline is 4f.
+
+# It seems a bit better on average.
+#./compare_wer.sh 4f 4u
+#System                       4f        4u
+#WER on train_dev(tg)      16.83     16.47
+#WER on train_dev(fg)      15.73     15.23
+#WER on eval2000(tg)        18.4      18.4
+#WER on eval2000(fg)        16.6      16.7
+#Final train prob      -0.105832 -0.118911
+#Final valid prob      -0.123021 -0.135768
+
+# _4t is as _4s, but with --leaky-hmm-coefficient 0.04.
+
+# _4s is as _4f, but with --leaky-hmm-coefficient 0.02.  [A new option.]
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_4t # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --leaky-hmm-coefficient 0.08 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1500 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4v.sh b/egs/swbd/s5c/local/chain/run_tdnn_4v.sh
new file mode 100755
index 00000000000..9cdbfefb5a2
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_4v.sh
@@ -0,0 +1,394 @@
+#!/bin/bash
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+#./compare_wer.sh 4r 4v
+#System                       4r        4v
+#WER on train_dev(tg)      16.50     15.95
+#WER on train_dev(fg)      15.45     14.69
+#WER on eval2000(tg)        18.3      17.7
+#WER on eval2000(fg)        16.7      16.0
+#Final train prob      -0.103652 -0.106646  -1.60775
+#Final valid prob      -0.121105 -0.118631  -1.62832
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_4v # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1400 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4w.sh b/egs/swbd/s5c/local/chain/run_tdnn_4w.sh
new file mode 100755
index 00000000000..6dd5c587f7a
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_4w.sh
@@ -0,0 +1,397 @@
+#!/bin/bash
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2 WER seems consistently a
+# bit worse, although final valid prob is very slightly better.
+
+#./compare_wer.sh 4v 4w
+#System                       4v        4w
+#WER on train_dev(tg)      15.95     16.05
+#WER on train_dev(fg)      14.69     14.92
+#WER on eval2000(tg)        17.7      18.0
+#WER on eval2000(fg)        16.0      16.2
+#Final train prob      -0.106646 -0.108816
+#Final valid prob      -0.118631 -0.118254
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_4w # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.2 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1400 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_4x.sh b/egs/swbd/s5c/local/chain/run_tdnn_4x.sh
new file mode 100755
index 00000000000..0290e0bdbd5
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_4x.sh
@@ -0,0 +1,396 @@
+#!/bin/bash
+
+# _4x is as _4u, but with --leaky-hmm-coefficient 0.2.   Note: the
+# ultimate baseline is 4f.  It seems a little bit worse than 4u on average: (+0.2, +0.2, 0.0, -0.1).
+# So I'm guessing the best value is around --leaky-hmm-coefficient 0.1.
+#
+# ./compare_wer.sh  4f 4u 4x
+# System                       4f        4u        4x
+# WER on train_dev(tg)      16.83     16.47     16.63
+# WER on train_dev(fg)      15.73     15.23     15.42
+# WER on eval2000(tg)        18.4      18.4      18.4
+# WER on eval2000(fg)        16.6      16.7      16.6
+# Final train prob      -0.105832 -0.118911 -0.130674
+# Final valid prob      -0.123021 -0.135768 -0.146351
+
+# _4u is as _4t, but with --leaky-hmm-coefficient 0.08.  Note: the
+# ultimate baseline is 4f.
+
+#./compare_wer.sh 4f 4u
+#System                       4f        4u
+#WER on train_dev(tg)      16.83     16.47
+#WER on train_dev(fg)      15.73     15.23
+#WER on eval2000(tg)        18.4      18.4
+#WER on eval2000(fg)        16.6      16.7
+#Final train prob      -0.105832 -0.118911
+#Final valid prob      -0.123021 -0.135768
+
+# _4t is as _4s, but with --leaky-hmm-coefficient 0.04.
+
+# _4s is as _4f, but with --leaky-hmm-coefficient 0.02.  [A new option.]
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_4x # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --leaky-hmm-coefficient 0.2 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1500 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5a.sh b/egs/swbd/s5c/local/chain/run_tdnn_5a.sh
new file mode 100755
index 00000000000..cd1de07a80d
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5a.sh
@@ -0,0 +1,401 @@
+#!/bin/bash
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.  Very helpful (between 0.2%
+# and 0.6%).
+
+#./compare_wer.sh 4w 5a
+#System                       4w        5a
+#WER on train_dev(tg)      16.05     15.86
+#WER on train_dev(fg)      14.92     14.74
+#WER on eval2000(tg)        18.0      17.4
+#WER on eval2000(fg)        16.2      15.6
+#Final train prob      -0.108816-0.0998359
+#Final valid prob      -0.118254 -0.115884
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5a # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.2 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5b.sh b/egs/swbd/s5c/local/chain/run_tdnn_5b.sh
new file mode 100755
index 00000000000..7e44c10920e
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5b.sh
@@ -0,0 +1,404 @@
+#!/bin/bash
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5b # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.2 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5c.sh b/egs/swbd/s5c/local/chain/run_tdnn_5c.sh
new file mode 100755
index 00000000000..93ebb59b16d
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5c.sh
@@ -0,0 +1,409 @@
+#!/bin/bash
+
+# _5c is as _4w, but changing --xent-regularize to 0.05, since 0.2 seemed to be
+# worse than 0.1.
+# It seems a little worse on average: WER change is (+0.3, +0.3, -0.2, +0.2).
+#System                       4w        5c
+#WER on train_dev(tg)      16.05     16.35
+#WER on train_dev(fg)      14.92     15.21
+#WER on eval2000(tg)        18.0      17.8
+#WER on eval2000(fg)        16.2      16.4
+#Final train prob      -0.108816 -0.107098
+#Final valid prob      -0.118254 -0.118209
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2.  WER seems consistently
+# a bit worse (+0.1, +0.2, +0.3, +0.2), although final valid prob is very
+# slightly better.
+
+#./compare_wer.sh 4v 4w
+#System                       4v        4w
+#WER on train_dev(tg)      15.95     16.05
+#WER on train_dev(fg)      14.69     14.92
+#WER on eval2000(tg)        17.7      18.0
+#WER on eval2000(fg)        16.0      16.2
+#Final train prob      -0.106646 -0.108816
+#Final valid prob      -0.118631 -0.118254
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5c # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.05 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 400  --jesus-forward-output-dim 1400 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5d.sh b/egs/swbd/s5c/local/chain/run_tdnn_5d.sh
new file mode 100755
index 00000000000..8e6e9358003
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5d.sh
@@ -0,0 +1,407 @@
+#!/bin/bash
+
+# _5d is as _5b, but increasing jesus-forward-input-dim from 500 to 600 and
+# jesus-forward-output-dim from 1800 to 2000.
+
+# It's maybe slightly helpful: WER change is (-0.2, -0.2, 0, +0.1).
+#./compare_wer.sh 5b 5d
+#System                       5b        5d
+#WER on train_dev(tg)      15.51     15.29
+#WER on train_dev(fg)      14.39     14.17
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.7
+#Final train prob      -0.112013 -0.107858
+#Final valid prob      -0.130879 -0.128862
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5d # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.2 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 2000 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5e.sh b/egs/swbd/s5c/local/chain/run_tdnn_5e.sh
new file mode 100755
index 00000000000..ed48b0673b8
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5e.sh
@@ -0,0 +1,417 @@
+#!/bin/bash
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5e # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5f.sh b/egs/swbd/s5c/local/chain/run_tdnn_5f.sh
new file mode 100755
index 00000000000..5fb1f0c445c
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5f.sh
@@ -0,0 +1,423 @@
+#!/bin/bash
+
+# _5f is as _5e, but making the 5b->5d change (increasing the
+# number of parameters)-- increasing jesus-forward-output-dim from 1800 to 2000,
+# and jesus-forward-input-dim from 500 to 600.
+
+# WER change is (-0.1, -0.2, +0.2, +0.1).  So zero on average.
+# This means 5e remains the best system so far.
+
+#./compare_wer.sh 5e 5f
+#System                       5e        5f
+#WER on train_dev(tg)      15.43     15.35
+#WER on train_dev(fg)      14.32     14.15
+#WER on eval2000(tg)        17.3      17.5
+#WER on eval2000(fg)        15.5      15.6
+#Final train prob      -0.110056  -0.10574
+#Final valid prob      -0.129184 -0.128112
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.05 is better than 0.2 or 0.1).
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5f # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 2000 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5g.sh b/egs/swbd/s5c/local/chain/run_tdnn_5g.sh
new file mode 100755
index 00000000000..784facf5a82
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5g.sh
@@ -0,0 +1,499 @@
+#!/bin/bash
+
+# _5g is as _5e, but adding one statistics-extraction layer to the
+# splice indexes, in the middle of the network (with both mean
+# and stddev).
+
+
+# Here is decoding with --frames-per-chunk 300.  A fairly consistent
+# improvement.
+#./compare_wer.sh 5e 5g
+#System                       5e        5g
+#WER on train_dev(tg)      15.43     15.27
+#WER on train_dev(fg)      14.32     14.21
+#WER on eval2000(tg)        17.3      16.9
+#WER on eval2000(fg)        15.5      15.2
+#Final train prob      -0.110056 -0.103752
+#Final valid prob      -0.129184 -0.125641
+
+
+#  *All results below here are broken-- they were computed when I had a bug in
+#   the index-permutation, and the blocks weren't computed right for the jesus
+#   layer.*
+# Here are WERs when the frames-per-chunk was 50:
+#./compare_wer.sh 5e 5g
+#System                       5e        5g
+#WER on train_dev(tg)      15.43     15.62
+#WER on train_dev(fg)      14.32     14.42
+#WER on eval2000(tg)        17.3      17.7
+#WER on eval2000(fg)        15.5      16.0
+
+# and here with 150:
+# WER on train_dev(tg)      15.43     15.46
+# WER on train_dev(fg)      14.32     14.38
+# WER on eval2000(tg)        17.3      17.3
+# WER on eval2000(fg)        15.5      15.5
+
+
+# and here with 300 ... we do see a small improvement
+# at this value.  (could probably improve it further
+# by modifying the model to average over a larger window).
+#WER on train_dev(tg)      15.43     15.29
+#WER on train_dev(fg)      14.32     14.17
+#WER on eval2000(tg)        17.3      17.2
+#WER on eval2000(fg)        15.5      15.4
+#Final train prob      -0.110056 -0.105725
+#Final valid prob      -0.129184 -0.125756
+
+# Below is also with chunk-size=300, but with the 'wide' model
+# that sees more context.  Oddly, the WER is worse.  It looks like
+# the model may be doing something different than just learning
+# speaker characteristics.
+#./compare_wer.sh 5e 5g
+#System                       5e        5g
+#WER on train_dev(tg)      15.43     15.54
+#WER on train_dev(fg)      14.32     14.34
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.5      15.6
+#Final train prob      -0.110056 -0.105725
+#Final valid prob      -0.129184 -0.125756
+
+
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5g # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3,mean+stddev(-99:3:9:99) -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --frames-per-chunk 300 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+
+# if [ $stage -le 15 ]; then
+#   # get wide-context model
+#   nnet3-am-copy --binary=false $dir/final.mdl - | \
+#     sed 's/Context> 99/Context> 306/g' | nnet3-am-copy - $dir/wide.mdl
+#   for decode_set in train_dev eval2000; do
+#       (
+#       steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+#           --frames-per-chunk 300 --iter wide \
+#           --nj 50 --cmd "$decode_cmd" \
+#           --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+#          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+#       if $has_fisher; then
+#           steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+#             data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+#             $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+#       fi
+#       ) &
+#   done
+# fi
+
+
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5h.sh b/egs/swbd/s5c/local/chain/run_tdnn_5h.sh
new file mode 100755
index 00000000000..5eeb5ca5d03
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5h.sh
@@ -0,0 +1,434 @@
+#!/bin/bash
+
+# _5h is as _5g, but only mean, no stddev, stats.
+
+# The following comparison is with 150 frames per chunk
+# in both the 5g and 5h decodes.  No consistent WER difference
+# with either 5e or 5g.
+#System                       5e        5g        5h
+#WER on train_dev(tg)      15.43     15.46     15.45
+#WER on train_dev(fg)      14.32     14.38     14.34
+#WER on eval2000(tg)        17.3      17.3      17.2
+#WER on eval2000(fg)        15.5      15.5      15.7
+#Final train prob      -0.110056 -0.105725 -0.106213
+#Final valid prob      -0.129184 -0.125756 -0.126809
+
+# _5g is as _5e, but adding one statistics-extraction layer to the
+# splice indexes, in the middle of the network (with both mean
+# and stddev).
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5h # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3,mean(-99:3:9:99) -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --frames-per-chunk 150 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5i.sh b/egs/swbd/s5c/local/chain/run_tdnn_5i.sh
new file mode 100755
index 00000000000..9ffc37793ee
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5i.sh
@@ -0,0 +1,432 @@
+#!/bin/bash
+
+# _5i is as _5g, but adding the mean+stddev features for all hidden layers.
+# a little worse than 5g (but for Remi Francis it was a little better).
+#local/chain/compare_wer.sh 5e 5g 5i
+#System                       5e        5g        5i
+#WER on train_dev(tg)      15.43     15.27     15.41
+#WER on train_dev(fg)      14.32     14.21     14.47
+#WER on eval2000(tg)        17.3      16.9      17.0
+#WER on eval2000(fg)        15.5      15.2      15.4
+#Final train prob      -0.110056 -0.103752 -0.102539
+#Final valid prob      -0.129184 -0.125641  -0.12375
+
+# _5g is as _5e, but adding one statistics-extraction layer to the
+# splice indexes, in the middle of the network (with both mean
+# and stddev).
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5i # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2,mean+stddev(-99:1:9:99) -3,0,3,mean+stddev(-99:3:9:99) -3,0,3,mean+stddev(-99:3:9:99) -3,0,3,mean+stddev(-99:3:9:99) -6,-3,0,mean+stddev(-99:3:9:99)" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --frames-per-chunk 150 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5j.sh b/egs/swbd/s5c/local/chain/run_tdnn_5j.sh
new file mode 100755
index 00000000000..892a79fd2a8
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5j.sh
@@ -0,0 +1,427 @@
+#!/bin/bash
+
+# _5j is as _5e, but omitting the iVectors.
+
+# Definitely worse, although curiously, there is very little effect on the valid prob.
+#./compare_wer.sh 5e 5j
+#System                       5e        5j
+#WER on train_dev(tg)      15.43     17.59
+#WER on train_dev(fg)      14.32     16.33
+#WER on eval2000(tg)        17.3      19.1
+#WER on eval2000(fg)        15.5      17.5
+#Final train prob      -0.110056 -0.114691
+#Final valid prob      -0.129184 -0.130761
+
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5j # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5k.sh b/egs/swbd/s5c/local/chain/run_tdnn_5k.sh
new file mode 100755
index 00000000000..b6c984ed253
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5k.sh
@@ -0,0 +1,454 @@
+#!/bin/bash
+
+# _5k is as _5j (omitting iVectors), and adding a statistics-extraction layer
+# in the middle, like 5e->5g, to see whether it recovers some of the improvement
+# of using the iVectors.
+
+# It recovers half of the improvement-- but the objf is better than
+# we might expect.  I think it's learning some phonetic stuff too.
+#
+#./compare_wer.sh 5e 5j 5k
+#System                       5e        5j        5k
+#WER on train_dev(tg)      15.43     17.59     16.46
+#WER on train_dev(fg)      14.32     16.33     15.17
+#WER on eval2000(tg)        17.3      19.1      18.1
+#WER on eval2000(fg)        15.5      17.5      16.5
+#Final train prob      -0.110056 -0.114691 -0.105502
+#Final valid prob      -0.129184 -0.130761  -0.12337
+
+# The following is decoding with the default frames-per-chunk of 50, and
+# --extra-left-context 20.
+#./compare_wer.sh 5e 5j 5k
+#System                       5e        5j        5k
+#WER on train_dev(tg)      15.43     17.59     17.37
+#WER on train_dev(fg)      14.32     16.33     16.09
+#WER on eval2000(tg)        17.3      19.1      18.8
+#WER on eval2000(fg)        15.5      17.5      17.3
+#Final train prob      -0.110056 -0.114691 -0.105502
+#Final valid prob      -0.129184 -0.130761  -0.12337
+
+# _5j is as _5e, but omitting the iVectors.
+
+# Definitely worse, although curiously, there is very little effect on the valid prob.
+#./compare_wer.sh 5e 5j
+#System                       5e        5j
+#WER on train_dev(tg)      15.43     17.59
+#WER on train_dev(fg)      14.32     16.33
+#WER on eval2000(tg)        17.3      19.1
+#WER on eval2000(fg)        15.5      17.5
+#Final train prob      -0.110056 -0.114691
+#Final valid prob      -0.129184 -0.130761
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5k # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_5j_sp/egs \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3,mean+stddev(-99:3:9:99) -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --frames-per-chunk 300 \
+          --nj 50 --cmd "$decode_cmd" \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5l.sh b/egs/swbd/s5c/local/chain/run_tdnn_5l.sh
new file mode 100755
index 00000000000..d5b51eb7551
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5l.sh
@@ -0,0 +1,464 @@
+#!/bin/bash
+
+# _5l is as _5k, but doubling frames-per-eg from 150 to 300, and increasing
+# the context radius of the statistics-pooling from 99 to 153.
+
+# :-( No better than 5k.)
+#./compare_wer.sh 5e 5j 5k 5l
+#System                       5e        5j        5k        5l
+#WER on train_dev(tg)      15.43     17.59     16.46     16.68
+#WER on train_dev(fg)      14.32     16.33     15.17     15.40
+#WER on eval2000(tg)        17.3      19.1      18.1      18.3
+#WER on eval2000(fg)        15.5      17.5      16.5      16.5
+#Final train prob      -0.110056 -0.114691 -0.105502-0.0804455
+#Final valid prob      -0.129184 -0.130761  -0.12337  -0.10712
+
+# _5k is as _5j (omitting iVectors), and adding a statistics-extraction layer
+# in the middle, like 5e->5g, to see whether it recovers some of the improvement
+# of using the iVectors.
+
+# It recovers half of the improvement-- but the objf is better than
+# we might expect.  I think it's learning some phonetic stuff too.
+#
+#./compare_wer.sh 5e 5j 5k
+#System                       5e        5j        5k
+#WER on train_dev(tg)      15.43     17.59     16.46
+#WER on train_dev(fg)      14.32     16.33     15.17
+#WER on eval2000(tg)        17.3      19.1      18.1
+#WER on eval2000(fg)        15.5      17.5      16.5
+#Final train prob      -0.110056 -0.114691 -0.105502
+#Final valid prob      -0.129184 -0.130761  -0.12337
+
+# The following is decoding with the default frames-per-chunk of 50, and
+# --extra-left-context 20.
+#./compare_wer.sh 5e 5j 5k
+#System                       5e        5j        5k
+#WER on train_dev(tg)      15.43     17.59     17.37
+#WER on train_dev(fg)      14.32     16.33     16.09
+#WER on eval2000(tg)        17.3      19.1      18.8
+#WER on eval2000(fg)        15.5      17.5      17.3
+#Final train prob      -0.110056 -0.114691 -0.105502
+#Final valid prob      -0.129184 -0.130761  -0.12337
+
+# _5j is as _5e, but omitting the iVectors.
+
+# Definitely worse, although curiously, there is very little effect on the valid prob.
+#./compare_wer.sh 5e 5j
+#System                       5e        5j
+#WER on train_dev(tg)      15.43     17.59
+#WER on train_dev(fg)      14.32     16.33
+#WER on eval2000(tg)        17.3      19.1
+#WER on eval2000(fg)        15.5      17.5
+#Final train prob      -0.110056 -0.114691
+#Final valid prob      -0.129184 -0.130761
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5l # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.414  # was 2; now 2 / sqrt(2) = sqrt(2), since we're using half the minibatch size.
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --frames-per-eg 300 \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3,mean+stddev(-153:3:9:153) -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size 64 \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --frames-per-chunk 300 \
+          --nj 50 --cmd "$decode_cmd" \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5m.sh b/egs/swbd/s5c/local/chain/run_tdnn_5m.sh
new file mode 100644
index 00000000000..a9e12357c23
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5m.sh
@@ -0,0 +1,430 @@
+#!/bin/bash
+
+# _5m is as _5e, but with a script change where we are randomizing
+# the frame shift a bit better.
+
+# No very clear change, but if anything the optimization is less effective
+# and the WER worse -> I'm going to revert this script change.
+#System                       5e        5m
+#WER on train_dev(tg)      15.43     15.57
+#WER on train_dev(fg)      14.32     14.47
+#WER on eval2000(tg)        17.3      17.2
+#WER on eval2000(fg)        15.5      15.7
+#Final train prob      -0.110056 -0.112539
+#Final valid prob      -0.129184 -0.129006
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5m # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5n.sh b/egs/swbd/s5c/local/chain/run_tdnn_5n.sh
new file mode 100755
index 00000000000..d4372a418d8
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5n.sh
@@ -0,0 +1,459 @@
+#!/bin/bash
+
+# _5n is as _5j (also omitting the iVectors), but using double the input frame
+# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying
+# the splice indexes accordingly
+
+# A very nice improvement on dev; small improvement on eval2000 though.
+#local/chain/compare_wer.sh 5j 5n
+#System                       5j        5n
+#WER on train_dev(tg)      17.59     16.85
+#WER on train_dev(fg)      16.33     15.67
+#WER on eval2000(tg)        19.1      19.1
+#WER on eval2000(fg)        17.5      17.3
+#Final train prob      -0.114691 -0.116341
+#Final valid prob      -0.130761 -0.130884
+
+# _5j is as _5e, but omitting the iVectors.
+
+# Definitely worse, although curiously, there is very little effect on the valid prob.
+#./compare_wer.sh 5e 5j
+#System                       5e        5j
+#WER on train_dev(tg)      15.43     17.59
+#WER on train_dev(fg)      14.32     16.33
+#WER on eval2000(tg)        17.3      19.1
+#WER on eval2000(fg)        15.5      17.5
+#Final train prob      -0.110056 -0.114691
+#Final valid prob      -0.129184 -0.130761
+
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5n # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=2  # this is about the same amount of compute as the normal 4, since one
+              # epoch encompasses all frame-shifts of the data.
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=300 # doubling it, since we have half the frame rate.
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+# Generate double-frame-rate version of the data.
+if [ $stage -le 12 ]; then
+  mfccdir=mfcc
+  for dataset in eval2000 train_dev; do  ## ${train_set}; do
+    utils/copy_data_dir.sh data/$dataset data/${dataset}_hires_dbl
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 --mfcc-config conf/mfcc_hires_dbl.conf \
+        data/${dataset}_hires_dbl exp/make_hires_dbl/$dataset $mfccdir;
+    steps/compute_cmvn_stats.sh data/${dataset}_hires_dbl exp/make_hires_dbl/$dataset $mfccdir;
+    utils/fix_data_dir.sh data/${dataset}_hires_dbl  # remove segments with problems
+  done
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --frame-subsampling-factor 6 \
+    --alignment-subsampling-factor 3 \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -2,0,2 -4,-2,0,2 -6,0,6 -6,0,6 -12,-6,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 2400000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires_dbl $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+
+ echo "0.005" > $dir/frame_shift # this lets the sclite decoding script know
+                                 # what the frame shift was, in seconds.
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+         $graph_dir data/${decode_set}_hires_dbl $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires_dbl \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5o.sh b/egs/swbd/s5c/local/chain/run_tdnn_5o.sh
new file mode 100755
index 00000000000..86bbe1ad441
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5o.sh
@@ -0,0 +1,467 @@
+#!/bin/bash
+
+# _5o is as _5n but adding an extra splicing layer and increasing the
+# splice-width slightly on the 1st layer, to get closer to the context in 5n;
+# having one more layer running at double-frequency, and reverting the frame-length to
+# the same as in the baseline (25ms) to avoid sacrificing frequency resolution.
+
+# Objective functions improve but WER change is quite small vs 5n (~0.1%).  so
+# not clear that the extra time is worth it (it's noticeably slower to train as
+# that extra layer is at a higher sampling rate).
+#
+#System                       5j        5n        5o
+#WER on train_dev(tg)      17.59     16.85     16.83
+#WER on train_dev(fg)      16.33     15.67     15.60
+#WER on eval2000(tg)        19.1      19.1      18.8
+#WER on eval2000(fg)        17.5      17.3      17.2
+#Final train prob      -0.114691 -0.116341 -0.111613
+#Final valid prob      -0.130761 -0.130884 -0.126765
+
+# _5n is as _5j (also omitting the iVectors), but using double the input frame
+# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying
+# the splice indexes accordingly
+
+# _5j is as _5e, but omitting the iVectors.
+
+# Definitely worse, although curiously, there is very little effect on the valid prob.
+#./compare_wer.sh 5e 5j
+#System                       5e        5j
+#WER on train_dev(tg)      15.43     17.59
+#WER on train_dev(fg)      14.32     16.33
+#WER on eval2000(tg)        17.3      19.1
+#WER on eval2000(fg)        15.5      17.5
+#Final train prob      -0.110056 -0.114691
+#Final valid prob      -0.129184 -0.130761
+
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5o # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=2  # this is about the same amount of compute as the normal 4, since one
+              # epoch encompasses all frame-shifts of the data.
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=300 # doubling it, since we have half the frame rate.
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+# Generate double-frame-rate version of the data.
+if [ $stage -le 12 ]; then
+  mfccdir=mfcc
+  for dataset in eval2000 train_dev ${train_set}; do
+    utils/copy_data_dir.sh data/$dataset data/${dataset}_hires_dbl2
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 --mfcc-config conf/mfcc_hires_dbl2.conf \
+        data/${dataset}_hires_dbl2 exp/make_hires_dbl2/$dataset $mfccdir;
+    steps/compute_cmvn_stats.sh data/${dataset}_hires_dbl2 exp/make_hires_dbl2/$dataset $mfccdir;
+    utils/fix_data_dir.sh data/${dataset}_hires_dbl2  # remove segments with problems
+  done
+fi
+
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --frame-subsampling-factor 6 \
+    --alignment-subsampling-factor 3 \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -2,-1,0,1,2 -2,0,2 -4,-2,0,2 -6,0,6 -6,0,6 -12,-6,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 2400000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires_dbl2 $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+
+ echo "0.005" > $dir/frame_shift # this lets the sclite decoding script know
+                                 # what the frame shift was, in seconds.
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+         $graph_dir data/${decode_set}_hires_dbl2 $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires_dbl2 \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5p.sh b/egs/swbd/s5c/local/chain/run_tdnn_5p.sh
new file mode 100755
index 00000000000..d2ef7057873
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5p.sh
@@ -0,0 +1,421 @@
+#!/bin/bash
+
+# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair
+# ReLUs that are over or under-saturated. [abandoned after discovering bug,
+# this thread is picked up in 5s and 5t.]
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5p # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5q.sh b/egs/swbd/s5c/local/chain/run_tdnn_5q.sh
new file mode 100755
index 00000000000..5968a00417e
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5q.sh
@@ -0,0 +1,425 @@
+#!/bin/bash
+
+# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try
+# to compensate for the fact that more of the output dimensions are now being
+# usefully used.  [abandoned after discovering bug,
+# this thread is picked up in 5s and 5t.]
+
+# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair
+# ReLUs that are over or under-saturated.
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5q # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1500 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5r.sh b/egs/swbd/s5c/local/chain/run_tdnn_5r.sh
new file mode 100755
index 00000000000..306d76859f9
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5r.sh
@@ -0,0 +1,427 @@
+#!/bin/bash
+
+# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000.
+# [abandoned after discovering bug, this thread is picked up in 5s and 5t.]
+
+# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try
+# to compensate for the fact that more of the output dimensions are now being
+# usefully used.
+
+# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair
+# ReLUs that are over or under-saturated.
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5r # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1500 --jesus-hidden-dim 5000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5s.sh b/egs/swbd/s5c/local/chain/run_tdnn_5s.sh
new file mode 100755
index 00000000000..65da1e06183
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5s.sh
@@ -0,0 +1,441 @@
+#!/bin/bash
+
+# Comparing with 5e which is the most recent baseline we actually decoded,
+# 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700,
+# jesus-hidden-dim reduced 7500 to 5000, and
+# and the new option --self-repair-scale 0.00001 added.
+# Also compare 5t and 5v which have even smaller j3sus-hidden-dims.
+
+#./compare_wer.sh 5e 5s 5t
+#System                       5e        5s        5t
+#WER on train_dev(tg)      15.43     15.47     15.43
+#WER on train_dev(fg)      14.32     14.31     14.34
+#WER on eval2000(tg)        17.3      17.4      17.4
+#WER on eval2000(fg)        15.5      15.6      15.6
+#Final train prob      -0.110056 -0.110928 -0.110752
+#Final valid prob      -0.129184 -0.132139 -0.129123
+
+# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000.
+
+# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try
+# to compensate for the fact that more of the output dimensions are now being
+# usefully used.
+
+# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair
+# ReLUs that are over or under-saturated.
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5s # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1700 --jesus-hidden-dim 5000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5t.sh b/egs/swbd/s5c/local/chain/run_tdnn_5t.sh
new file mode 100755
index 00000000000..9831417003b
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5t.sh
@@ -0,0 +1,445 @@
+#!/bin/bash
+
+# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it
+# up), from 5000 to 3500.  Seems to make no difference to WERs; valid prob improves.
+
+#local/chain/compare_wer.sh 5e 5s 5t
+#System                       5e        5s        5t
+#WER on train_dev(tg)      15.43     15.47     15.43
+#WER on train_dev(fg)      14.32     14.31     14.34
+#WER on eval2000(tg)        17.3      17.4      17.4
+#WER on eval2000(fg)        15.5      15.6      15.6
+#Final train prob      -0.110056 -0.110928 -0.110752
+#Final valid prob      -0.129184 -0.132139 -0.129123
+
+# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate
+# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair
+# code to a bug which was doubling the thresholds so there was, in effect,
+# no upper threshold.  I stopped the p,q,r runs after I found this, but in
+# configuring this run I'm bearing in mind the train and valid probs from the
+# p,q,r runs.
+
+# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000.
+
+# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try
+# to compensate for the fact that more of the output dimensions are now being
+# usefully used.
+
+# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair
+# ReLUs that are over or under-saturated.
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5t # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1700 --jesus-hidden-dim 3500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5u.sh b/egs/swbd/s5c/local/chain/run_tdnn_5u.sh
new file mode 100755
index 00000000000..34fe30993cf
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5u.sh
@@ -0,0 +1,505 @@
+#!/bin/bash
+
+# _5u is as _5o but modifying the mfcc generation to use a narrower window while
+# generating the lower-order mfcc coefficients (the first 10).
+
+# Abandoning it partway through after I got the following less-than-promising diagnostics.
+# grep Overall exp/chain/tdnn_5{o,u}_sp/log/compute_prob_valid.84.log | grep -v xent
+# exp/chain/tdnn_5o_sp/log/compute_prob_valid.84.log:LOG (nnet3-chain-compute-prob:PrintTotalStats():nnet-chain-diagnostics.cc:175) Overall log-probability for 'output' is -0.146977 + -0.0159528 = -0.16293 per frame, over 20000 frames.
+# exp/chain/tdnn_5u_sp/log/compute_prob_valid.84.log:LOG (nnet3-chain-compute-prob:PrintTotalStats():nnet-chain-diagnostics.cc:175) Overall log-probability for 'output' is -0.147207 + -0.015692 = -0.162899 per frame, over 20000 frames.
+# a03:s5c: grep Overall exp/chain/tdnn_5{o,u}_sp/log/compute_prob_train.84.log | grep -v xent
+# exp/chain/tdnn_5o_sp/log/compute_prob_train.84.log:LOG (nnet3-chain-compute-prob:PrintTotalStats():nnet-chain-diagnostics.cc:175) Overall log-probability for 'output' is -0.146703 + -0.0165036 = -0.163207 per frame, over 20000 frames.
+# exp/chain/tdnn_5u_sp/log/compute_prob_train.84.log:LOG (nnet3-chain-compute-prob:PrintTotalStats():nnet-chain-diagnostics.cc:175) Overall log-probability for 'output' is -0.145524 + -0.0162272 = -0.161751 per frame, over 20000 frames.
+
+# _5o is as _5n but adding an extra splicing layer and increasing the
+# splice-width slightly on the 1st layer, to get closer to the context in 5n;
+# having one more layer running at double-frequency, and reverting the frame-length to
+# the same as in the baseline (25ms) to avoid sacrificing frequency resolution.
+
+# Objective functions improve but WER change is quite small vs 5n (~0.1%).  so
+# not clear that the extra time is worth it (it's noticeably slower to train as
+# that extra layer is at a higher sampling rate).
+#
+#System                       5j        5n        5o
+#WER on train_dev(tg)      17.59     16.85     16.83
+#WER on train_dev(fg)      16.33     15.67     15.60
+#WER on eval2000(tg)        19.1      19.1      18.8
+#WER on eval2000(fg)        17.5      17.3      17.2
+#Final train prob      -0.114691 -0.116341 -0.111613
+#Final valid prob      -0.130761 -0.130884 -0.126765
+
+# _5n is as _5j (also omitting the iVectors), but using double the input frame
+# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying
+# the splice indexes accordingly
+
+# _5j is as _5e, but omitting the iVectors.
+
+# Definitely worse, although curiously, there is very little effect on the valid prob.
+#./compare_wer.sh 5e 5j
+#System                       5e        5j
+#WER on train_dev(tg)      15.43     17.59
+#WER on train_dev(fg)      14.32     16.33
+#WER on eval2000(tg)        17.3      19.1
+#WER on eval2000(fg)        15.5      17.5
+#Final train prob      -0.110056 -0.114691
+#Final valid prob      -0.129184 -0.130761
+
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=13
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5u # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=2  # this is about the same amount of compute as the normal 4, since one
+              # epoch encompasses all frame-shifts of the data.
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=300 # doubling it, since we have half the frame rate.
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+# Generate double-frame-rate version of the data with normal window size.
+if [ $stage -le 12 ]; then
+  mfccdir=mfcc
+  for dataset in eval2000 train_dev ${train_set}; do
+    utils/copy_data_dir.sh data/$dataset data/${dataset}_hires_dbl2
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 --mfcc-config conf/mfcc_hires_dbl2.conf \
+        data/${dataset}_hires_dbl2 exp/make_hires_dbl2/$dataset $mfccdir;
+    steps/compute_cmvn_stats.sh data/${dataset}_hires_dbl2 exp/make_hires_dbl2/$dataset $mfccdir;
+    utils/fix_data_dir.sh data/${dataset}_hires_dbl2  # remove segments with problems
+  done
+fi
+
+# Generate double-frame-rate version of the data with smaller than normal window size;
+# and only keeping the first 10 MFCC coefficients.
+if [ $stage -le 13 ]; then
+  mfccdir=mfcc
+  for dataset in eval2000 train_dev ${train_set}; do
+    utils/copy_data_dir.sh data/$dataset data/${dataset}_dbl3
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 --mfcc-config conf/mfcc_dbl3.conf \
+        data/${dataset}_dbl3 exp/make_dbl3/$dataset $mfccdir;
+    utils/fix_data_dir.sh data/${dataset}_dbl3  # remove segments with problems
+  done
+fi
+
+# select dimension 10-39 of the dbl2 features, then create pasted features consisting
+# of the 10 dimensions of the dbl3, plus the selected dimensions 10-39 of dbl2.
+if [ $stage -le 14 ]; then
+  mfccdir=mfcc
+  for dataset in eval2000 train_dev ${train_set}; do
+    steps/select_feats.sh --cmd "$train_cmd --max-jobs-run 4" 10-39 data/${dataset}_hires_dbl2 data/${dataset}_hires_dbl2_select \
+          exp/make_dbl3/$dataset $mfccdir
+    rm data/${dataset}_hires_dbl2_select/cmvn.scp 2>/dev/null || true
+    steps/paste_feats.sh --cmd "$train_cmd --max-jobs-run 4" data/${dataset}_hires_dbl2_select data/${dataset}_dbl3 data/${dataset}_pasted \
+          exp/make_dbl3/$dataset $mfccdir
+    steps/compute_cmvn_stats.sh data/${dataset}_pasted exp/make_dbl3/$dataset $mfccdir;
+    utils/fix_data_dir.sh data/${dataset}_pasted
+  done
+fi
+
+
+if [ $stage -le 15 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --frame-subsampling-factor 6 \
+    --alignment-subsampling-factor 3 \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -2,-1,0,1,2 -2,0,2 -4,-2,0,2 -6,0,6 -6,0,6 -12,-6,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 2400000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_pasted $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+
+ echo "0.005" > $dir/frame_shift # this lets the sclite decoding script know
+                                 # what the frame shift was, in seconds.
+fi
+
+if [ $stage -le 16 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 17 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+         $graph_dir data/${decode_set}_pasted $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_pasted \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5v.sh b/egs/swbd/s5c/local/chain/run_tdnn_5v.sh
new file mode 100755
index 00000000000..b33f013b894
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5v.sh
@@ -0,0 +1,459 @@
+#!/bin/bash
+
+# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500.
+
+# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse.
+# I ended up running it again after I suspected that we had 'got lucky' with
+# this particular run (since various experiments using 5v as a starting point
+# were failures); that rerun is the  5v2 run.
+#
+# local/chain/compare_wer.sh 5e 5s 5t 5v 5v2
+# System                       5e        5s        5t        5v       5v2
+# WER on train_dev(tg)      15.43     15.47     15.43     15.38     15.74
+# WER on train_dev(fg)      14.32     14.31     14.34     14.39     14.50
+# WER on eval2000(tg)        17.3      17.4      17.4      17.4      17.5
+# WER on eval2000(fg)        15.5      15.6      15.6      15.7      15.9
+# Final train prob      -0.110056 -0.110928 -0.110752  -0.11156 -0.112155
+# Final valid prob      -0.129184 -0.132139 -0.129123 -0.131797 -0.129516
+
+
+# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it
+# up), from 5000 to 3500.
+
+# about 5s: comparing with 5e which is the most recent baseline we actually
+# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700,
+# jesus-hidden-dim reduced 7500 to 5000, and and the new option
+# --self-repair-scale 0.00001 added.  Also compare 5t and 5v which have even
+# smaller jesus-hidden-dims.
+
+# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate
+# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair
+# code to a bug which was doubling the thresholds so there was, in effect,
+# no upper threshold.  I stopped the p,q,r runs after I found this, but in
+# configuring this run I'm bearing in mind the train and valid probs from the
+# p,q,r runs.
+
+# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000.
+
+# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try
+# to compensate for the fact that more of the output dimensions are now being
+# usefully used.
+
+# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair
+# ReLUs that are over or under-saturated.
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5v # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1700 --jesus-hidden-dim 2500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5w.sh b/egs/swbd/s5c/local/chain/run_tdnn_5w.sh
new file mode 100755
index 00000000000..1a40acfa105
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5w.sh
@@ -0,0 +1,469 @@
+#!/bin/bash
+
+# _5w is as _5k (which is a fairly good-performing ivector-free model), but
+# making the same changes as 5e -> 5t, which makes the model more lightweight
+# and faster to train, specifically: reduce --jesus-hidden-dim from 7500 to
+# 3500, add --self-repair-scale 0.00001, and reduce --jesus-forward-output-dim
+# from 1800 to 1700.
+
+# Difference is tiny.
+#local/chain/compare_wer.sh 5k 5w
+#System                       5k        5w
+#WER on train_dev(tg)      16.46     16.56
+#WER on train_dev(fg)      15.17     15.30
+#WER on eval2000(tg)        18.1      18.1
+#WER on eval2000(fg)        16.5      16.4
+#Final train prob      -0.105502 -0.106549
+#Final valid prob       -0.12337 -0.120079
+
+# _5k is as _5j (omitting iVectors), and adding a statistics-extraction layer
+# in the middle, like 5e->5g, to see whether it recovers some of the improvement
+# of using the iVectors.
+
+# It recovers half of the improvement-- but the objf is better than
+# we might expect.  I think it's learning some phonetic stuff too.
+#
+#./compare_wer.sh 5e 5j 5k
+#System                       5e        5j        5k
+#WER on train_dev(tg)      15.43     17.59     16.46
+#WER on train_dev(fg)      14.32     16.33     15.17
+#WER on eval2000(tg)        17.3      19.1      18.1
+#WER on eval2000(fg)        15.5      17.5      16.5
+#Final train prob      -0.110056 -0.114691 -0.105502
+#Final valid prob      -0.129184 -0.130761  -0.12337
+
+# The following is decoding with the default frames-per-chunk of 50, and
+# --extra-left-context 20.
+#./compare_wer.sh 5e 5j 5k
+#System                       5e        5j        5k
+#WER on train_dev(tg)      15.43     17.59     17.37
+#WER on train_dev(fg)      14.32     16.33     16.09
+#WER on eval2000(tg)        17.3      19.1      18.8
+#WER on eval2000(fg)        15.5      17.5      17.3
+#Final train prob      -0.110056 -0.114691 -0.105502
+#Final valid prob      -0.129184 -0.130761  -0.12337
+
+# _5j is as _5e, but omitting the iVectors.
+
+# Definitely worse, although curiously, there is very little effect on the valid prob.
+#./compare_wer.sh 5e 5j
+#System                       5e        5j
+#WER on train_dev(tg)      15.43     17.59
+#WER on train_dev(fg)      14.32     16.33
+#WER on eval2000(tg)        17.3      19.1
+#WER on eval2000(fg)        15.5      17.5
+#Final train prob      -0.110056 -0.114691
+#Final valid prob      -0.129184 -0.130761
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5w # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1700 --jesus-hidden-dim 3500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3,mean+stddev(-99:3:9:99) -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --frames-per-chunk 300 \
+          --nj 50 --cmd "$decode_cmd" \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5x.sh b/egs/swbd/s5c/local/chain/run_tdnn_5x.sh
new file mode 100755
index 00000000000..88dc28c2354
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5x.sh
@@ -0,0 +1,476 @@
+#!/bin/bash
+
+# _5x is as _5w but decreasing the context of the averaging layer from +-0.99
+# seconds to +-0.66 seconds.  I would not have expected this to work a priori,
+# but the change from 5k -> 5l, which made the context wider, made WERs slightly
+# worse, so I'd like to see what happens when we decrease the context.
+
+# It's worse.  Odd because increasing the context (5k->5l) seemed to be a little
+# worse also.
+local/chain/compare_wer.sh 5w 5x
+#System                       5w        5x
+#WER on train_dev(tg)      16.56     16.66
+#WER on train_dev(fg)      15.30     15.41
+#WER on eval2000(tg)        18.1      18.5
+#WER on eval2000(fg)        16.4      16.6
+#Final train prob      -0.106549 -0.105693
+#Final valid prob      -0.120079 -0.121834
+
+# _5w is as _5k (which is a fairly good-performing ivector-free model), but
+# making the same changes as 5e -> 5t, which makes the model more lightweight
+# and faster to train, specifically: reduce --jesus-hidden-dim from 7500 to
+# 3500, add --self-repair-scale 0.00001, and reduce --jesus-forward-output-dim
+# from 1800 to 1700.
+
+# _5k is as _5j (omitting iVectors), and adding a statistics-extraction layer
+# in the middle, like 5e->5g, to see whether it recovers some of the improvement
+# of using the iVectors.
+
+# It recovers half of the improvement-- but the objf is better than
+# we might expect.  I think it's learning some phonetic stuff too.
+#
+#./compare_wer.sh 5e 5j 5k
+#System                       5e        5j        5k
+#WER on train_dev(tg)      15.43     17.59     16.46
+#WER on train_dev(fg)      14.32     16.33     15.17
+#WER on eval2000(tg)        17.3      19.1      18.1
+#WER on eval2000(fg)        15.5      17.5      16.5
+#Final train prob      -0.110056 -0.114691 -0.105502
+#Final valid prob      -0.129184 -0.130761  -0.12337
+
+# The following is decoding with the default frames-per-chunk of 50, and
+# --extra-left-context 20.
+#./compare_wer.sh 5e 5j 5k
+#System                       5e        5j        5k
+#WER on train_dev(tg)      15.43     17.59     17.37
+#WER on train_dev(fg)      14.32     16.33     16.09
+#WER on eval2000(tg)        17.3      19.1      18.8
+#WER on eval2000(fg)        15.5      17.5      17.3
+#Final train prob      -0.110056 -0.114691 -0.105502
+#Final valid prob      -0.129184 -0.130761  -0.12337
+
+# _5j is as _5e, but omitting the iVectors.
+
+# Definitely worse, although curiously, there is very little effect on the valid prob.
+#./compare_wer.sh 5e 5j
+#System                       5e        5j
+#WER on train_dev(tg)      15.43     17.59
+#WER on train_dev(fg)      14.32     16.33
+#WER on eval2000(tg)        17.3      19.1
+#WER on eval2000(fg)        15.5      17.5
+#Final train prob      -0.110056 -0.114691
+#Final valid prob      -0.129184 -0.130761
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5x # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_5w_sp/egs \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1700 --jesus-hidden-dim 3500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3,mean+stddev(-63:3:9:63) -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --frames-per-chunk 300 \
+          --nj 50 --cmd "$decode_cmd" \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5y.sh b/egs/swbd/s5c/local/chain/run_tdnn_5y.sh
new file mode 100755
index 00000000000..54769c23734
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5y.sh
@@ -0,0 +1,476 @@
+#!/bin/bash
+
+# _5y is as _5v, but rebalancing the network to have fewer parameters in the
+# final layer and more in the hidden parts, by reducing --final-hidden-dim from 500
+# (it defaults to --jesus-forward-hidden-dim) to 400, and increasing
+#  --jesus-forward-input-dim from 500 to 600 and
+#  --jesus-forward-output-dim from 1700 to 1800,
+# and --jesus-hidden-dim from 2500 to 3000 (note: I don't really expect this last change
+# to make much of a difference).
+# Very roughly, we're moving about a million parameters from the final layer to the
+# hidden parts of the network.  Hopefully this will reduce overtraining, since
+# the hidden parts of the network are regularized by the --xent-regularize option.
+
+# The diagnostics were improved, but the WER is no better (or maybe slightly worse).
+#local/chain/compare_wer.sh 5v 5y
+#System                       5v        5y
+#WER on train_dev(tg)      15.38     15.50
+#WER on train_dev(fg)      14.39     14.37
+#WER on eval2000(tg)        17.4      17.5
+#WER on eval2000(fg)        15.7      15.7
+#Final train prob       -0.11156 -0.111636
+#Final valid prob      -0.131797 -0.128892
+
+# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500.
+
+# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse.
+#
+#local/chain/compare_wer.sh 5e 5s 5t 5v
+#System                       5e        5s        5t        5v
+#WER on train_dev(tg)      15.43     15.47     15.43     15.38
+#WER on train_dev(fg)      14.32     14.31     14.34     14.39
+#WER on eval2000(tg)        17.3      17.4      17.4      17.4
+#WER on eval2000(fg)        15.5      15.6      15.6      15.7
+#Final train prob      -0.110056 -0.110928 -0.110752  -0.11156
+#Final valid prob      -0.129184 -0.132139 -0.129123 -0.131797
+
+# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it
+# up), from 5000 to 3500.
+
+# about 5s: comparing with 5e which is the most recent baseline we actually
+# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700,
+# jesus-hidden-dim reduced 7500 to 5000, and and the new option
+# --self-repair-scale 0.00001 added.  Also compare 5t and 5v which have even
+# smaller jesus-hidden-dims.
+
+# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate
+# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair
+# code to a bug which was doubling the thresholds so there was, in effect,
+# no upper threshold.  I stopped the p,q,r runs after I found this, but in
+# configuring this run I'm bearing in mind the train and valid probs from the
+# p,q,r runs.
+
+# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000.
+
+# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try
+# to compensate for the fact that more of the output dimensions are now being
+# usefully used.
+
+# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair
+# ReLUs that are over or under-saturated.
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5y # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1800 --final-hidden-dim 400 --jesus-hidden-dim 3000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5z.sh b/egs/swbd/s5c/local/chain/run_tdnn_5z.sh
new file mode 100755
index 00000000000..94843bfa2c9
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5z.sh
@@ -0,0 +1,468 @@
+#!/bin/bash
+
+# _5z is as _5v, but adding skip-splicing (a new configuration option)
+# It seems not helpful.  I'll remove the option soon.
+# note: 5v2 is a rerun of 5v.
+
+# local/chain/compare_wer.sh 5v 5v2 5z
+# System                       5v       5v2        5z
+# WER on train_dev(tg)      15.38     15.74     15.60
+# WER on train_dev(fg)      14.39     14.50     14.50
+# WER on eval2000(tg)        17.4      17.5      17.6
+# WER on eval2000(fg)        15.7      15.9      15.9
+# Final train prob       -0.11156 -0.112155 -0.113823
+# Final valid prob      -0.131797 -0.129516 -0.131356
+
+# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500.
+
+# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse.
+#
+#local/chain/compare_wer.sh 5e 5s 5t 5v
+#System                       5e        5s        5t        5v
+#WER on train_dev(tg)      15.43     15.47     15.43     15.38
+#WER on train_dev(fg)      14.32     14.31     14.34     14.39
+#WER on eval2000(tg)        17.3      17.4      17.4      17.4
+#WER on eval2000(fg)        15.5      15.6      15.6      15.7
+#Final train prob      -0.110056 -0.110928 -0.110752  -0.11156
+#Final valid prob      -0.129184 -0.132139 -0.129123 -0.131797
+
+# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it
+# up), from 5000 to 3500.
+
+# about 5s: comparing with 5e which is the most recent baseline we actually
+# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700,
+# jesus-hidden-dim reduced 7500 to 5000, and and the new option
+# --self-repair-scale 0.00001 added.  Also compare 5t and 5v which have even
+# smaller jesus-hidden-dims.
+
+# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate
+# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair
+# code to a bug which was doubling the thresholds so there was, in effect,
+# no upper threshold.  I stopped the p,q,r runs after I found this, but in
+# configuring this run I'm bearing in mind the train and valid probs from the
+# p,q,r runs.
+
+# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000.
+
+# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try
+# to compensate for the fact that more of the output dimensions are now being
+# usefully used.
+
+# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair
+# ReLUs that are over or under-saturated.
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5z # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1700 --jesus-hidden-dim 2500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3,skip0 -3,0,3,skip0 -3,0,3,skip0 -6,-3,0,skip-3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6a.sh b/egs/swbd/s5c/local/chain/run_tdnn_6a.sh
new file mode 100755
index 00000000000..c618d1c0adf
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6a.sh
@@ -0,0 +1,490 @@
+#!/bin/bash
+
+# _6a is as _5y, where we keep the hidden parts of the network a bit larger
+# but take the final-hidden-dim back up to 500, which is the same as what
+# it was in 5v.
+
+# No better.
+#local/chain/compare_wer.sh 5v 6a
+#System                       5v        6a
+#WER on train_dev(tg)      15.38     15.49
+#WER on train_dev(fg)      14.39     14.30
+#WER on eval2000(tg)        17.4      17.5
+#WER on eval2000(fg)        15.7      15.9
+#Final train prob       -0.11156 -0.109471
+#Final valid prob      -0.131797 -0.129035
+
+# _5y is as _5v, but rebalancing the network to have fewer parameters in the
+# final layer and more in the hidden parts, by reducing --final-hidden-dim from 500
+# (it defaults to --jesus-forward-hidden-dim) to 400, and increasing
+#  --jesus-forward-input-dim from 500 to 600 and
+#  --jesus-forward-output-dim from 1700 to 1800,
+# and --jesus-hidden-dim from 2500 to 3000 (note: I don't really expect this last change
+# to make much of a difference).
+# Very roughly, we're moving about a million parameters from the final layer to the
+# hidden parts of the network.  Hopefully this will reduce overtraining, since
+# the hidden parts of the network are regularized by the --xent-regularize option.
+
+# The diagnostics were improved, but the WER is no better (or maybe slightly worse).
+#local/chain/compare_wer.sh 5v 5y
+#System                       5v        5y
+#WER on train_dev(tg)      15.38     15.50
+#WER on train_dev(fg)      14.39     14.37
+#WER on eval2000(tg)        17.4      17.5
+#WER on eval2000(fg)        15.7      15.7
+#Final train prob       -0.11156 -0.111636
+#Final valid prob      -0.131797 -0.128892
+
+# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500.
+
+# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse.
+#
+#local/chain/compare_wer.sh 5e 5s 5t 5v
+#System                       5e        5s        5t        5v
+#WER on train_dev(tg)      15.43     15.47     15.43     15.38
+#WER on train_dev(fg)      14.32     14.31     14.34     14.39
+#WER on eval2000(tg)        17.3      17.4      17.4      17.4
+#WER on eval2000(fg)        15.5      15.6      15.6      15.7
+#Final train prob      -0.110056 -0.110928 -0.110752  -0.11156
+#Final valid prob      -0.129184 -0.132139 -0.129123 -0.131797
+
+# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it
+# up), from 5000 to 3500.
+
+# about 5s: comparing with 5e which is the most recent baseline we actually
+# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700,
+# jesus-hidden-dim reduced 7500 to 5000, and and the new option
+# --self-repair-scale 0.00001 added.  Also compare 5t and 5v which have even
+# smaller jesus-hidden-dims.
+
+# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate
+# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair
+# code to a bug which was doubling the thresholds so there was, in effect,
+# no upper threshold.  I stopped the p,q,r runs after I found this, but in
+# configuring this run I'm bearing in mind the train and valid probs from the
+# p,q,r runs.
+
+# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000.
+
+# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try
+# to compensate for the fact that more of the output dimensions are now being
+# usefully used.
+
+# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair
+# ReLUs that are over or under-saturated.
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6a # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1800 --final-hidden-dim 500 --jesus-hidden-dim 3000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6b.sh b/egs/swbd/s5c/local/chain/run_tdnn_6b.sh
new file mode 100755
index 00000000000..5cd3f7dfbf2
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6b.sh
@@ -0,0 +1,480 @@
+#!/bin/bash
+
+# _6b is as _5y, where we keep the hidden parts of the network a bit larger
+# but take the final-hidden-dim back up to 500, which is the same as what
+# it was in 5v.
+
+# _5y is as _5v, but rebalancing the network to have fewer parameters in the
+# final layer and more in the hidden parts, by reducing --final-hidden-dim from 500
+# (it defaults to --jesus-forward-hidden-dim) to 400, and increasing
+#  --jesus-forward-input-dim from 500 to 600 and
+#  --jesus-forward-output-dim from 1700 to 1800,
+# and --jesus-hidden-dim from 2500 to 3000 (note: I don't really expect this last change
+# to make much of a difference).
+# Very roughly, we're moving about a million parameters from the final layer to the
+# hidden parts of the network.  Hopefully this will reduce overtraining, since
+# the hidden parts of the network are regularized by the --xent-regularize option.
+
+# The diagnostics were improved, but the WER is no better (or maybe slightly worse).
+#local/chain/compare_wer.sh 5v 5y
+#System                       5v        5y
+#WER on train_dev(tg)      15.38     15.50
+#WER on train_dev(fg)      14.39     14.37
+#WER on eval2000(tg)        17.4      17.5
+#WER on eval2000(fg)        15.7      15.7
+#Final train prob       -0.11156 -0.111636
+#Final valid prob      -0.131797 -0.128892
+
+# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500.
+
+# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse.
+#
+#local/chain/compare_wer.sh 5e 5s 5t 5v
+#System                       5e        5s        5t        5v
+#WER on train_dev(tg)      15.43     15.47     15.43     15.38
+#WER on train_dev(fg)      14.32     14.31     14.34     14.39
+#WER on eval2000(tg)        17.3      17.4      17.4      17.4
+#WER on eval2000(fg)        15.5      15.6      15.6      15.7
+#Final train prob      -0.110056 -0.110928 -0.110752  -0.11156
+#Final valid prob      -0.129184 -0.132139 -0.129123 -0.131797
+
+# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it
+# up), from 5000 to 3500.
+
+# about 5s: comparing with 5e which is the most recent baseline we actually
+# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700,
+# jesus-hidden-dim reduced 7500 to 5000, and and the new option
+# --self-repair-scale 0.00001 added.  Also compare 5t and 5v which have even
+# smaller jesus-hidden-dims.
+
+# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate
+# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair
+# code to a bug which was doubling the thresholds so there was, in effect,
+# no upper threshold.  I stopped the p,q,r runs after I found this, but in
+# configuring this run I'm bearing in mind the train and valid probs from the
+# p,q,r runs.
+
+# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000.
+
+# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try
+# to compensate for the fact that more of the output dimensions are now being
+# usefully used.
+
+# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair
+# ReLUs that are over or under-saturated.
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6b # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1800 --final-hidden-dim 500 --jesus-hidden-dim 3000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6c.sh b/egs/swbd/s5c/local/chain/run_tdnn_6c.sh
new file mode 100755
index 00000000000..7334a5e185e
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6c.sh
@@ -0,0 +1,468 @@
+#!/bin/bash
+
+# _6c is as _5v but adding "--thick-jesus-layer true" (new option): extra hidden
+# layer inside jesus layer.
+
+# Note: 5v2 is a rerun of 5v.
+#local/chain/compare_wer.sh 5v 5v2 6c
+#System                       5v       5v2        6c
+#WER on train_dev(tg)      15.38     15.74     15.54
+#WER on train_dev(fg)      14.39     14.50     14.55
+#WER on eval2000(tg)        17.4      17.5      17.5
+#WER on eval2000(fg)        15.7      15.9      15.8
+#Final train prob       -0.11156 -0.112155 -0.114084
+#Final valid prob      -0.131797 -0.129516 -0.129589
+
+# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500.
+
+# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse.
+#
+#local/chain/compare_wer.sh 5e 5s 5t 5v
+#System                       5e        5s        5t        5v
+#WER on train_dev(tg)      15.43     15.47     15.43     15.38
+#WER on train_dev(fg)      14.32     14.31     14.34     14.39
+#WER on eval2000(tg)        17.3      17.4      17.4      17.4
+#WER on eval2000(fg)        15.5      15.6      15.6      15.7
+#Final train prob      -0.110056 -0.110928 -0.110752  -0.11156
+#Final valid prob      -0.129184 -0.132139 -0.129123 -0.131797
+
+# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it
+# up), from 5000 to 3500.
+
+# about 5s: comparing with 5e which is the most recent baseline we actually
+# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700,
+# jesus-hidden-dim reduced 7500 to 5000, and and the new option
+# --self-repair-scale 0.00001 added.  Also compare 5t and 5v which have even
+# smaller jesus-hidden-dims.
+
+# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate
+# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair
+# code to a bug which was doubling the thresholds so there was, in effect,
+# no upper threshold.  I stopped the p,q,r runs after I found this, but in
+# configuring this run I'm bearing in mind the train and valid probs from the
+# p,q,r runs.
+
+# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000.
+
+# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try
+# to compensate for the fact that more of the output dimensions are now being
+# usefully used.
+
+# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair
+# ReLUs that are over or under-saturated.
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6c # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1700 --jesus-hidden-dim 2500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001 --thick-jesus-layer true" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6d.sh b/egs/swbd/s5c/local/chain/run_tdnn_6d.sh
new file mode 100755
index 00000000000..80b6a18cabf
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6d.sh
@@ -0,0 +1,470 @@
+#!/bin/bash
+
+# _6d is as _5v but changing adding --num-jesus-blocks 84 (default is 100).
+# this means (after rounding) that we have 6, not 5, as
+# --jesus-forward-input-dim / --num-jesus-blocks.
+
+# no clear difference.
+#[note, 5v2 is a rerun of 5v].
+# local/chain/compare_wer.sh 5v 5v2 6d
+# System                       5v       5v2        6d
+# WER on train_dev(tg)      15.38     15.74     15.66
+# WER on train_dev(fg)      14.39     14.50     14.54
+# WER on eval2000(tg)        17.4      17.5      17.5
+# WER on eval2000(fg)        15.7      15.9      15.8
+# Final train prob       -0.11156 -0.112155 -0.112034
+# Final valid prob      -0.131797 -0.129516 -0.131714
+
+# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500.
+
+# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse.
+#
+#local/chain/compare_wer.sh 5e 5s 5t 5v
+#System                       5e        5s        5t        5v
+#WER on train_dev(tg)      15.43     15.47     15.43     15.38
+#WER on train_dev(fg)      14.32     14.31     14.34     14.39
+#WER on eval2000(tg)        17.3      17.4      17.4      17.4
+#WER on eval2000(fg)        15.5      15.6      15.6      15.7
+#Final train prob      -0.110056 -0.110928 -0.110752  -0.11156
+#Final valid prob      -0.129184 -0.132139 -0.129123 -0.131797
+
+# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it
+# up), from 5000 to 3500.
+
+# about 5s: comparing with 5e which is the most recent baseline we actually
+# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700,
+# jesus-hidden-dim reduced 7500 to 5000, and and the new option
+# --self-repair-scale 0.00001 added.  Also compare 5t and 5v which have even
+# smaller jesus-hidden-dims.
+
+# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate
+# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair
+# code to a bug which was doubling the thresholds so there was, in effect,
+# no upper threshold.  I stopped the p,q,r runs after I found this, but in
+# configuring this run I'm bearing in mind the train and valid probs from the
+# p,q,r runs.
+
+# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000.
+
+# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try
+# to compensate for the fact that more of the output dimensions are now being
+# usefully used.
+
+# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair
+# ReLUs that are over or under-saturated.
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6d # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--num-jesus-blocks 84 --jesus-forward-input-dim 500  --jesus-forward-output-dim 1700 --jesus-hidden-dim 2500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6e.sh b/egs/swbd/s5c/local/chain/run_tdnn_6e.sh
new file mode 100755
index 00000000000..d44973db7ba
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6e.sh
@@ -0,0 +1,464 @@
+#!/bin/bash
+
+
+# _6e is as _6d but going further: reducing --num-jesus-blocks to 72 = ceil(500/7).
+
+#
+# _6d is as _5v but changing adding --num-jesus-blocks 84 (default is 100).
+# this means (after rounding) that we have 6, not 5, as
+# --jesus-forward-input-dim / --num-jesus-blocks.
+
+
+# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500.
+
+# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse.
+#
+#local/chain/compare_wer.sh 5e 5s 5t 5v
+#System                       5e        5s        5t        5v
+#WER on train_dev(tg)      15.43     15.47     15.43     15.38
+#WER on train_dev(fg)      14.32     14.31     14.34     14.39
+#WER on eval2000(tg)        17.3      17.4      17.4      17.4
+#WER on eval2000(fg)        15.5      15.6      15.6      15.7
+#Final train prob      -0.110056 -0.110928 -0.110752  -0.11156
+#Final valid prob      -0.129184 -0.132139 -0.129123 -0.131797
+
+# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it
+# up), from 5000 to 3500.
+
+# about 5s: comparing with 5e which is the most recent baseline we actually
+# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700,
+# jesus-hidden-dim reduced 7500 to 5000, and and the new option
+# --self-repair-scale 0.00001 added.  Also compare 5t and 5v which have even
+# smaller jesus-hidden-dims.
+
+# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate
+# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair
+# code to a bug which was doubling the thresholds so there was, in effect,
+# no upper threshold.  I stopped the p,q,r runs after I found this, but in
+# configuring this run I'm bearing in mind the train and valid probs from the
+# p,q,r runs.
+
+# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000.
+
+# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try
+# to compensate for the fact that more of the output dimensions are now being
+# usefully used.
+
+# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair
+# ReLUs that are over or under-saturated.
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6e # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--num-jesus-blocks 72 --jesus-forward-input-dim 500  --jesus-forward-output-dim 1700 --jesus-hidden-dim 2500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6f.sh b/egs/swbd/s5c/local/chain/run_tdnn_6f.sh
new file mode 100755
index 00000000000..fb7ff03b66d
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6f.sh
@@ -0,0 +1,470 @@
+#!/bin/bash
+
+# _6f is as _5v, but setting --jesus-hidden-dim to 0 which with a script change
+# means there is no hidden part in the jesus layer (it's just repeated affine and relu).
+
+# slightly worse, but encouragingly small difference.
+# note, 5v2 is a rerun of 5v.
+# local/chain/compare_wer.sh 5v 5v2 6f
+# System                       5v       5v2        6f
+# WER on train_dev(tg)      15.38     15.74     15.71
+# WER on train_dev(fg)      14.39     14.50     14.50
+# WER on eval2000(tg)        17.4      17.5      17.5
+# WER on eval2000(fg)        15.7      15.9      15.9
+# Final train prob       -0.11156 -0.112155 -0.111305
+# Final valid prob      -0.131797 -0.129516 -0.131487
+
+
+# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500.
+
+# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse.
+#
+#local/chain/compare_wer.sh 5e 5s 5t 5v
+#System                       5e        5s        5t        5v
+#WER on train_dev(tg)      15.43     15.47     15.43     15.38
+#WER on train_dev(fg)      14.32     14.31     14.34     14.39
+#WER on eval2000(tg)        17.3      17.4      17.4      17.4
+#WER on eval2000(fg)        15.5      15.6      15.6      15.7
+#Final train prob      -0.110056 -0.110928 -0.110752  -0.11156
+#Final valid prob      -0.129184 -0.132139 -0.129123 -0.131797
+
+# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it
+# up), from 5000 to 3500.
+
+# about 5s: comparing with 5e which is the most recent baseline we actually
+# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700,
+# jesus-hidden-dim reduced 7500 to 5000, and and the new option
+# --self-repair-scale 0.00001 added.  Also compare 5t and 5v which have even
+# smaller jesus-hidden-dims.
+
+# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate
+# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair
+# code to a bug which was doubling the thresholds so there was, in effect,
+# no upper threshold.  I stopped the p,q,r runs after I found this, but in
+# configuring this run I'm bearing in mind the train and valid probs from the
+# p,q,r runs.
+
+# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000.
+
+# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try
+# to compensate for the fact that more of the output dimensions are now being
+# usefully used.
+
+# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair
+# ReLUs that are over or under-saturated.
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6f # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1700 --jesus-hidden-dim 0 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6g.sh b/egs/swbd/s5c/local/chain/run_tdnn_6g.sh
new file mode 100755
index 00000000000..8d4e8b79fd0
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6g.sh
@@ -0,0 +1,491 @@
+#!/bin/bash
+
+# _6g is as _6f but increasing the parameters (increasing
+# jesus-forward-input-from from 500 to 600).
+
+# seems better than 6f, and about the same as (5v,5v2).  encouraging.
+# note, 5v2 is rerun of 5v.
+#local/chain/compare_wer.sh 5v 5v2 6f 6g
+#System                       5v       5v2        6f        6g
+#WER on train_dev(tg)      15.38     15.74     15.71     15.50
+#WER on train_dev(fg)      14.39     14.50     14.50     14.31
+#WER on eval2000(tg)        17.4      17.5      17.5      17.5
+#WER on eval2000(fg)        15.7      15.9      15.9      15.8
+#Final train prob       -0.11156 -0.112155 -0.111305 -0.105853
+#Final valid prob      -0.131797 -0.129516 -0.131487 -0.129997
+
+# _6f is as _5v, but setting --jesus-hidden-dim to 0 which with a script change
+# means there is no hidden part in the jesus layer (it's just repeated affine and relu).
+
+# slightly worse, but encouragingly small difference.
+#local/chain/compare_wer.sh 5v 6f
+#System                       5v        6f
+#WER on train_dev(tg)      15.38     15.71
+#WER on train_dev(fg)      14.39     14.50
+#WER on eval2000(tg)        17.4      17.5
+#WER on eval2000(fg)        15.7      15.9
+#Final train prob       -0.11156 -0.111305
+#Final valid prob      -0.131797 -0.131487
+
+# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500.
+
+# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse.
+#
+#local/chain/compare_wer.sh 5e 5s 5t 5v
+#System                       5e        5s        5t        5v
+#WER on train_dev(tg)      15.43     15.47     15.43     15.38
+#WER on train_dev(fg)      14.32     14.31     14.34     14.39
+#WER on eval2000(tg)        17.3      17.4      17.4      17.4
+#WER on eval2000(fg)        15.5      15.6      15.6      15.7
+#Final train prob      -0.110056 -0.110928 -0.110752  -0.11156
+#Final valid prob      -0.129184 -0.132139 -0.129123 -0.131797
+
+# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it
+# up), from 5000 to 3500.
+
+# about 5s: comparing with 5e which is the most recent baseline we actually
+# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700,
+# jesus-hidden-dim reduced 7500 to 5000, and and the new option
+# --self-repair-scale 0.00001 added.  Also compare 5t and 5v which have even
+# smaller jesus-hidden-dims.
+
+# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate
+# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair
+# code to a bug which was doubling the thresholds so there was, in effect,
+# no upper threshold.  I stopped the p,q,r runs after I found this, but in
+# configuring this run I'm bearing in mind the train and valid probs from the
+# p,q,r runs.
+
+# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000.
+
+# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try
+# to compensate for the fact that more of the output dimensions are now being
+# usefully used.
+
+# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair
+# ReLUs that are over or under-saturated.
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+# quite helpful:
+#local/chain/compare_wer.sh 4w 5a
+#System                       4w        5a
+#WER on train_dev(tg)      16.05     15.86
+#WER on train_dev(fg)      14.92     14.74
+#WER on eval2000(tg)        18.0      17.4
+#WER on eval2000(fg)        16.2      15.6
+#Final train prob      -0.108816-0.0998359
+#Final valid prob      -0.118254 -0.115884
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6g # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1700 --jesus-hidden-dim 0 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6h.sh b/egs/swbd/s5c/local/chain/run_tdnn_6h.sh
new file mode 100755
index 00000000000..f3065cec603
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6h.sh
@@ -0,0 +1,494 @@
+#!/bin/bash
+
+# _6h is as _6g but adding --xent-separate-forward-affine=true, which
+# gives a separate last-but-one weight matrix to the xent output.
+
+# Although this slight improvement is probably not significant, it's a
+# sensible idea so I think I'll stick with it.
+#local/chain/compare_wer.sh 6g 6h
+#System                       6g        6h
+#WER on train_dev(tg)      15.50     15.46
+#WER on train_dev(fg)      14.31     14.28
+#WER on eval2000(tg)        17.5      17.4
+#WER on eval2000(fg)        15.8      15.7
+#Final train prob      -0.105853 -0.105663
+#Final valid prob      -0.129997 -0.130166
+
+# _6g is as _6f but increasing the parameters (increasing
+# jesus-forward-input-from from 500 to 600).
+
+# _6f is as _5v, but setting --jesus-hidden-dim to 0 which with a script change
+# means there is no hidden part in the jesus layer (it's just repeated affine and relu).
+
+# slightly worse, but encouragingly small difference.
+#local/chain/compare_wer.sh 5v 6f
+#System                       5v        6f
+#WER on train_dev(tg)      15.38     15.71
+#WER on train_dev(fg)      14.39     14.50
+#WER on eval2000(tg)        17.4      17.5
+#WER on eval2000(fg)        15.7      15.9
+#Final train prob       -0.11156 -0.111305
+#Final valid prob      -0.131797 -0.131487
+
+# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500.
+
+# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse.
+#
+#local/chain/compare_wer.sh 5e 5s 5t 5v
+#System                       5e        5s        5t        5v
+#WER on train_dev(tg)      15.43     15.47     15.43     15.38
+#WER on train_dev(fg)      14.32     14.31     14.34     14.39
+#WER on eval2000(tg)        17.3      17.4      17.4      17.4
+#WER on eval2000(fg)        15.5      15.6      15.6      15.7
+#Final train prob      -0.110056 -0.110928 -0.110752  -0.11156
+#Final valid prob      -0.129184 -0.132139 -0.129123 -0.131797
+
+# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it
+# up), from 5000 to 3500.
+
+# about 5s: comparing with 5e which is the most recent baseline we actually
+# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700,
+# jesus-hidden-dim reduced 7500 to 5000, and and the new option
+# --self-repair-scale 0.00001 added.  Also compare 5t and 5v which have even
+# smaller jesus-hidden-dims.
+
+# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate
+# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair
+# code to a bug which was doubling the thresholds so there was, in effect,
+# no upper threshold.  I stopped the p,q,r runs after I found this, but in
+# configuring this run I'm bearing in mind the train and valid probs from the
+# p,q,r runs.
+
+# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000.
+
+# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try
+# to compensate for the fact that more of the output dimensions are now being
+# usefully used.
+
+# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair
+# ReLUs that are over or under-saturated.
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+# quite helpful:
+#local/chain/compare_wer.sh 4w 5a
+#System                       4w        5a
+#WER on train_dev(tg)      16.05     15.86
+#WER on train_dev(fg)      14.92     14.74
+#WER on eval2000(tg)        18.0      17.4
+#WER on eval2000(fg)        16.2      15.6
+#Final train prob      -0.108816-0.0998359
+#Final valid prob      -0.118254 -0.115884
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6h # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1700 --jesus-hidden-dim 0 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001 --xent-separate-forward-affine=true" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6h_discriminative.sh b/egs/swbd/s5c/local/chain/run_tdnn_6h_discriminative.sh
new file mode 100755
index 00000000000..85afa7bf9ca
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6h_discriminative.sh
@@ -0,0 +1,265 @@
+#!/bin/bash
+
+set -o pipefail
+set -e
+# this is run_discriminative.sh
+
+# This script does discriminative training on top of chain nnet3 system.
+# note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
+# since the lattice generation runs in about real-time, so takes of the order of
+# 1000 hours of CPU time.
+#
+# eval2000
+
+# chain 7b
+# %WER 17.2 | 4459 42989 | 84.8 10.2 5.0 2.0 17.2 54.4 | exp/chain/tdnn_7b_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# chain 7b + smbr
+# %WER 16.9 | 4459 42989 | 85.2 10.3 4.5 2.1 16.9 54.4 | exp/chain/tdnn_7b_sp_smbr/decode_eval2000_sw1_tg_epoch1/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 85.4 10.5 4.1 2.3 16.9 54.2 | exp/chain/tdnn_7b_sp_smbr/decode_eval2000_sw1_tg_epoch2/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.3 10.4 4.3 2.3 17.0 54.5 | exp/chain/tdnn_7b_sp_smbr/decode_eval2000_sw1_tg_epoch3/score_12_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.1 | 4459 42989 | 85.2 10.5 4.3 2.4 17.1 54.5 | exp/chain/tdnn_7b_sp_smbr/decode_eval2000_sw1_tg_epoch4/score_12_0.5/eval2000_hires.ctm.filt.sys
+
+# chain 7b
+# %WER 15.5 | 4459 42989 | 86.3 9.0 4.7 1.8 15.5 51.3 | exp/chain/tdnn_7b_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# chain 7b + smbr
+# %WER 15.2 | 4459 42989 | 86.8 9.1 4.1 2.0 15.2 51.2 | exp/chain/tdnn_7b_sp_smbr/decode_eval2000_sw1_fsh_fg_epoch1/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 15.1 | 4459 42989 | 86.9 9.0 4.1 2.0 15.1 51.3 | exp/chain/tdnn_7b_sp_smbr/decode_eval2000_sw1_fsh_fg_epoch2/score_12_0.0/eval2000_hires.ctm.filt.sys
+# %WER 15.1 | 4459 42989 | 87.0 9.1 3.9 2.1 15.1 51.2 | exp/chain/tdnn_7b_sp_smbr/decode_eval2000_sw1_fsh_fg_epoch3/score_12_0.5/eval2000_hires.ctm.filt.sys
+# %WER 15.2 | 4459 42989 | 87.0 9.2 3.8 2.2 15.2 51.5 | exp/chain/tdnn_7b_sp_smbr/decode_eval2000_sw1_fsh_fg_epoch4/score_12_0.5/eval2000_hires.ctm.filt.sys
+
+
+# RT'03
+
+# chain 7b
+# %WER 21.6 | 8420 76157 | 80.5 12.8 6.7 2.1 21.6 53.7 | exp/chain/tdnn_7b_sp/decode_rt03_sw1_tg/score_9_0.0/rt03_hires.ctm.filt.sys
+
+# chain 7b + smbr
+# %WER 21.0 | 8420 76157 | 81.3 12.8 5.8 2.4 21.0 53.0 | exp/chain/tdnn_7b_sp_smbr/decode_rt03_sw1_tg_epoch1/score_10_0.0/rt03_hires.ctm.filt.sys
+# %WER 20.8 | 8420 76157 | 81.6 12.5 6.0 2.4 20.8 53.0 | exp/chain/tdnn_7b_sp_smbr/decode_rt03_sw1_tg_epoch2/score_11_0.0/rt03_hires.ctm.filt.sys
+# %WER 20.8 | 8420 76157 | 81.6 12.6 5.8 2.5 20.8 53.1 | exp/chain/tdnn_7b_sp_smbr/decode_rt03_sw1_tg_epoch3/score_11_0.5/rt03_hires.ctm.filt.sys
+# %WER 20.9 | 8420 76157 | 81.7 12.7 5.6 2.6 20.9 53.2 | exp/chain/tdnn_7b_sp_smbr/decode_rt03_sw1_tg_epoch4/score_11_0.0/rt03_hires.ctm.filt.sys
+
+# chain 7b
+# %WER 19.0 | 8420 76157 | 82.7 10.2 7.2 1.7 19.0 50.0 | exp/chain/tdnn_7b_sp/decode_rt03_sw1_fsh_fg/score_10_0.0/rt03_hires.ctm.filt.sys
+
+# chain 7b + smbr
+# %WER 18.2 | 8420 76157 | 83.7 10.4 5.9 1.9 18.2 49.3 | exp/chain/tdnn_7b_sp_smbr/decode_rt03_sw1_fsh_fg_epoch1/score_11_0.0/rt03_hires.ctm.filt.sys
+# %WER 18.1 | 8420 76157 | 83.9 10.7 5.4 2.1 18.1 49.3 | exp/chain/tdnn_7b_sp_smbr/decode_rt03_sw1_fsh_fg_epoch2/score_11_0.0/rt03_hires.ctm.filt.sys
+# %WER 18.1 | 8420 76157 | 84.0 10.7 5.3 2.1 18.1 49.4 | exp/chain/tdnn_7b_sp_smbr/decode_rt03_sw1_fsh_fg_epoch3/score_11_1.0/rt03_hires.ctm.filt.sys
+# %WER 18.2 | 8420 76157 | 83.8 10.5 5.7 2.1 18.2 49.6 | exp/chain/tdnn_7b_sp_smbr/decode_rt03_sw1_fsh_fg_epoch4/score_12_1.0/rt03_hires.ctm.filt.sys
+
+. cmd.sh
+
+stage=0
+train_stage=-10 # can be used to start training in the middle.
+get_egs_stage=-10
+use_gpu=true  # for training
+cleanup=false  # run with --cleanup true --stage 6 to clean up (remove large things like denlats,
+               # alignments and degs).
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+srcdir=exp/chain/tdnn_7b_sp
+train_data_dir=data/train_nodup_sp_hires
+online_ivector_dir=exp/nnet3/ivectors_train_nodup_sp
+degs_dir=                     # If provided, will skip the degs directory creation
+lats_dir=                     # If provided, will skip denlats creation
+
+## Objective options
+criterion=smbr
+one_silence_class=true
+
+dir=${srcdir}_${criterion}
+
+## Egs options
+frames_per_eg=150
+frames_overlap_per_eg=30
+truncate_deriv_weights=10
+
+## Nnet training options
+effective_learning_rate=0.000000125
+max_param_change=1
+num_jobs_nnet=4
+num_epochs=4
+regularization_opts="--xent-regularize=0.1 --l2-regularize=0.00005"          # Applicable for providing --xent-regularize and --l2-regularize options 
+minibatch_size=64
+
+## Decode options
+decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we decided to run more.
+
+if $use_gpu; then
+  if ! cuda-compiled; then
+    cat <<EOF && exit 1 
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
+EOF
+  fi
+  num_threads=1
+else
+  # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
+  # almost the same, but this may be a little bit slow.
+  num_threads=16
+fi
+
+if [ ! -f ${srcdir}/final.mdl ]; then
+  echo "$0: expected ${srcdir}/final.mdl to exist; first run run_tdnn.sh or run_lstm.sh"
+  exit 1;
+fi
+
+lang=data/lang
+
+frame_subsampling_opt=
+frame_subsampling_factor=1
+if [ -f $srcdir/frame_subsampling_factor ]; then
+  frame_subsampling_factor=$(cat $srcdir/frame_subsampling_factor)
+  frame_subsampling_opt="--frame-subsampling-factor $(cat $srcdir/frame_subsampling_factor)"
+fi
+
+affix=    # Will be set if doing input frame shift
+if [ $frame_subsampling_factor -ne 1 ]; then
+  if [ $stage -le 0 ]; then
+    mkdir -p ${online_ivector_dir}_fs
+    cp -r $online_ivector_dir/{conf,ivector_period} ${online_ivector_dir}_fs
+
+    rm ${online_ivector_dir}_fs/ivector_online.scp 2>/dev/null || true
+
+    data_dirs=
+    for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do 
+      steps/shift_feats.sh --cmd "$train_cmd --max-jobs-run 40" --nj 350 \
+        $x $train_data_dir exp/shift_hires/ mfcc_hires
+      utils/fix_data_dir.sh ${train_data_dir}_fs$x
+      data_dirs="$data_dirs ${train_data_dir}_fs$x"
+      awk -v nfs=$x '{print "fs"nfs"-"$0}' $online_ivector_dir/ivector_online.scp >> ${online_ivector_dir}_fs/ivector_online.scp
+    done
+    utils/combine_data.sh ${train_data_dir}_fs $data_dirs
+    for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do 
+      rm -r ${train_data_dir}_fs$x
+    done
+  fi
+
+  train_data_dir=${train_data_dir}_fs
+
+  affix=_fs
+fi
+    
+rm ${online_ivector_dir}_fs/ivector_online.scp 2>/dev/null || true
+for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do 
+  awk -v nfs=$x '{print "fs"nfs"-"$0}' $online_ivector_dir/ivector_online.scp >> ${online_ivector_dir}_fs/ivector_online.scp
+done
+online_ivector_dir=${online_ivector_dir}_fs
+
+if [ $stage -le 1 ]; then
+  # hardcode no-GPU for alignment, although you could use GPU [you wouldn't
+  # get excellent GPU utilization though.]
+  nj=350 # have a high number of jobs because this could take a while, and we might
+         # have some stragglers.
+  steps/nnet3/align.sh  --cmd "$decode_cmd" --use-gpu false \
+    --online-ivector-dir $online_ivector_dir \
+    --scale-opts "--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0" \
+    --nj $nj $train_data_dir $lang $srcdir ${srcdir}_ali${affix} ;
+fi
+
+if [ -z "$lats_dir" ]; then
+  lats_dir=${srcdir}_denlats${affix}
+  if [ $stage -le 2 ]; then
+    nj=50  
+    # this doesn't really affect anything strongly, except the num-jobs for one of
+    # the phases of get_egs_discriminative.sh below.
+    num_threads_denlats=6
+    subsplit=40 # number of jobs that run per job (but 2 run at a time, so total jobs is 80, giving
+    # total slots = 80 * 6 = 480.
+    steps/nnet3/make_denlats.sh --cmd "$decode_cmd" \
+      --self-loop-scale 1.0 --acwt 1.0 --determinize true \
+      --online-ivector-dir $online_ivector_dir \
+      --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \
+      $train_data_dir $lang $srcdir ${lats_dir} ;
+  fi
+fi
+
+model_left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'` 
+model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'` 
+
+left_context=$[model_left_context + extra_left_context]
+right_context=$[model_right_context + extra_right_context]
+
+valid_left_context=$[valid_left_context + frames_per_eg]
+valid_right_context=$[valid_right_context + frames_per_eg]
+
+cmvn_opts=`cat $srcdir/cmvn_opts` 
+
+if [ -z "$degs_dir" ]; then
+  degs_dir=${srcdir}_degs${affix}
+
+  if [ $stage -le 3 ]; then
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${srcdir}_degs/storage ]; then
+      utils/create_split_dir.pl \
+        /export/b0{1,2,12,13}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5/${srcdir}_degs/storage ${srcdir}_degs/storage
+    fi
+    # have a higher maximum num-jobs if
+    if [ -d ${srcdir}_degs/storage ]; then max_jobs=10; else max_jobs=5; fi
+
+    degs_opts="--determinize true --minimize true --remove-output-symbols true --remove-epsilons true --collapse-transition-ids true"
+
+    steps/nnet3/get_egs_discriminative.sh \
+      --cmd "$decode_cmd --max-jobs-run $max_jobs --mem 20G" --stage $get_egs_stage --cmvn-opts "$cmvn_opts" \
+      --adjust-priors false --acwt 1.0 \
+      --online-ivector-dir $online_ivector_dir \
+      --left-context $left_context --right-context $right_context \
+      --valid-left-context $valid_left_context --valid-right-context $valid_right_context \
+      --priors-left-context $valid_left_context --priors-right-context $valid_right_context $frame_subsampling_opt \
+      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg ${degs_opts} \
+      $train_data_dir $lang ${srcdir}_ali${affix} $lats_dir $srcdir/final.mdl $degs_dir ;
+  fi
+fi
+
+if [ $stage -le 4 ]; then
+  steps/nnet3/train_discriminative.sh --cmd "$decode_cmd" \
+    --stage $train_stage \
+    --effective-lrate $effective_learning_rate --max-param-change $max_param_change \
+    --criterion $criterion --drop-frames true --acoustic-scale 1.0 \
+    --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size $minibatch_size \
+    --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
+    --regularization-opts "$regularization_opts" --use-frame-shift false \
+    --truncate-deriv-weights $truncate_deriv_weights --adjust-priors false \
+    --modify-learning-rates false \
+      ${degs_dir} $dir ;
+fi
+
+graph_dir=$srcdir/graph_sw1_tg
+if [ $stage -le 5 ]; then
+  for x in `seq $decode_start_epoch $num_epochs`; do
+    for decode_set in train_dev eval2000 rt03; do
+      (
+      num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+      iter=epoch$x.adj
+      
+      steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+        $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_sw1_tg_$iter ;
+      if $has_fisher; then
+        steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+          data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+          $dir/decode_${decode_set}_sw1_{tg,fsh_fg}_$iter ;
+      fi
+      ) &
+    done
+  done
+fi
+wait;
+
+if [ $stage -le 6 ] && $cleanup; then
+  # if you run with "--cleanup true --stage 6" you can clean up.
+  rm ${lats_dir}/lat.*.gz || true
+  rm ${srcdir}_ali/ali.*.gz || true
+  steps/nnet2/remove_egs.sh ${srcdir}_degs || true
+fi
+
+
+exit 0;
+
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6h_py.sh b/egs/swbd/s5c/local/chain/run_tdnn_6h_py.sh
new file mode 100755
index 00000000000..a5a96de7f38
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6h_py.sh
@@ -0,0 +1,177 @@
+#!/bin/bash
+
+# this is a replica of_6h script, but makes use of the python trainer
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6h_py # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+xent_regularize=0.1
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs";
+
+     steps/nnet3/make_jesus_configs.py \
+      --feat-dir data/${train_set}_hires \
+      --ivector-dir exp/nnet3/ivectors_${train_set} \
+      --tree-dir $treedir \
+      --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+      --jesus-forward-input-dim 600 \
+      --jesus-forward-output-dim 1700 \
+      --jesus-hidden-dim 0 \
+      --jesus-stddev-scale 0.2 \
+      --final-layer-learning-rate-factor 0.25  \
+      --self-repair-scale 0.00001 \
+      --xent-separate-forward-affine=true \
+      --xent-regularize=$xent_regularize \
+      --include-log-softmax=false \
+    $dir/configs || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir exp/chain/tdnn_2y_sp/egs \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1200000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6i.sh b/egs/swbd/s5c/local/chain/run_tdnn_6i.sh
new file mode 100755
index 00000000000..457b424be73
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6i.sh
@@ -0,0 +1,497 @@
+#!/bin/bash
+
+# _6i takes aspects from 5n and 6g.  Like 6g it uses a 'thin' jesus-layer
+# (no hidden dimension), and like 5n it uses a non-standard frame shift at the
+# input, but this frame shift is 7.5 ms rather than 5ms (5n) or 10ms (6h).
+# the idea is that this allows us to subsample the input frames by a factor
+# of 4, rather than 3, and since 4 = 2 * 2, we can do the subsampling
+# in two stages.  You'll see this reflected in the splice indexes.
+
+# local/chain/compare_wer.sh 6g 6i
+# System                       6g        6i
+# WER on train_dev(tg)      15.50     15.62
+# WER on train_dev(fg)      14.31     14.46
+# WER on eval2000(tg)        17.5      17.3
+# WER on eval2000(fg)        15.8      15.8
+# Final train prob      -0.105853  -0.10417
+# Final valid prob      -0.129997 -0.123985
+
+# _5n is as _5j (also omitting the iVectors), but using double the input frame
+# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying
+# the splice indexes accordingly.
+# note: the frames-per-iter should be 1.6 million to get the same amount of
+# data per iteration, but I'm making it 2 million as the training per is getting
+# faster than I like (-> wasting time waiting for the queue).
+
+# A very nice improvement on dev; small improvement on eval2000 though.
+#local/chain/compare_wer.sh 5j 5n
+#System                       5j        5n
+#WER on train_dev(tg)      17.59     16.85
+#WER on train_dev(fg)      16.33     15.67
+#WER on eval2000(tg)        19.1      19.1
+#WER on eval2000(fg)        17.5      17.3
+#Final train prob      -0.114691 -0.116341
+#Final valid prob      -0.130761 -0.130884
+
+# _5j is as _5e, but omitting the iVectors.
+
+# Definitely worse, although curiously, there is very little effect on the valid prob.
+#./compare_wer.sh 5e 5j
+#System                       5e        5j
+#WER on train_dev(tg)      15.43     17.59
+#WER on train_dev(fg)      14.32     16.33
+#WER on eval2000(tg)        17.3      19.1
+#WER on eval2000(fg)        15.5      17.5
+#Final train prob      -0.110056 -0.114691
+#Final valid prob      -0.129184 -0.130761
+
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6i # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=3  # this is about the same amount of compute as the normal 4, since
+              # epoch encompasses all frame-shifts of the data and we now have 4
+              # frames-shifts rather than 3.  (3 * 4 == 4 * 3).
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=200 # 20 is equivalent to 150 at 10ms frame rate.
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+# Generate faster-frame-rate (7.5 ms frame shift) version of the data.
+if [ $stage -le 12 ]; then
+  mfccdir=mfcc
+  for dataset in eval2000 train_dev ${train_set}; do
+    utils/copy_data_dir.sh data/$dataset data/${dataset}_hiresf
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 --mfcc-config conf/mfcc_hiresf.conf \
+        data/${dataset}_hiresf exp/make_hiresf/$dataset $mfccdir;
+    steps/compute_cmvn_stats.sh data/${dataset}_hiresf exp/make_hiresf/$dataset $mfccdir;
+    utils/fix_data_dir.sh data/${dataset}_hiresf  # remove segments with problems
+  done
+fi
+
+if [ $stage -le 13 ]; then
+  for dataset in eval2000 train_dev ${train_set}; do
+    mkdir -p exp/nnet3/ivectors_${dataset}_fake
+    cp exp/nnet3/ivectors_${dataset}/ivector_online.scp exp/nnet3/ivectors_${dataset}_fake
+    # verify that the old ivector_period was 10.
+    [ $(cat exp/nnet3/ivectors_${dataset}/ivector_period) -eq 10 ] || exit 1
+    # the ivector_period would have to be 13.333 to get the exact same rate.
+    # set it to 14 (slightly over) as less likely to produce errors in decoding.
+    echo 14 > exp/nnet3/ivectors_${dataset}_fake/ivector_period
+  done
+  # for the training set, use 13 as the ivector_period... this avoids
+  # errors for some longer utterances (the code checks the matching
+  # in a slightly different way).  none of this would be necessary
+  # if we generated iVectors using the same frame shift.
+  echo 13 > exp/nnet3/ivectors_${train_set}_fake/ivector_period
+fi
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --frame-subsampling-factor 4 \
+    --alignment-subsampling-factor 3 \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1700 --jesus-hidden-dim 0 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001" \
+    --splice-indexes "-1,0,1 -2,-1,0,1,2 -4,-2,0,2 -4,0,4 -4,0,4 -4,0,4" \
+    --apply-deriv-weights false \
+    --frames-per-iter 2000000 \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set}_fake \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hiresf $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+
+ echo "0.0075" > $dir/frame_shift # this lets the sclite decoding script know
+                                  # what the frame shift was, in seconds.
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 16 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set}_fake \
+         $graph_dir data/${decode_set}_hiresf $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hiresf \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6j.sh b/egs/swbd/s5c/local/chain/run_tdnn_6j.sh
new file mode 100755
index 00000000000..ded13de9470
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6j.sh
@@ -0,0 +1,482 @@
+#!/bin/bash
+
+# _6j is another baseline for _6i, in which we use regular features (10 ms frame
+# shift) with the 4-fold subsampling of 6i.  I don't expect this will be as
+# good, but it will be nice to have confirmation that the lower sampling
+# rate is actually helpful.
+# reducing frames-per-eg from 200 to 150 and --frames-per-iter from
+# 2 million to 1.5 million.
+
+# Hm- the difference is surprisingly small, about 0.2% worse on average.
+#local/chain/compare_wer.sh 6i 6j
+#System                       6i        6j
+#WER on train_dev(tg)      15.62     15.86
+#WER on train_dev(fg)      14.46     14.79
+#WER on eval2000(tg)        17.3      17.6
+#WER on eval2000(fg)        15.8      15.8
+#Final train prob       -0.10417 -0.131444
+#Final valid prob      -0.123985 -0.167574
+#Final train prob (xent)      -1.60566  -1.45908
+#Final valid prob (xent)      -1.67945  -1.55937
+
+# _6i takes aspects from 5n and 6h.  Like 6h it uses a 'thin' jesus-layer
+# (no hidden dimension), and like 5n it uses a non-standard frame shift at the
+# input, but this frame shift is 7.5 ms rather than 5ms (5n) or 10ms (6h).
+# the idea is that this allows us to subsample the input frames by a factor
+# of 4, rather than 3, and since 4 = 2 * 2, we can do the subsampling
+# in two stages.  You'll see this reflected in the splice indexes.
+# Some notes:
+#    - we had the choose the splice indexes; we have 1 hidden layer at
+#      base frame rate, 2 at
+
+# _5n is as _5j (also omitting the iVectors), but using double the input frame
+# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying
+# the splice indexes accordingly.
+# note: the frames-per-iter should be 1.6 million to get the same amount of
+# data per iteration, but I'm making it 2 million as the training per is getting
+# faster than I like (-> wasting time waiting for the queue).
+
+# A very nice improvement on dev; small improvement on eval2000 though.
+#local/chain/compare_wer.sh 5j 5n
+#System                       5j        5n
+#WER on train_dev(tg)      17.59     16.85
+#WER on train_dev(fg)      16.33     15.67
+#WER on eval2000(tg)        19.1      19.1
+#WER on eval2000(fg)        17.5      17.3
+#Final train prob      -0.114691 -0.116341
+#Final valid prob      -0.130761 -0.130884
+
+# _5j is as _5e, but omitting the iVectors.
+
+# Definitely worse, although curiously, there is very little effect on the valid prob.
+#./compare_wer.sh 5e 5j
+#System                       5e        5j
+#WER on train_dev(tg)      15.43     17.59
+#WER on train_dev(fg)      14.32     16.33
+#WER on eval2000(tg)        17.3      19.1
+#WER on eval2000(fg)        15.5      17.5
+#Final train prob      -0.110056 -0.114691
+#Final valid prob      -0.129184 -0.130761
+
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=11
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6j # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=3  # this is about the same amount of compute as the normal 4, since
+              # epoch encompasses all frame-shifts of the data and we now have 4
+              # frames-shifts rather than 3.  (3 * 4 == 4 * 3).
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_6j_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 4 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --frame-subsampling-factor 4 \
+    --alignment-subsampling-factor 4 \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1700 --jesus-hidden-dim 0 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001 --xent-separate-forward-affine=true" \
+    --splice-indexes "-1,0,1 -2,-1,0,1,2 -4,-2,0,2 -4,0,4 -4,0,4 -4,0,4" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1500000 \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+
+ echo "0.0075" > $dir/frame_shift # this lets the sclite decoding script know
+                                  # what the frame shift was, in seconds.
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6k.sh b/egs/swbd/s5c/local/chain/run_tdnn_6k.sh
new file mode 100755
index 00000000000..4625da200e6
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6k.sh
@@ -0,0 +1,509 @@
+#!/bin/bash
+
+# _6k is as _6i, but one more epoch.  After running the first few stages, I'm
+# copying the last model from 6i and starting from that point, to save compute.
+# No better.
+#local/chain/compare_wer.sh 6i 6k
+#System                       6i        6k
+#WER on train_dev(tg)      15.62     15.67
+#WER on train_dev(fg)      14.46     14.47
+#WER on eval2000(tg)        17.3      17.4
+#WER on eval2000(fg)        15.8      15.8
+#Final train prob       -0.10417-0.0994163
+#Final valid prob      -0.123985 -0.122743
+
+# _6i takes aspects from 5n and 6h.  Like 6h it uses a 'thin' jesus-layer
+# (no hidden dimension), and like 5n it uses a non-standard frame shift at the
+# input, but this frame shift is 7.5 ms rather than 5ms (5n) or 10ms (6h).
+# the idea is that this allows us to subsample the input frames by a factor
+# of 4, rather than 3, and since 4 = 2 * 2, we can do the subsampling
+# in two stages.  You'll see this reflected in the splice indexes.
+
+# local/chain/compare_wer.sh 6h 6i
+# System                       6h        6i
+# WER on train_dev(tg)      15.46     15.62
+# WER on train_dev(fg)      14.28     14.46
+# WER on eval2000(tg)        17.4      17.3
+# WER on eval2000(fg)        15.7      15.8
+# Final train prob      -0.105663  -0.10417
+# Final valid prob      -0.130166 -0.123985
+
+
+# _5n is as _5j (also omitting the iVectors), but using double the input frame
+# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying
+# the splice indexes accordingly.
+# note: the frames-per-iter should be 1.6 million to get the same amount of
+# data per iteration, but I'm making it 2 million as the training per is getting
+# faster than I like (-> wasting time waiting for the queue).
+
+# A very nice improvement on dev; small improvement on eval2000 though.
+#local/chain/compare_wer.sh 5j 5n
+#System                       5j        5n
+#WER on train_dev(tg)      17.59     16.85
+#WER on train_dev(fg)      16.33     15.67
+#WER on eval2000(tg)        19.1      19.1
+#WER on eval2000(fg)        17.5      17.3
+#Final train prob      -0.114691 -0.116341
+#Final valid prob      -0.130761 -0.130884
+
+# _5j is as _5e, but omitting the iVectors.
+
+# Definitely worse, although curiously, there is very little effect on the valid prob.
+#./compare_wer.sh 5e 5j
+#System                       5e        5j
+#WER on train_dev(tg)      15.43     17.59
+#WER on train_dev(fg)      14.32     16.33
+#WER on eval2000(tg)        17.3      19.1
+#WER on eval2000(fg)        15.5      17.5
+#Final train prob      -0.110056 -0.114691
+#Final valid prob      -0.129184 -0.130761
+
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=14
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6k # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=200 # 20 is equivalent to 150 at 10ms frame rate.
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+# Generate faster-frame-rate (7.5 ms frame shift) version of the data.
+if [ $stage -le 12 ]; then
+  mfccdir=mfcc
+  for dataset in eval2000 train_dev ${train_set}; do
+    utils/copy_data_dir.sh data/$dataset data/${dataset}_hiresf
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 --mfcc-config conf/mfcc_hiresf.conf \
+        data/${dataset}_hiresf exp/make_hiresf/$dataset $mfccdir;
+    steps/compute_cmvn_stats.sh data/${dataset}_hiresf exp/make_hiresf/$dataset $mfccdir;
+    utils/fix_data_dir.sh data/${dataset}_hiresf  # remove segments with problems
+  done
+fi
+
+if [ $stage -le 13 ]; then
+  for dataset in eval2000 train_dev ${train_set}; do
+    mkdir -p exp/nnet3/ivectors_${dataset}_fake
+    cp exp/nnet3/ivectors_${dataset}/ivector_online.scp exp/nnet3/ivectors_${dataset}_fake
+    # verify that the old ivector_period was 10.
+    [ $(cat exp/nnet3/ivectors_${dataset}/ivector_period) -eq 10 ] || exit 1
+    # the ivector_period would have to be 13.333 to get the exact same rate.
+    # set it to 14 (slightly over) as less likely to produce errors in decoding.
+    echo 14 > exp/nnet3/ivectors_${dataset}_fake/ivector_period
+  done
+  # for the training set, use 13 as the ivector_period... this avoids
+  # errors for some longer utterances (the code checks the matching
+  # in a slightly different way).  none of this would be necessary
+  # if we generated iVectors using the same frame shift.
+  echo 13 > exp/nnet3/ivectors_${train_set}_fake/ivector_period
+fi
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_6i_sp/egs \
+    --frame-subsampling-factor 4 \
+    --alignment-subsampling-factor 3 \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1700 --jesus-hidden-dim 0 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001 --xent-separate-forward-affine=true" \
+    --splice-indexes "-1,0,1 -2,-1,0,1,2 -4,-2,0,2 -4,0,4 -4,0,4 -4,0,4" \
+    --apply-deriv-weights false \
+    --frames-per-iter 2000000 \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set}_fake \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hiresf $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+
+ echo "0.0075" > $dir/frame_shift # this lets the sclite decoding script know
+                                  # what the frame shift was, in seconds.
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 16 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set}_fake \
+         $graph_dir data/${decode_set}_hiresf $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hiresf \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6l.sh b/egs/swbd/s5c/local/chain/run_tdnn_6l.sh
new file mode 100755
index 00000000000..f1e0821f2cf
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6l.sh
@@ -0,0 +1,521 @@
+#!/bin/bash
+
+# _6l is as _6i, but adding the option --xent-separate-forward-affine=true which
+# I had accidentally omitted, and adding 4 frames more left context and 2 frames
+# more right context.
+
+# Below I'm also comparing with 6h, which (since we now added
+# --xent-separate-forward-affine=true) is the appopriate normal-frame-rate
+# baseline, rather than 6g.
+
+# This experiment is better than 6i, but there is no clear difference with
+# 6h.  So we can't really say that we're getting any benefit from the higher
+# frame rate.
+
+#local/chain/compare_wer.sh 6h 6i 6l
+#System                       6h        6i        6l
+#WER on train_dev(tg)      15.46     15.62     15.42
+#WER on train_dev(fg)      14.28     14.46     14.25
+#WER on eval2000(tg)        17.4      17.3      17.3
+#WER on eval2000(fg)        15.7      15.8      15.8
+#Final train prob      -0.105663  -0.10417-0.0984719
+#Final valid prob      -0.130166 -0.123985 -0.119088
+#Final train prob (xent)      -1.42483  -1.60566  -1.46581
+#Final valid prob (xent)      -1.49792  -1.67945  -1.51644
+
+
+# _6i takes aspects from 5n and 6g.  Like 6g it uses a 'thin' jesus-layer
+# (no hidden dimension), and like 5n it uses a non-standard frame shift at the
+# input, but this frame shift is 7.5 ms rather than 5ms (5n) or 10ms (6h).
+# the idea is that this allows us to subsample the input frames by a factor
+# of 4, rather than 3, and since 4 = 2 * 2, we can do the subsampling
+# in two stages.  You'll see this reflected in the splice indexes.
+
+# local/chain/compare_wer.sh 6g 6i
+# System                       6g        6i
+# WER on train_dev(tg)      15.50     15.62
+# WER on train_dev(fg)      14.31     14.46
+# WER on eval2000(tg)        17.5      17.3
+# WER on eval2000(fg)        15.8      15.8
+# Final train prob      -0.105853  -0.10417
+# Final valid prob      -0.129997 -0.123985
+
+# _5n is as _5j (also omitting the iVectors), but using double the input frame
+# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying
+# the splice indexes accordingly.
+# note: the frames-per-iter should be 1.6 million to get the same amount of
+# data per iteration, but I'm making it 2 million as the training per is getting
+# faster than I like (-> wasting time waiting for the queue).
+
+# A very nice improvement on dev; small improvement on eval2000 though.
+#local/chain/compare_wer.sh 5j 5n
+#System                       5j        5n
+#WER on train_dev(tg)      17.59     16.85
+#WER on train_dev(fg)      16.33     15.67
+#WER on eval2000(tg)        19.1      19.1
+#WER on eval2000(fg)        17.5      17.3
+#Final train prob      -0.114691 -0.116341
+#Final valid prob      -0.130761 -0.130884
+
+# _5j is as _5e, but omitting the iVectors.
+
+# Definitely worse, although curiously, there is very little effect on the valid prob.
+#./compare_wer.sh 5e 5j
+#System                       5e        5j
+#WER on train_dev(tg)      15.43     17.59
+#WER on train_dev(fg)      14.32     16.33
+#WER on eval2000(tg)        17.3      19.1
+#WER on eval2000(fg)        15.5      17.5
+#Final train prob      -0.110056 -0.114691
+#Final valid prob      -0.129184 -0.130761
+
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=14
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6l # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=3  # this is about the same amount of compute as the normal 4, since
+              # epoch encompasses all frame-shifts of the data and we now have 4
+              # frames-shifts rather than 3.  (3 * 4 == 4 * 3).
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=200 # 20 is equivalent to 150 at 10ms frame rate.
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+# Generate faster-frame-rate (7.5 ms frame shift) version of the data.
+if [ $stage -le 12 ]; then
+  mfccdir=mfcc
+  for dataset in eval2000 train_dev ${train_set}; do
+    utils/copy_data_dir.sh data/$dataset data/${dataset}_hiresf
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 --mfcc-config conf/mfcc_hiresf.conf \
+        data/${dataset}_hiresf exp/make_hiresf/$dataset $mfccdir;
+    steps/compute_cmvn_stats.sh data/${dataset}_hiresf exp/make_hiresf/$dataset $mfccdir;
+    utils/fix_data_dir.sh data/${dataset}_hiresf  # remove segments with problems
+  done
+fi
+
+if [ $stage -le 13 ]; then
+  for dataset in eval2000 train_dev ${train_set}; do
+    mkdir -p exp/nnet3/ivectors_${dataset}_fake
+    cp exp/nnet3/ivectors_${dataset}/ivector_online.scp exp/nnet3/ivectors_${dataset}_fake
+    # verify that the old ivector_period was 10.
+    [ $(cat exp/nnet3/ivectors_${dataset}/ivector_period) -eq 10 ] || exit 1
+    # the ivector_period would have to be 13.333 to get the exact same rate.
+    # set it to 14 (slightly over) as less likely to produce errors in decoding.
+    echo 14 > exp/nnet3/ivectors_${dataset}_fake/ivector_period
+  done
+  # for the training set, use 13 as the ivector_period... this avoids
+  # errors for some longer utterances (the code checks the matching
+  # in a slightly different way).  none of this would be necessary
+  # if we generated iVectors using the same frame shift.
+  echo 13 > exp/nnet3/ivectors_${train_set}_fake/ivector_period
+fi
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b{05,b11,b12,b13}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --frame-subsampling-factor 4 \
+    --alignment-subsampling-factor 3 \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1700 --jesus-hidden-dim 0 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001  --xent-separate-forward-affine=true" \
+    --splice-indexes "-1,0,1 -2,-1,0,1,2 -4,-2,0,2,4 -4,0,4 -4,0,4 -8,-4,0,4" \
+    --apply-deriv-weights false \
+    --frames-per-iter 2000000 \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set}_fake \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hiresf $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+
+ echo "0.0075" > $dir/frame_shift # this lets the sclite decoding script know
+                                  # what the frame shift was, in seconds.
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 16 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set}_fake \
+         $graph_dir data/${decode_set}_hiresf $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hiresf \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6m.sh b/egs/swbd/s5c/local/chain/run_tdnn_6m.sh
new file mode 100755
index 00000000000..8a7b14ef342
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6m.sh
@@ -0,0 +1,497 @@
+#!/bin/bash
+
+# _6m is as _6j (which subsamples by 4 frames not 3 at the output), changing just the
+# --left-tolerance and --right-tolerance to be the same total width but more
+# symmetrical (-7,+8) vs the default (-5, +10).
+
+# this is unhelpful and if anything is a little worse.
+#local/chain/compare_wer.sh 6j 6m
+#System                       6j        6m
+#WER on train_dev(tg)      15.86     16.08
+#WER on train_dev(fg)      14.79     14.85
+#WER on eval2000(tg)        17.6      17.6
+#WER on eval2000(fg)        15.8      15.8
+#Final train prob      -0.131444 -0.131515
+#Final valid prob      -0.167574  -0.17046
+#Final train prob (xent)      -1.45908  -1.43814
+#Final valid prob (xent)      -1.55937   -1.5412
+
+# _6j is another baseline for _6i, in which we use regular features (10 ms frame
+# shift) with the 4-fold subsampling of 6i.  I don't expect this will be as
+# good, but it will be nice to have confirmation that the lower sampling
+# rate is actually helpful.
+# reducing frames-per-eg from 200 to 150 and --frames-per-iter from
+# 2 million to 1.5 million.
+
+# Hm- the difference is surprisingly small, about 0.2% worse on average.
+#local/chain/compare_wer.sh 6i 6j
+#System                       6i        6j
+#WER on train_dev(tg)      15.62     15.86
+#WER on train_dev(fg)      14.46     14.79
+#WER on eval2000(tg)        17.3      17.6
+#WER on eval2000(fg)        15.8      15.8
+#Final train prob       -0.10417 -0.131444
+#Final valid prob      -0.123985 -0.167574
+
+# _6i takes aspects from 5n and 6h.  Like 6h it uses a 'thin' jesus-layer
+# (no hidden dimension), and like 5n it uses a non-standard frame shift at the
+# input, but this frame shift is 7.5 ms rather than 5ms (5n) or 10ms (6h).
+# the idea is that this allows us to subsample the input frames by a factor
+# of 4, rather than 3, and since 4 = 2 * 2, we can do the subsampling
+# in two stages.  You'll see this reflected in the splice indexes.
+# Some notes:
+#    - we had the choose the splice indexes; we have 1 hidden layer at
+#      base frame rate, 2 at
+
+# _5n is as _5j (also omitting the iVectors), but using double the input frame
+# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying
+# the splice indexes accordingly.
+# note: the frames-per-iter should be 1.6 million to get the same amount of
+# data per iteration, but I'm making it 2 million as the training per is getting
+# faster than I like (-> wasting time waiting for the queue).
+
+# A very nice improvement on dev; small improvement on eval2000 though.
+#local/chain/compare_wer.sh 5j 5n
+#System                       5j        5n
+#WER on train_dev(tg)      17.59     16.85
+#WER on train_dev(fg)      16.33     15.67
+#WER on eval2000(tg)        19.1      19.1
+#WER on eval2000(fg)        17.5      17.3
+#Final train prob      -0.114691 -0.116341
+#Final valid prob      -0.130761 -0.130884
+
+# _5j is as _5e, but omitting the iVectors.
+
+# Definitely worse, although curiously, there is very little effect on the valid prob.
+#./compare_wer.sh 5e 5j
+#System                       5e        5j
+#WER on train_dev(tg)      15.43     17.59
+#WER on train_dev(fg)      14.32     16.33
+#WER on eval2000(tg)        17.3      19.1
+#WER on eval2000(fg)        15.5      17.5
+#Final train prob      -0.110056 -0.114691
+#Final valid prob      -0.129184 -0.130761
+
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6m # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=3  # this is about the same amount of compute as the normal 4, since
+              # epoch encompasses all frame-shifts of the data and we now have 4
+              # frames-shifts rather than 3.  (3 * 4 == 4 * 3).
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_6j_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 4 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --left-tolerance 7 --right-tolerance 8 \
+    --frame-subsampling-factor 4 \
+    --alignment-subsampling-factor 4 \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1700 --jesus-hidden-dim 0 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001 --xent-separate-forward-affine=true" \
+    --splice-indexes "-1,0,1 -2,-1,0,1,2 -4,-2,0,2 -4,0,4 -4,0,4 -4,0,4" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1500000 \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+
+ echo "0.0075" > $dir/frame_shift # this lets the sclite decoding script know
+                                  # what the frame shift was, in seconds.
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6n.sh b/egs/swbd/s5c/local/chain/run_tdnn_6n.sh
new file mode 100755
index 00000000000..625cb73cf50
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6n.sh
@@ -0,0 +1,499 @@
+#!/bin/bash
+
+# _6n is as _6m, but with a less-wide splicing context.
+
+# The effect is inconsistent- there is none, on average.
+#System                       6j        6m        6n
+#WER on train_dev(tg)      15.86     16.08     16.01
+#WER on train_dev(fg)      14.79     14.85     14.66
+#WER on eval2000(tg)        17.6      17.6      17.7
+#WER on eval2000(fg)        15.8      15.8      15.9
+#Final train prob      -0.131444 -0.131515 -0.133681
+#Final valid prob      -0.167574  -0.17046 -0.172072
+#Final train prob (xent)      -1.45908  -1.43814  -1.53108
+#Final valid prob (xent)      -1.55937   -1.5412  -1.65137
+
+# _6m is as _6j (which subsamples by 4 frames), changing just the
+# --left-tolerance and --right-tolerance to be the same total width but more
+# symmetrical (-7,+8) vs the default (-5, +10).
+
+# _6j is another baseline for _6i, in which we use regular features (10 ms frame
+# shift) with the 4-fold subsampling of 6i.  I don't expect this will be as
+# good, but it will be nice to have confirmation that the lower sampling
+# rate is actually helpful.
+# reducing frames-per-eg from 200 to 150 and --frames-per-iter from
+# 2 million to 1.5 million.
+
+# Hm- the difference is surprisingly small, about 0.2% worse on average.
+#local/chain/compare_wer.sh 6i 6j
+#System                       6i        6j
+#WER on train_dev(tg)      15.62     15.86
+#WER on train_dev(fg)      14.46     14.79
+#WER on eval2000(tg)        17.3      17.6
+#WER on eval2000(fg)        15.8      15.8
+#Final train prob       -0.10417 -0.131444
+#Final valid prob      -0.123985 -0.167574
+
+# _6i takes aspects from 5n and 6h.  Like 6h it uses a 'thin' jesus-layer
+# (no hidden dimension), and like 5n it uses a non-standard frame shift at the
+# input, but this frame shift is 7.5 ms rather than 5ms (5n) or 10ms (6h).
+# the idea is that this allows us to subsample the input frames by a factor
+# of 4, rather than 3, and since 4 = 2 * 2, we can do the subsampling
+# in two stages.  You'll see this reflected in the splice indexes.
+# Some notes:
+#    - we had the choose the splice indexes; we have 1 hidden layer at
+#      base frame rate, 2 at
+
+# _5n is as _5j (also omitting the iVectors), but using double the input frame
+# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying
+# the splice indexes accordingly.
+# note: the frames-per-iter should be 1.6 million to get the same amount of
+# data per iteration, but I'm making it 2 million as the training per is getting
+# faster than I like (-> wasting time waiting for the queue).
+
+# A very nice improvement on dev; small improvement on eval2000 though.
+#local/chain/compare_wer.sh 5j 5n
+#System                       5j        5n
+#WER on train_dev(tg)      17.59     16.85
+#WER on train_dev(fg)      16.33     15.67
+#WER on eval2000(tg)        19.1      19.1
+#WER on eval2000(fg)        17.5      17.3
+#Final train prob      -0.114691 -0.116341
+#Final valid prob      -0.130761 -0.130884
+
+# _5j is as _5e, but omitting the iVectors.
+
+# Definitely worse, although curiously, there is very little effect on the valid prob.
+#./compare_wer.sh 5e 5j
+#System                       5e        5j
+#WER on train_dev(tg)      15.43     17.59
+#WER on train_dev(fg)      14.32     16.33
+#WER on eval2000(tg)        17.3      19.1
+#WER on eval2000(fg)        15.5      17.5
+#Final train prob      -0.110056 -0.114691
+#Final valid prob      -0.129184 -0.130761
+
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6n # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=3  # this is about the same amount of compute as the normal 4, since
+              # epoch encompasses all frame-shifts of the data and we now have 4
+              # frames-shifts rather than 3.  (3 * 4 == 4 * 3).
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_6j_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 4 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_6m_sp/egs \
+    --left-tolerance 7 --right-tolerance 8 \
+    --frame-subsampling-factor 4 \
+    --alignment-subsampling-factor 4 \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1700 --jesus-hidden-dim 0 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001 --xent-separate-forward-affine=true" \
+    --splice-indexes "-1,0,1 -2,-1,0,1,2 -2,0,2 -2,0,2 -4,0,4 -4,0,4" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1500000 \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+
+ echo "0.0075" > $dir/frame_shift # this lets the sclite decoding script know
+                                  # what the frame shift was, in seconds.
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6o.sh b/egs/swbd/s5c/local/chain/run_tdnn_6o.sh
new file mode 100755
index 00000000000..e07e6092644
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6o.sh
@@ -0,0 +1,509 @@
+#!/bin/bash
+
+# _6o is as _6h but halving the --l2-regularize option, because since the
+# time we last tuned this, other regularization methods have been added.
+
+#It's worse.
+#local/chain/compare_wer.sh 6h 6o
+#System                       6h        6o
+#WER on train_dev(tg)      15.46     15.61
+#WER on train_dev(fg)      14.28     14.58
+#WER on eval2000(tg)        17.4      17.5
+#WER on eval2000(fg)        15.7      15.7
+#Final train prob      -0.105663-0.0992526
+#Final valid prob      -0.130166 -0.127421
+#Final train prob (xent)      -1.42483   -1.4369
+#Final valid prob (xent)      -1.49792  -1.49867
+
+# _6h is as _6g but adding --xent-separate-forward-affine=true, which
+# gives a separate last-but-one weight matrix to the xent output.
+
+# Although this slight improvement is probably not significant, it's a
+# sensible idea so I think I'll stick with it.
+#local/chain/compare_wer.sh 6g 6h
+#System                       6g        6h
+#WER on train_dev(tg)      15.50     15.46
+#WER on train_dev(fg)      14.31     14.28
+#WER on eval2000(tg)        17.5      17.4
+#WER on eval2000(fg)        15.8      15.7
+#Final train prob      -0.105853 -0.105663
+#Final valid prob      -0.129997 -0.130166
+
+# _6g is as _6f but increasing the parameters (increasing
+# jesus-forward-input-from from 500 to 600).
+
+# _6f is as _5v, but setting --jesus-hidden-dim to 0 which with a script change
+# means there is no hidden part in the jesus layer (it's just repeated affine and relu).
+
+# slightly worse, but encouragingly small difference.
+#local/chain/compare_wer.sh 5v 6f
+#System                       5v        6f
+#WER on train_dev(tg)      15.38     15.71
+#WER on train_dev(fg)      14.39     14.50
+#WER on eval2000(tg)        17.4      17.5
+#WER on eval2000(fg)        15.7      15.9
+#Final train prob       -0.11156 -0.111305
+#Final valid prob      -0.131797 -0.131487
+
+# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500.
+
+# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse.
+#
+#local/chain/compare_wer.sh 5e 5s 5t 5v
+#System                       5e        5s        5t        5v
+#WER on train_dev(tg)      15.43     15.47     15.43     15.38
+#WER on train_dev(fg)      14.32     14.31     14.34     14.39
+#WER on eval2000(tg)        17.3      17.4      17.4      17.4
+#WER on eval2000(fg)        15.5      15.6      15.6      15.7
+#Final train prob      -0.110056 -0.110928 -0.110752  -0.11156
+#Final valid prob      -0.129184 -0.132139 -0.129123 -0.131797
+
+# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it
+# up), from 5000 to 3500.
+
+# about 5s: comparing with 5e which is the most recent baseline we actually
+# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700,
+# jesus-hidden-dim reduced 7500 to 5000, and and the new option
+# --self-repair-scale 0.00001 added.  Also compare 5t and 5v which have even
+# smaller jesus-hidden-dims.
+
+# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate
+# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair
+# code to a bug which was doubling the thresholds so there was, in effect,
+# no upper threshold.  I stopped the p,q,r runs after I found this, but in
+# configuring this run I'm bearing in mind the train and valid probs from the
+# p,q,r runs.
+
+# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000.
+
+# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try
+# to compensate for the fact that more of the output dimensions are now being
+# usefully used.
+
+# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair
+# ReLUs that are over or under-saturated.
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+# quite helpful:
+#local/chain/compare_wer.sh 4w 5a
+#System                       4w        5a
+#WER on train_dev(tg)      16.05     15.86
+#WER on train_dev(fg)      14.92     14.74
+#WER on eval2000(tg)        18.0      17.4
+#WER on eval2000(fg)        16.2      15.6
+#Final train prob      -0.108816-0.0998359
+#Final valid prob      -0.118254 -0.115884
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6o # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.000025 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1700 --jesus-hidden-dim 0 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001 --xent-separate-forward-affine=true" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6p.sh b/egs/swbd/s5c/local/chain/run_tdnn_6p.sh
new file mode 100755
index 00000000000..a9f7eef9bbc
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6p.sh
@@ -0,0 +1,503 @@
+#!/bin/bash
+
+# _6p is as _6j, but increasing the various regularization coefficients.
+# the intention is to increase them by 4/3, since they are all evaluated
+# once per output frame, and there are now fewer output frames by a factor
+# of 3/4.  To make them rounder numbers, I increased some by a factor
+# of 5/4 (--xent-regularize, 0.1 -> 0.125, and --leaky-hmm-coefficient,
+# 0.1 -> 0.125), and l2-regularize by 3/2 (0.00005 -> 0.000075).
+
+# Worse.
+#local/chain/compare_wer.sh 6j 6p
+#System                       6j        6p
+#WER on train_dev(tg)      15.86     15.91
+#WER on train_dev(fg)      14.79     14.76
+#WER on eval2000(tg)        17.6      17.9
+#WER on eval2000(fg)        15.8      15.9
+#Final train prob      -0.131444 -0.143285
+#Final valid prob      -0.167574 -0.173759
+#Final train prob (xent)      -1.45908  -1.44287
+#Final valid prob (xent)      -1.55937  -1.52918
+
+
+# _6j is another baseline for _6i, in which we use regular features (10 ms frame
+# shift) with the 4-fold subsampling of 6i.  I don't expect this will be as
+# good, but it will be nice to have confirmation that the lower sampling
+# rate is actually helpful.
+# reducing frames-per-eg from 200 to 150 and --frames-per-iter from
+# 2 million to 1.5 million.
+
+# Hm- the difference is surprisingly small, about 0.2% worse on average.
+#local/chain/compare_wer.sh 6i 6j
+#System                       6i        6j
+#WER on train_dev(tg)      15.62     15.86
+#WER on train_dev(fg)      14.46     14.79
+#WER on eval2000(tg)        17.3      17.6
+#WER on eval2000(fg)        15.8      15.8
+#Final train prob       -0.10417 -0.131444
+#Final valid prob      -0.123985 -0.167574
+#Final train prob (xent)      -1.60566  -1.45908
+#Final valid prob (xent)      -1.67945  -1.55937
+
+# _6i takes aspects from 5n and 6h.  Like 6h it uses a 'thin' jesus-layer
+# (no hidden dimension), and like 5n it uses a non-standard frame shift at the
+# input, but this frame shift is 7.5 ms rather than 5ms (5n) or 10ms (6h).
+# the idea is that this allows us to subsample the input frames by a factor
+# of 4, rather than 3, and since 4 = 2 * 2, we can do the subsampling
+# in two stages.  You'll see this reflected in the splice indexes.
+# Some notes:
+#    - we had the choose the splice indexes; we have 1 hidden layer at
+#      base frame rate, 2 at
+
+# _5n is as _5j (also omitting the iVectors), but using double the input frame
+# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying
+# the splice indexes accordingly.
+# note: the frames-per-iter should be 1.6 million to get the same amount of
+# data per iteration, but I'm making it 2 million as the training per is getting
+# faster than I like (-> wasting time waiting for the queue).
+
+# A very nice improvement on dev; small improvement on eval2000 though.
+#local/chain/compare_wer.sh 5j 5n
+#System                       5j        5n
+#WER on train_dev(tg)      17.59     16.85
+#WER on train_dev(fg)      16.33     15.67
+#WER on eval2000(tg)        19.1      19.1
+#WER on eval2000(fg)        17.5      17.3
+#Final train prob      -0.114691 -0.116341
+#Final valid prob      -0.130761 -0.130884
+
+# _5j is as _5e, but omitting the iVectors.
+
+# Definitely worse, although curiously, there is very little effect on the valid prob.
+#./compare_wer.sh 5e 5j
+#System                       5e        5j
+#WER on train_dev(tg)      15.43     17.59
+#WER on train_dev(fg)      14.32     16.33
+#WER on eval2000(tg)        17.3      19.1
+#WER on eval2000(fg)        15.5      17.5
+#Final train prob      -0.110056 -0.114691
+#Final valid prob      -0.129184 -0.130761
+
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6p # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=3  # this is about the same amount of compute as the normal 4, since
+              # epoch encompasses all frame-shifts of the data and we now have 4
+              # frames-shifts rather than 3.  (3 * 4 == 4 * 3).
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_6j_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 4 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_6j_sp/egs \
+    --frame-subsampling-factor 4 \
+    --alignment-subsampling-factor 4 \
+    --xent-regularize 0.125 \
+    --leaky-hmm-coefficient 0.125 \
+    --l2-regularize 0.000075 \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1700 --jesus-hidden-dim 0 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001 --xent-separate-forward-affine=true" \
+    --splice-indexes "-1,0,1 -2,-1,0,1,2 -4,-2,0,2 -4,0,4 -4,0,4 -4,0,4" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1500000 \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+
+ echo "0.0075" > $dir/frame_shift # this lets the sclite decoding script know
+                                  # what the frame shift was, in seconds.
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6q.sh b/egs/swbd/s5c/local/chain/run_tdnn_6q.sh
new file mode 100755
index 00000000000..440da3a1d6b
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6q.sh
@@ -0,0 +1,493 @@
+#!/bin/bash
+
+# _6q is as _5n (which is a double-frame-rate system), but putting back
+# the iVectors and otherwise changing the configuration as in 5j -> 6g,
+# like 'rebasing' the changes onto 6g.
+# (note, I forgot the self-repair-scale, and I probably should have used
+# 6h as the baseline because it has --xent-separate-forward-affine=true;
+# note, this experiment doesn't have --xent-separate-forward-affine=true but
+# it would have been better to have it (retrying as 6r)
+
+# we're about 0.2% better than 6g.
+#local/chain/compare_wer.sh 6g 6q
+#System                       6g        6q
+#WER on train_dev(tg)      15.50     15.25
+#WER on train_dev(fg)      14.31     14.24
+#WER on eval2000(tg)        17.5      17.2
+#WER on eval2000(fg)        15.8      15.6
+#Final train prob      -0.105853 -0.106936
+#Final valid prob      -0.129997 -0.123066
+#Final train prob (xent)       -1.4718  -1.66328
+#Final valid prob (xent)      -1.55129  -1.71979
+
+
+
+# _5n is as _5j (also omitting the iVectors), but using double the input frame
+# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying
+# the splice indexes accordingly
+
+# A very nice improvement on dev; small improvement on eval2000 though.
+#local/chain/compare_wer.sh 5j 5n
+#System                       5j        5n
+#WER on train_dev(tg)      17.59     16.85
+#WER on train_dev(fg)      16.33     15.67
+#WER on eval2000(tg)        19.1      19.1
+#WER on eval2000(fg)        17.5      17.3
+#Final train prob      -0.114691 -0.116341
+#Final valid prob      -0.130761 -0.130884
+
+# _5j is as _5e, but omitting the iVectors.
+
+# Definitely worse, although curiously, there is very little effect on the valid prob.
+#./compare_wer.sh 5e 5j
+#System                       5e        5j
+#WER on train_dev(tg)      15.43     17.59
+#WER on train_dev(fg)      14.32     16.33
+#WER on eval2000(tg)        17.3      19.1
+#WER on eval2000(fg)        15.5      17.5
+#Final train prob      -0.110056 -0.114691
+#Final valid prob      -0.129184 -0.130761
+
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=13
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6q # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=2  # this is about the same amount of compute as the normal 4, since one
+              # epoch encompasses all frame-shifts of the data.
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=300 # doubling it, since we have half the frame rate.
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+# Generate double-frame-rate version of the data.
+if [ $stage -le 12 ]; then
+  mfccdir=mfcc
+  for dataset in eval2000 train_dev; do  ## ${train_set}; do
+    utils/copy_data_dir.sh data/$dataset data/${dataset}_hires_dbl
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 --mfcc-config conf/mfcc_hires_dbl.conf \
+        data/${dataset}_hires_dbl exp/make_hires_dbl/$dataset $mfccdir;
+    steps/compute_cmvn_stats.sh data/${dataset}_hires_dbl exp/make_hires_dbl/$dataset $mfccdir;
+    utils/fix_data_dir.sh data/${dataset}_hires_dbl  # remove segments with problems
+  done
+fi
+
+if [ $stage -le 13 ]; then
+  for dataset in eval2000 train_dev ${train_set}; do
+    mkdir -p exp/nnet3/ivectors_${dataset}_fake2
+    cp exp/nnet3/ivectors_${dataset}/ivector_online.scp exp/nnet3/ivectors_${dataset}_fake2
+    # verify that the old ivector_period was 10.
+    [ $(cat exp/nnet3/ivectors_${dataset}/ivector_period) -eq 10 ] || exit 1
+    echo 20 > exp/nnet3/ivectors_${dataset}_fake2/ivector_period
+  done
+fi
+
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --frame-subsampling-factor 6 \
+    --alignment-subsampling-factor 3 \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25" \
+    --splice-indexes "-1,0,1 -2,0,2 -4,-2,0,2 -6,0,6 -6,0,6 -12,-6,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 3000000 \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set}_fake2 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires_dbl $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+
+ echo "0.005" > $dir/frame_shift # this lets the sclite decoding script know
+                                 # what the frame shift was, in seconds.
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set}_fake2 \
+         $graph_dir data/${decode_set}_hires_dbl $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires_dbl \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6r.sh b/egs/swbd/s5c/local/chain/run_tdnn_6r.sh
new file mode 100755
index 00000000000..ffbac19d1eb
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6r.sh
@@ -0,0 +1,492 @@
+#!/bin/bash
+
+# _6r is as _6q, but adding --self-repair-scale 0.00001
+# --xent-separate-forward-affine=true.  the appropriate normal-frame-rate
+# baseline for this is 6h (since it has --xent-separate-forward-affine=true),
+# so using that as the baseline:
+
+#local/chain/compare_wer.sh 6h 6r
+#System                       6h        6r
+#WER on train_dev(tg)      15.46     15.06
+#WER on train_dev(fg)      14.28     14.05
+#WER on eval2000(tg)        17.4      17.2
+#WER on eval2000(fg)        15.7      15.4
+#Final train prob      -0.105663 -0.106685
+#Final valid prob      -0.130166 -0.122293
+#Final train prob (xent)      -1.42483  -1.62108
+#Final valid prob (xent)      -1.49792  -1.67695
+
+# _6q is as _5n (which is a double-frame-rate system), but putting back
+# the iVectors and otherwise changing the configuration as in 5j -> 6g,
+# like 'rebasing' the changes onto 6g.
+
+# _5n is as _5j (also omitting the iVectors), but using double the input frame
+# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying
+# the splice indexes accordingly
+
+# A very nice improvement on dev; small improvement on eval2000 though.
+#local/chain/compare_wer.sh 5j 5n
+#System                       5j        5n
+#WER on train_dev(tg)      17.59     16.85
+#WER on train_dev(fg)      16.33     15.67
+#WER on eval2000(tg)        19.1      19.1
+#WER on eval2000(fg)        17.5      17.3
+#Final train prob      -0.114691 -0.116341
+#Final valid prob      -0.130761 -0.130884
+
+# _5j is as _5e, but omitting the iVectors.
+
+# Definitely worse, although curiously, there is very little effect on the valid prob.
+#./compare_wer.sh 5e 5j
+#System                       5e        5j
+#WER on train_dev(tg)      15.43     17.59
+#WER on train_dev(fg)      14.32     16.33
+#WER on eval2000(tg)        17.3      19.1
+#WER on eval2000(fg)        15.5      17.5
+#Final train prob      -0.110056 -0.114691
+#Final valid prob      -0.129184 -0.130761
+
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=14
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6r # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=2  # this is about the same amount of compute as the normal 4, since one
+              # epoch encompasses all frame-shifts of the data.
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=300 # doubling it, since we have half the frame rate.
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+# Generate double-frame-rate version of the data.
+if [ $stage -le 12 ]; then
+  mfccdir=mfcc
+  for dataset in eval2000 train_dev; do  ## ${train_set}; do
+    utils/copy_data_dir.sh data/$dataset data/${dataset}_hires_dbl
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 --mfcc-config conf/mfcc_hires_dbl.conf \
+        data/${dataset}_hires_dbl exp/make_hires_dbl/$dataset $mfccdir;
+    steps/compute_cmvn_stats.sh data/${dataset}_hires_dbl exp/make_hires_dbl/$dataset $mfccdir;
+    utils/fix_data_dir.sh data/${dataset}_hires_dbl  # remove segments with problems
+  done
+fi
+
+if [ $stage -le 13 ]; then
+  for dataset in eval2000 train_dev ${train_set}; do
+    mkdir -p exp/nnet3/ivectors_${dataset}_fake2
+    cp exp/nnet3/ivectors_${dataset}/ivector_online.scp exp/nnet3/ivectors_${dataset}_fake2
+    # verify that the old ivector_period was 10.
+    [ $(cat exp/nnet3/ivectors_${dataset}/ivector_period) -eq 10 ] || exit 1
+    echo 20 > exp/nnet3/ivectors_${dataset}_fake2/ivector_period
+  done
+fi
+
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_6q_sp/egs \
+    --frame-subsampling-factor 6 \
+    --alignment-subsampling-factor 3 \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001 --xent-separate-forward-affine=true" \
+    --splice-indexes "-1,0,1 -2,0,2 -4,-2,0,2 -6,0,6 -6,0,6 -12,-6,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 3000000 \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set}_fake2 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires_dbl $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+
+ echo "0.005" > $dir/frame_shift # this lets the sclite decoding script know
+                                 # what the frame shift was, in seconds.
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set}_fake2 \
+         $graph_dir data/${decode_set}_hires_dbl $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires_dbl \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6s.sh b/egs/swbd/s5c/local/chain/run_tdnn_6s.sh
new file mode 100755
index 00000000000..4693dde0a31
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6s.sh
@@ -0,0 +1,502 @@
+#!/bin/bash
+
+
+# _6s is as _6r, but changing the splicing indexes to be exactly the same as 6h,
+# but all multiplied by 2.  This means that for any given frame-shift, the network
+# sees exactly the same input as 6h; the only difference is that we see
+# more frame shifts, i.e. the data is more carefully perturbed than 6h.
+# this is to help disentangle whether the improvement really comes from the
+# higher-resolution features, or from the improved data shifting.
+
+# So we lose the improvement that we got in 6r (see below).  This is consistent
+# with the idea that we really do need the higher-frame-rate input,  but it's
+# also possible that some slight differences in the splicing indexes were
+# responsible, so in 6t we'll do an experiment where we try to get closer
+# to the splicing setup of 6r.
+#
+# local/chain/compare_wer.sh 6h 6r 6s
+#System                       6h        6r        6s
+#WER on train_dev(tg)      15.46     15.06     15.50
+#WER on train_dev(fg)      14.28     14.05     14.45
+#WER on eval2000(tg)        17.4      17.2      17.5
+#WER on eval2000(fg)        15.7      15.4      15.7
+#Final train prob      -0.105663 -0.106685 -0.105965
+#Final valid prob      -0.130166 -0.122293 -0.122376
+#Final train prob (xent)      -1.42483  -1.62108   -1.5454
+#Final valid prob (xent)      -1.49792  -1.67695  -1.58129
+
+# _6r is as _6q, but adding --self-repair-scale 0.00001 --xent-separate-forward-affine=true
+
+# _6q is as _5n (which is a double-frame-rate system), but putting back
+# the iVectors and otherwise changing the configuration as in 5j -> 6g,
+# like 'rebasing' the changes onto 6g.
+
+# _5n is as _5j (also omitting the iVectors), but using double the input frame
+# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying
+# the splice indexes accordingly
+
+# A very nice improvement on dev; small improvement on eval2000 though.
+#local/chain/compare_wer.sh 5j 5n
+#System                       5j        5n
+#WER on train_dev(tg)      17.59     16.85
+#WER on train_dev(fg)      16.33     15.67
+#WER on eval2000(tg)        19.1      19.1
+#WER on eval2000(fg)        17.5      17.3
+#Final train prob      -0.114691 -0.116341
+#Final valid prob      -0.130761 -0.130884
+
+# _5j is as _5e, but omitting the iVectors.
+
+# Definitely worse, although curiously, there is very little effect on the valid prob.
+#./compare_wer.sh 5e 5j
+#System                       5e        5j
+#WER on train_dev(tg)      15.43     17.59
+#WER on train_dev(fg)      14.32     16.33
+#WER on eval2000(tg)        17.3      19.1
+#WER on eval2000(fg)        15.5      17.5
+#Final train prob      -0.110056 -0.114691
+#Final valid prob      -0.129184 -0.130761
+
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=14
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6s # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=2  # this is about the same amount of compute as the normal 4, since one
+              # epoch encompasses all frame-shifts of the data.
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=300 # doubling it, since we have half the frame rate.
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+# Generate double-frame-rate version of the data.
+if [ $stage -le 12 ]; then
+  mfccdir=mfcc
+  for dataset in eval2000 train_dev; do  ## ${train_set}; do
+    utils/copy_data_dir.sh data/$dataset data/${dataset}_hires_dbl
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 --mfcc-config conf/mfcc_hires_dbl.conf \
+        data/${dataset}_hires_dbl exp/make_hires_dbl/$dataset $mfccdir;
+    steps/compute_cmvn_stats.sh data/${dataset}_hires_dbl exp/make_hires_dbl/$dataset $mfccdir;
+    utils/fix_data_dir.sh data/${dataset}_hires_dbl  # remove segments with problems
+  done
+fi
+
+if [ $stage -le 13 ]; then
+  for dataset in eval2000 train_dev ${train_set}; do
+    mkdir -p exp/nnet3/ivectors_${dataset}_fake2
+    cp exp/nnet3/ivectors_${dataset}/ivector_online.scp exp/nnet3/ivectors_${dataset}_fake2
+    # verify that the old ivector_period was 10.
+    [ $(cat exp/nnet3/ivectors_${dataset}/ivector_period) -eq 10 ] || exit 1
+    echo 20 > exp/nnet3/ivectors_${dataset}_fake2/ivector_period
+  done
+fi
+
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --frame-subsampling-factor 6 \
+    --alignment-subsampling-factor 3 \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001 --xent-separate-forward-affine=true" \
+    --splice-indexes "-2,0,2 -2,0,2,4 -6,0,6 -6,0,6 -6,0,6 -12,-6,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 3000000 \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set}_fake2 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires_dbl $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+
+ echo "0.005" > $dir/frame_shift # this lets the sclite decoding script know
+                                 # what the frame shift was, in seconds.
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set}_fake2 \
+         $graph_dir data/${decode_set}_hires_dbl $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires_dbl \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6t.sh b/egs/swbd/s5c/local/chain/run_tdnn_6t.sh
new file mode 100755
index 00000000000..47921335155
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6t.sh
@@ -0,0 +1,512 @@
+#!/bin/bash
+
+# since _6s didn't work that well, in 6t we try something else:
+# modifying 6s to use almost exactly the same splicing indexes as 6r,
+# but with the first splice indexes changed from -1,0,1 to -1,1, so that
+# all the differences are multiples of 2 (so the effective frame rate is
+# the normal frame rate).  In effect we're using a narrower splicing
+# at the start of the nnet, than 6s.
+
+# 6t does seem better than 6s, but not quite as good as 6r.
+# the fact that it's not as good as 6r may show that the double-frame-rate
+# input was actually giving us some useful information-- although the
+# improvement is only something like 0.1%-0.2%, and we didn't actually see
+# any difference in the objective function from 6r, which undermines the
+# notion that by removing that central 0 splice at the input, we lost
+# some information.
+#
+#
+#local/chain/compare_wer.sh 6r 6s 6t
+#System                       6r        6s        6t
+#WER on train_dev(tg)      15.06     15.50     15.34
+#WER on train_dev(fg)      14.05     14.45     14.23
+#WER on eval2000(tg)        17.2      17.5      17.2
+#WER on eval2000(fg)        15.4      15.7      15.6
+#Final train prob      -0.106685 -0.105965 -0.106575
+#Final valid prob      -0.122293 -0.122376 -0.121902
+#Final train prob (xent)      -1.62108   -1.5454  -1.62226
+#Final valid prob (xent)      -1.67695  -1.58129  -1.67252
+
+# _6s is as _6r, but changing the splicing indexes to be exactly the same as 6h,
+# but all multiplied by 2.  This means that for any given frame-shift, the network
+# sees exactly the same input as 6h; the only differences is that we see
+# more frame shifts, i.e. the data is more carefully perturbed than 6h.
+# this is to help disentangle whether the improvement really comes from the
+# higher-resolution features, or from the improved data shifting.
+
+# _6r is as _6q, but adding --self-repair-scale 0.00001 --xent-separate-forward-affine=true
+
+# _6q is as _5n (which is a double-frame-rate system), but putting back
+# the iVectors and otherwise changing the configuration as in 5j -> 6g,
+# like 'rebasing' the changes onto 6g.
+
+# _5n is as _5j (also omitting the iVectors), but using double the input frame
+# rate from 10 to 5 ms (and reducing frame width from 25 to 20), and modifying
+# the splice indexes accordingly
+
+# A very nice improvement on dev; small improvement on eval2000 though.
+#local/chain/compare_wer.sh 5j 5n
+#System                       5j        5n
+#WER on train_dev(tg)      17.59     16.85
+#WER on train_dev(fg)      16.33     15.67
+#WER on eval2000(tg)        19.1      19.1
+#WER on eval2000(fg)        17.5      17.3
+#Final train prob      -0.114691 -0.116341
+#Final valid prob      -0.130761 -0.130884
+
+# _5j is as _5e, but omitting the iVectors.
+
+# Definitely worse, although curiously, there is very little effect on the valid prob.
+#./compare_wer.sh 5e 5j
+#System                       5e        5j
+#WER on train_dev(tg)      15.43     17.59
+#WER on train_dev(fg)      14.32     16.33
+#WER on eval2000(tg)        17.3      19.1
+#WER on eval2000(fg)        15.5      17.5
+#Final train prob      -0.110056 -0.114691
+#Final valid prob      -0.129184 -0.130761
+
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=14
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6t # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=2  # this is about the same amount of compute as the normal 4, since one
+              # epoch encompasses all frame-shifts of the data.
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=300 # doubling it, since we have half the frame rate.
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+# Generate double-frame-rate version of the data.
+if [ $stage -le 12 ]; then
+  mfccdir=mfcc
+  for dataset in eval2000 train_dev; do  ## ${train_set}; do
+    utils/copy_data_dir.sh data/$dataset data/${dataset}_hires_dbl
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 --mfcc-config conf/mfcc_hires_dbl.conf \
+        data/${dataset}_hires_dbl exp/make_hires_dbl/$dataset $mfccdir;
+    steps/compute_cmvn_stats.sh data/${dataset}_hires_dbl exp/make_hires_dbl/$dataset $mfccdir;
+    utils/fix_data_dir.sh data/${dataset}_hires_dbl  # remove segments with problems
+  done
+fi
+
+if [ $stage -le 13 ]; then
+  for dataset in eval2000 train_dev ${train_set}; do
+    mkdir -p exp/nnet3/ivectors_${dataset}_fake2
+    cp exp/nnet3/ivectors_${dataset}/ivector_online.scp exp/nnet3/ivectors_${dataset}_fake2
+    # verify that the old ivector_period was 10.
+    [ $(cat exp/nnet3/ivectors_${dataset}/ivector_period) -eq 10 ] || exit 1
+    echo 20 > exp/nnet3/ivectors_${dataset}_fake2/ivector_period
+  done
+fi
+
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_6s_sp/egs \
+    --frame-subsampling-factor 6 \
+    --alignment-subsampling-factor 3 \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1800 --jesus-hidden-dim 7500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001 --xent-separate-forward-affine=true" \
+    --splice-indexes "-1,1 -2,0,2 -4,-2,0,2 -6,0,6 -6,0,6 -12,-6,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 3000000 \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set}_fake2 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires_dbl $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+
+ echo "0.005" > $dir/frame_shift # this lets the sclite decoding script know
+                                 # what the frame shift was, in seconds.
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set}_fake2 \
+         $graph_dir data/${decode_set}_hires_dbl $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires_dbl \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6u.sh b/egs/swbd/s5c/local/chain/run_tdnn_6u.sh
new file mode 100755
index 00000000000..4c48a75ffd6
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6u.sh
@@ -0,0 +1,524 @@
+#!/bin/bash
+
+# _6u is as _6h, but with slightly different splicing indexes (start
+# narrower than 6h and ramp up slowly).  These are designed to be
+# equivalent to those in 6t, except for use with normal-frame-rate,
+# not double-frame-rate, input.  The difference between 6t and 6u
+# will show us whether having double-frame-rate input for the purpose
+# of getting more different shifted versions of the input, is helpful.
+# [however, note that the number of frames-per-iter is not comparable
+# between 6t and 6u: here we're using 1.2 million frames per eg,
+# and 6s is using 3 million which at the normal frame rate would be
+# 1.5 million, and 1.2 != 1.5.
+
+# 6u is no better than 6h, and maybe slightly worse.  Certainly it's worse than
+# 6t.  In addition, the train-valid difference is bigger with 6h and 6u than
+# with 6t.  This is all consistent with the notion that the higher-frame-rate
+# input, with with we can generate more shifted versions, does really make a
+# difference.  However, I want to wait till the 6v->6w comparison is ready,
+# which may let us know whether the difference in frames-per-iter could have
+# been a confounding factor here.  (It's unlikely, but possible).
+#
+#local/chain/compare_wer.sh 6h 6t 6u
+#System                       6h        6t        6u
+#WER on train_dev(tg)      15.46     15.34     15.46
+#WER on train_dev(fg)      14.28     14.23     14.28
+#WER on eval2000(tg)        17.4      17.2      17.6
+#WER on eval2000(fg)        15.7      15.6      15.9
+#Final train prob      -0.105663 -0.106575 -0.108665
+#Final valid prob      -0.130166 -0.121902 -0.129495
+#Final train prob (xent)      -1.42483  -1.62226  -1.54189
+#Final valid prob (xent)      -1.49792  -1.67252  -1.60749
+
+# _6h is as _6g but adding --xent-separate-forward-affine=true, which
+# gives a separate last-but-one weight matrix to the xent output.
+
+# Although this slight improvement is probably not significant, it's a
+# sensible idea so I think I'll stick with it.
+#local/chain/compare_wer.sh 6g 6h
+#System                       6g        6h
+#WER on train_dev(tg)      15.50     15.46
+#WER on train_dev(fg)      14.31     14.28
+#WER on eval2000(tg)        17.5      17.4
+#WER on eval2000(fg)        15.8      15.7
+#Final train prob      -0.105853 -0.105663
+#Final valid prob      -0.129997 -0.130166
+
+# _6g is as _6f but increasing the parameters (increasing
+# jesus-forward-input-from from 500 to 600).
+
+# _6f is as _5v, but setting --jesus-hidden-dim to 0 which with a script change
+# means there is no hidden part in the jesus layer (it's just repeated affine and relu).
+
+# slightly worse, but encouragingly small difference.
+#local/chain/compare_wer.sh 5v 6f
+#System                       5v        6f
+#WER on train_dev(tg)      15.38     15.71
+#WER on train_dev(fg)      14.39     14.50
+#WER on eval2000(tg)        17.4      17.5
+#WER on eval2000(fg)        15.7      15.9
+#Final train prob       -0.11156 -0.111305
+#Final valid prob      -0.131797 -0.131487
+
+# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500.
+
+# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse.
+#
+#local/chain/compare_wer.sh 5e 5s 5t 5v
+#System                       5e        5s        5t        5v
+#WER on train_dev(tg)      15.43     15.47     15.43     15.38
+#WER on train_dev(fg)      14.32     14.31     14.34     14.39
+#WER on eval2000(tg)        17.3      17.4      17.4      17.4
+#WER on eval2000(fg)        15.5      15.6      15.6      15.7
+#Final train prob      -0.110056 -0.110928 -0.110752  -0.11156
+#Final valid prob      -0.129184 -0.132139 -0.129123 -0.131797
+
+# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it
+# up), from 5000 to 3500.
+
+# about 5s: comparing with 5e which is the most recent baseline we actually
+# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700,
+# jesus-hidden-dim reduced 7500 to 5000, and and the new option
+# --self-repair-scale 0.00001 added.  Also compare 5t and 5v which have even
+# smaller jesus-hidden-dims.
+
+# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate
+# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair
+# code to a bug which was doubling the thresholds so there was, in effect,
+# no upper threshold.  I stopped the p,q,r runs after I found this, but in
+# configuring this run I'm bearing in mind the train and valid probs from the
+# p,q,r runs.
+
+# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000.
+
+# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try
+# to compensate for the fact that more of the output dimensions are now being
+# usefully used.
+
+# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair
+# ReLUs that are over or under-saturated.
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+# quite helpful:
+#local/chain/compare_wer.sh 4w 5a
+#System                       4w        5a
+#WER on train_dev(tg)      16.05     15.86
+#WER on train_dev(fg)      14.92     14.74
+#WER on eval2000(tg)        18.0      17.4
+#WER on eval2000(fg)        16.2      15.6
+#Final train prob      -0.108816-0.0998359
+#Final valid prob      -0.118254 -0.115884
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6u # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1700 --jesus-hidden-dim 0 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001 --xent-separate-forward-affine=true" \
+    --splice-indexes "-1,0 -1,0,1 -2,-1,0,1 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6v.sh b/egs/swbd/s5c/local/chain/run_tdnn_6v.sh
new file mode 100755
index 00000000000..df711d31aa1
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6v.sh
@@ -0,0 +1,283 @@
+#!/bin/bash
+# This script contains online decoding using chain + nnet3 setup.
+# _6v is as _6h, but moving to a TDNN+ReLU recipe instead of using jesus-layer.
+# Otherwise we make everything as similar as possible to 6h.
+# The ReLU dimension, at 576, is chosen to make the number of parameters about
+# the same as 6h.
+
+# great improvement!
+# local/chain/compare_wer.sh 6h 6v
+# System                       6h        6v
+# WER on train_dev(tg)      15.46     15.00
+# WER on train_dev(fg)      14.28     13.91
+# WER on eval2000(tg)        17.4      17.2
+# WER on eval2000(fg)        15.7      15.7
+
+# the following objf values are computed on the last iter (323), because due to
+# a script bug, now fixed, the 'final' ones were not computed in 6v.
+# note: in this run the xent learning rate was too slow.
+# 323 train prob        -0.129285     -0.120026
+# 323 valid prob        -0.151648     -0.140628
+# 323 train prob (xent)  -1.4443      -1.5431
+# 323 valid prob (xent)  -1.51731     -1.56975
+
+
+set -e
+
+# configs for 'chain'
+affix=
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6v  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+
+# TDNN options
+# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing
+# smoothing options
+pool_window=
+pool_type='none'
+pool_lpfilter_width=
+self_repair_scale=0.00001
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+relu_dim=576
+frames_per_eg=150
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}${affix:+_$affix}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs";
+  if [ ! -z "$relu_dim" ]; then
+    dim_opts="--relu-dim $relu_dim"
+  else
+    dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim  $pnorm_output_dim"
+  fi
+
+  # create the config files for nnet initialization
+  pool_opts=
+  pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "}
+  pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "}
+  pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "}
+  repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "}
+
+  steps/nnet3/tdnn/make_configs.py $pool_opts \
+    $repair_opts \
+    --feat-dir data/${train_set}_hires \
+    --ivector-dir exp/nnet3/ivectors_${train_set} \
+    --tree-dir $treedir \
+    $dim_opts \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \
+    --use-presoftmax-prior-scale false \
+    --xent-regularize $xent_regularize \
+    --xent-separate-forward-affine true \
+    --include-log-softmax false \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    $dir/configs || exit 1;
+fi
+
+
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir exp/chain/tdnn_2y_sp/egs \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1200000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+
+# Results using offline and online decoding
+# System                    6v_sp 6v_sp_online 6v_sp_online{per_utt}
+# WER on train_dev(tg)      14.68  14.72  15.43
+# WER on train_dev(fg)      13.49  13.58  14.18
+# WER on eval2000(tg)        17.2  17.3   18.2
+# WER on eval2000(fg)        15.7  15.9   16.7
+
+#if [ $stage -le 15 ]; then
+#  # If this setup used PLP features, we'd have to give the option --feature-type plp
+#  # to the script below.
+#  steps/online/nnet3/prepare_online_decoding.sh --mfcc-config conf/mfcc_hires.conf \
+#      data/lang exp/nnet3/extractor "$dir" ${dir}_online || exit 1;
+#fi
+
+
+
+#if [ $stage -le 16 ]; then
+#  iter_opts=
+#  if [ ! -z $decode_iter ]; then
+#    iter_opts=" --iter $decode_iter "
+#  fi
+#  for decode_set in train_dev eval2000; do
+#      (
+#      steps/online/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+#          --nj 50 --cmd "$decode_cmd" $iter_opts --config conf/decode_online.config \
+#          $graph_dir data/${decode_set}_hires ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+#      if $has_fisher; then
+#          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+#            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+#            ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+#      fi
+#      ) &
+#  done
+#fi
+#
+#if [ $stage -le 17 ]; then
+#  iter_opts=
+#  if [ ! -z $decode_iter ]; then
+#    iter_opts=" --iter $decode_iter "
+#  fi
+#  for decode_set in train_dev eval2000; do
+#      (
+#      steps/online/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 --config conf/decode_online.config \
+#          --nj 50 --cmd "$decode_cmd" $iter_opts --per-utt true \
+#          $graph_dir data/${decode_set}_hires ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff}_per_utt || exit 1;
+#      if $has_fisher; then
+#          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+#            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+#            ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg}_per_utt || exit 1;
+#      fi
+#      ) &
+#  done
+#fi
+#
+wait;
+
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6w.sh b/egs/swbd/s5c/local/chain/run_tdnn_6w.sh
new file mode 100755
index 00000000000..3e3bb622290
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6w.sh
@@ -0,0 +1,234 @@
+#!/bin/bash
+
+# 6w is as 6v (a new tdnn-based recipe), but using 1.5 million not 1.2 million
+# frames per iter (and of course re-dumping the egs).
+
+# I discovered after running this that there was a problem with the egs-dumping,
+# which seems to have existed for quite a while: the --right-tolerance defaults to 10
+# in the script, but it should have been 5, to match the code.  However, 6v was
+# run with older egs (before this bug was introduced) from 2y, so it doesn't
+# have the problem.
+
+# note regarding the changes in objfs: these have explanations, they are due to
+# the --right-tolerance increasing from 5->10 in 6v->6w: the chain objfs improve
+# because of the less-restrictive numerator graphs, and the xent objfs get worse
+# because the phone alignments become less consistent; we can see the reverse
+# pattern in 6y -> 6z when we revert the right-tolerance back to 5.
+#
+#local/chain/compare_wer.sh 6v  6w
+#System                       6v        6w
+#WER on train_dev(tg)      15.00     15.33
+#WER on train_dev(fg)      13.91     14.27
+#WER on eval2000(tg)        17.2      17.3
+#WER on eval2000(fg)        15.7      15.6
+#Final train prob      -0.105012  -0.10287
+#Final valid prob      -0.125877 -0.120451
+#Final train prob (xent)      -1.54736  -1.63586
+#Final valid prob (xent)      -1.57475  -1.67173
+
+
+
+# this is same as v2 script but with xent-regularization
+# it has a different splicing configuration
+set -e
+
+# configs for 'chain'
+affix=
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6w  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+
+# TDNN options
+# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing
+# smoothing options
+pool_window=
+pool_type='none'
+pool_lpfilter_width=
+self_repair_scale=0.00001
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+relu_dim=576
+frames_per_eg=150
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}${affix:+_$affix}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs";
+  if [ ! -z "$relu_dim" ]; then
+    dim_opts="--relu-dim $relu_dim"
+  else
+    dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim  $pnorm_output_dim"
+  fi
+
+  # create the config files for nnet initialization
+  pool_opts=
+  pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "}
+  pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "}
+  pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "}
+  repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "}
+
+  steps/nnet3/tdnn/make_configs.py $pool_opts \
+    $repair_opts \
+    --feat-dir data/${train_set}_hires \
+    --ivector-dir exp/nnet3/ivectors_${train_set} \
+    --tree-dir $treedir \
+    $dim_opts \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \
+    --use-presoftmax-prior-scale false \
+    --xent-regularize $xent_regularize \
+    --xent-separate-forward-affine true \
+    --include-log-softmax false \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    $dir/configs || exit 1;
+fi
+
+
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6x.sh b/egs/swbd/s5c/local/chain/run_tdnn_6x.sh
new file mode 100755
index 00000000000..177ddd2a867
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6x.sh
@@ -0,0 +1,229 @@
+#!/bin/bash
+
+# 6x is as 6w, but changing the splice-indexes to be like in 6u
+# except since this is a TDNN setup, we need a final "0" [the jesus-layer
+# setup had a final ReLU as a special case.].
+# These splice indexes start smaller, and ramp up more slowly, than
+# the baseline in 6w.
+# We're reusing the 6x egs.
+
+# no clear benefit; if anything, it's slightly worse.
+# local/chain/compare_wer.sh  6w 6x
+# System                       6w        6x
+# WER on train_dev(tg)      15.33     15.30
+# WER on train_dev(fg)      14.27     14.35
+# WER on eval2000(tg)        17.3      17.4
+# WER on eval2000(fg)        15.6      15.7
+# Final train prob       -0.10287 -0.103078
+# Final valid prob      -0.120451 -0.122477
+# Final train prob (xent)      -1.63586  -1.73292
+# Final valid prob (xent)      -1.67173  -1.75042
+
+# 6w is as 6v (a new tdnn-based recipe), but using 1.5 million not 1.2 million
+# frames per iter (and of course re-dumping the egs).
+
+# this is same as v2 script but with xent-regularization
+# it has a different splicing configuration
+set -e
+
+# configs for 'chain'
+affix=
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6x  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+
+# TDNN options
+# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing
+# smoothing options
+pool_window=
+pool_type='none'
+pool_lpfilter_width=
+self_repair_scale=0.00001
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+relu_dim=576
+frames_per_eg=150
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}${affix:+_$affix}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs";
+  if [ ! -z "$relu_dim" ]; then
+    dim_opts="--relu-dim $relu_dim"
+  else
+    dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim  $pnorm_output_dim"
+  fi
+
+  # create the config files for nnet initialization
+  pool_opts=
+  pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "}
+  pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "}
+  pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "}
+  repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "}
+
+  steps/nnet3/tdnn/make_configs.py $pool_opts \
+    $repair_opts \
+    --feat-dir data/${train_set}_hires \
+    --ivector-dir exp/nnet3/ivectors_${train_set} \
+    --tree-dir $treedir \
+    $dim_opts \
+    --splice-indexes "-1,0 -1,0,1 -2,-1,0,1 -3,0,3 -3,0,3 -6,-3,0 0" \
+    --use-presoftmax-prior-scale false \
+    --xent-regularize $xent_regularize \
+    --xent-separate-forward-affine true \
+    --include-log-softmax false \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    $dir/configs || exit 1;
+fi
+
+
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --egs.dir exp/chain/tdnn_6w_sp/egs \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6y.sh b/egs/swbd/s5c/local/chain/run_tdnn_6y.sh
new file mode 100755
index 00000000000..a15c6648641
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6y.sh
@@ -0,0 +1,227 @@
+#!/bin/bash
+
+# 6y is as 6w, but after fixing the config-generation script to use
+# a higher learning-rate factor for the final xent layer (it was otherwise
+# training too slowly).
+
+# WER results are inconclusive, but objective values are encouraging.
+# We'll keep the change as it makes sense.
+# local/chain/compare_wer.sh 6w 6y
+# System                       6w        6y
+# WER on train_dev(tg)      15.33     15.36
+# WER on train_dev(fg)      14.27     14.19
+# WER on eval2000(tg)        17.3      17.2
+# WER on eval2000(fg)        15.6      15.8
+# Final train prob       -0.10287 -0.102139
+# Final valid prob      -0.120451 -0.119654
+# Final train prob (xent)      -1.63586  -1.55598
+# Final valid prob (xent)      -1.67173  -1.58821
+
+# 6w is as 6v (a new tdnn-based recipe), but using 1.5 million not 1.2 million
+# frames per iter (and of course re-dumping the egs).
+
+# this is same as v2 script but with xent-regularization
+# it has a different splicing configuration
+set -e
+
+# configs for 'chain'
+affix=
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6y  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+
+# TDNN options
+# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing
+# smoothing options
+pool_window=
+pool_type='none'
+pool_lpfilter_width=
+self_repair_scale=0.00001
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+relu_dim=576
+frames_per_eg=150
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}${affix:+_$affix}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs";
+  if [ ! -z "$relu_dim" ]; then
+    dim_opts="--relu-dim $relu_dim"
+  else
+    dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim  $pnorm_output_dim"
+  fi
+
+  # create the config files for nnet initialization
+  pool_opts=
+  pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "}
+  pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "}
+  pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "}
+  repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "}
+
+  steps/nnet3/tdnn/make_configs.py $pool_opts \
+    $repair_opts \
+    --feat-dir data/${train_set}_hires \
+    --ivector-dir exp/nnet3/ivectors_${train_set} \
+    --tree-dir $treedir \
+    $dim_opts \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \
+    --use-presoftmax-prior-scale false \
+    --xent-regularize $xent_regularize \
+    --xent-separate-forward-affine true \
+    --include-log-softmax false \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    $dir/configs || exit 1;
+fi
+
+
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --egs.dir exp/chain/tdnn_6w_sp/egs \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6z.sh b/egs/swbd/s5c/local/chain/run_tdnn_6z.sh
new file mode 100755
index 00000000000..97cc1b83624
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6z.sh
@@ -0,0 +1,231 @@
+#!/bin/bash
+
+# 6z is as 6y, but fixing the right-tolerance in the scripts to default to 5 (as
+# the default is in the code), rather than the previous script default value of
+# 10 which I seem to have added to the script around Feb 9th.
+# definitely better than 6y- not clear if we have managed to get the same
+# results as 6v (could indicate that the larger frames-per-iter is not helpful?
+# but I'd rather not decrease it as it would hurt speed).
+
+# local/chain/compare_wer.sh 6v 6y 6z
+# System                       6v        6y        6z
+# WER on train_dev(tg)      15.00     15.36     15.18
+# WER on train_dev(fg)      13.91     14.19     14.06
+# WER on eval2000(tg)        17.2      17.2      17.2
+# WER on eval2000(fg)        15.7      15.8      15.6
+# Final train prob      -0.105012 -0.102139 -0.106268
+# Final valid prob      -0.125877 -0.119654 -0.126726
+# Final train prob (xent)      -1.54736  -1.55598  -1.4556
+# Final valid prob (xent)      -1.57475  -1.58821  -1.50136
+
+# 6y is as 6w, but after fixing the config-generation script to use
+# a higher learning-rate factor for the final xent layer (it was otherwise
+# training too slowly).
+
+# 6w is as 6v (a new tdnn-based recipe), but using 1.5 million not 1.2 million
+# frames per iter (and of course re-dumping the egs).
+
+# this is same as v2 script but with xent-regularization
+# it has a different splicing configuration
+set -e
+
+# configs for 'chain'
+affix=
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6z  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+
+# TDNN options
+# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing
+# smoothing options
+pool_window=
+pool_type='none'
+pool_lpfilter_width=
+self_repair_scale=0.00001
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+relu_dim=576
+frames_per_eg=150
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}${affix:+_$affix}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs";
+  if [ ! -z "$relu_dim" ]; then
+    dim_opts="--relu-dim $relu_dim"
+  else
+    dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim  $pnorm_output_dim"
+  fi
+
+  # create the config files for nnet initialization
+  pool_opts=
+  pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "}
+  pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "}
+  pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "}
+  repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "}
+
+  steps/nnet3/tdnn/make_configs.py $pool_opts \
+    $repair_opts \
+    --feat-dir data/${train_set}_hires \
+    --ivector-dir exp/nnet3/ivectors_${train_set} \
+    --tree-dir $treedir \
+    $dim_opts \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \
+    --use-presoftmax-prior-scale false \
+    --xent-regularize $xent_regularize \
+    --xent-separate-forward-affine true \
+    --include-log-softmax false \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    $dir/configs || exit 1;
+fi
+
+
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6z_l2.sh b/egs/swbd/s5c/local/chain/run_tdnn_6z_l2.sh
new file mode 100755
index 00000000000..e1c8d263458
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6z_l2.sh
@@ -0,0 +1,201 @@
+#!/bin/bash
+
+# same as 6z but with only l2 regularization
+set -e
+
+# configs for 'chain'
+affix=
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6z_l2  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+
+# TDNN options
+# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing
+# smoothing options
+pool_window=
+pool_type='none'
+pool_lpfilter_width=
+self_repair_scale=0.00001
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+relu_dim=576
+frames_per_eg=150
+remove_egs=false
+common_egs_dir=exp/chain/tdnn_6z_sp/egs
+
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}${affix:+_$affix}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs";
+  if [ ! -z "$relu_dim" ]; then
+    dim_opts="--relu-dim $relu_dim"
+  else
+    dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim  $pnorm_output_dim"
+  fi
+
+  # create the config files for nnet initialization
+  pool_opts=
+  pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "}
+  pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "}
+  pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "}
+  repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "}
+
+  steps/nnet3/tdnn/make_configs.py $pool_opts \
+    $repair_opts \
+    --feat-dir data/${train_set}_hires \
+    --ivector-dir exp/nnet3/ivectors_${train_set} \
+    --tree-dir $treedir \
+    $dim_opts \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \
+    --use-presoftmax-prior-scale false \
+    --include-log-softmax false \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    $dir/configs || exit 1;
+fi
+
+
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6z_l2_leaky.sh b/egs/swbd/s5c/local/chain/run_tdnn_6z_l2_leaky.sh
new file mode 100755
index 00000000000..157ecb2d6f7
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6z_l2_leaky.sh
@@ -0,0 +1,201 @@
+#!/bin/bash
+
+set -e
+
+# configs for 'chain'
+affix=
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6z_l2_leaky  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+
+# TDNN options
+# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing
+# smoothing options
+pool_window=
+pool_type='none'
+pool_lpfilter_width=
+self_repair_scale=0.00001
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+relu_dim=576
+frames_per_eg=150
+remove_egs=false
+common_egs_dir=exp/chain/tdnn_6z_sp/egs
+
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}${affix:+_$affix}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs";
+  if [ ! -z "$relu_dim" ]; then
+    dim_opts="--relu-dim $relu_dim"
+  else
+    dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim  $pnorm_output_dim"
+  fi
+
+  # create the config files for nnet initialization
+  pool_opts=
+  pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "}
+  pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "}
+  pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "}
+  repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "}
+
+  steps/nnet3/tdnn/make_configs.py $pool_opts \
+    $repair_opts \
+    --feat-dir data/${train_set}_hires \
+    --ivector-dir exp/nnet3/ivectors_${train_set} \
+    --tree-dir $treedir \
+    $dim_opts \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \
+    --use-presoftmax-prior-scale false \
+    --include-log-softmax false \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    $dir/configs || exit 1;
+fi
+
+
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6z_leaky.sh b/egs/swbd/s5c/local/chain/run_tdnn_6z_leaky.sh
new file mode 100755
index 00000000000..3ac915142d3
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6z_leaky.sh
@@ -0,0 +1,201 @@
+#!/bin/bash
+
+# same as 6z but with only l2 regularization
+set -e
+
+# configs for 'chain'
+affix=
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6z_leaky  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+
+# TDNN options
+# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing
+# smoothing options
+pool_window=
+pool_type='none'
+pool_lpfilter_width=
+self_repair_scale=0.00001
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+relu_dim=576
+frames_per_eg=150
+remove_egs=false
+common_egs_dir=exp/chain/tdnn_6z_sp/egs
+
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}${affix:+_$affix}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs";
+  if [ ! -z "$relu_dim" ]; then
+    dim_opts="--relu-dim $relu_dim"
+  else
+    dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim  $pnorm_output_dim"
+  fi
+
+  # create the config files for nnet initialization
+  pool_opts=
+  pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "}
+  pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "}
+  pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "}
+  repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "}
+
+  steps/nnet3/tdnn/make_configs.py $pool_opts \
+    $repair_opts \
+    --feat-dir data/${train_set}_hires \
+    --ivector-dir exp/nnet3/ivectors_${train_set} \
+    --tree-dir $treedir \
+    $dim_opts \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \
+    --use-presoftmax-prior-scale false \
+    --include-log-softmax false \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    $dir/configs || exit 1;
+fi
+
+
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --egs.dir "$common_egs_dir" \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6z_none.sh b/egs/swbd/s5c/local/chain/run_tdnn_6z_none.sh
new file mode 100755
index 00000000000..e9aa68c2dd7
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6z_none.sh
@@ -0,0 +1,200 @@
+#!/bin/bash
+
+# same as 6z but with only l2 regularization
+set -e
+
+# configs for 'chain'
+affix=
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6z_none  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+
+# TDNN options
+# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing
+# smoothing options
+pool_window=
+pool_type='none'
+pool_lpfilter_width=
+self_repair_scale=0.00001
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+relu_dim=576
+frames_per_eg=150
+remove_egs=false
+common_egs_dir=exp/chain/tdnn_6z_sp/egs
+
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}${affix:+_$affix}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs";
+  if [ ! -z "$relu_dim" ]; then
+    dim_opts="--relu-dim $relu_dim"
+  else
+    dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim  $pnorm_output_dim"
+  fi
+
+  # create the config files for nnet initialization
+  pool_opts=
+  pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "}
+  pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "}
+  pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "}
+  repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "}
+
+  steps/nnet3/tdnn/make_configs.py $pool_opts \
+    $repair_opts \
+    --feat-dir data/${train_set}_hires \
+    --ivector-dir exp/nnet3/ivectors_${train_set} \
+    --tree-dir $treedir \
+    $dim_opts \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \
+    --use-presoftmax-prior-scale false \
+    --include-log-softmax false \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    $dir/configs || exit 1;
+fi
+
+
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --egs.dir "$common_egs_dir" \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6z_nosubsampling.sh b/egs/swbd/s5c/local/chain/run_tdnn_6z_nosubsampling.sh
new file mode 100755
index 00000000000..10d4ce3ce73
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6z_nosubsampling.sh
@@ -0,0 +1,200 @@
+#!/bin/bash
+
+# same as 6z but with no frame-subsampling
+# This is a simplification of the _6z script as we use the normal lang/ directory,
+# we set frame-subsampling-factor and alignment-subsampling-factor to 1.
+# it is at least 3 times slower than _6z,
+# We increase the num-epochs, possibly by a factor of 3 [since there would only be one shift].
+
+
+set -e
+
+# configs for 'chain'
+affix=
+stage=11
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6z_nosubsamp  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+
+# TDNN options
+# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing
+# smoothing options
+pool_window=
+pool_type='none'
+pool_lpfilter_width=
+self_repair_scale=0.00001
+# training options
+num_epochs=12
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=32
+relu_dim=576
+frames_per_eg=150
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}${affix:+_$affix}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_nosubsamp_tree$suffix
+lang=data/lang
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 1 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs";
+  if [ ! -z "$relu_dim" ]; then
+    dim_opts="--relu-dim $relu_dim"
+  else
+    dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim  $pnorm_output_dim"
+  fi
+
+  # create the config files for nnet initialization
+  pool_opts=
+  pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "}
+  pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "}
+  pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "}
+  repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "}
+
+  steps/nnet3/tdnn/make_configs.py $pool_opts \
+    $repair_opts \
+    --feat-dir data/${train_set}_hires \
+    --ivector-dir exp/nnet3/ivectors_${train_set} \
+    --tree-dir $treedir \
+    $dim_opts \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \
+    --use-presoftmax-prior-scale false \
+    --xent-regularize $xent_regularize \
+    --xent-separate-forward-affine true \
+    --include-log-softmax false \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    $dir/configs || exit 1;
+fi
+
+
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b{09,10,11,12}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.frame-subsampling-factor 1 \
+    --chain.alignment-subsampling-factor 1 \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6z_nosubsampling2.sh b/egs/swbd/s5c/local/chain/run_tdnn_6z_nosubsampling2.sh
new file mode 100755
index 00000000000..94ad0be2b59
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6z_nosubsampling2.sh
@@ -0,0 +1,226 @@
+#!/bin/bash
+
+# same as 6z but with no frame-subsampling
+# This is a simplification of the _6z script as we use the normal lang/ directory,
+# we set frame-subsampling-factor and alignment-subsampling-factor to 1.
+# it is at least 3 times slower than _6z,
+# We increase the num-epochs, possibly by a factor of 3 [since there would only be one shift].
+
+
+set -e
+
+# configs for 'chain'
+affix=
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6z_nosubsamp_100cw  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+
+# TDNN options
+# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing
+# smoothing options
+pool_window=
+pool_type='none'
+pool_lpfilter_width=
+self_repair_scale=0.00001
+# training options
+num_epochs=12
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.414
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=64
+relu_dim=576
+frames_per_eg=100
+remove_egs=false
+common_egs_dir=exp/chain/tdnn_6z_nosubsamp_100cw_sp/egs
+xent_regularize=0.1
+
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}${affix:+_$affix}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_nosubsamp_origtopo_tree$suffix
+lang=data/lang
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 1 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs";
+  if [ ! -z "$relu_dim" ]; then
+    dim_opts="--relu-dim $relu_dim"
+  else
+    dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim  $pnorm_output_dim"
+  fi
+
+  # create the config files for nnet initialization
+  pool_opts=
+  pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "}
+  pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "}
+  pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "}
+  repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "}
+
+  steps/nnet3/tdnn/make_configs.py $pool_opts \
+    $repair_opts \
+    --feat-dir data/${train_set}_hires \
+    --ivector-dir exp/nnet3/ivectors_${train_set} \
+    --tree-dir $treedir \
+    $dim_opts \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \
+    --use-presoftmax-prior-scale false \
+    --xent-regularize $xent_regularize \
+    --xent-separate-forward-affine true \
+    --include-log-softmax false \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    $dir/configs || exit 1;
+fi
+
+
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b{09,10,11,12}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.frame-subsampling-factor 1 \
+    --chain.alignment-subsampling-factor 1 \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.0333 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 16 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      false && {
+      steps/nnet3/decode.sh --acwt 0.33 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg_acwt0.33 || exit 1;
+    }
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg}_acwt0.33 || exit 1;
+      fi
+      ) &
+  done
+fi
+
+
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6z_nosubsampling3.sh b/egs/swbd/s5c/local/chain/run_tdnn_6z_nosubsampling3.sh
new file mode 100755
index 00000000000..4d18f1599b0
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6z_nosubsampling3.sh
@@ -0,0 +1,212 @@
+#!/bin/bash
+
+# same as 6z but with no frame-subsampling
+# This is a simplification of the _6z script as we use the normal lang/ directory,
+# we set frame-subsampling-factor and alignment-subsampling-factor to 1.
+# it is at least 3 times slower than _6z,
+# We increase the num-epochs, possibly by a factor of 3 [since there would only be one shift].
+
+
+set -e
+
+# configs for 'chain'
+affix=
+stage=10
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6z_nosubsamp_100cw_topo6  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+
+# TDNN options
+# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing
+# smoothing options
+pool_window=
+pool_type='none'
+pool_lpfilter_width=
+self_repair_scale=0.00001
+# training options
+num_epochs=12
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.414
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=64
+relu_dim=576
+frames_per_eg=100
+remove_egs=false
+xent_regularize=0.1
+
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}${affix:+_$affix}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_nosubsamp_tree$suffix
+lang=data/lang_chain_6z_ns_topo6
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+#  rm -rf $lang
+#  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo6.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 1 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs";
+  if [ ! -z "$relu_dim" ]; then
+    dim_opts="--relu-dim $relu_dim"
+  else
+    dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim  $pnorm_output_dim"
+  fi
+
+  # create the config files for nnet initialization
+  pool_opts=
+  pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "}
+  pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "}
+  pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "}
+  repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "}
+
+  steps/nnet3/tdnn/make_configs.py $pool_opts \
+    $repair_opts \
+    --feat-dir data/${train_set}_hires \
+    --ivector-dir exp/nnet3/ivectors_${train_set} \
+    --tree-dir $treedir \
+    $dim_opts \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \
+    --use-presoftmax-prior-scale false \
+    --xent-regularize $xent_regularize \
+    --xent-separate-forward-affine true \
+    --include-log-softmax false \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    $dir/configs || exit 1;
+fi
+
+
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b{09,10,11,12}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.frame-subsampling-factor 1 \
+    --chain.alignment-subsampling-factor 1 \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.0333 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6z_nosubsampling4.sh b/egs/swbd/s5c/local/chain/run_tdnn_6z_nosubsampling4.sh
new file mode 100755
index 00000000000..6acba86b3af
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6z_nosubsampling4.sh
@@ -0,0 +1,226 @@
+#!/bin/bash
+
+# same as 6z but with no frame-subsampling
+# This is a simplification of the _6z script as we use the normal lang/ directory,
+# we set frame-subsampling-factor and alignment-subsampling-factor to 1.
+# it is at least 3 times slower than _6z,
+# We increase the num-epochs, possibly by a factor of 3 [since there would only be one shift].
+
+
+set -e
+
+# configs for 'chain'
+affix=
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6z_nosubsamp_100cw_lowreg  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+
+# TDNN options
+# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing
+# smoothing options
+pool_window=
+pool_type='none'
+pool_lpfilter_width=
+self_repair_scale=0.00001
+# training options
+num_epochs=12
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.414
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=64
+relu_dim=576
+frames_per_eg=100
+remove_egs=false
+common_egs_dir=exp/chain/tdnn_6z_nosubsamp_100cw_sp/egs
+xent_regularize=0.0333
+
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}${affix:+_$affix}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_nosubsamp_origtopo_tree$suffix
+lang=data/lang
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 1 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs";
+  if [ ! -z "$relu_dim" ]; then
+    dim_opts="--relu-dim $relu_dim"
+  else
+    dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim  $pnorm_output_dim"
+  fi
+
+  # create the config files for nnet initialization
+  pool_opts=
+  pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "}
+  pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "}
+  pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "}
+  repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "}
+
+  steps/nnet3/tdnn/make_configs.py $pool_opts \
+    $repair_opts \
+    --feat-dir data/${train_set}_hires \
+    --ivector-dir exp/nnet3/ivectors_${train_set} \
+    --tree-dir $treedir \
+    $dim_opts \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \
+    --use-presoftmax-prior-scale false \
+    --xent-regularize $xent_regularize \
+    --xent-separate-forward-affine true \
+    --include-log-softmax false \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    $dir/configs || exit 1;
+fi
+
+
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b{09,10,11,12}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.frame-subsampling-factor 1 \
+    --chain.alignment-subsampling-factor 1 \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.0333 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 16 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      false && {
+      steps/nnet3/decode.sh --acwt 0.33 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg_acwt0.33 || exit 1;
+    }
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg}_acwt0.33 || exit 1;
+      fi
+      ) &
+  done
+fi
+
+
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6z_xent.sh b/egs/swbd/s5c/local/chain/run_tdnn_6z_xent.sh
new file mode 100755
index 00000000000..6dda5768cf5
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6z_xent.sh
@@ -0,0 +1,205 @@
+#!/bin/bash
+
+# same as 6z but with only xent regularization
+
+set -e
+
+# configs for 'chain'
+affix=
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6z_xent  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+
+# TDNN options
+# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing
+# smoothing options
+pool_window=
+pool_type='none'
+pool_lpfilter_width=
+self_repair_scale=0.00001
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+relu_dim=576
+frames_per_eg=150
+remove_egs=false
+common_egs_dir=exp/chain/tdnn_6z_sp/egs
+xent_regularize=0.1
+
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}${affix:+_$affix}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs";
+  if [ ! -z "$relu_dim" ]; then
+    dim_opts="--relu-dim $relu_dim"
+  else
+    dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim  $pnorm_output_dim"
+  fi
+
+  # create the config files for nnet initialization
+  pool_opts=
+  pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "}
+  pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "}
+  pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "}
+  repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "}
+
+  steps/nnet3/tdnn/make_configs.py $pool_opts \
+    $repair_opts \
+    --feat-dir data/${train_set}_hires \
+    --ivector-dir exp/nnet3/ivectors_${train_set} \
+    --tree-dir $treedir \
+    $dim_opts \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \
+    --use-presoftmax-prior-scale false \
+    --xent-regularize $xent_regularize \
+    --xent-separate-forward-affine true \
+    --include-log-softmax false \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    $dir/configs || exit 1;
+fi
+
+
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --egs.dir "$common_egs_dir" \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6z_xent_l2.sh b/egs/swbd/s5c/local/chain/run_tdnn_6z_xent_l2.sh
new file mode 100755
index 00000000000..4c91a49ca90
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6z_xent_l2.sh
@@ -0,0 +1,204 @@
+#!/bin/bash
+
+set -e
+
+# configs for 'chain'
+affix=
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6z_xent_l2  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+
+# TDNN options
+# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing
+# smoothing options
+pool_window=
+pool_type='none'
+pool_lpfilter_width=
+self_repair_scale=0.00001
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+relu_dim=576
+frames_per_eg=150
+remove_egs=false
+xent_regularize=0.1
+common_egs_dir=exp/chain/tdnn_6z_sp/egs
+
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}${affix:+_$affix}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs";
+  if [ ! -z "$relu_dim" ]; then
+    dim_opts="--relu-dim $relu_dim"
+  else
+    dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim  $pnorm_output_dim"
+  fi
+
+  # create the config files for nnet initialization
+  pool_opts=
+  pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "}
+  pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "}
+  pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "}
+  repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "}
+
+  steps/nnet3/tdnn/make_configs.py $pool_opts \
+    $repair_opts \
+    --feat-dir data/${train_set}_hires \
+    --ivector-dir exp/nnet3/ivectors_${train_set} \
+    --tree-dir $treedir \
+    $dim_opts \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \
+    --use-presoftmax-prior-scale false \
+    --xent-regularize $xent_regularize \
+    --xent-separate-forward-affine true \
+    --include-log-softmax false \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    $dir/configs || exit 1;
+fi
+
+
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6z_xent_leaky.sh b/egs/swbd/s5c/local/chain/run_tdnn_6z_xent_leaky.sh
new file mode 100755
index 00000000000..d9696a91795
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6z_xent_leaky.sh
@@ -0,0 +1,204 @@
+#!/bin/bash
+
+set -e
+
+# configs for 'chain'
+affix=
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6z_xent_leaky  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+
+# TDNN options
+# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing
+# smoothing options
+pool_window=
+pool_type='none'
+pool_lpfilter_width=
+self_repair_scale=0.00001
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+relu_dim=576
+frames_per_eg=150
+remove_egs=false
+common_egs_dir=exp/chain/tdnn_6z_sp/egs
+xent_regularize=0.1
+
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}${affix:+_$affix}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs";
+  if [ ! -z "$relu_dim" ]; then
+    dim_opts="--relu-dim $relu_dim"
+  else
+    dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim  $pnorm_output_dim"
+  fi
+
+  # create the config files for nnet initialization
+  pool_opts=
+  pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "}
+  pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "}
+  pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "}
+  repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "}
+
+  steps/nnet3/tdnn/make_configs.py $pool_opts \
+    $repair_opts \
+    --feat-dir data/${train_set}_hires \
+    --ivector-dir exp/nnet3/ivectors_${train_set} \
+    --tree-dir $treedir \
+    $dim_opts \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \
+    --use-presoftmax-prior-scale false \
+    --xent-regularize $xent_regularize \
+    --xent-separate-forward-affine true \
+    --include-log-softmax false \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    $dir/configs || exit 1;
+fi
+
+
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_7a.sh b/egs/swbd/s5c/local/chain/run_tdnn_7a.sh
new file mode 100755
index 00000000000..95c3c9f4c24
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_7a.sh
@@ -0,0 +1,262 @@
+#!/bin/bash
+
+# 7a inherits from 6z (which is a TDNN+ReLU-based network with various small
+# bugs hopefully fixed now), and from 6r, which is our most-successful
+# double-frame-rate system.  We're re-dumping the egs, because the egs used in
+# 6r used right-tolerance=10, which turns out to have been a bug, and not a
+# helpful one.
+
+# it is not better than 6z.
+# local/chain/compare_wer.sh 6v 6z 7a
+#System                       6v        6z        7a
+#WER on train_dev(tg)      15.00     15.18     15.05
+#WER on train_dev(fg)      13.91     14.06     14.10
+#WER on eval2000(tg)        17.2      17.2      17.3
+#WER on eval2000(fg)        15.7      15.6      15.7
+#Final train prob      -0.105012 -0.106268 -0.110288
+#Final valid prob      -0.125877 -0.126726 -0.127071
+#Final train prob (xent)      -1.54736   -1.4556  -1.59569
+#Final valid prob (xent)      -1.57475  -1.50136  -1.62312
+
+# 6z is as 6y, but fixing the right-tolerance in the scripts to default to 5 (as
+# the default is in the code), rather than the previous script default value of
+# 10 which I seem to have added to the script around Feb 9th.
+
+# 6y is as 6w, but after fixing the config-generation script to use
+# a higher learning-rate factor for the final xent layer (it was otherwise
+# training too slowly).
+
+# 6w is as 6v (a new tdnn-based recipe), but using 1.5 million not 1.2 million
+# frames per iter (and of course re-dumping the egs).
+
+# this is same as v2 script but with xent-regularization
+# it has a different splicing configuration
+set -e
+
+# configs for 'chain'
+affix=
+stage=14
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_7a  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+
+# TDNN options
+# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing
+# smoothing options
+pool_window=
+pool_type='none'
+pool_lpfilter_width=
+self_repair_scale=0.00001
+# training options
+num_epochs=2 # use 2 not 4 epochs, as with the double-frame-rate input, we
+             # shift the input data in double the number of distinct ways
+             # on each epoch.
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+relu_dim=576
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}${affix:+_$affix}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+# Generate double-frame-rate version of the data.
+if [ $stage -le 12 ]; then
+  mfccdir=mfcc
+  for dataset in eval2000 train_dev; do  ## ${train_set}; do
+    utils/copy_data_dir.sh data/$dataset data/${dataset}_hires_dbl
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 --mfcc-config conf/mfcc_hires_dbl.conf \
+        data/${dataset}_hires_dbl exp/make_hires_dbl/$dataset $mfccdir;
+    steps/compute_cmvn_stats.sh data/${dataset}_hires_dbl exp/make_hires_dbl/$dataset $mfccdir;
+    utils/fix_data_dir.sh data/${dataset}_hires_dbl  # remove segments with problems
+  done
+fi
+
+if [ $stage -le 13 ]; then
+  for dataset in eval2000 train_dev ${train_set}; do
+    mkdir -p exp/nnet3/ivectors_${dataset}_fake2
+    cp exp/nnet3/ivectors_${dataset}/ivector_online.scp exp/nnet3/ivectors_${dataset}_fake2
+    # verify that the old ivector_period was 10.
+    [ $(cat exp/nnet3/ivectors_${dataset}/ivector_period) -eq 10 ] || exit 1
+    echo 20 > exp/nnet3/ivectors_${dataset}_fake2/ivector_period
+  done
+fi
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating neural net configs";
+  if [ ! -z "$relu_dim" ]; then
+    dim_opts="--relu-dim $relu_dim"
+  else
+    dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim  $pnorm_output_dim"
+  fi
+
+  # create the config files for nnet initialization
+  pool_opts=
+  pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "}
+  pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "}
+  pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "}
+  repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "}
+
+  steps/nnet3/tdnn/make_configs.py $pool_opts \
+    $repair_opts \
+    --feat-dir data/${train_set}_hires_dbl \
+    --ivector-dir exp/nnet3/ivectors_${train_set}_fake2 \
+    --tree-dir $treedir \
+    $dim_opts \
+    --splice-indexes "-1,0,1 -2,0,2 -4,-2,0,2 -6,0,6 -6,0,6 -12,-6,0 0" \
+    --use-presoftmax-prior-scale false \
+    --xent-regularize $xent_regularize \
+    --xent-separate-forward-affine true \
+    --include-log-softmax false \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    $dir/configs || exit 1;
+fi
+
+
+
+if [ $stage -le 15 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{7,11,12,13}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set}_fake2 \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.frame-subsampling-factor 6 \
+    --chain.alignment-subsampling-factor 3 \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width 300 \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 3000000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires_dbl \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+ echo "0.005" > $dir/frame_shift # this lets the sclite decoding script know
+                                 # what the frame shift was, in seconds.
+fi
+
+if [ $stage -le 16 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 17 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set}_fake2 \
+          $graph_dir data/${decode_set}_hires_dbl $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires_dbl \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_7b.sh b/egs/swbd/s5c/local/chain/run_tdnn_7b.sh
new file mode 100755
index 00000000000..4d138cc5da0
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_7b.sh
@@ -0,0 +1,246 @@
+#!/bin/bash
+
+# 7b is as 6z, but increasing the relu-dim slightly from 576 to 625.
+
+# there is very little change.  looks like we were close to the optimum.
+# local/chain/compare_wer.sh 6z 7b
+# System                       6z        7b
+# WER on train_dev(tg)      15.18     15.15
+# WER on train_dev(fg)      14.06     14.19
+# WER on eval2000(tg)        17.2      17.2
+# WER on eval2000(fg)        15.6      15.5
+# Final train prob      -0.106268 -0.102617
+# Final valid prob      -0.126726 -0.126529
+# Final train prob (xent)       -1.4556  -1.43802
+# Final valid prob (xent)      -1.50136   -1.4964
+
+# 6z is as 6y, but fixing the right-tolerance in the scripts to default to 5 (as
+# the default is in the code), rather than the previous script default value of
+# 10 which I seem to have added to the script around Feb 9th.
+# definitely better than 6y- not clear if we have managed to get the same
+# results as 6v (could indicate that the larger frames-per-iter is not helpful?
+# but I'd rather not decrease it as it would hurt speed).
+
+# local/chain/compare_wer.sh 6v 6y 6z
+# System                       6v        6y        6z
+# WER on train_dev(tg)      15.00     15.36     15.18
+# WER on train_dev(fg)      13.91     14.19     14.06
+# WER on eval2000(tg)        17.2      17.2      17.2
+# WER on eval2000(fg)        15.7      15.8      15.6
+# Final train prob      -0.105012 -0.102139 -0.106268
+# Final valid prob      -0.125877 -0.119654 -0.126726
+# Final train prob (xent)      -1.54736  -1.55598  -1.4556
+# Final valid prob (xent)      -1.57475  -1.58821  -1.50136
+
+# 6y is as 6w, but after fixing the config-generation script to use
+# a higher learning-rate factor for the final xent layer (it was otherwise
+# training too slowly).
+
+# 6w is as 6v (a new tdnn-based recipe), but using 1.5 million not 1.2 million
+# frames per iter (and of course re-dumping the egs).
+
+# this is same as v2 script but with xent-regularization
+# it has a different splicing configuration
+set -e
+
+# configs for 'chain'
+affix=
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_7b  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+
+# TDNN options
+# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing
+# smoothing options
+pool_window=
+pool_type='none'
+pool_lpfilter_width=
+self_repair_scale=0.00001
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+relu_dim=625
+frames_per_eg=150
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}${affix:+_$affix}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs";
+  if [ ! -z "$relu_dim" ]; then
+    dim_opts="--relu-dim $relu_dim"
+  else
+    dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim  $pnorm_output_dim"
+  fi
+
+  # create the config files for nnet initialization
+  pool_opts=
+  pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "}
+  pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "}
+  pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "}
+  repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "}
+
+  steps/nnet3/tdnn/make_configs.py $pool_opts \
+    $repair_opts \
+    --feat-dir data/${train_set}_hires \
+    --ivector-dir exp/nnet3/ivectors_${train_set} \
+    --tree-dir $treedir \
+    $dim_opts \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \
+    --use-presoftmax-prior-scale false \
+    --xent-regularize $xent_regularize \
+    --xent-separate-forward-affine true \
+    --include-log-softmax false \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    $dir/configs || exit 1;
+fi
+
+
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --egs.dir exp/chain/tdnn_6z_sp/egs \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_7c.sh b/egs/swbd/s5c/local/chain/run_tdnn_7c.sh
new file mode 100755
index 00000000000..05cb2148ba0
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_7c.sh
@@ -0,0 +1,246 @@
+#!/bin/bash
+
+# 7c is as 6z, but reducing the left and right tolerance from 5 to 4.
+
+# No clear difference.
+
+# I reran the scoring of train_dev for 6z because the scoring script
+# has had a bug fixed.
+# local/score.sh  data/train_dev exp/chain/tdnn_6z_sp/graph_sw1_tg exp/chain/tdnn_6z_sp/decode_train_dev_sw1_tg
+# local/score.sh  data/train_dev exp/chain/tdnn_6z_sp/graph_sw1_tg exp/chain/tdnn_6z_sp/decode_train_dev_sw1_fsh_fg;
+# local/chain/compare_wer.sh 6z 7c
+# System                       6z        7c
+# WER on train_dev(tg)      14.88     14.89
+# WER on train_dev(fg)      13.66     13.69
+# WER on eval2000(tg)        17.2      17.2
+# WER on eval2000(fg)        15.6      15.5
+# Final train prob      -0.106268 -0.107003
+# Final valid prob      -0.126726 -0.133782
+# Final train prob (xent)       -1.4556  -1.40549
+# Final valid prob (xent)      -1.50136  -1.47833
+
+
+
+# local/chain/compare_wer.sh 6v 6y 6z
+# System                       6v        6y        6z
+# WER on train_dev(tg)      15.00     15.36     15.18
+# WER on train_dev(fg)      13.91     14.19     14.06
+# WER on eval2000(tg)        17.2      17.2      17.2
+# WER on eval2000(fg)        15.7      15.8      15.6
+# Final train prob      -0.105012 -0.102139 -0.106268
+# Final valid prob      -0.125877 -0.119654 -0.126726
+# Final train prob (xent)      -1.54736  -1.55598  -1.4556
+# Final valid prob (xent)      -1.57475  -1.58821  -1.50136
+
+# 6y is as 6w, but after fixing the config-generation script to use
+# a higher learning-rate factor for the final xent layer (it was otherwise
+# training too slowly).
+
+# 6w is as 6v (a new tdnn-based recipe), but using 1.5 million not 1.2 million
+# frames per iter (and of course re-dumping the egs).
+
+# this is same as v2 script but with xent-regularization
+# it has a different splicing configuration
+set -e
+
+# configs for 'chain'
+affix=
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_7c  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+
+# TDNN options
+# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing
+# smoothing options
+pool_window=
+pool_type='none'
+pool_lpfilter_width=
+self_repair_scale=0.00001
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+relu_dim=576
+frames_per_eg=150
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}${affix:+_$affix}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs";
+  if [ ! -z "$relu_dim" ]; then
+    dim_opts="--relu-dim $relu_dim"
+  else
+    dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim  $pnorm_output_dim"
+  fi
+
+  # create the config files for nnet initialization
+  pool_opts=
+  pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "}
+  pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "}
+  pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "}
+  repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "}
+
+  steps/nnet3/tdnn/make_configs.py $pool_opts \
+    $repair_opts \
+    --feat-dir data/${train_set}_hires \
+    --ivector-dir exp/nnet3/ivectors_${train_set} \
+    --tree-dir $treedir \
+    $dim_opts \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \
+    --use-presoftmax-prior-scale false \
+    --xent-regularize $xent_regularize \
+    --xent-separate-forward-affine true \
+    --include-log-softmax false \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    $dir/configs || exit 1;
+fi
+
+
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.left-tolerance 4 --chain.right-tolerance 4 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_7d.sh b/egs/swbd/s5c/local/chain/run_tdnn_7d.sh
new file mode 100644
index 00000000000..d33755602bd
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_7d.sh
@@ -0,0 +1,217 @@
+#!/bin/bash
+
+# 7d is as 7b, but changing the HMM context from triphone to left biphone.
+
+# Left biphone model turns out to be as good as triphone model.
+# local/chain/compare_wer.sh 7b 7d
+# System                       7b        7d
+# WER on train_dev(tg)      15.10     15.03
+# WER on train_dev(fg)      14.21     14.22
+# WER on eval2000(tg)        17.2      17.4
+# WER on eval2000(fg)        15.9      15.9
+# Final train prob      -0.100551 -0.092629
+# Final valid prob      -0.123914  -0.11354
+# Final train prob (xent)      -1.43215  -1.27932
+# Final valid prob (xent)      -1.46662  -1.33193
+# Real-time factor       0.918978  0.711695
+
+set -e
+
+# configs for 'chain'
+affix=
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_7d  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+
+# TDNN options
+# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing
+# smoothing options
+pool_window=
+pool_type='none'
+pool_lpfilter_width=
+self_repair_scale=0.00001
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+relu_dim=625
+frames_per_eg=150
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}${affix:+_$affix}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs";
+  if [ ! -z "$relu_dim" ]; then
+    dim_opts="--relu-dim $relu_dim"
+  else
+    dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim  $pnorm_output_dim"
+  fi
+
+  # create the config files for nnet initialization
+  pool_opts=
+  pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "}
+  pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "}
+  pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "}
+  repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "}
+
+  steps/nnet3/tdnn/make_configs.py $pool_opts \
+    $repair_opts \
+    --feat-dir data/${train_set}_hires \
+    --ivector-dir exp/nnet3/ivectors_${train_set} \
+    --tree-dir $treedir \
+    $dim_opts \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \
+    --use-presoftmax-prior-scale false \
+    --xent-regularize $xent_regularize \
+    --xent-separate-forward-affine true \
+    --include-log-softmax false \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    $dir/configs || exit 1;
+fi
+
+
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_a.sh b/egs/swbd/s5c/local/chain/run_tdnn_a.sh
new file mode 100755
index 00000000000..d77cb4a518a
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_a.sh
@@ -0,0 +1,147 @@
+#!/bin/bash
+
+# caution: the egs for this were dumped with a bug in the numerator lattices,
+# you can subtract 0.0152 from the likelihoods to correct for this.  (compare
+# exp/chain/tdnn_a_sp/log/compute_prob_valid.final.log.new and
+# exp/chain/tdnn_a_sp/log/compute_prob_valid.final.log for an explanation).
+
+set -e
+
+# configs for 'chain'
+stage=9
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_a  # Note: _sp will get added to this if $speed_perturb == true.
+
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -7,2 0"
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.0002
+final_effective_lrate=0.00002
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=256
+frames_per_eg=75
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5b_tree$suffix
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+if [ $stage -le 9 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  lang=data/lang_chain
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 10 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --cmd "$train_cmd" 5000 data/$train_set data/lang_chain $ali_dir $treedir
+fi
+
+if [ $stage -le 11 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --get-egs-stage $get_egs_stage \
+    --left-deriv-truncate 5  --right-deriv-truncate 5  --right-tolerance 5 \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 10" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --relu-dim 1024 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --transition-scale 0.0 \
+      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_a2.sh b/egs/swbd/s5c/local/chain/run_tdnn_a2.sh
new file mode 100755
index 00000000000..0289505f593
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_a2.sh
@@ -0,0 +1,146 @@
+#!/bin/bash
+
+
+
+set -e
+
+# configs for 'chain'
+stage=9
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_a2  # Note: _sp will get added to this if $speed_perturb == true.
+
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -7,2 0"
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.0002
+final_effective_lrate=0.00002
+max_param_change=1.0 # match the way the code was when we first ran this
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=256
+frames_per_eg=75
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5b_tree$suffix
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+if [ $stage -le 9 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  lang=data/lang_chain
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 10 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --cmd "$train_cmd" 5000 data/$train_set data/lang_chain $ali_dir $treedir
+fi
+
+if [ $stage -le 11 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --get-egs-stage $get_egs_stage \
+    --left-deriv-truncate 5  --right-deriv-truncate 5  --right-tolerance 5 \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 10" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --relu-dim 1024 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --transition-scale 0.0 \
+      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_b.sh b/egs/swbd/s5c/local/chain/run_tdnn_b.sh
new file mode 100755
index 00000000000..3929527171c
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_b.sh
@@ -0,0 +1,149 @@
+#!/bin/bash
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=10
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_b  # Note: _sp will get added to this if $speed_perturb == true.
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -7,2 0"
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=256
+frames_per_eg=75
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5b_tree$suffix
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  lang=data/lang_chain
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --cmd "$train_cmd" 12000 data/$train_set data/lang_chain $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --get-egs-stage $get_egs_stage \
+    --left-deriv-truncate 5  --right-deriv-truncate 5  --right-tolerance 5 \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 10" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 1024 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --transition-scale 0.0 \
+      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_c.sh b/egs/swbd/s5c/local/chain/run_tdnn_c.sh
new file mode 100755
index 00000000000..e7f7c756a08
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_c.sh
@@ -0,0 +1,157 @@
+#!/bin/bash
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+# also setting max-param-change=1, which it seems is what the 'a' one was run with
+# (it was the default in the code at that time).
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=10
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_c  # Note: _sp will get added to this if $speed_perturb == true.
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -7,2 0"
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.0002
+final_effective_lrate=0.00002
+max_param_change=1.0
+final_layer_normalize_target=1.0
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=256
+frames_per_eg=75
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5c_tree$suffix
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  lang=data/lang_chain
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --cmd "$train_cmd" 7000 data/$train_set data/lang_chain $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --get-egs-stage $get_egs_stage \
+    --left-deriv-truncate 5  --right-deriv-truncate 5  --right-tolerance 5 \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 10" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 1024 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --transition-scale 0.0 \
+      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_d.sh b/egs/swbd/s5c/local/chain/run_tdnn_d.sh
new file mode 100755
index 00000000000..fa103660f69
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_d.sh
@@ -0,0 +1,161 @@
+#!/bin/bash
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=10
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_d  # Note: _sp will get added to this if $speed_perturb == true.
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -7,2 0"
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.0002
+final_effective_lrate=0.00002
+max_param_change=1.0
+final_layer_normalize_target=1.0
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=256
+frames_per_eg=75
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5d_tree$suffix
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  lang=data/lang_chain_d
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo2.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --cmd "$train_cmd" 8000 data/$train_set data/lang_chain_d $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ # adding --target-num-history-states 500 to match the egs of run_lstm_a.sh.  The
+ # script must have had a different default at that time.
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --get-egs-stage $get_egs_stage \
+    --left-deriv-truncate 5  --right-deriv-truncate 5  --right-tolerance 5 \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 10" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 1024 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --transition-scale 0.0 \
+      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_e.sh b/egs/swbd/s5c/local/chain/run_tdnn_e.sh
new file mode 100755
index 00000000000..3d6aef09224
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_e.sh
@@ -0,0 +1,167 @@
+#!/bin/bash
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=10
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_e  # Note: _sp will get added to this if $speed_perturb == true.
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -7,2 0"
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=256
+frames_per_eg=75
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5e_tree$suffix
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  lang=data/lang_chain_d
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo2.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --cmd "$train_cmd" 12000 data/$train_set data/lang_chain_d $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ # adding --target-num-history-states 500 to match the egs of run_lstm_a.sh.  The
+ # script must have had a different default at that time.
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --get-egs-stage $get_egs_stage \
+    --left-deriv-truncate 5  --right-deriv-truncate 5  --right-tolerance 5 \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 10" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 1024 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --transition-scale 0.0 \
+      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_f.sh b/egs/swbd/s5c/local/chain/run_tdnn_f.sh
new file mode 100755
index 00000000000..22e4de418c7
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_f.sh
@@ -0,0 +1,172 @@
+#!/bin/bash
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=10
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_f  # Note: _sp will get added to this if $speed_perturb == true.
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -7,2 0"
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=30
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=256
+frames_per_eg=75
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5f_tree$suffix
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  lang=data/lang_chain_d
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo2.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 12000 data/$train_set data/lang_chain_d $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ # adding --target-num-history-states 500 to match the egs of run_lstm_a.sh.  The
+ # script must have had a different default at that time.
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --get-egs-stage $get_egs_stage \
+    --left-deriv-truncate 5  --right-deriv-truncate 5  --right-tolerance 5 \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 10" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 1024 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --transition-scale 0.0 \
+      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_g.sh b/egs/swbd/s5c/local/chain/run_tdnn_g.sh
new file mode 100755
index 00000000000..aed6401e230
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_g.sh
@@ -0,0 +1,174 @@
+#!/bin/bash
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_g  # Note: _sp will get added to this if $speed_perturb == true.
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -9,0,9 0"
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=30
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=256
+frames_per_eg=75
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5f_tree$suffix
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  lang=data/lang_chain_d
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo2.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 12000 data/$train_set data/lang_chain_d $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ # adding --target-num-history-states 500 to match the egs of run_lstm_a.sh.  The
+ # script must have had a different default at that time.
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --get-egs-stage $get_egs_stage \
+    --left-deriv-truncate 5  --right-deriv-truncate 5  --right-tolerance 5 \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 10 --nj 40" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 1024 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --transition-scale 0.0 \
+      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_h.sh b/egs/swbd/s5c/local/chain/run_tdnn_h.sh
new file mode 100755
index 00000000000..b3917ac9a2c
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_h.sh
@@ -0,0 +1,188 @@
+#!/bin/bash
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+# The WER is quite a bit worse.
+# b01:s5c: grep Sum exp/chain/tdnn_g_sp/decode_eval2000_sw1_fsh_fg/score*/*ys |  utils/best_wer.sh
+# %WER 13.1 | 1831 21395 | 88.6 8.1 3.4 1.7 13.1 50.0 | exp/chain/tdnn_g_sp/decode_eval2000_sw1_fsh_fg/score_11_0.5/eval2000_hires.ctm.swbd.filt.sys
+# b01:s5c: grep Sum exp/chain/tdnn_h_sp/decode_eval2000_sw1_fsh_fg/score*/*ys |  utils/best_wer.sh
+# %WER 14.9 | 1831 21395 | 87.1 9.0 3.9 2.0 14.9 52.3 | exp/chain/tdnn_h_sp/decode_eval2000_sw1_fsh_fg/score_14_0.0/eval2000_hires.ctm.swbd.filt.sys
+
+# the train objf is a bit better.  The valid objf is about the same but can't really be trusted as
+# we had the bug where there was no utt2uniq file.
+# exp/chain/tdnn_h_sp/log/compute_prob_train.final.log:LOG (nnet3-chain-compute-prob:PrintTotalStats():nnet-chain-diagnostics.cc:131) Overall log-probability for 'output' is -0.0788236 per frame, over 10000 frames.
+#exp/chain/tdnn_g_sp/log/compute_prob_train.final.log:LOG (nnet3-chain-compute-prob:PrintTotalStats():nnet-chain-diagnostics.cc:131) Overall log-probability for 'output' is -0.08124 per frame, over 10000 frames.
+
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_h  # Note: _sp will get added to this if $speed_perturb == true.
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -9,0,9 0"
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=30
+max_param_change=0.3333
+scale_max_param_change=true
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=256
+frames_per_eg=75
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5f_tree$suffix
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  lang=data/lang_chain_d
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo2.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 12000 data/$train_set data/lang_chain_d $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_g_sp/egs \
+    --get-egs-stage $get_egs_stage \
+    --left-deriv-truncate 5  --right-deriv-truncate 5  --right-tolerance 5 \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 10 --nj 40" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --scale-max-param-change $scale_max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 1024 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --transition-scale 0.0 \
+      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_i.sh b/egs/swbd/s5c/local/chain/run_tdnn_i.sh
new file mode 100755
index 00000000000..9519ecc2789
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_i.sh
@@ -0,0 +1,182 @@
+#!/bin/bash
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+# be cautious comparing the valid probs with h though, because
+# we fixed the utt2uniq bug at this point, so from h on, the valid probs
+# are properly held out.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_i  # Note: _sp will get added to this if $speed_perturb == true.
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -9,0,9 0"
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=30
+max_param_change=0.3333
+scale_max_param_change=true
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5f_tree$suffix
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  lang=data/lang_chain_d
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo2.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 12000 data/$train_set data/lang_chain_d $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --get-egs-stage $get_egs_stage \
+    --left-deriv-truncate 5  --right-deriv-truncate 5  --right-tolerance 5 \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 10 --nj 40" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --scale-max-param-change $scale_max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 1024 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --transition-scale 0.0 \
+      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_j.sh b/egs/swbd/s5c/local/chain/run_tdnn_j.sh
new file mode 100755
index 00000000000..8b1ff96ae5f
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_j.sh
@@ -0,0 +1,189 @@
+#!/bin/bash
+
+# _j is as _i and using the same egs, but setting
+# --left-deriv-truncate and --right-deriv-truncate to 10
+# instead of 5.
+# This does not seem to be helpful at all: WERs are the same or even worse.  With
+# the trigram model and evaluating on all of eval2000, the WER with the 'i'
+# model is 21.1, and of this model is 21.3.
+# However, it probably would have made sense to set --frames-overlap-per-eg
+# to a larger number - at least 20 - in this setup.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_j  # Note: _sp will get added to this if $speed_perturb == true.
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -9,0,9 0"
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=30
+max_param_change=0.3333
+scale_max_param_change=true
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5f_tree$suffix
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  lang=data/lang_chain_d
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo2.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 12000 data/$train_set data/lang_chain_d $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_i_sp/egs \
+    --get-egs-stage $get_egs_stage \
+    --left-deriv-truncate 10  --right-deriv-truncate 10  --right-tolerance 5 \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 10 --nj 40" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --scale-max-param-change $scale_max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 1024 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --transition-scale 0.0 \
+      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_k.sh b/egs/swbd/s5c/local/chain/run_tdnn_k.sh
new file mode 100755
index 00000000000..2393ddaffbb
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_k.sh
@@ -0,0 +1,184 @@
+#!/bin/bash
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+# [only] after 4gram rescoring, only 0.1% better than _i. :-(
+# %WER 12.7 | 1831 21395 | 89.0 7.8 3.3 1.7 12.7 49.2 | exp/chain/tdnn_k_sp/decode_eval2000_sw1_fsh_fg/score_12_1.0/eval2000_hires.ctm.swbd.filt.sys
+# %WER 12.8 | 1831 21395 | 88.8 7.8 3.4 1.6 12.8 49.3 | exp/chain/tdnn_i_sp/decode_eval2000_sw1_fsh_fg/score_14_0.0/eval2000_hires.ctm.swbd.filt.sys
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_k  # Note: _sp will get added to this if $speed_perturb == true.
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -9,0,9 0"
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=30
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5f_tree$suffix
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  lang=data/lang_chain_d
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo2.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 12000 data/$train_set data/lang_chain_d $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_i_sp/egs \
+    --get-egs-stage $get_egs_stage \
+    --left-deriv-truncate 5  --right-deriv-truncate 5  --right-tolerance 5 \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 10 --nj 40" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 1024 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --transition-scale 0.0 \
+      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_l.sh b/egs/swbd/s5c/local/chain/run_tdnn_l.sh
new file mode 100755
index 00000000000..1c7f431d4ec
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_l.sh
@@ -0,0 +1,188 @@
+#!/bin/bash
+
+# _l is as _k but even longer chunk size: 200 instead of 150.  having to halve
+#  minibatch size to save memory.  I correspondingly changed max-param-change.
+# ... perhaps very slightly better than k: after 4-gram rescoring, looking at the
+# whole of the eval2000 dataset we get improvement 18.9->18.7, but before
+# 4-gram rescoring there is no change (20.7).
+# on the Swbd subset the improvement is 0.1% before rescoring (14.3->14.2),
+# and 0.3% after rescoring (12.7 -> 12.4).
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_l  # Note: _sp will get added to this if $speed_perturb == true.
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -9,0,9 0"
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=30
+max_param_change=0.666
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=64
+frames_per_eg=200
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5f_tree$suffix
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  lang=data/lang_chain_d
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo2.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 12000 data/$train_set data/lang_chain_d $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --get-egs-stage $get_egs_stage \
+    --left-deriv-truncate 5  --right-deriv-truncate 5  --right-tolerance 5 \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 10 --nj 40" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 1024 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --transition-scale 0.0 \
+      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_m.sh b/egs/swbd/s5c/local/chain/run_tdnn_m.sh
new file mode 100755
index 00000000000..9d29447f78c
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_m.sh
@@ -0,0 +1,189 @@
+#!/bin/bash
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_m  # Note: _sp will get added to this if $speed_perturb == true.
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -9,0,9 0"
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=30
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5f_tree$suffix
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  lang=data/lang_chain_d
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo2.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 12000 data/$train_set data/lang_chain_d $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --get-egs-stage $get_egs_stage \
+    --left-deriv-truncate 5  --right-deriv-truncate 5  --right-tolerance 5 \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 10 --nj 40" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 1024 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --transition-scale 0.0 \
+      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_n.sh b/egs/swbd/s5c/local/chain/run_tdnn_n.sh
new file mode 100755
index 00000000000..78029e7161f
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_n.sh
@@ -0,0 +1,199 @@
+#!/bin/bash
+
+# _n is as _m but changing the egs configuration to get better and more even
+# coverage of the data: increasing frames_per_eg from 150 to 200,
+# and increasing --frames-overlap-per-eg from 10 to 30.
+# I am also testing out some script changes in the get_egs.sh script that
+# aims to reduce the number of small files (and some accompanying code changes
+# that allow us to put the CPU-intensive phase of egs preparation with the
+# 'shuffle' jobs).
+#
+# This doesn't seem to have made any consistent difference at all (although on
+# average the change was slightly beneficial): on all of eval2000, m->n changed
+# 20.9->20.8 with trigram and 18.6->18.7 after 4g rescoring; on train_dev it
+# changed 19.31->19.04 with trigram, and 17.58->17.45 after 4g rescoring.
+
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_n  # Note: _sp will get added to this if $speed_perturb == true.
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -9,0,9 0"
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=30
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=200
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5f_tree$suffix
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  lang=data/lang_chain_d
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo2.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 12000 data/$train_set data/lang_chain_d $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --get-egs-stage $get_egs_stage \
+    --left-deriv-truncate 5  --right-deriv-truncate 5  --right-tolerance 5 \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 30" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 1024 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --transition-scale 0.0 \
+      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_o.sh b/egs/swbd/s5c/local/chain/run_tdnn_o.sh
new file mode 100755
index 00000000000..8085c3a80fe
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_o.sh
@@ -0,0 +1,226 @@
+#!/bin/bash
+
+# _o is as _n, but reducing the number of parameters to try to reduce
+# over-training: reducing relu-dim from 1024 to 850 and target num-states
+# from 12k to 9k.  Also modifying the splicing setup in a way that shouldn't
+# affect num-params, from "-2,-1,0,1,2 -1,2 -3,3 -9,0,9 0" to
+# "-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3".
+#
+# There seems to be a slight improvement: on train_dev, WER changes 19.04->18.99 before
+# rescoring, and 17.45->17.29 after.   On all of eval2000 the WER changes
+# from 20.8->20.6 before fg rescoring, and 18.7->18.5 after.
+
+# _n is as _m but changing the egs configuration to get better and more even
+# coverage of the data: increasing frames_per_eg from 150 to 200,
+# and increasing --frames-overlap-per-eg from 10 to 30.
+# I am also testing out some script changes in the get_egs.sh script that
+# aims to reduce the number of small files (and some accompanying code changes
+# that allow us to put the CPU-intensive phase of egs preparation with the
+# 'shuffle' jobs).
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=11
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_o  # Note: _sp will get added to this if $speed_perturb == true.
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3"
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=30
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=200
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5o_tree$suffix
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  lang=data/lang_chain_d
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo2.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set data/lang_chain_d $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --get-egs-stage $get_egs_stage \
+    --left-deriv-truncate 5  --right-deriv-truncate 5  --right-tolerance 5 \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 30" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --transition-scale 0.0 \
+      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+
+# Running another decode with tighter beam.
+# time is about twice faster-- easily within real-time even on fairly old machines.
+# degradation on eval2000 is 14.2->14.4 before rescoring and 12.2->12.5 after;
+# on train_dev is's 18.99->19.09 before rescoring, and 17.29->17.55 after.  Probably
+# the greater degradation after rescoring is due to the lattice-beam being too tight,
+# which might not even affect the speed much (could easily make it 7.0).
+if [ $stage -le 15 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --beam 11.0 --lattice-beam 6.0 --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff}_11_6 || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg}_11_6 || exit 1;
+      fi
+      ) &
+  done
+fi
+
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_p.sh b/egs/swbd/s5c/local/chain/run_tdnn_p.sh
new file mode 100755
index 00000000000..97bb6dfbfc0
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_p.sh
@@ -0,0 +1,196 @@
+#!/bin/bash
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_p  # Note: _sp will get added to this if $speed_perturb == true.
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -9,0,9 0"
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=30
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5f_tree$suffix
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  lang=data/lang_chain_d
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo2.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 12000 data/$train_set data/lang_chain_d $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 30" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 1024 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --transition-scale 0.0 \
+      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_q.sh b/egs/swbd/s5c/local/chain/run_tdnn_q.sh
new file mode 100755
index 00000000000..70274105c93
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_q.sh
@@ -0,0 +1,206 @@
+#!/bin/bash
+
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+#  This reduction in parameters seems to be helpful: on train_dev (fg),
+#  change is 18.45 -> 18.07, and on all of eval2000 (fg), from 20.0 -> 19.8.
+
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This will.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_q  # Note: _sp will get added to this if $speed_perturb == true.
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3"
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=30
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5o_tree$suffix
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  lang=data/lang_chain_d
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo2.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set data/lang_chain_d $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 30" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --transition-scale 0.0 \
+      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_r.sh b/egs/swbd/s5c/local/chain/run_tdnn_r.sh
new file mode 100755
index 00000000000..3dcb1311db4
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_r.sh
@@ -0,0 +1,206 @@
+#!/bin/bash
+
+# _r is as _q except adding --lm-opts "--num-extra-states=0"
+# to reduce the size of the phone LM.  Not really expecting much difference
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_r  # Note: _sp will get added to this if $speed_perturb == true.
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3"
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=30
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5o_tree$suffix
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  lang=data/lang_chain_d
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo2.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set data/lang_chain_d $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --lm-opts "--num-extra-states=0" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 30" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --transition-scale 0.0 \
+      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_s.sh b/egs/swbd/s5c/local/chain/run_tdnn_s.sh
new file mode 100755
index 00000000000..7ee23833fc9
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_s.sh
@@ -0,0 +1,208 @@
+#!/bin/bash
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_s  # Note: _sp will get added to this if $speed_perturb == true.
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3"
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=30
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5o_tree$suffix
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  lang=data/lang_chain_d
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo2.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set data/lang_chain_d $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --pdf-boundary-penalty 0.0 \
+    --egs-dir exp/chain/tdnn_q_sp/egs \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 30" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --transition-scale 0.0 \
+      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_t.sh b/egs/swbd/s5c/local/chain/run_tdnn_t.sh
new file mode 100755
index 00000000000..8b5805093e2
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_t.sh
@@ -0,0 +1,211 @@
+#!/bin/bash
+
+
+# _t is as _s but setting pdf-boundary-penalty to 2.0
+#  This makes things slightly worse: 18.0->18.2 on eval2000 after fg rescoring (20.1->20.1 before)
+# and  16.96->17.26 on train_dev after fg rescoring (18.45->18.68 before).
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_t  # Note: _sp will get added to this if $speed_perturb == true.
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3"
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=30
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5o_tree$suffix
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  lang=data/lang_chain_d
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo2.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set data/lang_chain_d $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --pdf-boundary-penalty 2.0 \
+    --egs-dir exp/chain/tdnn_q_sp/egs \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 30" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --transition-scale 0.0 \
+      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_u.sh b/egs/swbd/s5c/local/chain/run_tdnn_u.sh
new file mode 100755
index 00000000000..62470d31068
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_u.sh
@@ -0,0 +1,216 @@
+#!/bin/bash
+
+# _u is as _t but also setting --truncate-deriv-weights 3.
+#  This doesn't seem to be helpful, or at least inconsistent: 18.2->18.6 on all of eval2000
+#  after fg rescoring (20.1->20.7 before); on train_dev, 17.26->17.14 after fg rescoring,
+#  or 18.6->18.74 before.  So worse on eval2000, inconsistent on train_dev.
+#  The train and valid probs are actually quite different: -0.111 -> -0.1427 on train,
+#  -0.109 -> -0.783 on valid.  So it looks like the edge effects do make a difference-
+#  maybe some kind of regularization effect?
+#
+# _t is as _s but setting pdf-boundary-penalty to 2.0
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_u  # Note: _sp will get added to this if $speed_perturb == true.
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3"
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=30
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5o_tree$suffix
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  lang=data/lang_chain_d
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo2.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set data/lang_chain_d $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --truncate-deriv-weights 3 \
+    --pdf-boundary-penalty 2.0 \
+    --egs-dir exp/chain/tdnn_q_sp/egs \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 30" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --transition-scale 0.0 \
+      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_v.sh b/egs/swbd/s5c/local/chain/run_tdnn_v.sh
new file mode 100755
index 00000000000..206e8aa45f9
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_v.sh
@@ -0,0 +1,222 @@
+#!/bin/bash
+
+# _v is as _u but setting pdf-boundary-penalty to 0.0 (as in t->s),
+#   and also trying a smaller language model:   --lm-opts "--num-extra-states=0"
+#
+# It's worse: on train_dev, 18.73->19.29 with tg,  17.14->17.75 with fg.  [around 0.6 abs worse]
+#              on eval2000, 20.1->20.7 with tg 18.2->18.6 with fg.  [around 0.5 abs worse].
+#  Now, the s->t stage was on average over the 4 conditions, about 0.2 worse, so the t->s change
+#  (changing pdf-boundary-penalty to 0.0) should have given 0.2 abs improvement.  This means that we
+#  we have 0.7 to 0.8 abs degradation from setting --lm-opts "--num-extra-states=0".
+#   (Note: this could possibly be an interaction between the --truncate-deriv-weights and
+#   the pdf-boundary-penalty)?
+
+#
+#
+# _u is as _t but also setting --truncate-deriv-weights 3.
+# _t is as _s but setting pdf-boundary-penalty to 2.0
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_v  # Note: _sp will get added to this if $speed_perturb == true.
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3"
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=30
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5o_tree$suffix
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  lang=data/lang_chain_d
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo2.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set data/lang_chain_d $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --lm-opts "--num-extra-states=0" \
+    --truncate-deriv-weights 3 \
+    --pdf-boundary-penalty 0.0 \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 30" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --transition-scale 0.0 \
+      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_w.sh b/egs/swbd/s5c/local/chain/run_tdnn_w.sh
new file mode 100755
index 00000000000..36a54e3e5c5
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_w.sh
@@ -0,0 +1,215 @@
+#!/bin/bash
+
+# _w is as _s (with --pdf-boundary-penalty 0.0) but setting
+#   --lm-opts "--num-extra-states=500" (like the opposite of
+# the u->v change, which was very unhelpful).  Also making a script change
+# to set the same --pdf-boundary-penalty value on the train and valid egs for
+# diagnostics (this won't affect WERs).
+# See the top of run_tdnn_2a.sh for the WER comparisons for this experiment.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_w  # Note: _sp will get added to this if $speed_perturb == true.
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3"
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=30
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5o_tree$suffix
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  lang=data/lang_chain_d
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo2.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set data/lang_chain_d $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --lm-opts "--num-extra-states=500" \
+    --pdf-boundary-penalty 0.0 \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 30" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --transition-scale 0.0 \
+      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_x.sh b/egs/swbd/s5c/local/chain/run_tdnn_x.sh
new file mode 100755
index 00000000000..cf5bb635200
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_x.sh
@@ -0,0 +1,216 @@
+#!/bin/bash
+
+# _x is as _s but setting     --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+# See the top of run_tdnn_2a.sh for more detailed WER comparisons for this experiment.
+#   It's worse by about 0.3: on train_dev,
+#   before rescoring 16.96->17.22, after rescoring 18.45->18.67; on all of
+# eval2000, before rescoring 20.1->20.4, after rescoring 18.0->18.4
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_x  # Note: _sp will get added to this if $speed_perturb == true.
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3"
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=30
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5o_tree$suffix
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  lang=data/lang_chain_d
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo2.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set data/lang_chain_d $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --pdf-boundary-penalty 0.0 \
+    --lm-opts "--num-extra-states=0" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 30" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --transition-scale 0.0 \
+      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_y.sh b/egs/swbd/s5c/local/chain/run_tdnn_y.sh
new file mode 100755
index 00000000000..06a3eff123e
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_y.sh
@@ -0,0 +1,245 @@
+#!/bin/bash
+
+# _y is as _s but trying --apply-deriv-weights false. (note: in the
+# interim, the script was changed so the train and valid probs have --pdf-boundary-penalty 0
+# and are no longer comparable with the ones in _s.
+#
+#   Compared to s, the results are improved: on train_dev, 18.45->18.04 with tg
+# and 16.96->16.57 with fg; on all of eval2000, 20.1->19.8 with tg and 18.0 to
+# 17.9 with fg.
+#
+#
+#  I recomputed the train and valid probs using the .486 model and no --pdf-boundary-penalty option, to
+# be able to compre with the _s ones.  In _s the (train,valid) probs at iter 485 were (-0.0691, -0.0997),
+# in _y the (train,valid) probs at iter 486 were (-0.0655,-0.0998).  So better on train, essentially
+# the same on valid.  It makes sense it would be better on train, since its overtraining is more
+# closely aligned with the distribution of training segments on which we compute the objf-- also because
+# we've simply trained more, i.e. equivalent to slightly more epochs.
+
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_y  # Note: _sp will get added to this if $speed_perturb == true.
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3"
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=30
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5o_tree$suffix
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  lang=data/lang_chain_d
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo2.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set data/lang_chain_d $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --apply-deriv-weights false \
+    --pdf-boundary-penalty 0.0 \
+    --egs-dir exp/chain/tdnn_q_sp/egs \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 30" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --transition-scale 0.0 \
+      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+
+if [ $stage -le 15 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      iter=300
+      steps/nnet3/decode.sh --iter $iter --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff}_it$iter || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg}_it$iter || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_z.sh b/egs/swbd/s5c/local/chain/run_tdnn_z.sh
new file mode 100755
index 00000000000..db85df89a7d
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_z.sh
@@ -0,0 +1,216 @@
+#!/bin/bash
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also w, which has --num-extra-states=500, and 2a, which has 8000).
+# See the top of un_tdnn_2a.sh for the WER comparisons for this experiment.
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_z  # Note: _sp will get added to this if $speed_perturb == true.
+
+# TDNN options
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3"
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=30
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5o_tree$suffix
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  lang=data/lang_chain_d
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo2.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set data/lang_chain_d $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --pdf-boundary-penalty 0.0 \
+    --lm-opts "--num-extra-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 30" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim 850 \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --transition-scale 0.0 \
+      --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/show_wer.sh b/egs/swbd/s5c/local/chain/show_wer.sh
new file mode 100755
index 00000000000..a82c4acf26d
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/show_wer.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+for l in $*; do
+  grep WER exp/chain/tdnn_${l}_sp/decode_train_dev_sw1_tg/wer_* | utils/best_wer.sh
+done
+for l in $*; do
+  grep WER exp/chain/tdnn_${l}_sp/decode_train_dev_sw1_fsh_fg/wer_* | utils/best_wer.sh
+done
+for l in $*; do
+  grep Sum exp/chain/tdnn_${l}_sp/decode_eval2000_sw1_tg/score*/*ys | grep -v swbd | utils/best_wer.sh
+done
+for l in $*; do
+  grep Sum exp/chain/tdnn_${l}_sp/decode_eval2000_sw1_fsh_fg/score*/*ys | grep -v swbd | utils/best_wer.sh
+done
diff --git a/egs/swbd/s5c/local/chain/tdnn/run_tdnn_v1.sh b/egs/swbd/s5c/local/chain/tdnn/run_tdnn_v1.sh
new file mode 100755
index 00000000000..8e4ef2935a3
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tdnn/run_tdnn_v1.sh
@@ -0,0 +1,171 @@
+#!/bin/bash
+
+# this is based oni dan's tdnn_2o script
+# it has a different splicing configuration
+set -e
+
+# configs for 'chain'
+affix=
+stage=10
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_v1  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+
+# TDNN options
+# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing
+splice_indexes="-2,-1,0,1,2 -1,0,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+# smoothing options
+pool_window=
+pool_type='none'
+pool_lpfilter_width=
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+relu_dim=700
+frames_per_eg=150
+remove_egs=false
+common_egs_dir=
+
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}${affix:+_$affix}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2o_tree$suffix
+lang=data/lang_chain_2o
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --apply-deriv-weights false \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --pool-type "$pool_type" \
+    --pool-window "$pool_window" \
+    --pool-lpfilter-width "$pool_lpfilter_width" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim $relu_dim \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    --egs-dir "$common_egs_dir" \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/tdnn/run_tdnn_v2.sh b/egs/swbd/s5c/local/chain/tdnn/run_tdnn_v2.sh
new file mode 100755
index 00000000000..f5718837690
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tdnn/run_tdnn_v2.sh
@@ -0,0 +1,172 @@
+#!/bin/bash
+
+# this is same as v1 script but with l2 regularization
+# it has a different splicing configuration
+set -e
+
+# configs for 'chain'
+affix=
+stage=10
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_v2  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+
+# TDNN options
+# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing
+splice_indexes="-2,-1,0,1,2 -1,0,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+# smoothing options
+pool_window=
+pool_type='none'
+pool_lpfilter_width=
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+relu_dim=700
+frames_per_eg=150
+remove_egs=false
+common_egs_dir=
+
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}${affix:+_$affix}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2o_tree$suffix
+lang=data/lang_chain_2o
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --l2-regularize 0.00005 \
+    --apply-deriv-weights false \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --pool-type "$pool_type" \
+    --pool-window "$pool_window" \
+    --pool-lpfilter-width "$pool_lpfilter_width" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim $relu_dim \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    --egs-dir "$common_egs_dir" \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/tdnn/run_tdnn_v3.sh b/egs/swbd/s5c/local/chain/tdnn/run_tdnn_v3.sh
new file mode 100755
index 00000000000..3b280712aeb
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tdnn/run_tdnn_v3.sh
@@ -0,0 +1,173 @@
+#!/bin/bash
+
+# this is same as v2 script but with xent-regularization
+# it has a different splicing configuration
+set -e
+
+# configs for 'chain'
+affix=
+stage=10
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_v3  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+
+# TDNN options
+# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing
+splice_indexes="-2,-1,0,1,2 -1,0,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+# smoothing options
+pool_window=
+pool_type='none'
+pool_lpfilter_width=
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+relu_dim=700
+frames_per_eg=150
+remove_egs=false
+common_egs_dir=
+
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}${affix:+_$affix}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2o_tree$suffix
+lang=data/lang_chain_2o
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.2 \
+    --l2-regularize 0.00005 \
+    --apply-deriv-weights false \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --splice-indexes "$splice_indexes" \
+    --pool-type "$pool_type" \
+    --pool-window "$pool_window" \
+    --pool-lpfilter-width "$pool_lpfilter_width" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    --relu-dim $relu_dim \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    --egs-dir "$common_egs_dir" \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/tdnn/run_tdnn_v4.sh b/egs/swbd/s5c/local/chain/tdnn/run_tdnn_v4.sh
new file mode 100755
index 00000000000..c10e296dee9
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tdnn/run_tdnn_v4.sh
@@ -0,0 +1,207 @@
+#!/bin/bash
+
+# this is same as v2 script but with xent-regularization
+# it has a different splicing configuration
+set -e
+
+# configs for 'chain'
+affix=
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_v4  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+
+# TDNN options
+# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing
+splice_indexes="-2,-1,0,1,2 -1,0,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+# smoothing options
+pool_window=
+pool_type='none'
+pool_lpfilter_width=
+self_repair_scale=0.00001
+xent_regularize=0.2
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+relu_dim=700
+frames_per_eg=150
+remove_egs=false
+common_egs_dir=
+
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}${affix:+_$affix}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2o_tree$suffix
+lang=data/lang_chain_2o
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs";
+  if [ ! -z "$relu_dim" ]; then
+    dim_opts="--relu-dim $relu_dim"
+  else
+    dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim  $pnorm_output_dim"
+  fi
+
+  # create the config files for nnet initialization
+  pool_opts=
+  pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "}
+  pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "}
+  pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "}
+  repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "}
+
+  python steps/nnet3/tdnn/make_configs.py $pool_opts \
+    $repair_opts \
+    --feat-dir data/${train_set}_hires \
+    --ivector-dir exp/nnet3/ivectors_${train_set} \
+    --tree-dir $treedir \
+    $dim_opts \
+    --splice-indexes "$splice_indexes"  \
+    --use-presoftmax-prior-scale false \
+    --xent-regularize $xent_regularize \
+    --include-log-softmax false \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    $dir/configs || exit 1;
+fi
+
+
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1200000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/tdnn/run_tdnn_v5.sh b/egs/swbd/s5c/local/chain/tdnn/run_tdnn_v5.sh
new file mode 100755
index 00000000000..262e241296f
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tdnn/run_tdnn_v5.sh
@@ -0,0 +1,205 @@
+#!/bin/bash
+
+# this is same as v2 script but with xent-regularization
+# it has a different splicing configuration
+set -e
+
+# configs for 'chain'
+affix=
+stage=10
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_v5  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+
+# TDNN options
+# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing
+splice_indexes="-2,-1,0,1,2 -1,0,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+# smoothing options
+pool_window=
+pool_type='none'
+pool_lpfilter_width=
+self_repair_scale=0.00001
+xent_regularize=0.2
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=1.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+relu_dim=700
+frames_per_eg=150
+remove_egs=false
+common_egs_dir=
+
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}${affix:+_$affix}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2o_tree$suffix
+lang=data/lang_chain_2o
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs";
+  if [ ! -z "$relu_dim" ]; then
+    dim_opts="--relu-dim $relu_dim"
+  else
+    dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim  $pnorm_output_dim"
+  fi
+
+  # create the config files for nnet initialization
+  pool_opts=
+  pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "}
+  pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "}
+  pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "}
+  repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "}
+
+  python steps/nnet3/tdnn/make_configs.py $pool_opts \
+    $repair_opts \
+    --feat-dir data/${train_set}_hires \
+    --ivector-dir exp/nnet3/ivectors_${train_set} \
+    --tree-dir $treedir \
+    $dim_opts \
+    --splice-indexes "$splice_indexes"  \
+    --use-presoftmax-prior-scale false \
+    --xent-regularize $xent_regularize \
+    --include-log-softmax false \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    $dir/configs || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.00001 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1200000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/tdnn/run_tdnn_v6.sh b/egs/swbd/s5c/local/chain/tdnn/run_tdnn_v6.sh
new file mode 100755
index 00000000000..866b5064757
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tdnn/run_tdnn_v6.sh
@@ -0,0 +1,207 @@
+#!/bin/bash
+
+# this is same as v2 script but with xent-regularization
+# it has a different splicing configuration
+set -e
+
+# configs for 'chain'
+affix=
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_v6  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+
+# TDNN options
+# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing
+splice_indexes="-2,-1,0,1,2 -1,0,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+# smoothing options
+pool_window=
+pool_type='none'
+pool_lpfilter_width=
+self_repair_scale=0.00001
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+relu_dim=768
+frames_per_eg=150
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}${affix:+_$affix}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs";
+  if [ ! -z "$relu_dim" ]; then
+    dim_opts="--relu-dim $relu_dim"
+  else
+    dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim  $pnorm_output_dim"
+  fi
+
+  # create the config files for nnet initialization
+  pool_opts=
+  pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "}
+  pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "}
+  pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "}
+  repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "}
+
+  python steps/nnet3/tdnn/make_configs.py $pool_opts \
+    $repair_opts \
+    --feat-dir data/${train_set}_hires \
+    --ivector-dir exp/nnet3/ivectors_${train_set} \
+    --tree-dir $treedir \
+    $dim_opts \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --use-presoftmax-prior-scale false \
+    --xent-regularize $xent_regularize \
+    --include-log-softmax false \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    $dir/configs || exit 1;
+fi
+
+
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir exp/chain/tdnn_2y_sp/egs \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1200000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/tdnn/run_tdnn_v7.sh b/egs/swbd/s5c/local/chain/tdnn/run_tdnn_v7.sh
new file mode 100755
index 00000000000..ede618e0639
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tdnn/run_tdnn_v7.sh
@@ -0,0 +1,207 @@
+#!/bin/bash
+
+# this is same as v2 script but with xent-regularization
+# it has a different splicing configuration
+set -e
+
+# configs for 'chain'
+affix=
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_v7  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+
+# TDNN options
+# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing
+# smoothing options
+pool_window=
+pool_type='none'
+pool_lpfilter_width=
+self_repair_scale=0.00001
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+relu_dim=576
+frames_per_eg=150
+remove_egs=false
+common_egs_dir=exp/chain/tdnn_2y_sp/egs
+xent_regularize=0.1
+
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}${affix:+_$affix}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs";
+  if [ ! -z "$relu_dim" ]; then
+    dim_opts="--relu-dim $relu_dim"
+  else
+    dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim  $pnorm_output_dim"
+  fi
+
+  # create the config files for nnet initialization
+  pool_opts=
+  pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "}
+  pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "}
+  pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "}
+  repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "}
+
+  steps/nnet3/tdnn/make_configs.py $pool_opts \
+    $repair_opts \
+    --feat-dir data/${train_set}_hires \
+    --ivector-dir exp/nnet3/ivectors_${train_set} \
+    --tree-dir $treedir \
+    $dim_opts \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \
+    --use-presoftmax-prior-scale false \
+    --xent-regularize $xent_regularize \
+    --xent-separate-forward-affine true \
+    --include-log-softmax false \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    $dir/configs || exit 1;
+fi
+
+
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1200000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/tdnn/run_tdnn_v7_pool.sh b/egs/swbd/s5c/local/chain/tdnn/run_tdnn_v7_pool.sh
new file mode 100755
index 00000000000..8aa54c556a4
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tdnn/run_tdnn_v7_pool.sh
@@ -0,0 +1,207 @@
+#!/bin/bash
+
+# this is same as v2 script but with xent-regularization
+# it has a different splicing configuration
+set -e
+
+# configs for 'chain'
+affix=
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_v7  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+
+# TDNN options
+# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing
+# smoothing options
+pool_window=
+pool_type='none'
+pool_lpfilter_width=
+self_repair_scale=0.00001
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=64
+relu_dim=576
+frames_per_eg=150
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}${affix:+_$affix}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs";
+  if [ ! -z "$relu_dim" ]; then
+    dim_opts="--relu-dim $relu_dim"
+  else
+    dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim  $pnorm_output_dim"
+  fi
+
+  # create the config files for nnet initialization
+  pool_opts=
+  pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "}
+  pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "}
+  pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "}
+  repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "}
+
+  steps/nnet3/tdnn/make_configs.py $pool_opts \
+    $repair_opts \
+    --feat-dir data/${train_set}_hires \
+    --ivector-dir exp/nnet3/ivectors_${train_set} \
+    --tree-dir $treedir \
+    $dim_opts \
+    --splice-indexes "-2,-1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \
+    --use-presoftmax-prior-scale false \
+    --xent-regularize $xent_regularize \
+    --xent-separate-forward-affine true \
+    --include-log-softmax false \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    $dir/configs || exit 1;
+fi
+
+
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1200000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/tdnn/run_tdnn_v8_pool.sh b/egs/swbd/s5c/local/chain/tdnn/run_tdnn_v8_pool.sh
new file mode 100755
index 00000000000..e217fba7af5
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tdnn/run_tdnn_v8_pool.sh
@@ -0,0 +1,206 @@
+#!/bin/bash
+
+# same as v7 but with large dimension
+set -e
+
+# configs for 'chain'
+affix=
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_v8  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+
+# TDNN options
+# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing
+# smoothing options
+pool_window=
+pool_type='none'
+pool_lpfilter_width=
+self_repair_scale=0.00001
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=64
+relu_dim=768
+frames_per_eg=150
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}${affix:+_$affix}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs";
+  if [ ! -z "$relu_dim" ]; then
+    dim_opts="--relu-dim $relu_dim"
+  else
+    dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim  $pnorm_output_dim"
+  fi
+
+  # create the config files for nnet initialization
+  pool_opts=
+  pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "}
+  pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "}
+  pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "}
+  repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "}
+
+  steps/nnet3/tdnn/make_configs.py $pool_opts \
+    $repair_opts \
+    --feat-dir data/${train_set}_hires \
+    --ivector-dir exp/nnet3/ivectors_${train_set} \
+    --tree-dir $treedir \
+    $dim_opts \
+    --splice-indexes "-2,-1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \
+    --use-presoftmax-prior-scale false \
+    --xent-regularize $xent_regularize \
+    --xent-separate-forward-affine true \
+    --include-log-softmax false \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    $dir/configs || exit 1;
+fi
+
+
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1200000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/confidence_calibration.sh b/egs/swbd/s5c/local/confidence_calibration.sh
new file mode 100755
index 00000000000..de330866622
--- /dev/null
+++ b/egs/swbd/s5c/local/confidence_calibration.sh
@@ -0,0 +1,84 @@
+#!/bin/bash
+. cmd.sh
+. path.sh
+
+# Global options,
+graph=exp/tri4/graph_sw1_tg
+arpa_gz=data/local/lm/sw1_fsh.o3g.kn.gz
+lmwt=14
+
+# Dev-set options,
+dev_data=data/train_dev
+dev_latdir=exp/tri4/decode_dev_sw1_tg
+
+# Eval-set options,
+eval_data=data/eval2000
+eval_latdir=exp/tri4/decode_eval2000_sw1_tg
+
+. utils/parse_options.sh
+set -euxo pipefail
+
+# Derived options,
+dev_caldir=$dev_latdir/confidence_$lmwt
+eval_caldir=$eval_latdir/confidence_$lmwt
+
+###### Data preparation,
+
+# Prepare filtering for excluding data from train-set (1 .. keep word, 0 .. exclude word),
+# - only excludes from training-targets, the confidences are recalibrated for all the words,
+word_filter=$(mktemp)
+awk '{ keep_the_word = $1 !~ /^(\[.*\]|<.*>|%.*|!.*|-.*|.*-)$/; print $0, keep_the_word }' \
+  $graph/words.txt >$word_filter
+
+# Calcualte the word-length,
+word_length=$(mktemp)
+awk '{if(r==0) { len_hash[$1] = NF-2; } 
+      if(r==1) { if(len_hash[$1]) { len = len_hash[$1]; } else { len = -1 }  
+      print $0, len; }}' \
+  r=0 $graph/phones/align_lexicon.txt \
+  r=1 $graph/words.txt \
+  >$word_length
+
+# Extract unigrams,
+unigrams=$(mktemp); steps/conf/parse_arpa_unigrams.py $graph/words.txt $arpa_gz $unigrams
+
+###### Paste the 'word-specific' features (first 4 columns have fixed position, more feature-columns can be added),
+# Format: "word word_id filter length other_features"
+word_feats=$(mktemp)
+paste $word_filter <(awk '{ print $3 }' $word_length) <(awk '{ print $3 }' $unigrams) > $word_feats
+
+
+###### Train the calibration,
+steps/conf/train_calibration.sh --cmd "$decode_cmd" --lmwt $lmwt \
+  $dev_data $graph $word_feats $dev_latdir $dev_caldir
+
+###### Apply the calibration to eval set,
+steps/conf/apply_calibration.sh --cmd "$decode_cmd" \
+  $eval_data $graph $eval_latdir $dev_caldir $eval_caldir
+# The final confidences are here '$eval_caldir/ctm_calibrated',
+
+###### Sclite scoring,
+# We will produce NCE which shows the ``quality'' of the confidences.
+# Please compare with the default scoring script for your database.
+
+# Scoring tools,
+hubscr=$KALDI_ROOT/tools/sctk/bin/hubscr.pl 
+hubdir=`dirname $hubscr`
+
+# Inputs,
+ctm=$eval_caldir/ctm_calibrated
+stm=$eval_data/stm
+glm=$eval_data/glm
+
+# Normalizng CTM, just like in 'local/score_sclite.sh',
+cat $ctm | grep -i -v -E '\[NOISE|LAUGHTER|VOCALIZED-NOISE\]' | \
+  grep -i -v -E '<UNK>' | \
+  grep -i -v -E ' (UH|UM|EH|MM|HM|AH|HUH|HA|ER|OOF|HEE|ACH|EEE|EW) ' | \
+  awk '$5 !~ /^.*-$/' | \
+  local/map_acronyms_ctm.py -M data/local/dict_nosp/acronyms.map -i - -o ${ctm}.filt
+
+# Mapping the time info to global,
+utils/convert_ctm.pl $eval_data/segments $eval_data/reco2file_and_channel <${ctm}.filt >${ctm}.filt.conv
+
+# Scoring,
+$hubscr -p $hubdir -V -l english -h hub5 -g $glm -r $stm ${ctm}.filt.conv
diff --git a/egs/swbd/s5c/local/map_acronyms_ctm.py b/egs/swbd/s5c/local/map_acronyms_ctm.py
index c7f002cb2c7..983c02205d9 100755
--- a/egs/swbd/s5c/local/map_acronyms_ctm.py
+++ b/egs/swbd/s5c/local/map_acronyms_ctm.py
@@ -15,6 +15,9 @@
 parser.add_argument('-M','--Map',help='Input acronyms map', required=True)
 args = parser.parse_args()
 
+if args.input == '-': args.input = '/dev/stdin'
+if args.output == '-': args.output = '/dev/stdout'
+
 dict_acronym_back = {}
 fin_map = open(args.Map, "r")
 for line in fin_map:
diff --git a/egs/swbd/s5c/local/nnet/run_dnn.sh b/egs/swbd/s5c/local/nnet/run_dnn.sh
index d0bc50d6ea7..0ad87100e31 100755
--- a/egs/swbd/s5c/local/nnet/run_dnn.sh
+++ b/egs/swbd/s5c/local/nnet/run_dnn.sh
@@ -30,28 +30,29 @@ has_fisher=true
 . utils/parse_options.sh || exit 1;
 #
 
+set -euxo pipefail
+
 if [ $stage -le 0 ]; then
   # Store fMLLR features, so we can train on them easily,
   # eval2000
   dir=$data_fmllr/eval2000
   steps/nnet/make_fmllr_feats.sh --nj 10 --cmd "$train_cmd" \
      --transform-dir $gmmdir/decode_eval2000_sw1_tg \
-     $dir data/eval2000 $gmmdir $dir/log $dir/data || exit 1
+     $dir data/eval2000 $gmmdir $dir/log $dir/data
   # train
   dir=$data_fmllr/train_nodup
   steps/nnet/make_fmllr_feats.sh --nj 10 --cmd "$train_cmd" \
      --transform-dir ${gmmdir}_ali_nodup \
-     $dir data/train_nodup $gmmdir $dir/log $dir/data || exit 1
+     $dir data/train_nodup $gmmdir $dir/log $dir/data
   # split the data : 90% train 10% cross-validation (held-out)
-  utils/subset_data_dir_tr_cv.sh $dir ${dir}_tr90 ${dir}_cv10 || exit 1
+  utils/subset_data_dir_tr_cv.sh $dir ${dir}_tr90 ${dir}_cv10
 fi
 
 if [ $stage -le 1 ]; then
   # Pre-train DBN, i.e. a stack of RBMs
   dir=exp/dnn5b_pretrain-dbn
-  (tail --pid=$$ -F $dir/log/pretrain_dbn.log 2>/dev/null)& # forward log
   $cuda_cmd $dir/log/pretrain_dbn.log \
-    steps/nnet/pretrain_dbn.sh --rbm-iter 1 $data_fmllr/train_nodup $dir || exit 1;
+    steps/nnet/pretrain_dbn.sh --rbm-iter 1 $data_fmllr/train_nodup $dir
 fi
 
 if [ $stage -le 2 ]; then
@@ -60,16 +61,15 @@ if [ $stage -le 2 ]; then
   ali=${gmmdir}_ali_nodup
   feature_transform=exp/dnn5b_pretrain-dbn/final.feature_transform
   dbn=exp/dnn5b_pretrain-dbn/6.dbn
-  (tail --pid=$$ -F $dir/log/train_nnet.log 2>/dev/null)& # forward log
   # Train
   $cuda_cmd $dir/log/train_nnet.log \
     steps/nnet/train.sh --feature-transform $feature_transform --dbn $dbn --hid-layers 0 --learn-rate 0.008 \
-    $data_fmllr/train_nodup_tr90 $data_fmllr/train_nodup_cv10 data/lang $ali $ali $dir || exit 1;
+    $data_fmllr/train_nodup_tr90 $data_fmllr/train_nodup_cv10 data/lang $ali $ali $dir
   # Decode with the trigram swbd language model.
   steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" \
     --config conf/decode_dnn.config --acwt 0.08333 \
     $gmmdir/graph_sw1_tg $data_fmllr/eval2000 \
-    $dir/decode_eval2000_sw1_tg || exit 1;
+    $dir/decode_eval2000_sw1_tg
   if $has_fisher; then
     # Rescore with the 4gram swbd+fisher language model.
     steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
@@ -80,8 +80,7 @@ fi
 
 
 # Sequence training using sMBR criterion, we do Stochastic-GD 
-# with per-utterance updates. We use usually good acwt 0.1
-# Lattices are re-generated after 1st epoch, to get faster convergence.
+# with per-utterance updates. The typical acwt value is around 0.1
 dir=exp/dnn5b_pretrain-dbn_dnn_smbr
 srcdir=exp/dnn5b_pretrain-dbn_dnn
 acwt=0.0909
@@ -89,62 +88,28 @@ acwt=0.0909
 if [ $stage -le 3 ]; then
   # First we generate lattices and alignments:
   steps/nnet/align.sh --nj 250 --cmd "$train_cmd" \
-    $data_fmllr/train_nodup data/lang $srcdir ${srcdir}_ali || exit 1;
+    $data_fmllr/train_nodup data/lang $srcdir ${srcdir}_ali
   steps/nnet/make_denlats.sh --nj 10 --sub-split 100 --cmd "$decode_cmd" --config conf/decode_dnn.config \
-    --acwt $acwt $data_fmllr/train_nodup data/lang $srcdir ${srcdir}_denlats || exit 1;
+    --acwt $acwt $data_fmllr/train_nodup data/lang $srcdir ${srcdir}_denlats
 fi
 
 if [ $stage -le 4 ]; then
   # Re-train the DNN by 1 iteration of sMBR 
-  steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 1 --acwt $acwt --do-smbr true \
-    $data_fmllr/train_nodup data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir || exit 1
-  # Decode (reuse HCLG graph)
-  for ITER in 1; do
-    # Decode with the trigram swbd language model.
-    steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" \
-      --config conf/decode_dnn.config \
-      --nnet $dir/${ITER}.nnet --acwt $acwt \
-      $gmmdir/graph_sw1_tg $data_fmllr/eval2000 \
-      $dir/decode_eval2000_sw1_tg || exit 1;
-    if $has_fisher; then
-      # Rescore with the 4gram swbd+fisher language model.
-      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-        data/lang_sw1_{tg,fsh_fg} data/eval2000 \
-        $dir/decode_eval2000_sw1_{tg,fsh_fg}
-    fi
-  done 
-fi
-
-# Re-generate lattices, run 4 more sMBR iterations
-dir=exp/dnn5b_pretrain-dbn_dnn_smbr_i1lats
-srcdir=exp/dnn5b_pretrain-dbn_dnn_smbr
-acwt=0.0909
-
-if [ $stage -le 5 ]; then
-  # First we generate lattices and alignments:
-  steps/nnet/align.sh --nj 250 --cmd "$train_cmd" \
-    $data_fmllr/train_nodup data/lang $srcdir ${srcdir}_ali || exit 1;
-  steps/nnet/make_denlats.sh --nj 10 --sub-split 100 --cmd "$decode_cmd" --config conf/decode_dnn.config \
-    --acwt $acwt $data_fmllr/train_nodup data/lang $srcdir ${srcdir}_denlats || exit 1;
-fi
-
-if [ $stage -le 6 ]; then
-  # Re-train the DNN by 1 iteration of sMBR 
-  steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 2 --acwt $acwt --do-smbr true \
-    $data_fmllr/train_nodup data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir || exit 1
+  steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 4 --acwt $acwt --do-smbr true \
+    $data_fmllr/train_nodup data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir
   # Decode (reuse HCLG graph)
-  for ITER in 1 2; do
+  for ITER in 4 3 2 1; do
     # Decode with the trigram swbd language model.
     steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" \
       --config conf/decode_dnn.config \
       --nnet $dir/${ITER}.nnet --acwt $acwt \
       $gmmdir/graph_sw1_tg $data_fmllr/eval2000 \
-      $dir/decode_eval2000_sw1_tg || exit 1;
+      $dir/decode_eval2000_sw1_tg_it$ITER
     if $has_fisher; then
       # Rescore with the 4gram swbd+fisher language model.
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
         data/lang_sw1_{tg,fsh_fg} data/eval2000 \
-        $dir/decode_eval2000_sw1_{tg,fsh_fg}
+        $dir/decode_eval2000_sw1_{tg,fsh_fg}_it$ITER
     fi
   done 
 fi
diff --git a/egs/swbd/s5c/local/nnet/run_dnn_tandem_uc.sh b/egs/swbd/s5c/local/nnet/run_dnn_tandem_uc.sh
index f15add1f3f5..4cd6a21873f 100755
--- a/egs/swbd/s5c/local/nnet/run_dnn_tandem_uc.sh
+++ b/egs/swbd/s5c/local/nnet/run_dnn_tandem_uc.sh
@@ -12,221 +12,186 @@
 
 # Config:
 stage=0 # resume training with --stage=N
+has_fisher=true
 # End of config.
 . utils/parse_options.sh || exit 1;
 #
 
+set -euxo pipefail 
+
+train_src=data/train_nodup
+train=data-fbank-pitch/train_nodup
+
+dev_src=data/eval2000
+dev=data-fbank-pitch/eval2000
+
+gmmdir=exp/tri4
+
+lang=data/lang
+lang_test=data/lang_sw1_tg
+
 if [ $stage -le 1 ]; then
-  # prepare the FBANK+f0 features
-  # eval2000
-  dir=data-fbank-pitch/eval2000; srcdir=data/eval2000
-  (mkdir -p $dir; cp $srcdir/* $dir; )
-  steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 20 $dir $dir/log $dir/data || exit 1;
-  steps/compute_cmvn_stats.sh $dir $dir/log $dir/data || exit 1;
-
-  # training set
-  dir=data-fbank-pitch/train; srcdir=data/train
-  (mkdir -p $dir; cp $srcdir/* $dir; )
-  steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 20 $dir $dir/log $dir/data || exit 1;
-  steps/compute_cmvn_stats.sh $dir $dir/log $dir/data || exit 1;
+  [ -e $dev ] && echo "Existing '$dev', better quit than overwrite!!!" && exit 1
+  # prepare the FBANK+f0 features,
+  # eval2000,
+  utils/copy_data_dir.sh  $dev_src $dev; rm $dev/{feats,cmvn}.scp
+  steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 20 $dev $dev/log $dev/data
+  steps/compute_cmvn_stats.sh $dev $dev/log $dev/data
+  # training set,
+  utils/copy_data_dir.sh $train_src $train; rm $train/{feats,cmvn}.scp
+  steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 20 $train $train/log $train/data
+  steps/compute_cmvn_stats.sh $train $train/log $train/data
 fi
 
 if [ $stage -le 2 ]; then
-  # Prepare same subsets as in the main MFCC-GMM recipe, these will be used 
-  # during during building GMM system from flat-start, later in the Tandem recipe.
-  data=data-fbank-pitch
-
-  # Use the first 4k sentences as dev set.  Note: when we trained the LM, we used
-  utils/subset_data_dir.sh --first $data/train 4000 $data/train_dev # 5hr 6min
-  n=$[`cat data/train/segments | wc -l` - 4000]
-  utils/subset_data_dir.sh --last $data/train $n $data/train_nodev
-
-  # Prepare data for training mono
-  utils/subset_data_dir.sh --shortest $data/train_nodev 100000 $data/train_100kshort
-  utils/subset_data_dir.sh  $data/train_100kshort 10000 $data/train_10k
-  local/remove_dup_utts.sh 100 $data/train_10k $data/train_10k_nodup
-
-  # Take the first 30k utterances (about 1/8th of the data)
-  utils/subset_data_dir.sh --first $data/train_nodev 30000 $data/train_30k
-  local/remove_dup_utts.sh 200 $data/train_30k $data/train_30k_nodup
-
-  # Take the first 100k utterances (just under half the data); we'll use
-  # this for later stages of training.
-  utils/subset_data_dir.sh --first $data/train_nodev 100000 $data/train_100k
-  local/remove_dup_utts.sh 200 $data/train_100k $data/train_100k_nodup
-
-  # Full training dataset,
-  local/remove_dup_utts.sh 300 $data/train_nodev $data/train_nodup
-  # split the data : 90% train 10% cross-validation (held-out)
-  dir=$data/train_nodup
-  utils/subset_data_dir_tr_cv.sh $dir ${dir}_tr90 ${dir}_cv10 || exit 1
+  # split the data : 90% train, 10% cross-validation (held-out set),
+  utils/subset_data_dir_tr_cv.sh $train ${train}_tr90 ${train}_cv10
 fi
 
 #########################################################################################
 # Let's build universal-context bottleneck network
 # - Universal context MLP is a hierarchy of two bottleneck neural networks
-# - The first network can see a limited range of frames (11 frames)
-# - The second network sees concatenation of bottlneck outputs of the first 
-#   network, with temporal shifts -10 -5 0 5 10, (in total a range of 31 frames 
+# - The first network has limited range of frames on input (11 frames)
+# - The second network input is a concatenation of bottlneck outputs from the first 
+#   network, with temporal shifts -10 -5..5 10, (in total a range of 31 frames 
 #   in the original feature space)
-# - This structure has been reported to produce superior performance
-#   compared to a network with single bottleneck
+# - This structure produces superior performance w.r.t. single bottleneck network
 #
 if [ $stage -le 3 ]; then
-  # 1st network, overall context +/-5 frames
-  # - the topology is 90_1500_1500_80_1500_NSTATES, linear bottleneck
-  dir=exp/nnet5b_uc-part1
-  ali=exp/tri4_ali_nodup
+  # Train 1st network, overall context +/-5 frames
+  # - the topology is 90_1500_1500_80_1500_NSTATES, linear bottleneck,
+  dir=exp/nnet5uc-part1
+  ali=${gmmdir}_ali_nodup
   $cuda_cmd $dir/log/train_nnet.log \
-    steps/nnet/train.sh --hid-layers 2 --hid-dim 1500 --bn-dim 80 --apply-cmvn true \
-      --copy-feats false \
+    steps/nnet/train.sh --hid-layers 2 --hid-dim 1500 --bn-dim 80 \
+      --cmvn-opts "--norm-means=true --norm-vars=false" \
       --feat-type traps --splice 5 --traps-dct-basis 6 --learn-rate 0.008 \
-    data-fbank-pitch/train_nodup_tr90 data-fbank-pitch/train_nodup_cv10 data/lang ${ali} ${ali} $dir || exit 1;
+      ${train}_tr90 ${train}_cv10 $lang $ali $ali $dir
 fi
+#
 if [ $stage -le 4 ]; then
   # Compose feature_transform for the next stage, 
-  # - remaining part of the first network is fixed
-  dir=exp/nnet5b_uc-part1
+  # - remaining part of the first network is fixed,
+  dir=exp/nnet5uc-part1
   feature_transform=$dir/final.feature_transform.part1
-  nnet-concat $dir/final.feature_transform \
-    "nnet-copy --remove-last-layers=4 --binary=false $dir/final.nnet - |" \
-    "utils/nnet/gen_splice.py --fea-dim=80 --splice=2 --splice-step=5 |" \
-    $feature_transform || exit 1
+  # Create splice transform,
+  nnet-initialize <(echo "<Splice> <InputDim> 80 <OutputDim> 1040 <BuildVector> -10 -5:5 10 </BuildVector>") \
+    $dir/splice_for_bottleneck.nnet 
+  # Concatanate the input-transform, 1stage network, splicing,
+  nnet-concat $dir/final.feature_transform "nnet-copy --remove-last-components=4 $dir/final.nnet - |" \
+    $dir/splice_for_bottleneck.nnet $feature_transform
   
-  # 2nd network, overall context +/-15 frames
-  # - the topology will be 400_1500_1500_30_1500_NSTATES, again, the bottleneck is linear
-  dir=exp/nnet5b_uc-part2
-  ali=exp/tri4_ali_nodup
+  # Train 2nd network, overall context +/-15 frames,
+  # - the topology will be 1040_1500_1500_30_1500_NSTATES, linear bottleneck,
+  # - cmvn_opts get imported inside 'train.sh',
+  dir=exp/nnet5uc-part2
+  ali=${gmmdir}_ali_nodup
   $cuda_cmd $dir/log/train_nnet.log \
-    steps/nnet/train.sh --hid-layers 2 --hid-dim 1500 --bn-dim 30 --apply-cmvn true \
+    steps/nnet/train.sh --hid-layers 2 --hid-dim 1500 --bn-dim 30 \
     --feature-transform $feature_transform --learn-rate 0.008 \
-    data-fbank-pitch/train_nodup_tr90 data-fbank-pitch/train_nodup_cv10 data/lang ${ali} ${ali} $dir || exit 1;
+    ${train}_tr90 ${train}_cv10 $lang $ali $ali $dir
 fi
 #
 #########################################################################################
 
+# Decode the 2nd DNN,
 if [ $stage -le 5 ]; then
-  # Store the BN-features
-  data=data-bn/nnet5b_uc-part2 
-  srcdata=data-fbank-pitch/
-  nnet=exp/nnet5b_uc-part2
-  # eval2000
-  steps/nnet/make_bn_feats.sh --cmd "$train_cmd" --nj 20 $data/eval2000 $srcdata/eval2000 \
-    $nnet $data/eval2000/log $data/eval2000/data || exit 1
-  # trainig data (full set)
-  steps/nnet/make_bn_feats.sh --cmd "$train_cmd" --nj 40 $data/train $srcdata/train \
-    $nnet $data/train/log $data/train/data || exit 1
-
-  # Compute CMVN of the BN-features
-  dir=data-bn/nnet5b_uc-part2/train
-  steps/compute_cmvn_stats.sh $dir $dir/log $dir/data || exit 1;
-  dir=data-bn/nnet5b_uc-part2/eval2000
-  steps/compute_cmvn_stats.sh $dir $dir/log $dir/data || exit 1;
+  dir=exp/nnet5uc-part2
+  steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.08333 \
+    $gmmdir/graph_sw1_tg $dev $dir/decode_eval2000_sw1_tg
 fi
 
+# Store the BN-features,
+nnet=exp/nnet5uc-part2
+train_bn=data-$(basename $nnet)/train_nodup
+dev_bn=data-$(basename $nnet)/eval2000
 if [ $stage -le 6 ]; then
-  # Prepare BN-feature subsets same as with MFCCs in run.sh 
-  data=data-bn/nnet5b_uc-part2/
-
-  # Use the first 4k sentences as dev set.
-  utils/subset_data_dir.sh --first $data/train 4000 $data/train_dev # 5hr 6min
-  n=$[`cat data/train/segments | wc -l` - 4000]
-  utils/subset_data_dir.sh --last $data/train $n $data/train_nodev
-
-  # Prepare data for training mono
-  utils/subset_data_dir.sh --shortest $data/train_nodev 100000 $data/train_100kshort
-  utils/subset_data_dir.sh  $data/train_100kshort 10000 $data/train_10k
-  local/remove_dup_utts.sh 100 $data/train_10k $data/train_10k_nodup
-
-  # Take the first 30k utterances (about 1/8th of the data)
-  utils/subset_data_dir.sh --first $data/train_nodev 30000 $data/train_30k
-  local/remove_dup_utts.sh 200 $data/train_30k $data/train_30k_nodup
-
-  # Take the first 100k utterances (just under half the data); we'll use
-  # this for later stages of training.
-  utils/subset_data_dir.sh --first $data/train_nodev 100000 $data/train_100k
-  local/remove_dup_utts.sh 200 $data/train_100k $data/train_100k_nodup
-
-  # Full dataset
-  local/remove_dup_utts.sh 300 $data/train_nodev $data/train_nodup
+  # eval2000,
+  steps/nnet/make_bn_feats.sh --cmd "$train_cmd" --nj 20 $dev_bn $dev $nnet $dev_bn/log $dev_bn/data
+  # trainig,
+  steps/nnet/make_bn_feats.sh --cmd "$train_cmd --max-jobs-run 50" --nj 200 $train_bn $train $nnet $train_bn/log $train_bn/data
+  # For further GMM training, we have to produce cmvn statistics even if not used!!!
+  steps/compute_cmvn_stats.sh $dev_bn $dev_bn/log $dev_bn/data
+  steps/compute_cmvn_stats.sh $train_bn $train_bn/log $train_bn/data
 fi
 
-
-# Start building the tandem GMM system
-# - train from mono to tri4b, run bmmi training
-bndata=data-bn/nnet5b_uc-part2/
-
+# Use single-pass retraining to build new GMM system on top of bottleneck features,
 if [ $stage -le 7 ]; then
-  steps/tandem/train_mono.sh --nj 10 --cmd "$train_cmd" \
-    data/train_10k_nodup $bndata/train_10k_nodup data/lang exp/tandem2uc-mono0a || exit 1;
-
-  steps/tandem/align_si.sh --nj 30 --cmd "$train_cmd" \
-     data/train_30k_nodup $bndata/train_30k_nodup data/lang exp/tandem2uc-mono0a exp/tandem2uc-mono0a_ali || exit 1;
-
-  steps/tandem/train_deltas.sh --cmd "$train_cmd" \
-      3200 30000 data/train_30k_nodup $bndata/train_30k_nodup data/lang exp/tandem2uc-mono0a_ali exp/tandem2uc-tri1 || exit 1;
-   
-  utils/mkgraph.sh data/lang_test exp/tandem2uc-tri1 exp/tandem2uc-tri1/graph
-
-  steps/tandem/decode.sh --nj 30 --cmd "$decode_cmd" --config conf/decode_tandem.config \
-    exp/tandem2uc-tri1/graph data/eval2000 $bndata/eval2000 exp/tandem2uc-tri1/decode_eval2000
+  dir=exp/tri6uc
+  ali_src=${gmmdir}_ali_nodup
+  graph=$dir/graph_${lang_test#*lang_}
+  # Train,
+  # GMM on bn features, no cmvn, no lda-mllt,
+  steps/train_deltas.sh --cmd "$train_cmd" --delta-opts "--delta-order=0" \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --beam 20 --retry-beam 80 \
+    11500 200000 $train_bn $lang $ali_src $dir 
+  # Decode,
+  utils/mkgraph.sh $lang_test $dir $graph
+  steps/decode.sh --nj 30 --cmd "$decode_cmd" --acwt 0.05 --beam 15.0 --lattice-beam 8.0 \
+    $graph $dev_bn $dir/decode_$(basename $dev_bn)_$(basename $graph)
+  # Align,
+  steps/align_fmllr.sh --nj 50 --cmd "$train_cmd" \
+    --beam 20 --retry-beam 80 \
+    $train_bn $lang $dir ${dir}_ali
 fi
 
+# Train SAT-adapted GMM on bottleneck features,
 if [ $stage -le 8 ]; then
-  steps/tandem/align_si.sh --nj 30 --cmd "$train_cmd" \
-     data/train_30k_nodup $bndata/train_30k_nodup data/lang exp/tandem2uc-tri1 exp/tandem2uc-tri1_ali || exit 1;
-
-  steps/tandem/train_deltas.sh --cmd "$train_cmd" \
-     3200 30000 data/train_30k_nodup $bndata/train_30k_nodup data/lang exp/tandem2uc-tri1_ali exp/tandem2uc-tri2 || exit 1;
-
-  utils/mkgraph.sh data/lang_test exp/tandem2uc-tri2 exp/tandem2uc-tri2/graph || exit 1;
-  steps/tandem/decode.sh --nj 30 --cmd "$decode_cmd" --config conf/decode_tandem.config \
-   exp/tandem2uc-tri2/graph data/eval2000 $bndata/eval2000 exp/tandem2uc-tri2/decode_eval2000 || exit 1;
+  dir=exp/tri7uc-sat
+  ali=exp/tri6uc_ali
+  graph=$dir/graph_${lang_test#*lang_}
+  # Train,
+  # fmllr-gmm system on bottleneck features, 
+  # - no cmvn, put fmllr to the features directly (no lda),
+  # - note1 : we don't need cmvn, similar effect has diagonal of fmllr transform,
+  # - note2 : lda+mllt was causing a small hit <0.5%,
+  steps/train_sat.sh --cmd "$train_cmd" --beam 20 --retry-beam 80 \
+    11500 200000 $train_bn $lang $ali $dir
+  # Decode,
+  utils/mkgraph.sh $lang_test $dir $graph
+  steps/decode_fmllr.sh --nj 30 --cmd "$decode_cmd" --acwt 0.05 --beam 15.0 --lattice-beam 8.0 \
+    $graph $dev_bn $dir/decode_$(basename $dev_bn)_$(basename $graph)
 fi
 
+# Prepare alignments and lattices for bMMI training,
 if [ $stage -le 9 ]; then
-  steps/tandem/align_si.sh --nj 30 --cmd "$train_cmd" \
-    data/train_100k_nodup $bndata/train_100k_nodup data/lang exp/tandem2uc-tri2 exp/tandem2uc-tri2_ali || exit 1;
-
-  # Train tri3b, which is LDA+MLLT, on 100k_nodup data.
-  steps/tandem/train_lda_mllt.sh --cmd "$train_cmd" \
-     --splice-opts "--left-context=3 --right-context=3" \
-     5500 90000 data/train_100k_nodup $bndata/train_100k_nodup data/lang exp/tandem2uc-tri2_ali exp/tandem2uc-tri3b || exit 1;
-
-  utils/mkgraph.sh data/lang_test exp/tandem2uc-tri3b exp/tandem2uc-tri3b/graph || exit 1;
-  steps/tandem/decode.sh --nj 30 --cmd "$decode_cmd" --config conf/decode_tandem.config \
-   exp/tandem2uc-tri3b/graph data/eval2000 $bndata/eval2000 exp/tandem2uc-tri3b/decode_eval2000 || exit 1;
+  dir=exp/tri7uc-sat
+  # Align,
+  steps/align_fmllr.sh --nj 50 --cmd "$train_cmd" --beam 20 --retry-beam 80 \
+    $train_bn $lang $dir ${dir}_ali_nodup
+  # Make denlats,
+  steps/make_denlats.sh --nj 50 --cmd "$decode_cmd" --acwt 0.05 \
+    --config conf/decode.config --transform-dir ${dir}_ali_nodup \
+    $train_bn $lang $dir ${dir}_denlats_nodup 
 fi
 
+# 4 iterations of bMMI seems to work well overall. The number of iterations is
+# used as an explicit argument even though train_mmi.sh will use 4 iterations by
+# default.
+num_mmi_iters=4
 if [ $stage -le 10 ]; then
-  # From now, we start building a more serious system (with SAT), 
-  # and we'll do the alignment with fMLLR.
-  steps/tandem/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
-    data/train_nodup $bndata/train_nodup data/lang exp/tandem2uc-tri3b exp/tandem2uc-tri3b_ali_nodup || exit 1;
-
-  steps/tandem/train_sat.sh  --cmd "$train_cmd" \
-    11500 200000 data/train_nodup $bndata/train_nodup data/lang exp/tandem2uc-tri3b_ali_nodup exp/tandem2uc-tri4b || exit 1;
-
-  utils/mkgraph.sh data/lang_test exp/tandem2uc-tri4b exp/tandem2uc-tri4b/graph || exit 1
-  steps/tandem/decode_fmllr.sh --nj 30 --cmd "$decode_cmd" --config conf/decode_tandem.config \
-    exp/tandem2uc-tri4b/graph data/eval2000 $bndata/eval2000 exp/tandem2uc-tri4b/decode_eval2000 || exit 1
+  dir=exp/tri7uc-sat_mmi_b0.1
+  graph=exp/tri7uc-sat/graph_${lang_test#*lang_}
+  steps/train_mmi.sh --cmd "$decode_cmd" \
+    --boost 0.1 --num-iters $num_mmi_iters \
+    $train_bn $lang exp/tri7uc-sat_{ali,denlats}_nodup ${dir}
+  for iter in 1 2 3 4; do
+    steps/decode.sh --nj 30 --cmd "$decode_cmd" --acwt 0.05 \
+      --config conf/decode.config --iter $iter \
+      --transform-dir exp/tri7uc-sat/decode_$(basename $dev_bn)_$(basename $graph) \
+      $graph $dev_bn $dir/decode_$(basename $dev_bn)_$(basename $graph)_it${iter}
+  done
 fi
 
-# bMMI starting from system in tandem2uc-tri4b, use full dataset.
 if [ $stage -le 11 ]; then
-  steps/tandem/align_fmllr.sh --nj 100 --cmd "$train_cmd" \
-    data/train_nodup $bndata/train_nodup data/lang exp/tandem2uc-tri4b exp/tandem2uc-tri4b_ali || exit 1;
-  steps/tandem/make_denlats.sh --nj 40 --cmd "$decode_cmd" --transform-dir exp/tandem2uc-tri4b_ali \
-    --sub-split 100 data/train_nodup $bndata/train_nodup data/lang exp/tandem2uc-tri4b exp/tandem2uc-tri4b_denlats || exit 1;
-fi
-if [ $stage -le 12 ]; then
-  steps/tandem/train_mmi.sh --cmd "$decode_cmd" --boost 0.1 --acwt 0.039 \
-    data/train_nodup $bndata/train_nodup data/lang exp/tandem2uc-tri4b_{ali,denlats} exp/tandem2uc-tri4b_mmi_b0.1 || exit 1;
-
-  steps/tandem/decode.sh --nj 30 --cmd "$decode_cmd" --config conf/decode_tandem.config \
-    --transform-dir exp/tandem2uc-tri4b/decode_eval2000 \
-    exp/tandem2uc-tri4b/graph data/eval2000 $bndata/eval2000 exp/tandem2uc-tri4b_mmi_b0.1/decode_eval2000 || exit 1;
+  if $has_fisher; then
+    # Rescore with the 4gram swbd+fisher language model.
+    dir=exp/tri7uc-sat_mmi_b0.1
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+      data/lang_sw1_{tg,fsh_fg} data/eval2000 \
+      $dir/decode_eval2000_graph_sw1_{tg,fsh_fg}_it4
+  fi
 fi
 
-echo success
-exit 0
+echo Done.
diff --git a/egs/swbd/s5c/local/nnet2/run_nnet2.sh b/egs/swbd/s5c/local/nnet2/run_nnet2.sh
index 0872560337b..e83c587a006 100755
--- a/egs/swbd/s5c/local/nnet2/run_nnet2.sh
+++ b/egs/swbd/s5c/local/nnet2/run_nnet2.sh
@@ -5,7 +5,7 @@
 # units, on top of fMLLR features, on GPU.
 
 temp_dir=
-dir=exp/nnet2_5
+dir=nnet2_5
 has_fisher=true
 
 . ./cmd.sh
@@ -18,10 +18,10 @@ parallel_opts="--gpu 1"  # This is suitable for the CLSP network, you'll
 
 ( 
   if [ ! -f exp/$dir/final.mdl ]; then
-    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d exp/$dir/egs/storage ]; then
       # spread the egs over various machines. 
       utils/create_split_dir.pl \
-      /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+      /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/exp/$dir/egs/storage exp/$dir/egs/storage
     fi
 
     steps/nnet2/train_pnorm_accel2.sh --parallel-opts "$parallel_opts" \
diff --git a/egs/swbd/s5c/local/nnet3/run_blstm_discriminative.sh b/egs/swbd/s5c/local/nnet3/run_blstm_discriminative.sh
new file mode 100755
index 00000000000..32494afe47b
--- /dev/null
+++ b/egs/swbd/s5c/local/nnet3/run_blstm_discriminative.sh
@@ -0,0 +1,203 @@
+#!/bin/bash
+
+set -o pipefail
+set -e
+# this is run_discriminative.sh
+
+# This script does discriminative training on top of CE BLSTM system.
+# note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
+# since the lattice generation runs in about real-time, so takes of the order of
+# 1000 hours of CPU time.
+# 
+. cmd.sh
+
+
+stage=0
+train_stage=-10 # can be used to start training in the middle.
+get_egs_stage=-10
+use_gpu=true  # for training
+cleanup=false  # run with --cleanup true --stage 6 to clean up (remove large things like denlats,
+               # alignments and degs).
+
+frames_per_chunk=150
+# The contexts here must match the one used for training
+extra_left_context=40
+extra_right_context=40
+extra_left_context_initial=-1
+extra_right_context_final=-1
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+srcdir=exp/nnet3/lstm_bidirectional_ld0
+train_data_dir=data/train_nodup_sp_hires
+online_ivector_dir=exp/nnet3/ivectors_train_nodup_sp
+degs_dir=                     # If provided, will skip the degs directory creation
+lats_dir=                     # If provided, will skip denlats creation
+
+## Objective options
+criterion=smbr
+one_silence_class=true
+
+dir=${srcdir}_${criterion}
+
+## Egs options
+frames_per_eg=150
+frames_overlap_per_eg=30
+truncate_deriv_weights=10
+
+## Nnet training options
+effective_learning_rate=0.0000125
+max_param_change=1
+num_jobs_nnet=4
+num_epochs=4
+regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options 
+minibatch_size=64
+adjust_priors=true            # May need to be set to false 
+                              # because it does not help in some setups
+modify_learning_rates=true
+last_layer_factor=0.1
+
+## Decode options
+decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we decided to run more.
+
+if $use_gpu; then
+  if ! cuda-compiled; then
+    cat <<EOF && exit 1 
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
+EOF
+  fi
+  num_threads=1
+else
+  # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
+  # almost the same, but this may be a little bit slow.
+  num_threads=16
+fi
+
+[ ! -z "$frames_per_chunk" ] && context_opts="$context_opts --frames-per-chunk $frames_per_chunk"
+[ ! -z "$extra_left_context" ] && context_opts="$context_opts --extra-left-context $extra_left_context"
+[ ! -z "$extra_right_context" ] && context_opts="$context_opts --extra-right-context $extra_right_context"
+[ ! -z "$extra_left_context_initial" ] && context_opts="$context_opts --extra-left-context-initial $extra_left_context_initial"
+[ ! -z "$extra_right_context_final" ] && context_opts="$context_opts --extra-right-context-final $extra_right_context_final"
+
+if [ ! -f ${srcdir}/final.mdl ]; then
+  echo "$0: expected ${srcdir}/final.mdl to exist; first run run_tdnn.sh or run_lstm.sh"
+  exit 1;
+fi
+
+if [ $stage -le 1 ]; then
+  # hardcode no-GPU for alignment, although you could use GPU [you wouldn't
+  # get excellent GPU utilization though.]
+  nj=350 # have a high number of jobs because this could take a while, and we might
+         # have some stragglers.
+  steps/nnet3/align.sh  --cmd "$decode_cmd" --use-gpu false \
+    --online-ivector-dir $online_ivector_dir $context_opts \
+     --nj $nj $train_data_dir data/lang $srcdir ${srcdir}_ali ;
+
+fi
+
+if [ -z "$lats_dir" ]; then
+  lats_dir=${srcdir}_denlats
+  if [ $stage -le 2 ]; then
+    nj=50  
+    # this doesn't really affect anything strongly, except the num-jobs for one of
+    # the phases of get_egs_discriminative.sh below.
+    num_threads_denlats=6
+    subsplit=40 # number of jobs that run per job (but 2 run at a time, so total jobs is 80, giving
+    # total slots = 80 * 6 = 480.
+    steps/nnet3/make_denlats.sh --cmd "$decode_cmd" --determinize true \
+      --online-ivector-dir $online_ivector_dir $context_opts \
+      --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \
+      $train_data_dir data/lang $srcdir ${lats_dir} ;
+  fi
+fi
+
+model_left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'` 
+model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'` 
+
+left_context=$[model_left_context + extra_left_context]
+right_context=$[model_right_context + extra_right_context]
+
+valid_left_context=$[valid_left_context + frames_per_eg]
+valid_right_context=$[valid_right_context + frames_per_eg]
+
+frame_subsampling_opt=
+if [ -f $srcdir/frame_subsampling_factor ]; then
+  frame_subsampling_opt="--frame-subsampling-factor $(cat $srcdir/frame_subsampling_factor)"
+fi
+
+cmvn_opts=`cat $srcdir/cmvn_opts` 
+
+if [ -z "$degs_dir" ]; then
+  degs_dir=${srcdir}_degs
+
+  if [ $stage -le 3 ]; then
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${srcdir}_degs/storage ]; then
+      utils/create_split_dir.pl \
+        /export/b0{1,2,12,13}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5/${srcdir}_degs/storage ${srcdir}_degs/storage
+    fi
+    # have a higher maximum num-jobs if
+    if [ -d ${srcdir}_degs/storage ]; then max_jobs=10; else max_jobs=5; fi
+
+    degs_opts="--determinize true --minimize true --remove-output-symbols true --remove-epsilons true --collapse-transition-ids true"
+
+    steps/nnet3/get_egs_discriminative.sh \
+      --cmd "$decode_cmd --max-jobs-run $max_jobs --mem 20G" --stage $get_egs_stage --cmvn-opts "$cmvn_opts" \
+      --adjust-priors $adjust_priors \
+      --online-ivector-dir $online_ivector_dir \
+      --left-context $left_context --right-context $right_context \
+      --valid-left-context $valid_left_context --valid-right-context $valid_right_context \
+      --priors-left-context $valid_left_context --priors-right-context $valid_right_context $frame_subsampling_opt \
+      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg ${degs_opts} \
+      $train_data_dir data/lang ${srcdir}_ali $lats_dir $srcdir/final.mdl $degs_dir ;
+  fi
+fi
+
+if [ $stage -le 4 ]; then
+  steps/nnet3/train_discriminative.sh --cmd "$decode_cmd" \
+    --stage $train_stage \
+    --effective-lrate $effective_learning_rate --max-param-change $max_param_change \
+    --criterion $criterion --drop-frames true \
+    --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size $minibatch_size \
+    --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
+    --regularization-opts "$regularization_opts" \
+    --truncate-deriv-weights $truncate_deriv_weights --adjust-priors $adjust_priors \
+    --modify-learning-rates $modify_learning_rates --last-layer-factor $last_layer_factor \
+    ${degs_dir} $dir 
+fi
+
+graph_dir=exp/tri4/graph_sw1_tg
+if [ $stage -le 5 ]; then
+  for x in `seq $decode_start_epoch $num_epochs`; do
+    for decode_set in train_dev eval2000; do
+      (
+      num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+      iter=epoch$x.adj
+      
+      steps/nnet3/lstm/decode.sh --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
+        --online-ivector-dir exp/nnet3/ivectors_${decode_set} $context_opts \
+        $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_sw1_tg_$iter ;
+      if $has_fisher; then
+        steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+          data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+          $dir/decode_${decode_set}_sw1_{tg,fsh_fg}_$iter ;
+      fi
+      ) &
+    done
+  done
+fi
+wait;
+
+if [ $stage -le 6 ] && $cleanup; then
+  # if you run with "--cleanup true --stage 6" you can clean up.
+  rm ${lats_dir}/lat.*.gz || true
+  rm ${srcdir}_ali/ali.*.gz || true
+  steps/nnet2/remove_egs.sh ${srcdir}_degs || true
+fi
+
+
+exit 0;
+
diff --git a/egs/swbd/s5c/local/nnet3/run_ivector_common.sh b/egs/swbd/s5c/local/nnet3/run_ivector_common.sh
new file mode 100755
index 00000000000..3f7c782ffcc
--- /dev/null
+++ b/egs/swbd/s5c/local/nnet3/run_ivector_common.sh
@@ -0,0 +1,142 @@
+#!/bin/bash
+
+. ./cmd.sh
+set -e
+stage=1
+train_stage=-10
+generate_alignments=true # false if doing ctc training
+speed_perturb=true
+
+. ./path.sh
+. ./utils/parse_options.sh
+
+mkdir -p nnet3
+# perturbed data preparation
+train_set=train_nodup
+if [ "$speed_perturb" == "true" ]; then
+  if [ $stage -le 1 ]; then
+    #Although the nnet will be trained by high resolution data, we still have to perturbe the normal data to get the alignment
+    # _sp stands for speed-perturbed
+
+    for datadir in train_nodup; do
+      utils/perturb_data_dir_speed.sh 0.9 data/${datadir} data/temp1
+      utils/perturb_data_dir_speed.sh 1.1 data/${datadir} data/temp2
+      utils/combine_data.sh data/${datadir}_tmp data/temp1 data/temp2
+      utils/validate_data_dir.sh --no-feats data/${datadir}_tmp
+      rm -r data/temp1 data/temp2
+
+      mfccdir=mfcc_perturbed
+      steps/make_mfcc.sh --cmd "$train_cmd" --nj 50 \
+        data/${datadir}_tmp exp/make_mfcc/${datadir}_tmp $mfccdir || exit 1;
+      steps/compute_cmvn_stats.sh data/${datadir}_tmp exp/make_mfcc/${datadir}_tmp $mfccdir || exit 1;
+      utils/fix_data_dir.sh data/${datadir}_tmp
+
+      utils/copy_data_dir.sh --spk-prefix sp1.0- --utt-prefix sp1.0- data/${datadir} data/temp0
+      utils/combine_data.sh data/${datadir}_sp data/${datadir}_tmp data/temp0
+      utils/fix_data_dir.sh data/${datadir}_sp
+      rm -r data/temp0 data/${datadir}_tmp
+    done
+  fi
+
+  if [ $stage -le 2 ] && [ "$generate_alignments" == "true" ]; then
+    #obtain the alignment of the perturbed data
+    steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \
+      data/train_nodup_sp data/lang_nosp exp/tri4 exp/tri4_ali_nodup_sp || exit 1
+  fi
+  train_set=train_nodup_sp
+fi
+
+if [ $stage -le 3 ]; then
+  mfccdir=mfcc_hires
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
+    date=$(date +'%m_%d_%H_%M')
+    utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$date/s5b/$mfccdir/storage $mfccdir/storage
+  fi
+
+  # the 100k_nodup directory is copied seperately, as
+  # we want to use exp/tri2_ali_100k_nodup for lda_mllt training
+  # the main train directory might be speed_perturbed
+  for dataset in $train_set train_100k_nodup; do
+    utils/copy_data_dir.sh data/$dataset data/${dataset}_hires
+
+    # scale the waveforms, this is useful as we don't use CMVN
+    data_dir=data/${dataset}_hires
+    cat $data_dir/wav.scp | python -c "
+import sys, os, subprocess, re, random
+scale_low = 1.0/8
+scale_high = 2.0
+for line in sys.stdin.readlines():
+  if len(line.strip()) == 0:
+    continue
+  print '{0} sox --vol {1} -t wav - -t wav - |'.format(line.strip(), random.uniform(scale_low, scale_high))
+"| sort -k1,1 -u  > $data_dir/wav.scp_scaled || exit 1;
+    mv $data_dir/wav.scp_scaled $data_dir/wav.scp
+
+    steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \
+        --cmd "$train_cmd" data/${dataset}_hires exp/make_hires/$dataset $mfccdir;
+    steps/compute_cmvn_stats.sh data/${dataset}_hires exp/make_hires/${dataset} $mfccdir;
+
+    # Remove the small number of utterances that couldn't be extracted for some
+    # reason (e.g. too short; no such file).
+    utils/fix_data_dir.sh data/${dataset}_hires;
+  done
+
+  for dataset in eval2000 train_dev rt03; do
+    # Create MFCCs for the eval set
+    utils/copy_data_dir.sh data/$dataset data/${dataset}_hires
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 --mfcc-config conf/mfcc_hires.conf \
+        data/${dataset}_hires exp/make_hires/$dataset $mfccdir;
+    steps/compute_cmvn_stats.sh data/${dataset}_hires exp/make_hires/$dataset $mfccdir;
+    utils/fix_data_dir.sh data/${dataset}_hires  # remove segments with problems
+  done
+
+  # Take the first 30k utterances (about 1/8th of the data) this will be used
+  # for the diagubm training
+  utils/subset_data_dir.sh --first data/${train_set}_hires 30000 data/${train_set}_30k_hires
+  local/remove_dup_utts.sh 200 data/${train_set}_30k_hires data/${train_set}_30k_nodup_hires  # 33hr
+fi
+
+# ivector extractor training
+if [ $stage -le 5 ]; then
+  # We need to build a small system just because we need the LDA+MLLT transform
+  # to train the diag-UBM on top of.  We use --num-iters 13 because after we get
+  # the transform (12th iter is the last), any further training is pointless.
+  # this decision is based on fisher_english
+  steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \
+    --splice-opts "--left-context=3 --right-context=3" \
+    5500 90000 data/train_100k_nodup_hires \
+    data/lang_nosp exp/tri2_ali_100k_nodup exp/nnet3/tri3b
+fi
+
+if [ $stage -le 6 ]; then
+  # To train a diagonal UBM we don't need very much data, so use the smallest subset.
+  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 200000 \
+    data/${train_set}_30k_nodup_hires 512 exp/nnet3/tri3b exp/nnet3/diag_ubm
+fi
+
+if [ $stage -le 7 ]; then
+  # iVector extractors can be sensitive to the amount of data, but this one has a
+  # fairly small dim (defaults to 100) so we don't use all of it, we use just the
+  # 100k subset (just under half the data).
+  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
+    data/train_100k_nodup_hires exp/nnet3/diag_ubm exp/nnet3/extractor || exit 1;
+fi
+
+if [ $stage -le 8 ]; then
+  # We extract iVectors on all the train_nodup data, which will be what we
+  # train the system on.
+
+  # having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (iVector starts at zero).
+  steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/${train_set}_hires data/${train_set}_max2_hires
+
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
+    data/${train_set}_max2_hires exp/nnet3/extractor exp/nnet3/ivectors_$train_set || exit 1;
+
+  for data_set in eval2000 train_dev rt03; do
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
+      data/${data_set}_hires exp/nnet3/extractor exp/nnet3/ivectors_$data_set || exit 1;
+  done
+fi
+
+exit 0;
diff --git a/egs/swbd/s5c/local/nnet3/run_lstm.sh b/egs/swbd/s5c/local/nnet3/run_lstm.sh
new file mode 100755
index 00000000000..11fc851cb71
--- /dev/null
+++ b/egs/swbd/s5c/local/nnet3/run_lstm.sh
@@ -0,0 +1,168 @@
+#!/bin/bash
+
+# Copyright 2015  Johns Hopkins University (Author: Daniel Povey).
+#           2015  Vijayaditya Peddinti
+#           2015  Xingyu Na
+#           2015  Pegah Ghahrmani
+# Apache 2.0.
+
+
+# this is a basic lstm script
+# LSTM script runs for more epochs than the TDNN script
+# and each epoch takes twice the time
+
+# At this script level we don't support not running on GPU, as it would be painfully slow.
+# If you want to run without GPU you'd have to call lstm/train.sh with --gpu false
+
+stage=0
+train_stage=-10
+has_fisher=true
+affix=
+speed_perturb=true
+common_egs_dir=
+reporting_email=
+
+# LSTM options
+splice_indexes="-2,-1,0,1,2 0 0"
+lstm_delay=" -1 -2 -3 "
+label_delay=5
+num_lstm_layers=3
+cell_dim=1024
+hidden_dim=1024
+recurrent_projection_dim=256
+non_recurrent_projection_dim=256
+chunk_width=20
+chunk_left_context=40
+chunk_right_context=0
+
+
+# training options
+num_epochs=8
+initial_effective_lrate=0.0003
+final_effective_lrate=0.00003
+num_jobs_initial=3
+num_jobs_final=15
+momentum=0.5
+num_chunk_per_minibatch=100
+samples_per_iter=20000
+remove_egs=true
+
+#decode options
+extra_left_context=
+extra_right_context=
+frames_per_chunk=
+
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+dir=exp/nnet3/lstm
+dir=$dir${affix:+_$affix}
+if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  echo "$0: creating neural net configs";
+  config_extra_opts=()
+  [ ! -z "$lstm_delay" ] && config_extra_opts+=(--lstm-delay "$lstm_delay")
+  steps/nnet3/lstm/make_configs.py  "${config_extra_opts[@]}" \
+    --feat-dir data/${train_set}_hires \
+    --ivector-dir exp/nnet3/ivectors_${train_set} \
+    --ali-dir $ali_dir \
+    --num-lstm-layers $num_lstm_layers \
+    --splice-indexes "$splice_indexes " \
+    --cell-dim $cell_dim \
+    --hidden-dim $hidden_dim \
+    --recurrent-projection-dim $recurrent_projection_dim \
+    --non-recurrent-projection-dim $non_recurrent_projection_dim \
+    --label-delay $label_delay \
+    --self-repair-scale 0.00001 \
+   $dir/configs || exit 1;
+
+fi
+
+if [ $stage -le 10 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_rnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.num-epochs=$num_epochs \
+    --trainer.samples-per-iter=$samples_per_iter \
+    --trainer.optimization.num-jobs-initial=$num_jobs_initial \
+    --trainer.optimization.num-jobs-final=$num_jobs_final \
+    --trainer.optimization.initial-effective-lrate=$initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate=$final_effective_lrate \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.rnn.num-chunk-per-minibatch=$num_chunk_per_minibatch \
+    --trainer.optimization.momentum=$momentum \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.dir="$common_egs_dir" \
+    --cleanup.remove-egs=$remove_egs \
+    --cleanup.preserve-model-interval=100 \
+    --use-gpu=true \
+    --feat-dir=data/${train_set}_hires \
+    --ali-dir=$ali_dir \
+    --lang=data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+fi
+
+graph_dir=exp/tri4/graph_sw1_tg
+if [ $stage -le 11 ]; then
+  if [ -z $extra_left_context ]; then
+    extra_left_context=$chunk_left_context
+  fi
+  if [ -z $extra_right_context ]; then
+    extra_right_context=$chunk_right_context
+  fi
+  if [ -z $frames_per_chunk ]; then
+    frames_per_chunk=$chunk_width
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+      steps/nnet3/lstm/decode.sh --nj 250 --cmd "$decode_cmd" \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --frames-per-chunk "$frames_per_chunk" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/nnet3/run_tdnn.sh b/egs/swbd/s5c/local/nnet3/run_tdnn.sh
new file mode 100755
index 00000000000..5254bc31857
--- /dev/null
+++ b/egs/swbd/s5c/local/nnet3/run_tdnn.sh
@@ -0,0 +1,109 @@
+#!/bin/bash
+
+# this is the standard "tdnn" system, built in nnet3; it's what we use to
+# call multi-splice.
+
+. cmd.sh
+
+
+# At this script level we don't support not running on GPU, as it would be painfully slow.
+# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false,
+# --num-threads 16 and --minibatch-size 128.
+
+stage=0
+affix=
+train_stage=-10
+has_fisher=true
+speed_perturb=true
+common_egs_dir=
+reporting_email=
+remove_egs=true
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+dir=exp/nnet3/tdnn
+dir=$dir${affix:+_$affix}
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+	--speed-perturb $speed_perturb || exit 1;
+
+if [ $stage -le 9 ]; then
+  echo "$0: creating neural net configs";
+
+  # create the config files for nnet initialization
+  python steps/nnet3/tdnn/make_configs.py  \
+    --feat-dir data/${train_set}_hires \
+    --ivector-dir exp/nnet3/ivectors_${train_set} \
+    --ali-dir $ali_dir \
+    --relu-dim 1024 \
+    --splice-indexes "-2,-1,0,1,2 -1,2 -3,3 -7,2 0"  \
+    --use-presoftmax-prior-scale true \
+   $dir/configs || exit 1;
+fi
+
+
+
+if [ $stage -le 10 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_dnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.num-epochs 2 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.0017 \
+    --trainer.optimization.final-effective-lrate 0.00017 \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --cleanup.preserve-model-interval 100 \
+    --use-gpu true \
+    --feat-dir=data/${train_set}_hires \
+    --ali-dir $ali_dir \
+    --lang data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+
+fi
+
+graph_dir=exp/tri4/graph_sw1_tg
+if [ $stage -le 11 ]; then
+  for decode_set in train_dev eval2000; do
+    (
+    num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+    steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \
+        --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+       $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_hires_sw1_tg || exit 1;
+    if $has_fisher; then
+	steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+          data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+	  $dir/decode_${decode_set}_hires_sw1_{tg,fsh_fg} || exit 1;
+    fi
+    ) &
+  done
+fi
+wait;
+exit 0;
+
diff --git a/egs/swbd/s5c/local/nnet3/run_tdnn_discriminative.sh b/egs/swbd/s5c/local/nnet3/run_tdnn_discriminative.sh
new file mode 100755
index 00000000000..2d2cba4ed93
--- /dev/null
+++ b/egs/swbd/s5c/local/nnet3/run_tdnn_discriminative.sh
@@ -0,0 +1,190 @@
+#!/bin/bash
+
+set -o pipefail
+set -e
+# this is run_discriminative.sh
+
+# This script does discriminative training on top of CE nnet3 system.
+# note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
+# since the lattice generation runs in about real-time, so takes of the order of
+# 1000 hours of CPU time.
+# 
+. cmd.sh
+
+
+stage=0
+train_stage=-10 # can be used to start training in the middle.
+get_egs_stage=-10
+use_gpu=true  # for training
+cleanup=false  # run with --cleanup true --stage 6 to clean up (remove large things like denlats,
+               # alignments and degs).
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+srcdir=exp/nnet3/nnet_ms_a
+train_data_dir=data/train_nodup_sp_hires
+online_ivector_dir=exp/nnet3/ivectors_train_nodup_sp
+degs_dir=                     # If provided, will skip the degs directory creation
+lats_dir=                     # If provided, will skip denlats creation
+
+## Objective options
+criterion=smbr
+one_silence_class=true
+
+dir=${srcdir}_${criterion}
+
+## Egs options
+frames_per_eg=150
+frames_overlap_per_eg=30
+truncate_deriv_weights=10
+
+## Nnet training options
+effective_learning_rate=0.0000125
+max_param_change=1
+num_jobs_nnet=4
+num_epochs=4
+regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options 
+minibatch_size=64
+adjust_priors=true            # May need to be set to false 
+                              # because it does not help in some setups
+modify_learning_rates=true
+last_layer_factor=0.1
+
+## Decode options
+decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we decided to run more.
+
+if $use_gpu; then
+  if ! cuda-compiled; then
+    cat <<EOF && exit 1 
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
+EOF
+  fi
+  num_threads=1
+else
+  # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
+  # almost the same, but this may be a little bit slow.
+  num_threads=16
+fi
+
+if [ ! -f ${srcdir}/final.mdl ]; then
+  echo "$0: expected ${srcdir}/final.mdl to exist; first run run_tdnn.sh or run_lstm.sh"
+  exit 1;
+fi
+
+if [ $stage -le 1 ]; then
+  # hardcode no-GPU for alignment, although you could use GPU [you wouldn't
+  # get excellent GPU utilization though.]
+  nj=350 # have a high number of jobs because this could take a while, and we might
+         # have some stragglers.
+  steps/nnet3/align.sh  --cmd "$decode_cmd" --use-gpu false \
+    --online-ivector-dir $online_ivector_dir \
+     --nj $nj $train_data_dir data/lang $srcdir ${srcdir}_ali ;
+
+fi
+
+if [ -z "$lats_dir" ]; then
+  lats_dir=${srcdir}_denlats
+  if [ $stage -le 2 ]; then
+    nj=50  
+    # this doesn't really affect anything strongly, except the num-jobs for one of
+    # the phases of get_egs_discriminative.sh below.
+    num_threads_denlats=6
+    subsplit=40 # number of jobs that run per job (but 2 run at a time, so total jobs is 80, giving
+    # total slots = 80 * 6 = 480.
+    steps/nnet3/make_denlats.sh --cmd "$decode_cmd" --determinize true \
+      --online-ivector-dir $online_ivector_dir \
+      --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \
+      $train_data_dir data/lang $srcdir ${lats_dir} ;
+  fi
+fi
+
+model_left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'` 
+model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'` 
+
+left_context=$[model_left_context + extra_left_context]
+right_context=$[model_right_context + extra_right_context]
+
+valid_left_context=$[valid_left_context + frames_per_eg]
+valid_right_context=$[valid_right_context + frames_per_eg]
+
+frame_subsampling_opt=
+if [ -f $srcdir/frame_subsampling_factor ]; then
+  frame_subsampling_opt="--frame-subsampling-factor $(cat $srcdir/frame_subsampling_factor)"
+fi
+
+cmvn_opts=`cat $srcdir/cmvn_opts` 
+
+if [ -z "$degs_dir" ]; then
+  degs_dir=${srcdir}_degs
+
+  if [ $stage -le 3 ]; then
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${srcdir}_degs/storage ]; then
+      utils/create_split_dir.pl \
+        /export/b0{1,2,12,13}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5/${srcdir}_degs/storage ${srcdir}_degs/storage
+    fi
+    # have a higher maximum num-jobs if
+    if [ -d ${srcdir}_degs/storage ]; then max_jobs=10; else max_jobs=5; fi
+
+    degs_opts="--determinize true --minimize true --remove-output-symbols true --remove-epsilons true --collapse-transition-ids true"
+
+    steps/nnet3/get_egs_discriminative.sh \
+      --cmd "$decode_cmd --max-jobs-run $max_jobs --mem 20G" --stage $get_egs_stage --cmvn-opts "$cmvn_opts" \
+      --adjust-priors $adjust_priors \
+      --online-ivector-dir $online_ivector_dir \
+      --left-context $left_context --right-context $right_context \
+      --valid-left-context $valid_left_context --valid-right-context $valid_right_context \
+      --priors-left-context $valid_left_context --priors-right-context $valid_right_context $frame_subsampling_opt \
+      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg ${degs_opts} \
+      $train_data_dir data/lang ${srcdir}_ali $lats_dir $srcdir/final.mdl $degs_dir ;
+  fi
+fi
+
+if [ $stage -le 4 ]; then
+  steps/nnet3/train_discriminative.sh --cmd "$decode_cmd" \
+    --stage $train_stage \
+    --effective-lrate $effective_learning_rate --max-param-change $max_param_change \
+    --criterion $criterion --drop-frames true \
+    --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size $minibatch_size \
+    --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
+    --regularization-opts "$regularization_opts" \
+    --truncate-deriv-weights $truncate_deriv_weights --adjust-priors $adjust_priors \
+    --modify-learning-rates $modify_learning_rates --last-layer-factor $last_layer_factor \
+    ${degs_dir} $dir 
+fi
+
+graph_dir=exp/tri4/graph_sw1_tg
+if [ $stage -le 5 ]; then
+  for x in `seq $decode_start_epoch $num_epochs`; do
+    for decode_set in train_dev eval2000; do
+      (
+      num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+      iter=epoch$x.adj
+      
+      steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
+        --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+        $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_sw1_tg_$iter ;
+      if $has_fisher; then
+        steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+          data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+          $dir/decode_${decode_set}_sw1_{tg,fsh_fg}_$iter ;
+      fi
+      ) &
+    done
+  done
+fi
+wait;
+
+if [ $stage -le 6 ] && $cleanup; then
+  # if you run with "--cleanup true --stage 6" you can clean up.
+  rm ${lats_dir}/lat.*.gz || true
+  rm ${srcdir}_ali/ali.*.gz || true
+  steps/nnet2/remove_egs.sh ${srcdir}_degs || true
+fi
+
+
+exit 0;
+
diff --git a/egs/swbd/s5c/local/online/run_nnet2_common.sh b/egs/swbd/s5c/local/online/run_nnet2_common.sh
index e1842941966..8221be79162 100755
--- a/egs/swbd/s5c/local/online/run_nnet2_common.sh
+++ b/egs/swbd/s5c/local/online/run_nnet2_common.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 . ./cmd.sh
-set -e 
+set -e
 stage=1
 train_stage=-10
 
@@ -18,7 +18,7 @@ if [ $stage -le 1 ]; then
   fi
   utils/copy_data_dir.sh data/train data/train_scaled_hires
   utils/copy_data_dir.sh data/train data/train_hires
-  
+
   data_dir=data/train_scaled_hires
   cat $data_dir/wav.scp | python -c "
 import sys, os, subprocess, re, random
@@ -34,12 +34,12 @@ for line in sys.stdin.readlines():
       --cmd "$train_cmd" data/train_scaled_hires exp/make_hires/train_scaled $mfccdir;
   steps/compute_cmvn_stats.sh data/train_scaled_hires exp/make_hires/train_scaled $mfccdir;
 
-  # we need these features for the run_nnet2_ms.sh 
+  # we need these features for the run_nnet2_ms.sh
   steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \
       --cmd "$train_cmd" data/train_hires exp/make_hires/train $mfccdir;
   steps/compute_cmvn_stats.sh data/train_hires exp/make_hires/train $mfccdir;
 
-  # Remove the small number of utterances that couldn't be extracted for some 
+  # Remove the small number of utterances that couldn't be extracted for some
   # reason (e.g. too short; no such file).
   utils/fix_data_dir.sh data/train_scaled_hires;
   utils/fix_data_dir.sh data/train_hires;
@@ -50,7 +50,7 @@ for line in sys.stdin.readlines():
       data/eval2000_hires exp/make_hires/eval2000 $mfccdir;
   steps/compute_cmvn_stats.sh data/eval2000_hires exp/make_hires/eval2000 $mfccdir;
     utils/fix_data_dir.sh data/eval2000_hires  # remove segments with problems
-    
+
   # Use the first 4k sentences as dev set.  Note: when we trained the LM, we used
   # the 1st 10k sentences as dev set, so the 1st 4k won't have been used in the
   # LM training data.   However, they will be in the lexicon, plus speakers
@@ -84,7 +84,7 @@ if [ $stage -le 2 ]; then
   # We need to build a small system just because we need the LDA+MLLT transform
   # to train the diag-UBM on top of.  We use --num-iters 13 because after we get
   # the transform (12th iter is the last), any further training is pointless.
-  # this decision is based on fisher_english 
+  # this decision is based on fisher_english
   steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \
     --splice-opts "--left-context=3 --right-context=3" \
     5500 90000 data/train_scaled_hires_100k_nodup \
diff --git a/egs/swbd/s5c/local/rt03_data_prep.sh b/egs/swbd/s5c/local/rt03_data_prep.sh
new file mode 100755
index 00000000000..a18637a6a16
--- /dev/null
+++ b/egs/swbd/s5c/local/rt03_data_prep.sh
@@ -0,0 +1,113 @@
+#!/bin/bash
+
+# RT-03 data preparation (conversational telephone speech part only) 
+# Adapted from Arnab Ghoshal's script for Hub-5 Eval 2000 by Peng Qi
+
+# To be run from one directory above this script.
+
+# Expects the standard directory layout for RT-03
+
+if [ $# -ne 1 ]; then
+  echo "Usage: "`basename $0`" <rt03-dir>"
+  echo "See comments in the script for more details"
+  exit 1
+fi
+
+sdir=$1
+[ ! -d $sdir/data/audio/eval03/english/cts ] \
+  && echo Expecting directory $sdir/data/audio/eval03/english/cts to be present && exit 1;
+[ ! -d $sdir/data/references/eval03/english/cts ] \
+  && echo Expecting directory $tdir/data/references/eval03/english/cts to be present && exit 1;
+
+. path.sh 
+
+dir=data/local/rt03
+mkdir -p $dir
+
+rtroot=$sdir
+tdir=$sdir/data/references/eval03/english/cts
+sdir=$sdir/data/audio/eval03/english/cts
+
+find $sdir -iname '*.sph' | sort > $dir/sph.flist
+sed -e 's?.*/??' -e 's?.sph??' $dir/sph.flist | paste - $dir/sph.flist \
+  > $dir/sph.scp
+
+sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
+[ ! -x $sph2pipe ] \
+  && echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1;
+
+awk -v sph2pipe=$sph2pipe '{
+  printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2); 
+  printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);
+}' < $dir/sph.scp | sort > $dir/wav.scp || exit 1;
+#side A - channel 1, side B - channel 2
+
+# Get segments file...
+# segments file format is: utt-id side-id start-time end-time, e.g.:
+# sw02001-A_000098-001156 sw02001-A 0.98 11.56
+#pem=$sdir/english/hub5e_00.pem
+#[ ! -f $pem ] && echo "No such file $pem" && exit 1;
+# pem file has lines like: 
+# en_4156 A unknown_speaker 301.85 302.48
+
+#grep -v ';;' $pem \
+cat $tdir/*.stm | grep -v ';;' | grep -v inter_segment_gap \
+  | awk '{
+           spk=$1"-"(($2==1)?"A":"B");
+           utt=sprintf("%s_%06d-%06d",spk,$4*100,$5*100);
+           print utt,spk,$4,$5;}' \
+  | sort -u > $dir/segments
+
+# stm file has lines like:
+# en_4156 A en_4156_A 357.64 359.64 <O,en,F,en-F>  HE IS A POLICE OFFICER 
+# TODO(arnab): We should really be lowercasing this since the Edinburgh
+# recipe uses lowercase. This is not used in the actual scoring.
+#grep -v ';;' $tdir/reference/hub5e00.english.000405.stm \
+cat $tdir/*.stm | grep -v ';;' | grep -v inter_segment_gap \
+  | awk '{
+           spk=$1"-"(($2==1)?"A":"B");
+           utt=sprintf("%s_%06d-%06d",spk,$4*100,$5*100);
+           printf utt; for(n=7;n<=NF;n++) printf(" %s", $n); print ""; }' \
+  | sort > $dir/text.all
+
+# We'll use the stm file for sclite scoring.  There seem to be various errors
+# in the stm file that upset hubscr.pl, and we fix them here.
+cat $tdir/*.stm | \
+  sed -e 's:((:(:' -e 's:<B_ASIDE>::g' -e 's:<E_ASIDE>::g' | \
+  grep -v inter_segment_gap | \
+  awk '{
+           printf $1; if ($1==";;") printf(" %s",$2); else printf(($2==1)?" A":" B"); for(n=3;n<=NF;n++) printf(" %s", $n); print ""; }'\
+  > $dir/stm  
+#$tdir/reference/hub5e00.english.000405.stm >  $dir/stm
+cp $rtroot/data/trans_rules/en20030506.glm  $dir/glm
+
+# next line uses command substitution
+# Just checking that the segments are the same in pem vs. stm.
+! cmp <(awk '{print $1}' $dir/text.all) <(awk '{print $1}' $dir/segments) && \
+   echo "Segments from pem file and stm file do not match." && exit 1;
+
+grep -v IGNORE_TIME_SEGMENT_ $dir/text.all > $dir/text
+   
+# create an utt2spk file that assumes each conversation side is
+# a separate speaker.
+awk '{print $1,$2;}' $dir/segments > $dir/utt2spk  
+utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
+
+# cp $dir/segments $dir/segments.tmp
+# awk '{x=$3-0.05; if (x<0.0) x=0.0; y=$4+0.05; print $1, $2, x, y; }' \
+#   $dir/segments.tmp > $dir/segments
+
+awk '{print $1}' $dir/wav.scp \
+  | perl -ane '$_ =~ m:^(\S+)-([AB])$: || die "bad label $_";
+               print "$1-$2 $1 $2\n"; ' \
+  > $dir/reco2file_and_channel || exit 1;
+
+dest=data/rt03
+mkdir -p $dest
+for x in wav.scp segments text utt2spk spk2utt stm glm reco2file_and_channel; do
+  cp $dir/$x $dest/$x
+done
+
+echo Data preparation and formatting completed for RT-03
+echo "(but not MFCC extraction)"
+
diff --git a/egs/swbd/s5c/local/score.sh b/egs/swbd/s5c/local/score.sh
index 81455d1e13a..40a49d0b41a 100755
--- a/egs/swbd/s5c/local/score.sh
+++ b/egs/swbd/s5c/local/score.sh
@@ -13,6 +13,7 @@ stage=0
 min_lmwt=5
 max_lmwt=20
 reverse=false
+iter=final
 word_ins_penalty=0.0,0.5,1.0
 #end configuration section.
 
diff --git a/egs/swbd/s5c/local/score_basic.sh b/egs/swbd/s5c/local/score_basic.sh
index aaaf005ceba..8fed1b3bab7 100755
--- a/egs/swbd/s5c/local/score_basic.sh
+++ b/egs/swbd/s5c/local/score_basic.sh
@@ -6,6 +6,7 @@ cmd=run.pl
 min_lmwt=5
 max_lmwt=20
 reverse=false
+iter=final
 word_ins_penalty=0.0,0.5,1.0
 #end configuration section.
 
@@ -26,9 +27,9 @@ data=$1
 lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied.
 dir=$3
 
-model=$dir/../final.mdl # assume model one level up from decoding dir.
+model=$dir/../$iter.mdl # assume model one level up from decoding dir.
 
-hubscr=$KALDI_ROOT/tools/sctk/bin/hubscr.pl 
+hubscr=$KALDI_ROOT/tools/sctk/bin/hubscr.pl
 [ ! -f $hubscr ] && echo "Cannot find scoring program at $hubscr" && exit 1;
 hubdir=`dirname $hubscr`
 
@@ -42,10 +43,10 @@ mkdir -p $dir/scoring/log
 
 
 function filter_text {
-  perl -e 'foreach $w (@ARGV) { $bad{$w} = 1; } 
+  perl -e 'foreach $w (@ARGV) { $bad{$w} = 1; }
    while(<STDIN>) { @A  = split(" ", $_); $id = shift @A; print "$id ";
      foreach $a (@A) { if (!defined $bad{$a}) { print "$a "; }} print "\n"; }' \
-   '[NOISE]' '[LAUGHTER]' '[VOCALIZED-NOISE]' '<UNK>' '%HESITATION'
+   '[noise]' '[laughter]' '[vocalized-noise]' '<unk>' '%hesitation'
 }
 
 for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
diff --git a/egs/swbd/s5c/local/score_sclite.sh b/egs/swbd/s5c/local/score_sclite.sh
index 847e7625015..7ac33fdd26a 100755
--- a/egs/swbd/s5c/local/score_sclite.sh
+++ b/egs/swbd/s5c/local/score_sclite.sh
@@ -7,6 +7,7 @@ stage=0
 min_lmwt=5
 max_lmwt=20
 reverse=false
+iter=final
 word_ins_penalty=0.0,0.5,1.0
 #end configuration section.
 
@@ -28,9 +29,9 @@ data=$1
 lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied.
 dir=$3
 
-model=$dir/../final.mdl # assume model one level up from decoding dir.
+model=$dir/../$iter.mdl # assume model one level up from decoding dir.
 
-hubscr=$KALDI_ROOT/tools/sctk/bin/hubscr.pl 
+hubscr=$KALDI_ROOT/tools/sctk/bin/hubscr.pl
 [ ! -f $hubscr ] && echo "Cannot find scoring program at $hubscr" && exit 1;
 hubdir=`dirname $hubscr`
 
@@ -43,32 +44,39 @@ name=`basename $data`; # e.g. eval2000
 
 mkdir -p $dir/scoring/log
 
+align_word=
+reorder_opt=
+if $reverse; then
+  align_word="lattice-reverse ark:- ark:- |"
+  reorder_opt="--reorder=false"
+fi
+
+
+if [ -f $dir/../frame_shift ]; then
+  frame_shift_opt="--frame-shift=$(cat $dir/../frame_shift)"
+  echo "$0: $dir/../frame_shift exists, using $frame_shift_opt"
+elif [ -f $dir/../frame_subsampling_factor ]; then
+  factor=$(cat $dir/../frame_subsampling_factor) || exit 1
+  frame_shift_opt="--frame-shift=0.0$factor"
+  echo "$0: $dir/../frame_subsampling_factor exists, using $frame_shift_opt"
+fi
+
+name=`basename $data`; # e.g. eval2000
+
+mkdir -p $dir/scoring/log
+
 if [ $stage -le 0 ]; then
   for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
-    if $reverse; then
-      $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/get_ctm.LMWT.${wip}.log \
-        mkdir -p $dir/score_LMWT_${wip}/ '&&' \
-        lattice-scale --lm-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
-        lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \
-        lattice-1best ark:- ark:- \| \
-        lattice-reverse ark:- ark:- \| \
-        lattice-align-words --reorder=false $lang/phones/word_boundary.int $model ark:- ark:- \| \
-        nbest-to-ctm ark:- - \| \
-        utils/int2sym.pl -f 5 $lang/words.txt  \| \
-        utils/convert_ctm.pl $data/segments $data/reco2file_and_channel \
-        '>' $dir/score_LMWT_${wip}/$name.ctm || exit 1;
-    else
-      $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/get_ctm.LMWT.${wip}.log \
-        mkdir -p $dir/score_LMWT_${wip}/ '&&' \
-        lattice-scale --lm-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
-        lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \
-        lattice-1best ark:- ark:- \| \
-        lattice-align-words $lang/phones/word_boundary.int $model ark:- ark:- \| \
-        nbest-to-ctm ark:- - \| \
-        utils/int2sym.pl -f 5 $lang/words.txt  \| \
-        utils/convert_ctm.pl $data/segments $data/reco2file_and_channel \
-        '>' $dir/score_LMWT_${wip}/$name.ctm || exit 1;
-    fi
+    $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/get_ctm.LMWT.${wip}.log \
+      mkdir -p $dir/score_LMWT_${wip}/ '&&' \
+      lattice-scale --lm-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
+      lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \
+      lattice-1best ark:- ark:- \| \
+      lattice-align-words $reorder_opt $lang/phones/word_boundary.int $model ark:- ark:- \| \
+      nbest-to-ctm $frame_shift_opt ark:- - \| \
+      utils/int2sym.pl -f 5 $lang/words.txt  \| \
+      utils/convert_ctm.pl $data/segments $data/reco2file_and_channel \
+      '>' $dir/score_LMWT_${wip}/$name.ctm || exit 1;
   done
 fi
 
@@ -93,7 +101,7 @@ if [ $stage -le 1 ]; then
 fi
 
 # Score the set...
-if [ $stage -le 2 ]; then  
+if [ $stage -le 2 ]; then
   for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
     $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.${wip}.log \
       cp $data/stm $dir/score_LMWT_${wip}/ '&&' \
@@ -102,23 +110,45 @@ if [ $stage -le 2 ]; then
 fi
 
 # For eval2000 score the subsets
-case "$name" in eval2000* )
-  # Score only the, swbd part...
-  if [ $stage -le 3 ]; then  
+case "$name" in 
+  eval2000*)
+    # Score only the, swbd part...
+    if [ $stage -le 3 ]; then
+      for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
+        $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.swbd.LMWT.${wip}.log \
+          grep -v '^en_' $data/stm '>' $dir/score_LMWT_${wip}/stm.swbd '&&' \
+          grep -v '^en_' $dir/score_LMWT_${wip}/${name}.ctm '>' $dir/score_LMWT_${wip}/${name}.ctm.swbd '&&' \
+          $hubscr -p $hubdir -V -l english -h hub5 -g $data/glm -r $dir/score_LMWT_${wip}/stm.swbd $dir/score_LMWT_${wip}/${name}.ctm.swbd || exit 1;
+      done
+    fi
+    # Score only the, callhome part...
+    if [ $stage -le 3 ]; then
+      for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
+        $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.callhm.LMWT.${wip}.log \
+          grep -v '^sw_' $data/stm '>' $dir/score_LMWT_${wip}/stm.callhm '&&' \
+          grep -v '^sw_' $dir/score_LMWT_${wip}/${name}.ctm '>' $dir/score_LMWT_${wip}/${name}.ctm.callhm '&&' \
+          $hubscr -p $hubdir -V -l english -h hub5 -g $data/glm -r $dir/score_LMWT_${wip}/stm.callhm $dir/score_LMWT_${wip}/${name}.ctm.callhm || exit 1;
+      done
+    fi
+    ;;
+rt03* )
+    
+  # Score only the swbd part...
+  if [ $stage -le 3 ]; then
     for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
       $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.swbd.LMWT.${wip}.log \
-        grep -v '^en_' $data/stm '>' $dir/score_LMWT_${wip}/stm.swbd '&&' \
-        grep -v '^en_' $dir/score_LMWT_${wip}/${name}.ctm '>' $dir/score_LMWT_${wip}/${name}.ctm.swbd '&&' \
+        grep -v '^fsh_' $data/stm '>' $dir/score_LMWT_${wip}/stm.swbd '&&' \
+        grep -v '^fsh_' $dir/score_LMWT_${wip}/${name}.ctm '>' $dir/score_LMWT_${wip}/${name}.ctm.swbd '&&' \
         $hubscr -p $hubdir -V -l english -h hub5 -g $data/glm -r $dir/score_LMWT_${wip}/stm.swbd $dir/score_LMWT_${wip}/${name}.ctm.swbd || exit 1;
     done
   fi
-  # Score only the, callhome part...
-  if [ $stage -le 3 ]; then  
+  # Score only the fisher part...
+  if [ $stage -le 3 ]; then
     for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
-      $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.callhm.LMWT.${wip}.log \
-        grep -v '^sw_' $data/stm '>' $dir/score_LMWT_${wip}/stm.callhm '&&' \
-        grep -v '^sw_' $dir/score_LMWT_${wip}/${name}.ctm '>' $dir/score_LMWT_${wip}/${name}.ctm.callhm '&&' \
-        $hubscr -p $hubdir -V -l english -h hub5 -g $data/glm -r $dir/score_LMWT_${wip}/stm.callhm $dir/score_LMWT_${wip}/${name}.ctm.callhm || exit 1;
+      $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.fsh.LMWT.${wip}.log \
+        grep -v '^sw_' $data/stm '>' $dir/score_LMWT_${wip}/stm.fsh '&&' \
+        grep -v '^sw_' $dir/score_LMWT_${wip}/${name}.ctm '>' $dir/score_LMWT_${wip}/${name}.ctm.fsh '&&' \
+        $hubscr -p $hubdir -V -l english -h hub5 -g $data/glm -r $dir/score_LMWT_${wip}/stm.fsh $dir/score_LMWT_${wip}/${name}.ctm.fsh || exit 1;
     done
   fi
  ;;
diff --git a/egs/swbd/s5c/local/swbd1_data_download.sh b/egs/swbd/s5c/local/swbd1_data_download.sh
index 00ec97c5028..d8f076b5141 100755
--- a/egs/swbd/s5c/local/swbd1_data_download.sh
+++ b/egs/swbd/s5c/local/swbd1_data_download.sh
@@ -10,18 +10,11 @@
 ## you unpacked this.  We are just doing a "find" command to locate
 ## the .sph files.
 
-## The second input is optional, which should point to a directory containing
-## Switchboard transcriptions/documentations (specifically, the conv.tab file).
-## If specified, the script will try to use the actual speaker PINs provided 
-## with the corpus instead of the conversation side ID (Kaldi default). We 
-## will be using "find" to locate this file so we don't make any assumptions
-## on the directory structure. (Peng Qi, Aug 2014)
-
 . path.sh
 
 #check existing directories
-if [ $# != 1 -a $# != 2 ]; then
-  echo "Usage: swbd1_data_prep_edin.sh /path/to/SWBD [/path/to/SWBD_DOC]"
+if [ $# != 1 ]; then
+  echo "Usage: swbd1_data_download.sh /path/to/SWBD"
   exit 1; 
 fi 
 
@@ -30,24 +23,19 @@ SWBD_DIR=$1
 dir=data/local/train
 mkdir -p $dir
 
-
 # Audio data directory check
 if [ ! -d $SWBD_DIR ]; then
   echo "Error: run.sh requires a directory argument"
   exit 1; 
 fi  
 
-sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
-[ ! -x $sph2pipe ] \
-  && echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1;
-
-
 # Trans directory check
 if [ ! -d $SWBD_DIR/transcriptions/swb_ms98_transcriptions ]; then
   ( 
     cd $dir;
     if [ ! -d swb_ms98_transcriptions ]; then
       echo " *** Downloading trascriptions and dictionary ***" 
+      wget http://www.openslr.org/resources/5/switchboard_word_alignments.tar.gz ||
       wget http://www.isip.piconepress.com/projects/switchboard/releases/switchboard_word_alignments.tar.gz
       tar -xf switchboard_word_alignments.tar.gz
     fi
diff --git a/egs/swbd/s5c/local/swbd1_data_prep.sh b/egs/swbd/s5c/local/swbd1_data_prep.sh
index 57fb0ff56c8..9621e7fc06e 100755
--- a/egs/swbd/s5c/local/swbd1_data_prep.sh
+++ b/egs/swbd/s5c/local/swbd1_data_prep.sh
@@ -21,7 +21,7 @@
 
 #check existing directories
 if [ $# != 1 -a $# != 2 ]; then
-  echo "Usage: swbd1_data_prep_edin.sh /path/to/SWBD [/path/to/SWBD_DOC]"
+  echo "Usage: swbd1_data_prep.sh /path/to/SWBD [/path/to/SWBD_DOC]"
   exit 1; 
 fi 
 
@@ -41,23 +41,6 @@ sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
 [ ! -x $sph2pipe ] \
   && echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1;
 
-
-# Trans directory check
-if [ ! -d $SWBD_DIR/transcriptions/swb_ms98_transcriptions ]; then
-  ( 
-    cd $dir;
-    if [ ! -d swb_ms98_transcriptions ]; then
-      echo " *** Downloading trascriptions and dictionary ***" 
-      wget http://www.isip.piconepress.com/projects/switchboard/releases/switchboard_word_alignments.tar.gz
-      tar -xf switchboard_word_alignments.tar.gz
-    fi
-  )
-else
-  echo "Directory with transcriptions exists, skipping downloading"
-  [ -f $dir/swb_ms98_transcriptions ] \
-    || ln -sf $SWBD_DIR/transcriptions/swb_ms98_transcriptions $dir/
-fi
-
 # Option A: SWBD dictionary file check
 [ ! -f $dir/swb_ms98_transcriptions/sw-ms98-dict.text ] && \
   echo  "SWBD dictionary file does not exist" &&  exit 1;
diff --git a/egs/swbd/s5c/path.sh b/egs/swbd/s5c/path.sh
index db666cc10f6..1bea0e69779 100755
--- a/egs/swbd/s5c/path.sh
+++ b/egs/swbd/s5c/path.sh
@@ -1,4 +1,6 @@
 export KALDI_ROOT=`pwd`/../../..
-export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$KALDI_ROOT/src/lmbin/:$PWD:$PATH
-#$KALDI_ROOT/tools/srilm/bin:$KALDI_ROOT/tools/srilm/bin/i686-m64:$KALDI_ROOT/tools/srilm/bin/i686:$PATH
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+
 export LC_ALL=C
diff --git a/egs/swbd/s5c/run.sh b/egs/swbd/s5c/run.sh
index afe561f881f..3bc2df0a337 100755
--- a/egs/swbd/s5c/run.sh
+++ b/egs/swbd/s5c/run.sh
@@ -7,11 +7,13 @@
 # 1. added more training data for early stages
 # 2. removed SAT system (and later stages) on the 100k utterance training data
 # 3. reduced number of LM rescoring, only sw1_tg and sw1_fsh_fg remain
-# 4. mapped swbd transcription to fisher style, instead of the other way around 
+# 4. mapped swbd transcription to fisher style, instead of the other way around
 
 set -e # exit on error
 has_fisher=true
 local/swbd1_data_download.sh /export/corpora3/LDC/LDC97S62
+# local/swbd1_data_download.sh /mnt/matylda2/data/SWITCHBOARD_1R2 # BUT,
+
 # prepare SWBD dictionary first since we want to find acronyms according to pronunciations
 # before mapping lexicon and transcripts
 local/swbd1_prepare_dict.sh
@@ -20,7 +22,7 @@ local/swbd1_prepare_dict.sh
 # which specifies the directory to Switchboard documentations. Specifically, if
 # this argument is given, the script will look for the conv.tab file and correct
 # speaker IDs to the actual speaker personal identification numbers released in
-# the documentations. The documentations can be found here: 
+# the documentations. The documentations can be found here:
 # https://catalog.ldc.upenn.edu/docs/LDC97S62/
 # Note: if you are using this link, make sure you rename conv_tab.csv to conv.tab
 # after downloading.
@@ -28,24 +30,22 @@ local/swbd1_prepare_dict.sh
 local/swbd1_data_prep.sh /export/corpora3/LDC/LDC97S62
 # local/swbd1_data_prep.sh /home/dpovey/data/LDC97S62
 # local/swbd1_data_prep.sh /data/corpora0/LDC97S62
-# local/swbd1_data_prep.sh /mnt/matylda2/data/SWITCHBOARD_1R2
+# local/swbd1_data_prep.sh /mnt/matylda2/data/SWITCHBOARD_1R2 # BUT,
 # local/swbd1_data_prep.sh /exports/work/inf_hcrc_cstr_general/corpora/switchboard/switchboard1
 
 utils/prepare_lang.sh data/local/dict_nosp \
   "<unk>"  data/local/lang_nosp data/lang_nosp
 
 # Now train the language models. We are using SRILM and interpolating with an
-# LM trained on the Fisher transcripts (part 2 disk is currently missing; so 
+# LM trained on the Fisher transcripts (part 2 disk is currently missing; so
 # only part 1 transcripts ~700hr are used)
 
 # If you have the Fisher data, you can set this "fisher_dir" variable.
 fisher_dirs="/export/corpora3/LDC/LDC2004T19/fe_03_p1_tran/ /export/corpora3/LDC/LDC2005T19/fe_03_p2_tran/"
-#fisher_dirs="/home/dpovey/data/LDC2004T19/fe_03_p1_tran/"
-#fisher_dirs="/data/corpora0/LDC2004T19/fe_03_p1_tran/"
-# edinburgh:
-# fisher_dirs="/exports/work/inf_hcrc_cstr_general/corpora/fisher/transcripts"
-# brno:
-# fisher_dirs="/mnt/matylda2/data/FISHER/fe_03_p1_tran" # BUT
+# fisher_dirs="/home/dpovey/data/LDC2004T19/fe_03_p1_tran/"
+# fisher_dirs="/data/corpora0/LDC2004T19/fe_03_p1_tran/"
+# fisher_dirs="/exports/work/inf_hcrc_cstr_general/corpora/fisher/transcripts" # Edinburgh,
+# fisher_dirs="/mnt/matylda2/data/FISHER/fe_03_p1_tran /mnt/matylda2/data/FISHER/fe_03_p2_tran" # BUT,
 local/swbd1_train_lms.sh data/local/train/text \
   data/local/dict_nosp/lexicon.txt data/local/lm $fisher_dirs
 
@@ -79,7 +79,7 @@ mfccdir=mfcc
 for x in train eval2000; do
   steps/make_mfcc.sh --nj 50 --cmd "$train_cmd" \
     data/$x exp/make_mfcc/$x $mfccdir
-  steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir 
+  steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir
   utils/fix_data_dir.sh data/$x
 done
 
@@ -91,11 +91,10 @@ utils/subset_data_dir.sh --first data/train 4000 data/train_dev # 5hr 6min
 n=$[`cat data/train/segments | wc -l` - 4000]
 utils/subset_data_dir.sh --last data/train $n data/train_nodev
 
-# Now-- there are 260k utterances (313hr 23min), and we want to start the 
-# monophone training on relatively short utterances (easier to align), but not 
-# only the shortest ones (mostly uh-huh).  So take the 100k shortest ones;
-# remove most of the repeated utterances (these are the uh-huh type ones), and 
-# then take 10k random utterances from those (about 4hr 40mins)
+# Now-- there are 260k utterances (313hr 23min), and we want to start the
+# monophone training on relatively short utterances (easier to align), but not
+# only the shortest ones (mostly uh-huh).  So take the 100k shortest ones, and
+# then take 30k random utterances from those (about 12hr)
 utils/subset_data_dir.sh --shortest data/train_nodev 100000 data/train_100kshort
 utils/subset_data_dir.sh data/train_100kshort 30000 data/train_30kshort
 
@@ -108,13 +107,13 @@ local/remove_dup_utts.sh 200 data/train_100k data/train_100k_nodup  # 110hr
 local/remove_dup_utts.sh 300 data/train_nodev data/train_nodup  # 286hr
 ## Starting basic training on MFCC features
 steps/train_mono.sh --nj 30 --cmd "$train_cmd" \
-  data/train_30kshort data/lang_nosp exp/mono 
+  data/train_30kshort data/lang_nosp exp/mono
 
 steps/align_si.sh --nj 30 --cmd "$train_cmd" \
-  data/train_100k_nodup data/lang_nosp exp/mono exp/mono_ali 
+  data/train_100k_nodup data/lang_nosp exp/mono exp/mono_ali
 
 steps/train_deltas.sh --cmd "$train_cmd" \
-  3200 30000 data/train_100k_nodup data/lang_nosp exp/mono_ali exp/tri1 
+  3200 30000 data/train_100k_nodup data/lang_nosp exp/mono_ali exp/tri1
 
 (
   graph_dir=exp/tri1/graph_nosp_sw1_tg
@@ -125,7 +124,7 @@ steps/train_deltas.sh --cmd "$train_cmd" \
 ) &
 
 steps/align_si.sh --nj 30 --cmd "$train_cmd" \
-  data/train_100k_nodup data/lang_nosp exp/tri1 exp/tri1_ali 
+  data/train_100k_nodup data/lang_nosp exp/tri1 exp/tri1_ali
 
 steps/train_deltas.sh --cmd "$train_cmd" \
   4000 70000 data/train_100k_nodup data/lang_nosp exp/tri1_ali exp/tri2
@@ -149,11 +148,11 @@ steps/align_si.sh --nj 30 --cmd "$train_cmd" \
 # From now, we start using all of the data (except some duplicates of common
 # utterances, which don't really contribute much).
 steps/align_si.sh --nj 30 --cmd "$train_cmd" \
-  data/train_nodup data/lang_nosp exp/tri2 exp/tri2_ali_nodup 
+  data/train_nodup data/lang_nosp exp/tri2 exp/tri2_ali_nodup
 
 # Do another iteration of LDA+MLLT training, on all the data.
 steps/train_lda_mllt.sh --cmd "$train_cmd" \
-  6000 140000 data/train_nodup data/lang_nosp exp/tri2_ali_nodup exp/tri3 
+  6000 140000 data/train_nodup data/lang_nosp exp/tri2_ali_nodup exp/tri3
 
 (
   graph_dir=exp/tri3/graph_nosp_sw1_tg
@@ -190,7 +189,7 @@ fi
 
 # Train tri4, which is LDA+MLLT+SAT, on all the (nodup) data.
 steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
-  data/train_nodup data/lang exp/tri3 exp/tri3_ali_nodup 
+  data/train_nodup data/lang exp/tri3 exp/tri3_ali_nodup
 
 
 steps/train_sat.sh  --cmd "$train_cmd" \
@@ -203,6 +202,9 @@ steps/train_sat.sh  --cmd "$train_cmd" \
   steps/decode_fmllr.sh --nj 30 --cmd "$decode_cmd" \
     --config conf/decode.config \
     $graph_dir data/eval2000 exp/tri4/decode_eval2000_sw1_tg
+  # Will be used for confidence calibration example,
+  steps/decode_fmllr.sh --nj 30 --cmd "$decode_cmd" \
+    $graph_dir data/train_dev exp/tri4/decode_dev_sw1_tg
 ) &
 wait
 
@@ -212,13 +214,13 @@ if $has_fisher; then
     exp/tri4/decode_eval2000_sw1_{tg,fsh_fg}
 fi
 
-# MMI training starting from the LDA+MLLT+SAT systems on all the (nodup) data. 
+# MMI training starting from the LDA+MLLT+SAT systems on all the (nodup) data.
 steps/align_fmllr.sh --nj 50 --cmd "$train_cmd" \
   data/train_nodup data/lang exp/tri4 exp/tri4_ali_nodup
 
 steps/make_denlats.sh --nj 50 --cmd "$decode_cmd" \
   --config conf/decode.config --transform-dir exp/tri4_ali_nodup \
-  data/train_nodup data/lang exp/tri4 exp/tri4_denlats_nodup 
+  data/train_nodup data/lang exp/tri4 exp/tri4_denlats_nodup
 
 # 4 iterations of MMI seems to work well overall. The number of iterations is
 # used as an explicit argument even though train_mmi.sh will use 4 iterations by
@@ -226,7 +228,7 @@ steps/make_denlats.sh --nj 50 --cmd "$decode_cmd" \
 num_mmi_iters=4
 steps/train_mmi.sh --cmd "$decode_cmd" \
   --boost 0.1 --num-iters $num_mmi_iters \
-  data/train_nodup data/lang exp/tri4_{ali,denlats}_nodup exp/tri4_mmi_b0.1 
+  data/train_nodup data/lang exp/tri4_{ali,denlats}_nodup exp/tri4_mmi_b0.1
 
 for iter in 1 2 3 4; do
   (
@@ -245,7 +247,7 @@ if $has_fisher; then
     (
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
         data/lang_sw1_{tg,fsh_fg} data/eval2000 \
-        exp/tri4_mmi_b0.1/decode_eval2000_${iter}.mdl_sw1_fsh_{tg,fsh_fg}
+        exp/tri4_mmi_b0.1/decode_eval2000_${iter}.mdl_sw1_{tg,fsh_fg}
     ) &
   done
 fi
@@ -257,7 +259,7 @@ steps/train_diag_ubm.sh --silence-weight 0.5 --nj 50 --cmd "$train_cmd" \
 steps/train_mmi_fmmi.sh --learning-rate 0.005 \
   --boost 0.1 --cmd "$train_cmd" \
   data/train_nodup data/lang exp/tri4_ali_nodup exp/tri4_dubm \
-  exp/tri4_denlats_nodup exp/tri4_fmmi_b0.1  
+  exp/tri4_denlats_nodup exp/tri4_fmmi_b0.1
 
 for iter in 4 5 6 7 8; do
   (
@@ -301,5 +303,18 @@ fi
 # demonstration script for raw-fMLLR.  You should probably ignore this.
 # local/run_raw_fmllr.sh
 
+# nnet3 LSTM recipe
+# local/nnet3/run_lstm.sh
+
+# nnet3 BLSTM recipe
+# local/nnet3/run_lstm.sh --affix bidirectional \
+#	                  --lstm-delay " [-1,1] [-2,2] [-3,3] " \
+#                         --label-delay 0 \
+#                         --cell-dim 1024 \
+#                         --recurrent-projection-dim 128 \
+#                         --non-recurrent-projection-dim 128 \
+#                         --chunk-left-context 40 \
+#                         --chunk-right-context 40
+
 # getting results (see RESULTS file)
 # for x in 1 2 3a 3b 4a; do grep 'Percent Total Error' exp/tri$x/decode_eval2000_sw1_tg/score_*/eval2000.ctm.filt.dtl | sort -k5 -g | head -1; done
diff --git a/egs/swbd/s5c/swbd.perf b/egs/swbd/s5c/swbd.perf
new file mode 100644
index 00000000000..5151a6fdaa0
--- /dev/null
+++ b/egs/swbd/s5c/swbd.perf
@@ -0,0 +1,33 @@
+%WER 12.8 | 1831 21395 | 89.2 7.7 3.2 2.0 12.8 50.4 | exp/chain/tdnn_v_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.swbd.filt.sys
+%WER 11.5 | 1831 21395 | 89.8 6.7 3.4 1.4 11.5 47.0 | exp/chain/tdnn_v1_trial4_sp/decode_eval2000_sw1_fsh_fg/score_12_0.0/eval2000_hires.ctm.swbd.filt.sys
+%WER 11.5 | 1831 21395 | 89.6 6.6 3.8 1.1 11.5 47.1 | exp/chain/tdnn_v2_trial1_sp/decode_eval2000_200_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
+%WER 11.4 | 1831 21395 | 89.9 6.4 3.7 1.3 11.4 46.9 | exp/chain/tdnn_v1_trial1_sp/decode_eval2000_sw1_fsh_fg/score_12_0.0/eval2000_hires.ctm.swbd.filt.sys
+%WER 11.4 | 1831 21395 | 89.8 6.6 3.7 1.2 11.4 47.5 | exp/chain/tdnn_v1_trial5_sp/decode_eval2000_sw1_fsh_fg/score_12_1.0/eval2000_hires.ctm.swbd.filt.sys
+%WER 11.3 | 1831 21395 | 90.0 6.6 3.4 1.3 11.3 46.0 | exp/chain/tdnn_v1_trial3_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.swbd.filt.sys
+%WER 11.2 | 1831 21395 | 90.1 6.4 3.5 1.3 11.2 46.0 | exp/chain/tdnn_v2_trial4_sp/decode_eval2000_300_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
+%WER 11.1 | 1831 21395 | 90.2 6.3 3.5 1.3 11.1 46.6 | exp/chain/tdnn_v1_trial6_sp/decode_eval2000_sw1_fsh_fg/score_12_0.0/eval2000_hires.ctm.swbd.filt.sys
+%WER 11.1 | 1831 21395 | 90.1 6.6 3.3 1.3 11.1 46.6 | exp/chain/tdnn_v2_trial3_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
+%WER 11.1 | 1831 21395 | 90.1 6.5 3.5 1.2 11.1 45.8 | exp/chain/tdnn_v2_trial4_sp/decode_eval2000_400_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
+%WER 11.1 | 1831 21395 | 90.1 6.5 3.4 1.3 11.1 45.7 | exp/chain/tdnn_v2_trial4_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
+%WER 11.1 | 1831 21395 | 90.1 6.4 3.5 1.1 11.1 46.5 | exp/chain/tdnn_v2_trial1_sp/decode_eval2000_300_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
+%WER 11.1 | 1831 21395 | 90.1 6.3 3.6 1.2 11.1 46.4 | exp/chain/tdnn_v2_trial3_sp/decode_eval2000_400_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.swbd.filt.sys
+%WER 11.1 | 1831 21395 | 90.0 6.4 3.7 1.1 11.1 46.3 | exp/chain/tdnn_v2_trial8_sp/decode_eval2000_200_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.swbd.filt.sys
+%WER 11.1 | 1831 21395 | 90.0 6.4 3.7 1.1 11.1 46.3 | exp/chain/tdnn_v1_trial8_sp/decode_eval2000_200_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.swbd.filt.sys
+%WER 11.1 | 1831 21395 | 89.9 6.3 3.8 1.1 11.1 46.1 | exp/chain/tdnn_v2_trial1_sp/decode_eval2000_400_sw1_fsh_fg/score_11_1.0/eval2000_hires.ctm.swbd.filt.sys
+%WER 11.0 | 1831 21395 | 90.3 6.4 3.3 1.3 11.0 47.1 | exp/chain/tdnn_v2_trial5_sp/decode_eval2000_400_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
+%WER 11.0 | 1831 21395 | 90.2 6.4 3.3 1.2 11.0 45.8 | exp/chain/tdnn_v2_trial8_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
+%WER 11.0 | 1831 21395 | 90.2 6.4 3.3 1.2 11.0 45.8 | exp/chain/tdnn_v2_trial8_sp/decode_eval2000_400_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
+%WER 11.0 | 1831 21395 | 90.2 6.4 3.3 1.2 11.0 45.8 | exp/chain/tdnn_v1_trial8_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
+%WER 11.0 | 1831 21395 | 90.2 6.4 3.3 1.2 11.0 45.8 | exp/chain/tdnn_v1_trial8_sp/decode_eval2000_400_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
+%WER 11.0 | 1831 21395 | 90.2 6.3 3.5 1.2 11.0 46.5 | exp/chain/tdnn_v2_trial5_sp/decode_eval2000_300_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
+%WER 11.0 | 1831 21395 | 90.2 6.3 3.5 1.2 11.0 46.1 | exp/chain/tdnn_v3_trial1_sp/decode_eval2000_300_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
+%WER 11.0 | 1831 21395 | 90.2 6.2 3.6 1.2 11.0 45.9 | exp/chain/tdnn_v2_trial1_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.swbd.filt.sys
+%WER 11.0 | 1831 21395 | 90.2 6.2 3.5 1.2 11.0 46.3 | exp/chain/tdnn_v2_trial3_sp/decode_eval2000_300_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
+%WER 11.0 | 1831 21395 | 90.1 6.3 3.5 1.2 11.0 46.1 | exp/chain/tdnn_v3_trial1_sp/decode_eval2000_400_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
+%WER 10.9 | 1831 21395 | 90.4 6.2 3.4 1.3 10.9 46.6 | exp/chain/tdnn_v2_trial5_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
+%WER 10.9 | 1831 21395 | 90.3 6.3 3.4 1.2 10.9 45.9 | exp/chain/tdnn_v2_trial8_sp/decode_eval2000_300_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.swbd.filt.sys
+%WER 10.9 | 1831 21395 | 90.3 6.3 3.4 1.2 10.9 45.9 | exp/chain/tdnn_v1_trial8_sp/decode_eval2000_300_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.swbd.filt.sys
+%WER 10.8 | 1831 21395 | 90.4 6.3 3.2 1.3 10.8 46.3 | exp/chain/tdnn_v2_trial2_sp/decode_eval2000_400_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
+%WER 10.8 | 1831 21395 | 90.4 6.1 3.5 1.2 10.8 45.9 | exp/chain/tdnn_v2_trial2_sp/decode_eval2000_300_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.swbd.filt.sys
+%WER 10.7 | 1831 21395 | 90.5 6.2 3.3 1.2 10.7 45.1 | exp/chain/tdnn_v3_trial1_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
+%WER 10.6 | 1831 21395 | 90.6 6.2 3.1 1.3 10.6 46.1 | exp/chain/tdnn_v2_trial2_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
diff --git a/egs/swbd/s5c/tdnn_chain.sh b/egs/swbd/s5c/tdnn_chain.sh
new file mode 100755
index 00000000000..8b0362f4e90
--- /dev/null
+++ b/egs/swbd/s5c/tdnn_chain.sh
@@ -0,0 +1,334 @@
+exp=$1
+
+if [ $exp -eq 1 ]; then
+dir_name=exp/chain/tdnn_v1_trial1_sp/
+mkdir -p $dir_name
+for f in 0.trans_mdl cmvn_opts frame_subsampling_factor normalization.fst phone_lm.fst den.fst tree; do
+  cp exp/chain/tdnn_2o_sp/$f $dir_name
+done
+
+ local/chain/tdnn/run_tdnn_v1.sh --affix trial1 \
+                                 --stage 12 \
+                                 --train-stage -5 \
+                                 --common-egs-dir exp/chain/tdnn_2o_sp/egs
+fi
+
+if [ $exp -eq 2 ]; then
+#  had to reduce the batch size as there were memory issues
+# models up to iteration 216 cannot be read anymore as the Read.WRite methods changed
+# there are more issues, I am just restarting the experiment
+dir_name=exp/chain/tdnn_v1_trial2_sp/
+mkdir -p $dir_name
+for f in 0.trans_mdl cmvn_opts frame_subsampling_factor normalization.fst phone_lm.fst den.fst tree; do
+  cp exp/chain/tdnn_2o_sp/$f $dir_name
+done
+
+ local/chain/tdnn/run_tdnn_v1.sh --affix trial2 \
+                                 --stage 12 \
+                                 --train-stage 216 \
+                                 --minibatch-size 64 \
+                                 --pool-type 'low-pass' \
+                                 --pool-lpfilter-width "0.333" \
+                                 --pool-window 7
+fi
+
+
+if [ $exp -eq 3 ]; then
+  # same as trial1 but with smaller mini-batch size to be used as a control for trial2
+dir_name=exp/chain/tdnn_v1_trial3_sp/
+mkdir -p $dir_name
+for f in 0.trans_mdl cmvn_opts frame_subsampling_factor normalization.fst phone_lm.fst den.fst tree; do
+  cp exp/chain/tdnn_v1_trial1_sp/$f $dir_name
+done
+
+ local/chain/tdnn/run_tdnn_v1.sh --affix trial3 \
+                                 --stage 12 \
+                                 --train-stage 298 \
+                                 --minibatch-size 64 \
+                                 --common-egs-dir exp/chain/tdnn_v1_trial2_sp/egs
+fi
+
+if [ $exp -eq 4 ]; then
+  # same as trial2 but with updatable convolution layers
+dir_name=exp/chain/tdnn_v1_trial4_sp/
+mkdir -p $dir_name
+for f in 0.trans_mdl cmvn_opts frame_subsampling_factor normalization.fst phone_lm.fst den.fst tree; do
+  cp exp/chain/tdnn_v1_trial1_sp/$f $dir_name
+done
+
+ local/chain/tdnn/run_tdnn_v1.sh --affix trial4 \
+                                 --stage 12 \
+                                 --train-stage 469 \
+                                 --minibatch-size 64 \
+                                 --pool-type 'weighted-average' \
+                                 --pool-lpfilter-width "0.333" \
+                                 --pool-window 7 \
+                                 --common-egs-dir exp/chain/tdnn_v1_trial2_sp/egs
+fi
+
+
+if [ $exp -eq 5 ]; then
+  # this is trial2 just restarted
+dir_name=exp/chain/tdnn_v1_trial5_sp/
+mkdir -p $dir_name
+for f in 0.trans_mdl cmvn_opts frame_subsampling_factor normalization.fst phone_lm.fst den.fst tree; do
+  cp exp/chain/tdnn_v1_trial1_sp/$f $dir_name
+done
+
+ local/chain/tdnn/run_tdnn_v1.sh --affix trial5 \
+                                 --stage 12 \
+                                 --train-stage 182 \
+                                 --minibatch-size 64 \
+                                 --pool-type 'low-pass' \
+                                 --pool-lpfilter-width "0.333" \
+                                 --pool-window 7 \
+                                 --common-egs-dir exp/chain/tdnn_v1_trial2_sp/egs
+fi
+
+
+if [ $exp -eq 6 ]; then
+  # same as trial2 but with per-dim affine component
+dir_name=exp/chain/tdnn_v1_trial6_sp/
+mkdir -p $dir_name
+for f in 0.trans_mdl cmvn_opts frame_subsampling_factor normalization.fst phone_lm.fst den.fst tree; do
+  cp exp/chain/tdnn_v1_trial1_sp/$f $dir_name
+done
+
+ local/chain/tdnn/run_tdnn_v1.sh --affix trial6 \
+                                 --stage 12 \
+                                 --train-stage 323 \
+                                 --minibatch-size 64 \
+                                 --pool-type 'per-dim-weighted-average' \
+                                 --pool-window 7 \
+                                 --common-egs-dir exp/chain/tdnn_v1_trial2_sp/egs
+fi
+
+
+if [ $exp -eq 7 ]; then
+  # same as trial2 but with per-dim affine component
+dir_name=exp/chain/tdnn_v1_trial7_sp/
+mkdir -p $dir_name
+for f in 0.trans_mdl cmvn_opts frame_subsampling_factor normalization.fst phone_lm.fst den.fst tree; do
+  cp exp/chain/tdnn_v1_trial1_sp/$f $dir_name
+done
+
+ local/chain/tdnn/run_tdnn_v1.sh --affix trial7 \
+                                 --stage 12 \
+                                 --train-stage -5 \
+                                 --splice-indexes "-2,-1,0,1,2 -1,0,1 -1,0,1 -1,0,1 -1,0,1 -1,0,1 -1,0,1 -1,0,1 -1,0,1 -1,0,1 -1,0,1 -1,0,1 -1,0 -1,0 -1,0 -1,0 -1,0" \
+                                 --relu-dim 450 \
+                                 --minibatch-size 64 \
+                                 --common-egs-dir exp/chain/tdnn_v1_trial2_sp/egs
+fi
+
+
+if [ $exp -eq 8 ]; then
+  # same as trial2 but with updatable convolution layers
+dir_name=exp/chain/tdnn_v1_trial8_sp/
+mkdir -p $dir_name
+for f in 0.trans_mdl cmvn_opts frame_subsampling_factor normalization.fst phone_lm.fst den.fst tree; do
+  cp exp/chain/tdnn_v1_trial1_sp/$f $dir_name
+done
+
+ local/chain/tdnn/run_tdnn_v2.sh --affix trial8 \
+                                 --stage 12 \
+                                 --train-stage -5 \
+                                 --minibatch-size 64 \
+                                 --pool-type 'weighted-average' \
+                                 --pool-lpfilter-width "0.333" \
+                                 --pool-window 7 \
+                                 --common-egs-dir exp/chain/tdnn_v1_trial2_sp/egs
+fi
+
+if [ $exp -eq 9 ]; then
+  # same as trial2 but with updatable convolution layers
+dir_name=exp/chain/tdnn_v2_trial1_sp/
+mkdir -p $dir_name
+for f in 0.trans_mdl cmvn_opts frame_subsampling_factor normalization.fst phone_lm.fst den.fst tree; do
+  cp exp/chain/tdnn_v1_trial1_sp/$f $dir_name
+done
+
+ local/chain/tdnn/run_tdnn_v2.sh --affix trial1 \
+                                 --stage 12 \
+                                 --train-stage -5 \
+                                 --minibatch-size 64 \
+                                 --pool-type 'low-pass' \
+                                 --pool-lpfilter-width "0.333" \
+                                 --pool-window 7 \
+                                 --common-egs-dir exp/chain/tdnn_v1_trial2_sp/egs
+fi
+
+if [ $exp -eq 10 ]; then
+dir_name=exp/chain/tdnn_v2_trial2_sp/
+mkdir -p $dir_name
+for f in 0.trans_mdl cmvn_opts frame_subsampling_factor normalization.fst phone_lm.fst den.fst tree; do
+  cp exp/chain/tdnn_v1_trial1_sp/$f $dir_name
+done
+
+ local/chain/tdnn/run_tdnn_v2.sh --affix trial2 \
+                                 --stage 12 \
+                                 --train-stage -5 \
+                                 --minibatch-size 64 \
+                                 --pool-type 'per-dim-weighted-average' \
+                                 --pool-window 7 \
+                                 --common-egs-dir exp/chain/tdnn_v1_trial2_sp/egs
+fi
+
+if [ $exp -eq 11 ]; then
+dir_name=exp/chain/tdnn_v2_trial3_sp/
+mkdir -p $dir_name
+for f in 0.trans_mdl cmvn_opts frame_subsampling_factor normalization.fst phone_lm.fst den.fst tree; do
+  cp exp/chain/tdnn_v1_trial1_sp/$f $dir_name
+done
+
+ local/chain/tdnn/run_tdnn_v2.sh --affix trial3 \
+                                 --stage 12 \
+                                 --train-stage -5 \
+                                 --relu-dim 500 \
+                                 --minibatch-size 64 \
+                                 --pool-type 'weighted-average' \
+                                 --pool-lpfilter-width "0.333" \
+                                 --pool-window 7 \
+                                 --common-egs-dir exp/chain/tdnn_v1_trial2_sp/egs
+fi
+
+
+if [ $exp -eq 12 ]; then
+dir_name=exp/chain/tdnn_v2_trial4_sp/
+mkdir -p $dir_name
+for f in 0.trans_mdl cmvn_opts frame_subsampling_factor normalization.fst phone_lm.fst den.fst tree; do
+  cp exp/chain/tdnn_v1_trial1_sp/$f $dir_name
+done
+
+ local/chain/tdnn/run_tdnn_v2.sh --affix trial4 \
+                                 --stage 12 \
+                                 --train-stage -5 \
+                                 --relu-dim 500 \
+                                 --minibatch-size 64 \
+                                 --pool-type 'low-pass' \
+                                 --pool-lpfilter-width "0.333" \
+                                 --pool-window 7 \
+                                 --common-egs-dir exp/chain/tdnn_v1_trial2_sp/egs
+fi
+
+if [ $exp -eq 13 ]; then
+dir_name=exp/chain/tdnn_v2_trial5_sp/
+mkdir -p $dir_name
+for f in 0.trans_mdl cmvn_opts frame_subsampling_factor normalization.fst phone_lm.fst den.fst tree; do
+  cp exp/chain/tdnn_v1_trial1_sp/$f $dir_name
+done
+
+ local/chain/tdnn/run_tdnn_v2.sh --affix trial5 \
+                                 --stage 12 \
+                                 --train-stage -5 \
+                                 --relu-dim 500 \
+                                 --minibatch-size 64 \
+                                 --pool-type 'per-dim-weighted-average' \
+                                 --pool-window 7 \
+                                 --common-egs-dir exp/chain/tdnn_v1_trial2_sp/egs
+fi
+
+if [ $exp -eq 14 ]; then
+dir_name=exp/chain/tdnn_v3_trial1_sp/
+mkdir -p $dir_name
+for f in 0.trans_mdl cmvn_opts frame_subsampling_factor normalization.fst phone_lm.fst den.fst tree; do
+  cp exp/chain/tdnn_v1_trial1_sp/$f $dir_name
+done
+
+ local/chain/tdnn/run_tdnn_v3.sh --affix trial1 \
+                                 --stage 12 \
+                                 --train-stage -1 \
+                                 --minibatch-size 64 \
+                                 --pool-type 'per-dim-weighted-average' \
+                                 --pool-window 7 \
+                                 --common-egs-dir exp/chain/tdnn_v1_trial2_sp/egs
+fi
+
+if [ $exp -eq 15 ]; then
+dir_name=exp/chain/tdnn_v4_trial1_sp/
+mkdir -p $dir_name
+for f in 0.trans_mdl cmvn_opts frame_subsampling_factor normalization.fst phone_lm.fst den.fst tree; do
+  cp exp/chain/tdnn_v1_trial1_sp/$f $dir_name
+done
+
+ local/chain/tdnn/run_tdnn_v4.sh --affix trial1 \
+                                 --stage 12 \
+                                 --train-stage 116 \
+                                 --minibatch-size 64 \
+                                 --pool-type 'per-dim-weighted-average' \
+                                 --pool-window 7 \
+                                 --common-egs-dir exp/chain/tdnn_v1_trial2_sp/egs
+fi
+
+if [ $exp -eq 16 ]; then
+dir_name=exp/chain/tdnn_v4_trial2_sp/
+mkdir -p $dir_name
+for f in 0.trans_mdl cmvn_opts frame_subsampling_factor normalization.fst phone_lm.fst den.fst tree; do
+  cp exp/chain/tdnn_v1_trial1_sp/$f $dir_name
+done
+
+ local/chain/tdnn/run_tdnn_v4.sh --affix trial2 \
+                                 --stage 12 \
+                                 --train-stage -5 \
+                                 --minibatch-size 64 \
+                                 --pool-type 'per-dim-weighted-average' \
+                                 --pool-window 7 \
+                                 --self-repair-scale "" \
+                                 --common-egs-dir exp/chain/tdnn_v1_trial2_sp/egs
+fi
+
+
+if [ $exp -eq 17 ]; then
+  # this is very similar to v3_trial1 as expected, so discontinuing this was
+  # similar to v4, except for HMM leaky coefficient reducing hmm leaky
+  # coefficient to 1e-5, brings the training progress back to before which
+  # causes a lot of undertraining
+
+dir_name=exp/chain/tdnn_v5_trial1_sp/
+mkdir -p $dir_name
+for f in 0.trans_mdl cmvn_opts frame_subsampling_factor normalization.fst phone_lm.fst den.fst tree; do
+  cp exp/chain/tdnn_v1_trial1_sp/$f $dir_name
+done
+
+ local/chain/tdnn/run_tdnn_v5.sh --affix trial1 \
+                                 --stage 12 \
+                                 --train-stage -15 \
+                                 --minibatch-size 64 \
+                                 --pool-type 'per-dim-weighted-average' \
+                                 --pool-window 7 \
+                                 --common-egs-dir exp/chain/tdnn_v1_trial2_sp/egs
+fi
+
+
+if [ $exp -eq 18 ]; then
+dir_name=exp/chain/tdnn_v5_mdwa_sp/
+mkdir -p $dir_name
+for f in 0.trans_mdl cmvn_opts frame_subsampling_factor normalization.fst phone_lm.fst den.fst tree; do
+  cp exp/chain/tdnn_v1_trial1_sp/$f $dir_name
+done
+
+ local/chain/tdnn/run_tdnn_v5.sh --affix mdwa \
+                                 --stage 12 \
+                                 --train-stage 0 \
+                                 --minibatch-size 64 \
+                                 --pool-type 'multi-dim-weighted-average' \
+                                 --pool-window 7 \
+                                 --common-egs-dir exp/chain/tdnn_v1_trial2_sp/egs
+fi
+
+
+if [ $exp -eq 19 ]; then
+dir_name=exp/chain/tdnn_v5_mdwa_sp/
+mkdir -p $dir_name
+for f in 0.trans_mdl cmvn_opts frame_subsampling_factor normalization.fst phone_lm.fst den.fst tree; do
+  cp exp/chain/tdnn_v1_trial1_sp/$f $dir_name
+done
+
+ local/chain/tdnn/run_tdnn_v5.sh --affix mdwa \
+                                 --stage 12 \
+                                 --train-stage -15 \
+                                 --minibatch-size 64 \
+                                 --pool-type 'none' \
+                                 --pool-window 7 \
+                                 --common-egs-dir exp/chain/tdnn_v1_trial2_sp/egs
+fi
diff --git a/egs/swbd/s5c/total.perf b/egs/swbd/s5c/total.perf
new file mode 100644
index 00000000000..112285c817a
--- /dev/null
+++ b/egs/swbd/s5c/total.perf
@@ -0,0 +1,33 @@
+%WER 19.3 | 4459 42989 | 83.5 11.9 4.7 2.8 19.3 57.8 | exp/chain/tdnn_v_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+%WER 17.1 | 4459 42989 | 84.9 10.2 4.9 2.0 17.1 53.9 | exp/chain/tdnn_v1_trial1_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+%WER 17.1 | 4459 42989 | 84.9 10.1 4.9 2.0 17.1 54.0 | exp/chain/tdnn_v1_trial5_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+%WER 17.1 | 4459 42989 | 84.7 10.0 5.3 1.9 17.1 54.1 | exp/chain/tdnn_v1_trial4_sp/decode_eval2000_sw1_fsh_fg/score_12_0.5/eval2000_hires.ctm.filt.sys
+%WER 17.0 | 4459 42989 | 85.0 9.9 5.1 1.9 17.0 53.8 | exp/chain/tdnn_v1_trial3_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+%WER 16.9 | 4459 42989 | 84.8 9.4 5.8 1.6 16.9 53.5 | exp/chain/tdnn_v2_trial1_sp/decode_eval2000_200_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+%WER 16.8 | 4459 42989 | 84.9 9.4 5.7 1.7 16.8 53.9 | exp/chain/tdnn_v2_trial5_sp/decode_eval2000_300_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+%WER 16.7 | 4459 42989 | 85.2 9.8 5.0 1.9 16.7 52.9 | exp/chain/tdnn_v2_trial8_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+%WER 16.7 | 4459 42989 | 85.2 9.8 5.0 1.9 16.7 52.9 | exp/chain/tdnn_v2_trial8_sp/decode_eval2000_400_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+%WER 16.7 | 4459 42989 | 85.2 9.8 5.0 1.9 16.7 52.9 | exp/chain/tdnn_v1_trial8_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+%WER 16.7 | 4459 42989 | 85.2 9.8 5.0 1.9 16.7 52.9 | exp/chain/tdnn_v1_trial8_sp/decode_eval2000_400_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+%WER 16.7 | 4459 42989 | 85.2 9.6 5.2 1.9 16.7 53.7 | exp/chain/tdnn_v2_trial3_sp/decode_eval2000_400_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+%WER 16.7 | 4459 42989 | 85.2 9.4 5.4 1.9 16.7 54.0 | exp/chain/tdnn_v2_trial5_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+%WER 16.7 | 4459 42989 | 85.1 9.9 5.0 1.9 16.7 53.5 | exp/chain/tdnn_v2_trial1_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+%WER 16.7 | 4459 42989 | 85.1 9.7 5.2 1.8 16.7 53.6 | exp/chain/tdnn_v2_trial1_sp/decode_eval2000_400_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+%WER 16.6 | 4459 42989 | 85.4 10.0 4.7 2.0 16.6 53.7 | exp/chain/tdnn_v2_trial5_sp/decode_eval2000_400_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+%WER 16.6 | 4459 42989 | 85.3 9.8 4.9 1.9 16.6 53.1 | exp/chain/tdnn_v2_trial4_sp/decode_eval2000_300_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+%WER 16.6 | 4459 42989 | 85.2 9.7 5.2 1.8 16.6 53.7 | exp/chain/tdnn_v2_trial3_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+%WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.3 | exp/chain/tdnn_v2_trial8_sp/decode_eval2000_300_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+%WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.3 | exp/chain/tdnn_v1_trial8_sp/decode_eval2000_300_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+%WER 16.6 | 4459 42989 | 85.2 9.6 5.2 1.8 16.6 53.3 | exp/chain/tdnn_v2_trial4_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+%WER 16.6 | 4459 42989 | 85.2 9.5 5.3 1.8 16.6 53.4 | exp/chain/tdnn_v1_trial6_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+%WER 16.6 | 4459 42989 | 85.1 9.2 5.7 1.7 16.6 53.0 | exp/chain/tdnn_v2_trial8_sp/decode_eval2000_200_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+%WER 16.6 | 4459 42989 | 85.1 9.2 5.7 1.7 16.6 53.0 | exp/chain/tdnn_v1_trial8_sp/decode_eval2000_200_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+%WER 16.5 | 4459 42989 | 85.3 9.3 5.4 1.8 16.5 53.8 | exp/chain/tdnn_v2_trial3_sp/decode_eval2000_300_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+%WER 16.5 | 4459 42989 | 85.2 9.5 5.3 1.7 16.5 53.6 | exp/chain/tdnn_v2_trial1_sp/decode_eval2000_300_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+%WER 16.5 | 4459 42989 | 85.2 9.5 5.3 1.7 16.5 53.3 | exp/chain/tdnn_v2_trial4_sp/decode_eval2000_400_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+%WER 16.4 | 4459 42989 | 85.5 9.6 5.0 1.9 16.4 53.3 | exp/chain/tdnn_v3_trial1_sp/decode_eval2000_400_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+%WER 16.4 | 4459 42989 | 85.4 9.6 5.0 1.8 16.4 53.7 | exp/chain/tdnn_v2_trial2_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+%WER 16.4 | 4459 42989 | 85.4 9.5 5.1 1.8 16.4 53.3 | exp/chain/tdnn_v3_trial1_sp/decode_eval2000_300_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+%WER 16.3 | 4459 42989 | 85.6 9.7 4.7 1.9 16.3 53.2 | exp/chain/tdnn_v2_trial2_sp/decode_eval2000_300_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+%WER 16.3 | 4459 42989 | 85.4 9.4 5.2 1.8 16.3 53.8 | exp/chain/tdnn_v2_trial2_sp/decode_eval2000_400_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+%WER 16.1 | 4459 42989 | 85.7 9.5 4.7 1.9 16.1 52.7 | exp/chain/tdnn_v3_trial1_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
diff --git a/egs/tedlium/s5/RESULTS b/egs/tedlium/s5/RESULTS
index 9c494712aa8..0c209bddf7e 100644
--- a/egs/tedlium/s5/RESULTS
+++ b/egs/tedlium/s5/RESULTS
@@ -7,6 +7,27 @@ for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; d
   for x in exp/{mono,tri,sgmm,nnet,dnn,lstm}*/decode*; do [ -d $x ] && grep Sum $x/score_*/*.sys | utils/best_wer.sh; done 2>/dev/null | grep $filter_regexp
 exit 0
 
+
+#---------------------------------Current results (after fixing the <unk> problem)---------------------------------
+# There was a problem with the language model preparation where the scripts expected <UNK> to represent OOV words while
+# the language model used <unk> to represent them. See `git log tedlium-unk-fix` for details.
+# Fixing this causes a small decrease in WER.
+
+# GMMs
+# DEV SPEAKERS:
+%WER 31.0 | 507 17792 | 73.5 20.2 6.3 4.5 31.0 97.2 | -0.032 | exp/tri1/decode_nosp_dev/score_11_0.0/ctm.filt.filt.sys
+%WER 26.4 | 507 17792 | 77.8 16.7 5.5 4.2 26.4 95.5 | -0.066 | exp/tri2/decode_nosp_dev/score_13_0.0/ctm.filt.filt.sys
+%WER 26.1 | 507 17792 | 77.2 16.3 6.5 3.4 26.1 95.5 | -0.106 | exp/tri2/decode_dev/score_14_1.0/ctm.filt.filt.sys
+%WER 22.0 | 507 17792 | 81.6 13.2 5.2 3.6 22.0 93.9 | -0.189 | exp/tri3/decode_dev/score_13_1.0/ctm.filt.filt.sys
+
+# TEST SPEAKERS:
+%WER 30.9 | 1155 27512 | 72.1 21.0 6.9 3.0 30.9 94.5 | 0.035 | exp/tri1/decode_nosp_test/score_12_0.5/ctm.filt.filt.sys
+%WER 25.5 | 1155 27512 | 78.0 17.4 4.6 3.6 25.5 92.8 | -0.034 | exp/tri2/decode_nosp_test/score_12_0.0/ctm.filt.filt.sys
+%WER 24.9 | 1155 27512 | 78.3 16.7 5.0 3.2 24.9 93.0 | -0.020 | exp/tri2/decode_test/score_14_0.5/ctm.filt.filt.sys
+%WER 20.3 | 1155 27512 | 82.7 13.4 3.9 3.0 20.3 90.0 | -0.063 | exp/tri3/decode_test/score_14_0.5/ctm.filt.filt.sys
+
+#---------------------------------(Pre-<unk> fix for Cantab LM) Provided for reference----------------------------------
+
 # Results from Nikolay, using kaldi scoring:
 # %WER 35.17 [ 9677 / 27512, 1267 ins, 1681 del, 6729 sub ] exp/tri1/decode/wer_13
 # %WER 30.03 [ 8262 / 27512, 1255 ins, 1367 del, 5640 sub ] exp/tri2/decode/wer_15
diff --git a/egs/tedlium/s5/cmd.sh b/egs/tedlium/s5/cmd.sh
index bed97d34020..ba7f120e599 100644
--- a/egs/tedlium/s5/cmd.sh
+++ b/egs/tedlium/s5/cmd.sh
@@ -19,7 +19,7 @@ host=$(hostname -f)
 if [ ${host#*.} == "fit.vutbr.cz" ]; then
   # BUT cluster:
   queue="all.q@@blade,all.q@@speech"
-  gpu_queue="long.q@supergpu*,long.q@dellgpu*,long.q@pcspeech-gpu,long.q@pcgpu*"
+  gpu_queue="long.q@@gpu"
   storage="matylda5"
   export train_cmd="queue.pl -q $queue -l ram_free=1500M,mem_free=1500M,${storage}=1"
   export decode_cmd="queue.pl -q $queue -l ram_free=2500M,mem_free=2500M,${storage}=0.5"
diff --git a/egs/tedlium/s5/local/chain/README b/egs/tedlium/s5/local/chain/README
new file mode 100644
index 00000000000..85e5b863a7c
--- /dev/null
+++ b/egs/tedlium/s5/local/chain/README
@@ -0,0 +1,16 @@
+These are the instructions to reproduce the TEDLIUM models described in
+"Purely sequence-trained neural networks for ASR based on lattice-free
+MMI", by Povey et al.
+
+First run:
+
+./run.sh
+
+until the end of stage 7. (local/nnet/run_dnn.sh can be skipped.)
+
+Then run:
+
+local/chain/run_tdnn.sh
+
+to see results for a generic chain model. See the script's header
+comments to see other options, and their results.
\ No newline at end of file
diff --git a/egs/tedlium/s5/local/chain/run_tdnn.sh b/egs/tedlium/s5/local/chain/run_tdnn.sh
new file mode 100755
index 00000000000..804bf93f58a
--- /dev/null
+++ b/egs/tedlium/s5/local/chain/run_tdnn.sh
@@ -0,0 +1,204 @@
+#!/bin/bash
+#
+# This script requires that you have run the toplevel run.sh script in TEDLIUM up to stage 7.
+#
+# Results: (Run for x in exp/chain/tdnn/decode*; do [ -d $x ] && grep Sum $x/score_*/*.sys | utils/best_wer.sh; done 2>/dev/null)
+## Number of parameters: 6172530
+## %WER 14.1 | 507 17792 | 88.6 7.3 4.1 2.7 14.1 92.9 | 0.075 | exp/chain/tdnn/decode_dev/score_10_0.5/ctm.filt.filt.sys
+## %WER 13.3 | 507 17792 | 89.7 6.9 3.4 2.9 13.3 92.1 | 0.000 | exp/chain/tdnn/decode_dev_rescore/score_10_0.0/ctm.filt.filt.sys
+## %WER 13.8 | 1155 27512 | 89.4 7.5 3.1 3.2 13.8 87.9 | 0.101 | exp/chain/tdnn/decode_test/score_10_0.0/ctm.filt.filt.sys
+## %WER 12.9 | 1155 27512 | 90.1 6.6 3.3 2.9 12.9 86.1 | 0.043 | exp/chain/tdnn/decode_test_rescore/score_10_0.0/ctm.filt.filt.sys
+# The final WER (rescored WER on the test set) is what we are interested in.
+
+# To reproduce the setup used in the paper, set the following variables:
+# affix=_more_ce
+# relu_dim=525
+# xent_regularize=0.2
+#
+# Results: (Run for x in exp/chain/tdnn_more_ce/decode*; do [ -d $x ] && grep Sum $x/score_*/*.sys | utils/best_wer.sh; done 2>/dev/null)
+## Number of parameters: 8758742
+## %WER 14.3 | 507 17792 | 89.0 7.8 3.2 3.3 14.3 93.5 | 0.116 | exp/chain/tdnn_more_ce/decode_dev/score_10_0.0/ctm.filt.filt.sys
+## %WER 13.0 | 507 17792 | 90.0 6.9 3.2 2.9 13.0 91.3 | -0.003 | exp/chain/tdnn_more_ce/decode_devv_rescore/score_10_0.0/ctm.filt.filt.sys
+## %WER 13.8 | 1155 27512 | 89.1 7.4 3.4 2.9 13.8 87.5 | 0.082 | exp/chain/tdnn_more_ce/decode_test/score_10_0.5/ctm.filt.filt.sys
+## %WER 12.8 | 1155 27512 | 90.4 6.6 3.1 3.1 12.8 86.7 | 0.014 | exp/chain/tdnn_more_ce/decode_test_rescore/score_10_0.0/ctm.filt.filt.sys
+
+set -uo pipefail
+
+# configs for 'chain'
+affix=
+stage=0 # After running the entire script once, you can set stage=12 to tune the neural net only.
+train_stage=-10
+get_egs_stage=-10
+dir=exp/chain/tdnn
+decode_iter=
+
+# TDNN options
+# this script uses the new tdnn config generator so it needs a final 0 to reflect that the final layer input has no splicing
+self_repair_scale=0.00001
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=8
+minibatch_size=128
+relu_dim=425
+frames_per_eg=150
+remove_egs=false
+xent_regularize=0.1
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+dir=${dir}${affix}
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 9" if you have already
+# run those things.
+
+gmm_dir=exp/tri3
+ali_dir=exp/tri3_ali
+lats_dir=${ali_dir/ali/lats}
+treedir=exp/chain/tri3_tree
+lang=data/lang_chain
+
+mkdir -p $dir
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --generate-alignments false \
+  --speed-perturb true || exit 1;
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat ${ali_dir}/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/train_sp \
+    data/lang $gmm_dir $lats_dir
+  rm ${lats_dir}/fsts.*.gz # save space
+fi
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 4000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs";
+
+  # create the config files for nnet initialization
+  repair_opts=${self_repair_scale:+" --self-repair-scale $self_repair_scale "}
+
+  steps/nnet3/tdnn/make_configs.py \
+    $repair_opts \
+    --feat-dir data/train_sp_hires \
+    --ivector-dir exp/nnet3/ivectors_train_sp \
+    --tree-dir $treedir \
+    --relu-dim $relu_dim \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \
+    --use-presoftmax-prior-scale false \
+    --xent-regularize $xent_regularize \
+    --xent-separate-forward-affine true \
+    --include-log-softmax false \
+    --final-layer-normalize-target $final_layer_normalize_target \
+    $dir/configs || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  if [[  $(hostname -f) ==  *.clsp.jhu.edu ]]; then
+     # spread the egs over various machines.  will help reduce overload of any
+     # one machine.
+     utils/create_split_dir.pl /export/b{09,10,11,12}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  touch $dir/egs/.nodelete
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+   --cmd "$decode_cmd" \
+   --feat.online-ivector-dir exp/nnet3/ivectors_train_sp \
+   --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+   --chain.xent-regularize $xent_regularize \
+   --chain.leaky-hmm-coefficient 0.1 \
+   --chain.l2-regularize 0.00005 \
+   --chain.apply-deriv-weights false \
+   --chain.lm-opts="--num-extra-lm-states=2000" \
+   --egs.stage $get_egs_stage \
+   --egs.opts "--frames-overlap-per-eg 0" \
+   --egs.chunk-width $frames_per_eg \
+   --trainer.num-chunk-per-minibatch $minibatch_size \
+   --trainer.frames-per-iter 1500000 \
+   --trainer.num-epochs $num_epochs \
+   --trainer.optimization.num-jobs-initial $num_jobs_initial \
+   --trainer.optimization.num-jobs-final $num_jobs_final \
+   --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+   --trainer.optimization.final-effective-lrate $final_effective_lrate \
+   --trainer.max-param-change $max_param_change \
+   --cleanup.remove-egs $remove_egs \
+   --cleanup.preserve-model-interval 20 \
+   --feat-dir data/train_sp_hires \
+   --tree-dir $treedir \
+   --lat-dir $lats_dir \
+   --dir $dir || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+fi
+
+graph_dir=$dir/graph
+if [ $stage -le 15 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+
+  for decode_set in dev test; do
+    (
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --nj $(wc -l data/$decode_set/spk2utt) --cmd "$decode_cmd" $iter_opts \
+      --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+      --scoring-opts "--min_lmwt 5 --max_lmwt 15" \
+      $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter} || exit 1;
+
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+      data/lang_test data/lang_rescore data/${decode_set}_hires \
+      $dir/decode_${decode_set}${decode_iter:+_$decode_iter} \
+      $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_rescore || exit 1;
+    ) &
+  done
+fi
+
+wait
\ No newline at end of file
diff --git a/egs/tedlium/s5/local/confidence_calibration.sh b/egs/tedlium/s5/local/confidence_calibration.sh
new file mode 100755
index 00000000000..0eb3dc21521
--- /dev/null
+++ b/egs/tedlium/s5/local/confidence_calibration.sh
@@ -0,0 +1,81 @@
+#!/bin/bash
+. cmd.sh
+. path.sh
+
+# Global options,
+graph=exp/tri3/graph
+arpa_gz=db/cantab-TEDLIUM/cantab-TEDLIUM-pruned.lm3.gz
+lmwt=13
+
+# Dev-set options,
+dev_data=data/dev
+dev_latdir=exp/tri3_mmi_b0.1/decode_dev_it4
+
+# Eval-set options,
+eval_data=data/test
+eval_latdir=exp/tri3_mmi_b0.1/decode_test_it4
+
+. utils/parse_options.sh
+set -euxo pipefail
+
+# Derived options,
+dev_caldir=$dev_latdir/confidence_$lmwt
+eval_caldir=$eval_latdir/confidence_$lmwt
+
+###### Data preparation,
+
+# Prepare filtering for excluding data from train-set (1 .. keep word, 0 .. exclude word),
+# - only excludes from training-targets, the confidences are recalibrated for all the words,
+word_filter=$(mktemp)
+awk '{ keep_the_word = $1 !~ /^(\[.*\]|<.*>|%.*|!.*|-.*|.*-)$/; print $0, keep_the_word }' \
+  $graph/words.txt >$word_filter
+
+# Calcualte the word-length,
+word_length=$(mktemp)
+awk '{if(r==0) { len_hash[$1] = NF-2; } 
+      if(r==1) { if(len_hash[$1]) { len = len_hash[$1]; } else { len = -1 }  
+      print $0, len; }}' \
+  r=0 $graph/phones/align_lexicon.txt \
+  r=1 $graph/words.txt \
+  >$word_length
+
+# Extract unigrams,
+unigrams=$(mktemp); steps/conf/parse_arpa_unigrams.py $graph/words.txt $arpa_gz $unigrams
+
+###### Paste the 'word-specific' features (first 4 columns have fixed position, more feature-columns can be added),
+# Format: "word word_id filter length other_features"
+word_feats=$(mktemp)
+paste $word_filter <(awk '{ print $3 }' $word_length) <(awk '{ print $3 }' $unigrams) > $word_feats
+
+
+###### Train the calibration,
+steps/conf/train_calibration.sh --cmd "$decode_cmd" --lmwt $lmwt \
+  $dev_data $graph $word_feats $dev_latdir $dev_caldir
+
+###### Apply the calibration to eval set,
+steps/conf/apply_calibration.sh --cmd "$decode_cmd" \
+  $eval_data $graph $eval_latdir $dev_caldir $eval_caldir
+# The final confidences are here '$eval_caldir/ctm_calibrated',
+
+###### Sclite scoring,
+# We will produce NCE which shows the ``quality'' of the confidences.
+# Please compare with the default scoring script for your database.
+
+# Scoring tools,
+hubscr=$KALDI_ROOT/tools/sctk/bin/hubscr.pl 
+hubdir=`dirname $hubscr`
+
+# Inputs,
+ctm=$eval_caldir/ctm_calibrated
+stm=$eval_data/stm
+glm=$eval_data/glm
+
+# Normalizng CTM, just like in 'local/score_sclite.sh',
+cat $ctm | grep -v -E '\[BREATH|NOISE|COUGH|SMACK|UM|UH\]' | \
+  grep -v -E '"!SIL|\<UNK\>' >${ctm}.filt
+
+# Mapping the time info to global,
+utils/convert_ctm.pl $eval_data/segments $eval_data/reco2file_and_channel <${ctm}.filt >${ctm}.filt.conv
+
+# Scoring,
+$hubscr -p $hubdir -V -l english -h hub5 -g $glm -r $stm ${ctm}.filt.conv
diff --git a/egs/tedlium/s5/local/join_suffix.py b/egs/tedlium/s5/local/join_suffix.py
index 25b097ed0e4..55cc9ba37ac 100755
--- a/egs/tedlium/s5/local/join_suffix.py
+++ b/egs/tedlium/s5/local/join_suffix.py
@@ -5,9 +5,10 @@
 
 
 import sys
+from codecs import open
 
 words = set()
-for line in open(sys.argv[1]):
+for line in open(sys.argv[1], encoding='utf8'):
     items = line.split()
     words.add(items[0])
 
@@ -16,12 +17,10 @@
     new_items = []
     i = 1
     while i < len(items):
-	if i < len(items) - 1 and items[i+1][0] == '\'' and items[i] + items[i+1] in words:
-	    new_items.append(items[i] + items[i+1])
-	    i = i + 1
-	else:
-	    new_items.append(items[i])
-	i = i + 1
-	
-    print items[0], " ".join(new_items)
-    
+        if i < len(items) - 1 and items[i+1][0] == '\'' and items[i] + items[i+1] in words:
+            new_items.append(items[i] + items[i+1])
+            i = i + 1
+        else:
+            new_items.append(items[i])
+        i = i + 1
+    print(items[0] + ' ' + ' '.join(new_items))
diff --git a/egs/tedlium/s5/local/nnet/run_dnn_bn.sh b/egs/tedlium/s5/local/nnet/run_dnn_bn.sh
index 909d1b2f253..3bd0dc2a1ea 100755
--- a/egs/tedlium/s5/local/nnet/run_dnn_bn.sh
+++ b/egs/tedlium/s5/local/nnet/run_dnn_bn.sh
@@ -146,7 +146,7 @@ if [ $stage -le 5 ]; then
      --transform-dir $gmm/decode_$(basename $test_bn) \
      $test_bn_fmllr $test_bn $gmm $test_bn_fmllr/log $test_bn_fmllr/data || exit 1;
   # Training set
-  steps/nnet/make_fmllr_feats.sh --nj $njfea --cmd "$train_cmd -tc 10" \
+  steps/nnet/make_fmllr_feats.sh --nj $njfea --cmd "$train_cmd --max-jobs-run 10" \
      --transform-dir ${gmm}_ali \
      $train_bn_fmllr $train_bn $gmm $train_bn_fmllr/log $train_bn_fmllr/data || exit 1;
   # Split the training set
diff --git a/egs/tedlium/s5/local/nnet/run_dnn_fbank.sh b/egs/tedlium/s5/local/nnet/run_dnn_fbank.sh
index 6403db12f3e..762b8a71307 100755
--- a/egs/tedlium/s5/local/nnet/run_dnn_fbank.sh
+++ b/egs/tedlium/s5/local/nnet/run_dnn_fbank.sh
@@ -40,7 +40,7 @@ stage=0
   steps/compute_cmvn_stats.sh $test $test/log $test/data || exit 1;
   # Training set
   utils/copy_data_dir.sh $train_original $train || exit 1; rm $train/{cmvn,feats}.scp
-  steps/make_fbank_pitch.sh --nj 10 --cmd "$train_cmd -tc 10" \
+  steps/make_fbank_pitch.sh --nj 10 --cmd "$train_cmd --max-jobs-run 10" \
      $train $train/log $train/data || exit 1;
   steps/compute_cmvn_stats.sh $train $train/log $train/data || exit 1;
   # Split the training set
diff --git a/egs/tedlium/s5/local/nnet/run_lstm.sh b/egs/tedlium/s5/local/nnet/run_lstm.sh
index 3293724cfb3..a8d6326812e 100755
--- a/egs/tedlium/s5/local/nnet/run_lstm.sh
+++ b/egs/tedlium/s5/local/nnet/run_lstm.sh
@@ -29,7 +29,7 @@ stage=0
   steps/compute_cmvn_stats.sh $dev $dev/log $dev/data || exit 1;
   # Training set
   utils/copy_data_dir.sh $train_original $train || exit 1; rm $train/{cmvn,feats}.scp
-  steps/make_fbank_pitch.sh --nj 10 --cmd "$train_cmd -tc 10" \
+  steps/make_fbank_pitch.sh --nj 10 --cmd "$train_cmd --max-jobs-run 10" \
      $train $train/log $train/data || exit 1;
   steps/compute_cmvn_stats.sh $train $train/log $train/data || exit 1;
   # Split the training set
@@ -46,7 +46,7 @@ if [ $stage -le 1 ]; then
     steps/nnet/train.sh --network-type lstm --learn-rate 0.00001 \
       --cmvn-opts "--norm-means=true --norm-vars=true" --feat-type plain --splice 0 \
       --proto-opts "--clip-gradient 5.0" \
-      --train-opts "--momentum 0.9 --halving-factor 0.65" \
+      --train-tool-opts "--momentum 0.9 --halving-factor 0.65" \
       --train-tool "nnet-train-lstm-streams --num-stream=4 --targets-delay=5" \
     ${train}_tr90 ${train}_cv10 data/lang $ali $ali $dir || exit 1;
 
diff --git a/egs/tedlium/s5/local/nnet3/README b/egs/tedlium/s5/local/nnet3/README
new file mode 100644
index 00000000000..6b77eb121b8
--- /dev/null
+++ b/egs/tedlium/s5/local/nnet3/README
@@ -0,0 +1,9 @@
+To produce the results from:
+
+"Purely sequence-trained neural networks for ASR based on lattice-free MMI", Povey et al.
+
+Run the following in order:
+
+./run.sh
+local/nnet3/run_tdnn.sh
+local/nnet3/run_tdnn_discriminative.sh
\ No newline at end of file
diff --git a/egs/tedlium/s5/local/nnet3/run_ivector_common.sh b/egs/tedlium/s5/local/nnet3/run_ivector_common.sh
new file mode 100755
index 00000000000..0b1738a2e8e
--- /dev/null
+++ b/egs/tedlium/s5/local/nnet3/run_ivector_common.sh
@@ -0,0 +1,125 @@
+#!/bin/bash
+
+# This is based on:
+# swbd/s5c/local/nnet3/run_ivector_common.sh and
+# tedlium/s5/local/online/run_nnet2_ms_perturbed.sh
+# see the chain docs for general direction on what training is doing!
+
+set -uo pipefail
+stage=1
+generate_alignments=true # false if doing ctc training
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+mkdir -p exp/nnet3 
+# perturb the data
+train_set=train
+if [ $stage -le 1 ]; then
+  #Although the nnet will be trained by high resolution data, we still have to perturb the normal data to get the alignment
+
+  utils/perturb_data_dir_speed.sh 0.9 data/${train_set} data/temp1
+  utils/perturb_data_dir_speed.sh 1.1 data/${train_set} data/temp2
+  utils/combine_data.sh data/${train_set}_tmp data/temp1 data/temp2
+  utils/validate_data_dir.sh --no-feats data/${train_set}_tmp
+  rm -r data/temp1 data/temp2
+
+  mfccdir=mfcc_perturbed
+  steps/make_mfcc.sh --cmd "$train_cmd" --nj 50 \
+    data/${train_set}_tmp exp/make_mfcc/${train_set}_tmp $mfccdir || exit 1;
+  steps/compute_cmvn_stats.sh data/${train_set}_tmp exp/make_mfcc/${train_set}_tmp $mfccdir || exit1;
+  utils/fix_data_dir.sh data/${train_set}_tmp
+    
+  utils/copy_data_dir.sh --spk-prefix sp1.0- --utt-prefix sp1.0- data/${train_set} data/temp0
+  utils/combine_data.sh data/${train_set}_sp data/${train_set}_tmp data/temp0
+  utils/fix_data_dir.sh data/${train_set}_sp
+  rm -r data/temp0 data/${train_set}_tmp
+fi
+
+train_set_sp=${train_set}_sp
+
+if [ $stage -le 2 ] && [ "$generate_alignments" == "true" ]; then
+  # obtain the alignment of the pertubed data
+  steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \
+    data/${train_set_sp} data/lang_nosp exp/tri3 exp/tri3_ali_sp || exit 1
+fi
+
+if [ $stage -le 3 ]; then
+  
+  mfccdir=mfcc_hires
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
+    date=$(date +'%m_%d_%H_%M')
+    utils/create_split_dir.pl /export/b{09,10,11,12}/$USER/kaldi-data/egs/tedlium-$date/s5/$mfccdir/storage $mfccdir/storage
+  fi
+
+  for dataset in $train_set $train_set_sp; do
+    data_dir=data/${dataset}_hires
+    utils/copy_data_dir.sh data/$dataset $data_dir
+
+      # this next section does volume perturbation on the data.
+    cat $data_dir/wav.scp | python -c "
+import sys, os, subprocess, re, random
+random.seed(0)
+scale_low = 1.0/8
+scale_high = 2.0
+for line in sys.stdin.readlines():
+  if len(line.strip()) == 0:
+    continue
+  print '{0} sox --vol {1} -t wav - -t wav - |'.format(line.strip(), random.uniform(scale_low, scale_high))
+"| sort -k1,1 -u  > $data_dir/wav.scp_scaled || exit 1;
+    mv $data_dir/wav.scp_scaled $data_dir/wav.scp
+
+    steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \
+      $data_dir exp/make_hires/$dataset $mfccdir
+    steps/compute_cmvn_stats.sh $data_dir exp/make_hires/$dataset $mfccdir
+    utils/fix_data_dir.sh $data_dir # remove segments with problems
+  done
+
+  for dataset in dev test; do
+    data_dir=data/${dataset}_hires
+    utils/copy_data_dir.sh data/$dataset $data_dir
+
+    steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \
+      $data_dir exp/make_hires/$dataset $mfccdir
+    steps/compute_cmvn_stats.sh $data_dir exp/make_hires/$dataset $mfccdir
+    utils/fix_data_dir.sh $data_dir # remove segments with problems
+  done
+fi
+
+# ivector extractor training
+if [ $stage -le 5 ]; then
+  # We need to build a small system just because we need the LDA+MLLT transform
+  # to train the diag-UBM on top of.  We use --num-iters 13 because after we get
+  # the transform (12th iter is the last), any further training is pointless.
+  # this decision is based on fisher_english
+  # Note: We do NOT use speed-perturbed data in this step.
+  steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \
+    --splice-opts "--left-context=3 --right-context=3" \
+    5000 10000 data/${train_set}_hires \
+    data/lang_nosp exp/tri3_ali exp/nnet3/tri3b
+fi
+
+if [ $stage -le 6 ]; then
+  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 700000 \
+    data/${train_set_sp}_hires 512 exp/nnet3/tri3b exp/nnet3/diag_ubm
+fi
+
+if [ $stage -le 7 ]; then
+    steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
+        data/${train_set_sp}_hires exp/nnet3/diag_ubm exp/nnet3/extractor || exit 1;
+fi
+
+if [ $stage -le 8 ]; then
+  
+    steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/${train_set_sp}_hires \
+        data/${train_set_sp}_hires_max2
+
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
+        data/${train_set_sp}_hires_max2 exp/nnet3/extractor exp/nnet3/ivectors_${train_set_sp} || exit 1
+
+    for data_set in dev test; do
+      steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 8 \
+        data/${data_set}_hires exp/nnet3/extractor exp/nnet3/ivectors_${data_set} || exit 1;
+    done
+fi
diff --git a/egs/tedlium/s5/local/nnet3/run_tdnn.sh b/egs/tedlium/s5/local/nnet3/run_tdnn.sh
new file mode 100755
index 00000000000..4eabd9fae0b
--- /dev/null
+++ b/egs/tedlium/s5/local/nnet3/run_tdnn.sh
@@ -0,0 +1,113 @@
+#!/bin/bash
+
+# this is the standard "tdnn" system, built in nnet3; it's what we use to
+# call multi-splice.
+
+# Results (2 epochs):
+# Number of parameters: 6056880
+# %WER 15.3 | 507 17792 | 87.4 9.0 3.6 2.7 15.3 90.1 | -0.081 | exp/nnet3/tdnn_sp/decode_dev/score_10_0.5/ctm.filt.filt.sys
+# %WER 13.9 | 507 17792 | 88.4 8.0 3.6 2.3 13.9 85.8 | -0.164 | exp/nnet3/tdnn_sp/decode_dev_rescore/score_10_0.5/ctm.filt.filt.sys
+# %WER 13.8 | 1155 27512 | 88.5 8.7 2.7 2.3 13.8 84.2 | -0.076 | exp/nnet3/tdnn_sp/decode_test/score_10_0.0/ctm.filt.filt.sys
+# %WER 12.5 | 1155 27512 | 89.6 7.7 2.6 2.1 12.5 81.5 | -0.133 | exp/nnet3/tdnn_sp/decode_test_rescore/score_10_0.0/ctm.filt.filt.sys
+
+# 4 epochs
+# %WER 14.6 | 507 17792 | 87.9 8.7 3.4 2.5 14.6 88.6 | -0.111 | exp/nnet3/tdnn/decode_dev/score_10_0.5/ctm.filt.filt.sys
+# %WER 13.2 | 507 17792 | 89.4 7.7 2.9 2.6 13.2 85.0 | -0.170 | exp/nnet3/tdnn/decode_dev_rescore/score_10_0.0/ctm.filt.filt.sys
+# %WER 13.5 | 1155 27512 | 88.7 8.5 2.7 2.3 13.5 83.6 | -0.110 | exp/nnet3/tdnn/decode_test/score_10_0.0/ctm.filt.filt.sys
+# %WER 12.1 | 1155 27512 | 89.9 7.5 2.6 2.1 12.1 80.3 | -0.178 | exp/nnet3/tdnn/decode_test_rescore/score_10_0.0/ctm.filt.filt.sys
+
+# At this script level we don't support not running on GPU, as it would be painfully slow.
+# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false,
+# --num-threads 16 and --minibatch-size 128.
+
+stage=1
+affix=
+train_stage=-10
+common_egs_dir=
+reporting_email=
+remove_egs=true
+decode_iter=
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+dir=exp/nnet3/tdnn
+dir=$dir${affix:+_$affix}
+train_set=train_sp #_sp stands for speed-perturbed. This is hard-coded to speed 
+                   # pertub data.
+ali_dir=exp/tri3_ali_sp
+
+local/nnet3/run_ivector_common.sh --stage $stage --generate-alignments true || exit 1;
+
+if [ $stage -le 9 ]; then
+  echo "$0: creating neural net configs";
+
+  # create the config files for nnet initialization
+  python steps/nnet3/tdnn/make_configs.py  \
+    --feat-dir data/${train_set}_hires \
+    --ivector-dir exp/nnet3/ivectors_${train_set} \
+    --ali-dir $ali_dir \
+    --relu-dim 500 \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --use-presoftmax-prior-scale true \
+   $dir/configs || exit 1;
+fi
+
+if [ $stage -le 10 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b{09,10,11,12}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_dnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.num-epochs 2 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 8 \
+    --trainer.optimization.initial-effective-lrate 0.0015 \
+    --trainer.optimization.final-effective-lrate 0.00015 \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --cleanup.preserve-model-interval 20 \
+    --feat-dir=data/${train_set}_hires \
+    --ali-dir $ali_dir \
+    --lang data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+
+fi
+
+graph_dir=exp/tri3/graph
+if [ $stage -le 11 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+
+  for decode_set in dev test; do
+    (
+    steps/nnet3/decode.sh \
+      --nj $(wc -l data/$decode_set/spk2utt) --cmd "$decode_cmd" $iter_opts \
+      --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+      $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter} || exit 1;
+
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+      data/lang_test data/lang_rescore data/${decode_set}_hires \
+      $dir/decode_${decode_set}${decode_iter:+_$decode_iter} \
+      $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_rescore || exit 1;
+    ) &
+  done
+fi
+wait;
\ No newline at end of file
diff --git a/egs/tedlium/s5/local/nnet3/run_tdnn_discriminative.sh b/egs/tedlium/s5/local/nnet3/run_tdnn_discriminative.sh
new file mode 100755
index 00000000000..469db715bb7
--- /dev/null
+++ b/egs/tedlium/s5/local/nnet3/run_tdnn_discriminative.sh
@@ -0,0 +1,201 @@
+#!/bin/bash
+
+# This script does discriminative training on top of CE nnet3 system.
+# note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
+# since the lattice generation runs in about real-time, so takes of the order of
+# 1000 hours of CPU time.
+# 
+
+#%WER 13.3 | 507 17792 | 89.1 8.2 2.8 2.4 13.3 86.0 | -0.207 | exp/nnet3/tdnn_smbr/decode_dev_epoch1.adj/score_12_1.0/ctm.filt.filt.sys
+#%WER 12.4 | 507 17792 | 89.8 7.5 2.7 2.2 12.4 85.4 | -0.305 | exp/nnet3/tdnn_smbr/decode_dev_epoch1.adj_rescore/score_12_1.0/ctm.filt.filt.sys
+#%WER 13.1 | 507 17792 | 89.2 8.0 2.8 2.3 13.1 85.4 | -0.244 | exp/nnet3/tdnn_smbr/decode_dev_epoch2.adj/score_13_1.0/ctm.filt.filt.sys
+#%WER 12.4 | 507 17792 | 89.7 7.5 2.8 2.1 12.4 84.0 | -0.336 | exp/nnet3/tdnn_smbr/decode_dev_epoch2.adj_rescore/score_13_1.0/ctm.filt.filt.sys
+#%WER 13.2 | 507 17792 | 89.2 8.1 2.7 2.4 13.2 85.8 | -0.332 | exp/nnet3/tdnn_smbr/decode_dev_epoch3.adj/score_13_1.0/ctm.filt.filt.sys
+#%WER 12.5 | 507 17792 | 89.9 7.8 2.4 2.4 12.5 85.2 | -0.391 | exp/nnet3/tdnn_smbr/decode_dev_epoch3.adj_rescore/score_14_0.5/ctm.filt.filt.sys
+#%WER 13.4 | 507 17792 | 88.9 8.3 2.7 2.4 13.4 86.0 | -0.342 | exp/nnet3/tdnn_smbr/decode_dev_epoch4.adj/score_13_1.0/ctm.filt.filt.sys
+#%WER 12.7 | 507 17792 | 89.3 7.7 3.0 2.1 12.7 84.4 | -0.427 | exp/nnet3/tdnn_smbr/decode_dev_epoch4.adj_rescore/score_16_1.0/ctm.filt.filt.sys
+#%WER 12.4 | 1155 27512 | 89.4 7.9 2.7 1.7 12.4 80.1 | -0.163 | exp/nnet3/tdnn_smbr/decode_test_epoch1.adj/score_13_1.0/ctm.filt.filt.sys
+#%WER 11.4 | 1155 27512 | 90.5 6.9 2.6 2.0 11.4 78.9 | -0.269 | exp/nnet3/tdnn_smbr/decode_test_epoch1.adj_rescore/score_13_0.5/ctm.filt.filt.sys
+#%WER 12.6 | 1155 27512 | 89.4 8.0 2.6 2.0 12.6 81.4 | -0.190 | exp/nnet3/tdnn_smbr/decode_test_epoch2.adj/score_13_1.0/ctm.filt.filt.sys
+#%WER 11.5 | 1155 27512 | 90.2 7.0 2.8 1.7 11.5 79.8 | -0.301 | exp/nnet3/tdnn_smbr/decode_test_epoch2.adj_rescore/score_14_1.0/ctm.filt.filt.sys
+#%WER 12.7 | 1155 27512 | 89.5 8.1 2.4 2.2 12.7 82.3 | -0.218 | exp/nnet3/tdnn_smbr/decode_test_epoch3.adj/score_14_0.5/ctm.filt.filt.sys
+#%WER 11.6 | 1155 27512 | 90.4 7.1 2.5 2.0 11.6 80.4 | -0.345 | exp/nnet3/tdnn_smbr/decode_test_epoch3.adj_rescore/score_14_0.5/ctm.filt.filt.sys
+#%WER 12.8 | 1155 27512 | 89.0 8.1 2.8 1.9 12.8 82.0 | -0.252 | exp/nnet3/tdnn_smbr/decode_test_epoch4.adj/score_15_1.0/ctm.filt.filt.sys
+#%WER 11.7 | 1155 27512 | 90.1 7.3 2.6 1.8 11.7 79.4 | -0.383 | exp/nnet3/tdnn_smbr/decode_test_epoch4.adj_rescore/score_13_1.0/ctm.filt.filt.sys
+
+
+set -uo pipefail
+
+stage=1
+train_stage=-10 # can be used to start training in the middle.
+get_egs_stage=-10
+use_gpu=true  # for training
+cleanup=false  # run with --cleanup true --stage 6 to clean up (remove large things like denlats,
+               # alignments and degs).
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+srcdir=exp/nnet3/tdnn
+train_data_dir=data/train_sp_hires
+online_ivector_dir=exp/nnet3/ivectors_train_sp
+degs_dir=                     # If provided, will skip the degs directory creation
+lats_dir=                     # If provided, will skip denlats creation
+
+## Objective options
+criterion=smbr
+one_silence_class=true
+
+dir=${srcdir}_${criterion}
+
+## Egs options
+frames_per_eg=150
+frames_overlap_per_eg=30
+truncate_deriv_weights=10
+
+## Nnet training options
+effective_learning_rate=0.0000125
+max_param_change=1
+num_jobs_nnet=4
+num_epochs=4
+regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options 
+minibatch_size=64
+adjust_priors=true            # May need to be set to false 
+                              # because it does not help in some setups
+modify_learning_rates=true
+last_layer_factor=0.1
+
+## Decode options
+decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we decided to run more.
+
+if $use_gpu; then
+  if ! cuda-compiled; then
+    cat <<EOF && exit 1 
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
+EOF
+  fi
+  num_threads=1
+else
+  # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
+  # almost the same, but this may be a little bit slow.
+  num_threads=16
+fi
+
+if [ ! -f ${srcdir}/final.mdl ]; then
+  echo "$0: expected ${srcdir}/final.mdl to exist; first run run_tdnn.sh or run_lstm.sh"
+  exit 1;
+fi
+
+if [ $stage -le 1 ]; then
+  # hardcode no-GPU for alignment, although you could use GPU [you wouldn't
+  # get excellent GPU utilization though.]
+  nj=400 # have a high number of jobs because this could take a while, and we might
+         # have some stragglers.
+  steps/nnet3/align.sh  --cmd "$decode_cmd" --use-gpu false \
+    --online-ivector-dir $online_ivector_dir \
+     --nj $nj $train_data_dir data/lang $srcdir ${srcdir}_ali ;
+
+fi
+
+if [ -z "$lats_dir" ]; then
+  lats_dir=${srcdir}_denlats
+  if [ $stage -le 2 ]; then
+    nj=50  
+    # this doesn't really affect anything strongly, except the num-jobs for one of
+    # the phases of get_egs_discriminative.sh below.
+    num_threads_denlats=6
+    subsplit=40 # number of jobs that run per job (but 2 run at a time, so total jobs is 80, giving
+    # total slots = 80 * 6 = 480.
+    steps/nnet3/make_denlats.sh --cmd "$decode_cmd" --determinize true \
+      --online-ivector-dir $online_ivector_dir \
+      --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \
+      $train_data_dir data/lang $srcdir ${lats_dir} ;
+  fi
+fi
+
+left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'` 
+right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'` 
+
+valid_left_context=$[left_context + frames_per_eg]
+valid_right_context=$[right_context + frames_per_eg]
+
+frame_subsampling_opt=
+if [ -f $srcdir/frame_subsampling_factor ]; then
+  frame_subsampling_opt="--frame-subsampling-factor $(cat $srcdir/frame_subsampling_factor)"
+fi
+
+cmvn_opts=`cat $srcdir/cmvn_opts` 
+
+if [ -z "$degs_dir" ]; then
+  degs_dir=${srcdir}_degs
+
+  if [ $stage -le 3 ]; then
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${srcdir}_degs/storage ]; then
+      utils/create_split_dir.pl \
+        /export/b{09,10,11,12}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5/${srcdir}_degs/storage ${srcdir}_degs/storage
+    fi
+    # have a higher maximum num-jobs if
+    if [ -d ${srcdir}_degs/storage ]; then max_jobs=10; else max_jobs=5; fi
+
+    degs_opts="--determinize true --minimize true --remove-output-symbols true --remove-epsilons true --collapse-transition-ids true"
+
+    steps/nnet3/get_egs_discriminative.sh \
+      --cmd "$decode_cmd --max-jobs-run $max_jobs --mem 20G" --stage $get_egs_stage --cmvn-opts "$cmvn_opts" \
+      --adjust-priors $adjust_priors \
+      --online-ivector-dir $online_ivector_dir \
+      --left-context $left_context --right-context $right_context \
+      --valid-left-context $valid_left_context --valid-right-context $valid_right_context \
+      --priors-left-context $valid_left_context --priors-right-context $valid_right_context $frame_subsampling_opt \
+      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg ${degs_opts} \
+      $train_data_dir data/lang ${srcdir}_ali $lats_dir $srcdir/final.mdl $degs_dir ;
+  fi
+fi
+
+if [ $stage -le 4 ]; then
+  steps/nnet3/train_discriminative.sh --cmd "$decode_cmd" \
+    --stage $train_stage \
+    --effective-lrate $effective_learning_rate --max-param-change $max_param_change \
+    --criterion $criterion --drop-frames true \
+    --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size $minibatch_size \
+    --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
+    --regularization-opts "$regularization_opts" \
+    --truncate-deriv-weights $truncate_deriv_weights --adjust-priors $adjust_priors \
+    --modify-learning-rates $modify_learning_rates --last-layer-factor $last_layer_factor \
+    ${degs_dir} $dir 
+fi
+
+graph_dir=exp/tri3/graph
+if [ $stage -le 5 ]; then
+  for x in `seq $decode_start_epoch $num_epochs`; do
+    for decode_set in dev test; do
+      (
+      num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+      iter=epoch$x.adj
+      
+      steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
+        --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+        $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${iter:+_$iter} || exit 1;
+
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test data/lang_rescore data/${decode_set}_hires \
+        $dir/decode_${decode_set}${iter:+_$iter} \
+        $dir/decode_${decode_set}${iter:+_$iter}_rescore || exit 1;
+      ) &
+    done
+  done
+fi
+wait;
+
+if [ $stage -le 6 ] && $cleanup; then
+  # if you run with "--cleanup true --stage 6" you can clean up.
+  rm ${lats_dir}/lat.*.gz || true
+  rm ${srcdir}_ali/ali.*.gz || true
+  steps/nnet2/remove_egs.sh ${srcdir}_degs || true
+fi
+
+
+exit 0;
+
diff --git a/egs/tedlium/s5/local/online/run_nnet2_ms_disc.sh b/egs/tedlium/s5/local/online/run_nnet2_ms_disc.sh
index 39bbef9717d..4505b89a273 100755
--- a/egs/tedlium/s5/local/online/run_nnet2_ms_disc.sh
+++ b/egs/tedlium/s5/local/online/run_nnet2_ms_disc.sh
@@ -106,14 +106,14 @@ if [ $stage -le 3 ]; then
   if [ -d ${srcdir}_degs/storage ]; then max_jobs=10; else max_jobs=5; fi
 
   steps/nnet2/get_egs_discriminative2.sh \
-    --cmd "$decode_cmd -tc $max_jobs" \
+    --cmd "$decode_cmd --max-jobs-run $max_jobs" \
     --online-ivector-dir exp/nnet2_online/ivectors_train_hires \
     --criterion $criterion --drop-frames $drop_frames \
      data/train_hires data/lang ${srcdir}{_ali,_denlats,/final.mdl,_degs} || exit 1;
 
   # the command below is a more generic, but slower, way to do it.
   #steps/online/nnet2/get_egs_discriminative2.sh \
-  #  --cmd "$decode_cmd -tc $max_jobs" \
+  #  --cmd "$decode_cmd --max-jobs-run $max_jobs" \
   #  --criterion $criterion --drop-frames $drop_frames \
   #   data/train_hires data/lang ${srcdir}{_ali,_denlats,_online,_degs} || exit 1;
 fi
diff --git a/egs/tedlium/s5/local/prepare_dict.sh b/egs/tedlium/s5/local/prepare_dict.sh
index a3207de050a..fcb03ea7aef 100755
--- a/egs/tedlium/s5/local/prepare_dict.sh
+++ b/egs/tedlium/s5/local/prepare_dict.sh
@@ -1,7 +1,8 @@
 #!/bin/bash
 #
-# Copyright  2014 Nickolay V. Shmyrev 
+# Copyright  2014 Nickolay V. Shmyrev
 #            2014 Brno University of Technology (Author: Karel Vesely)
+#            2016 Daniel Galvez
 # Apache 2.0
 #
 
@@ -13,10 +14,11 @@ srcdict=db/cantab-TEDLIUM/cantab-TEDLIUM.dct
 [ ! -r $srcdict ] && echo "Missing $srcdict" && exit 1
 
 # Join dicts and fix some troubles
-cat $srcdict | grep -v "<s>" | grep -v "</s>" | LANG= LC_ALL= sort | sed 's:([0-9])::g' > $dir/lexicon_words.txt 
+cat $srcdict | grep -v -w "<s>" | grep -v -w "</s>" | grep -v -w "<unk>" | \
+  LANG= LC_ALL= sort | sed 's:([0-9])::g' > $dir/lexicon_words.txt
 
 cat $dir/lexicon_words.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}' | \
-  grep -v SIL | sort > $dir/nonsilence_phones.txt  
+  grep -v SIL | sort > $dir/nonsilence_phones.txt
 
 ( echo SIL; echo BRH; echo CGH; echo NSN ; echo SMK; echo UM; echo UHH ) > $dir/silence_phones.txt
 
@@ -27,9 +29,11 @@ echo SIL > $dir/optional_silence.txt
 echo -n >$dir/extra_questions.txt
 
 # Add to the lexicon the silences, noises etc.
+# Typically, you would use "<UNK> NSN" here, but the Cantab Research language models
+# use <unk> instead of <UNK> to represent out of vocabulary words.
 (echo '!SIL SIL'; echo '[BREATH] BRH'; echo '[NOISE] NSN'; echo '[COUGH] CGH';
  echo '[SMACK] SMK'; echo '[UM] UM'; echo '[UH] UHH'
- echo '<UNK> NSN' ) | \
+ echo '<unk> NSN' ) | \
  cat - $dir/lexicon_words.txt | sort | uniq > $dir/lexicon.txt
 
 # Check that the dict dir is okay!
diff --git a/egs/tedlium/s5/local/prepare_lm.sh b/egs/tedlium/s5/local/prepare_lm.sh
index 21e92704e23..e1efe628483 100755
--- a/egs/tedlium/s5/local/prepare_lm.sh
+++ b/egs/tedlium/s5/local/prepare_lm.sh
@@ -1,6 +1,6 @@
-#!/bin/bash 
+#!/bin/bash
 #
-# Copyright  2014 Nickolay V. Shmyrev 
+# Copyright  2014 Nickolay V. Shmyrev
 # Apache 2.0
 
 
@@ -12,21 +12,8 @@ arpa_lm=db/cantab-TEDLIUM/cantab-TEDLIUM-pruned.lm3.gz
 rm -rf data/lang_nosp_test
 cp -r data/lang_nosp data/lang_nosp_test
 
-# grep -v '<s> <s>' etc. is only for future-proofing this script.  Our
-# LM doesn't have these "invalid combinations".  These can cause 
-# determinization failures of CLG [ends up being epsilon cycles].
-# Note: remove_oovs.pl takes a list of words in the LM that aren't in
-# our word list.  Since our LM doesn't have any, we just give it
-# /dev/null [we leave it in the script to show how you'd do it].
-gunzip -c "$arpa_lm" | \
-   grep -v '<s> <s>' | \
-   grep -v '</s> <s>' | \
-   grep -v '</s> </s>' | \
-   arpa2fst - | fstprint | \
-   utils/remove_oovs.pl /dev/null | \
-   utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_nosp_test/words.txt \
-     --osymbols=data/lang_nosp_test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-    fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_nosp_test/G.fst
+gunzip -c "$arpa_lm" | arpa2fst --disambig-symbol=#0 \
+  --read-symbol-table=data/lang_nosp_test/words.txt - data/lang_nosp_test/G.fst
 
 
 echo  "Checking how stochastic G is (the first of these numbers should be small):"
diff --git a/egs/tedlium/s5/local/score_sclite.sh b/egs/tedlium/s5/local/score_sclite.sh
index 518ba040659..7b0915abea4 100755
--- a/egs/tedlium/s5/local/score_sclite.sh
+++ b/egs/tedlium/s5/local/score_sclite.sh
@@ -13,6 +13,7 @@ beam=7  # speed-up, but may affect MBR confidences.
 word_ins_penalty=0.0,0.5,1.0
 min_lmwt=10
 max_lmwt=20
+iter=final
 #end configuration section.
 
 [ -f ./path.sh ] && . ./path.sh
@@ -32,7 +33,7 @@ data=$1
 lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied.
 dir=$3
 
-model=$dir/../final.mdl # assume model one level up from decoding dir.
+model=$dir/../$iter.mdl # assume model one level up from decoding dir.
 
 hubscr=$KALDI_ROOT/tools/sctk/bin/hubscr.pl 
 [ ! -f $hubscr ] && echo "Cannot find scoring program at $hubscr" && exit 1;
@@ -48,6 +49,15 @@ nj=$(cat $dir/num_jobs)
 
 mkdir -p $dir/scoring/log
 
+if [ -f $dir/../frame_shift ]; then
+  frame_shift_opt="--frame-shift=$(cat $dir/../frame_shift)"
+  echo "$0: $dir/../frame_shift exists, using $frame_shift_opt"
+elif [ -f $dir/../frame_subsampling_factor ]; then
+  factor=$(cat $dir/../frame_subsampling_factor) || exit 1
+  frame_shift_opt="--frame-shift=0.0$factor"
+  echo "$0: $dir/../frame_subsampling_factor exists, using $frame_shift_opt"
+fi
+
 if [ $stage -le 0 ]; then
   for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
     $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/get_ctm.LMWT.${wip}.log \
@@ -58,7 +68,7 @@ if [ $stage -le 0 ]; then
       lattice-prune --beam=$beam ark:- ark:- \| \
       lattice-align-words-lexicon --output-error-lats=true --max-expand=10.0 --test=false \
        $lang/phones/align_lexicon.int $model ark:- ark:- \| \
-      lattice-to-ctm-conf --decode-mbr=$decode_mbr ark:- - \| \
+      lattice-to-ctm-conf --decode-mbr=$decode_mbr $frame_shift_opt ark:- - \| \
       utils/int2sym.pl -f 5 $lang/words.txt \| \
       utils/convert_ctm.pl $data/segments $data/reco2file_and_channel \| \
       sort -k1,1 -k2,2 -k3,3nb '>' $dir/score_LMWT_${wip}/ctm || exit 1;
@@ -68,8 +78,10 @@ fi
 if [ $stage -le 1 ]; then
   # Remove some stuff we don't want to score, from the ctm.
   for x in $dir/score_*/ctm; do
-    cat $x | grep -v -E '"\[BREATH|NOISE|COUGH|SMACK|UM|UH\]"' | \
-      grep -v -E '"!SIL|\<UNK\>"' > ${x}.filt || exit 1;
+    # `-i` is not needed in the following. It is added for robustness in ase this code is copy-pasted
+    # into another script that, e.g., uses <UNK> instead of <unk>
+    cat $x | grep -v -w -i -E '\[BREATH|NOISE|COUGH|SMACK|UM|UH\]' | \
+      grep -v -w -i -E '!SIL|<unk>' > ${x}.filt || exit 1;
   done
 fi
 
diff --git a/egs/tedlium/s5/path.sh b/egs/tedlium/s5/path.sh
index dcefaea23d8..16d5314b9c2 100755
--- a/egs/tedlium/s5/path.sh
+++ b/egs/tedlium/s5/path.sh
@@ -1,3 +1,6 @@
 export KALDI_ROOT=`pwd`/../../..
-export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$KALDI_ROOT/src/lmbin/:$PWD:$PATH:$KALDI_ROOT/tools/sph2pipe_v2.5
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH:$KALDI_ROOT/tools/sph2pipe_v2.5
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
 export LC_ALL=C
diff --git a/egs/tedlium/s5/run.sh b/egs/tedlium/s5/run.sh
index 7a36e49e8e0..e1dbf7b80e0 100755
--- a/egs/tedlium/s5/run.sh
+++ b/egs/tedlium/s5/run.sh
@@ -9,7 +9,7 @@
 # The data is distributed under 'Creative Commons BY-NC-ND 3.0' license,
 # which allow free non-commercial use, while only a citation is required.
 #
-# Copyright  2014 Nickolay V. Shmyrev 
+# Copyright  2014 Nickolay V. Shmyrev
 #            2014 Brno University of Technology (Author: Karel Vesely)
 # Apache 2.0
 #
@@ -28,17 +28,18 @@ stage=0
 # Data preparation
 if [ $stage -le 0 ]; then
   local/download_data.sh || exit 1
-  
+
   local/prepare_data.sh || exit 1
 
   local/prepare_dict.sh || exit 1
 
   utils/prepare_lang.sh data/local/dict_nosp \
-    "<UNK>" data/local/lang_nosp data/lang_nosp || exit 1
+    "<unk>" data/local/lang_nosp data/lang_nosp || exit 1
 
   local/prepare_lm.sh || exit 1
 
 fi
+
 # Feature extraction
 feat_dir=$pwd/data/mfcc_features
 if [ $stage -le 1 ]; then
@@ -100,7 +101,7 @@ if [ $stage -le 5 ]; then
     data/local/dict_nosp exp/tri2/pron_counts_nowb.txt \
     exp/tri2/sil_counts_nowb.txt \
     exp/tri2/pron_bigram_counts_nowb.txt data/local/dict
-  
+
   utils/prepare_lang.sh data/local/dict "<unk>" data/local/lang data/lang
   cp -rT data/lang data/lang_test
   cp -rT data/lang data/lang_rescore
@@ -134,6 +135,8 @@ if [ $stage -le 6 ]; then
     exp/tri3/graph data/test exp/tri3/decode_test || exit 1
 fi
 
+# steps/cleanup/debug_lexicon.sh --nj 100 --alidir exp/tri3 --cmd "$train_cmd" data/train data/lang exp/tri3 data/local/dict/lexicon.txt exp/tri3_debug_lexicon &
+
 if [ $stage -le 7 ]; then
   steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
     data/train data/lang exp/tri3 exp/tri3_ali || exit 1
diff --git a/egs/thchs30/README.txt b/egs/thchs30/README.txt
new file mode 100644
index 00000000000..acbdea4a263
--- /dev/null
+++ b/egs/thchs30/README.txt
@@ -0,0 +1,10 @@
+THCHS30 is an open Chinese speech database published by Center for Speech and Language Technology (CSLT) at Tsinghua University.
+
+The origional recording was conducted in 2002 by Dong Wang, supervised by Prof. Xiaoyan Zhu, at the Key State Lab of Intelligence and System, Department of Computer Science, Tsinghua Universeity, and the original name was 'TCMSD', standing for 'Tsinghua Continuous Mandarin Speech Database'. The publication after 13 years has been initiated by Dr. Dong Wang and was supported by Prof. Xiaoyan Zhu. We hope to provide a toy database for new researchers in the field of speech recognition. Therefore, the database is totally free to academic users.
+
+The database can be downloaded from openslr:
+http://www.openslr.org/18/
+
+or from the CSLT server:
+http://data.cslt.org/thchs30/README.html
+
diff --git a/egs/thchs30/s5/RESULTS b/egs/thchs30/s5/RESULTS
new file mode 100644
index 00000000000..70718ea4c2a
--- /dev/null
+++ b/egs/thchs30/s5/RESULTS
@@ -0,0 +1,61 @@
+#!/bin/bash
+for x in exp/{mono,tri1,tri2b,tri3b,tri4b,tri4b_dnn,tri4b_dnn_mpe}/decode_test_phone* ; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done
+#clean mono,tri1,tri2b,tri3b,GMM,DNN model
+#clean test data
+#phone task
+%WER 31.49 [ 113986 / 362027, 20820 ins, 22043 del, 71123 sub ] exp/mono/decode_test_phone/wer_5
+%WER 20.56 [ 74445 / 362027, 15452 ins, 12457 del, 46536 sub ] exp/tri1/decode_test_phone/wer_5
+%WER 17.32 [ 62689 / 362027, 11937 ins, 11260 del, 39492 sub ] exp/tri2b/decode_test_phone/wer_6
+%WER 18.06 [ 65368 / 362027, 10426 ins, 13780 del, 41162 sub ] exp/tri3b/decode_test_phone/wer_5
+%WER 18.50 [ 66984 / 362027, 13117 ins, 11917 del, 41950 sub ] exp/tri3b/decode_test_phone.si/wer_5
+%WER 16.17 [ 58544 / 362027, 9628 ins, 11746 del, 37170 sub ] exp/tri4b/decode_test_phone/wer_6
+%WER 16.59 [ 60060 / 362027, 11440 ins, 10477 del, 38143 sub ] exp/tri4b/decode_test_phone.si/wer_6
+%WER 10.27 [ 37173 / 362027, 8675 ins, 6483 del, 22015 sub ] exp/tri4b_dnn/decode_test_phone/wer_4
+%WER 10.11 [ 36591 / 362027, 8702 ins, 6255 del, 21634 sub ] exp/tri4b_dnn_mpe/decode_test_phone_it1/wer_4
+%WER 10.03 [ 36321 / 362027, 7490 ins, 6731 del, 22100 sub ] exp/tri4b_dnn_mpe/decode_test_phone_it2/wer_5
+%WER 10.01 [ 36249 / 362027, 7507 ins, 6677 del, 22065 sub ] exp/tri4b_dnn_mpe/decode_test_phone_it3/wer_5
+
+exit 0
+
+for x in exp/{mono,tri1,tri2b,tri3b,tri4b,tri4b_dnn,tri4b_dnn_mpe}/decode_test_word* ; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done
+#clean mono,tri1,tri2b,tri3b,GMM,DNN model
+#clean test data
+#word task
+%WER 51.04 [ 41414 / 81139, 474 ins, 2404 del, 38536 sub ] exp/mono/decode_test_word/wer_9
+%WER 36.38 [ 29522 / 81139, 516 ins, 1096 del, 27910 sub ] exp/tri1/decode_test_word/wer_10
+%WER 32.51 [ 26379 / 81139, 469 ins, 940 del, 24970 sub ] exp/tri2b/decode_test_word/wer_9
+%WER 31.65 [ 25684 / 81139, 340 ins, 1085 del, 24259 sub ] exp/tri3b/decode_test_word/wer_9
+%WER 34.07 [ 27643 / 81139, 443 ins, 1100 del, 26100 sub ] exp/tri3b/decode_test_word.si/wer_10
+%WER 29.64 [ 24052 / 81139, 341 ins, 929 del, 22782 sub ] exp/tri4b/decode_test_word/wer_11
+%WER 31.71 [ 25732 / 81139, 472 ins, 902 del, 24358 sub ] exp/tri4b/decode_test_word.si/wer_10
+%WER 23.57 [ 19123 / 81139, 419 ins, 585 del, 18119 sub ] exp/tri4b_dnn/decode_test_word/wer_7
+%WER 23.40 [ 18984 / 81139, 397 ins, 567 del, 18020 sub ] exp/tri4b_dnn_mpe/decode_test_word_it1/wer_7
+%WER 23.27 [ 18884 / 81139, 396 ins, 553 del, 17935 sub ] exp/tri4b_dnn_mpe/decode_test_word_it2/wer_7
+%WER 23.18 [ 18804 / 81139, 368 ins, 618 del, 17818 sub ] exp/tri4b_dnn_mpe/decode_test_word_it3/wer_8
+
+exit 0
+
+for x in exp/{tri4b_dnn_mpe,tri4b_dnn_dae}/decode_phone_0db/{white,car,cafe}; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done
+#clean MPE model and mixture DAE model
+#0db noise test data
+#phone task
+%WER 84.01 [ 304141 / 362027, 717 ins, 275948 del, 27476 sub ] exp/tri4b_dnn_mpe/decode_phone_0db/white/wer_4
+%WER 14.11 [ 51074 / 362027, 10941 ins, 8175 del, 31958 sub ] exp/tri4b_dnn_mpe/decode_phone_0db/car/wer_5
+%WER 71.63 [ 259329 / 362027, 6164 ins, 217508 del, 35657 sub ] exp/tri4b_dnn_mpe/decode_phone_0db/cafe/wer_4
+%WER 40.04 [ 144946 / 362027, 17764 ins, 35162 del, 92020 sub ] exp/tri4b_dnn_dae/decode_phone_0db/white/wer_6
+%WER 11.81 [ 42773 / 362027, 9598 ins, 7552 del, 25623 sub ] exp/tri4b_dnn_dae/decode_phone_0db/car/wer_5
+%WER 32.39 [ 117256 / 362027, 17793 ins, 27750 del, 71713 sub ] exp/tri4b_dnn_dae/decode_phone_0db/cafe/wer_6
+exit 0
+
+for x in exp/{tri4b_dnn_mpe,tri4b_dnn_dae}/decode_word_0db/{white,car,cafe}; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done
+#clean MPE model and mixture DAE model
+#0db noise test data
+#word task
+%WER 98.56 [ 79973 / 81139, 15 ins, 64293 del, 15665 sub ] exp/tri4b_dnn_mpe/decode_word_0db/white/wer_4
+%WER 28.10 [ 22799 / 81139, 553 ins, 661 del, 21585 sub ] exp/tri4b_dnn_mpe/decode_word_0db/car/wer_8
+%WER 85.58 [ 69438 / 81139, 321 ins, 49066 del, 20051 sub ] exp/tri4b_dnn_mpe/decode_word_0db/cafe/wer_8
+%WER 65.23 [ 52923 / 81139, 827 ins, 4198 del, 47898 sub ] exp/tri4b_dnn_dae/decode_word_0db/white/wer_13
+%WER 25.12 [ 20379 / 81139, 444 ins, 676 del, 19259 sub ] exp/tri4b_dnn_dae/decode_word_0db/car/wer_9
+%WER 53.38 [ 43308 / 81139, 907 ins, 4164 del, 38237 sub ] exp/tri4b_dnn_dae/decode_word_0db/cafe/wer_12
+
+exit 0
diff --git a/egs/thchs30/s5/cmd.sh b/egs/thchs30/s5/cmd.sh
new file mode 100644
index 00000000000..1d8e768790f
--- /dev/null
+++ b/egs/thchs30/s5/cmd.sh
@@ -0,0 +1,15 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd=queue.pl
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/thchs30/s5/conf/decode_dnn.config b/egs/thchs30/s5/conf/decode_dnn.config
new file mode 100644
index 00000000000..89dd9929a62
--- /dev/null
+++ b/egs/thchs30/s5/conf/decode_dnn.config
@@ -0,0 +1,2 @@
+beam=18.0 # beam for decoding.  Was 13.0 in the scripts.
+lattice_beam=10.0 # this has most effect on size of the lattices.
diff --git a/egs/thchs30/s5/conf/fbank.conf b/egs/thchs30/s5/conf/fbank.conf
new file mode 100644
index 00000000000..8e6e36c69cf
--- /dev/null
+++ b/egs/thchs30/s5/conf/fbank.conf
@@ -0,0 +1,3 @@
+# No non-default options for now.
+#--sample-frequency=8000
+--num-mel-bins=40
diff --git a/egs/thchs30/s5/conf/mfcc.conf b/egs/thchs30/s5/conf/mfcc.conf
new file mode 100644
index 00000000000..47d6c48bfe5
--- /dev/null
+++ b/egs/thchs30/s5/conf/mfcc.conf
@@ -0,0 +1,2 @@
+--use-energy=false   # only non-default option.
+#--sample-frequency=8000
diff --git a/egs/thchs30/s5/local/dae/add-noise-mod.py b/egs/thchs30/s5/local/dae/add-noise-mod.py
new file mode 100755
index 00000000000..33e8a297aef
--- /dev/null
+++ b/egs/thchs30/s5/local/dae/add-noise-mod.py
@@ -0,0 +1,155 @@
+#!/usr/bin/env python
+# Copyright 2016  Tsinghua University (Author: Chao Liu, Dong Wang).  Apache 2.0.
+
+
+from __future__ import print_function
+import optparse
+import random
+import bisect
+import re
+import logging
+import wave
+import math
+import struct
+import sys
+import os
+
+try:
+  import pyximport; pyximport.install()
+  from thchs30_util import *
+except:
+  print("Cython possibly not installed, using standard python code. The process might be slow", file=sys.stderr)
+
+  def energy(mat):
+    return float(sum([x * x for x in mat])) / len(mat)
+
+  def mix(mat, noise, pos, scale):
+    ret = []
+    l = len(noise)
+    for i in xrange(len(mat)):
+        x = mat[i]
+        d = int(x + scale * noise[pos])
+        #if d > 32767 or d < -32768:
+        #    logging.debug('overflow occurred!')
+        d = max(min(d, 32767), -32768)
+        ret.append(d)
+        pos += 1
+        if pos == l:
+            pos = 0
+    return (pos, ret)
+
+
+def dirichlet(params):
+    samples = [random.gammavariate(x, 1) if x > 0 else 0. for x in params]
+    samples = [x / sum(samples) for x in samples]
+    for x in xrange(1, len(samples)):
+        samples[x] += samples[x - 1]
+    return bisect.bisect_left(samples, random.random())
+
+def wave_mat(wav_filename):
+    f = wave.open(wav_filename, 'r')
+    n = f.getnframes()
+    ret = f.readframes(n)
+    f.close()
+    return list(struct.unpack('%dh' % n, ret))
+
+def num_samples(mat):
+    return len(mat)
+
+def scp(scp_filename):
+    with open(scp_filename) as f:
+        for l in f:
+            yield tuple(l.strip().split())
+
+def wave_header(sample_array, sample_rate):
+  byte_count = (len(sample_array)) * 2 # short
+  # write the header
+  hdr = struct.pack('<ccccIccccccccIHHIIHH',
+    'R', 'I', 'F', 'F',
+    byte_count + 0x2c - 8,  # header size
+    'W', 'A', 'V', 'E', 'f', 'm', 't', ' ',
+    0x10,  # size of 'fmt ' header
+    1,  # format 1
+    1,  # channels
+    sample_rate,  # samples / second
+    sample_rate * 2,  # bytes / second
+    2,  # block alignment
+    16)  # bits / sample
+  hdr += struct.pack('<ccccI',
+    'd', 'a', 't', 'a', byte_count)
+  return hdr
+
+
+def output(tag, mat):
+    sys.stdout.write(tag + ' ')
+    sys.stdout.write(wave_header(mat, 16000))
+    sys.stdout.write(struct.pack('%dh' % len(mat), *mat))
+
+def output_wave_file(dir, tag, mat):
+    with open('%s/%s.wav' % (dir,tag), 'w') as f:
+        f.write(wave_header(mat, 16000))
+        f.write(struct.pack('%dh' % len(mat), *mat))
+
+def main():
+    parser = optparse.OptionParser()
+    parser.add_option('--noise-level', type=float, help='')
+    parser.add_option('--noise-src', type=str, help='')
+    parser.add_option('--noise-prior', type=str, help='')
+    parser.add_option('--seed', type=int, help='')
+    parser.add_option('--sigma0', type=float, help='')
+    parser.add_option('--wav-src', type=str, help='')
+    parser.add_option('--verbose', type=int, help='')
+    parser.add_option('--wavdir', type=str, help='')
+    (args, dummy) = parser.parse_args()
+    random.seed(args.seed)
+    params = [float(x) for x in args.noise_prior.split(',')]
+    
+    if args.verbose:
+        logging.basicConfig(level=logging.DEBUG)
+    global noises
+    noise_energies = [0.]
+    noises = [(0, [])]
+    for tag, wav in scp(args.noise_src):
+        logging.debug('noise wav: %s', wav)
+        mat = wave_mat(wav)
+        e = energy(mat)
+        logging.debug('noise energy: %f', e)
+        noise_energies.append(e)
+        noises.append((0, mat))
+        
+    for tag, wav in scp(args.wav_src):
+        logging.debug('wav: %s', wav)
+        noise_level = random.gauss(args.noise_level, args.sigma0)
+        logging.debug('noise level: %f', noise_level)
+        mat = wave_mat(wav)
+        signal = energy(mat)
+        logging.debug('signal energy: %f', signal)
+        noise = signal / (10 ** (noise_level / 10.))
+        logging.debug('noise energy: %f', noise)
+        type = dirichlet(params)
+        logging.debug('selected type: %d', type)
+        if type == 0:
+            if args.wavdir != 'NULL':
+               output_wave_file(args.wavdir, tag, mat)
+            else:
+               output(tag, mat)
+        else:
+            p,n = noises[type]
+            if p+len(mat) > len(n):
+                noise_energies[type] = energy(n[p::]+n[0:len(n)-p:])
+            else:
+                noise_energies[type] = energy(n[p:p+len(mat):])
+            scale = math.sqrt(noise / noise_energies[type])
+            logging.debug('noise scale: %f', scale)
+            pos, result = mix(mat, n, p, scale)
+            noises[type] = (pos, n)
+            if args.wavdir != 'NULL':
+                output_wave_file(args.wavdir, tag, result)
+            else:
+                output(tag, result)
+
+if __name__ == '__main__':
+    main()
+
+
+
diff --git a/egs/thchs30/s5/local/dae/run_dae.sh b/egs/thchs30/s5/local/dae/run_dae.sh
new file mode 100755
index 00000000000..f6a6db3a01a
--- /dev/null
+++ b/egs/thchs30/s5/local/dae/run_dae.sh
@@ -0,0 +1,149 @@
+#!/bin/bash
+#Copyright 2016  Tsinghua University (Author: Dong Wang, Xuewei Zhang).  Apache 2.0.
+
+#Conducts experiments of dae-based denoisng 
+
+stage=0
+nj=8
+
+. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
+           ## This relates to the queue.
+
+. ./path.sh ## Source the tools/utils (import the queue.pl)
+. utils/parse_options.sh || exit 1;
+
+thchs=$1
+
+#generate noisy data. We focuse on the 0db condition.
+#For training set, generate noisy data with SNR mean=0, variance=10, with three noise types mixed together.  
+#For dev, generate noisy data with SNR mean=0, variance=0, with three niose types mixed together
+#For test, use the standard test data which were generated by SNR mean=0, variance=0.
+
+if [ $stage =  0 ]; then
+   #generat noise.scp
+   mkdir -p data/dae/noise && \
+   awk '{print $1 " '$thchs'/resource/noise/"$2}' $thchs/resource/noise/noise.scp >  data/dae/noise/noise.scp || exit 1
+
+   echo "DAE: generate training data..."
+   noise_scp=data/dae/noise/noise.scp
+   noise_prior="0.0,10.0,10.0,10.0" #define noise type to sample. [S_clean, S_white, S_car, S_cafe]
+   noise_level=0 #0db condition
+   sigma0=10 #some random in SNR
+   seed=32
+   verbose=0
+   wavdir=wav/dae/train 
+   rm -rf data/dae/train && mkdir -p data/dae/train || exit 1
+   cp data/fbank/train/{spk2utt,utt2spk,text} data/dae/train || exit 1
+   mkdir -p $wavdir && awk '{print $1 " '$wavdir'/"$1".wav"}' data/fbank/train/wav.scp > data/dae/train/wav.scp || exit 1
+
+   mkdir -p exp/dae/gendata 
+   split_scps=""
+   for n in $(seq $nj); do
+      split_scps="$split_scps exp/dae/gendata/train_split_${n}.scp"
+   done
+   utils/split_scp.pl data/fbank/train/wav.scp  $split_scps || exit 1
+   $train_cmd JOB=1:$nj exp/dae/gendata/add_noise_train.JOB.log \
+     local/dae/add-noise-mod.py --noise-level $noise_level \
+       --sigma0 $sigma0 --seed $seed --verbose $verbose \
+       --noise-prior $noise_prior --noise-src $noise_scp \
+       --wav-src exp/dae/gendata/train_split_JOB.scp --wavdir $wavdir \
+       || exit 1
+
+   steps/make_fbank.sh --nj $nj --cmd "$train_cmd"  \
+     data/dae/train exp/dae/gendata fbank/dae/train || exit 1
+   steps/compute_cmvn_stats.sh data/dae/train exp/dae/cmvn \
+     fbank/dae/train || exit 1
+
+   #genreate dev data. Just the 0db condition is produced.  Multiple noise types mixed together.
+   echo "DAE: generating dev data..."
+   wavdir=wav/dae/dev/0db
+   sigma0=0 #no random in SNR
+   rm -rf data/dae/dev/0db && mkdir -p data/dae/dev/0db && \
+   cp -L data/fbank/dev/{spk2utt,utt2spk,text} data/dae/dev/0db || exit 1
+   mkdir -p $wavdir && awk '{print $1 " '$wavdir'/"$1".wav"}' data/fbank/dev/wav.scp > data/dae/dev/0db/wav.scp || exit 1
+
+   split_scps=""
+   for n in $(seq $nj); do
+      split_scps="$split_scps exp/dae/gendata/dev_split_${n}.scp"
+   done
+   utils/split_scp.pl data/fbank/dev/wav.scp  $split_scps || exit 1
+
+   $train_cmd JOB=1:$nj exp/dae/gendata/add_noise_dev.JOB.log \
+     local/dae/add-noise-mod.py --noise-level $noise_level \
+       --sigma0 $sigma0 --seed $seed --verbose $verbose \
+       --noise-prior $noise_prior --noise-src $noise_scp \
+       --wav-src exp/dae/gendata/dev_split_JOB.scp --wavdir $wavdir \
+       || exit 1
+   steps/make_fbank.sh --nj $nj --cmd "$train_cmd"  \
+     data/dae/dev/0db exp/dae/gendata fbank/dae/dev/0db || exit 1
+   steps/compute_cmvn_stats.sh data/dae/dev/0db exp/dae/cmvn \
+     fbank/dae/dev/0db || exit 1
+
+   #generate test data. Assume it has been downloaded in $thchs/test-noise
+   echo "DAE: generating test data..."
+   #generate fbank
+   for x in car white cafe; do
+     echo "producing fbanks for $x"
+     mkdir -p data/dae/test/0db/$x && \
+     cp -L data/fbank/test/{spk2utt,utt2spk,text} data/dae/test/0db/$x && \
+     awk '{print $1 " '$thchs'/test-noise/0db/'$x'/"$1".wav"}' data/fbank/test/wav.scp > data/dae/test/0db/$x/wav.scp || exit 1
+     steps/make_fbank.sh --nj $nj --cmd "$train_cmd"  \
+       data/dae/test/0db/$x exp/dae/gendata fbank/dae/test/0db/$x || exit 1
+     echo "generating cmvn for test data $x"
+     steps/compute_cmvn_stats.sh data/dae/test/0db/$x exp/dae/cmvn \
+       fbank/dae/test/0db/$x || exit 1
+     cp -R data/dae/test/0db/$x data/dae/test/0db/${x}_phone && cp data/test/phone.txt data/dae/test/0db/${x}_phone/text || exit 1
+   done
+fi
+
+#DAE training
+if [ $stage -le 1 ]; then
+  #train dnn dae using data with mixed noise
+  #produce merged feats.scp as --labels for both training and cv
+  dir=exp/tri4b_dnn_dae && mkdir -p exp/tri4b_dnn_dae || exit 1
+  cat data/fbank/train/feats.scp data/fbank/dev/feats.scp | sort -u  > $dir/tgt_feats.scp
+  cat data/fbank/train/cmvn.scp data/fbank/dev/cmvn.scp   | sort -u  > $dir/tgt_cmvn.scp
+
+  num_fea=$(feat-to-dim scp:$dir/tgt_feats.scp -)
+  echo "num_fea = $num_fea"
+
+  $cuda_cmd exp/tri4b_dnn_dae/log/train_nnet.log \
+    steps/nnet/train.sh --hid-layers 2 --hid-dim 1200 \
+      --cmvn-opts "--norm-vars=false"  --splice 10 \
+      --learn-rate 0.0001 \
+      --train_tool_opts "--objective-function=mse" \
+      --copy_feats false \
+      --labels "ark:copy-feats scp:$dir/tgt_feats.scp ark:- | apply-cmvn --norm-vars=false scp:$dir/tgt_cmvn.scp ark:- ark:- | feat-to-post ark:- ark:-|" \
+      --num-tgt  $num_fea \
+      --proto-opts '--no-softmax '  \
+      data/dae/train data/dae/dev/0db data/lang  \
+      data/fbank/train  data/fbank/dev  \
+      exp/tri4b_dnn_dae || exit 1;
+  nnet-concat exp/tri4b_dnn_dae/final.feature_transform exp/tri4b_dnn_dae/final.nnet \
+    exp/tri4b_dnn_mpe/final.feature_transform exp/tri4b_dnn_dae/dae.nnet  || exit 1
+  
+fi
+
+#decoding 
+if [ $stage -le 2 ]; then
+   for x in car white cafe; do
+     (
+       #decode word 
+       steps/nnet/decode.sh --cmd "$decode_cmd" --nj $nj \
+         --srcdir exp/tri4b_dnn_mpe \
+         exp/tri4b/graph_word data/dae/test/0db/$x exp/tri4b_dnn_mpe/decode_word_0db/$x || exit 1;
+       steps/nnet/decode.sh --cmd "$decode_cmd" --nj $nj \
+         --srcdir exp/tri4b_dnn_mpe --feature-transform exp/tri4b_dnn_dae/dae.nnet \
+         exp/tri4b/graph_word data/dae/test/0db/$x exp/tri4b_dnn_dae/decode_word_0db/$x || exit 1;
+
+       #decode phone
+       steps/nnet/decode.sh --cmd "$decode_cmd" --nj $nj \
+         --srcdir exp/tri4b_dnn_mpe \
+         exp/tri4b/graph_phone data/dae/test/0db/${x}_phone exp/tri4b_dnn_mpe/decode_phone_0db/$x || exit 1;
+       steps/nnet/decode.sh --cmd "$decode_cmd" --nj $nj \
+         --srcdir exp/tri4b_dnn_mpe --feature-transform exp/tri4b_dnn_dae/dae.nnet \
+         exp/tri4b/graph_phone data/dae/test/0db/${x}_phone exp/tri4b_dnn_dae/decode_phone_0db/$x || exit 1;
+    ) &
+   done
+fi
+
diff --git a/egs/thchs30/s5/local/dae/thchs30_util.pyx b/egs/thchs30/s5/local/dae/thchs30_util.pyx
new file mode 100755
index 00000000000..281ff166032
--- /dev/null
+++ b/egs/thchs30/s5/local/dae/thchs30_util.pyx
@@ -0,0 +1,27 @@
+# Copyright 2016  Tsinghua University (Author: Chao Liu).  Apache 2.0.
+  
+def energy(list mat):
+    cdef float e
+    cdef int i, j, l
+    l = len(mat)
+    for i in range(l):
+        j = mat[i]
+        e += j * j
+    e /= l
+    return e
+
+def mix(list mat, list noise, int pos, double scale):
+    cdef len_noise, len_mat, i, x, y
+    ret = []
+    len_noise = len(noise)
+    len_mat = len(mat)
+    for i in range(len_mat):
+        x = mat[i]
+        y = int(x + scale * noise[pos])
+        if y > 32767:
+            y = 32767
+        elif y < -32768:
+            y = -32768
+        ret.append(y)
+        pos = (pos + 1) % len_noise
+    return pos, ret
diff --git a/egs/thchs30/s5/local/download_and_untar.sh b/egs/thchs30/s5/local/download_and_untar.sh
new file mode 100755
index 00000000000..655e674dc9b
--- /dev/null
+++ b/egs/thchs30/s5/local/download_and_untar.sh
@@ -0,0 +1,99 @@
+#!/bin/bash
+
+# Copyright   2014  Johns Hopkins University (author: Daniel Povey) 
+# Copyright   2016  Tsinghua University (author: Dong Wang)
+# Apache 2.0
+
+# Adapted from librispeech recipe local/download_and_untar.sh
+
+remove_archive=false
+
+if [ "$1" == --remove-archive ]; then
+  remove_archive=true
+  shift
+fi
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>"
+  echo "e.g.: $0 /nfs/public/materials/data/thchs30-openslr www.openslr.org/resources/18 data_thchs30"
+  echo "With --remove-archive it will remove the archive after successfully un-tarring it."
+  echo "<corpus-part> can be one of: data_thchs30, test-noise, resource"
+fi
+
+data=$1
+url=$2
+part=$3
+
+if [ ! -d "$data" ]; then
+  echo "$0: no such directory $data"
+  exit 1;
+fi
+
+part_ok=false
+list="data_thchs30 test-noise resource"
+for x in $list; do 
+  if [ "$part" == $x ]; then part_ok=true; fi
+done
+if ! $part_ok; then
+  echo "$0: expected <corpus-part> to be one of $list, but got '$part'"
+  exit 1;
+fi
+
+if [ -z "$url" ]; then
+  echo "$0: empty URL base."
+  exit 1;
+fi
+
+if [ -f $data/$part/.complete ]; then
+  echo "$0: data part $part was already successfully extracted, nothing to do."
+  exit 0;
+fi
+
+
+sizes="6453425169 1971460210 24813708"
+
+if [ -f $data/$part.tgz ]; then
+  size=$(/bin/ls -l $data/$part.tgz | awk '{print $5}')
+  size_ok=false
+  for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done
+  if ! $size_ok; then
+    echo "$0: removing existing file $data/$part.tgz because its size in bytes $size"
+    echo "does not equal the size of one of the archives."
+    rm $data/$part.tgz
+  else
+    echo "$data/$part.tgz exists and appears to be complete."
+  fi
+fi
+
+if [ ! -f $data/$part.tgz ]; then
+  if ! which wget >/dev/null; then
+    echo "$0: wget is not installed."
+    exit 1;
+  fi
+  full_url=$url/$part.tgz
+  echo "$0: downloading data from $full_url.  This may take some time, please be patient."
+
+  cd $data
+  pwd
+  echo " wget --no-check-certificate $full_url"
+  if ! wget --no-check-certificate $full_url; then
+    echo "$0: error executing wget $full_url"
+    exit 1;
+  fi
+fi
+
+cd $data
+
+if ! tar -xvzf $part.tgz; then
+  echo "$0: error un-tarring archive $data/$part.tgz"
+  exit 1;
+fi
+
+touch $data/$part/.complete
+
+echo "$0: Successfully downloaded and un-tarred $data/$part.tgz"
+
+if $remove_archive; then
+  echo "$0: removing $data/$part.tgz file since --remove-archive option was supplied."
+  rm $data/$part.tgz
+fi
diff --git a/egs/thchs30/s5/local/nnet/run_dnn.sh b/egs/thchs30/s5/local/nnet/run_dnn.sh
new file mode 100755
index 00000000000..d40f48e3609
--- /dev/null
+++ b/egs/thchs30/s5/local/nnet/run_dnn.sh
@@ -0,0 +1,90 @@
+#!/bin/bash
+#Copyright 2016  Tsinghua University (Author: Dong Wang, Xuewei Zhang).  Apache 2.0.
+
+#run from ../..
+#DNN training, both xent and MPE
+
+
+. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
+           ## This relates to the queue.
+
+. ./path.sh ## Source the tools/utils (import the queue.pl)
+
+stage=0
+nj=8
+
+. utils/parse_options.sh || exit 1;
+
+gmmdir=$1
+alidir=$2
+alidir_cv=$3
+
+#generate fbanks
+if [ $stage -le 0 ]; then
+  echo "DNN training: stage 0: feature generation"
+  rm -rf data/fbank && mkdir -p data/fbank &&  cp -R data/{train,dev,test,test_phone} data/fbank || exit 1;
+  for x in train dev test; do
+    echo "producing fbank for $x"
+    #fbank generation
+    steps/make_fbank.sh --nj $nj --cmd "$train_cmd" data/fbank/$x exp/make_fbank/$x fbank/$x || exit 1
+    #ompute cmvn
+    steps/compute_cmvn_stats.sh data/fbank/$x exp/fbank_cmvn/$x fbank/$x || exit 1
+  done
+  
+  echo "producing test_fbank_phone"
+  cp data/fbank/test/feats.scp data/fbank/test_phone && cp data/fbank/test/cmvn.scp data/fbank/test_phone || exit 1;
+
+fi
+
+
+#xEnt training
+if [ $stage -le 1 ]; then
+  outdir=exp/tri4b_dnn
+  #NN training
+  (tail --pid=$$ -F $outdir/log/train_nnet.log 2>/dev/null)& # forward log
+  $cuda_cmd $outdir/log/train_nnet.log \
+    steps/nnet/train.sh --copy_feats false --cmvn-opts "--norm-means=true --norm-vars=false" --hid-layers 4 --hid-dim 1024 \
+	  --learn-rate 0.008 data/fbank/train data/fbank/dev data/lang $alidir $alidir_cv $outdir || exit 1;
+  #Decode (reuse HCLG graph in gmmdir)
+  (
+    steps/nnet/decode.sh --nj $nj --cmd "$decode_cmd" --srcdir $outdir --config conf/decode_dnn.config --acwt 0.1 \
+      $gmmdir/graph_word data/fbank/test $outdir/decode_test_word || exit 1; 
+  )&
+  (
+   steps/nnet/decode.sh --nj $nj --cmd "$decode_cmd" --srcdir $outdir --config conf/decode_dnn.config --acwt 0.1 \
+     $gmmdir/graph_phone data/fbank/test_phone $outdir/decode_test_phone || exit 1; 
+  )&
+
+fi
+
+#MPE training
+
+srcdir=exp/tri4b_dnn
+acwt=0.1
+
+if [ $stage -le 2 ]; then
+  # generate lattices and alignments
+  steps/nnet/align.sh --nj $nj --cmd "$train_cmd" \
+    data/fbank/train data/lang $srcdir ${srcdir}_ali || exit 1;
+  steps/nnet/make_denlats.sh --nj $nj --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt $acwt \
+    data/fbank/train data/lang $srcdir ${srcdir}_denlats || exit 1;
+fi
+
+if [ $stage -le 3 ]; then
+  outdir=exp/tri4b_dnn_mpe
+  #Re-train the DNN by 3 iteration of MPE
+  steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 3 --acwt $acwt --do-smbr false \
+    data/fbank/train data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $outdir || exit 1
+  #Decode (reuse HCLG graph)
+  for ITER in 3 2 1; do
+   (
+    steps/nnet/decode.sh --nj $nj --cmd "$decode_cmd" --nnet $outdir/${ITER}.nnet --config conf/decode_dnn.config --acwt $acwt \
+      $gmmdir/graph_word data/fbank/test $outdir/decode_test_word_it${ITER} || exit 1;
+   )&
+   (
+   steps/nnet/decode.sh --nj $nj --cmd "$decode_cmd" --nnet $outdir/${ITER}.nnet --config conf/decode_dnn.config --acwt $acwt \
+     $gmmdir/graph_phone data/fbank/test_phone $outdir/decode_test_phone_it${ITER} || exit 1; 
+   )&
+  done
+fi
+
diff --git a/egs/thchs30/s5/local/score.sh b/egs/thchs30/s5/local/score.sh
new file mode 120000
index 00000000000..0afefc3158c
--- /dev/null
+++ b/egs/thchs30/s5/local/score.sh
@@ -0,0 +1 @@
+../steps/score_kaldi.sh
\ No newline at end of file
diff --git a/egs/thchs30/s5/local/thchs-30_data_prep.sh b/egs/thchs30/s5/local/thchs-30_data_prep.sh
new file mode 100755
index 00000000000..7a85274ce83
--- /dev/null
+++ b/egs/thchs30/s5/local/thchs-30_data_prep.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+#Copyright 2016  Tsinghua University (Author: Dong Wang, Xuewei Zhang).  Apache 2.0.
+
+#This script pepares the data directory for thchs30 recipe. 
+#It reads the corpus and get wav.scp and transcriptions.
+
+dir=$1
+corpus_dir=$2
+
+
+cd $dir
+
+echo "creating data/{train,dev,test}"
+mkdir -p data/{train,dev,test}
+
+#create wav.scp, utt2spk.scp, spk2utt.scp, text
+(
+for x in train dev test; do
+  echo "cleaning data/$x"
+  cd $dir/data/$x
+  rm -rf wav.scp utt2spk spk2utt word.txt phone.txt text
+  echo "preparing scps and text in data/$x"
+  for nn in `find  $corpus_dir/$x/*.wav | sort -u | xargs -i basename {} .wav`; do
+      echo $nn $corpus_dir/$x/$nn.wav >> wav.scp
+      echo $nn $nn >> utt2spk
+      echo $nn $nn >> spk2utt
+      echo $nn `sed -n 1p $corpus_dir/data/$nn.wav.trn` >> word.txt
+      echo $nn `sed -n 3p $corpus_dir/data/$nn.wav.trn` >> phone.txt
+  done 
+  cp word.txt text
+done
+) || exit 1
+
+echo "creating test_phone for phone decoding"
+(
+  rm -rf data/test_phone && cp -R data/test data/test_phone  || exit 1
+  cd data/test_phone && rm text &&  cp phone.txt text || exit 1
+)
+
diff --git a/egs/thchs30/s5/local/thchs-30_decode.sh b/egs/thchs30/s5/local/thchs-30_decode.sh
new file mode 100755
index 00000000000..f9661f61f21
--- /dev/null
+++ b/egs/thchs30/s5/local/thchs-30_decode.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+#Copyright 2016  Tsinghua University (Author: Dong Wang, Xuewei Zhang).  Apache 2.0.
+
+#decoding wrapper for thchs30 recipe
+#run from ../
+
+nj=8
+mono=false
+
+. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
+           ## This relates to the queue.
+
+. ./path.sh ## Source the tools/utils (import the queue.pl)
+
+. utils/parse_options.sh || exit 1;
+decoder=$1
+srcdir=$2
+datadir=$3
+
+
+if [ $mono = true ];then
+  echo  "using monophone to generate graph"
+  opt="--mono"
+fi
+
+#decode word
+utils/mkgraph.sh $opt data/graph/lang $srcdir $srcdir/graph_word  || exit 1;
+$decoder --cmd "$decode_cmd" --nj $nj $srcdir/graph_word $datadir/test $srcdir/decode_test_word || exit 1
+
+#decode phone
+utils/mkgraph.sh $opt data/graph_phone/lang $srcdir $srcdir/graph_phone  || exit 1;
+$decoder --cmd "$decode_cmd" --nj $nj $srcdir/graph_phone $datadir/test_phone $srcdir/decode_test_phone || exit 1
+
+
diff --git a/egs/thchs30/s5/local/wer_output_filter b/egs/thchs30/s5/local/wer_output_filter
new file mode 100755
index 00000000000..1ccb651a258
--- /dev/null
+++ b/egs/thchs30/s5/local/wer_output_filter
@@ -0,0 +1,19 @@
+#!/usr/bin/env python
+#Copyright 2016  Tsinghua University (Author: Dong Wang).  Apache 2.0.
+
+#This script accepts a Chinese stream and inserts blanks between Chinese characters
+#Used to prepare character-based transcriptions and compute CER.
+
+from __future__ import print_function
+import sys
+
+for l in sys.stdin:
+    l=l.strip()
+    ll=l.split()
+    lk=ll[0]
+    for v in ll[1:]:
+        v = v.decode('utf-8')
+        for i in v:
+           lk= lk + ' ' + i
+        
+    print (lk.encode('utf-8'))
diff --git a/egs/thchs30/s5/path.sh b/egs/thchs30/s5/path.sh
new file mode 100755
index 00000000000..fb1c0489386
--- /dev/null
+++ b/egs/thchs30/s5/path.sh
@@ -0,0 +1,7 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
+
diff --git a/egs/thchs30/s5/run.sh b/egs/thchs30/s5/run.sh
new file mode 100755
index 00000000000..24645f59e83
--- /dev/null
+++ b/egs/thchs30/s5/run.sh
@@ -0,0 +1,112 @@
+#!/bin/bash
+
+. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
+           ## This relates to the queue.
+. ./path.sh
+
+H=`pwd`  #exp home
+n=8      #parallel jobs
+
+#corpus and trans directory
+thchs=/nfs/public/materials/data/thchs30-openslr
+
+#you can obtain the database by uncommting the following lines
+#[ -d $thchs ] || mkdir -p $thchs  || exit 1
+#echo "downloading THCHS30 at $thchs ..."
+#local/download_and_untar.sh $thchs  http://www.openslr.org/resources/18 data_thchs30  || exit 1
+#local/download_and_untar.sh $thchs  http://www.openslr.org/resources/18 resource      || exit 1
+#local/download_and_untar.sh $thchs  http://www.openslr.org/resources/18 test-noise    || exit 1
+
+#data preparation 
+#generate text, wav.scp, utt2pk, spk2utt
+local/thchs-30_data_prep.sh $H $thchs/data_thchs30 || exit 1;
+
+#produce MFCC features 
+rm -rf data/mfcc && mkdir -p data/mfcc &&  cp -R data/{train,dev,test,test_phone} data/mfcc || exit 1;
+for x in train dev test; do
+   #make  mfcc 
+   steps/make_mfcc.sh --nj $n --cmd "$train_cmd" data/mfcc/$x exp/make_mfcc/$x mfcc/$x || exit 1;
+   #compute cmvn
+   steps/compute_cmvn_stats.sh data/mfcc/$x exp/mfcc_cmvn/$x mfcc/$x || exit 1;
+done
+#copy feats and cmvn to test.ph, avoid duplicated mfcc & cmvn 
+cp data/mfcc/test/feats.scp data/mfcc/test_phone && cp data/mfcc/test/cmvn.scp data/mfcc/test_phone || exit 1;
+
+
+#prepare language stuff
+#build a large lexicon that invovles words in both the training and decoding. 
+(
+  echo "make word graph ..."
+  cd $H; mkdir -p data/{dict,lang,graph} && \
+  cp $thchs/resource/dict/{extra_questions.txt,nonsilence_phones.txt,optional_silence.txt,silence_phones.txt} data/dict && \
+  cat $thchs/resource/dict/lexicon.txt $thchs/data_thchs30/lm_word/lexicon.txt | \
+  	grep -v '<s>' | grep -v '</s>' | sort -u > data/dict/lexicon.txt || exit 1;
+  utils/prepare_lang.sh --position_dependent_phones false data/dict "<SPOKEN_NOISE>" data/local/lang data/lang || exit 1;
+  gzip -c $thchs/data_thchs30/lm_word/word.3gram.lm > data/graph/word.3gram.lm.gz || exit 1;
+  utils/format_lm.sh data/lang data/graph/word.3gram.lm.gz $thchs/data_thchs30/lm_word/lexicon.txt data/graph/lang || exit 1;
+)
+
+#make_phone_graph
+(
+  echo "make phone graph ..."
+  cd $H; mkdir -p data/{dict_phone,graph_phone,lang_phone} && \
+  cp $thchs/resource/dict/{extra_questions.txt,nonsilence_phones.txt,optional_silence.txt,silence_phones.txt} data/dict_phone  && \
+  cat $thchs/data_thchs30/lm_phone/lexicon.txt | grep -v '<eps>' | sort -u > data/dict_phone/lexicon.txt  && \
+  echo "<SPOKEN_NOISE> sil " >> data/dict_phone/lexicon.txt  || exit 1;
+  utils/prepare_lang.sh --position_dependent_phones false data/dict_phone "<SPOKEN_NOISE>" data/local/lang_phone data/lang_phone || exit 1;
+  gzip -c $thchs/data_thchs30/lm_phone/phone.3gram.lm > data/graph_phone/phone.3gram.lm.gz  || exit 1;
+  utils/format_lm.sh data/lang_phone data/graph_phone/phone.3gram.lm.gz $thchs/data_thchs30/lm_phone/lexicon.txt \
+    data/graph_phone/lang  || exit 1;
+)
+
+#monophone
+steps/train_mono.sh --boost-silence 1.25 --nj $n --cmd "$train_cmd" data/mfcc/train data/lang exp/mono || exit 1; 
+#test monophone model
+local/thchs-30_decode.sh --mono true --nj $n "steps/decode.sh" exp/mono data/mfcc &
+
+#monophone_ali
+steps/align_si.sh --boost-silence 1.25 --nj $n --cmd "$train_cmd" data/mfcc/train data/lang exp/mono exp/mono_ali || exit 1;
+
+#triphone
+steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" 2000 10000 data/mfcc/train data/lang exp/mono_ali exp/tri1 || exit 1;
+#test tri1 model
+local/thchs-30_decode.sh --nj $n "steps/decode.sh" exp/tri1 data/mfcc &
+
+#triphone_ali
+steps/align_si.sh --nj $n --cmd "$train_cmd" data/mfcc/train data/lang exp/tri1 exp/tri1_ali || exit 1;
+
+#lda_mllt
+steps/train_lda_mllt.sh --cmd "$train_cmd" --splice-opts "--left-context=3 --right-context=3" 2500 15000 data/mfcc/train data/lang exp/tri1_ali exp/tri2b || exit 1;
+#test tri2b model
+local/thchs-30_decode.sh --nj $n "steps/decode.sh" exp/tri2b data/mfcc &
+
+
+#lda_mllt_ali
+steps/align_si.sh  --nj $n --cmd "$train_cmd" --use-graphs true data/mfcc/train data/lang exp/tri2b exp/tri2b_ali || exit 1;
+
+#sat
+steps/train_sat.sh --cmd "$train_cmd" 2500 15000 data/mfcc/train data/lang exp/tri2b_ali exp/tri3b || exit 1;
+#test tri3b model
+local/thchs-30_decode.sh --nj $n "steps/decode_fmllr.sh" exp/tri3b data/mfcc &
+
+#sat_ali
+steps/align_fmllr.sh --nj $n --cmd "$train_cmd" data/mfcc/train data/lang exp/tri3b exp/tri3b_ali || exit 1;
+
+#quick
+steps/train_quick.sh --cmd "$train_cmd" 4200 40000 data/mfcc/train data/lang exp/tri3b_ali exp/tri4b || exit 1;
+#test tri4b model
+local/thchs-30_decode.sh --nj $n "steps/decode_fmllr.sh" exp/tri4b data/mfcc &
+
+#quick_ali
+steps/align_fmllr.sh --nj $n --cmd "$train_cmd" data/mfcc/train data/lang exp/tri4b exp/tri4b_ali || exit 1;
+
+#quick_ali_cv
+steps/align_fmllr.sh --nj $n --cmd "$train_cmd" data/mfcc/dev data/lang exp/tri4b exp/tri4b_ali_cv || exit 1;
+
+#train dnn model
+local/nnet/run_dnn.sh --stage 0 --nj $n  exp/tri4b exp/tri4b_ali exp/tri4b_ali_cv || exit 1;  
+
+#train dae model
+#python2.6 or above is required for noisy data generation.
+#To speed up the process, pyximport for python is recommeded.
+local/dae/run_dae.sh --stage 0  $thchs || exit 1;
diff --git a/egs/thchs30/s5/steps b/egs/thchs30/s5/steps
new file mode 120000
index 00000000000..1b186770dd1
--- /dev/null
+++ b/egs/thchs30/s5/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps/
\ No newline at end of file
diff --git a/egs/thchs30/s5/utils b/egs/thchs30/s5/utils
new file mode 120000
index 00000000000..a3279dc8679
--- /dev/null
+++ b/egs/thchs30/s5/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils/
\ No newline at end of file
diff --git a/egs/tidigits/s5/cmd.sh b/egs/tidigits/s5/cmd.sh
index c8f0d9d67a7..71dd849a93b 100644
--- a/egs/tidigits/s5/cmd.sh
+++ b/egs/tidigits/s5/cmd.sh
@@ -1,14 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#export train_cmd="queue.pl -q all.q@a*.clsp.jhu.edu"
-export decode_cmd="queue.pl -q all.q@a*.clsp.jhu.edu"
-export train_cmd=run.pl
-#export decode_cmd=run.pl
-
-
-
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/tidigits/s5/local/tidigits_prepare_lang.sh b/egs/tidigits/s5/local/tidigits_prepare_lang.sh
index ff316514fc9..0bc08ab40a0 100755
--- a/egs/tidigits/s5/local/tidigits_prepare_lang.sh
+++ b/egs/tidigits/s5/local/tidigits_prepare_lang.sh
@@ -88,10 +88,11 @@ utils/make_lexicon_fst.pl $tmpdir/lexicon.txt 0.5 sil | \
 
 cp $lang/L.fst $lang/L_disambig.fst
 
-silphonelist=`cat $lang/phones/silence.csl | sed 's/:/ /g'`
-nonsilphonelist=`cat $lang/phones/nonsilence.csl | sed 's/:/ /g'`
-cat conf/topo.proto | sed "s:NONSILENCEPHONES:$nonsilphonelist:" | \
-   sed "s:SILENCEPHONES:$silphonelist:" > $lang/topo
+num_sil_states=5
+num_nonsil_states=3
+silphonelist=`cat $lang/phones/silence.csl`
+nonsilphonelist=`cat $lang/phones/nonsilence.csl`
+utils/gen_topo.pl $num_nonsil_states $num_sil_states $nonsilphonelist $silphonelist >$lang/topo
 
 # Now we prepare a simple grammar G.fst that's a kind of loop of
 # digits (no silence in this, since that's handled in L.fst)
diff --git a/egs/tidigits/s5/path.sh b/egs/tidigits/s5/path.sh
index 3ee46078956..2d17b17a84a 100755
--- a/egs/tidigits/s5/path.sh
+++ b/egs/tidigits/s5/path.sh
@@ -1,3 +1,6 @@
 export KALDI_ROOT=`pwd`/../../..
-export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$PWD:$PATH
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
 export LC_ALL=C
diff --git a/egs/timit/README.txt b/egs/timit/README.txt
index 7e5bfa8e82d..f8ca39c4fc9 100644
--- a/egs/timit/README.txt
+++ b/egs/timit/README.txt
@@ -21,15 +21,14 @@ About TIMIT:
 Each subdirectory of this directory contains the scripts for a sequence
 of experiments.
 
-  s3: Monophone GMM/HMM system trained with Maximum likelihood. Training
-      is done with 61 phonemes, that are collapsed down to 39 phoneme
-      during testing. Implemented by Navdeep Jaitly (ndjaitly@cs.toronto.edu)
-      [from Dan: I believe this is now somewhat out of date, please us s5/]
-
-  s4: Monophone, Triphone GMM/HMM systems trained with Maximum Likelihood.
-      Training is done on 48 phonemes  (see- Lee and Hon: Speaker-Independent
+  s5: Monophone, Triphone GMM/HMM systems trained with Maximum Likelihood,
+      followed by SGMM and DNN recipe.
+      Training is done on 48 phonemes (see- Lee and Hon: Speaker-Independent
       Phone Recognition Using Hidden Markov Models. IEEE TRANSACTIONS ON
       ACOUSTICS. SPEECH, AND SIGNAL PROCESSING, VOL. 31. NO. 11, PG. 1641-48,
-      NOVEMBER 1989, ). Implemented by Arnab Ghoshal (arnab13@gmail.com)
+      NOVEMBER 1989, ). In scoring we map to 39 phonememes, as is usually 
+      done in conference papers. 
+      The earlier versions of TIMIT scripts were implemented by Navdeep Jaitly,
+      Arnab Ghoshal. Current version was developed by Bagher BabaAli and is 
+      maintained by Karel Vesely (vesis84@gmail.com).
 
-  s5: the currently recommended recipe.
diff --git a/egs/timit/s3/RESULTS b/egs/timit/s3/RESULTS
deleted file mode 100644
index aeb53d8a5c2..00000000000
--- a/egs/timit/s3/RESULTS
+++ /dev/null
@@ -1,11 +0,0 @@
-# dev set
-#compute-wer --mode=present ark:- ark,p:tmp 
-#%WER 34.42 [ 5003 / 14534, 218 ins, 1974 del, 2811 sub ]
-#%SER 100.00 [ 400 / 400 ]
-#Scored 400 sentences, 0 not present in hyp.
-# test set
-#compute-wer --mode=present ark:- ark,p:tmp 
-#%WER 35.67 [ 2479 / 6949, 98 ins, 1009 del, 1372 sub ]
-#%SER 100.00 [ 192 / 192 ]
-
-
diff --git a/egs/timit/s3/conf/plp.conf b/egs/timit/s3/conf/plp.conf
deleted file mode 100644
index c4b73674cab..00000000000
--- a/egs/timit/s3/conf/plp.conf
+++ /dev/null
@@ -1,2 +0,0 @@
-# No non-default options for now.
-
diff --git a/egs/timit/s3/conf/topo.proto b/egs/timit/s3/conf/topo.proto
deleted file mode 100644
index 14a6da73983..00000000000
--- a/egs/timit/s3/conf/topo.proto
+++ /dev/null
@@ -1,22 +0,0 @@
-<Topology> 
-<TopologyEntry> 
-<ForPhones>
-NONSILENCEPHONES
-</ForPhones> 
-<State> 0 <PdfClass> 0 <Transition> 0 0.75 <Transition> 1 0.25 </State> 
-<State> 1 <PdfClass> 1 <Transition> 1 0.75 <Transition> 2 0.25 </State> 
-<State> 2 <PdfClass> 2 <Transition> 2 0.75 <Transition> 3 0.25 </State> 
-<State> 3 </State>
-</TopologyEntry> 
-<TopologyEntry> 
-<ForPhones>
-SILENCEPHONES
-</ForPhones> 
-<State> 0 <PdfClass> 0 <Transition> 0 0.25 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 </State> 
-<State> 1 <PdfClass> 1 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 <Transition> 4 0.25 </State> 
-<State> 2 <PdfClass> 2 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 <Transition> 4 0.25 </State> 
-<State> 3 <PdfClass> 3 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 <Transition> 4 0.25 </State> 
-<State> 4 <PdfClass> 4 <Transition> 4 0.25 <Transition> 5 0.75 </State> 
-<State> 5 </State>
-</TopologyEntry> 
-</Topology> 
diff --git a/egs/timit/s3/local/create_biphone_lm.sh b/egs/timit/s3/local/create_biphone_lm.sh
deleted file mode 100755
index 2c4c84dba2e..00000000000
--- a/egs/timit/s3/local/create_biphone_lm.sh
+++ /dev/null
@@ -1,177 +0,0 @@
-# Copyright 2012  Navdeep Jaitly
-
-# Is mostly a cut and paste operation, derived from 
-# ../../../tools/kaldi_lm/train_lm.sh to create an lm for 
-# biphone/bigram language models, which train_lm.sh does not
-# deign to do.
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-
-# note: output is
-# data/local/lm/3gram-mincount/lm_unpruned.gz 
-# Expects train.gz, word_map in [argument 1 folder].
-# Call from local/timit_train_lms.sh.
-
-if [ $# != 1 ]; then
-  echo "Usage: ../../local/create_biphone_lm.sh [lm folder]"
-  echo "eg: ../../local/create_biphone_lm.sh data/local"
-  exit 1; 
-fi 
-
-
-export PATH=$PATH:`pwd`/../../../tools/kaldi_lm
-dir=$1
-
-requirements="$dir/train.gz $dir/word_map"
-for f in $requirements; do
-  if [ ! -f $f ]; then
-     echo "create_biphone_lm.sh: no such file $f"
-     exit 1;
-  fi
-done
-
-echo "Training biphone language model in folder $dir"
-subdir=$dir/biphone
-echo "Creating directory $subdir"
-mkdir -p $subdir
-
-# Clearly we don't have enough data to build a properly cross validated back-off model.
-# In addition there is no need for a backoff model since we have all bigrams in the
-# training data. However, taking out some of the data for validation set may remove
-# some of the bigrams. This may seem like a bad thing, but could be a good thing if
-# the resulting smoothing helps.
-
-heldout_sent=300
-write_arpa=1
-
-if [ -s $subdir/ngrams.gz -a -s $subdir/heldout_ngrams.gz ]; then
-  echo "Not creating raw N-gram counts ngrams.gz and heldout_ngrams.gz since they already exist in $subdir"
-  echo "(remove them if you want them regenerated)"
-else 
-  echo Getting raw N-gram counts
-
-  gunzip -c $dir/train.gz | tail -n +$heldout_sent | get_raw_ngrams 2 | sort | uniq -c |\
-        uniq_to_ngrams | sort | gzip -c > $subdir/ngrams.gz    
-  # Note: the Perl command below adds ":" before the count, which
-  # is a marker that these N-grams are test N-grams.
-  gunzip -c $dir/train.gz | head -n $heldout_sent | \
-       get_raw_ngrams 2 | sort | uniq -c | uniq_to_ngrams | \
-       perl -ane 's/(\S+)$/:$1/; print;' | sort | gzip -c > $subdir/heldout_ngrams.gz    
-fi
-
-cat > $subdir/config.0 <<EOF
-D=0.4 tau=0.9 phi=2.0
-D=0.6 tau=0.9 phi=2.0
-D=0.8 tau=1.1 phi=2.0
-EOF
-cat > $subdir/config.diff_1 <<EOF
-D=0 tau=1 phi=0
-D=0 tau=1 phi=0
-D=0 tau=1 phi=0
-EOF
-cat > $subdir/config.diff_2 <<EOF
-D=0 tau=0 phi=0
-D=0 tau=0 phi=0
-D=1 tau=0 phi=0
-EOF
-cat > $subdir/config.diff_3 <<EOF
-D=0 tau=0 phi=0
-D=0 tau=0 phi=0
-D=0 tau=1 phi=0
-EOF
-cat > $subdir/config.diff_4 <<EOF
-D=0 tau=0 phi=0
-D=0 tau=0 phi=0
-D=0 tau=0 phi=1
-EOF
-cat > $subdir/config.diff_5 <<EOF
-D=0 tau=0 phi=0
-D=1 tau=0 phi=0
-EOF
-cat > $subdir/config.diff_6 <<EOF
-D=0 tau=0 phi=0
-D=0 tau=1 phi=0
-EOF
-cat > $subdir/config.diff_7 <<EOF
-D=0 tau=0 phi=0
-D=0 tau=0 phi=1
-EOF
-num_configs=7
-
-awk '{print $2}' $dir/word_map > $dir/wordlist.mapped
-
-# Define a subroutine
-get_perplexity()  { # echoes the perplexity to stdout. uses current "$config" as config
-  time gunzip -c $subdir/ngrams.gz | \
-   discount_ngrams "$config" | sort | merge_ngrams | \
-   interpolate_ngrams $dir/wordlist.mapped 0.5 | sort | \
-   sort -m <(gunzip -c $subdir/heldout_ngrams.gz) - | compute_perplexity
-}
-
-mkdir -p $subdir/configs/ $subdir/perplexities/
-
-if [ -f $subdir/config.$num_configs ]; then
-  echo Not doing optimization of discounting parameters since
-  echo file $subdir/config.$num_configs already exists
-else
-  for n in `seq 1 $num_configs`; do
-    echo "Iteration $n/$num_configs of optimizing discounting parameters"
-    for alpha in -0.25 0.0 0.35; do
-      config=$subdir/configs/config.$n.$alpha
-      # Note: if this ensure-nonnegative stuff gets active here it would cause
-      # the optimization to give the wrong answer, but we've set up the config files
-      # in such a way that this shouldn't happen.
-      scale_configs.pl $subdir/config.$[$n-1] $subdir/config.diff_$n $alpha > $config
-      get_perplexity > $subdir/perplexities/$n.$alpha &
-    done
-    wait
-    optimize_alpha.pl -0.25 `cat $subdir/perplexities/$n.-0.25` \
-                       0.0 `cat $subdir/perplexities/$n.0.0` \
-                      0.35 `cat $subdir/perplexities/$n.0.35` > $subdir/perplexities/alpha.$n || exit 1;
-    alpha=`cat $subdir/perplexities/alpha.$n`
-    echo "Alpha value on iter $n is $alpha"
-    scale_configs.pl $subdir/config.$[$n-1] $subdir/config.diff_$n $alpha > $subdir/config.$n
-  done
-fi
-echo Final config is:
-cat $subdir/config.$num_configs
-
-# Create final LM as discounted (but not interpolated) N-grams:
-if gunzip -c $subdir/ngrams_disc.gz >&/dev/null; then
-  echo "Not creating discounted N-grams file $subdir/ngrams_disc.gz since it already exists"
-else
-  echo "Discounting N-grams."
-  gunzip -c $subdir/ngrams.gz | \
-   discount_ngrams $subdir/config.$num_configs | sort | merge_ngrams | \
-   gzip -c > $subdir/ngrams_disc.gz
-fi
-
-echo "Computing final perplexity"
-gunzip -c $subdir/ngrams_disc.gz | \
-  interpolate_ngrams $dir/wordlist.mapped 0.5 | \
-  sort | sort -m <(gunzip -c $subdir/heldout_ngrams.gz) - | \
-  compute_perplexity 2>&1 | tee  $subdir/perplexity &
-
-
-if [ $write_arpa == 1 ]; then
-  echo "Building ARPA LM (perplexity computation is in background)"
-  mkdir -p $subdir/tmpdir
-  gunzip -c $subdir/ngrams_disc.gz | \
-    interpolate_ngrams --arpa $dir/wordlist.mapped 0.5 | \
-    sort | finalize_arpa.pl $subdir/tmpdir | \
-    map_words_in_arpa.pl $dir/word_map | \
-    gzip -c > $subdir/lm_unpruned.gz
-fi
-
diff --git a/egs/timit/s3/local/export_log_fbanks_to_htk.sh b/egs/timit/s3/local/export_log_fbanks_to_htk.sh
deleted file mode 100755
index 208f7d2a037..00000000000
--- a/egs/timit/s3/local/export_log_fbanks_to_htk.sh
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/bin/bash
-# Copyright 2012  Navdeep Jaitly
-
-# This program allows you to export log filterbank data from 
-# KALDI to HTK format. Also exported is the force alignment 
-# data, from the gmm alignment.
-# HTK files are created, one per input file. 
-# alignment file: ali is create one for the entire set (test/dev/train).
-# Can be used for offline neural network training if you don't use
-# the abilities of Kaldi to do so.
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-
-
-config=conf/mfcc.conf
-data=data
-#out_path=/ais/gobi2/ndjaitly/Data/Kaldi/Spectrograms/
-#out_path=/ais/gobi2/ndjaitly/Data/Kaldi/FBANKS/
-out_path=/ais/gobi2/ndjaitly/Data/Kaldi/export/FBANKS_25_10/
-num_mel_bins=40
-power_spectrum_only=0
-frame_length=25
-frame_shift=10
- 
-#for test in train test dev ; do 
-for test in test dev ; do 
-   scp=$data/$test/wav.scp
-   out_dir=$out_path/$test/
-   out_scp=$out_path/$test/htk.scp
-   out_ali=$out_path/$test/ali
-   mkdir -p $out_dir
-   cat $scp | awk -v outdir=$out_dir '{ printf $1 " " outdir $1 ".htk\n"; }'  > $out_scp
-   compute-fbank-feats --frame-length=$frame_length  --frame-shift=$frame_shift \
-                 --num-mel-bins=$num_mel_bins --output-format=htk --verbose=2 \
-                 --config=$config scp:$scp  scp:$out_scp
-   ali-to-pdf exp/mono/final.mdl ark:exp/mono_ali_$test/ali t,ark:- > $out_ali
-done
diff --git a/egs/timit/s3/local/get_word_map.pl b/egs/timit/s3/local/get_word_map.pl
deleted file mode 100755
index fe90ba68a06..00000000000
--- a/egs/timit/s3/local/get_word_map.pl
+++ /dev/null
@@ -1,65 +0,0 @@
-#!/usr/bin/env perl
-# A very small modification on ../../../tools/kaldi_lm/get_word_map.pl to account
-# for no OOV vocab terms in timit.  - Navdeep Jaitly.
-
-
-# This program reads in a file with one word
-# on each line, and outputs a "translation file" of the form:
-# word short-form-of-word
-# on each line, 
-# where short-form-of-word is a kind of abbreviation of the word.
-#
-# It uses the letters a-z and A-Z, plus the characters from 
-# 128 to 255.  The first words in the file have the shortest representation.
-#
-# For convenience, it makes sure to give <s>, </s> 
-# a consistent labeling, as A and B respectively.
-
-
-# set up character table and some variables.
-@C = ();
-foreach $x (ord('A')...ord('Z')) { push @C, chr($x); }
-foreach $x (ord('a')...ord('z')) {  push @C, chr($x); }
-foreach $x(128...254) { push @C, chr($x); } # 255 is space so don't include it.
-
-@index = ( 2 ); # array of indexes into @C... count up to [dim of C -1]
-   # then add another index onto this.  Set it to 3, since 0 and 1 are
-   # reserved for <s> and </s> respectively.
-
-if (@ARGV != 2 && @ARGV != 3) {
-  die "Usage: get_word_map.pl bos-symbol eos-symbol [words-in-order]\n";
-}
-
-$bos = shift @ARGV;
-$eos = shift @ARGV;
-print "$bos $C[0]\n";
-print "$eos $C[1]\n";
-
-sub get_short_form();
-
-while(<>) {
-  chop;
-  $word = $_;
-  $word =~ m:^\S+$: || die "Bad word $word";
-  if($seen{$word}) { die "Word $word repeated"; }
-  $seen{$word}++;
-  if ($word ne $bos && $word ne $eos) {
-    $short_form = get_short_form();
-    print "$word $short_form\n";
-  }
-}
-
-sub get_short_form() {
-  $ans = "";
-  foreach $i (@index) { $ans = $C[$i] . $ans; } # 
-  # Now increment the index.
-  $index[0]++;
-  $cur_idx = 0;
-  while ($index[$cur_idx] == @C) { # E.g. if the least significant digit
-    # is out of the valid range... carry one.
-    $index[$cur_idx] = 0;
-    $cur_idx++;
-    $index[$cur_idx]++; # This will extend the array if necessary.
-  }
-  return $ans;
-}
diff --git a/egs/timit/s3/local/make_trans.pl b/egs/timit/s3/local/make_trans.pl
deleted file mode 100755
index 230b4fab2bf..00000000000
--- a/egs/timit/s3/local/make_trans.pl
+++ /dev/null
@@ -1,98 +0,0 @@
-#!/usr/bin/env perl
-# Copyright 2012 Navdeep Jaitly.
-# Copyright 2010-2011 Microsoft Corporation
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-
-
-# usage:  make_trans.sh prefix in.flist out.txt out.scp
-
-# prefix is first letters of the database "key" (rest are numeric)
-
-# in.flist is just a list of the WAV file paths (X.WAV). The 
-# monophone transcriptions are in the files (X.phn).
-# out.txt is the output transcriptions in format "key word1 word\n"
-# out.scp is the output scp file, which is as in.scp but has the
-# database-key first on each line.
-
-# Reads from first argument in.flist
-# Writes to standard output trans.txt
-
-sub ParseTranscript() {
-    my $transcript_file = $_[0];
-    open(F, "<$transcript_file") || die "Error opening phone transcription file $transcript_file\n";
-    my $trans = "h#" ; 
-    my $line = <F> ; 
-    chomp ($line);
-    # first line should be "h#".
-    ($line =~/h#/) || die "First line should be h#. Got line: $line";
-    my @pieces;
-    while(<F>) {
-        chomp ; 
-        @pieces = split(" ", $_);
-        @pieces == 3 || die "Error parsing file: $transcript_file, line: $_. Expected 3 fields. Found @pieces";
-        $trans = $trans . " " . $pieces[2];
-    }
-    ($pieces[2] =~/^h#/) || die "Last line should be h#";
-    #$trans =~s/^h#/<s>/ ; # first h#
-    #$trans =~s/h#/<\\s>/ ; # last h#
-    $trans =~s/^h#// ; # first h#
-    $trans =~s/h#$// ; # last h#
-    ($trans !~ m/h#/) || die "Found h# character in transcript, other than start or end.";
-    
-    close(F);
-    return $trans ; 
-}
-
-if(@ARGV != 4) {
-    die "usage:  make_trans.sh prefix in.flist out.txt out.scp\n";
-}
-($prefix, $in_flist, $out_txt, $out_scp) = @ARGV;
-
-open(G, "<$in_flist") || die "Opening file list $in_flist";
-
-open(O, ">$out_txt") || die "Open output transcription file $out_txt";
-
-open(P, ">$out_scp") || die "Open output scp file $out_scp";
-
-while(<G>) {
-    my $sph_file = $_ ;
-    chomp ($sph_file) ;
-    $_ =~ m:/(\w+)/(\w+)\.WAV\s+$:i || die "bad scp line $_";
-    $spkname = $1;
-    $uttname = $2;
-    $uttname  =~ tr/a-z/A-Z/;
-    $spkname =~ s/_//g; # remove underscore from spk name to make key nicer.
-    $key = $prefix . "_" . $spkname . "_" . $uttname;
-    $key =~ tr/A-Z/a-z/; # Make it all lower case.
-     # to make the numerical and string-sorted orders the same.
-    my $transcript_file = substr($_, 0, length($_)-4) . "phn";
-    if (! -e $transcript_file ) {
-       $transcript_file = substr($_, 0, length($_)-4) . "PHN";
-    }
-    if (! -e $transcript_file ) {
-       print "Transcription file: $transcript_file missing." ; 
-    }
-     
-    my $trans = &ParseTranscript($transcript_file);
-    $trans =~ tr/a-z/A-Z/; # Make it all upper case.
-    print P "$key $sph_file\n";
-    print O "$key $trans\n";
-    $n++;
-} 
-close(O) || die "Closing output.";
-close(P) || die "Closing output.";
-
-
diff --git a/egs/timit/s3/local/timit_data_prep.sh b/egs/timit/s3/local/timit_data_prep.sh
deleted file mode 100755
index ff6754b4a55..00000000000
--- a/egs/timit/s3/local/timit_data_prep.sh
+++ /dev/null
@@ -1,140 +0,0 @@
-#!/bin/bash -u
-
-# Copyright 2012  Navdeep Jaitly
-# Copyright 2010-2011  Microsoft Corporation
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-# To be run from one directory above this script.
-
-# The input is the 3 CDs from the LDC distribution of Resource Management.
-# The script's argument is a directory which has three subdirectories:
-# rm1_audio1  rm1_audio2  rm2_audio
-
-# Note: when creating your own data preparation scripts, it's a good idea
-# to make sure that the speaker id (if present) is a prefix of the utterance
-# id, that the output scp file is sorted on utterance id, and that the 
-# transcription file is exactly the same length as the scp file and is also
-# sorted on utterance id (missing transcriptions should be removed from the
-# scp file using e.g. scripts/filter_scp.pl)
-
-if [ $# != 1 ]; then
-  echo "Usage: ../../local/timit_data_prep.sh /path/to/TIMIT"
-  exit 1; 
-fi 
-
-TIMIT_ROOT=$1
-S3_ROOT=`pwd`
-mkdir -p data/local
-cd data/local
-
-lower_case=0
-upper_case=0
-if [ -d $TIMIT_ROOT/TIMIT/TRAIN -a -d $TIMIT_ROOT/TIMIT/TEST ];
- then
-   upper_case=1
-   train_folder=$TIMIT_ROOT/TIMIT/TRAIN
-   test_folder=$TIMIT_ROOT/TIMIT/TEST
-   spkr_info_file=$TIMIT_ROOT/TIMIT/DOC/SPKRINFO.TXT
-elif [ -d $TIMIT_ROOT/timit/train -a -d $TIMIT_ROOT/timit/test ];
- then
-   lower_case=1
-   train_folder=$TIMIT_ROOT/timit/train
-   test_folder=$TIMIT_ROOT/timit/test
-   spkr_info_file=$TIMIT_ROOT/timit/doc/spkrinfo.txt
-else 
-   echo "Error: run.sh requires a directory argument (an absolute pathname) that contains TIMIT/TRAIN and TIMIT/TEST or timit/train and timit/test."
-   exit 1;
-fi
-
-
-(
-   find $train_folder -iname "*.wav" | perl -ane 'if (! m/sa[0-9].wav/i){ print $_ ; }'
-)  > train_sph.flist
-
-
-# make_trans.pl also creates the utterance id's and the kaldi-format scp file.
-$S3_ROOT/local/make_trans.pl trn train_sph.flist train_trans.txt train_sph.scp || exit 1;
-mv train_trans.txt tmp; sort -k 1 tmp > train_trans.txt
-mv train_sph.scp tmp; sort -k 1 tmp > train_sph.scp
-rm tmp
-
-sph2pipe=`cd $S3_ROOT ; cd ../../..; echo $PWD/tools/sph2pipe_v2.5/sph2pipe`
-if [ ! -f $sph2pipe ]; then
-    echo "Could not find the sph2pipe program at $sph2pipe";
-    exit 1;
-fi
-awk '{printf("%s '$sph2pipe' -f wav %s |\n", $1, $2);}' < train_sph.scp > train_wav.scp
-
-cat train_wav.scp | perl -ane 'm/^(\w+_(\w+)\w_\w+) / || die; print "$1 $2\n"' > train.utt2spk
-cat train.utt2spk | sort -k 2 | $S3_ROOT/scripts/utt2spk_to_spk2utt.pl > train.spk2utt
-
-echo "Creating coretest set."
-test_speakers="mdab0 mwbt0 felc0 mtas1 mwew0 fpas0 mjmp0 mlnt0 fpkt0 mlll0 mtls0 fjlm0 mbpm0 mklt0 fnlp0 mcmj0 mjdh0 fmgd0 mgrt0 mnjm0 fdhc0 mjln0 mpam0 fmld0"
-dev_speakers="faks0 fdac1 fjem0 mgwt0 mjar0 mmdb1 mmdm2 mpdf0 fcmh0 fkms0 mbdg0 mbwm0 mcsh0 fadg0"
-dev_speakers="${dev_speakers} fdms0 fedw0 mgjf0 mglb0 mrtk0 mtaa0 mtdt0 mthc0 mwjg0 fnmr0 frew0 fsem0 mbns0 mmjr0 mdls0 mdlf0"
-dev_speakers="${dev_speakers} mdvc0 mers0 fmah0 fdrw0 mrcs0 mrjm4 fcal1 mmwh0 fjsj0 majc0 mjsw0 mreb0 fgjd0 fjmg0 mroa0 mteb0 mjfc0 mrjr0 fmml0 mrws1"
-
-
-if [ $upper_case == 1 ] ; then
-   test_speakers=`echo $test_speakers | tr '[:lower:]' '[:upper:]'`
-   dev_speakers=`echo $dev_speakers | tr '[:lower:]' '[:upper:]'`
-fi
-
-rm -f test_sph.flist
-for speaker in $test_speakers ; do
-echo -n $speaker " "
-(
-   find $test_folder/*/${speaker} -iname "*.wav" | perl -ane 'if (! m/sa[0-9].wav/i){ print $_ ; }'
-)  >> test_sph.flist
-done 
-echo ""
-num_lines=`wc -l test_sph.flist | awk '{print $1}'`
-echo "# of utterances in coretest set = ${num_lines}"
-
-echo "Creating dev set."
-rm -f dev_sph.flist
-for speaker in $dev_speakers ; do
-echo -n $speaker " "
-(
-   find $test_folder/*/${speaker} -iname "*.wav" | perl -ane 'if (! m/sa[0-9].wav/i){ print $_ ; }'
-)  >> dev_sph.flist
-done 
-echo ""
-num_lines=`wc -l dev_sph.flist | awk '{print $1}'`
-echo "# of utterances in dev set = ${num_lines}"
-
-
-# make_trans.pl also creates the utterance id's and the kaldi-format scp file.
-for test in test dev ; do
-    echo "Finalizing ${test}"
-    $S3_ROOT/local/make_trans.pl ${test} ${test}_sph.flist ${test}_trans.txt ${test}_sph.scp || exit 1;
-    mv ${test}_trans.txt tmp; sort -k 1 tmp > ${test}_trans.txt
-    mv ${test}_sph.scp tmp; sort -k 1 tmp > ${test}_sph.scp
-    rm tmp;
-    awk '{printf("%s '$sph2pipe' -f wav %s |\n", $1, $2);}' < ${test}_sph.scp  > ${test}_wav.scp
-
-    cat ${test}_wav.scp | perl -ane 'm/^(\w+_(\w+)\w_\w+) / || die; print "$1 $2\n"' > ${test}.utt2spk
-    cat ${test}.utt2spk | sort -k 2 | $S3_ROOT/scripts/utt2spk_to_spk2utt.pl > ${test}.spk2utt
-done
-
-
-# Need to set these on the basis of file name first characters.
-#grep -v "^;" DOC/SPKRINFO.TXT | awk '{print $1 " " $2 ; } ' | \
-cat $spkr_info_file | \
-    perl -ane 'tr/A-Z/a-z/;print;' | grep -v ';' | \
-    awk '{print $2$1, $2}' | sort | uniq > spk2gender.map || exit 1;
-
-
-echo timit_data_prep succeeded.
diff --git a/egs/timit/s3/local/timit_format_data.sh b/egs/timit/s3/local/timit_format_data.sh
deleted file mode 100755
index ba1f5a955f3..00000000000
--- a/egs/timit/s3/local/timit_format_data.sh
+++ /dev/null
@@ -1,137 +0,0 @@
-#!/bin/bash
-
-# Copyright 2012  Navdeep Jaitly
-# Copyright 2010-2011  Microsoft Corporation
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-# To be run from one directory above this script.
-
-
-
-if [ -f path.sh ]; then . path.sh; fi
-
-arpa_lm=data/local/lm/biphone/lm_unpruned.gz
-
-data_list="train test dev"
-
-for x in lang lang_test $data_list; do
-  mkdir -p data/$x
-done
-
-# Copy stuff into its final location:
-
-for x in $data_list; do
-  cp data/local/$x.spk2utt data/$x/spk2utt || exit 1;
-  cp data/local/$x.utt2spk data/$x/utt2spk || exit 1;
-  cp data/local/${x}_wav.scp data/$x/wav.scp || exit 1;
-  cp data/local/${x}_trans.txt data/$x/text || exit 1;
-  scripts/filter_scp.pl data/$x/spk2utt data/local/spk2gender.map > data/$x/spk2gender || exit 1;
-done
-
-
-scripts/make_words_symtab.pl < data/local/lexicon.txt > data/lang/words.txt
-scripts/make_phones_symtab.pl < data/local/lexicon.txt > data/lang/phones.txt
-cp data/lang/words.txt data/lang_test/words.txt
-
-silphones="sil"; # This would in general be a space-separated list of all silence phones.  E.g. "sil vn"
-# Generate colon-separated lists of silence and non-silence phones.
-scripts/silphones.pl data/lang/phones.txt "$silphones" data/lang/silphones.csl \
-  data/lang/nonsilphones.csl
-
-ndisambig=`scripts/add_lex_disambig.pl data/local/lexicon.txt data/local/lexicon_disambig.txt`
-ndisambig=$[$ndisambig+1]; # add one disambig symbol for silence in lexicon FST.
-scripts/add_disambig.pl data/lang/phones.txt $ndisambig > data/lang_test/phones_disambig.txt
-cp data/lang_test/phones_disambig.txt data/lang/ # needed for MMI.
-
-echo "Creating L.fst"
-silprob=0.3  # same prob as word
-scripts/make_lexicon_fst.pl data/local/lexicon.txt $silprob sil  | \
-  fstcompile --isymbols=data/lang/phones.txt --osymbols=data/lang/words.txt \
-   --keep_isymbols=false --keep_osymbols=false | \
-   fstarcsort --sort_type=olabel > data/lang/L.fst
-echo "Done creating L.fst"
-
-
-# L_disambig.fst has the disambiguation symbols (c.f. Mohri's papers)
-echo "Creating L_disambig.fst"
-scripts/make_lexicon_fst.pl data/local/lexicon_disambig.txt $silprob sil '#'$ndisambig | \
-   fstcompile --isymbols=data/lang_test/phones_disambig.txt --osymbols=data/lang_test/words.txt \
-   --keep_isymbols=false --keep_osymbols=false | fstarcsort --sort_type=olabel \
-    > data/lang_test/L_disambig.fst
-echo "Done creating L_disambig.fst"
-
-cp data/lang_test/L_disambig.fst data/lang/  # Needed for MMI training.
-echo "Creating G.fst"
-
-#gunzip -c "$arpa_lm" | \
-#   grep -v '<s> <s>' | \
-#   grep -v '</s> <s>' | \
-#   grep -v '</s> </s>' | \
-#   arpa2fst - | fstprint | \
-#   scripts/remove_oovs.pl /dev/null | \
-#   scripts/eps2disambig.pl | scripts/s2eps.pl | \
-#   fstcompile --isymbols=data/lang/words.txt --osymbols=data/lang_test/words.txt  --keep_isymbols=false \
-#        --keep_osymbols=false > data/lang_test/G.fst
-gunzip -c "$arpa_lm" | \
-   grep -v '<s> <s>' | \
-   grep -v '</s> <s>' | \
-   grep -v '</s> </s>' | \
-   arpa2fst - | fstprint | \
-   scripts/remove_oovs.pl /dev/null | \
-   scripts/s2eps.pl | \
-   fstcompile --isymbols=data/lang/words.txt --osymbols=data/lang_test/words.txt  --keep_isymbols=false \
-        --keep_osymbols=false > data/lang_test/G.fst
-
-echo "G.fst created. How stochastic is it ?"
-fstisstochastic data/lang_test/G.fst 
-
-# Checking that G.fst is determinizable.
-fstdeterminize data/lang_test/G.fst /dev/null || echo Error determinizing G.
-
-# Checking that L_disambig.fst is determinizable.
-fstdeterminize data/lang_test/L_disambig.fst /dev/null || echo Error determinizing L.
-
-# Checking that disambiguated lexicon times G is determinizable
-fsttablecompose data/lang_test/L_disambig.fst data/lang_test/G.fst | \
-   fstdeterminize >/dev/null || echo Error
-
-# Checking that LG is stochastic:
-echo "How stochastic is LG.fst."
-fstisstochastic data/lang_test/G.fst 
-fsttablecompose data/lang/L.fst data/lang_test/G.fst | \
-   fstisstochastic 
-
-# Checking that LG_disambig.fst is stochastic:
-echo "How stochastic is LG_disambig.fst."
-fsttablecompose data/lang_test/L_disambig.fst data/lang_test/G.fst | \
-   fstisstochastic 
-
-
-## Check lexicon.
-## just have a look and make sure it seems sane.
-echo "First few lines of lexicon FST:"
-fstprint   --isymbols=data/lang/phones.txt --osymbols=data/lang/words.txt data/lang/L.fst  | head
-
-
-silphonelist=`cat data/lang/silphones.csl | sed 's/:/ /g'`
-nonsilphonelist=`cat data/lang/nonsilphones.csl | sed 's/:/ /g'`
-cat conf/topo.proto | sed "s:NONSILENCEPHONES:$nonsilphonelist:" | \
-   sed "s:SILENCEPHONES:$silphonelist:" > data/lang/topo 
-
-for x in phones.txt words.txt silphones.csl nonsilphones.csl topo; do
-   cp data/lang/$x data/lang_test/$x || exit 1;
-done
-
-echo timit_format_data succeeded.
diff --git a/egs/timit/s3/local/timit_train_lms.sh b/egs/timit/s3/local/timit_train_lms.sh
deleted file mode 100755
index eb61122442d..00000000000
--- a/egs/timit/s3/local/timit_train_lms.sh
+++ /dev/null
@@ -1,95 +0,0 @@
-#!/bin/bash
-# Copyright 2012  Navdeep Jaitly
-
-# Derived from swbd/s3/local/swbd_p1_train_lms.sh scripts. 
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-
-# To be run from one directory above this script.
-# This script takes no arguments.  It assumes you have already run
-# timit_data_prep.sh.  
-# It takes as input the file
-#     [argument 1]/train_trans.txt
-# and uses it to create the lexicon (just the phones) and the biphone language model.
-# Creates folder [argument 1]/lm
-
-if [ $# != 1 ]; then
-  echo "Usage: ../../local/timit_train_lms.sh [data path]"
-  echo "eg: ../../local/timit_train_lms.sh data/local"
-  exit 1; 
-fi 
-
-
-dir=$1/lm
-trans_file=$1/train_trans.txt
-phones_file=$1/phones.txt 
-lex_file=$1/lexicon.txt
-
-if [ ! -e $trans_file ]; then 
-   echo "Transcript file $trans_file not found. Did you run local/timit_data_prep.sh"
-   exit 1;
-fi
-
-mkdir -p $dir
-export LC_ALL=C # You'll get errors about things being not sorted, if you
-# have a different locale.
-export PATH=$PATH:`pwd`/../../../tools/kaldi_lm
-( # First make sure the kaldi_lm toolkit is installed.
- cd ../../../tools || exit 1;
- if [ -d kaldi_lm ]; then
-   echo Not installing the kaldi_lm toolkit since it is already there.
- else
-   echo Downloading and installing the kaldi_lm tools
-   if [ ! -f kaldi_lm.tar.gz ]; then
-     wget http://www.danielpovey.com/files/kaldi/kaldi_lm.tar.gz || exit 1;
-   fi
-   tar -xvzf kaldi_lm.tar.gz || exit 1;
-   cd kaldi_lm
-   make || exit 1;
-   echo Done making the kaldi_lm tools
- fi
-) || exit 1;
-
-mkdir -p $dir
-
-echo "Creating phones file, and monophone lexicon (mapping phones to itself)."
-cat $trans_file | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq | awk '{print tolower($1) ; }' > $phones_file
-cat $phones_file | awk '{print toupper($1) " " $1 ; }' > $lex_file
-cat $trans_file | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
-   sort -nr > $dir/word.counts
-
-
-# Get counts from acoustic training transcripts, and add  one-count
-# for each word in the lexicon. 
-cat $trans_file | awk '{for(n=2;n<=NF;n++) print $n; }' | \
-  cat - <(cat $lex_file | awk '{print $1}') | \
-   sort | uniq -c | sort -nr > $dir/unigram.counts
-
-# note: we probably won't really make use of <UNK> as there aren't any OOVs
-cat $dir/unigram.counts  | awk '{print $2}' | local/get_word_map.pl "<s>" "</s>" > $dir/word_map
-
-# note: ignore 2nd field of train.txt, it's the utterance-id.
-cat $trans_file | awk -v wmap=$dir/word_map 'BEGIN{while((getline<wmap)>0)map[$1]=$2;}
-  { for(n=2;n<=NF;n++) { printf map[$n]; if(n<NF){ printf " "; } else { print ""; }}}' | gzip -c >$dir/train.gz
-
-! merge_ngrams </dev/null >&/dev/null  && \
-     echo merge_ngrams not found in kaldi_lm. You need to have kaldi_lm on your path OR && \
-     echo You can do the following:  && \
-     echo  1. Install the latest version from http://www.danielpovey.com/files/kaldi/kaldi_lm.tar.gz  && \
-     echo  2. you delete kaldi_lm, and kaldi_lm.tar.gz in the tools folder. This script will automatically install it. && \
-   exit 1;
-
-echo "Creating biphone model"
-local/create_biphone_lm.sh  $dir
diff --git a/egs/timit/s3/path.sh b/egs/timit/s3/path.sh
deleted file mode 100755
index 35e306fa45e..00000000000
--- a/egs/timit/s3/path.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-
-export PATH=$PWD/scripts/:$PWD/../../../src/bin:$PWD/../../../tools/openfst/bin:$PWD/../../../src/fstbin/:$PWD/../../../src/gmmbin/:$PWD/../../../src/featbin/:$PWD/../../../src/lm/:$PWD/../../../src/sgmmbin/:$PWD/../../../src/fgmmbin/:$PWD/../../../src/latbin/:$PWD:$PATH
-export LC_ALL=C
diff --git a/egs/timit/s3/run.sh b/egs/timit/s3/run.sh
deleted file mode 100755
index 97b9973d8c3..00000000000
--- a/egs/timit/s3/run.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash -u
-
-# Copyright 2012  Navdeep Jaitly
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-# To be safe we suggest running the recipe line by line. Otherwise
-# comment out the following line
-exit 1;
-
-. path.sh
-local/timit_data_prep.sh /ais/gobi2/speech/TIMIT || exit 1;
-# local/timit_data_prep.sh /export/corpora5/LDC/LDC93S1 || exit 1;
-local/timit_train_lms.sh data/local || exit 1 ;
-local/timit_format_data.sh || exit 1;
-
-# mfccdir should be some place with a largish disk where you
-# want to store MFCC features. 
-mfccdir=mfccs
-
-for test in train test dev ; do
-  steps/make_mfcc.sh data/$test exp/make_mfcc/$test $mfccdir 4
-done
-
-# train monophone system.
-steps/train_mono.sh data/train data/lang exp/mono || exit 1;
-
-scripts/mkgraph.sh --mono data/lang_test exp/mono exp/mono/graph || exit 1;
-echo "Decoding test datasets."
-for test in dev test ; do
-  steps/decode_deltas.sh exp/mono data/$test data/lang exp/mono/decode_$test &
-done
-wait
-scripts/average_wer.sh exp/mono/decode_*/wer > exp/mono/wer || exit 1;
-
-# Get alignments from monophone system.
-echo "Creating training alignments to use to train other systems such as ANN-HMM."
-steps/align_deltas.sh data/train data/lang exp/mono exp/mono_ali_train || exit 1;
-echo "Creating dev alignments to use to train other systems such as ANN-HMM."
-steps/align_deltas.sh data/dev data/lang exp/mono exp/mono_ali_dev || exit 1;
-echo "Creating test alignments to use to train other systems such as ANN-HMM."
-steps/align_deltas.sh data/test data/lang exp/mono exp/mono_ali_test || exit 1;
-
-
diff --git a/egs/timit/s3/scripts/add_disambig.pl b/egs/timit/s3/scripts/add_disambig.pl
deleted file mode 100755
index 9036b484e29..00000000000
--- a/egs/timit/s3/scripts/add_disambig.pl
+++ /dev/null
@@ -1,58 +0,0 @@
-#!/usr/bin/env perl
-# Copyright 2010-2011 Microsoft Corporation
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-
-# Adds some specified number of disambig symbols to a symbol table.
-# Adds these as #1, #2, etc.
-# If the --include-zero option is specified, includes an extra one
-# #0.
-if(!(@ARGV == 2 || (@ARGV ==3 && $ARGV[0] eq "--include-zero"))) {
-    die "Usage: add_disambig.pl [--include-zero] symtab.txt num_extra > symtab_out.txt ";
-}
-
-if(@ARGV  == 3) {
-    $include_zero = 1;
-    $ARGV[0] eq "--include-zero" || die "Bad option/first argument $ARGV[0]";
-    shift @ARGV;
-} else {
-    $include_zero = 0;
-}
-
-$input = $ARGV[0];
-$nsyms = $ARGV[1];
-
-open(F, "<$input") || die "Opening file $input";
-
-while(<F>) {
-    @A = split(" ", $_);
-    @A == 2 || die "Bad line $_";
-    $lastsym = $A[1];
-    print;
-}
-
-if(!defined($lastsym)){
- die "Empty symbol file?";
-}
-
-if($include_zero) {
-    $lastsym++;
-    print "#0  $lastsym\n";
-}
-
-for($n = 1; $n <= $nsyms; $n++) {
-    $y = $n + $lastsym;
-    print "#$n  $y\n";
-}
diff --git a/egs/timit/s3/scripts/add_lex_disambig.pl b/egs/timit/s3/scripts/add_lex_disambig.pl
deleted file mode 100755
index 86d96848c97..00000000000
--- a/egs/timit/s3/scripts/add_lex_disambig.pl
+++ /dev/null
@@ -1,101 +0,0 @@
-#!/usr/bin/env perl
-# Copyright 2010-2011 Microsoft Corporation
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-
-# Adds disambiguation symbols to a lexicon.
-# Outputs still in the normal lexicon format.
-# Disambig syms are numbered #1, #2, #3, etc. (#0 
-# reserved for symbol in grammar).
-# Outputs the number of disambig syms to the standard output.
-
-if(@ARGV != 2) {
-    die "Usage: add_lex_disambig.pl [ --sil silphone ] lexicon.txt lexicon_disambig.txt "
-}
-
-
-$lexfn = shift @ARGV;
-$lexoutfn = shift @ARGV;
-
-open(L, "<$lexfn") || die "Error opening lexicon $lexfn";
-
-# (1)  Read in the lexicon.
-@L = ( );
-while(<L>) {
-    @A = split(" ", $_);
-    push @L, join(" ", @A);
-}
-
-# (2) Work out the count of each phone-sequence in the
-# lexicon.
-
-foreach $l (@L) {
-    @A = split(" ", $l);
-    shift @A; # Remove word.
-    $count{join(" ",@A)}++;
-}
-
-# (3) For each left sub-sequence of each phone-sequence, note down
-# that exists (for identifying prefixes of longer strings).
-
-foreach $l (@L) {
-    @A = split(" ", $l);
-    shift @A; # Remove word.
-    while(@A > 0) {
-        pop @A;  # Remove last phone
-        $issubseq{join(" ",@A)} = 1;
-    }
-}
-
-# (4) For each entry in the lexicon:
-#  if the phone sequence is unique and is not a
-#  prefix of another word, no diambig symbol.
-#  Else output #1, or #2, #3, ... if the same phone-seq
-#  has already been assigned a disambig symbol.
-
-
-open(O, ">$lexoutfn") || die "Opening lexicon file $lexoutfn for writing.\n";
-
-$max_disambig = 0;
-foreach $l (@L) {
-    @A = split(" ", $l);
-    $word = shift @A;
-    $phnseq = join(" ",@A);
-    if(!defined $issubseq{$phnseq}
-       && $count{$phnseq}==1) {
-        ; # Do nothing.
-    } else {
-        if($phnseq eq "") { # need disambig symbols for the empty string
-            # that are not used anywhere else.
-            $max_disambig++;
-            $reserved{$max_disambig} = 1;
-            $phnseq = "#$max_disambig";
-        } else {
-            $curnumber = $disambig_of{$phnseq};
-            if(!defined{$curnumber}) { $curnumber = 0; }
-            $curnumber++; # now 1 or 2, ... 
-            while(defined $reserved{$curnumber} ) { $curnumber++; } # skip over reserved symbols
-            if($curnumber > $max_disambig) {
-                $max_disambig = $curnumber;
-            }
-            $disambig_of{$phnseq} = $curnumber;
-            $phnseq = $phnseq . " #" . $curnumber;
-         }
-    }
-    print O "$word\t$phnseq\n";
-}
-
-print $max_disambig . "\n";
-
diff --git a/egs/timit/s3/scripts/average_wer.sh b/egs/timit/s3/scripts/average_wer.sh
deleted file mode 100755
index a2c9c35109d..00000000000
--- a/egs/timit/s3/scripts/average_wer.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/bash
-#
-# Copyright 2010-2011 Microsoft Corporation
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-# To be run from one directory above this script.
-
-grep WER $* | \
-  awk '{n=n+$4; d=d+$6} END{ printf("Average WER is %f (%d / %d) \n", (100.0*n)/d, n, d); }' 
diff --git a/egs/timit/s3/scripts/collapse_phones.pl b/egs/timit/s3/scripts/collapse_phones.pl
deleted file mode 100755
index f2126a48882..00000000000
--- a/egs/timit/s3/scripts/collapse_phones.pl
+++ /dev/null
@@ -1,63 +0,0 @@
-#!/usr/bin/env perl
-use strict ; 
-
-my $ignore_first_field = 0;
-if($ARGV[0] eq "--ignore-first-field") { $ignore_first_field = 1; shift @ARGV; }
-
-my $symtab = shift @ARGV;
-
-if(!defined $symtab) {
-    die "Usage: collapse_phones.pl --ignore-first-field symtab [phoneme mapping] > output transcriptions\n";
-}
-
-my $mapping_str = shift @ARGV;
-if(!defined $mapping_str) {
-    die "Usage: collapse_phones.pl --ignore-first-field symtab [phoneme mapping] > output transcriptions\n";
-}
-
-my %mapping;
-my @parts = split(",", $mapping_str);
-for my $part (@parts) {
-    my ($from, $to) = split(":", $part);
-    $mapping{uc($from)} = uc($to) ; 
-}
-
-my %sym2int ; 
-open(F, "<$symtab") || die "Error opening symbol table file $symtab";
-while(<F>) {
-    my @A = split(" ", $_);
-    @A == 2 || die "bad line in symbol table file: $_";
-    $sym2int{$A[0]} = $A[1] + 0;
-}
-
-# change the mappings. 
-my %int2int ;
-foreach my $key (keys %sym2int) {
-   my $value = $sym2int{$key} ; 
-   if (exists($mapping{$key})) {
-      $int2int{$value} = $sym2int{$mapping{$key}} ;
-   } else {
-      $int2int{$value} = $value ; 
-   }
-}
-
-while(<>) {
-    my @A = split(" ", $_);
-    if(@A == 0) {
-        die "Empty line in transcriptions input.";
-    }
-    if($ignore_first_field) {
-        my $key = shift @A;
-        print $key . " ";
-    }
-    foreach $a (@A) {
-        my $i = $int2int{$a};
-        if(!defined ($i)) {
-                die "collapse_phones.pl: undefined symbol $a\n";
-        }
-        print $i . " ";
-    }
-    print "\n";
-}
-
-
diff --git a/egs/timit/s3/scripts/eps2disambig.pl b/egs/timit/s3/scripts/eps2disambig.pl
deleted file mode 100755
index 049802b0888..00000000000
--- a/egs/timit/s3/scripts/eps2disambig.pl
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/usr/bin/env perl
-# Copyright 2010-2011 Microsoft Corporation
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-# This script replaces epsilon with #0 on the input side only, of the G.fst
-# acceptor.  
-
-while(<>){
-    s:^(\d+\s+\d+\s+)\<eps\>(\s+):$1#0$2:;
-    print;
-}
diff --git a/egs/timit/s3/scripts/filter_scp.pl b/egs/timit/s3/scripts/filter_scp.pl
deleted file mode 100755
index c60b9800f84..00000000000
--- a/egs/timit/s3/scripts/filter_scp.pl
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/usr/bin/env perl
-use warnings; #sed replacement for -w perl parameter
-# Copyright 2010-2011 Microsoft Corporation
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-
-# This script takes a list of utterance-ids and filters an scp
-# file (or any file whose first field is an utterance id), printing
-# out only those lines whose first field is in id_list.
-
-if(@ARGV < 1 || @ARGV > 2) {
-    die "Usage: filter_scp.pl id_list [in.scp] > out.scp ";
-}
-
-$idlist = shift @ARGV;
-open(F, "<$idlist") || die "Could not open id-list file $idlist";
-while(<F>) {
-    @A = split;
-    @A>=1 || die "Invalid id-list file line $_";
-    $seen{$A[0]} = 1;
-}
-
-while(<>) {
-    @A = split;
-    @A > 0 || die "Invalid scp file line $_";
-    if($seen{$A[0]}) {
-        print $_;
-    }
-}
diff --git a/egs/timit/s3/scripts/int2sym.pl b/egs/timit/s3/scripts/int2sym.pl
deleted file mode 100755
index ad85ef34993..00000000000
--- a/egs/timit/s3/scripts/int2sym.pl
+++ /dev/null
@@ -1,90 +0,0 @@
-#!/usr/bin/env perl
-# Copyright 2010-2011 Microsoft Corporation
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-
-$ignore_noninteger = 0;
-$ignore_first_field = 0;
-$field = -1;
-for($x = 0; $x < 2; $x++) {
-    if($ARGV[0] eq "--ignore-noninteger") { $ignore_noninteger = 1; shift @ARGV; }
-    if($ARGV[0] eq "--ignore-first-field") { $ignore_first_field = 1; shift @ARGV; }
-    if($ARGV[0] eq "--field") { 
-       shift @ARGV; $field = $ARGV[0]+0; shift @ARGV;
-       if ($field < 1) { die "Bad argument to --field option: $field"; }
-    }
-}
-
-if ($ignore_first_field && $field > 0) { die "Incompatible options ignore-first-field and field"; }
-$zfield = $field-1; # Change to zero-based indexing.
-
-$symtab = shift @ARGV;
-if(!defined $symtab) {
-    die "Usage: sym2int.pl symtab [input] > output\n";
-}
-open(F, "<$symtab") || die "Error opening symbol table file $symtab";
-while(<F>) {
-    @A = split(" ", $_);
-    @A == 2 || die "bad line in symbol table file: $_";
-    $int2sym{$A[1]} = $A[0];
-}
-
-sub int2sym {
-    my $a = shift @_;
-    my $pos = shift @_;
-    if($a !~  m:^\d+$:) { # not all digits..
-        if($ignore_noninteger) {
-            print $a . " ";
-            next;
-        } else {
-            if($pos == 0) {
-                die "int2sym.pl: found noninteger token $a (try --ignore-first-field)\n";
-            } else {
-                die "int2sym.pl: found noninteger token $a (try --ignore-noninteger if valid input)\n";
-            }
-        }
-    }
-    $s = $int2sym{$a};
-    if(!defined ($s)) {
-        die "int2sym.pl: integer $a not in symbol table $symtab.";
-    }
-    return $s;
-}
-
-$error = 0;
-while(<>) {
-    @A = split(" ", $_);
-    if($ignore_first_field) {
-        $key = shift @A;
-        print $key . " ";
-    }
-    if ($field != -1) {
-        if ($zfield <= $#A && $zfield >= 0) {
-            $a = $A[$zfield];
-            $A[$zfield] = int2sym($a, $zfield);
-        }
-        print join(" ", @A);
-    } else {
-        for ($pos = 0; $pos <= $#A; $pos++) {
-            $a = $A[$pos];
-            $s = int2sym($a, $pos);
-            print $s . " ";
-        }
-    }
-    print "\n";
-}
-
-
-
diff --git a/egs/timit/s3/scripts/is_sorted.sh b/egs/timit/s3/scripts/is_sorted.sh
deleted file mode 100755
index ac6ae42e74e..00000000000
--- a/egs/timit/s3/scripts/is_sorted.sh
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/bin/bash
-# Copyright 2010-2011 Microsoft Corporation
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-
-# Usage: is_sorted.sh [script-file]
-# This script returns 0 (success) if the script file argument [or standard input]
-# is sorted and 1 otherwise.
-
-export LC_ALL=C
-
-if [ $# == 0 ]; then
-  scp=-
-fi
-if [ $# == 1 ]; then
-  scp=$1
-fi
-if [ $# -gt 1 -o "$1" == "--help" -o "$1" == "-h" ]; then
-  echo "Usage: is_sorted.sh [script-file]"
-  exit 1
-fi
-
-cat $scp > /tmp/tmp1.$$
-sort /tmp/tmp1.$$ > /tmp/tmp2.$$
-cmp /tmp/tmp1.$$ /tmp/tmp2.$$ >/dev/null
-ret=$?
-rm /tmp/tmp1.$$  /tmp/tmp2.$$
-if [ $ret == 0 ]; then
-   exit 0;
-else
-  echo "is_sorted.sh: script file $scp is not sorted";
-  exit 1;
-fi
diff --git a/egs/timit/s3/scripts/make_lexicon_fst.pl b/egs/timit/s3/scripts/make_lexicon_fst.pl
deleted file mode 100755
index ada17f64e11..00000000000
--- a/egs/timit/s3/scripts/make_lexicon_fst.pl
+++ /dev/null
@@ -1,122 +0,0 @@
-#!/usr/bin/env perl
-# Copyright 2010-2011 Microsoft Corporation
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-
-# makes lexicon FST (no pron-probs involved).
-
-if(@ARGV != 1 && @ARGV != 3 && @ARGV != 4) {
-    die "Usage: make_lexicon_fst.pl lexicon.txt [silprob silphone [sil_disambig_sym]] lexiconfst.txt"
-}
-
-$lexfn = shift @ARGV;
-if(@ARGV == 0) {
-    $silprob = 0.0;
-} elsif (@ARGV == 2){ 
-    ($silprob,$silphone) = @ARGV;
-} else {
-    ($silprob,$silphone,$sildisambig) = @ARGV;
-}
-if($silprob != 0.0) {
-    $silprob < 1.0 || die "Sil prob cannot be >= 1.0";
-    $silcost = -log($silprob);
-    $nosilcost = -log(1.0 - $silprob);
-}
-
-
-open(L, "<$lexfn") || die "Error opening lexicon $lexfn";
-
-
-
-if( $silprob == 0.0 ) { # No optional silences: just have one (loop+final) state which is numbered zero.
-    $loopstate = 0;
-    $nextstate = 1; # next unallocated state.
-    while(<L>) {
-        @A = split(" ", $_);
-        $w = shift @A;
-        if(@A == 0) { # For empty words (<s> and </s>) insert no optional
-                      # silence (not needed as adjacent words supply it)....
-                      # actually we only hit this case for the lexicon without disambig
-                      # symbols but doesn't ever matter as training transcripts don't have <s> or </s>.
-            print "$loopstate\t$loopstate\t<eps>\t$w\n";
-        } else {
-            $s = $loopstate;
-            $word_or_eps = $w;
-            while (@A > 0) {
-                $p = shift @A;
-                if(@A > 0) {
-                    $ns = $nextstate++;
-                } else {
-                    $ns = $loopstate;
-                }
-                print "$s\t$ns\t$p\t$word_or_eps\n";
-                $word_or_eps = "<eps>";
-                $s = $ns;
-            }            
-        }
-    }
-    print "$loopstate\t0\n"; # final-cost.
-} else { # have silence probs.
-    $startstate = 0;
-    $loopstate = 1;
-    $silstate = 2; # state from where we go to loopstate after emitting silence.
-    print "$startstate\t$loopstate\t<eps>\t<eps>\t$nosilcost\n"; # no silence.
-    if (!defined $sildisambig) {
-        print "$startstate\t$loopstate\t$silphone\t<eps>\t$silcost\n"; # silence.
-        print "$silstate\t$loopstate\t$silphone\t<eps>\n"; # no cost.
-        $nextstate = 3;
-    } else {
-        $disambigstate = 3;
-        $nextstate = 4;
-        print "$startstate\t$disambigstate\t$silphone\t<eps>\t$silcost\n"; # silence.
-        print "$silstate\t$disambigstate\t$silphone\t<eps>\n"; # no cost.
-        print "$disambigstate\t$loopstate\t$sildisambig\t<eps>\n"; # silence disambiguation symbol.
-    }
-    while(<L>) {
-        @A = split(" ", $_);
-        $w = shift @A;
-        if(@A == 0) { # For empty words (<s> and </s>) insert no optional
-                      # silence (not needed as adjacent words supply it)....
-                      # actually we only hit this case for the lexicon without disambig
-                      # symbols but doesn't ever matter as training transcripts don't have <s> or </s>.
-            print "$loopstate\t$loopstate\t<eps>\t$w\n";
-        } else { 
-            $is_silence_word = (@A == 1 && $A[0] eq $silphone); # boolean.
-            $s = $loopstate;
-            $word_or_eps = $w;
-            while (@A > 0) {
-                $p = shift @A;
-                if(@A > 0) {
-                    $ns = $nextstate++;
-                    print "$s\t$ns\t$p\t$word_or_eps\n";
-                    $word_or_eps = "<eps>";
-                    $s = $ns;
-                } else {
-                    if(! $is_silence_word) {  
-                        # This is non-deterministic but relatively compact,
-                        # and avoids epsilons.
-                        print "$s\t$loopstate\t$p\t$word_or_eps\t$nosilcost\n";
-                        print "$s\t$silstate\t$p\t$word_or_eps\t$silcost\n";
-                    } else {
-                        # no point putting opt-sil after silence word.
-                        print "$s\t$loopstate\t$p\t$word_or_eps\n";
-                    }
-                    $word_or_eps = "<eps>";
-                }
-            }
-        }            
-    }
-    print "$loopstate\t0\n"; # final-cost.
-}
diff --git a/egs/timit/s3/scripts/make_phones_symtab.pl b/egs/timit/s3/scripts/make_phones_symtab.pl
deleted file mode 100755
index 03b8cbe7af3..00000000000
--- a/egs/timit/s3/scripts/make_phones_symtab.pl
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/usr/bin/env perl
-# Copyright 2010-2011 Microsoft Corporation
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-# make_phones_symtab.pl < lexicon.txt > phones.txt
-
-
-while(<>) {
-    @A = split(" ", $_);
-    for ($i=1; $i<@A; $i++) {
-        $P{$A[$i]} = 1; # seen it.
-    }
-}
-
-print "<eps>\t0\n";
-$n = 1;
-foreach $p (sort keys %P) {
-    if($p ne "<eps>") {
-        print "$p\t$n\n";
-        $n++;
-    }
-}
-
-print "sil\t$n\n";
-
diff --git a/egs/timit/s3/scripts/make_rm_dict.pl b/egs/timit/s3/scripts/make_rm_dict.pl
deleted file mode 100755
index 8aee98e7481..00000000000
--- a/egs/timit/s3/scripts/make_rm_dict.pl
+++ /dev/null
@@ -1,130 +0,0 @@
-#!/usr/bin/env perl
-# Copyright 2010-2011 Yanmin Qian  Microsoft Corporation
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-# This file takes as input the file pcdsril.txt that comes with the RM
-# distribution, and creates the dictionary used in RM training.
-
-# make_rm_dct.pl   pcdsril.txt > dct.txt
-
-if (@ARGV != 1) {
-    die "usage: make_rm_dct.pl   pcdsril.txt > dct.txt\n";
-}
-unless (open(IN_FILE, "@ARGV[0]")) {
-    die ("can't open @ARGV[0]");
-}
-
-while ($line = <IN_FILE>)
-{	
-	chop($line);
-	if (($line =~ /^[a-z]/)) 
-	{
-		$line =~ s/\+1//g;
-		@LineArray = split(/\s+/,$line);
-		@LineArray[0] = uc(@LineArray[0]);
-
-		printf "%-16s",  @LineArray[0];
-		for ($i = 1; $i < @LineArray; $i ++)
-		{
-			if (@LineArray[$i] eq 'q')
-			{}
-			elsif (@LineArray[$i] eq 'zh')
-			{
-				printf "sh ";
-			}
-			elsif (@LineArray[$i] eq 'eng')
-			{
-				printf "ng ";
-			}
-			elsif (@LineArray[$i] eq 'hv')
-			{
-				printf "hh ";
-			}
-			elsif (@LineArray[$i] eq 'em')
-			{
-				printf "m ";
-			}
-			elsif (@LineArray[$i] eq 'axr')
-			{
-				printf "er ";
-			}
-			elsif (@LineArray[$i] eq 'tcl')
-			{
-				if (@LineArray[$i+1] ne 't')
-				{
-					printf "td ";
-				}
-			}
-			elsif (@LineArray[$i] eq 'dcl')
-			{
-				if (@LineArray[$i+1] ne 'd')
-				{
-					printf "dd ";
-				}
-			}
-			elsif (@LineArray[$i] eq 'kcl')
-			{
-				if (@LineArray[$i+1] ne 'k')
-				{
-					printf "kd ";
-				}
-			}
-			elsif (@LineArray[$i] eq 'pcl')
-			{
-				if (@LineArray[$i+1] ne 'p')
-				{
-					printf "pd ";
-				}
-			}
-			elsif (@LineArray[$i] eq 'bcl')
-			{
-				if (@LineArray[$i+1] ne 'b')
-				{
-					printf "b ";
-				}
-			}
-			elsif (@LineArray[$i] eq 'gcl')
-			{
-				if (@LineArray[$i+1] ne 'g')
-				{
-					printf "g ";
-				}
-			}
-			elsif (@LineArray[$i] eq 't')
-			{
-				if (@LineArray[$i+1] ne 's')
-				{
-					printf "@LineArray[$i] ";
-				}
-				else
-				{
-					printf "ts ";
-					$i++;
-				}
-			}
-			else
-			{
-				printf "@LineArray[$i] ";
-			}
-		}
-		printf "\n";
-	}
-}
-
-printf "!SIL  sil\n";
-
-close(IN_FILE);
-
-
diff --git a/egs/timit/s3/scripts/make_rm_lm.pl b/egs/timit/s3/scripts/make_rm_lm.pl
deleted file mode 100755
index 053fb294329..00000000000
--- a/egs/timit/s3/scripts/make_rm_lm.pl
+++ /dev/null
@@ -1,119 +0,0 @@
-#!/usr/bin/env perl
-
-# Copyright 2010-2011 Yanmin Qian  Microsoft Corporation
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-# This file takes as input the file wp_gram.txt that comes with the RM
-# distribution, and creates the language model as an acceptor in FST form.
-
-# make_rm_lm.pl   wp_gram.txt > G.txt
-
-if (@ARGV != 1) {
-    print "usage: make_rm_lm.pl  wp_gram.txt > G.txt\n";
-    exit(0);
-}
-unless (open(IN_FILE, "@ARGV[0]")) {
-    die ("can't open @ARGV[0]");
-}
-
-
-$flag = 0;
-$count_wrd = 0;
-$cnt_ends = 0;
-$init = "";
-
-while ($line = <IN_FILE>)
-{	
-	chop($line);
-
-    $line =~ s/ //g;
-    
-	if(($line =~ /^>/)) 
-	{
-		if($flag == 0) 
-		{
-			$flag = 1;
-		}
-		$line =~ s/>//g;
-		$hashcnt{$init} = $i;
-		$init = $line;
-		$i = 0;
-		$count_wrd++;
-		@LineArray[$count_wrd - 1] = $init;
- 		$hashwrd{$init} = 0;
-	}
-	elsif($flag != 0)
-	{
-		
-		$hash{$init}[$i] = $line;
-		$i++; 			
-		if($line =~ /SENTENCE-END/)
-		{
-			$cnt_ends++;
-		}
- 	} 
-	else
-	{}
-}
-
-$hashcnt{$init} = $i;
-
-$num = 0;
-$weight = 0;
-$init_wrd = "SENTENCE-END";
-$hashwrd{$init_wrd} = @LineArray;
-for($i = 0; $i < $hashcnt{$init_wrd}; $i++)
-{
-	$weight = -log(1/$hashcnt{$init_wrd});
-	$hashwrd{$hash{$init_wrd}[$i]} = $i + 1;
-	print "0    $hashwrd{$hash{$init_wrd}[$i]}    $hash{$init_wrd}[$i]    $hash{$init_wrd}[$i]    $weight\n";
-}
-$num = $i;
-
-for($i = 0; $i < @LineArray; $i++)
-{
-	if(@LineArray[$i] eq 'SENTENCE-END')
-	{}
-	else
-	{
-		if($hashwrd{@LineArray[$i]} == 0)
-		{
-			$num++;
-			$hashwrd{@LineArray[$i]} = $num;
-		}
-		for($j = 0; $j < $hashcnt{@LineArray[$i]}; $j++)
-		{
-			$weight = -log(1/$hashcnt{@LineArray[$i]});
-			if($hashwrd{$hash{@LineArray[$i]}[$j]} == 0)
-			{
-				$num++;
-				$hashwrd{$hash{@LineArray[$i]}[$j]} = $num;
-			}
-			if($hash{@LineArray[$i]}[$j] eq 'SENTENCE-END')
-			{
-				print "$hashwrd{@LineArray[$i]}    $hashwrd{$hash{@LineArray[$i]}[$j]}    <eps>    <eps>    $weight\n"
-                }
-			else
-			{
-				print "$hashwrd{@LineArray[$i]}    $hashwrd{$hash{@LineArray[$i]}[$j]}    $hash{@LineArray[$i]}[$j]    $hash{@LineArray[$i]}[$j]    $weight\n";
-			}
-		}
-	}
-}
-
-print "$hashwrd{$init_wrd}    0\n";
-close(IN_FILE);
-
-
diff --git a/egs/timit/s3/scripts/make_roots.pl b/egs/timit/s3/scripts/make_roots.pl
deleted file mode 100755
index 07c224379b6..00000000000
--- a/egs/timit/s3/scripts/make_roots.pl
+++ /dev/null
@@ -1,102 +0,0 @@
-#!/usr/bin/env perl
-# Copyright 2010-2011 Microsoft Corporation
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-
-# Written by Dan Povey 9/21/2010.  Apache 2.0 License.
-
-# This version of make_roots.pl is specialized for RM.
-
-# This script creates the file roots.txt which is an input to train-tree.cc.  It
-# specifies how the trees are built.  The input file phone-sets.txt is a partial
-# version of roots.txt in which phones are represented by their spelled form, not
-# their symbol id's.  E.g. at input, phone-sets.txt might contain;
-#  shared not-split  sil
-# Any phones not specified in phone-sets.txt but present in phones.txt will
-# be given a default treatment.  If the --separate option is given, we create
-# a separate tree root for each of them, otherwise they are all lumped in one set.
-# The arguments shared|not-shared and split|not-split are needed if any
-# phones are not specified in phone-sets.txt.  What they mean is as follows:
-# if shared=="shared" then we share the tree-root between different HMM-positions
-# (0,1,2).  If split=="split" then we actually do decision tree splitting on
-# that root, otherwise we forbid decision-tree splitting.  (The main reason we might 
-# set this to false is for silence when
-# we want to ensure that the HMM-positions will remain with a single PDF id.
-
-
-$separate = 0;
-if($ARGV[0] eq "--separate") {
-    $separate = 1;
-    shift @ARGV;
-}
-
-if(@ARGV != 4) {
-    die "Usage: make_roots.pl [--separate] phones.txt silence-phone-list[integer,colon-separated] shared|not-shared split|not-split > roots.txt\n";
-}
-
-
-($phonesfile, $silphones, $shared, $split) = @ARGV;
-if($shared ne "shared" && $shared ne "not-shared") {
-    die "Third argument must be \"shared\" or \"not-shared\"\n";
-}
-if($split ne "split" && $split ne "not-split") {
-    die "Third argument must be \"split\" or \"not-split\"\n";
-}
-
-
-
-open(F, "<$phonesfile") || die "Opening file $phonesfile";
-
-while(<F>) {
-    @A = split(" ", $_);
-    if(@A != 2) {
-        die "Bad line in phones symbol file: ".$_;
-    }
-    if($A[1] != 0) {
-        $symbol2id{$A[0]} = $A[1];
-        $id2symbol{$A[1]} = $A[0];
-    }
-}
-
-if($silphones == ""){ 
-    die "Empty silence phone list in make_roots.pl";
-}
-foreach $silphoneid (split(":", $silphones)) {
-    defined $id2symbol{$silphoneid} || die "No such silence phone id $silphoneid";
-    # Give each silence phone its own separate pdfs in each state, but
-    # no sharing (in this recipe; WSJ is different.. in this recipe there
-    #is only one silence phone anyway.)
-    $issil{$silphoneid} = 1;
-    print "not-shared not-split $silphoneid\n";
-}
-
-$idlist = "";
-$remaining_phones = "";
-
-if($separate){
-    foreach $a (keys %id2symbol) {
-        if(!defined $issil{$a}) {
-            print "$shared $split $a\n";
-        }
-    }
-} else {
-    print "$shared $split ";
-    foreach $a (keys %id2symbol) {
-        if(!defined $issil{$a}) {
-            print "$a ";
-        }
-    }
-    print "\n";
-}
diff --git a/egs/timit/s3/scripts/make_words_symtab.pl b/egs/timit/s3/scripts/make_words_symtab.pl
deleted file mode 100755
index 509078898fc..00000000000
--- a/egs/timit/s3/scripts/make_words_symtab.pl
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/usr/bin/env perl
-# Copyright 2010-2011 Microsoft Corporation
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-# make_words_symtab.pl < lexicon.txt > words.txt
-
-
-while(<>) {
-    @A = split(" ", $_);
-        $W{$A[0]} = 1;
-}
-
-print "<eps>\t0\n";
-$n = 1;
-foreach $w (sort keys %W) {
-    if($w ne "<eps>") {
-        print "$w\t$n\n";
-        $n++;
-    }
-}
-
-print "!SIL\t$n\n";
-
diff --git a/egs/timit/s3/scripts/mkgraph.sh b/egs/timit/s3/scripts/mkgraph.sh
deleted file mode 100755
index e7d3fbe6b19..00000000000
--- a/egs/timit/s3/scripts/mkgraph.sh
+++ /dev/null
@@ -1,107 +0,0 @@
-#!/bin/bash
-# Copyright 2010-2011 Microsoft Corporation
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-
-N=3
-P=1
-clean=false
-
-for x in 1 2 3; do 
-  if [ $1 == "--mono" ]; then
-    N=1;
-    P=0;
-    shift;
-  fi
-  if [ $1 == "--clean" ]; then
-    clean=true
-    shift;
-  fi
-
-done
-
-if [ $# != 3 ]; then
-   echo "Usage: scripts/mkgraph.sh <test-lang-dir> <model-dir> <graphdir>"
-   echo "e.g.: scripts/mkgraph.sh data/lang_test exp/tri1/ exp/tri1/graph"
-   exit 1;
-fi
-
-if [ -f path.sh ]; then . path.sh; fi
-
-lang=$1
-tree=$2/tree
-model=$2/final.mdl
-dir=$3
-
-if $clean; then rm -r $lang/tmp; fi
-
-mkdir -p $dir
-
-tscale=1.0
-loopscale=0.1
-
-# If $lang/tmp/LG.fst does not exist or is older than its sources, make it...
-# (note: the [[ ]] brackets make the || type operators work (inside [ ], we
-# would have to use -o instead),  -f means file exists, and -ot means older than).
-
-mkdir -p $lang/tmp
-if [[ ! -f $lang/tmp/LG.fst || $lang/tmp/LG.fst -ot $lang/G.fst || \
-      $lang/tmp/LG.fst -ot $lang/L_disambig.fst ]]; then
-  fsttablecompose $lang/L_disambig.fst $lang/G.fst | fstdeterminizestar --use-log=true | \
-    fstminimizeencoded  > $lang/tmp/LG.fst || exit 1;
-  fstisstochastic $lang/tmp/LG.fst || echo "warning: LG not stochastic."
-fi
-
-if [ ! -f $lang/phones_disambig.txt ]; then
-  echo "No such file $lang/phones_disambig.txt (supplied a training lang/ directory?)"
-  exit 1;
-fi
-
-grep '#' $lang/phones_disambig.txt | awk '{print $2}' > $lang/tmp/disambig_phones.list
-
-
-clg=$lang/tmp/CLG_${N}_${P}.fst
-
-if [[ ! -f $clg || $clg -ot $lang/tmp/LG.fst ]]; then
-  fstcomposecontext --context-size=$N --central-position=$P \
-   --read-disambig-syms=$lang/tmp/disambig_phones.list \
-   --write-disambig-syms=$lang/tmp/disambig_ilabels_${N}_${P}.list \
-    $lang/tmp/ilabels_${N}_${P} < $lang/tmp/LG.fst >$clg
-  fstisstochastic $clg  || echo "warning: CLG not stochastic."
-fi
-
-if [[ ! -f $dir/Ha.fst || $dir/Ha.fst -ot $model ]]; then
-  make-h-transducer --disambig-syms-out=$dir/disambig_tid.list \
-    --transition-scale=$tscale $lang/tmp/ilabels_${N}_${P} $tree $model \
-     > $dir/Ha.fst  || exit 1;
-fi
-
-if [[ ! -f $dir/HCLGa.fst || $dir/HCLGa.fst -ot $dir/Ha.fst || \
-      $dir/HCLGa.fst -ot $clg ]]; then
-  fsttablecompose $dir/Ha.fst $clg | fstdeterminizestar --use-log=true \
-    | fstrmsymbols $dir/disambig_tid.list | fstrmepslocal | \
-     fstminimizeencoded > $dir/HCLGa.fst || exit 1;
-  fstisstochastic $dir/HCLGa.fst || echo "HCLGa is not stochastic"
-fi
-
-if [[ ! -f $dir/HCLG.fst || $dir/HCLG.fst -ot $dir/HCLGa.fst ]]; then
-  add-self-loops --self-loop-scale=$loopscale --reorder=true \
-    $model < $dir/HCLGa.fst > $dir/HCLG.fst || exit 1;
-
-  if [ $tscale == 1.0 -a $loopscale == 1.0 ]; then
-    # No point doing this test if transition-scale not 1, as it is bound to fail. 
-    fstisstochastic $dir/HCLG.fst || echo "Final HCLG is not stochastic."
-  fi
-fi
diff --git a/egs/timit/s3/scripts/remove_oovs.pl b/egs/timit/s3/scripts/remove_oovs.pl
deleted file mode 100755
index 532d7f295ea..00000000000
--- a/egs/timit/s3/scripts/remove_oovs.pl
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/usr/bin/env perl
-# Copyright 2010-2011 Microsoft Corporation
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-# This script removes lines that contain these OOVs on either the
-# third or fourth fields  of the line.  It is intended to remove arcs
-# with OOVs on, from FSTs (probably compiled from ARPAs with OOVs in).
-
-if (  @ARGV < 1 && @ARGV > 2) {
-    die "Usage: remove_oovs.pl unk_list.txt [ printed-fst ]\n";
-}
-
-$unklist = shift @ARGV;
-open(S, "<$unklist") || die "Failed opening unknown-symbol list $unklist\n";
-while(<S>){ 
-    @A = split(" ", $_);
-    @A == 1 || die "Bad line in unknown-symbol list: $_";
-    $unk{$A[0]} = 1;
-}
-
-$num_removed = 0;
-while(<>){ 
-    @A = split(" ", $_);
-    if(defined $unk{$A[2]} || defined $unk{$A[3]}) {
-        $num_removed++;
-    } else {
-        print;
-    }
-}
-print STDERR "remove_oovs.pl: removed $num_removed lines.\n";
-
diff --git a/egs/timit/s3/scripts/s2eps.pl b/egs/timit/s3/scripts/s2eps.pl
deleted file mode 100755
index ffeeb8eb6af..00000000000
--- a/egs/timit/s3/scripts/s2eps.pl
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/usr/bin/env perl
-# Copyright 2010-2011 Microsoft Corporation
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-# This script replaces <s> and </s> with <eps> (on both input and output sides),
-# for the G.fst acceptor.
-
-while(<>){
-    @A = split(" ", $_);
-    if ( @A >= 4 ) {
-        if ($A[2] eq "<s>" || $A[2] eq "</s>") { $A[2] = "<eps>"; }
-        if ($A[3] eq "<s>" || $A[3] eq "</s>") { $A[3] = "<eps>"; }
-    }
-    print join("\t", @A) . "\n";
-}
diff --git a/egs/timit/s3/scripts/silphones.pl b/egs/timit/s3/scripts/silphones.pl
deleted file mode 100755
index 3ff85dfe3bb..00000000000
--- a/egs/timit/s3/scripts/silphones.pl
+++ /dev/null
@@ -1,57 +0,0 @@
-#!/usr/bin/env perl
-# Copyright 2010-2011 Microsoft Corporation
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-
-# creates integer lists of silence and non-silence phones in files,
-# e.g. silphones.csl="1:2:3 \n"
-# and nonsilphones.csl="4:5:6:7:...:24\n";
-
-if(@ARGV != 4) {
-    die "Usage: silphones.pl phones.txt \"sil1 sil2 sil3\" silphones.csl nonsilphones.csl";
-}
-
-($symtab, $sillist, $silphones, $nonsilphones) = @ARGV;
-open(S,"<$symtab") || die "Opening symbol table $symtab";
-
-
-foreach $s (split(" ", $sillist)) {
-    $issil{$s} = 1;
-}
-
-@sil = ();
-@nonsil = ();
-while(<S>){
-    @A = split(" ", $_);
-    @A == 2 || die "Bad line $_ in phone-symbol-table file $symtab";
-    ($sym, $int) = @A;
-    if($int != 0) {
-        if($issil{$sym}) { push @sil, $int; $seensil{$sym}=1; }
-        else { push @nonsil, $int; }
-    }
-}
-
-foreach $k(keys %issil) {
-    if(!$seensil{$k}) { die "No such silence phone $k"; }
-}
-open(F, ">$silphones") || die "opening silphones file $silphones";
-open(G, ">$nonsilphones") || die "opening nonsilphones file $nonsilphones";
-print F join(":", @sil) . "\n";
-print G join(":", @nonsil) . "\n";
-close(F);
-close(G);
-if(@sil == 0) { print STDERR "Warning: silphones.pl no silence phones.\n" }
-if(@nonsil == 0) { print STDERR "Warning: silphones.pl no non-silence phones.\n" }
-
diff --git a/egs/timit/s3/scripts/spk2utt_to_utt2spk.pl b/egs/timit/s3/scripts/spk2utt_to_utt2spk.pl
deleted file mode 100755
index 23992f25dea..00000000000
--- a/egs/timit/s3/scripts/spk2utt_to_utt2spk.pl
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/usr/bin/env perl
-# Copyright 2010-2011 Microsoft Corporation
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-
-while(<>){ 
-    @A = split(" ", $_);
-    @A > 1 || die "Invalid line in spk2utt file: $_";
-    $s = shift @A;
-    foreach $u ( @A ) {
-        print "$u $s\n";
-    }
-}
-
-
diff --git a/egs/timit/s3/scripts/split_scp.pl b/egs/timit/s3/scripts/split_scp.pl
deleted file mode 100755
index 9ffb29b76f2..00000000000
--- a/egs/timit/s3/scripts/split_scp.pl
+++ /dev/null
@@ -1,182 +0,0 @@
-#!/usr/bin/env perl
-use warnings; #sed replacement for -w perl parameter
-# Copyright 2010-2011 Microsoft Corporation
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-
-
-# This program splits up any kind of .scp or archive-type file.
-# If there is no utt2spk option it will work on any text  file and
-# will split it up with an approximately equal number of lines in
-# each but.
-# With the --utt2spk option it will work on anything that has the 
-# utterance-id as the first entry on each line; the utt2spk file is
-# of the form "utterance speaker" (on each line).
-# It splits it into equal size chunks as far as it can.  If you use
-# the utt2spk option it will make sure these chunks coincide with
-# speaker boundaries.  In this case, if there are more chunks
-# than speakers (and in some other circumstances), some of the 
-# resulting  chunks will be empty and it
-# will print a warning.
-# You will normally call this like:
-# split_scp.pl scp scp.1 scp.2 scp.3 ...
-# or
-# split_scp.pl --utt2spk=utt2spk scp scp.1 scp.2 scp.3 ...
-# Note that you can use this script to split the utt2spk file itself,
-# e.g. split_scp.pl --utt2spk=utt2spk utt2spk utt2spk.1 utt2spk.2 ...
-
-if(@ARGV < 2 ) {
-    die "Usage: split_scp.pl [--utt2spk=<utt2spk_file>] in.scp out1.scp out2.scp ... ";
-}
-
-if($ARGV[0] =~ m:^-:) {  
-    # Everything inside this block
-    # corresponds to what we do when the --utt2spk option is used.
-    $opt = shift @ARGV;
-    @A = split("=", $opt);
-    if(@A != 2 || $A[0] ne "--utt2spk") {
-        die "split_scp.pl: invalid option $ARGV[0]";
-    }
-    $utt2spk_file = $A[1];
-    open(U, "<$utt2spk_file") || die "Failed to open utt2spk file $utt2spk_file";
-    while(<U>) {
-        @A = split;
-        @A == 2 || die "Bad line $_ in utt2spk file $utt2spk_file";
-        ($u,$s) = @A;
-        $utt2spk{$u} = $s;
-    }
-    $inscp = shift @ARGV;
-    open(I, "<$inscp") || die "Opening input scp file $inscp";
-    @spkrs = ();
-    while(<I>) {
-        @A = split;
-        if(@A == 0) { die "Empty or space-only line in scp file $inscp"; }
-        $u = $A[0];
-        $s = $utt2spk{$u};
-        if(!defined $s) { die "No such utterance $u in utt2spk file $utt2spk_file"; }
-        if(!defined $spk_count{$s}) { 
-            push @spkrs, $s; 
-            $spk_count{$s} = 0;
-            $spk_data{$s} = "";
-        }
-        $spk_count{$s}++;
-        $spk_data{$s} = $spk_data{$s} . $_;
-    }
-    # Now split as equally as possible ..
-    # First allocate spks to files by given approximately
-    # equal #spks.
-    $numspks = @spkrs;  # number of speakers.
-    $numscps = @ARGV; # number of output files.
-    $spksperscp = int( ($numspks+($numscps-1)) / $numscps); # the +$(numscps-1) forces rounding up.
-    for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
-        $scparray[$scpidx] = []; # [] is array reference.
-        for($n = $spksperscp * $scpidx; 
-            $n < $numspks && $n < $spksperscp*($scpidx+1); 
-            $n++) {
-            $spk = $spkrs[$n];
-            push @{$scparray[$scpidx]}, $spk;
-            $scpcount[$scpidx] += $spk_count{$spk};
-        }
-    }
-    # Now will try to reassign beginning + ending speakers
-    # to different scp's and see if it gets more balanced.
-    # Suppose objf we're minimizing is sum_i (num utts in scp[i] - average)^2.
-    # We can show that if considering changing just 2 scp's, we minimize
-    # this by minimizing the squared difference in sizes.  This is
-    # equivalent to minimizing the absolute difference in sizes.  This
-    # shows this method is bound to converge.
-
-    $changed = 1;
-    while($changed) {
-        $changed = 0;
-        for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
-            # First try to reassign ending spk of this scp.
-            if($scpidx < $numscps-1) {
-                $sz = @{$scparray[$scpidx]};
-                if($sz > 0) {
-                    $spk = $scparray[$scpidx]->[$sz-1];
-                    $count = $spk_count{$spk};
-                    $nutt1 = $scpcount[$scpidx];
-                    $nutt2 = $scpcount[$scpidx+1];
-                    if( abs( ($nutt2+$count) - ($nutt1-$count))
-                        < abs($nutt2 - $nutt1))  { # Would decrease
-                        # size-diff by reassigning spk...
-                        $scpcount[$scpidx+1] += $count;
-                        $scpcount[$scpidx] -= $count;
-                        pop @{$scparray[$scpidx]};
-                        unshift @{$scparray[$scpidx+1]}, $spk;
-                        $changed = 1;
-                    }
-                }
-            }
-            if($scpidx > 0 && @{$scparray[$scpidx]} > 0) {
-                $spk = $scparray[$scpidx]->[0];
-                $count = $spk_count{$spk};
-                $nutt1 = $scpcount[$scpidx-1];
-                $nutt2 = $scpcount[$scpidx];
-                if( abs( ($nutt2-$count) - ($nutt1+$count))
-                    < abs($nutt2 - $nutt1))  { # Would decrease
-                    # size-diff by reassigning spk...
-                    $scpcount[$scpidx-1] += $count;
-                    $scpcount[$scpidx] -= $count;
-                    shift @{$scparray[$scpidx]};
-                    push @{$scparray[$scpidx-1]}, $spk;
-                    $changed = 1;
-                }
-            }
-        }
-    }
-    # Now print out the files...
-    for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
-        $scpfn = $ARGV[$scpidx];
-        open(F, ">$scpfn") || die "Could not open scp file $scpfn for writing.";
-        $count = 0;
-        if(@{$scparray[$scpidx]} == 0) {
-            print STDERR "Warning: split_scp.pl producing empty .scp file $scpfn (too many splits and too few speakers?)";
-        }
-        foreach $spk ( @{$scparray[$scpidx]} ) {
-            print F $spk_data{$spk};
-            $count += $spk_count{$spk};
-        }
-        if($count != $scpcount[$scpidx]) { die "Count mismatch [code error]"; }
-        close(F);
-    }
-} else { 
-   # This block is the "normal" case where there is no --utt2spk 
-   # option and we just break into equal size chunks.
-
-    $inscp = shift @ARGV;
-    open(I, "<$inscp") || die "Opening input scp file $inscp";
-
-    $numscps = @ARGV;  # size of array.
-    @F = ();
-    while(<I>) {
-        push @F, $_;
-    }
-    $numlines = @F;
-    if($numlines == 0) {
-        print STDERR "split_scp.pl: warning: empty input scp file $inscp";
-    }
-    $linesperscp = int( ($numlines+($numscps-1)) / $numscps); # the +$(numscps-1) forces rounding up.
-# [just doing int() rounds down].
-    for($scpidx = 0; $scpidx < @ARGV; $scpidx++) {
-        $scpfile = $ARGV[$scpidx];
-        open(O, ">$scpfile") || die "Opening output scp file $scpfile";
-        for($n = $linesperscp * $scpidx; $n < $numlines && $n < $linesperscp*($scpidx+1); $n++) {
-            print O $F[$n];
-        }
-        close(O) || die "Closing scp file $scpfile";
-    }
-}
diff --git a/egs/timit/s3/scripts/sym2int.pl b/egs/timit/s3/scripts/sym2int.pl
deleted file mode 100755
index ee22d3f13bd..00000000000
--- a/egs/timit/s3/scripts/sym2int.pl
+++ /dev/null
@@ -1,59 +0,0 @@
-#!/usr/bin/env perl
-# Copyright 2010-2011 Microsoft Corporation
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-
-$ignore_oov = 0;
-$ignore_first_field = 0;
-for($x = 0; $x < 2; $x++) {
-    if($ARGV[0] eq "--ignore-oov") { $ignore_oov = 1; shift @ARGV; }
-    if($ARGV[0] eq "--ignore-first-field") { $ignore_first_field = 1; shift @ARGV; }
-}
-
-$symtab = shift @ARGV;
-if(!defined $symtab) {
-    die "Usage: sym2int.pl symtab [input transcriptions] > output transcriptions\n";
-}
-open(F, "<$symtab") || die "Error opening symbol table file $symtab";
-while(<F>) {
-    @A = split(" ", $_);
-    @A == 2 || die "bad line in symbol table file: $_";
-    $sym2int{$A[0]} = $A[1] + 0;
-}
-
-while(<>) {
-    @A = split(" ", $_);
-    if(@A == 0) {
-        die "Empty line in transcriptions input.";
-    }
-    if($ignore_first_field) {
-        $key = shift @A;
-        print $key . " ";
-    }
-    foreach $a (@A) {
-        $i = $sym2int{$a};
-        if(!defined ($i)) {
-            if($ignore_oov) {
-                print $a . " " ;
-            } else {
-                die "sym2int.pl: undefined symbol $a\n";
-            }
-        }
-        print $i . " ";
-    }
-    print "\n";
-}
-
-
diff --git a/egs/timit/s3/scripts/utt2spk_to_spk2utt.pl b/egs/timit/s3/scripts/utt2spk_to_spk2utt.pl
deleted file mode 100755
index f5e61459bc9..00000000000
--- a/egs/timit/s3/scripts/utt2spk_to_spk2utt.pl
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/usr/bin/env perl
-# Copyright 2010-2011 Microsoft Corporation
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-
-
-while(<>){ 
-    @A = split(" ", $_);
-    @A == 2 || die "Invalid line in utt2spk file: $_";
-    ($u,$s) = @A;
-    if(!$seen_spk{$s}) {
-        $seen_spk{$s} = 1;
-        push @spklist, $s;
-    }
-    $uttlist{$s} = $uttlist{$s} . "$u ";
-}
-foreach $s (@spklist) {
-    $l = $uttlist{$s};
-    $l =~ s: $::; # remove trailing space.
-    print "$s $l\n";
-}
diff --git a/egs/timit/s3/steps/align_deltas.sh b/egs/timit/s3/steps/align_deltas.sh
deleted file mode 100755
index fd24edb789a..00000000000
--- a/egs/timit/s3/steps/align_deltas.sh
+++ /dev/null
@@ -1,81 +0,0 @@
-#!/bin/bash
-# Copyright 2010-2011 Microsoft Corporation  Arnab Ghoshal
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-# To be run from ..
-
-# This script does training-data alignment given a model built using 
-# CMN + delta + delta-delta features.  Its output, all in its own
-# experimental directory, is cmvn.ark, ali, tree, and final.mdl 
-# (the last two are just copied from the source directory). 
-
-# Option to use precompiled graphs from last phase, if these
-# are available (i.e. if they were built with the same data).
-
-graphs=
-if [ "$1" == --graphs ]; then
-   shift;
-   graphs=$1
-   shift
-fi
-
-
-if [ $# != 4 ]; then
-   echo "Usage: steps/align_deltas.sh <data-dir> <lang-dir> <src-dir> <exp-dir>"
-   echo " e.g.: steps/align_deltas.sh data/train data/lang exp/tri1 exp/tri1_ali"
-   exit 1;
-fi
-
-if [ -f path.sh ]; then . path.sh; fi
-
-data=$1
-lang=$2
-srcdir=$3
-dir=$4
-
-
-model=$srcdir/final.mdl
-
-
-mkdir -p $dir
-cp $model $dir/final.mdl || exit 1;  # Create copy of that model...
-cp $srcdir/tree $dir/tree || exit 1; # and the tree...
-
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
-
-
-
-echo "Computing cepstral mean and variance statistics"
-compute-cmvn-stats --spk2utt=ark:$data/spk2utt scp:$data/feats.scp \
-     ark:$dir/cmvn.ark 2>$dir/cmvn.log || exit 1;
-
-feats="ark:apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk ark:$dir/cmvn.ark scp:$data/feats.scp ark:- | add-deltas ark:- ark:- |"
-
-# Align all training data using the supplied model.
-
-echo "Aligning all training data"
-if [ -z "$graphs" ]; then # --graphs option not supplied [-z means empty string]
-  # compute integer form of transcripts.
-  scripts/sym2int.pl --ignore-first-field $lang/words.txt < $data/text > $dir/train.tra \
-    || exit 1;
-  gmm-align $scale_opts --beam=8 --retry-beam=40 $srcdir/tree $model $lang/L.fst \
-   "$feats" ark:$dir/train.tra ark:$dir/ali 2> $dir/align.log || exit 1;
-  rm $dir/train.tra
-else
-  gmm-align-compiled $scale_opts --beam=8 --retry-beam=40 $model  \
-   "$graphs" "$feats" ark:$dir/ali 2> $dir/align.log || exit 1;
-fi
-
-echo "Done."
diff --git a/egs/timit/s3/steps/decode_deltas.sh b/egs/timit/s3/steps/decode_deltas.sh
deleted file mode 100755
index 9f886a79e2c..00000000000
--- a/egs/timit/s3/steps/decode_deltas.sh
+++ /dev/null
@@ -1,77 +0,0 @@
-#!/bin/bash
-
-# Copyright 2010-2011 Microsoft Corporation
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-# Decoding script that works with a GMM model and delta-delta plus
-# cepstral mean subtraction features.  Used, for example, to decode
-# mono/ and tri1/
-
-if [ $# != 4 ]; then
-   echo "Usage: steps/decode_deltas.sh <model-dir> <data-dir> <lang-dir> <decode-dir>"
-   echo " e.g.: steps/decode_deltas.sh exp/mono data/test_feb89 data/test_lang exp/mono/decode_feb89"
-   exit 1;
-fi
-
-srcdir=$1
-data=$2
-lang=$3
-dir=$4
-graphdir=$srcdir/graph
-
-mkdir -p $dir
-
-if [ -f path.sh ]; then . path.sh; fi
-
-if [ ! -f $srcdir/final.mdl ]; then
-   echo No model file $srcdir/final.mdl
-   exit 1;
-fi
-
-if [[ ! -f $graphdir/HCLG.fst || $graphdir/HCLG.fst -ot $srcdir/final.mdl ]]; then
-   echo "Graph $graphdir/HCLG.fst does not exist or is too old."
-   exit 1;
-fi
-
-# We only do one decoding pass, so there is no point caching the
-# CMVN stats-- we make them part of a pipe.
-feats="ark:compute-cmvn-stats --spk2utt=ark:$data/spk2utt scp:$data/feats.scp ark:- | apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk ark:- scp:$data/feats.scp ark:- | add-deltas ark:- ark:- |"
-
-# For Resource Management, we use beam of 30 and acwt of 1/7.
-# More normal, LVCSR setups would have a beam of 13 and acwt of 1/15 or so.
-# If you decode with a beam of 20 on an LVCSR setup it will be very slow.
-
-gmm-decode-faster --beam=30.0 --acoustic-scale=0.1429 --word-symbol-table=$lang/words.txt \
-  $srcdir/final.mdl $graphdir/HCLG.fst "$feats" ark,t:$dir/test.tra ark,t:$dir/test.ali \
-     2> $dir/decode.log || exit 1;
-
-# In this setup there are no non-scored words, so
-# scoring is simple.
-
-# the ,p option lets it score partial output without dying..
-
-#scripts/sym2int.pl --ignore-first-field $lang/words.txt $data/text | \
-#  compute-wer --mode=present ark:-  ark,p:$dir/test.tra >& $dir/wer
-
-mapping="en:n,ao:aa,ax-h:ah,ax:ah,ix:ih,el:l,zh:sh,ux:uw,axr:er,em:m,nx:n,eng:ng,hv:hh,pcl:pau,tcl:pau,kcl:pau,q:pau,bcl:pau,dcl:pau,gcl:pau,epi:pau"
-scripts/collapse_phones.pl --ignore-first-field $lang/words.txt "$mapping" < $dir/test.tra > tmp
-scripts/sym2int.pl --ignore-first-field $lang/words.txt $data/text | \
-     scripts/collapse_phones.pl --ignore-first-field $lang/words.txt "$mapping" |\
-     compute-wer --mode=present ark:-  ark,p:tmp >& $dir/wer
-
-rm tmp
-
-
-
diff --git a/egs/timit/s3/steps/make_mfcc.sh b/egs/timit/s3/steps/make_mfcc.sh
deleted file mode 100755
index dc5b01c5f59..00000000000
--- a/egs/timit/s3/steps/make_mfcc.sh
+++ /dev/null
@@ -1,83 +0,0 @@
-#!/bin/bash 
-# Copyright 2010-2011 Microsoft Corporation
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-# To be run from .. (one directory up from here)
-
-if [ $# != 4 ]; then
-   echo "usage: make_mfcc.sh <data-dir> <log-dir> <abs-path-to-mfccdir> <num-cpus>";
-   exit 1;
-fi
-
-if [ -f path.sh ]; then . path.sh; fi
-
-data=$1
-logdir=$2
-mfccdir=$3
-ncpus=$4
-
-# use "name" as part of name of the archive.
-name=`basename $data`
-
-mkdir -p $mfccdir || exit 1;
-mkdir -p $logdir || exit 1;
-
-scp=$data/wav.scp
-config=conf/mfcc.conf
-required="$scp $config"
-
-for f in $required; do
-  if [ ! -f $f ]; then
-    echo "make_mfcc.sh: no such file $f"
-    exit 1;
-  fi
-done
-
-# note: in general, the double-parenthesis construct in bash "((" is "C-style
-# syntax" where we can get rid of the $ for variable names, and omit spaces.
-# The "for" loop in this style is a special construct.
-
-split_scps=""
-for ((n=1; n<=ncpus; n++)); do
-   split_scps="$split_scps $logdir/wav$n.scp"
-done
-
-scripts/split_scp.pl $scp $split_scps || exit 1;
-
-rm $logdir/.error 2>/dev/null
-for ((n=1; n<=ncpus; n++)); do
-  log=$logdir/make_mfcc.$n.log
-  compute-mfcc-feats  --verbose=2 --config=$config scp:$logdir/wav${n}.scp \
-   ark,scp:$mfccdir/raw_mfcc_$name.$n.ark,$mfccdir/raw_mfcc_$name.$n.scp \
-    2> $log || touch $logdir/.error &
-done
-wait;
-
-if [ -f $logdir/.error.$name ]; then
-  echo "Error producing mfcc features for $name:"
-  tail $logdir/make_mfcc.*.log
-  exit 1;
-fi
-
-# concatenate the .scp files together.
-rm $data/feats.scp 2>/dev/null
-for ((n=1; n<=ncpus; n++)); do
-  cat $mfccdir/raw_mfcc_$name.$n.scp >> $data/feats.scp
-done
-
-rm $logdir/wav*.scp
-
-echo "Succeeded creating MFCC features for $name"
-
diff --git a/egs/timit/s3/steps/train_deltas.sh b/egs/timit/s3/steps/train_deltas.sh
deleted file mode 100755
index 4a80f74a939..00000000000
--- a/egs/timit/s3/steps/train_deltas.sh
+++ /dev/null
@@ -1,128 +0,0 @@
-#!/bin/bash
-# Copyright 2010-2011 Microsoft Corporation  Arnab Ghoshal
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-# To be run from ..
-# Triphone model training, using delta-delta features and cepstral
-# mean normalization.  It starts from an existing directory (e.g.
-# exp/mono), supplied as an argument, which is assumed to be built using
-# the same type of features.
-
-if [ $# != 4 ]; then
-   echo "Usage: steps/train_deltas.sh <data-dir> <lang-dir> <ali-dir> <exp-dir>"
-   echo " e.g.: steps/train_deltas.sh data/train data/lang exp/mono_ali exp/tri1"
-   exit 1;
-fi
-
-if [ -f path.sh ]; then . path.sh; fi
-
-data=$1
-lang=$2
-alidir=$3
-dir=$4
-
-if [ ! -f $alidir/final.mdl -o ! -f $alidir/ali ]; then
-  echo "Error: alignment dir $alidir does not contain final.mdl and ali"
-  exit 1;
-fi
-
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
-realign_iters="5 10 15 20";  
-silphonelist=`cat $lang/silphones.csl`
-numiters=25    # Number of iterations of training
-maxiterinc=15 # Last iter to increase #Gauss on.
-numleaves=1800 # target num-leaves in tree building.
-numgauss=$[$numleaves + $numleaves/2];  # starting num-Gauss.
-     # Initially mix up to avg. 1.5 Gauss/state ( a bit more
-     # than this, due to state clustering... then slowly mix 
-     # up to final amount.
-totgauss=9000 # Target #Gaussians
-incgauss=$[($totgauss-$numgauss)/$maxiterinc] # per-iter increment for #Gauss
-
-
-mkdir -p $dir
-
-
-feats="ark:apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk ark:$alidir/cmvn.ark scp:$data/feats.scp ark:- | add-deltas ark:- ark:- |"
-
-# compute integer form of transcripts.
-scripts/sym2int.pl --ignore-first-field $lang/words.txt < $data/text > $dir/train.tra \
-  || exit 1;
-
-
-echo "Accumulating tree stats"
-acc-tree-stats  --ci-phones=$silphonelist $alidir/final.mdl "$feats" \
-   ark:$alidir/ali $dir/treeacc 2> $dir/acc.tree.log  || exit 1;
-
-
-echo "Computing questions for tree clustering"
-
-cat $lang/phones.txt | awk '{print $NF}' | grep -v -w 0 > $dir/phones.list
-cluster-phones $dir/treeacc $dir/phones.list $dir/questions.txt 2> $dir/questions.log || exit 1;
-scripts/int2sym.pl $lang/phones.txt < $dir/questions.txt > $dir/questions_syms.txt
-compile-questions $lang/topo $dir/questions.txt $dir/questions.qst 2>$dir/compile_questions.log || exit 1;
-
-# Have to make silence root not-shared because we will not split it.
-scripts/make_roots.pl --separate $lang/phones.txt $silphonelist shared split \
-    > $dir/roots.txt 2>$dir/roots.log || exit 1;
-
-
-echo "Building tree"
-build-tree --verbose=1 --max-leaves=$numleaves \
-    $dir/treeacc $dir/roots.txt \
-    $dir/questions.qst $lang/topo $dir/tree  2> $dir/train_tree.log || exit 1;
-
-gmm-init-model  --write-occs=$dir/1.occs  \
-    $dir/tree $dir/treeacc $lang/topo $dir/1.mdl 2> $dir/init_model.log || exit 1;
-
-gmm-mixup --mix-up=$numgauss $dir/1.mdl $dir/1.occs $dir/1.mdl \
-   2>$dir/mixup.log || exit 1;
-
-rm $dir/treeacc
-
-# Convert alignments generated from monophone model, to use as initial alignments.
-
-convert-ali $alidir/final.mdl $dir/1.mdl $dir/tree ark:$alidir/ali ark:$dir/cur.ali 2>$dir/convert.log 
-  # Debug step only: convert back and check they're the same.
-  convert-ali $dir/1.mdl $alidir/final.mdl $alidir/tree ark:$dir/cur.ali ark:- \
-   2>/dev/null | cmp - $alidir/ali || exit 1; 
-
-# Make training graphs
-echo "Compiling training graphs"
-compile-train-graphs $dir/tree $dir/1.mdl  $lang/L.fst ark:$dir/train.tra \
-    "ark:|gzip -c >$dir/graphs.fsts.gz"  2>$dir/compile_graphs.log  || exit 1;
-
-x=1
-while [ $x -lt $numiters ]; do
-   echo Pass $x
-   if echo $realign_iters | grep -w $x >/dev/null; then
-     echo "Aligning data"
-     gmm-align-compiled $scale_opts --beam=8 --retry-beam=40 $dir/$x.mdl \
-             "ark:gunzip -c $dir/graphs.fsts.gz|" "$feats" \
-             ark:$dir/cur.ali 2> $dir/align.$x.log || exit 1;
-   fi
-   gmm-acc-stats-ali --binary=false $dir/$x.mdl "$feats" ark:$dir/cur.ali $dir/$x.acc 2> $dir/acc.$x.log  || exit 1;
-   gmm-est --write-occs=$dir/$[$x+1].occs --mix-up=$numgauss $dir/$x.mdl $dir/$x.acc $dir/$[$x+1].mdl 2> $dir/update.$x.log || exit 1;
-   rm $dir/$x.mdl $dir/$x.acc
-   rm $dir/$x.occs 
-   if [[ $x -le $maxiterinc ]]; then 
-      numgauss=$[$numgauss+$incgauss];
-   fi
-   x=$[$x+1];
-done
-
-( cd $dir; rm final.mdl 2>/dev/null; ln -s $x.mdl final.mdl; ln -s $x.occs final.occs )
-
-echo Done
diff --git a/egs/timit/s3/steps/train_mono.sh b/egs/timit/s3/steps/train_mono.sh
deleted file mode 100755
index 3028ba3c339..00000000000
--- a/egs/timit/s3/steps/train_mono.sh
+++ /dev/null
@@ -1,106 +0,0 @@
-#!/bin/bash
-# Copyright 2010-2011 Microsoft Corporation  Arnab Ghoshal
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-# To be run from ..
-# Flat start and monophone training, with delta-delta features.
-# This script applies cepstral mean normalization (per speaker),
-# unlike the corresponding script in s1/
-
-if [ $# != 3 ]; then
-   echo "Usage: steps/train_mono.sh <data-dir> <lang-dir> <exp-dir>"
-   echo " e.g.: steps/train_mono.sh data/train.1k data/lang exp/mono"
-   exit 1;
-fi
-
-
-data=$1
-lang=$2
-dir=$3
-
-if [ -f path.sh ]; then . path.sh; fi
-
-# Configuration:
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
-numiters=30    # Number of iterations of training
-maxiterinc=20 # Last iter to increase #Gauss on.
-numgauss=250 # Initial num-Gauss (must be more than #states=3*phones).
-totgauss=1000 # Target #Gaussians.  
-incgauss=$[($totgauss-$numgauss)/$maxiterinc] # per-iter increment for #Gauss
-realign_iters="1 2 3 4 5 6 7 8 9 10 12 15 20 25";
-
-mkdir -p $dir
-echo "Computing cepstral mean and variance statistics"
-
-compute-cmvn-stats --spk2utt=ark:$data/spk2utt scp:$data/feats.scp \
-     ark:$dir/cmvn.ark 2>$dir/cmvn.log || exit 1;
-
-feats="ark:apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk ark:$dir/cmvn.ark scp:$data/feats.scp ark:- | add-deltas ark:- ark:- |"
-
-# compute integer form of transcripts.
-scripts/sym2int.pl --ignore-first-field $lang/words.txt < $data/text > $dir/train.tra \
-  || exit 1;
-
-echo "Initializing monophone system."
-
-gmm-init-mono "--train-feats=$feats subset-feats --n=10 ark:- ark:-|" $lang/topo 39  \
-   $dir/0.mdl $dir/tree 2> $dir/init.log || exit 1;
-
-
-echo "Compiling training graphs"
-compile-train-graphs $dir/tree $dir/0.mdl  $lang/L.fst \
-  ark:$dir/train.tra  "ark:|gzip -c >$dir/graphs.fsts.gz"  \
-  2>$dir/compile_graphs.log || exit 1 
-
-echo Pass 0
-
-align-equal-compiled "ark:gunzip -c $dir/graphs.fsts.gz|" "$feats" \
-   ark,t,f:-  2>$dir/align.0.log | \
- gmm-acc-stats-ali --binary=true $dir/0.mdl "$feats" ark:- \
-     $dir/0.acc 2> $dir/acc.0.log  || exit 1;
-
-# In the following steps, the --min-gaussian-occupancy=3 option is important, otherwise
-# we fail to est "rare" phones and later on, they never align properly.
-
-gmm-est --min-gaussian-occupancy=3  --mix-up=$numgauss \
-    $dir/0.mdl $dir/0.acc $dir/1.mdl 2> $dir/update.0.log || exit 1;
-
-rm $dir/0.acc
-
-beam=4 # will change to 8 below after 1st pass
-x=1
-while [ $x -lt $numiters ]; do
-  echo "Pass $x"
-  if echo $realign_iters | grep -w $x >/dev/null; then
-    echo "Aligning data"
-    gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$[$beam*4] $dir/$x.mdl \
-        "ark:gunzip -c $dir/graphs.fsts.gz|" "$feats" t,ark:$dir/cur.ali \
-        2> $dir/align.$x.log || exit 1;
-  fi
-  gmm-acc-stats-ali --binary=false $dir/$x.mdl "$feats" ark:$dir/cur.ali $dir/$x.acc 2> $dir/acc.$x.log  || exit 1;
-  gmm-est --mix-up=$numgauss $dir/$x.mdl $dir/$x.acc $dir/$[$x+1].mdl 2> $dir/update.$x.log || exit 1;
-  rm $dir/$x.mdl $dir/$x.acc
-  if [ $x -le $maxiterinc ]; then
-     numgauss=$[$numgauss+$incgauss];
-  fi
-  beam=8
-  x=$[$x+1]
-done
-
-( cd $dir; rm final.mdl 2>/dev/null; ln -s $x.mdl final.mdl )
-
-# example of showing the alignments:
-# show-alignments data/lang/phones.txt $dir/30.mdl ark:$dir/cur.ali | head -4
-
diff --git a/egs/timit/s4/RESULTS b/egs/timit/s4/RESULTS
deleted file mode 100644
index f11d53b6fdd..00000000000
--- a/egs/timit/s4/RESULTS
+++ /dev/null
@@ -1,24 +0,0 @@
-exp/mono/decode_dev_bg/wer_3
-compute-wer --text --mode=present ark:exp/mono/decode_dev_bg/test_trans.filt ark,p:- 
-%WER 33.73 [ 5079 / 15057, 392 ins, 1716 del, 2971 sub ]
-%SER 100.00 [ 400 / 400 ]
-Scored 400 sentences, 0 not present in hyp.
-
-exp/mono/decode_test_bg/wer
-compute-wer --text --mode=present ark:exp/mono/decode_test_bg/test.trans ark,p:exp/mono/decode_test_bg/text 
-%WER 35.68 [ 2574 / 7215, 204 ins, 848 del, 1522 sub ]
-%SER 100.00 [ 192 / 192 ]
-Scored 192 sentences, 0 not present in hyp.
-
-exp/tri1/decode_dev_bg/wer_6
-compute-wer --text --mode=present ark:exp/tri1/decode_dev_bg/test.trans ark,p:- 
-%WER 28.68 [ 4319 / 15057, 474 ins, 1333 del, 2512 sub ]
-%SER 100.00 [ 400 / 400 ]
-Scored 400 sentences, 0 not present in hyp.
-
-exp/tri1/decode_test_bg/wer
-compute-wer --text --mode=present ark:exp/tri1/decode_test_bg/test.trans ark,p:exp/tri1/decode_test_bg/text 
-%WER 31.02 [ 2238 / 7215, 226 ins, 704 del, 1308 sub ]
-%SER 100.00 [ 192 / 192 ]
-Scored 192 sentences, 0 not present in hyp.
-
diff --git a/egs/timit/s4/conf/dev_spk.list b/egs/timit/s4/conf/dev_spk.list
deleted file mode 100644
index 564da1f1ec6..00000000000
--- a/egs/timit/s4/conf/dev_spk.list
+++ /dev/null
@@ -1,50 +0,0 @@
-faks0
-fdac1
-fjem0
-mgwt0
-mjar0
-mmdb1
-mmdm2
-mpdf0
-fcmh0
-fkms0
-mbdg0
-mbwm0
-mcsh0
-fadg0
-fdms0
-fedw0
-mgjf0
-mglb0
-mrtk0
-mtaa0
-mtdt0
-mthc0
-mwjg0
-fnmr0
-frew0
-fsem0
-mbns0
-mmjr0
-mdls0
-mdlf0
-mdvc0
-mers0
-fmah0
-fdrw0
-mrcs0
-mrjm4
-fcal1
-mmwh0
-fjsj0
-majc0
-mjsw0
-mreb0
-fgjd0
-fjmg0
-mroa0
-mteb0
-mjfc0
-mrjr0
-fmml0
-mrws1
diff --git a/egs/timit/s4/conf/phones.60-48-39.map b/egs/timit/s4/conf/phones.60-48-39.map
deleted file mode 100644
index 4ebcc140fe7..00000000000
--- a/egs/timit/s4/conf/phones.60-48-39.map
+++ /dev/null
@@ -1,61 +0,0 @@
-aa	aa	aa
-ae	ae	ae
-ah	ah	ah
-ao	ao	aa
-aw	aw	aw
-ax	ax	ah
-ax-h	ax	ah
-axr	er	er
-ay	ay	ay
-b	b	b
-bcl	vcl	sil
-ch	ch	ch
-d	d	d
-dcl	vcl	sil
-dh	dh	dh
-dx	dx	dx
-eh	eh	eh
-el	el	l
-em	m	m
-en	en	n
-eng	ng	ng
-epi	epi	sil
-er	er	er
-ey	ey	ey
-f	f	f
-g	g	g
-gcl	vcl	sil
-h#	sil	sil
-hh	hh	hh
-hv	hh	hh
-ih	ih	ih
-ix	ix	ih
-iy	iy	iy
-jh	jh	jh
-k	k	k
-kcl	cl	sil
-l	l	l
-m	m	m
-n	n	n
-ng	ng	ng
-nx	n	n
-ow	ow	ow
-oy	oy	oy
-p	p	p
-pau	sil	sil
-pcl	cl	sil
-q
-r	r	r
-s	s	s
-sh	sh	sh
-t	t	t
-tcl	cl	sil
-th	th	th
-uh	uh	uh
-uw	uw	uw
-ux	uw	uw
-v	v	v
-w	w	w
-y	y	y
-z	z	z
-zh	zh	sh
diff --git a/egs/timit/s4/conf/test_spk.list b/egs/timit/s4/conf/test_spk.list
deleted file mode 100644
index 47f6653d64d..00000000000
--- a/egs/timit/s4/conf/test_spk.list
+++ /dev/null
@@ -1,24 +0,0 @@
-mdab0
-mwbt0
-felc0
-mtas1
-mwew0
-fpas0
-mjmp0
-mlnt0
-fpkt0
-mlll0
-mtls0
-fjlm0
-mbpm0
-mklt0
-fnlp0
-mcmj0
-mjdh0
-fmgd0
-mgrt0
-mnjm0
-fdhc0
-mjln0
-mpam0
-fmld0
diff --git a/egs/timit/s4/conf/topo.proto b/egs/timit/s4/conf/topo.proto
deleted file mode 100644
index 72778cb66ba..00000000000
--- a/egs/timit/s4/conf/topo.proto
+++ /dev/null
@@ -1,20 +0,0 @@
-<Topology> 
-<TopologyEntry> 
-<ForPhones>
-NONSILENCEPHONES
-</ForPhones> 
-<State> 0 <PdfClass> 0 <Transition> 0 0.75 <Transition> 1 0.25 </State> 
-<State> 1 <PdfClass> 1 <Transition> 1 0.75 <Transition> 2 0.25 </State> 
-<State> 2 <PdfClass> 2 <Transition> 2 0.75 <Transition> 3 0.25 </State> 
-<State> 3 </State>
-</TopologyEntry> 
-<TopologyEntry> 
-<ForPhones>
-SILENCEPHONES
-</ForPhones> 
-<State> 0 <PdfClass> 0 <Transition> 0 0.75 <Transition> 1 0.25 </State>
-<State> 1 <PdfClass> 1 <Transition> 1 0.75 <Transition> 2 0.25 </State>
-<State> 2 <PdfClass> 2 <Transition> 2 0.75 <Transition> 3 0.25 </State>
-<State> 3 </State>
-</TopologyEntry> 
-</Topology> 
diff --git a/egs/timit/s4/local/timit_data_prep.sh b/egs/timit/s4/local/timit_data_prep.sh
deleted file mode 100755
index 7636d6aee0d..00000000000
--- a/egs/timit/s4/local/timit_data_prep.sh
+++ /dev/null
@@ -1,127 +0,0 @@
-#!/bin/bash -u
-
-# Copyright 2012  Arnab Ghoshal
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-set -o errexit
-
-function error_exit () {
-  echo -e "$@" >&2; exit 1;
-}
-
-function read_dirname () {
-  local dir_name=`expr "X$1" : '[^=]*=\(.*\)'`;
-  [ -d "$dir_name" ] || error_exit "Argument '$dir_name' not a directory";
-  local retval=`cd $dir_name 2>/dev/null && pwd || exit 1`
-  echo $retval
-}
-
-PROG=`basename $0`;
-usage="Usage: $PROG <arguments>\n
-Prepare train, dev, test file lists for TIMIT.\n\n
-Required arguments:\n
-  --config-dir=DIR\tDirecory containing the necessary config files\n
-  --corpus-dir=DIR\tDirectory for the GlobalPhone corpus\n
-  --work-dir=DIR\t\tWorking directory\n
-";
-
-if [ $# -lt 3 ]; then
-  error_exit $usage;
-fi
-
-while [ $# -gt 0 ];
-do
-  case "$1" in
-  --help) echo -e $usage; exit 0 ;;
-  --config-dir=*)
-  CONFDIR=`read_dirname $1`; shift ;;
-  --corpus-dir=*)
-  CORPUS=`read_dirname $1`; shift ;;
-  --work-dir=*)
-  WDIR=`read_dirname $1`; shift ;;
-  *)  echo "Unknown argument: $1, exiting"; echo -e $usage; exit 1 ;;
-  esac
-done
-
-# (1) check if the config files are in place:
-cd $CONFDIR
-[ -f test_spk.list ] || error_exit "$PROG: Eval-set speaker list not found.";
-
-cd $WDIR
-[ -f path.sh ] && . path.sh  # Sets the PATH to contain necessary executables
-
-# (2) get the various file lists (for audio, transcription, etc.)
-mkdir -p data/local
-timit_prep_flists.sh --corpus-dir=$CORPUS --dev-spk=$CONFDIR/dev_spk.list \
-  --test-spk=$CONFDIR/test_spk.list --work-dir=data
-
-# (3) Normalize the transcripts.
-timit_norm_trans.pl -i data/local/train.trans -m $CONFDIR/phones.60-48-39.map \
-  -to 48 > data/local/train.trans2;
-for x in dev test; do
-  timit_norm_trans.pl -i data/local/${x}.trans -m $CONFDIR/phones.60-48-39.map \
-    -to 39 > data/local/${x}.trans2;
-done
-
-# Create the lexicon, which is just an identity mapping
-cut -d' ' -f2- data/local/train.trans2 | tr ' ' '\n' | sort -u > data/local/p
-paste data/local/p data/local/p > data/local/lexicon.txt
-
-# add disambig symbols to the lexicon: TODO: delete
-ndisambig=`add_lex_disambig.pl data/local/lexicon.txt data/local/lexicon_disambig.txt`
-ndisambig=$[$ndisambig+1];  # add one disambig symbol for silence
-echo $ndisambig > data/local/lex_ndisambig
-
-# Get the list of phones and map them to integers (adding the null symbol <eps>
-# to the list).
-cut -f2 data/local/lexicon.txt \
-  | awk 'BEGIN{ print "<eps> 0"; } { printf("%s %d\n", $1, NR); }' \
-  > data/local/phones.txt
-
-# Get the list of words:
-cut -f1 data/local/lexicon.txt \
-  | awk 'BEGIN{print "<eps> 0";} {printf("%s %d\n", $1, NR);} 
-         END{printf("#0 %d\n", NR+1);}' > data/local/words.txt
-
-# (4) Create the phone bigram LM
-(
-if [ -z $IRSTLM ] ; then
-  export IRSTLM=$KALDI_ROOT/tools/irstlm/
-fi
-export PATH=${PATH}:$IRSTLM/bin
-if ! command -v prune-lm >/dev/null 2>&1 ; then
-  echo "$0: Error: the IRSTLM is not available or compiled" >&2
-  echo "$0: Error: We used to install it by default, but." >&2
-  echo "$0: Error: this is no longer the case." >&2
-  echo "$0: Error: To install it, go to $KALDI_ROOT/tools" >&2
-  echo "$0: Error: and run extras/install_irstlm.sh" >&2
-  exit 1
-fi
-
-cut -d' ' -f2- $srcdir/text | sed -e 's:^:<s> :' -e 's:$: </s>:' \
-  > $srcdir/lm_train
-
-cut -d' ' -f2- data/local/train.trans2 | sed -e 's:^:<s> :' -e 's:$: </s>:' \
-  > data/local/lm_train.txt
-
-build-lm.sh -i data/local/lm_train.txt -n 2 \
-  -o data/local/lm_phone_bg.ilm.gz
-
-compile-lm data/local/lm_phone_bg.ilm.gz --text yes /dev/stdout \
-  | grep -v unk | gzip -c > data/local/lm_phone_bg.arpa.gz 
-
-) >& data/prepare_lm.log
-
-echo "Finished data preparation."
diff --git a/egs/timit/s4/local/timit_format_data.sh b/egs/timit/s4/local/timit_format_data.sh
deleted file mode 100755
index 5b8fa1c5169..00000000000
--- a/egs/timit/s4/local/timit_format_data.sh
+++ /dev/null
@@ -1,137 +0,0 @@
-#!/bin/bash -u
-
-# Copyright 2012  Arnab Ghoshal
-# Copyright 2010-2011  Microsoft Corporation
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-set -o errexit
-set -o pipefail
-
-function error_exit () {
-  echo -e "$@" >&2; exit 1;
-}
-
-function read_dirname () {
-  local dir_name=`expr "X$1" : '[^=]*=\(.*\)'`;
-  [ -d "$dir_name" ] || error_exit "Argument '$dir_name' not a directory";
-  local retval=`cd $dir_name 2>/dev/null && pwd || exit 1`
-  echo $retval
-}
-
-PROG=`basename $0`;
-usage="Usage: $PROG <arguments>\n
-Prepare train, dev, test file lists.\n\n
-Required arguments:\n
-  --hmm-proto=FILE\tPrototype of the HMM topology\n
-  --work-dir=DIR\t\tWorking directory\n
-";
-
-if [ $# -lt 2 ]; then
-  error_exit $usage;
-fi
-
-while [ $# -gt 0 ];
-do
-  case "$1" in
-  --help) echo -e $usage; exit 0 ;;
-  --hmm-proto=*)
-  PROTO=`expr "X$1" : '[^=]*=\(.*\)'`;
-  [ -f $PROTO ] || error_exit "Cannot find HMM prototype file '$PROTO'"; 
-  shift ;;
-  --work-dir=*)
-  WDIR=`read_dirname $1`; shift ;;
-  *)  echo "Unknown argument: $1, exiting"; error_exit $usage ;;
-  esac
-done
-
-cd $WDIR
-. path.sh
-
-echo "Preparing train data"
-
-# (0) Create a directory to contain files needed in training:
-for x in train dev test; do 
-  mkdir -p data/$x
-  cp data/local/${x}_wav.scp data/$x/wav.scp
-  cp data/local/${x}.trans2 data/$x/text
-  cp data/local/${x}.spk2utt data/$x/spk2utt
-  cp data/local/${x}.utt2spk data/$x/utt2spk
-done
-
-mkdir -p data/lang
-cp data/local/phones.txt -t data/lang/
-cp data/local/words.txt -t data/lang/
-
-# (1) Generate colon-separated lists of silence and non-silence phones
-silphones="cl epi sil vcl";
-silphones.pl data/lang/phones.txt "$silphones" \
-  data/lang/silphones.csl data/lang/nonsilphones.csl
-
-# (2) Create the L.fst without disambiguation symbols, for use in training.
-make_lexicon_fst.pl data/local/lexicon.txt 0.5 sil \
-  | fstcompile --isymbols=data/lang/phones.txt \
-    --osymbols=data/lang/words.txt --keep_isymbols=false \
-    --keep_osymbols=false \
-  | fstarcsort --sort_type=olabel > data/lang/L.fst
-
-# (3) Create phonesets.txt and extra_questions.txt.
-timit_make_questions.pl -i data/lang/phones.txt \
-  -m data/lang/phonesets_mono.txt -r data/lang/roots.txt
-grep -v sil data/lang/phonesets_mono.txt \
-  > data/lang/phonesets_cluster.txt
-echo "cl epi sil vcl" > data/lang/extra_questions.txt
-
-# (4), Finally, for training, create the HMM topology prototype:
-silphonelist=`cat data/lang/silphones.csl | sed 's/:/ /g'`
-nonsilphonelist=`cat data/lang/nonsilphones.csl | sed 's/:/ /g'`
-sed -e "s:NONSILENCEPHONES:$nonsilphonelist:" \
-  -e "s:SILENCEPHONES:$silphonelist:" $PROTO > data/lang/topo
-
-echo "Preparing test data"
-
-# (0) Copy over some files common to traina and test:
-mkdir -p data/lang_test
-for f in phones.txt words.txt L.fst silphones.csl nonsilphones.csl; do
-  cp data/lang/$f -t data/lang_test/
-done
-
-# (1) Create a list of phones including the disambiguation symbols.
-#     --include-zero includes the #0 symbol that is passed from G.fst
-ndisambig=`cat data/local/lex_ndisambig`;
-add_disambig.pl --include-zero data/lang_test/phones.txt $ndisambig \
-  > data/lang_test/phones_disambig.txt
-cp data/lang_test/phones_disambig.txt -t data/lang/  # for MMI.
-
-# (2) Create the lexicon FST with disambiguation symbols. There is an extra
-#     step where we create a loop to "pass through" the disambiguation symbols
-#     from G.fst.  
-phone_disambig_symbol=`grep \#0 data/lang_test/phones_disambig.txt | awk '{print $2}'`
-word_disambig_symbol=`grep \#0 data/lang_test/words.txt | awk '{print $2}'`
-
-make_lexicon_fst.pl data/local/lexicon_disambig.txt 0.5 sil '#'$ndisambig \
-  | fstcompile --isymbols=data/lang_test/phones_disambig.txt \
-    --osymbols=data/lang_test/words.txt --keep_isymbols=false \
-    --keep_osymbols=false \
-  | fstaddselfloops  "echo $phone_disambig_symbol |" \
-    "echo $word_disambig_symbol |" \
-  | fstarcsort --sort_type=olabel > data/lang_test/L_disambig.fst
-
-  # Needed for discriminative training
-cp data/lang_test/L_disambig.fst -t data/lang/
-
-# (3) Convert the language model to FST, and create decoding configuration.
-timit_format_lms.sh data
-
-echo "Succeeded in formatting data."
diff --git a/egs/timit/s4/local/timit_format_lms.sh b/egs/timit/s4/local/timit_format_lms.sh
deleted file mode 100755
index c122515ff2c..00000000000
--- a/egs/timit/s4/local/timit_format_lms.sh
+++ /dev/null
@@ -1,71 +0,0 @@
-#!/bin/bash -u
-
-# Copyright 2012  Arnab Ghoshal
-# Copyright 2010-2011  Microsoft Corporation
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-set -o errexit
-#set -o pipefail
-
-function error_exit () {
-  echo -e "$@" >&2; exit 1;
-}
-
-function read_dirname () {
-  [ -d "$1" ] || error_exit "Argument '$1' not a directory";
-  local retval=`cd $1 2>/dev/null && pwd || exit 1`
-  echo $retval
-}
-
-function format_lms () {
-  local lm_suffix=$1;
-  local work_dir=$2
-  local test=$work_dir/lang_test_${lm_suffix}
-
-  mkdir -p $test
-  for f in phones.txt words.txt phones_disambig.txt L.fst L_disambig.fst \
-           silphones.csl nonsilphones.csl; do
-    cp $work_dir/lang_test/$f $test
-  done
-
-  # Removing all "illegal" combinations of <s> and </s>, which are supposed to 
-  # occur only at being/end of utt.  These can cause determinization failures 
-  # of CLG [ends up being epsilon cycles].
-  gunzip -c $work_dir/local/lm_${lm_suffix}.arpa.gz \
-    | egrep -v '<s> <s>|</s> <s>|</s> </s>' \
-    | arpa2fst - | fstprint \
-    | eps2disambig.pl | s2eps.pl \
-    | fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt \
-      --keep_isymbols=false --keep_osymbols=false \
-    | fstrmepsilon > $test/G.fst
-  set +e
-  fstisstochastic $test/G.fst
-  set -e
-}
-
-PROG=`basename $0`;
-usage="Usage: $PROG data_dir\n
- Convert ARPA-format language models to FSTs.\n";
-
-if [ $# -ne 1 ]; then
-  error_exit $usage;
-fi
-WDIR=`read_dirname $1`;
-
-# Next, for each type of language model, create the corresponding FST
-# and the corresponding lang_test directory.
-
-echo "Preparing language models for test"
-format_lms phone_bg $WDIR >& $WDIR/format_lms.log
diff --git a/egs/timit/s4/local/timit_make_questions.pl b/egs/timit/s4/local/timit_make_questions.pl
deleted file mode 100755
index a8b1355a63a..00000000000
--- a/egs/timit/s4/local/timit_make_questions.pl
+++ /dev/null
@@ -1,59 +0,0 @@
-#!/usr/bin/env perl
-use warnings; #sed replacement for -w perl parameter
-
-# Copyright 2012  Arnab Ghoshal
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-# 'phonesets_mono' contains sets of phones that are shared when building the 
-# monophone system and when asking questions based on an automatic clustering 
-# of phones, for the triphone system.  
-# 'roots' contain the information about which phones share a common root in 
-# the phonetic decision tree and which have distinct pdfs. It also states 
-# whether the tree-building should split the roots or not.
-
-my $usage = "Usage: timit_make_questions.pl -i phones -m phoneset_mono -r roots\
-Creates sharerd phonesets for monophone and context-dependent training.\
-Required arguments:\
-  -i\tInput list of phones (can contain stress/position markers)\
-  -m\tOutput shared phoneset for use in monophone training\
-  -r\tOutput sharing and splitting info for context-dependent training\n";
-
-use strict;
-use Getopt::Long;
-my ($in_phones, $mono, $roots, %phoneset);
-GetOptions ("i=s" => \$in_phones,  # Input list of phones
-            "m=s" => \$mono,       # Shared phone-set for monophone system
-	    "r=s" => \$roots );    # roots file for context-dependent systems
-
-die "$usage" unless(defined($in_phones) && defined($mono) && defined($roots));
-
-open(P, "<$in_phones") or die "Cannot read from file '$in_phones': $!";
-open(MONO, ">$mono") or die "Cannot write to file '$mono': $!";
-open(ROOTS, ">$roots") or die "Cannot write to file '$roots': $!";
-
-while (<P>) {
-  next if m/eps|sil|vcl|cl|epi/;
-  chomp;
-  m/^(\S+)(_.)?\s+\S+$/ or die "Bad line: $_\n";
-  my $full_phone = defined($2)? $1.$2 : $1;
-  push @{$phoneset{$1}}, $full_phone;
-}
-
-print MONO "cl epi sil vcl\n";
-print ROOTS "not-shared not-split cl epi sil vcl\n";
-foreach my $p (sort keys %phoneset) {
-  print MONO join(" ", @{$phoneset{$p}}), "\n";
-  print ROOTS "shared split ", join(" ", @{$phoneset{$p}}), "\n";
-}
diff --git a/egs/timit/s4/local/timit_norm_trans.pl b/egs/timit/s4/local/timit_norm_trans.pl
deleted file mode 100755
index 07a185048d3..00000000000
--- a/egs/timit/s4/local/timit_norm_trans.pl
+++ /dev/null
@@ -1,90 +0,0 @@
-#!/usr/bin/env perl
-use warnings; #sed replacement for -w perl parameter
-
-# Copyright 2012  Arnab Ghoshal
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-
-# This script normalizes the TIMIT phonetic transcripts that have been 
-# extracted in a format where each line contains an utterance ID followed by 
-# the transcript, e.g.:
-# fcke0_si1111 h# hh ah dx ux w iy dcl d ix f ay n ih q h#
-
-my $usage = "Usage: timit_norm_trans.pl -i transcript -m phone_map -from [60|48] -to [48|39] > normalized\n
-Normalizes phonetic transcriptions for TIMIT, by mapping the phones to a 
-smaller set defined by the -m option. This script assumes that the mapping is 
-done in the \"standard\" fashion, i.e. to 48 or 39 phones.  The input is 
-assumed to have 60 phones (+1 for glottal stop, which is deleted), but that can
-be changed using the -from option. The input format is assumed to be utterance 
-ID followed by transcript on the same line.\n";
-
-use strict;
-use Getopt::Long;
-die "$usage" unless(@ARGV >= 1);
-my ($in_trans, $phone_map, $num_phones_out);
-my $num_phones_in = 60;
-GetOptions ("i=s" => \$in_trans,          # Input transcription
-	    "m=s" => \$phone_map,         # File containing phone mappings
-	    "from=i" => \$num_phones_in,  # Input #phones: must be 60 or 48
-	    "to=i" => \$num_phones_out ); # Output #phones: must be 48 or 39
-
-die $usage unless(defined($in_trans) && defined($phone_map) && 
-		  defined($num_phones_out));
-if ($num_phones_in != 60 && $num_phones_in != 48) {
-  die "Can only used 60 or 48 for -from (used $num_phones_in)."
-}
-if ($num_phones_out != 48 && $num_phones_out != 39) {
-  die "Can only used 48 or 39 for -to (used $num_phones_out)."
-}
-unless ($num_phones_out < $num_phones_in) {
-  die "Argument to -from ($num_phones_in) must be greater than that to -to ($num_phones_out)."
-}
-
-
-open(M, "<$phone_map") or die "Cannot open mappings file '$phone_map': $!";
-my (%phonemap, %seen_phones);
-my $num_seen_phones = 0;
-while (<M>) {
-  chomp;
-  next if ($_ =~ /^q\s*.*$/); # Ignore glottal stops.
-  m:^(\S+)\s+(\S+)\s+(\S+)$: or die "Bad line: $_";
-  my $mapped_from = ($num_phones_in == 60)? $1 : $2;
-  my $mapped_to = ($num_phones_out == 48)? $2 : $3;
-  if (!defined($seen_phones{$mapped_to})) {
-    $seen_phones{$mapped_to} = 1;
-    $num_seen_phones += 1;
-  }
-  $phonemap{$mapped_from} = $mapped_to;
-}
-if ($num_seen_phones != $num_phones_out) {
-  die "Trying to map to $num_phones_out phones, but seen only $num_seen_phones";
-}
-
-open(T, "<$in_trans") or die "Cannot open transcription file '$in_trans': $!";
-while (<T>) {
-  chomp;
-  $_ =~ m:^(\S+)\s+(.+): or die "Bad line: $_";
-  my $utt_id = $1;
-  my $trans = $2;
-
-  $trans =~ s/q//g;  # Remove glottal stops.
-  $trans =~ s/^\s*//; $trans =~ s/\s*$//;  # Normalize spaces
-
-  print $utt_id;
-  for my $phone (split(/\s+/, $trans)) {
-    print " $phonemap{$phone}"
-  }
-  print "\n";
-}
diff --git a/egs/timit/s4/local/timit_prep_flists.sh b/egs/timit/s4/local/timit_prep_flists.sh
deleted file mode 100755
index c7f969f6b6e..00000000000
--- a/egs/timit/s4/local/timit_prep_flists.sh
+++ /dev/null
@@ -1,121 +0,0 @@
-#!/bin/bash -u
-
-# Copyright 2012  Arnab Ghoshal
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-set -o errexit
-set -o pipefail
-
-function read_dirname () {
-  local dir_name=`expr "X$1" : '[^=]*=\(.*\)'`;
-  [ -d "$dir_name" ] || { echo "Argument '$dir_name' not a directory" >&2; \
-    exit 1; }
-  local retval=`cd $dir_name 2>/dev/null && pwd || exit 1`
-  echo $retval
-}
-
-PROG=`basename $0`;
-usage="Usage: $PROG <arguments>\n
-Prepare train, dev, test file lists for TIMIT.\n\n
-Required arguments:\n
-  --corpus-dir=DIR\tDirectory for the TIMIT corpus\n
-  --dev-spk=FILE\tDevelopment set speaker list\n
-  --test-spk=FILE\tCore test set speaker list\n
-  --work-dir=DIR\t\tPlace to write the files (in a subdirectory with the 2-letter language code)\n
-";
-
-if [ $# -lt 3 ]; then
-  echo -e $usage; exit 1;
-fi
-
-while [ $# -gt 0 ];
-do
-  case "$1" in
-  --help) echo -e $usage; exit 0 ;;
-  --corpus-dir=*) 
-  CORPUS=`read_dirname $1`; shift ;;
-  --dev-spk=*)
-  DEVSPK=`expr "X$1" : '[^=]*=\(.*\)'`; shift ;;
-  --test-spk=*)
-  TESTSPK=`expr "X$1" : '[^=]*=\(.*\)'`; shift ;;
-  --work-dir=*)
-  WDIR=`read_dirname $1`; shift ;;
-  *)  echo "Unknown argument: $1, exiting"; echo -e $usage; exit 1 ;;
-  esac
-done
-
-if [ ! -d "$CORPUS/train" -a ! -d "$CORPUS/TRAIN" ]; then
-  echo "Expecting directory $CORPUS/train or $CORPUS/TRAIN to exist."
-  exit 1;
-fi
-
-tmpdir=$(mktemp -d /tmp/kaldi.XXXX);
-trap 'rm -rf "$tmpdir"' EXIT
-
-# Get the list of speakers. The list of speakers in the 24-speaker core test 
-# set and the 50-speaker development set must be supplied to the script. All
-# speakers in the 'train' directory are used for training.
-tr '[:upper:]' '[:lower:]' < $DEVSPK > $tmpdir/dev_spk    # Just in case!
-tr '[:upper:]' '[:lower:]' < $TESTSPK > $tmpdir/test_spk  # Just in case!
-
-ls -d "$CORPUS"/train/dr*/* | sed -e "s:^.*/::" > $tmpdir/train_spk
-
-
-ODIR=$WDIR/local  # Directory to write file lists & transcripts
-mkdir -p $ODIR
-
-for x in train dev test; do
-  # First, find the list of audio files (use only si & sx utterances).
-  # Note: train & test sets are under different directories, but doing find on 
-  # both and grepping for the speakers will work correctly.
-  find $CORPUS/{train,test} -not \( -name 'sa*' \) -name '*.wav' \
-    | grep -f $tmpdir/${x}_spk > $ODIR/${x}_sph.flist
-  sed -e 's:.*/\(.*\)/\(.*\).wav$:\1_\2:' $ODIR/${x}_sph.flist \
-    > $tmpdir/${x}_sph.uttids
-  paste $tmpdir/${x}_sph.uttids $ODIR/${x}_sph.flist \
-    | sort -k1,1 > $ODIR/${x}_sph.scp
-
-  # Now, get the transcripts: each line of the output contains an utterance 
-  # ID followed by the transcript.
-  find $CORPUS/{train,test} -not \( -name 'sa*' \) -name '*.phn' \
-    | grep -f $tmpdir/${x}_spk > $tmpdir/${x}_phn.flist
-  sed -e 's:.*/\(.*\)/\(.*\).phn$:\1_\2:' $tmpdir/${x}_phn.flist \
-    > $tmpdir/${x}_phn.uttids
-  while read line; do
-    [ -f $line ] || error_exit "Cannot find transcription file '$line'";
-    cut -f3 -d' ' "$line" | tr '\n' ' ' | sed -e 's: *$:\n:'
-  done < $tmpdir/${x}_phn.flist > $tmpdir/${x}_phn.trans
-  paste $tmpdir/${x}_phn.uttids $tmpdir/${x}_phn.trans \
-    | sort -k1,1 > $ODIR/${x}.trans
-
-  # # Intersect the set of utterances with transcripts with the set of those
-  # # with valid audio.
-  # cut -f1 $tmpdir/${x}.trans \
-  #   | join $tmpdir/${x}_basenames_wav2 - > $tmpdir/${x}_basenames
-  # # Get the common set of WAV files and transcripts.
-  # join $tmpdir/${x}_basenames $tmpdir/${x}_wav.scp \
-  #   > $ODIR/${x}_wav.scp
-  # join $tmpdir/${x}_basenames $tmpdir/${x}.trans \
-  #   > $ODIR/${x}.trans
-
-  awk '{printf("%s sph2pipe -f wav %s |\n", $1, $2);}' < $ODIR/${x}_sph.scp \
-    > $ODIR/${x}_wav.scp
-
-  sed -e 's:_.*$::' $tmpdir/${x}_sph.uttids \
-    | paste -d' ' $tmpdir/${x}_sph.uttids - | sort -k1,1 \
-    > $ODIR/${x}.utt2spk
-  utt2spk_to_spk2utt.pl $ODIR/${x}.utt2spk \
-    > $ODIR/${x}.spk2utt;
-done
diff --git a/egs/timit/s4/path.sh b/egs/timit/s4/path.sh
deleted file mode 100644
index 0167f6d038b..00000000000
--- a/egs/timit/s4/path.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-# This contains the locations of the tools and data required for running
-# the TIMIT experiments.
-
-# The KALDIROOT enviromnent variable must be set by the user.
-# KALDIROOT=/absolute/path/to/kaldi/installation
-[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh 
-KALDISRC=$KALDIROOT/src
-KALDIBIN=$KALDISRC/bin:$KALDISRC/featbin:$KALDISRC/fgmmbin:$KALDISRC/fstbin  
-KALDIBIN=$KALDIBIN:$KALDISRC/gmmbin:$KALDISRC/latbin:$KALDISRC/nnetbin
-KALDIBIN=$KALDIBIN:$KALDISRC/sgmmbin:$KALDISRC/lm
-
-FSTBIN=$KALDIROOT/tools/openfst/bin
-LMBIN=$KALDIROOT/tools/irstlm/bin
-
-[ -d $PWD/local ] || { echo "Expecting 'local' subdirectory"; exit 1; }
-[ -d $PWD/utils ] || { echo "Expecting 'utils' subdirectory"; exit 1; }
-[ -d $PWD/steps ] || { echo "Expecting 'steps' subdirectory"; exit 1; }
-
-LOCALUTILS=$PWD/local
-KALDIUTILS=$PWD/utils
-KALDISTEPS=$PWD/steps
-SCRIPTS=$LOCALUTILS:$KALDIUTILS:$KALDISTEPS
-
-# If you already have shorten and sox on your path, comment the following out.
-# Else use install.sh to install them first in the specified locations.
-SPH2PIPE=$KALDIROOT/tools/sph2pipe_v2.5
-[ -x $SPH2PIPE/sph2pipe ] || { echo "Cannot find sph2pipe executable"; }
-TOOLS=$SPH2PIPE
-
-export PATH=$PATH:$KALDIBIN:$FSTBIN:$LMBIN:$SCRIPTS:$TOOLS
-export LC_ALL=C
-
-## Site-specific configs for Edinburgh
-# [ `hostname -y` == ecdf ] && \
-#   { . /etc/profile.d/modules.sh; module add intel/mkl; }
diff --git a/egs/timit/s4/run.sh b/egs/timit/s4/run.sh
deleted file mode 100755
index 7b6f25eedaa..00000000000
--- a/egs/timit/s4/run.sh
+++ /dev/null
@@ -1,81 +0,0 @@
-#!/bin/bash -u
-
-# Copyright 2012  Arnab Ghoshal
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-exit 1;
-# This script shows the steps needed to build a phone recognizer for TIMIT.
-
-# This recipe follows the setup first described in:
-#   K. F. Lee and H. W. Hon, "Speaker-independent phone recognition using hidden Markov models," 1988 
-# where the training set is mapped to 48 phones and the results are presented 
-# on a 39-phone subset of that.
-
-# Set WORKDIR to someplace with enough disk space. That is where MFCCs will 
-# get created, as well as the LM in ARPA & FST formats.
-WORKDIR=/path/with/disk/space
-mkdir -p $WORKDIR
-cp -r conf local utils steps path.sh $WORKDIR
-cd $WORKDIR
-. path.sh
-[ -z "$KALDIROOT" ] && echo "ERROR: Must specify the KALDIROOT env varaible" && exit 1;
-
-local/timit_data_prep.sh --config-dir=$PWD/conf --corpus-dir=/path/to/TIMIT --work-dir=$WORKDIR
-
-local/timit_format_data.sh --hmm-proto=conf/topo.proto --work-dir=$PWD
-
-# Now make MFCC features.
-mfccdir=$WORKDIR/data/MFCC
-for x in train dev test; do
-  steps/make_mfcc.sh --num-jobs 6 data/$x exp/make_mfcc/$x $mfccdir
-done
-
-decode_cmd="qsub -q all.q@@blade -l ram_free=500M,mem_free=500M"
-train_cmd="qsub -q all.q@@blade -l ram_free=200M,mem_free=200M"
-
-steps/train_mono.sh --num-jobs 10 --qcmd "$train_cmd" \
-  data/train data/lang exp/mono
-utils/mkgraph.sh --mono data/lang_test_phone_bg exp/mono exp/mono/graph_bg
-steps/decode_deltas.sh --accwt 1.0 --beam 20.0 --latgen --num-jobs 6 \
-  --qcmd "$decode_cmd" exp/mono/graph_bg data/dev exp/mono/decode_dev_bg
-utils/score_lats.sh exp/mono/decode_dev_bg exp/mono/graph_bg/words.txt \
-  data/dev conf/phones.60-48-39.map 
-opt_accwt=`grep WER exp/mono/decode_dev_bg/wer_* \
-  | sed -e 's?.*wer_??' -e 's?:%WER??' -e 's?\[.*??' | sort -k2,2 -g \
-  | head -1 | awk '{print 1/$1}'`
-steps/decode_deltas.sh --accwt $opt_accwt --beam 20.0 --num-jobs 4 \
-  --qcmd "$decode_cmd" exp/mono/graph_bg data/test exp/mono/decode_test_bg
-utils/score_text.sh exp/mono/decode_test_bg exp/mono/graph_bg/words.txt \
-  data/test conf/phones.60-48-39.map 
-
-steps/align_deltas.sh --num-jobs 10 --qcmd "$train_cmd" \
-  data/train data/lang exp/mono exp/mono_ali
-
-steps/train_deltas.sh --num-jobs 10 --qcmd "$train_cmd" \
-  2000 10000 data/train data/lang exp/mono_ali exp/tri1
-
-utils/mkgraph.sh data/lang_test_phone_bg exp/tri1 exp/tri1/graph_bg
-steps/decode_deltas.sh --accwt 1.0 --beam 20.0 --latgen --num-jobs 6 \
-  --qcmd "$decode_cmd" exp/tri1/graph_bg data/dev exp/tri1/decode_dev_bg
-utils/score_lats.sh exp/tri1/decode_dev_bg exp/tri1/graph_bg/words.txt \
-  data/dev conf/phones.60-48-39.map 
-opt_accwt=`grep WER exp/tri1/decode_dev_bg/wer_* \
-  | sed -e 's?.*wer_??' -e 's?:%WER??' -e 's?\[.*??' | sort -k2,2 -g \
-  | head -1 | awk '{print 1/$1}'`
-steps/decode_deltas.sh --accwt $opt_accwt --beam 20.0 --num-jobs 4 \
-  --qcmd "$decode_cmd" exp/tri1/graph_bg data/test exp/tri1/decode_test_bg
-utils/score_text.sh exp/tri1/decode_test_bg exp/tri1/graph_bg/words.txt \
-  data/test conf/phones.60-48-39.map 
-
diff --git a/egs/timit/s4/steps/align_deltas.sh b/egs/timit/s4/steps/align_deltas.sh
deleted file mode 100755
index 89cba6192ae..00000000000
--- a/egs/timit/s4/steps/align_deltas.sh
+++ /dev/null
@@ -1,138 +0,0 @@
-#!/bin/bash
-
-# Copyright 2010-2012  Microsoft Corporation;  Arnab Ghoshal
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-# To be run from ..
-
-# This script does training-data alignment given a model built using 
-# CMN + delta + delta-delta features.  It splits the data into
-# four chunks and does everything in parallel on the same machine.
-# Its output, all in its own experimental directory, is (assuming
-# you don't change the #jobs with --num-job option),
-# {0,1,2,3}.cmvn {0,1,2,3}.ali.gz, tree, final.mdl 
-# and final.occs (the last three are just copied from the source directory). 
-
-
-# Option to use precompiled graphs from last phase, if these
-# are available (i.e. if they were built with the same data).
-# These must be split into four pieces.
-
-function error_exit () {
-  echo -e "$@" >&2; exit 1;
-}
-
-function readint () {
-  local retval=${1/#*=/};  # In case --switch=ARG format was used
-  retval=${retval#0*}      # Strip any leading 0's
-  [[ "$retval" =~ ^-?[1-9][0-9]*$ ]] \
-    || error_exit "Argument \"$retval\" not an integer."
-  echo $retval
-}
-
-njobs=4      # Default number of jobs
-qcmd=""   # Options for the submit_jobs.sh script
-oldgraphs=false
-
-PROG=`basename $0`;
-usage="Usage: $PROG [options]  <data-dir> <lang-dir> <src-dir> <exp-dir>\n
-e.g.: $PROG data/train data/lang exp/tri1 exp/tri1_ali\n\n
-Options:\n
-  --help\t\tPrint this message and exit\n
-  --num-jobs INT\tNumber of parallel jobs to run (default=$njobs).\n
-  --qcmd STRING\tCommand for submitting a job to a grid engine (e.g. qsub) including switches.\n
-  --use-graphs\tReuse older graphs\n
-";
-
-while [ $# -gt 0 ]; do
-  case "${1# *}" in  # ${1# *} strips any leading spaces from the arguments
-    --help) echo -e $usage; exit 0 ;;
-    --num-jobs)
-      shift; njobs=`readint $1`;
-      [ $njobs -lt 1 ] && error_exit "--num-jobs arg '$njobs' not positive.";
-      shift ;;
-    --qcmd)
-      shift; qcmd=" --qcmd=${1}"; shift ;;
-    --use-graphs)
-      oldgraphs=true; shift ;;
-    -*)  echo "Unknown argument: $1, exiting"; echo -e $usage; exit 1 ;;
-    *)   break ;;   # end of options: interpreted as the data-dir
-  esac
-done
-
-if [ $# != 4 ]; then
-  error_exit $usage;
-fi
-
-[ -f path.sh ] && . path.sh
-
-data=$1
-lang=$2
-srcdir=$3
-dir=$4
-
-if [ -f $lang/oov.txt ]; then
-  oov_opt="--map-oov '"`cat $lang/oov.txt`"'"
-else
-  oov_opt='--ignore-oov'
-fi
-
-mkdir -p $dir
-# Create copy of the tree and model and occs...
-cp $srcdir/{tree,final.mdl,final.occs} $dir || exit 1;
-
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
-
-if [ ! -d $data/split$njobs -o $data/split$njobs -ot $data/feats.scp ]; then
-  split_data.sh $data $njobs
-fi
-
-echo "Computing cepstral mean and variance statistics"
-# for n in `get_splits.pl $njobs`; do # Do this locally; it's fast.
-submit_jobs.sh "$qcmd" --njobs=$njobs --log=$dir/cmvnTASK_ID.log \
-  compute-cmvn-stats --spk2utt=ark:$data/split$njobs/TASK_ID/spk2utt \
-    scp:$data/split$njobs/TASK_ID/feats.scp ark:$dir/TASK_ID.cmvn \
-    || error_exit "Computing CMN/CVN stats failed.";
-
-
-# Align all training data using the supplied model.
-echo "Aligning data from $data"
-feats="ark:apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk ark:$dir/TASK_ID.cmvn scp:$data/split$njobs/TASK_ID/feats.scp ark:- | add-deltas ark:- ark:- |"
-
-if $oldgraphs; then 
-  # for n in `get_splits.pl $njobs`; do
-  # feats="ark:apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk ark:$dir/TASK_ID.cmvn scp:$data/split$njobs/TASK_ID/feats.scp ark:- | add-deltas ark:- ark:- |"
-  ls $srcdir/{1..$njobs}.fsts.gz >/dev/null \
-    || error_exit "Missing FSTs with --use-graphs option specified."
-  submit_jobs.sh "$qcmd" --njobs=$njobs --log=$dir/alignTASK_ID.log \
-    gmm-align-compiled $scale_opts --beam=10 --retry-beam=40 $dir/final.mdl \
-      "ark:gunzip -c $srcdir/TASK_ID.fsts.gz|" "$feats" "ark:|gzip -c >$dir/TASK_ID.ali.gz" \
-      || error_exit "Error doing alignment.";
-
-else
-  # for n in `get_splits.pl $njobs`; do
-  # feats="ark:apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk ark:$dir/TASK_ID.cmvn scp:$data/split$njobs/TASK_ID/feats.scp ark:- | add-deltas ark:- ark:- |"
-  # compute integer form of transcripts.
-  tra="ark:sym2int.pl $oov_opt --ignore-first-field $lang/words.txt $data/split$njobs/TASK_ID/text|";
-  # We could just use gmm-align in the next line, but it's less efficient as 
-  # it compiles the training graphs one by one.
-  submit_jobs.sh "$qcmd" --njobs=$njobs --log=$dir/alignTASK_ID.log \
-    compile-train-graphs $dir/tree $dir/final.mdl  $lang/L.fst "$tra" ark:- \| \
-      gmm-align-compiled $scale_opts --beam=10 --retry-beam=40 $dir/final.mdl \
-      ark:- "$feats" "ark:|gzip -c >$dir/TASK_ID.ali.gz" \
-      || error_exit "Error doing alignment.";
-fi
-
-echo "Done aligning data."
diff --git a/egs/timit/s4/steps/decode_deltas.sh b/egs/timit/s4/steps/decode_deltas.sh
deleted file mode 100755
index 5d5594fd981..00000000000
--- a/egs/timit/s4/steps/decode_deltas.sh
+++ /dev/null
@@ -1,125 +0,0 @@
-#!/bin/bash -u
-
-# Copyright 2012  Arnab Ghoshal
-# Copyright 2010-2011  Microsoft Corporation
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-# Decoding script that works with a GMM model and delta-delta plus
-# cepstral mean subtraction features.  Used, for example, to decode
-# mono/ and tri1/
-# This script just generates lattices for a single broken-up
-# piece of the data.
-
-function error_exit () {
-  echo -e "$@" >&2; exit 1;
-}
-
-function readfloat () {
-  local retval=${1/#*=/};  # In case --switch=ARG format was used
-  [[ "$retval" =~ ^-?[0-9]*\.*[0-9]*$ ]] \
-    || error_exit "Argument \"$retval\" not a real number."
-  echo $retval
-}
-
-function readint () {
-  local retval=${1/#*=/};  # In case --switch=ARG format was used
-  retval=${retval#0*}      # Strip any leading 0's
-  [[ "$retval" =~ ^-?[1-9][0-9]*$ ]] \
-    || error_exit "Argument \"$retval\" not an integer."
-  echo $retval
-}
-
-accwt=1.0
-beam=30.0
-latgen=0
-njobs=4
-qcmd=""   # Options for the submit_jobs.sh script
-
-PROG=`basename $0`;
-usage="Usage: $PROG [options] <graph-dir> <data-dir> <decode-dir>\n
-e.g.: $PROG exp/mono/graph_bg data/dev exp/mono/decode_dev_bg\n\n
-Options:\n
-  --help\t\tPrint this message and exit\n
-  --accwt FLOAT\tScaling for acoustic likelihoods (default=$accwt).\n
-  --beam FLOAT\tDecoder beam (default=$beam)\n
-  --latgen\tGenerate lattices (off by default)\n
-  --num-jobs INT\tNumber of parallel jobs to run (default=$njobs).\n
-  --qcmd STRING\tCommand for submitting a job to a grid engine (e.g. qsub) including switches.\n
-";
-
-while [ $# -gt 0 ]; do
-  case "${1# *}" in  # ${1# *} strips any leading spaces from the arguments
-    --help) echo -e $usage; exit 0 ;;
-    --accwt)
-      shift; accwt=`readfloat $1`; shift ;;
-    --beam)
-      shift; beam=`readfloat $1`; shift ;;
-    --latgen) shift; latgen=1 ;;
-    --num-jobs)
-      shift; njobs=`readint $1`;
-      [ $njobs -lt 1 ] && error_exit "--num-jobs arg '$njobs' not positive.";
-      shift ;;
-    --qcmd)
-      shift; qcmd="--qcmd=${1}"; shift ;;
-    -*)  error_exit "Unknown argument: $1, exiting\n$usage" ;;
-    *)   break ;;   # end of options: interpreted as the data-dir
-  esac
-done
-
-if [ $# != 3 ]; then
-  error_exit $usage;
-fi
-
-[ -f path.sh ] && . path.sh
-
-graphdir=$1
-data=$2
-dir=$3
-srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory.
-
-mkdir -p $dir
-
-requirements="$data/feats.scp $srcdir/final.mdl $graphdir/HCLG.fst"
-for f in $requirements; do
-  if [ ! -f $f ]; then
-    echo "decode_deltas.sh: no such file $f";
-    exit 1;
-  fi
-done
-
-# We only do one decoding pass, so there is no point caching the
-# CMVN stats-- we make them part of a pipe.
-feats="ark:compute-cmvn-stats --spk2utt=ark:$data/spk2utt scp:$data/feats.scp ark:- | apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk ark:- scp:$data/feats.scp ark:- | add-deltas ark:- ark:- |"
-if [ $njobs -gt 1 ]; then
-  if [ ! -d $data/split$njobs -o $data/split$njobs -ot $data/feats.scp ]; then
-    split_data.sh $data $njobs
-  fi
-  mydata=$data/split$njobs/TASK_ID
-  feats="ark:compute-cmvn-stats --spk2utt=ark:$mydata/spk2utt scp:$mydata/feats.scp ark:- | apply-cmvn --norm-vars=false --utt2spk=ark:$mydata/utt2spk ark:- scp:$mydata/feats.scp ark:- | add-deltas ark:- ark:- |"
-fi
-
-if [ $latgen -eq 1 ]; then
-  submit_jobs.sh "$qcmd" --njobs=$njobs --log=$dir/decode.TASK_ID.log \
-    gmm-latgen-faster --max-active=7000 --beam=$beam --lattice-beam=6.0 \
-      --acoustic-scale=$accwt --word-symbol-table=$graphdir/words.txt \
-      $srcdir/final.mdl $graphdir/HCLG.fst "$feats" \
-      "ark:|gzip -c > $dir/lat.TASK_ID.gz" || error_exit "Decoding failed.";
-else
-  submit_jobs.sh "$qcmd" --njobs=$njobs --log=$dir/decode.TASK_ID.log \
-    gmm-decode-faster --beam=$beam --acoustic-scale=$accwt \
-      --word-symbol-table=$graphdir/words.txt $srcdir/final.mdl \
-      $graphdir/HCLG.fst "$feats" ark,t:$dir/test.TASK_ID.tra \
-      || error_exit "Decoding failed.";
-fi
diff --git a/egs/timit/s4/steps/make_mfcc.sh b/egs/timit/s4/steps/make_mfcc.sh
deleted file mode 100755
index 7033f0a1a42..00000000000
--- a/egs/timit/s4/steps/make_mfcc.sh
+++ /dev/null
@@ -1,111 +0,0 @@
-#!/bin/bash -u
-
-# Copyright 2012  Arnab Ghoshal
-# Copyright 2010-2011  Microsoft Corporation
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-# To be run from .. (one directory up from here)
-
-function error_exit () {
-  echo -e "$@" >&2; exit 1;
-}
-
-function readint () {
-  local retval=${1/#*=/};  # In case --switch=ARG format was used
-  retval=${retval#0*}      # Strip any leading 0's
-  [[ "$retval" =~ ^-?[1-9][0-9]*$ ]] \
-    || error_exit "Argument \"$retval\" not an integer."
-  echo $retval
-}
-
-njobs=4   # Default number of jobs
-stage=-4  # Default starting stage (start with calculating CMN/CVN stats)
-qcmd=""   # Options for the submit_jobs.sh script
-
-PROG=`basename $0`;
-usage="Usage: $PROG [options] <data-dir> <log-dir> <abs-path-to-mfccdir>\n\n
-Options:\n
-  --help\t\tPrint this message and exit\n
-  --num-jobs INT\tNumber of parallel jobs to run (default=$njobs).\n
-  --qcmd STRING\tCommand for submitting a job to a grid engine (e.g. qsub) including switches.\n
-";
-
-while [ $# -gt 0 ]; do
-  case "${1# *}" in  # ${1# *} strips any leading spaces from the arguments
-    --help) echo -e $usage; exit 0 ;;
-    --num-jobs)
-      shift; njobs=`readint $1`;
-      [ $njobs -lt 1 ] && error_exit "--num-jobs arg '$njobs' not positive.";
-      shift ;;
-    --qcmd)
-      shift; qcmd="--qcmd=${1}"; shift ;;
-    -*)  echo "Unknown argument: $1, exiting"; echo -e $usage; exit 1 ;;
-    *)   break ;;   # end of options: interpreted as the data-dir
-  esac
-done
-
-if [ $# != 3 ]; then
-  error_exit $usage;
-fi
-
-[ -f path.sh ] && . path.sh
-
-data=$1
-logdir=$2
-mfccdir=$3
-
-# use "name" as part of name of the archive.
-name=`basename $data`
-
-mkdir -p $mfccdir || exit 1;
-mkdir -p $logdir || exit 1;
-
-scp=$data/wav.scp
-config=conf/mfcc.conf
-required="$scp $config"
-
-for f in $required; do
-  if [ ! -f $f ]; then
-    echo "make_mfcc.sh: no such file $f"
-    exit 1;
-  fi
-done
-
-# note: in general, the double-parenthesis construct in bash "((" is "C-style
-# syntax" where we can get rid of the $ for variable names, and omit spaces.
-# The "for" loop in this style is a special construct.
-
-split_scps=""
-for ((n=1; n<=njobs; n++)); do
-  split_scps="$split_scps $logdir/wav$n.scp"
-done
-
-split_scp.pl $scp $split_scps || exit 1;
-
-rm -f $logdir/.error.$name 2>/dev/null
-submit_jobs.sh "$qcmd" --njobs=$njobs --log=$logdir/make_mfcc.TASK_ID.log \
-  compute-mfcc-feats --verbose=2 --config=$config scp:$logdir/wavTASK_ID.scp \
-  ark,scp:$mfccdir/mfcc_$name.TASK_ID.ark,$mfccdir/mfcc_$name.TASK_ID.scp \
-  || error_exit "Error producing mfcc features for $name:"`tail $logdir/make_mfcc.*.log`
-
-# concatenate the .scp files together.
-rm $data/feats.scp 2>/dev/null
-for ((n=1; n<=njobs; n++)); do
-  cat $mfccdir/mfcc_$name.$n.scp >> $data/feats.scp
-done
-
-# rm $logdir/wav*.scp
-
-echo "Succeeded creating MFCC features for $name"
diff --git a/egs/timit/s4/steps/train_deltas.sh b/egs/timit/s4/steps/train_deltas.sh
deleted file mode 100755
index 9101d89c9f4..00000000000
--- a/egs/timit/s4/steps/train_deltas.sh
+++ /dev/null
@@ -1,256 +0,0 @@
-#!/bin/bash
-
-# Copyright 2010-2012  Microsoft Corporation;  Arnab Ghoshal
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-# To be run from ..
-# Triphone model training, using (e.g. MFCC) + delta + acceleration features and
-# cepstral mean normalization.  It starts from an existing directory (e.g.
-# exp/mono), supplied as an argument, which is assumed to be built using the same
-# type of features.
-#
-# This script starts from previously generated state-level alignments
-# (in $alidir), e.g. generated by a previous monophone or triphone
-# system.  To build a context-dependent triphone system, we build 
-# decision trees that map a 3-phone phonetic context window to a
-# pdf index.  It's not really clear which is the right reference, but
-# on is "Tree-based state tying for high accuracy acoustic modelling"
-# by Steve Young et al.  
-# In a typical approach, there are decision trees for
-# each monophone HMM-state (i.e. 3 per phone), and each one gets to
-# ask questions about the left and right phone.  These questions
-# correspond to sets of phones, corresponding to phonetic classes
-# (e.g. vowel, consonant, liquid, solar, ... ).  In Kaldi, we prefer
-# fully automatic algorithms, and anyway we're not sure where to get
-# these types of lists, so we just generate the classes automatically.
-# This is based on a top-down binary tree clustering of the phones
-# (see "cluster-phones"), where we take single-Gaussian statistics for 
-# just the central state of each phone (assuming this to be more 
-# representative of the phones), and we get a tree structure on the
-# phones; each class corresponds to a node of the tree (it contains all 
-# the phones that are children of that node).  Note: you could
-# replace questions.txt with something derived from manually written
-# questions.
-#  Also, the roots of the tree correspond to classes of phones (typically
-# corresponding to "real phones", because the actual phones may contain
-# word-begin/end and stress information), and the tree gets to ask
-# questions also about the central phone, and about the state in the HMM.
-#  After building the tree, we do a number of iterations of Gaussian
-# Mixture Model training; on selected iterations we redo the Viterbi
-# alignments (initially, these are taken from the previous system).
-# The Gaussian mixture splitting, whereby we go from a single Gaussian
-# per state to multiple Gaussians, is done on all iterations (although
-# we stop doing this a few iterations before the end).  We don't have
-# a fixed number of Gaussians per state, but we have an overall target
-# #Gaussians that's specified on each iteration, and we allocate
-# the Gaussians among states according to a power-law where the #Gaussians
-# is proportional to the count to the power 0.2.  The target
-# increases linearly during training [note: logarithmically seems more
-# natural but didn't work as well.]
-
-function error_exit () {
-  echo -e "$@" >&2; exit 1;
-}
-
-function readint () {
-  local retval=${1/#*=/};  # In case --switch=ARG format was used
-  retval=${retval#0*}      # Strip any leading 0's
-  [[ "$retval" =~ ^-?[1-9][0-9]*$ ]] \
-    || error_exit "Argument \"$retval\" not an integer."
-  echo $retval
-}
-
-njobs=4    # Default number of jobs
-stage=-4   # Default starting stage (start with tree building)
-qcmd=""  # Options for the submit_jobs.sh script
-
-PROG=`basename $0`;
-usage="Usage: $PROG [options] <num-leaves> <tot-gauss> <data-dir> <lang-dir> <ali-dir> <exp-dir>\n
-e.g.: $PROG 2000 10000 data/train_si84 data/lang exp/mono_ali exp/tri1\n\n
-Options:\n
-  --help\t\tPrint this message and exit\n
-  --num-jobs INT\tNumber of parallel jobs to run (default=$njobs).\n
-  --qcmd STRING\tCommand for submitting a job to a grid engine (e.g. qsub) including switches.\n
-  --stage INT\tStarting stage (e.g. -4 for tree building; 2 for iter 2; default=$stage)\n
-";
-
-while [ $# -gt 0 ]; do
-  case "${1# *}" in  # ${1# *} strips any leading spaces from the arguments
-    --help) echo -e $usage; exit 0 ;;
-    --num-jobs) 
-      shift; njobs=`readint $1`;
-      [ $njobs -lt 1 ] && error_exit "--num-jobs arg '$njobs' not positive.";
-      shift ;;
-    --qcmd)
-      shift; qcmd=" --qcmd=${1}"; shift ;;
-    --stage)
-      shift; stage=`readint $1`; shift ;;
-    -*)  echo "Unknown argument: $1, exiting"; echo -e $usage; exit 1 ;;
-    *)   break ;;   # end of options: interpreted as num-leaves
-  esac
-done
-
-if [ $# != 6 ]; then
-  error_exit $usage;
-fi
-
-[ -f path.sh ] && . path.sh
-
-numleaves=$1
-totgauss=$2
-data=$3
-lang=$4
-alidir=$5
-dir=$6
-
-if [ ! -f $alidir/final.mdl ]; then
-  echo "Error: alignment dir $alidir does not contain final.mdl"
-  exit 1;
-fi
-
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
-realign_iters="10 20 30";
-silphonelist=`cat $lang/silphones.csl`
-numiters=35    # Number of iterations of training
-maxiterinc=25 # Last iter to increase #Gauss on.
-numgauss=$numleaves
-incgauss=$[($totgauss-$numgauss)/$maxiterinc] # per-iter increment for #Gauss
-
-if [ -f $lang/oov.txt ]; then
-  oov_opt="--map-oov '"`cat $lang/oov.txt`"'"
-else
-  oov_opt='--ignore-oov'
-fi
-
-mkdir -p $dir/log
-if [ ! -d $data/split$njobs -o $data/split$njobs -ot $data/feats.scp ]; then
-  split_data.sh $data $njobs
-fi
-
-# for n in `get_splits.pl $njobs`; do
-featspart="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$data/split$njobs/TASK_ID/utt2spk ark:$alidir/TASK_ID.cmvn scp:$data/split$njobs/TASK_ID/feats.scp ark:- | add-deltas ark:- ark:- |"
-
-if [ $stage -le -3 ]; then
-# The next stage assumes we won't need the context of silence, which
-# assumes something about $lang/roots.txt, but it seems pretty safe.
-  echo "Accumulating tree stats"
-  # for n in `get_splits.pl $njobs`; do
-  submit_jobs.sh "$qcmd" --njobs=$njobs --log=$dir/log/acc_tree.TASK_ID.log \
-    acc-tree-stats  --ci-phones=$silphonelist $alidir/final.mdl "$featspart" \
-      "ark:gunzip -c $alidir/TASK_ID.ali.gz|" $dir/TASK_ID.treeacc \
-      || error_exit "Error accumulating tree stats";
-  
-  sum-tree-stats $dir/treeacc $dir/*.treeacc 2>$dir/log/sum_tree_acc.log \
-      || error_exit "Error summing tree stats.";
-  rm $dir/*.treeacc
-fi
-
-if [ $stage -le -2 ]; then
-# preparing questions, roots file...
-  echo "Computing questions for tree clustering"
-  ( sym2int.pl $lang/phones.txt $lang/phonesets_cluster.txt > $dir/phonesets.txt
-    cluster-phones $dir/treeacc $dir/phonesets.txt $dir/questions.txt \
-      2> $dir/log/questions.log
-    [ -f $lang/extra_questions.txt ] && \
-      sym2int.pl $lang/phones.txt $lang/extra_questions.txt \
-      >> $dir/questions.txt
-    compile-questions $lang/topo $dir/questions.txt $dir/questions.qst \
-      2>$dir/log/compile_questions.log
-    sym2int.pl --ignore-oov $lang/phones.txt $lang/roots.txt > $dir/roots.txt
-  ) || error_exit "Error in generating questions for tree clustering."
-
-  echo "Building tree"
-  submit_jobs.sh "$qcmd" --log=$dir/log/train_tree.log \
-    build-tree --verbose=1 --max-leaves=$numleaves $dir/treeacc $dir/roots.txt \
-      $dir/questions.qst $lang/topo $dir/tree \
-      || error_exit "Error in building tree.";
-
-  gmm-init-model --write-occs=$dir/1.occs \
-    $dir/tree $dir/treeacc $lang/topo $dir/1.mdl 2> $dir/log/init_model.log \
-    || error_exit "Error in initializing the model.";
-
-  gmm-mixup --mix-up=$numgauss $dir/1.mdl $dir/1.occs $dir/1.mdl \
-    2>$dir/log/mixup.log || error_exit "Error mixing up to $numgauss Gaussains";
-
-  rm $dir/treeacc
-fi
-
-
-if [ $stage -le -1 ]; then
-# Convert alignments in $alidir, to use as initial alignments.
-# This assumes that $alidir was split in $njobs pieces, just like the
-# current dir.  Just do this locally-- it's very fast.
-  echo "Converting old alignments"
-  # for n in `get_splits.pl $njobs`; do
-  submit_jobs.sh --njobs=$njobs --log=$dir/log/convertTASK_ID.log \
-    convert-ali $alidir/final.mdl $dir/1.mdl $dir/tree \
-      "ark:gunzip -c $alidir/TASK_ID.ali.gz|" \
-      "ark:|gzip -c >$dir/TASK_ID.ali.gz" \
-      || error_exit "Error converting old alignments.";
-fi
-
-if [ $stage -le 0 ]; then
-# Make training graphs (this is split in $njobs parts).
-  echo "Compiling training graphs"
-  # for n in `get_splits.pl $njobs`; do
-  submit_jobs.sh "$qcmd" --njobs=$njobs --log=$dir/log/compile_graphsTASK_ID.log \
-    compile-train-graphs $dir/tree $dir/1.mdl  $lang/L.fst  \
-      "ark:sym2int.pl $oov_opt --ignore-first-field $lang/words.txt < $data/split$njobs/TASK_ID/text |" \
-      "ark:|gzip -c >$dir/TASK_ID.fsts.gz" \
-      || error_exit "Error compiling training graphs";
-fi
-
-x=1
-while [ $x -lt $numiters ]; do
-  echo Pass $x
-  if [ $stage -le $x ]; then
-    if echo $realign_iters | grep -w $x >/dev/null; then
-      echo "Aligning data"
-      # for n in `get_splits.pl $njobs`; do
-      submit_jobs.sh "$qcmd" --njobs=$njobs --log=$dir/log/align.$x.TASK_ID.log \
-        gmm-align-compiled $scale_opts --beam=10 --retry-beam=40 $dir/$x.mdl \
-          "ark:gunzip -c $dir/TASK_ID.fsts.gz|" "$featspart" \
-          "ark:|gzip -c >$dir/TASK_ID.ali.gz" \
-	  || error_exit "Error aligning data on iteration $x";
-    fi  # Realign iters
-
-    # for n in `get_splits.pl $njobs`; do
-    submit_jobs.sh "$qcmd" --njobs=$njobs --log=$dir/log/acc.$x.TASK_ID.log \
-      gmm-acc-stats-ali  $dir/$x.mdl "$featspart" \
-        "ark,s,cs:gunzip -c $dir/TASK_ID.ali.gz|" $dir/$x.TASK_ID.acc \
-	|| error_exit "Error accumulating stats on iteration $x";
-
-    submit_jobs.sh "$qcmd" --log=$dir/log/update.$x.log \
-      gmm-est --write-occs=$dir/$[$x+1].occs --mix-up=$numgauss $dir/$x.mdl \
-	"gmm-sum-accs - $dir/$x.*.acc |" $dir/$[$x+1].mdl \
-	|| error_exit "Error in pass $x extimation.";
-    rm -f r/$x.mdl $dir/$x.*.acc rm $dir/$x.occs 
-  fi  # Completed a training stage.
-  if [[ $x -le $maxiterinc ]]; then 
-    numgauss=$[$numgauss+$incgauss];
-  fi
-  x=$[$x+1];
-done
-
-( cd $dir; rm -f final.{mdl,occs}; ln -s $x.mdl final.mdl; \
-  ln -s $x.occs final.occs; )
-
-# Print out summary of the warning messages.
-for x in $dir/log/*.log; do 
-  n=`grep WARNING $x | wc -l`; 
-  if [ $n -ne 0 ]; then echo $n warnings in $x; fi; 
-done
-
-echo Done
diff --git a/egs/timit/s4/steps/train_mono.sh b/egs/timit/s4/steps/train_mono.sh
deleted file mode 100755
index b7dad23d7fe..00000000000
--- a/egs/timit/s4/steps/train_mono.sh
+++ /dev/null
@@ -1,202 +0,0 @@
-#!/bin/bash
-
-# Copyright 2012  Arnab Ghoshal
-# Copyright 2010-2011  Microsoft Corporation
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-# To be run from ..
-# Flat start and monophone training, with delta-delta features.
-# This script applies cepstral mean normalization (per speaker).
-
-function error_exit () {
-  echo -e "$@" >&2; exit 1;
-}
-
-function readint () {
-  local retval=${1/#*=/};  # In case --switch=ARG format was used
-  retval=${retval#0*}      # Strip any leading 0's
-  [[ "$retval" =~ ^-?[1-9][0-9]*$ ]] \
-    || error_exit "Argument \"$retval\" not an integer."
-  echo $retval
-}
-
-njobs=4   # Default number of jobs
-stage=-4  # Default starting stage (start with calculating CMN/CVN stats)
-qcmd=""   # Options for the submit_jobs.sh script
-
-PROG=`basename $0`;
-usage="Usage: $PROG [options] <data-dir> <lang-dir> <exp-dir>\n
-e.g.: $PROG data/train.1k data/lang exp/mono\n\n
-Options:\n
-  --help\t\tPrint this message and exit\n
-  --num-jobs INT\tNumber of parallel jobs to run (default=$njobs).\n
-  --qcmd STRING\tCommand for submitting a job to a grid engine (e.g. qsub) including switches.\n
-  --stage INT\tStarting stage (e.g. -4 for CMN/CVN stats; 2 for iter 2; default=$stage)\n
-";
-
-while [ $# -gt 0 ]; do
-  case "${1# *}" in  # ${1# *} strips any leading spaces from the arguments
-    --help) echo -e $usage; exit 0 ;;
-    --num-jobs)
-      shift; njobs=`readint $1`;
-      [ $njobs -lt 1 ] && error_exit "--num-jobs arg '$njobs' not positive.";
-      shift ;;
-    --qcmd)
-      shift; qcmd="--qcmd=${1}"; shift ;;
-    --stage)
-      shift; stage=`readint $1`; shift ;;
-    -*)  echo "Unknown argument: $1, exiting"; echo -e $usage; exit 1 ;;
-    *)   break ;;   # end of options: interpreted as the data-dir
-  esac
-done
-
-if [ $# != 3 ]; then
-  error_exit $usage;
-fi
-
-data=$1
-lang=$2
-dir=$3
-
-[ -f path.sh ] && . path.sh
-
-# Configuration:
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
-numiters=40    # Number of iterations of training
-maxiterinc=30 # Last iter to increase #Gauss on.
-numgauss=300 # Initial num-Gauss (must be more than #states=3*phones).
-totgauss=1000 # Target #Gaussians.  
-incgauss=$[($totgauss-$numgauss)/$maxiterinc] # per-iter increment for #Gauss
-realign_iters="1 2 3 4 5 6 7 8 9 10 12 14 16 18 20 23 26 29 32 35 38";
-if [ -f $lang/oov.txt ]; then
-  oov_opt="--map-oov '"`cat $lang/oov.txt`"'"
-else
-  oov_opt='--ignore-oov'
-fi
-
-mkdir -p $dir/log
-if [ ! -d $data/split$njobs -o $data/split$njobs -ot $data/feats.scp ]; then
-  split_data.sh $data $njobs
-fi
-
-if [ $stage -le -3 ]; then
-  echo "Computing cepstral mean and variance statistics"
-  # for n in `get_splits.pl $njobs`; do # do this locally; it's fast.
-  submit_jobs.sh "$qcmd" --njobs=$njobs --log=$dir/log/cmvnTASK_ID.log \
-    compute-cmvn-stats --spk2utt=ark:$data/split$njobs/TASK_ID/spk2utt \
-      scp:$data/split$njobs/TASK_ID/feats.scp ark:$dir/TASK_ID.cmvn \
-      || error_exit "Computing CMN/CVN stats failed.";
-fi
-
-feats="ark:apply-cmvn --norm-vars=false --utt2spk=ark:$data/utt2spk \"ark:cat $dir/*.cmvn|\" scp:$data/feats.scp ark:- | add-deltas ark:- ark:- |"
-
-# for n in `get_splits.pl $njobs`; do
-# for n in `seq 1 $njobs`; do
-featspart="ark:apply-cmvn --norm-vars=false --utt2spk=ark:$data/split$njobs/TASK_ID/utt2spk ark:$dir/TASK_ID.cmvn scp:$data/split$njobs/TASK_ID/feats.scp ark:- | add-deltas ark:- ark:- |"
-
-
-if [ $stage -le -2 ]; then
-  echo "Initializing monophone system."
-  if [ -f $lang/phonesets_mono.txt ]; then
-    echo "Using shared phones from $lang/phonesets_mono.txt"
-    # In recipes with stress and position markers, this pools together
-    # the stats for the different versions of the same phone (also for 
-    # the various silence phones).
-    sym2int.pl $lang/phones.txt $lang/phonesets_mono.txt > $dir/phonesets.int
-    shared_phones_opt="--shared-phones=$dir/phonesets.int"
-  fi
-
-  gmm-init-mono $shared_phones_opt \
-    "--train-feats=$feats subset-feats --n=10 ark:- ark:-|" $lang/topo 39  \
-    $dir/0.mdl $dir/tree 2> $dir/log/init.log \
-    || error_exit "Monophone model initialization failed.";
-fi
-
-if [ $stage -le -1 ]; then
-  echo "Compiling training graphs"
-  submit_jobs.sh "$qcmd" --njobs=$njobs --log=$dir/log/compile_graphsTASK_ID.log \
-    compile-train-graphs $dir/tree $dir/0.mdl  $lang/L.fst \
-      "ark:sym2int.pl $oov_opt --ignore-first-field $lang/words.txt < $data/split$njobs/TASK_ID/text|" \
-      "ark:|gzip -c >$dir/TASK_ID.fsts.gz" \
-      || error_exit "Error compiling training graphs.";
-fi
-
-if [ $stage -le 0 ]; then
-  echo "Aligning data equally (pass 0)"
-# for n in `get_splits.pl $njobs`; do
-  submit_jobs.sh "$qcmd" --njobs=$njobs --log=$dir/log/align.0.TASK_ID.log \
-    align-equal-compiled "ark:gunzip -c $dir/TASK_ID.fsts.gz|" "$featspart" \
-      ark,t,f:- \| \
-    gmm-acc-stats-ali --binary=true $dir/0.mdl "$featspart" \
-      ark:- $dir/0.TASK_ID.acc \
-      || error_exit "Error in pass 0 accumulation";
-
-# In the following steps, the --min-gaussian-occupancy=3 option is important, 
-# otherwise we cannot est "rare" phones and later on, they never align properly.
-  gmm-est --min-gaussian-occupancy=3 --mix-up=$numgauss \
-    $dir/0.mdl "gmm-sum-accs - $dir/0.*.acc|" $dir/1.mdl \
-    2> $dir/log/update.0.log || error_exit "Error in pass 0 estimation.";
-
-  rm $dir/0.*.acc
-fi  # Finished 0'th training iteration.
-
-beam=6  # will change to 10 below after 1st pass
-x=1
-while [ $x -lt $numiters ]; do
-  echo "Pass $x"
-  if [ $stage -le $x ]; then
-    if echo $realign_iters | grep -w $x >/dev/null; then
-      echo "Aligning data"
-      # for n in `get_splits.pl $njobs`; do
-      submit_jobs.sh "$qcmd" --njobs=$njobs --log=$dir/log/align.$x.TASK_ID.log \
-	gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$[$beam*4] \
-	  $dir/$x.mdl "ark:gunzip -c $dir/TASK_ID.fsts.gz|" "$featspart" \
-	  "ark,t:|gzip -c >$dir/TASK_ID.ali.gz" \
-	  || error_exit "Error in pass $x alignment.";
-    fi  # Realign iters
-
-    # for n in `get_splits.pl $njobs`; do
-    submit_jobs.sh "$qcmd" --njobs=$njobs --log=$dir/log/acc.$x.TASK_ID.log \
-      gmm-acc-stats-ali $dir/$x.mdl "$featspart" \
-	"ark:gunzip -c $dir/TASK_ID.ali.gz|" $dir/$x.TASK_ID.acc \
-	|| error_exit "Error in pass $x accumulation.";
-
-    submit_jobs.sh "$qcmd" --log=$dir/log/update.$x.log \
-      gmm-est --write-occs=$dir/$[$x+1].occs --mix-up=$numgauss $dir/$x.mdl \
-	"gmm-sum-accs - $dir/$x.*.acc|" $dir/$[$x+1].mdl \
-	|| error_exit "Error in pass $x extimation.";
-    rm -f $dir/$x.mdl $dir/$x.*.acc $dir/$x.occs
-  fi  # Completed a training stage.
-  if [ $x -le $maxiterinc ]; then
-    numgauss=$[$numgauss+$incgauss];
-  fi
-  beam=10
-  x=$[$x+1];
-done
-
-( cd $dir; rm -f final.{mdl,occs}; ln -s $x.mdl final.mdl; \
-  ln -s $x.occs final.occs; )
-
-# Print out summary of the warning messages.
-for x in $dir/log/*.log; do 
-  n=`grep WARNING $x | wc -l`; 
-  if [ $n -ne 0 ]; then echo $n warnings in $x; fi; 
-done
-
-echo Done
-
-# example of showing the alignments:
-# show-alignments data/lang/phones.txt $dir/30.mdl "ark:gunzip -c $dir/0.ali.gz|" | head -4
-
diff --git a/egs/timit/s4/utils/add_disambig.pl b/egs/timit/s4/utils/add_disambig.pl
deleted file mode 100755
index 962ef386763..00000000000
--- a/egs/timit/s4/utils/add_disambig.pl
+++ /dev/null
@@ -1,58 +0,0 @@
-#!/usr/bin/env perl
-# Copyright 2010-2011 Microsoft Corporation
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-
-# Adds some specified number of disambig symbols to a symbol table.
-# Adds these as #1, #2, etc.
-# If the --include-zero option is specified, includes an extra one
-# #0.
-
-$include_zero = 0;
-if($ARGV[0] eq "--include-zero") {
-    $include_zero = 1;
-    shift @ARGV;
-}
-
-if(@ARGV != 2) {
-    die "Usage: add_disambig.pl [--include-zero] symtab.txt num_extra > symtab_out.txt ";
-}
-
-
-$input = $ARGV[0];
-$nsyms = $ARGV[1];
-
-open(F, "<$input") || die "Opening file $input";
-
-while(<F>) {
-    @A = split(" ", $_);
-    @A == 2 || die "Bad line $_";
-    $lastsym = $A[1];
-    print;
-}
-
-if(!defined($lastsym)){
- die "Empty symbol file?";
-}
-
-if($include_zero) {
-    $lastsym++;
-    print "#0  $lastsym\n";
-}
-
-for($n = 1; $n <= $nsyms; $n++) {
-    $y = $n + $lastsym;
-    print "#$n  $y\n";
-}
diff --git a/egs/timit/s4/utils/add_lex_disambig.pl b/egs/timit/s4/utils/add_lex_disambig.pl
deleted file mode 100755
index ded04bb4b49..00000000000
--- a/egs/timit/s4/utils/add_lex_disambig.pl
+++ /dev/null
@@ -1,101 +0,0 @@
-#!/usr/bin/env perl
-# Copyright 2010-2011 Microsoft Corporation
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-
-# Adds disambiguation symbols to a lexicon.
-# Outputs still in the normal lexicon format.
-# Disambig syms are numbered #1, #2, #3, etc. (#0 
-# reserved for symbol in grammar).
-# Outputs the number of disambig syms to the standard output.
-
-if(@ARGV != 2) {
-    die "Usage: add_lex_disambig.pl  lexicon.txt lexicon_disambig.txt "
-}
-
-
-$lexfn = shift @ARGV;
-$lexoutfn = shift @ARGV;
-
-open(L, "<$lexfn") || die "Error opening lexicon $lexfn";
-
-# (1)  Read in the lexicon.
-@L = ( );
-while(<L>) {
-    @A = split(" ", $_);
-    push @L, join(" ", @A);
-}
-
-# (2) Work out the count of each phone-sequence in the
-# lexicon.
-
-foreach $l (@L) {
-    @A = split(" ", $l);
-    shift @A; # Remove word.
-    $count{join(" ",@A)}++;
-}
-
-# (3) For each left sub-sequence of each phone-sequence, note down
-# that exists (for identifying prefixes of longer strings).
-
-foreach $l (@L) {
-    @A = split(" ", $l);
-    shift @A; # Remove word.
-    while(@A > 0) {
-        pop @A;  # Remove last phone
-        $issubseq{join(" ",@A)} = 1;
-    }
-}
-
-# (4) For each entry in the lexicon:
-#  if the phone sequence is unique and is not a
-#  prefix of another word, no diambig symbol.
-#  Else output #1, or #2, #3, ... if the same phone-seq
-#  has already been assigned a disambig symbol.
-
-
-open(O, ">$lexoutfn") || die "Opening lexicon file $lexoutfn for writing.\n";
-
-$max_disambig = 0;
-foreach $l (@L) {
-    @A = split(" ", $l);
-    $word = shift @A;
-    $phnseq = join(" ",@A);
-    if(!defined $issubseq{$phnseq}
-       && $count{$phnseq}==1) {
-        ; # Do nothing.
-    } else {
-        if($phnseq eq "") { # need disambig symbols for the empty string
-            # that are not use anywhere else.
-            $max_disambig++;
-            $reserved{$max_disambig} = 1;
-            $phnseq = "#$max_disambig";
-        } else {
-            $curnumber = $disambig_of{$phnseq};
-            if(!defined{$curnumber}) { $curnumber = 0; }
-            $curnumber++; # now 1 or 2, ... 
-            while(defined $reserved{$curnumber} ) { $curnumber++; } # skip over reserved symbols
-            if($curnumber > $max_disambig) {
-                $max_disambig = $curnumber;
-            }
-            $disambig_of{$phnseq} = $curnumber;
-            $phnseq = $phnseq . " #" . $curnumber;
-         }
-    }
-    print O "$word\t$phnseq\n";
-}
-
-print $max_disambig . "\n";
-
diff --git a/egs/timit/s4/utils/decode.sh b/egs/timit/s4/utils/decode.sh
deleted file mode 100755
index d8706cdca0a..00000000000
--- a/egs/timit/s4/utils/decode.sh
+++ /dev/null
@@ -1,145 +0,0 @@
-#!/bin/bash   
-
-# Copyright 2012  Arnab Ghoshal
-# Copyright 2010-2011  Microsoft Corporation
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-function error_exit () {
-  echo -e "$@" >&2; exit 1;
-}
-
-function readint () {
-  local retval=${1/#*=/};  # In case --switch=ARG format was used
-  retval=${retval#0*}      # Strip any leading 0's
-  [[ "$retval" =~ ^-?[1-9][0-9]*$ ]] \
-    || error_exit "Argument \"$retval\" not an integer."
-  echo $retval
-}
-
-function read_dirname () {
-  local dir_name=${1/#*=/};  # In case --switch=ARG format was used
-  [ -d "$dir_name" ] || error_exit "Argument '$dir_name' not a directory";
-  local retval=`cd $dir_name 2>/dev/null && pwd || exit 1`
-  echo $retval
-}
-
-orig_args="$*"
-njobs=""  # Total number of jobs unset by default. Will set to #speakers (if 
-          # using a grid) or 4 (if not), unless specified by user.
-lang=""   # Option for sclite scoring (off by default)
-opts=""
-qcmd=""   # Options for the submit_jobs.sh script
-
-PROG=`basename $0`;
-usage="Usage: $PROG [options] <decode_script> <graph-dir> <data-dir> <decode-dir> [extra-args...]\n\n
-Options:\n
-  --help\t\tPrint this message and exit\n
-  -l DIR\t\tDirectory to find L_align.fst (needed for sclite scoring)\n
-  --num-jobs INT\tNumber of parallel jobs to run (default=$njobs).\n
-  --opts STRING\tOptions for the decoder script\n
-  --qcmd STRING\tCommand for submitting a job to a grid engine (e.g. qsub) including switches.\n
-";
-
-while [ $# -gt 0 ]; do
-  case "${1# *}" in  # ${1# *} strips any leading spaces from the arguments
-    --help) echo -e $usage; exit 0 ;;
-    -l) 
-      shift; lang=`read_dirname $1`;
-      [ ! -f "$lang/phones_disambig.txt" -o ! -f "$lang/L_align.fst" ] && \
-	error_exit "Invalid argument to -l option; expected $lang/phones_disambig.txt and $lang/L_align.fst to exist."
-      shift ;;
-    --num-jobs)
-      shift; njobs=`readint $1`;
-      [ $njobs -lt 1 ] && error_exit "--num-jobs arg '$njobs' not positive.";
-      shift ;;
-    --opts)
-      shift; opts="$1"; shift ;;
-    --qcmd)
-      shift; qcmd="--qcmd=${1}"; shift ;;
-    --stage)
-      shift; stage=`readint $1`; shift ;;
-    -*)  echo "Unknown argument: $1, exiting"; echo -e $usage; exit 1 ;;
-    *)   break ;;   # end of options: interpreted as the script to execute
-  esac
-done
-
-
-if [ $# -lt 4 ]; then
-  error_exit $usage;
-fi
-
-script=$1
-graphdir=$2
-data=$3
-dir=$4
-# Make "dir" an absolute pathname.
-dir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $dir ${PWD}`
-mkdir -p $dir || exit 1
-shift;shift;shift;shift;
-# Remaining args will be supplied to decoding script.
-extra_args=$* 
-
-[ -f path.sh ] && . path.sh
-
-for file in $script $scp $data/utt2spk; do
-  if [ ! -f "$file" ]; then
-     echo "decode.sh: no such file $file"
-     exit 1
-  fi 
-done
-
-if [ ! -f $graphdir/HCLG.fst -a ! -f $graphdir/G.fst ]; then
-  # Note: most scripts expect HCLG.fst in graphdir, but the
-  # "*_fromlats.sh" script(s) require(s) a "lang" dir in that
-  # position
-  echo No such file: $graphdir/HCLG.fst or $graphdir/G.fst
-  exit 1;
-fi
-
-if [ -z "$njobs" ]; then # Figure out num-jobs; user did not specify.
-  if [ -z "$qcmd" ]; then
-    njobs=4
-  else  # running on queue...
-    njobs=`utt2spk_to_spk2utt.pl $data/utt2spk | wc -l`
-  fi
-fi
-
-echo "Decoding with num-jobs = $njobs"
-if [[ $njobs -gt 1 || ! -d $data/split$njobs || \
-      $data/split$njobs -ot $data/feats.scp ]]; then
-  split_data.sh $data $njobs
-fi
-
-#for n in `get_splits.pl $njobs`; do
-submit_jobs.sh "$qcmd" --njobs=$njobs --log=$dir/partTASK_ID.log \
-  $script $opts -j $njobs TASK_ID $graphdir $data $dir $extra_args \
-  || error_exit "Error in decoding script: command was decode.sh $orig_args"
-
-if ls $dir/lat.*.gz >&/dev/null; then
-  if [ -n "$lang" ]; then  
-  # sclite scoring: $lang directory supplied only for this reason.
-    [ ! -f $data/stm ] && \
-      error_exit "Expected $data/stm to exist (-l only used for sclite scoring)"
-    score_lats_ctm.sh $dir $lang $data || \
-      error_exit "Error in scoring of lattices using sclite."
-  else
-    score_lats.sh $dir $graphdir/words.txt $data || \
-      error_exit "Error in scoring of latices.";
-  fi
-elif ls $dir/*.txt >&/dev/null; then
-  score_text.sh $dir $data || error_exit "Error in scoring of hypotheses.";
-else
-  eror_exit "No output found in $dir, not scoring.";
-fi
diff --git a/egs/timit/s4/utils/filter_scp.pl b/egs/timit/s4/utils/filter_scp.pl
deleted file mode 100755
index 17483ae8b37..00000000000
--- a/egs/timit/s4/utils/filter_scp.pl
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/usr/bin/env perl
-use warnings; #sed replacement for -w perl parameter
-# Copyright 2010-2011 Microsoft Corporation
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-
-# This script takes a list of utterance-ids or any file whose first field
-# of each line is an utterance-id, and filters an scp
-# file (or any file whose first field is an utterance id), printing
-# out only those lines whose first field is in id_list.
-
-if(@ARGV < 1 || @ARGV > 2) {
-    die "Usage: filter_scp.pl id_list [in.scp] > out.scp ";
-}
-
-$idlist = shift @ARGV;
-open(F, "<$idlist") || die "Could not open id-list file $idlist";
-while(<F>) {
-    @A = split;
-    @A>=1 || die "Invalid id-list file line $_";
-    $seen{$A[0]} = 1;
-}
-
-while(<>) {
-    @A = split;
-    @A > 0 || die "Invalid scp file line $_";
-    if($seen{$A[0]}) {
-        print $_;
-    }
-}
diff --git a/egs/timit/s4/utils/int2sym.pl b/egs/timit/s4/utils/int2sym.pl
deleted file mode 100755
index ad85ef34993..00000000000
--- a/egs/timit/s4/utils/int2sym.pl
+++ /dev/null
@@ -1,90 +0,0 @@
-#!/usr/bin/env perl
-# Copyright 2010-2011 Microsoft Corporation
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-
-$ignore_noninteger = 0;
-$ignore_first_field = 0;
-$field = -1;
-for($x = 0; $x < 2; $x++) {
-    if($ARGV[0] eq "--ignore-noninteger") { $ignore_noninteger = 1; shift @ARGV; }
-    if($ARGV[0] eq "--ignore-first-field") { $ignore_first_field = 1; shift @ARGV; }
-    if($ARGV[0] eq "--field") { 
-       shift @ARGV; $field = $ARGV[0]+0; shift @ARGV;
-       if ($field < 1) { die "Bad argument to --field option: $field"; }
-    }
-}
-
-if ($ignore_first_field && $field > 0) { die "Incompatible options ignore-first-field and field"; }
-$zfield = $field-1; # Change to zero-based indexing.
-
-$symtab = shift @ARGV;
-if(!defined $symtab) {
-    die "Usage: sym2int.pl symtab [input] > output\n";
-}
-open(F, "<$symtab") || die "Error opening symbol table file $symtab";
-while(<F>) {
-    @A = split(" ", $_);
-    @A == 2 || die "bad line in symbol table file: $_";
-    $int2sym{$A[1]} = $A[0];
-}
-
-sub int2sym {
-    my $a = shift @_;
-    my $pos = shift @_;
-    if($a !~  m:^\d+$:) { # not all digits..
-        if($ignore_noninteger) {
-            print $a . " ";
-            next;
-        } else {
-            if($pos == 0) {
-                die "int2sym.pl: found noninteger token $a (try --ignore-first-field)\n";
-            } else {
-                die "int2sym.pl: found noninteger token $a (try --ignore-noninteger if valid input)\n";
-            }
-        }
-    }
-    $s = $int2sym{$a};
-    if(!defined ($s)) {
-        die "int2sym.pl: integer $a not in symbol table $symtab.";
-    }
-    return $s;
-}
-
-$error = 0;
-while(<>) {
-    @A = split(" ", $_);
-    if($ignore_first_field) {
-        $key = shift @A;
-        print $key . " ";
-    }
-    if ($field != -1) {
-        if ($zfield <= $#A && $zfield >= 0) {
-            $a = $A[$zfield];
-            $A[$zfield] = int2sym($a, $zfield);
-        }
-        print join(" ", @A);
-    } else {
-        for ($pos = 0; $pos <= $#A; $pos++) {
-            $a = $A[$pos];
-            $s = int2sym($a, $pos);
-            print $s . " ";
-        }
-    }
-    print "\n";
-}
-
-
-
diff --git a/egs/timit/s4/utils/make_lexicon_fst.pl b/egs/timit/s4/utils/make_lexicon_fst.pl
deleted file mode 100755
index 9e088889cc2..00000000000
--- a/egs/timit/s4/utils/make_lexicon_fst.pl
+++ /dev/null
@@ -1,122 +0,0 @@
-#!/usr/bin/env perl
-# Copyright 2010-2011 Microsoft Corporation
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-
-# makes lexicon FST (no pron-probs involved).
-
-if(@ARGV != 1 && @ARGV != 3 && @ARGV != 4) {
-    die "Usage: make_lexicon_fst.pl lexicon.txt [silprob silphone [sil_disambig_sym]] lexiconfst.txt"
-}
-
-$lexfn = shift @ARGV;
-if(@ARGV == 0) {
-    $silprob = 0.0;
-} elsif (@ARGV == 2){ 
-    ($silprob,$silphone) = @ARGV;
-} else {
-    ($silprob,$silphone,$sildisambig) = @ARGV;
-}
-if($silprob != 0.0) {
-    $silprob < 1.0 || die "Sil prob cannot be >= 1.0";
-    $silcost = -log($silprob);
-    $nosilcost = -log(1.0 - $silprob);
-}
-
-
-open(L, "<$lexfn") || die "Error opening lexicon $lexfn";
-
-
-
-sub is_sil {
-    # Return true (1) if provided with a phone-sequence
-    # that means silence.
-    # @_ is the parameters of the function
-    # This function returns true if @_ equals ( $silphone )
-    # or something of the form ( "#0", $silphone, "#1" )
-    # where the "#0" and "#1" are disambiguation symbols.
-    return ( @_ == 1 && $_[0] eq $silphone ||
-             (@_ == 3 && $_[1] eq $silphone &&
-              $_[0] =~ m/^\#\d+$/ &&
-              $_[0] =~ m/^\#\d+$/));
-}
-
-if( $silprob == 0.0 ) { # No optional silences: just have one (loop+final) state which is numbered zero.
-    $loopstate = 0;
-    $nexststate = 1; # next unallocated state.
-    while(<L>) {
-        @A = split(" ", $_);
-        $w = shift @A;
-
-        $s = $loopstate;
-        $word_or_eps = $w;
-        while (@A > 0) {
-            $p = shift @A;
-            if(@A > 0) {
-                $ns = $nextstate++;
-            } else {
-                $ns = $loopstate;
-            }
-            print "$s\t$ns\t$p\t$word_or_eps\n";
-            $word_or_eps = "<eps>";
-            $s = $ns;
-        }
-    }
-    print "$loopstate\t0\n"; # final-cost.
-} else { # have silence probs.
-    $startstate = 0;
-    $loopstate = 1;
-    $silstate = 2; # state from where we go to loopstate after emitting silence.
-    print "$startstate\t$loopstate\t<eps>\t<eps>\t$nosilcost\n"; # no silence.
-    if (!defined $sildisambig) {
-        print "$startstate\t$loopstate\t$silphone\t<eps>\t$silcost\n"; # silence.
-        print "$silstate\t$loopstate\t$silphone\t<eps>\n"; # no cost.
-        $nextstate = 3;
-    } else {
-        $disambigstate = 3;
-        $nextstate = 4;
-        print "$startstate\t$disambigstate\t$silphone\t<eps>\t$silcost\n"; # silence.
-        print "$silstate\t$disambigstate\t$silphone\t<eps>\n"; # no cost.
-        print "$disambigstate\t$loopstate\t$sildisambig\t<eps>\n"; # silence disambiguation symbol.
-    }
-    while(<L>) {
-        @A = split(" ", $_);
-        $w = shift @A;
-
-        $s = $loopstate;
-        $word_or_eps = $w;
-        while (@A > 0) {
-            $p = shift @A;
-            if(@A > 0) {
-                $ns = $nextstate++;
-                print "$s\t$ns\t$p\t$word_or_eps\n";
-                $word_or_eps = "<eps>";
-                $s = $ns;
-            } else {
-                if(!is_sil(@A)){
-                    # This is non-deterministic but relatively compact,
-                    # and avoids epsilons.
-                    print "$s\t$loopstate\t$p\t$word_or_eps\t$nosilcost\n";
-                    print "$s\t$silstate\t$p\t$word_or_eps\t$silcost\n";
-                } else {
-                    # no point putting opt-sil after silence word.
-                    print "$s\t$loopstate\t$p\t$word_or_eps\n";
-                }
-                $word_or_eps = "<eps>";
-            }
-        }            
-    }
-    print "$loopstate\t0\n"; # final-cost.
-}
diff --git a/egs/timit/s4/utils/mkgraph.sh b/egs/timit/s4/utils/mkgraph.sh
deleted file mode 100755
index 971de31c782..00000000000
--- a/egs/timit/s4/utils/mkgraph.sh
+++ /dev/null
@@ -1,134 +0,0 @@
-#!/bin/bash
-# Copyright 2010-2011 Microsoft Corporation
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-
-# This script creates a fully expanded decoding graph (HCLG) that represents
-# all the language-model, pronunciation dictionary (lexicon), context-dependency,
-# and HMM structure in our model.  The output is a Finite State Transducer
-# that has word-ids on the output, and pdf-ids on the input (these are indexes
-# that resolve to Gaussian Mixture Models).  
-# See
-#  http://kaldi.sourceforge.net/graph_recipe_test.html
-# (this is compiled from this repository using Doxygen,
-# the source for this part is in src/doc/graph_recipe_test.dox)
-
-
-N=3
-P=1
-clean=false
-
-for x in 1 2 3; do 
-  if [ $1 == "--mono" ]; then
-    N=1;
-    P=0;
-    shift;
-  fi
-  if [ $1 == "--clean" ]; then
-    clean=true
-    shift;
-  fi
-
-done
-
-if [ $# != 3 ]; then
-   echo "Usage: scripts/mkgraph.sh <test-lang-dir> <model-dir> <graphdir>"
-   echo "e.g.: scripts/mkgraph.sh data/lang_test exp/tri1/ exp/tri1/graph"
-   exit 1;
-fi
-
-if [ -f path.sh ]; then . path.sh; fi
-
-lang=$1
-tree=$2/tree
-model=$2/final.mdl
-dir=$3
-
-if $clean; then rm -r $lang/tmp; fi
-
-mkdir -p $dir
-
-tscale=1.0
-loopscale=0.1
-
-# If $lang/tmp/LG.fst does not exist or is older than its sources, make it...
-# (note: the [[ ]] brackets make the || type operators work (inside [ ], we
-# would have to use -o instead),  -f means file exists, and -ot means older than).
-
-required="$lang/L.fst $lang/G.fst $lang/phones_disambig.txt $lang/words.txt $lang/silphones.csl $model $tree"
-for f in $required; do
-  [ ! -f $f ] && echo "mkgraph.sh: expected $f to exist" && exit 1;
-done
-
-mkdir -p $lang/tmp
-if [[ ! -f $lang/tmp/LG.fst || $lang/tmp/LG.fst -ot $lang/G.fst || \
-      $lang/tmp/LG.fst -ot $lang/L_disambig.fst ]]; then
-  fsttablecompose $lang/L_disambig.fst $lang/G.fst | fstdeterminizestar --use-log=true | \
-    fstminimizeencoded  > $lang/tmp/LG.fst || exit 1;
-  fstisstochastic $lang/tmp/LG.fst || echo "warning: LG not stochastic."
-fi
-
-if [ ! -f $lang/phones_disambig.txt ]; then
-  echo "No such file $lang/phones_disambig.txt (supplied a training lang/ directory?)"
-  exit 1;
-fi
-
-grep '#' $lang/phones_disambig.txt | awk '{print $2}' > $lang/tmp/disambig_phones.list
-
-
-clg=$lang/tmp/CLG_${N}_${P}.fst
-
-if [[ ! -f $clg || $clg -ot $lang/tmp/LG.fst ]]; then
-  fstcomposecontext --context-size=$N --central-position=$P \
-   --read-disambig-syms=$lang/tmp/disambig_phones.list \
-   --write-disambig-syms=$lang/tmp/disambig_ilabels_${N}_${P}.list \
-    $lang/tmp/ilabels_${N}_${P} < $lang/tmp/LG.fst >$clg
-  fstisstochastic $clg  || echo "warning: CLG not stochastic."
-fi
-
-if [[ ! -f $dir/Ha.fst || $dir/Ha.fst -ot $model  \
-    || $dir/Ha.fst -ot $lang/tmp/ilabels_${N}_${P} ]]; then
-  make-h-transducer --disambig-syms-out=$dir/disambig_tid.list \
-    --transition-scale=$tscale $lang/tmp/ilabels_${N}_${P} $tree $model \
-     > $dir/Ha.fst  || exit 1;
-fi
-
-if [[ ! -f $dir/HCLGa.fst || $dir/HCLGa.fst -ot $dir/Ha.fst || \
-      $dir/HCLGa.fst -ot $clg ]]; then
-  fsttablecompose $dir/Ha.fst $clg | fstdeterminizestar --use-log=true \
-    | fstrmsymbols $dir/disambig_tid.list | fstrmepslocal | \
-     fstminimizeencoded > $dir/HCLGa.fst || exit 1;
-  fstisstochastic $dir/HCLGa.fst || echo "HCLGa is not stochastic"
-fi
-
-if [[ ! -f $dir/HCLG.fst || $dir/HCLG.fst -ot $dir/HCLGa.fst ]]; then
-  add-self-loops --self-loop-scale=$loopscale --reorder=true \
-    $model < $dir/HCLGa.fst > $dir/HCLG.fst || exit 1;
-
-  if [ $tscale == 1.0 -a $loopscale == 1.0 ]; then
-    # No point doing this test if transition-scale not 1, as it is bound to fail. 
-    fstisstochastic $dir/HCLG.fst || echo "Final HCLG is not stochastic."
-  fi
-fi
-
-# keep a copy of the lexicon and a list of silence phones with HCLG...
-# this means we can decode without refrence to the $lang directory.
-cp $lang/words.txt $dir/
-cp $lang/silphones.csl $dir/
-
-# to make const fst:
-# fstconvert --fst_type=const $dir/HCLG.fst $dir/HCLG_c.fst
-
-echo "Finished making decoding graphs in $dir"
\ No newline at end of file
diff --git a/egs/timit/s4/utils/s2eps.pl b/egs/timit/s4/utils/s2eps.pl
deleted file mode 100755
index ffeeb8eb6af..00000000000
--- a/egs/timit/s4/utils/s2eps.pl
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/usr/bin/env perl
-# Copyright 2010-2011 Microsoft Corporation
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-# This script replaces <s> and </s> with <eps> (on both input and output sides),
-# for the G.fst acceptor.
-
-while(<>){
-    @A = split(" ", $_);
-    if ( @A >= 4 ) {
-        if ($A[2] eq "<s>" || $A[2] eq "</s>") { $A[2] = "<eps>"; }
-        if ($A[3] eq "<s>" || $A[3] eq "</s>") { $A[3] = "<eps>"; }
-    }
-    print join("\t", @A) . "\n";
-}
diff --git a/egs/timit/s4/utils/score_lats.sh b/egs/timit/s4/utils/score_lats.sh
deleted file mode 100755
index e44eafa2ec4..00000000000
--- a/egs/timit/s4/utils/score_lats.sh
+++ /dev/null
@@ -1,56 +0,0 @@
-#!/bin/bash
-
-# Copyright 2012  Arnab Ghoshal
-# Copyright 2010-2011  Microsoft Corporation
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-if [ -f ./path.sh ]; then . ./path.sh; fi
-
-if [ $# -ne 4 ]; then
-  echo "Usage: score_lats.sh <decode-dir> <word-symbol-table> <data-dir> <phone-map>"
-  exit 1;
-fi
-
-dir=$1
-symtab=$2
-data=$3
-phonemap=$4
-
-if [ ! -f $symtab ]; then
-  echo No such word symbol table file $symtab
-  exit 1;
-fi
-if [ ! -f $data/text ]; then
-  echo Could not find transcriptions in $data/text
-  exit 1
-fi
-
-
-trans=$data/text
-cp $trans $dir/test.trans
-
-for inv_acwt in `seq 1 7`; do 
-  acwt=`perl -e "print (1.0/$inv_acwt);"`
-  lattice-best-path --acoustic-scale=$acwt --word-symbol-table=$symtab \
-    "ark:gunzip -c $dir/lat.*.gz|" ark,t:$dir/${inv_acwt}.tra \
-    2>$dir/rescore_${inv_acwt}.log
-     
-  cat $dir/${inv_acwt}.tra \
-    | int2sym.pl --ignore-first-field $symtab \
-    | timit_norm_trans.pl -i - -m $phonemap -from 48 -to 39 |
-    compute-wer --text --mode=present ark:$dir/test.trans  ark,p:-  \
-    >& $dir/wer_$inv_acwt
-done
-
diff --git a/egs/timit/s4/utils/score_text.sh b/egs/timit/s4/utils/score_text.sh
deleted file mode 100755
index 7d8942e4c35..00000000000
--- a/egs/timit/s4/utils/score_text.sh
+++ /dev/null
@@ -1,50 +0,0 @@
-#!/bin/bash
-
-# Copyright 2012  Arnab Ghoshal
-# Copyright 2010-2011  Microsoft Corporation
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-
-if [ -f ./path.sh ]; then . ./path.sh; fi
-
-if [ $# -ne 4 ]; then
-  echo "Usage: score_text.sh <decode-dir> <word-symbol-table> <data-dir> <phone-map>"
-  exit 1;
-fi
-
-dir=$1
-symtab=$2
-data=$3
-phonemap=$4
-
-if [ ! -f $data/text ]; then
-  echo Could not find transcriptions in $data/text
-  exit 1
-fi
-
-trans=$data/text
-sort -k1,1 $trans > $dir/test.trans
-
-# We assume the transcripts are already in integer form.
-cat $dir/*.tra | sort -k1,1 \
-  | int2sym.pl --ignore-first-field $symtab \
-  | timit_norm_trans.pl -i - -m $phonemap -from 48 -to 39 \
-  > $dir/text
-
-compute-wer --text --mode=present ark:$dir/test.trans ark,p:$dir/text \
-  >& $dir/wer
-
-grep WER $dir/wer
-
diff --git a/egs/timit/s4/utils/silphones.pl b/egs/timit/s4/utils/silphones.pl
deleted file mode 100755
index 3ff85dfe3bb..00000000000
--- a/egs/timit/s4/utils/silphones.pl
+++ /dev/null
@@ -1,57 +0,0 @@
-#!/usr/bin/env perl
-# Copyright 2010-2011 Microsoft Corporation
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-
-# creates integer lists of silence and non-silence phones in files,
-# e.g. silphones.csl="1:2:3 \n"
-# and nonsilphones.csl="4:5:6:7:...:24\n";
-
-if(@ARGV != 4) {
-    die "Usage: silphones.pl phones.txt \"sil1 sil2 sil3\" silphones.csl nonsilphones.csl";
-}
-
-($symtab, $sillist, $silphones, $nonsilphones) = @ARGV;
-open(S,"<$symtab") || die "Opening symbol table $symtab";
-
-
-foreach $s (split(" ", $sillist)) {
-    $issil{$s} = 1;
-}
-
-@sil = ();
-@nonsil = ();
-while(<S>){
-    @A = split(" ", $_);
-    @A == 2 || die "Bad line $_ in phone-symbol-table file $symtab";
-    ($sym, $int) = @A;
-    if($int != 0) {
-        if($issil{$sym}) { push @sil, $int; $seensil{$sym}=1; }
-        else { push @nonsil, $int; }
-    }
-}
-
-foreach $k(keys %issil) {
-    if(!$seensil{$k}) { die "No such silence phone $k"; }
-}
-open(F, ">$silphones") || die "opening silphones file $silphones";
-open(G, ">$nonsilphones") || die "opening nonsilphones file $nonsilphones";
-print F join(":", @sil) . "\n";
-print G join(":", @nonsil) . "\n";
-close(F);
-close(G);
-if(@sil == 0) { print STDERR "Warning: silphones.pl no silence phones.\n" }
-if(@nonsil == 0) { print STDERR "Warning: silphones.pl no non-silence phones.\n" }
-
diff --git a/egs/timit/s4/utils/split_data.sh b/egs/timit/s4/utils/split_data.sh
deleted file mode 100755
index 19431aa5c6d..00000000000
--- a/egs/timit/s4/utils/split_data.sh
+++ /dev/null
@@ -1,79 +0,0 @@
-#!/bin/bash
-# Copyright 2010-2011 Microsoft Corporation
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-set -o errexit
-
-if [ $# != 2 ]; then
-  echo "Usage: split_data.sh data-dir num-to-split"
-  exit 1
-fi
-
-data=$1
-numsplit=$2
-
-if [ $numsplit -le 0 ]; then
-  echo "Invalid num-split argument $numsplit";
-  exit 1;
-fi
-
-n=0;
-feats=""
-wavs=""
-utt2spks=""
-texts=""
-
-nu=`cat $data/utt2spk | wc -l`
-nf=`cat $data/feats.scp | wc -l`
-nt=`cat $data/text | wc -l`
-if [ $nu -ne $nf ]; then
-  echo "split_data.sh: warning, #lines is (utt2spk,feats.scp) is ($nu,$nf);"
-  echo "this script may produce incorrectly split data."
-  echo "use utils/fix_data_dir.sh to fix this."
-fi
-if [ $nt -ne 0 -a $nu -ne $nt ]; then
-  echo "split_data.sh: warning, #lines is (utt2spk,text) is ($nu,$nt);"
-  echo "this script may produce incorrectly split data."
-  echo "use utils/fix_data_dir.sh to fix this."
-fi
-
-# utilsscripts/get_split.pl returns "0 1 2 3" or "00 01 .. 18 19" or whatever.
-# for n in `get_splits.pl $numsplit`; do
-for n in `seq 1 $numsplit`; do  # Changed this to usual number sequence -Arnab
-  mkdir -p $data/split$numsplit/$n
-  feats="$feats $data/split$numsplit/$n/feats.scp"
-  wavs="$wavs $data/split$numsplit/$n/wav.scp"
-  texts="$texts $data/split$numsplit/$n/text"
-  utt2spks="$utt2spks $data/split$numsplit/$n/utt2spk"
-done
-
-split_scp.pl --utt2spk=$data/utt2spk $data/utt2spk $utt2spks
-split_scp.pl --utt2spk=$data/utt2spk $data/feats.scp $feats
-[ -f $data/wav.scp ] && \
-  split_scp.pl --utt2spk=$data/utt2spk $data/wav.scp $wavs
-[ -f $data/text ] && \
-  split_scp.pl --utt2spk=$data/utt2spk $data/text $texts
-
-# for n in `get_splits.pl $numsplit`; do
-for n in `seq 1 $numsplit`; do  # Changed this to usual number sequence -Arnab
-  utt2spk_to_spk2utt.pl $data/split$numsplit/$n/utt2spk \
-    > $data/split$numsplit/$n/spk2utt
-  # for completeness, also split the spk2gender file
-  [ -f $data/spk2gender ] && \
-    filter_scp.pl $data/split$numsplit/$n/spk2utt $data/spk2gender \
-    > $data/split$numsplit/$n/spk2gender 
-done
-
-exit 0
diff --git a/egs/timit/s4/utils/split_scp.pl b/egs/timit/s4/utils/split_scp.pl
deleted file mode 100755
index f1054d323eb..00000000000
--- a/egs/timit/s4/utils/split_scp.pl
+++ /dev/null
@@ -1,212 +0,0 @@
-#!/usr/bin/env perl
-use warnings; #sed replacement for -w perl parameter
-# Copyright 2010-2011 Microsoft Corporation
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-
-
-# This program splits up any kind of .scp or archive-type file.
-# If there is no utt2spk option it will work on any text  file and
-# will split it up with an approximately equal number of lines in
-# each but.
-# With the --utt2spk option it will work on anything that has the 
-# utterance-id as the first entry on each line; the utt2spk file is
-# of the form "utterance speaker" (on each line).
-# It splits it into equal size chunks as far as it can.  If you use
-# the utt2spk option it will make sure these chunks coincide with
-# speaker boundaries.  In this case, if there are more chunks
-# than speakers (and in some other circumstances), some of the 
-# resulting  chunks will be empty and it
-# will print a warning.
-# You will normally call this like:
-# split_scp.pl scp scp.1 scp.2 scp.3 ...
-# or
-# split_scp.pl --utt2spk=utt2spk scp scp.1 scp.2 scp.3 ...
-# Note that you can use this script to split the utt2spk file itself,
-# e.g. split_scp.pl --utt2spk=utt2spk utt2spk utt2spk.1 utt2spk.2 ...
-
-# You can also call the scripts like:
-# split_scp.pl -j 3 0 scp scp.0
-# [note: with this option, it assumes zero-based indexing of the split parts,
-# i.e. the second number must be 0 <= n < num-jobs.]
-
-$num_jobs = 0;
-$job_id = 0;
-$utt2spk_file = "";
-
-for ($x = 1; $x <= 2; $x++) {
-    if ($ARGV[0] eq "-j") {
-        shift @ARGV;
-        $num_jobs = shift @ARGV;
-        $job_id = shift @ARGV;
-        if ($num_jobs <= 0 || $job_id < 0 || $job_id >= $num_jobs) {
-            die "Invalid num-jobs and job-id: $num_jobs and $job_id";
-        }
-    }
-    if ($ARGV[0] =~ "--utt2spk=(.+)") {
-        $utt2spk_file=$1;
-        shift;
-    }
-}
-
-if(($num_jobs == 0 && @ARGV < 2) || ($num_jobs > 0 && (@ARGV < 1 || @ARGV > 2))) {
-    die "Usage: split_scp.pl [--utt2spk=<utt2spk_file>] in.scp out1.scp out2.scp ... \n" .
-        " or: split_scp.pl -j num-jobs job-id [--utt2spk=<utt2spk_file>] in.scp [out.scp]\n" .
-        " ... where 0 <= job-id < num-jobs.";
-}
-   
-$inscp = shift @ARGV;
-if ($num_jobs == 0) { # without -j option
-    @OUTPUTS = @ARGV;
-} else {
-    for ($j = 0; $j < $num_jobs; $j++) {
-        if ($j == $job_id) { 
-            if (@ARGV > 0) { push @OUTPUTS, $ARGV[0]; }
-            else { push @OUTPUTS, "-"; }
-        } else {
-            push @OUTPUTS, "/dev/null";
-        }
-    }
-} 
-
-if ($utt2spk_file ne "") {  # We have the --utt2spk option...
-    open(U, "<$utt2spk_file") || die "Failed to open utt2spk file $utt2spk_file";
-    while(<U>) {
-        @A = split;
-        @A == 2 || die "Bad line $_ in utt2spk file $utt2spk_file";
-        ($u,$s) = @A;
-        $utt2spk{$u} = $s;
-    }
-    open(I, "<$inscp") || die "Opening input scp file $inscp";
-    @spkrs = ();
-    while(<I>) {
-        @A = split;
-        if(@A == 0) { die "Empty or space-only line in scp file $inscp"; }
-        $u = $A[0];
-        $s = $utt2spk{$u};
-        if(!defined $s) { die "No such utterance $u in utt2spk file $utt2spk_file"; }
-        if(!defined $spk_count{$s}) { 
-            push @spkrs, $s; 
-            $spk_count{$s} = 0;
-            $spk_data{$s} = "";
-        }
-        $spk_count{$s}++;
-        $spk_data{$s} = $spk_data{$s} . $_;
-    }
-    # Now split as equally as possible ..
-    # First allocate spks to files by allocating an approximately
-    # equal number of speakers.
-    $numspks = @spkrs;  # number of speakers.
-    $numscps = @OUTPUTS; # number of output files.
-    for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
-        $scparray[$scpidx] = []; # [] is array reference.
-    }
-    for ($spkidx = 0; $spkidx < $numspks; $spkidx++) {
-        $scpidx = int(($spkidx*$numscps) / $numspks);
-        $spk = $spkrs[$spkidx];
-        push @{$scparray[$scpidx]}, $spk;
-        $scpcount[$scpidx] += $spk_count{$spk};
-    }
-
-    # Now will try to reassign beginning + ending speakers
-    # to different scp's and see if it gets more balanced.
-    # Suppose objf we're minimizing is sum_i (num utts in scp[i] - average)^2.
-    # We can show that if considering changing just 2 scp's, we minimize
-    # this by minimizing the squared difference in sizes.  This is
-    # equivalent to minimizing the absolute difference in sizes.  This
-    # shows this method is bound to converge.
-
-    $changed = 1;
-    while($changed) {
-        $changed = 0;
-        for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
-            # First try to reassign ending spk of this scp.
-            if($scpidx < $numscps-1) {
-                $sz = @{$scparray[$scpidx]};
-                if($sz > 0) {
-                    $spk = $scparray[$scpidx]->[$sz-1];
-                    $count = $spk_count{$spk};
-                    $nutt1 = $scpcount[$scpidx];
-                    $nutt2 = $scpcount[$scpidx+1];
-                    if( abs( ($nutt2+$count) - ($nutt1-$count))
-                        < abs($nutt2 - $nutt1))  { # Would decrease
-                        # size-diff by reassigning spk...
-                        $scpcount[$scpidx+1] += $count;
-                        $scpcount[$scpidx] -= $count;
-                        pop @{$scparray[$scpidx]};
-                        unshift @{$scparray[$scpidx+1]}, $spk;
-                        $changed = 1;
-                    }
-                }
-            }
-            if($scpidx > 0 && @{$scparray[$scpidx]} > 0) {
-                $spk = $scparray[$scpidx]->[0];
-                $count = $spk_count{$spk};
-                $nutt1 = $scpcount[$scpidx-1];
-                $nutt2 = $scpcount[$scpidx];
-                if( abs( ($nutt2-$count) - ($nutt1+$count))
-                    < abs($nutt2 - $nutt1))  { # Would decrease
-                    # size-diff by reassigning spk...
-                    $scpcount[$scpidx-1] += $count;
-                    $scpcount[$scpidx] -= $count;
-                    shift @{$scparray[$scpidx]};
-                    push @{$scparray[$scpidx-1]}, $spk;
-                    $changed = 1;
-                }
-            }
-        }
-    }
-    # Now print out the files...
-    for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
-        $scpfn = $OUTPUTS[$scpidx];
-        open(F, ">$scpfn") || die "Could not open scp file $scpfn for writing.";
-        $count = 0;
-        if(@{$scparray[$scpidx]} == 0) {
-            print STDERR "Warning: split_scp.pl producing empty .scp file $scpfn (too many splits and too few speakers?)\n";
-        } else {
-            foreach $spk ( @{$scparray[$scpidx]} ) {
-                print F $spk_data{$spk};
-                $count += $spk_count{$spk};
-            }
-            if($count != $scpcount[$scpidx]) { die "Count mismatch [code error]"; }
-        }
-        close(F);
-    }
-} else { 
-   # This block is the "normal" case where there is no --utt2spk 
-   # option and we just break into equal size chunks.
-
-    open(I, "<$inscp") || die "Opening input scp file $inscp";
-
-    $numscps = @OUTPUTS;  # size of array.
-    @F = ();
-    while(<I>) {
-        push @F, $_;
-    }
-    $numlines = @F;
-    if($numlines == 0) {
-        print STDERR "split_scp.pl: warning: empty input scp file $inscp";
-    }
-    $linesperscp = int( ($numlines+($numscps-1)) / $numscps); # the +$(numscps-1) forces rounding up.
-# [just doing int() rounds down].
-    for($scpidx = 0; $scpidx < @OUTPUTS; $scpidx++) {
-        $scpfile = $OUTPUTS[$scpidx];
-        open(O, ">$scpfile") || die "Opening output scp file $scpfile";
-        for($n = $linesperscp * $scpidx; $n < $numlines && $n < $linesperscp*($scpidx+1); $n++) {
-            print O $F[$n];
-        }
-        close(O) || die "Closing scp file $scpfile";
-    }
-}
diff --git a/egs/timit/s4/utils/submit_jobs.sh b/egs/timit/s4/utils/submit_jobs.sh
deleted file mode 100755
index 98e17c763fb..00000000000
--- a/egs/timit/s4/utils/submit_jobs.sh
+++ /dev/null
@@ -1,125 +0,0 @@
-#!/bin/bash -u
-
-# Copyright 2012  Arnab Ghoshal
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-set -o errexit
-
-function error_exit () {
-  echo -e "$@" >&2; exit 1;
-}
-
-function readposint () {
-  local retval=`expr "X$1" : '[^=]*=\(.*\)'`;
-  retval=${retval#0*}  # Strip any leading 0's
-  [[ "$retval" =~ ^[1-9][0-9]*$ ]] \
-    || error_exit "Argument \"$retval\" not a positive integer."
-  echo $retval
-}
-
-PROG=`basename $0`;
-usage="Usage: $PROG [options] --log=logfile command\n
-Runs the supplied command and redirect the stdout & stderr to logfile.\n
-With the --qcmd option, the command is submitted to a grid engine.\n
-Any 'TASK_ID' in logfile or command is replaced with job number or \$SGE_TASK_ID (for SGE).\n\n
-Required arguments:\n
-  --log=FILE\tOutput of command redirected to this file.\n\n
-Options:\n
-  --njobs=INT\tNumber of jobs to run (default=1). Assumes split data exists.\n
-  --qcmd=STRING\tCommand for submitting a job to a grid engine (e.g. qsub) including switches.\n
-";
-
-if [ $# -lt 2 ]; then
-  error_exit $usage;
-fi
-
-NJOBS=1     # Default number of jobs
-QCMD=""     # No grid usage by default
-while [ $# -gt 1 ]; do
-  case "${1# *}" in  # ${1# *} strips any leading spaces from the arguments
-  --help) echo -e $usage; exit 0 ;;
-  --qcmd=*)
-  QCMD=`expr "X$1" : '[^=]*=\(.*\)'`; shift ;;
-  --njobs=*)
-  NJOBS=`readposint $1`; shift ;;
-  --log=*)
-  LOGF=`expr "X$1" : '[^=]*=\(.*\)'`; shift ;;
-  -*)  echo "Unknown argument: $1, exiting"; echo -e $usage; exit 1 ;;
-  '')  shift ;;  # Handle any empty arguments
-  *)   break ;;  # interpreted as the command to execute
-  esac
-done
-
-logfile_base=`basename $LOGF .log`
-logfile_dir=`dirname $LOGF`
-mkdir -p $logfile_dir;
-
-# Now, parse the command to execute
-exec_cmd="";
-while [ $# -gt 0 ]; do
-  case "$1" in
-  *\"*) exec_cmd=$exec_cmd"'''$1''' "; shift ;;
-  *\ *) exec_cmd=$exec_cmd"\"$1\" "; shift ;;
-     *) exec_cmd=$exec_cmd"$1 "; shift ;;
-  esac
-done
-
-function run_locally {
-  rm -f $logfile_dir/.error;
-  for n in `seq 1 $NJOBS`; do
-    local this_logfile=${logfile_base//TASK_ID/$n}
-    this_logfile=$logfile_dir"/"$this_logfile".log"
-    local this_command=${exec_cmd//TASK_ID/$n}
-    ( echo -e "# Command:\n# $this_command";
-      echo "# Running on: "`hostname`;
-      echo "# Started at: "`date`;
-      eval $this_command || touch $logfile_dir/.error
-      echo "# Finished at: "`date` ) >> $this_logfile 2>&1 &
-  done
-  wait;
-  [ -f $logfile_dir/.error ] && { rm -f $logfile_dir/.error; \
-      error_exit "One (or more) locally run jobs failed."; }
-  exit 0;
-}
-
-function run_on_grid {
-  local this_logfile=${logfile_base//TASK_ID/\$SGE_TASK_ID}
-  this_logfile=$logfile_dir"/"$this_logfile".log"
-  # If log files are in a separate 'log' directory, create the job submission
-  # scripts one level up.
-  local qdir=${logfile_dir/%log/q}
-  mkdir -p $qdir
-  local qlog=$qdir/queue.log
-  local this_command=${exec_cmd//TASK_ID/\$SGE_TASK_ID}
-  local run_this=$qdir"/"${logfile_base//TASK_ID/}".sh"
-  run_this=${run_this//../.}
-  printf "#!/bin/bash\n#\$ -S /bin/bash\n#\$ -V -cwd -j y\n" > $run_this
-  { printf "set -e\n";
-    printf "{ cd %s\n  . path.sh\n  echo Running on: \`hostname\`\n" "$PWD";
-    printf "  echo Started at: \`date\`\n  $this_command\n  ret=\$\?\n";
-    printf "  echo Finished at: \`date\`\n} >& %s\nexit \$ret\n" "$this_logfile"
-    printf "# Submitted with:\n"
-    printf "# $QCMD -sync y -o $qlog -t 1-$NJOBS $run_this >> $qlog 2>&1\n"
-  } >> $run_this
-  $QCMD -sync y -o $qlog -t 1-${NJOBS} $run_this >> $qlog 2>&1
-  exit $?
-}
-
-if [ -z "$QCMD" ]; then
-  run_locally;
-else
-  run_on_grid;
-fi
-
diff --git a/egs/timit/s4/utils/sym2int.pl b/egs/timit/s4/utils/sym2int.pl
deleted file mode 100755
index 71492652c50..00000000000
--- a/egs/timit/s4/utils/sym2int.pl
+++ /dev/null
@@ -1,82 +0,0 @@
-#!/usr/bin/env perl
-# Copyright 2010-2011 Microsoft Corporation
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-
-$ignore_oov = 0;
-$ignore_first_field = 0;
-for($x = 0; $x < 3; $x++) {
-    # Note: it will just print OOVS unmodified if you specify --ignore-oov.
-    # Else will complain and put nothing out.
-    if($ARGV[0] eq "--ignore-oov") { $ignore_oov = 1; shift @ARGV; } 
-    if($ARGV[0] eq "--ignore-first-field") { $ignore_first_field = 1; shift @ARGV; }
-    if($ARGV[0] eq "--map-oov") { shift @ARGV; $map_oov = shift @ARGV; }
-}
-
-$symtab = shift @ARGV;
-if(!defined $symtab) {
-    die "Usage: sym2int.pl symtab [input transcriptions] > output transcriptions\n";
-}
-open(F, "<$symtab") || die "Error opening symbol table file $symtab";
-while(<F>) {
-    @A = split(" ", $_);
-    @A == 2 || die "bad line in symbol table file: $_";
-    $sym2int{$A[0]} = $A[1] + 0;
-}
-
-$num_warning = 0;
-$max_warning = 20;
-$error = 0;
-while(<>) {
-    @A = split(" ", $_);
-    if(@A == 0) {
-        die "Empty line in transcriptions input.";
-    }
-    if($ignore_first_field) {
-        $key = shift @A;
-        print $key . " ";
-    }
-    @B = ();
-    foreach $a (@A) {
-        $i = $sym2int{$a};
-        if(!defined ($i)) {
-            if (defined $map_oov) {
-                if (!defined $sym2int{$map_oov}) {
-                    die "sym2int.pl: invalid map-oov option $map_oov (symbol not defined in $symtab)";
-                }
-                if ($num_warning++ < $max_warning) {
-                    print STDERR "sym2int.pl: replacing $a with $map_oov\n";
-                    if ($num_warning == $max_warning) {
-                        print STDERR "sym2int.pl: not warning for OOVs any more times\n";
-                    }
-                }
-                $i = $sym2int{$map_oov};
-            } elsif($ignore_oov) {
-                $i = $a; # just print them out unmodified..
-            } else {
-                die "sym2int.pl: undefined symbol $a\n";
-            }
-        }
-        push @B, $i;
-    }
-    print join(" ", @B);
-    print "\n";
-}
-
-if($error) { exit(1); }
-else { exit(0); }
-
-
-
diff --git a/egs/timit/s4/utils/utt2spk_to_spk2utt.pl b/egs/timit/s4/utils/utt2spk_to_spk2utt.pl
deleted file mode 100755
index 0dfb7ba5fd3..00000000000
--- a/egs/timit/s4/utils/utt2spk_to_spk2utt.pl
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/usr/bin/env perl
-# Copyright 2010-2011 Microsoft Corporation
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-# converts an utt2spk file to a spk2utt file.
-# Takes input from the stdin or from a file argument;
-# output goes to the standard out.
-
-if ( @ARGV > 1 ) {
-    die "Usage: utt2spk_to_spk2utt.pl [ utt2spk ] > spk2utt";
-}
-
-while(<>){ 
-    @A = split(" ", $_);
-    @A == 2 || die "Invalid line in utt2spk file: $_";
-    ($u,$s) = @A;
-    if(!$seen_spk{$s}) {
-        $seen_spk{$s} = 1;
-        push @spklist, $s;
-    }
-    $uttlist{$s} = $uttlist{$s} . "$u ";
-}
-foreach $s (@spklist) {
-    $l = $uttlist{$s};
-    $l =~ s: $::; # remove trailing space.
-    print "$s $l\n";
-}
diff --git a/egs/timit/s5/cmd.sh b/egs/timit/s5/cmd.sh
index fd91a53ff73..5abbfd4495a 100644
--- a/egs/timit/s5/cmd.sh
+++ b/egs/timit/s5/cmd.sh
@@ -1,36 +1,31 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
-#a) JHU cluster options
-#export train_cmd="queue.pl -l arch=*64"
-#export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G"
-#export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G"
-#export cuda_cmd=run.pl
+export train_cmd="queue.pl --mem 4G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
+# the use of cuda_cmd is deprecated but it's still sometimes used in nnet1
+# example scripts.
+export cuda_cmd="queue.pl --gpu 1"
 
-
-if [[ $(hostname -f) == *.clsp.jhu.edu ]]; then
-  export train_cmd="queue.pl -l arch=*64*"
-  export decode_cmd="queue.pl -l arch=*64* --mem 3G"
-  export mkgraph_cmd="queue.pl -l arch=*64* --mem 4G"
-  export cuda_cmd="queue.pl -l gpu=1"
-elif [[ $(hostname -f) == *.fit.vutbr.cz ]]; then
+# the rest of this file is present for historical reasons.
+# for cluster-specific configuration it's better to rely on conf/queue.conf.
+if [ "$(hostname -d)" == "fit.vutbr.cz" ]; then
   #b) BUT cluster options
-  queue="all.q@@blade,all.q@@speech,all.q@dellgpu*,all.q@supergpu*"
-  export train_cmd="queue.pl -q $queue -l ram_free=2500M,mem_free=2500M,matylda5=0.5"
-  export decode_cmd="queue.pl -q $queue -l ram_free=3000M,mem_free=3000M,matylda5=0.1"
-  export mkgraph_cmd="queue.pl -q $queue -l ram_free=4G,mem_free=4G,matylda5=3"
-  export cuda_cmd="queue.pl -q long.q@pcspeech-gpu,long.q@dellgpu1,long.q@pcgpu*,long.q@supergpu1 -l gpu=1" 
-else
-  echo "$0: you need to define options for your cluster."
-  exit 1;
+  queue="all.q@@blade,all.q@@speech"
+  gpu_queue="long.q@@gpu"
+  storage="matylda5"
+  export train_cmd="queue.pl -q $queue -l ram_free=1.5G,mem_free=1.5G,${storage}=0.5"
+  export decode_cmd="queue.pl -q $queue -l ram_free=2.5G,mem_free=2.5G,${storage}=0.1"
+  export cuda_cmd="queue.pl -q $gpu_queue -l gpu=1"
 fi
 
-#c) run locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-#export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
diff --git a/egs/timit/s5/local/score_basic.sh b/egs/timit/s5/local/score_basic.sh
index 102f2028635..2dbffe38e80 100755
--- a/egs/timit/s5/local/score_basic.sh
+++ b/egs/timit/s5/local/score_basic.sh
@@ -55,6 +55,6 @@ $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score_basic.LMWT.log \
     utils/int2sym.pl -f 2- $symtab \| \
     local/timit_norm_trans.pl -i - -m $phonemap -from 48 -to 39 \| \
     compute-wer --text --mode=all \
-     ark:$dir/scoring/test_filt.txt ark,p:- $dir/scoring/wer_stats_LMWT ">&" $dir/wer_LMWT || exit 1;
+     ark:$dir/scoring/test_filt.txt ark,p:- ">&" $dir/wer_LMWT || exit 1;
 
 exit 0;
diff --git a/egs/timit/s5/local/timit_format_data.sh b/egs/timit/s5/local/timit_format_data.sh
index 019d74dcfc7..4e8816a6799 100755
--- a/egs/timit/s5/local/timit_format_data.sh
+++ b/egs/timit/s5/local/timit_format_data.sh
@@ -16,7 +16,7 @@ tmpdir=data/local/lm_tmp
 lexicon=data/local/dict/lexicon.txt
 mkdir -p $tmpdir
 
-for x in train dev test; do 
+for x in train dev test; do
   mkdir -p data/$x
   cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1;
   cp $srcdir/$x.text data/$x/text || exit 1;
@@ -37,13 +37,10 @@ for lm_suffix in bg; do
   test=data/lang_test_${lm_suffix}
   mkdir -p $test
   cp -r data/lang/* $test
-  
+
   gunzip -c $lmdir/lm_phone_${lm_suffix}.arpa.gz | \
-    egrep -v '<s> <s>|</s> <s>|</s> </s>' | \
-    arpa2fst - | fstprint | \
-    utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
-     --osymbols=$test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-    fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
+    arpa2fst --disambig-symbol=#0 \
+             --read-symbol-table=$test/words.txt - $test/G.fst
   fstisstochastic $test/G.fst
  # The output is like:
  # 9.14233e-05 -0.259833
@@ -61,7 +58,7 @@ for lm_suffix in bg; do
     < "$lexicon"  >$tmpdir/g/select_empty.fst.txt
   fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt $tmpdir/g/select_empty.fst.txt | \
    fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst
-  fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && 
+  fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
     echo "Language model has cycles with empty words" && exit 1
   rm -r $tmpdir/g
 done
diff --git a/egs/timit/s5/path.sh b/egs/timit/s5/path.sh
index 1e48f21b323..62794699b41 100755
--- a/egs/timit/s5/path.sh
+++ b/egs/timit/s5/path.sh
@@ -1,4 +1,6 @@
 export KALDI_ROOT=`pwd`/../../..
-[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh 
-export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/irstlm/bin/:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin/:$KALDI_ROOT/src/kwsbin:$PWD:$PATH
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/irstlm/bin/:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
 export LC_ALL=C
diff --git a/egs/voxforge/online_demo/run.sh b/egs/voxforge/online_demo/run.sh
index 938061ebb4f..6a7e89991b6 100755
--- a/egs/voxforge/online_demo/run.sh
+++ b/egs/voxforge/online_demo/run.sh
@@ -3,6 +3,8 @@
 # Copyright 2012 Vassil Panayotov
 # Apache 2.0
 
+# Note: you have to do 'make ext' in ../../../src/ before running this.
+
 # Set the paths to the binaries and scripts needed
 KALDI_ROOT=`pwd`/../../..
 export PATH=$PWD/../s5/utils/:$KALDI_ROOT/src/onlinebin:$KALDI_ROOT/src/bin:$PATH
@@ -29,7 +31,7 @@ audio=${data_file}/audio
 if [ ! -s ${data_file}.tar.bz2 ]; then
     echo "Downloading test models and data ..."
     wget -T 10 -t 3 $data_url;
-    
+
     if [ ! -s ${data_file}.tar.bz2 ]; then
         echo "Download of $data_file has failed!"
         exit 1
@@ -53,11 +55,11 @@ case $test_mode in
         echo "  estimated on an audio book's text. The text in question is"
         echo "  \"King Solomon's Mines\" (http://www.gutenberg.org/ebooks/2166)."
         echo "  You may want to read some sentences from this book first ..."
-        echo 
+        echo
         online-gmm-decode-faster --rt-min=0.5 --rt-max=0.7 --max-active=4000 \
            --beam=12.0 --acoustic-scale=0.0769 $ac_model/model $ac_model/HCLG.fst \
            $ac_model/words.txt '1:2:3:4:5' $trans_matrix;;
-    
+
     simulated)
         echo
         echo -e "  SIMULATED ONLINE DECODING - pre-recorded audio is used\n"
@@ -70,7 +72,7 @@ case $test_mode in
         echo "  NOTE: Using utterances from the book, on which the LM was estimated"
         echo "        is considered to be \"cheating\" and we are doing this only for"
         echo "        the purposes of the demo."
-        echo 
+        echo
         echo "  You can type \"./run.sh --test-mode live\" to try it using your"
         echo "  own voice!"
         echo
@@ -87,7 +89,7 @@ case $test_mode in
             scp:$decode_dir/input.scp $ac_model/model $ac_model/HCLG.fst \
             $ac_model/words.txt '1:2:3:4:5' ark,t:$decode_dir/trans.txt \
             ark,t:$decode_dir/ali.txt $trans_matrix;;
-    
+
     *)
         echo "Invalid test mode! Should be either \"live\" or \"simulated\"!";
         exit 1;;
@@ -97,7 +99,7 @@ esac
 if [ $test_mode == "simulated" ]; then
     # Convert the reference transcripts from symbols to word IDs
     sym2int.pl -f 2- $ac_model/words.txt < $audio/trans.txt > $decode_dir/ref.txt
-    
+
     # Compact the hypotheses belonging to the same test utterance
     cat $decode_dir/trans.txt |\
         sed -e 's/^\(test[0-9]\+\)\([^ ]\+\)\(.*\)/\1 \3/' |\
diff --git a/egs/voxforge/s5/cmd.sh b/egs/voxforge/s5/cmd.sh
index 2d454050669..71dd849a93b 100644
--- a/egs/voxforge/s5/cmd.sh
+++ b/egs/voxforge/s5/cmd.sh
@@ -1,14 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#export train_cmd="queue.pl -q all.q@a*.clsp.jhu.edu"
-#export decode_cmd="queue.pl -q all.q@a*.clsp.jhu.edu"
-export train_cmd=run.pl
-export decode_cmd=run.pl
-
-
-
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/voxforge/s5/local/voxforge_format_data.sh b/egs/voxforge/s5/local/voxforge_format_data.sh
index 910be33bd75..6abaf6c7656 100755
--- a/egs/voxforge/s5/local/voxforge_format_data.sh
+++ b/egs/voxforge/s5/local/voxforge_format_data.sh
@@ -12,7 +12,7 @@ tmpdir=data/local/lm_tmp
 lexicon=data/local/dict/lexicon.txt
 mkdir -p $tmpdir
 
-for x in train test; do 
+for x in train test; do
   mkdir -p data/$x
   cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1;
   cp $srcdir/${x}_trans.txt data/$x/text || exit 1;
@@ -33,22 +33,8 @@ for f in phones.txt words.txt phones.txt L.fst L_disambig.fst phones; do
     cp -r data/lang/$f $test
 done
 cat $lmdir/lm.arpa | \
-   utils/find_arpa_oovs.pl $test/words.txt > $tmpdir/oovs.txt
-
-# grep -v '<s> <s>' because the LM seems to have some strange and useless
-# stuff in it with multiple <s>'s in the history.  Encountered some other similar
-# things in a LM from Geoff.  Removing all "illegal" combinations of <s> and </s>,
-# which are supposed to occur only at being/end of utt.  These can cause 
-# determinization failures of CLG [ends up being epsilon cycles].
-cat $lmdir/lm.arpa | \
-  grep -v '<s> <s>' | \
-  grep -v '</s> <s>' | \
-  grep -v '</s> </s>' | \
-  arpa2fst - | fstprint | \
-  utils/remove_oovs.pl $tmpdir/oovs.txt | \
-  utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
-    --osymbols=$test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-  fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
+  arpa2fst --disambig-symbol=#0 \
+           --read-symbol-table=$test/words.txt - $test/G.fst
 fstisstochastic $test/G.fst
 # The output is like:
 # 9.14233e-05 -0.259833
@@ -67,9 +53,8 @@ awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} END{print "0 0 #0 #0"; print "
 fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt \
   $tmpdir/g/select_empty.fst.txt | \
 fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > $tmpdir/g/empty_words.fst
-fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && 
+fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
   echo "Language model has cycles with empty words" && exit 1
 rm -rf $tmpdir
 
 echo "*** Succeeded in formatting data."
-
diff --git a/egs/voxforge/s5/path.sh b/egs/voxforge/s5/path.sh
index 6740f11d675..d5ee6268bae 100755
--- a/egs/voxforge/s5/path.sh
+++ b/egs/voxforge/s5/path.sh
@@ -1,5 +1,8 @@
 export KALDI_ROOT=`pwd`/../../..
-export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$PWD:$PATH
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
 
 # VoxForge data will be stored in:
 export DATA_ROOT="/home/dpovey/kaldi-clean/egs/voxforge/s5/voxforge"    # e.g. something like /media/secondary/voxforge
diff --git a/egs/vystadial_cz/s5/cmd.sh b/egs/vystadial_cz/s5/cmd.sh
index 0900744b5ae..bb0b5337cdb 100644
--- a/egs/vystadial_cz/s5/cmd.sh
+++ b/egs/vystadial_cz/s5/cmd.sh
@@ -1,22 +1,20 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
-#export train_cmd="queue.pl -q all.q@a*.clsp.jhu.edu"
-#export decode_cmd="queue.pl -q all.q@a*.clsp.jhu.edu"
-# export train_cmd="queue.pl -l mf=5g"
-# export decode_cmd="queue.pl -l mf=5g"
-export train_cmd="queue.pl -l arch=*64*"
-export decode_cmd="queue.pl -l arch=*64*"
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
 
-# The number of parallel jobs to be started for some parts of the recipe
-# Make sure you have enough resources(CPUs and RAM) to accomodate this number of jobs
-njobs=20
-
-# If you have no GridEngine you can do:
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-#njobs=2
+# this controls the number of parallel decoding jobs launched in run.sh if you
+# are running locally (e.g. with run.pl) you can reduce it to control memory
+# usage.
+export njobs=20
diff --git a/egs/vystadial_cz/s5/local/create_G.sh b/egs/vystadial_cz/s5/local/create_G.sh
index 7be19f7f03f..b462b9eab01 100755
--- a/egs/vystadial_cz/s5/local/create_G.sh
+++ b/egs/vystadial_cz/s5/local/create_G.sh
@@ -17,7 +17,7 @@ for lm in $LMs ; do
     lmp=$lmdir/`basename $lm`
 
     tmpdir=$tgt/tmp
-    mkdir -p $tgt 
+    mkdir -p $tgt
     mkdir -p $tmpdir
 
     echo "--- Preparing the grammar transducer (G.fst) from $lmp in $tgt ..."
@@ -26,21 +26,9 @@ for lm in $LMs ; do
         ln -s $langdir/$f $tgt/$f 2> /dev/null
     done
 
-    cat $lmp | utils/find_arpa_oovs.pl $tgt/words.txt > $tmpdir/oovs.txt
-
-     # grep -v '<s> <s>' because the LM seems to have some strange and useless
-     # stuff in it with multiple <s>'s in the history.  Encountered some other similar
-     # things in a LM from Geoff.  Removing all "illegal" combinations of <s> and </s>,
-     # which are supposed to occur only at being/end of utt.  These can cause 
-     # determinization failures of CLG [ends up being epsilon cycles].
-
     cat $lmp | \
-      grep -v '<s> <s>\|</s> <s>\|</s> </s>' | \
-      arpa2fst - | fstprint | \
-      utils/remove_oovs.pl $tmpdir/oovs.txt | \
-      utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$tgt/words.txt \
-        --osymbols=$tgt/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-      fstrmepsilon | fstarcsort --sort_type=ilabel > $tgt/G.fst
+      arpa2fst --disambig-symbol=#0 \
+               --read-symbol-table=$tgt/words.txt - $tgt/G.fst
     fstisstochastic $tgt/G.fst
     # The output is like:
     # 9.14233e-05 -0.259833
@@ -48,7 +36,7 @@ for lm in $LMs ; do
     # nonzero because the backoff weights make the states sum to >1).
     # Because of the <s> fiasco for these particular LMs, the first number is not
     # as close to zero as it could be.
-    
+
     # Everything below is only for diagnostic.
     # Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
     # this might cause determinization failure of CLG.
@@ -59,7 +47,7 @@ for lm in $LMs ; do
     fstcompile --isymbols=$tgt/words.txt --osymbols=$tgt/words.txt \
       $tmpdir/g/select_empty.fst.txt | \
     fstarcsort --sort_type=olabel | fstcompose - $tgt/G.fst > $tmpdir/g/empty_words.fst
-    fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && 
+    fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
       echo "Language model has cycles with empty words" && exit 1
 
     # rm -rf $tmpdir  # TODO debugging
diff --git a/egs/vystadial_cz/s5/path.sh b/egs/vystadial_cz/s5/path.sh
index 98bd2fab462..4fa5bb91042 100755
--- a/egs/vystadial_cz/s5/path.sh
+++ b/egs/vystadial_cz/s5/path.sh
@@ -1,9 +1,12 @@
 # Needed for "correct" sorting
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils:$PWD/steps:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/irstlm/bin/:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
 export LC_ALL=C
-export KALDI_ROOT=../../..
 
 # adding Kaldi binaries to path
-export PATH=$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/irstlm/bin/:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$PWD:$PWD/utils:$PWD/steps:$PATH
 
 
 
diff --git a/egs/vystadial_en/s5/cmd.sh b/egs/vystadial_en/s5/cmd.sh
index 0900744b5ae..bb0b5337cdb 100644
--- a/egs/vystadial_en/s5/cmd.sh
+++ b/egs/vystadial_en/s5/cmd.sh
@@ -1,22 +1,20 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
-#export train_cmd="queue.pl -q all.q@a*.clsp.jhu.edu"
-#export decode_cmd="queue.pl -q all.q@a*.clsp.jhu.edu"
-# export train_cmd="queue.pl -l mf=5g"
-# export decode_cmd="queue.pl -l mf=5g"
-export train_cmd="queue.pl -l arch=*64*"
-export decode_cmd="queue.pl -l arch=*64*"
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
 
-# The number of parallel jobs to be started for some parts of the recipe
-# Make sure you have enough resources(CPUs and RAM) to accomodate this number of jobs
-njobs=20
-
-# If you have no GridEngine you can do:
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-#njobs=2
+# this controls the number of parallel decoding jobs launched in run.sh if you
+# are running locally (e.g. with run.pl) you can reduce it to control memory
+# usage.
+export njobs=20
diff --git a/egs/vystadial_en/s5/local/create_G.sh b/egs/vystadial_en/s5/local/create_G.sh
index 7be19f7f03f..b462b9eab01 100755
--- a/egs/vystadial_en/s5/local/create_G.sh
+++ b/egs/vystadial_en/s5/local/create_G.sh
@@ -17,7 +17,7 @@ for lm in $LMs ; do
     lmp=$lmdir/`basename $lm`
 
     tmpdir=$tgt/tmp
-    mkdir -p $tgt 
+    mkdir -p $tgt
     mkdir -p $tmpdir
 
     echo "--- Preparing the grammar transducer (G.fst) from $lmp in $tgt ..."
@@ -26,21 +26,9 @@ for lm in $LMs ; do
         ln -s $langdir/$f $tgt/$f 2> /dev/null
     done
 
-    cat $lmp | utils/find_arpa_oovs.pl $tgt/words.txt > $tmpdir/oovs.txt
-
-     # grep -v '<s> <s>' because the LM seems to have some strange and useless
-     # stuff in it with multiple <s>'s in the history.  Encountered some other similar
-     # things in a LM from Geoff.  Removing all "illegal" combinations of <s> and </s>,
-     # which are supposed to occur only at being/end of utt.  These can cause 
-     # determinization failures of CLG [ends up being epsilon cycles].
-
     cat $lmp | \
-      grep -v '<s> <s>\|</s> <s>\|</s> </s>' | \
-      arpa2fst - | fstprint | \
-      utils/remove_oovs.pl $tmpdir/oovs.txt | \
-      utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$tgt/words.txt \
-        --osymbols=$tgt/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-      fstrmepsilon | fstarcsort --sort_type=ilabel > $tgt/G.fst
+      arpa2fst --disambig-symbol=#0 \
+               --read-symbol-table=$tgt/words.txt - $tgt/G.fst
     fstisstochastic $tgt/G.fst
     # The output is like:
     # 9.14233e-05 -0.259833
@@ -48,7 +36,7 @@ for lm in $LMs ; do
     # nonzero because the backoff weights make the states sum to >1).
     # Because of the <s> fiasco for these particular LMs, the first number is not
     # as close to zero as it could be.
-    
+
     # Everything below is only for diagnostic.
     # Checking that G has no cycles with empty words on them (e.g. <s>, </s>);
     # this might cause determinization failure of CLG.
@@ -59,7 +47,7 @@ for lm in $LMs ; do
     fstcompile --isymbols=$tgt/words.txt --osymbols=$tgt/words.txt \
       $tmpdir/g/select_empty.fst.txt | \
     fstarcsort --sort_type=olabel | fstcompose - $tgt/G.fst > $tmpdir/g/empty_words.fst
-    fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && 
+    fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
       echo "Language model has cycles with empty words" && exit 1
 
     # rm -rf $tmpdir  # TODO debugging
diff --git a/egs/vystadial_en/s5/path.sh b/egs/vystadial_en/s5/path.sh
index d34cd4cbe5e..d864305627b 100755
--- a/egs/vystadial_en/s5/path.sh
+++ b/egs/vystadial_en/s5/path.sh
@@ -1,9 +1,10 @@
-# Needed for "correct" sorting
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/irstlm/bin/:$PWD:$PWD/utils:$PWD/steps:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
 export LC_ALL=C
-export KALDI_ROOT=../../..
 
-# adding Kaldi binaries to path
-export PATH=$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/irstlm/bin/:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$PWD:$PWD/utils:$PWD/steps:$PATH
 
 
 srilm_bin=$KALDI_ROOT/tools/srilm/bin/
diff --git a/egs/wsj/s5/RESULTS b/egs/wsj/s5/RESULTS
index 2bb1d2124d0..acff4f9d7fe 100644
--- a/egs/wsj/s5/RESULTS
+++ b/egs/wsj/s5/RESULTS
@@ -89,22 +89,6 @@ exit 0
 %WER 5.74 [ 324 / 5643, 46 ins, 41 del, 237 sub ] exp/tri3b/decode_bd_tgpr_eval92_fg/wer_19
 %WER 5.90 [ 333 / 5643, 46 ins, 39 del, 248 sub ] exp/tri3b/decode_bd_tgpr_eval92_tg/wer_18
 
-# this section demonstrates RNNLM-HS rescoring (commented out by default)
-# the exact results might differ insignificantly due to hogwild in RNNLM-HS training that introduces indeterminism
-%WER 5.92 [ 334 / 5643, 58 ins, 32 del, 244 sub ] exp/tri3b/decode_bd_tgpr_eval92_fg/wer_14 # baseline (no rescoring)
-%WER 5.26 [ 297 / 5643, 47 ins, 29 del, 221 sub ] exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm-hs100_0.3/wer_15
-%WER 5.17 [ 292 / 5643, 46 ins, 30 del, 216 sub ] exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm-hs300_0.3/wer_16
-%WER 5.64 [ 318 / 5643, 50 ins, 34 del, 234 sub ] exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm-hs30_0.15/wer_16
-%WER 5.55 [ 313 / 5643, 51 ins, 32 del, 230 sub ] exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm-hs400_0.15/wer_16
-%WER 5.55 [ 313 / 5643, 51 ins, 32 del, 230 sub ] exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm-hs400_0.15_N1000/wer_16
-%WER 5.39 [ 304 / 5643, 50 ins, 30 del, 224 sub ] exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm-hs400_0.3/wer_15
-%WER 5.42 [ 306 / 5643, 50 ins, 30 del, 226 sub ] exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm-hs400_0.3_N10/wer_15
-%WER 5.39 [ 304 / 5643, 50 ins, 30 del, 224 sub ] exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm-hs400_0.3_N1000/wer_15
-%WER 5.37 [ 303 / 5643, 49 ins, 29 del, 225 sub ] exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm-hs400_0.4/wer_14
-%WER 5.37 [ 303 / 5643, 49 ins, 29 del, 225 sub ] exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm-hs400_0.4_N1000/wer_14
-%WER 5.26 [ 297 / 5643, 45 ins, 32 del, 220 sub ] exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm-hs400_0.5_N1000/wer_15
-%WER 5.14 [ 290 / 5643, 43 ins, 32 del, 215 sub ] exp/tri3b/decode_bd_tgpr_eval92_fg_rnnlm-hs400_0.75_N1000/wer_18
-
 %WER 14.17 [ 1167 / 8234, 222 ins, 123 del, 822 sub ] exp/tri3b/decode_tgpr_dev93/wer_17
 %WER 19.37 [ 1595 / 8234, 315 ins, 153 del, 1127 sub ] exp/tri3b/decode_tgpr_dev93.si/wer_15
 
@@ -277,4 +261,53 @@ for x in exp/nnet2_online/nnet_ms_a_online/decode_*; do grep WER $x/wer_* | util
 %WER 6.68 [ 377 / 5643, 102 ins, 13 del, 262 sub ] exp/nnet2_online/nnet_ms_a_online/decode_tgpr_eval92_utt/wer_10
 %WER 6.56 [ 370 / 5643, 100 ins, 12 del, 258 sub ] exp/nnet2_online/nnet_ms_a_online/decode_tgpr_eval92_utt_offline/wer_10
 
-
+# RNNLM n-best rescoring with Mikolov's model:
+for x in exp/nnet2_online/nnet_ms_a_online/decode_*rnnlm.h300.voc40k; do grep WER $x/wer_* | utils/best_wer.sh ; done
+%WER 5.60 [ 461 / 8234, 51 ins, 70 del, 340 sub ] exp/nnet2_online/nnet_ms_a_online/decode_bd_tgpr_dev93_rnnlm.h300.voc40k/wer_15_0.0
+%WER 2.64 [ 149 / 5643, 21 ins, 13 del, 115 sub ] exp/nnet2_online/nnet_ms_a_online/decode_bd_tgpr_eval92_rnnlm.h300.voc40k/wer_11_0.5
+%WER 8.16 [ 672 / 8234, 136 ins, 70 del, 466 sub ] exp/nnet2_online/nnet_ms_a_online/decode_tgpr_dev93_rnnlm.h300.voc40k/wer_14_0.5
+%WER 5.39 [ 304 / 5643, 74 ins, 16 del, 214 sub ] exp/nnet2_online/nnet_ms_a_online/decode_tgpr_eval92_rnnlm.h300.voc40k/wer_17_0.5
+
+# RNNLM lattice rescoring with Mikolov's model:
+for x in exp/nnet2_online/nnet_ms_a_online/decode_*rnnlm.h300.voc40k_lat; do grep WER $x/wer_* | utils/best_wer.sh ; done
+%WER 5.05 [ 416 / 8234, 47 ins, 72 del, 297 sub ] exp/nnet2_online/nnet_ms_a_online/decode_bd_tgpr_dev93_rnnlm.h300.voc40k_lat/wer_16_0.0
+%WER 2.59 [ 146 / 5643, 19 ins, 14 del, 113 sub ] exp/nnet2_online/nnet_ms_a_online/decode_bd_tgpr_eval92_rnnlm.h300.voc40k_lat/wer_10_0.5
+%WER 7.70 [ 634 / 8234, 133 ins, 67 del, 434 sub ] exp/nnet2_online/nnet_ms_a_online/decode_tgpr_dev93_rnnlm.h300.voc40k_lat/wer_13_0.5
+%WER 5.25 [ 296 / 5643, 81 ins, 14 del, 201 sub ] exp/nnet2_online/nnet_ms_a_online/decode_tgpr_eval92_rnnlm.h300.voc40k_lat/wer_14_0.5
+
+# RNNLM n-best rescoring with Yandex's model:
+for x in exp/nnet2_online/nnet_ms_a_online/decode_*rnnlm-hs.nce20.h400.voc40k; do grep WER $x/wer_* | utils/best_wer.sh ; done
+%WER 5.31 [ 437 / 8234, 50 ins, 66 del, 321 sub ] exp/nnet2_online/nnet_ms_a_online/decode_bd_tgpr_dev93_rnnlm-hs.nce20.h400.voc40k/wer_13_0.0
+%WER 2.91 [ 164 / 5643, 24 ins, 9 del, 131 sub ] exp/nnet2_online/nnet_ms_a_online/decode_bd_tgpr_eval92_rnnlm-hs.nce20.h400.voc40k/wer_10_0.0
+%WER 7.83 [ 645 / 8234, 159 ins, 50 del, 436 sub ] exp/nnet2_online/nnet_ms_a_online/decode_tgpr_dev93_rnnlm-hs.nce20.h400.voc40k/wer_11_0.0
+%WER 5.40 [ 305 / 5643, 77 ins, 16 del, 212 sub ] exp/nnet2_online/nnet_ms_a_online/decode_tgpr_eval92_rnnlm-hs.nce20.h400.voc40k/wer_13_1.0
+
+# TDNN results:
+for x in exp/nnet3/nnet_tdnn_a/decode_*; do grep WER $x/wer_* | utils/best_wer.sh ; done
+%WER 7.19 [ 592 / 8234, 51 ins, 109 del, 432 sub ] exp/nnet3/nnet_tdnn_a/decode_bd_tgpr_dev93/wer_13_0.5
+%WER 3.93 [ 222 / 5643, 23 ins, 20 del, 179 sub ] exp/nnet3/nnet_tdnn_a/decode_bd_tgpr_eval92/wer_10_1.0
+%WER 9.78 [ 805 / 8234, 167 ins, 72 del, 566 sub ] exp/nnet3/nnet_tdnn_a/decode_tgpr_dev93/wer_10_0.0
+%WER 6.40 [ 361 / 5643, 87 ins, 16 del, 258 sub ] exp/nnet3/nnet_tdnn_a/decode_tgpr_eval92/wer_10_1.0
+
+# local/nnet3/run_lstm.sh
+# LSTM results: cell_dim=1024, recurrent_projection_dim=non_recurrent_projection_dim=256,lstm_delay=-1 -2 -3, label_delay=5, num_params=11894059
+%WER 7.32 exp/nnet3/lstm_ld5/decode_bd_tgpr_dev93/wer_11_0.0
+%WER 4.24 exp/nnet3/lstm_ld5/decode_bd_tgpr_eval92/wer_10_1.0
+%WER 9.57 exp/nnet3/lstm_ld5/decode_tgpr_dev93/wer_9_1.0
+%WER 6.86 exp/nnet3/lstm_ld5/decode_tgpr_eval92/wer_10_1.0
+
+# bidirectional LSTM
+# -----------------------
+# local/nnet3/run_lstm.sh --affix bidirectional \ 
+#                         --lstm-delay " [-1,1] [-2,2] [-3,3] " \
+#	                  --label-delay 0 \
+#                         --cell-dim 640 \
+#                         --recurrent-projection-dim 128 \
+#                         --non-recurrent-projection-dim 128 \
+#                         --chunk-left-context 40 \
+#                         --chunk-right-context 40
+# num_params=11485739
+%WER 6.81 exp/nnet3/lstm_bidirectional/decode_bd_tgpr_dev93/wer_11_0.0
+%WER 4.27 exp/nnet3/lstm_bidirectional/decode_bd_tgpr_eval92/wer_11_0.0
+%WER 9.29 exp/nnet3/lstm_bidirectional/decode_tgpr_dev93/wer_11_0.5
+%WER 6.61 exp/nnet3/lstm_bidirectional/decode_tgpr_eval92/wer_11_1.0
diff --git a/egs/wsj/s5/cmd.sh b/egs/wsj/s5/cmd.sh
index 00aa0c145a3..537c46ba4f2 100644
--- a/egs/wsj/s5/cmd.sh
+++ b/egs/wsj/s5/cmd.sh
@@ -1,30 +1,29 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
-#a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64"
-export decode_cmd="queue.pl -l arch=*64 --mem 2G"
-export mkgraph_cmd="queue.pl -l arch=*64 --mem 4G"
-export big_memory_cmd="queue.pl -l arch=*64 --mem 8G"
-export cuda_cmd="queue.pl -l gpu=1"
-
-#b) run it locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-#export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
+export train_cmd=queue.pl
+export decode_cmd="queue.pl --mem 2G"
+export mkgraph_cmd="queue.pl --mem 4G"
+# the use of cuda_cmd is deprecated.
+export cuda_cmd="queue.pl --gpu 1"
 
+# the rest of this file is present for historical reasons.
+# It's better to use conf/queue.conf for cluster-specific configuration.
 #c) BUT cluster:
 if [ "$(hostname -d)" == "fit.vutbr.cz" ]; then
   queue="all.q@@blade,all.q@@speech"
-  gpu_queue="long.q@supergpu*,long.q@dellgpu*,long.q@pcspeech-gpu,long.q@pcgpu*"
+  gpu_queue="long.q@@gpu"
   storage="matylda5"
-  export train_cmd="queue.pl -q $queue -l ram_free=1500M,mem_free=1500M,${storage}=1"
-  export decode_cmd="queue.pl -q $queue -l ram_free=2500M,mem_free=2500M,${storage}=0.5"
-  export cuda_cmd="queue.pl -q $gpu_queue -l gpu=1" 
+  export train_cmd="queue.pl -q $queue -l ram_free=1.5G,mem_free=1.5G,${storage}=1"
+  export decode_cmd="queue.pl -q $queue -l ram_free=2.5G,mem_free=2.5G,${storage}=0.5"
+  export cuda_cmd="queue.pl -q $gpu_queue -l gpu=1"
 fi
-
diff --git a/egs/wsj/s5/local/cstr_wsj_extend_dict.sh b/egs/wsj/s5/local/cstr_wsj_extend_dict.sh
index b2a9faad704..8004db1d924 100755
--- a/egs/wsj/s5/local/cstr_wsj_extend_dict.sh
+++ b/egs/wsj/s5/local/cstr_wsj_extend_dict.sh
@@ -12,6 +12,11 @@
 # way.
 # It makes use of scripts in local/dict/
 
+dict_suffix=
+
+echo "$0 $@"  # Print the command line for logging
+. utils/parse_options.sh || exit 1;
+
 if [ $# -ne 1 ]; then
   echo "Usage: local/cstr_wsj_train_lms.sh WSJ1_doc_dir"
   exit 1
@@ -25,19 +30,20 @@ if [ ! -d $srcdir/lng_modl ]; then
   exit 1
 fi
 
-mkdir -p data/local/dict_larger
-dir=data/local/dict_larger
-cp data/local/dict/* data/local/dict_larger # Various files describing phones etc.
+mkdir -p data/local/dict${dict_suffix}_larger
+dir=data/local/dict${dict_suffix}_larger
+cp data/local/dict${dict_suffix}/* data/local/dict${dict_suffix}_larger # Various files describing phones etc.
   # are there; we just want to copy them as the phoneset is the same.
-rm data/local/dict_larger/lexicon.txt # we don't want this.
+rm data/local/dict${dict_suffix}_larger/lexicon.txt # we don't want this.
+rm data/local/dict${dict_suffix}_larger/lexiconp.txt # we don't want this either.
 mincount=2 # Minimum count of an OOV we will try to generate a pron for.
 
-[ ! -f data/local/dict/cmudict/cmudict.0.7a ] && echo "CMU dict not in expected place" && exit 1;
+[ ! -f data/local/dict${dict_suffix}/cmudict/cmudict.0.7a ] && echo "CMU dict not in expected place" && exit 1;
 
 # Remove comments from cmudict; print first field; remove
 # words like FOO(1) which are alternate prons: our dict format won't
 # include these markers.
-grep -v ';;;' data/local/dict/cmudict/cmudict.0.7a | 
+grep -v ';;;' data/local/dict${dict_suffix}/cmudict/cmudict.0.7a | 
  perl -ane 's/^(\S+)\(\d+\)/$1/; print; ' | sort | uniq > $dir/dict.cmu
 
 cat $dir/dict.cmu | awk '{print $1}' | sort | uniq > $dir/wordlist.cmu
diff --git a/egs/wsj/s5/local/nnet2/run_5e_gpu.sh b/egs/wsj/s5/local/nnet2/run_5e_gpu.sh
index d8ee94dd291..bae7327788e 100755
--- a/egs/wsj/s5/local/nnet2/run_5e_gpu.sh
+++ b/egs/wsj/s5/local/nnet2/run_5e_gpu.sh
@@ -15,14 +15,14 @@ dir=exp/nnet5e_gpu
 # learning rate, relative to run_5c.sh
 . ././cmd.sh
 . ./path.sh
-! cuda-compiled && cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+! cuda-compiled && cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.
 EOF
 . utils/parse_options.sh
 
-( 
+(
 
   if [ ! -z "$temp_dir" ] && [ ! -e $dir/egs ]; then
     mkdir -p $dir
@@ -41,19 +41,19 @@ EOF
    --ensemble-size $ensemble_size --initial-beta $initial_beta --final-beta $final_beta \
     data/train_si284 data/lang exp/tri4b_ali_si284 $dir || exit 1
 
-  steps/decode_nnet_cpu.sh --cmd "$decode_cmd" --nj 10 \
+  steps/nnet2/decode.sh --cmd "$decode_cmd" --nj 10 \
     --transform-dir exp/tri4b/decode_tgpr_dev93 \
      exp/tri4b/graph_tgpr data/test_dev93 $dir/decode_tgpr_dev93
 
-  steps/decode_nnet_cpu.sh --cmd "$decode_cmd" --nj 8 \
+  steps/nnet2/decode.sh --cmd "$decode_cmd" --nj 8 \
     --transform-dir exp/tri4b/decode_tgpr_eval92 \
      exp/tri4b/graph_tgpr data/test_eval92 $dir/decode_tgpr_eval92
 
-  steps/decode_nnet_cpu.sh --cmd "$decode_cmd" --nj 10 \
+  steps/nnet2/decode.sh --cmd "$decode_cmd" --nj 10 \
     --transform-dir exp/tri4b/decode_bd_tgpr_dev93 \
      exp/tri4b/graph_bd_tgpr data/test_dev93 $dir/decode_bd_tgpr_dev93
 
-  steps/decode_nnet_cpu.sh --cmd "$decode_cmd" --nj 8 \
+  steps/nnet2/decode.sh --cmd "$decode_cmd" --nj 8 \
     --transform-dir exp/tri4b/decode_bd_tgpr_eval92 \
      exp/tri4b/graph_bd_tgpr data/test_eval92 $dir/decode_bd_tgpr_eval92
 )
diff --git a/egs/wsj/s5/local/nnet2/run_bnf.sh b/egs/wsj/s5/local/nnet2/run_bnf.sh
index ee1e88f070e..1c00267c480 100644
--- a/egs/wsj/s5/local/nnet2/run_bnf.sh
+++ b/egs/wsj/s5/local/nnet2/run_bnf.sh
@@ -73,7 +73,7 @@ steps/nnet2/dump_bottleneck_features.sh --nj 10 \
 
 
 if [ ! data_bnf/train/.done -nt data_bnf/train_bnf/.done ]; then
-  steps/nnet/make_fmllr_feats.sh --cmd "$train_cmd -tc 10" \
+  steps/nnet/make_fmllr_feats.sh --cmd "$train_cmd --max-jobs-run 10" \
      --transform-dir $align_dir  data_bnf/train_sat data/train_si284 \
     exp/tri4b exp_bnf/make_fmllr_feats/log param_bnf/ 
 
diff --git a/egs/wsj/s5/local/nnet2/run_pnorm_bnf.sh b/egs/wsj/s5/local/nnet2/run_pnorm_bnf.sh
index ad12f8080a5..e9d573f01a3 100755
--- a/egs/wsj/s5/local/nnet2/run_pnorm_bnf.sh
+++ b/egs/wsj/s5/local/nnet2/run_pnorm_bnf.sh
@@ -79,7 +79,7 @@ steps/nnet2/dump_bottleneck_features.sh --nj 10 \
 
 
 if [ ! data_bnf/train/.done -nt data_bnf/train_bnf/.done ]; then
-  steps/nnet/make_fmllr_feats.sh --cmd "$train_cmd -tc 10" \
+  steps/nnet/make_fmllr_feats.sh --cmd "$train_cmd --max-jobs-run 10" \
      --transform-dir $align_dir  data_bnf/train_sat data/train_si284 \
     exp/tri4b exp_bnf/make_fmllr_feats/log param_bnf/ 
 
diff --git a/egs/wsj/s5/local/nnet3/run_lstm.sh b/egs/wsj/s5/local/nnet3/run_lstm.sh
new file mode 100755
index 00000000000..2d7ab51d900
--- /dev/null
+++ b/egs/wsj/s5/local/nnet3/run_lstm.sh
@@ -0,0 +1,128 @@
+#!/bin/bash
+
+# this is a basic lstm script
+# LSTM script runs for more epochs than the TDNN script
+# and each epoch takes twice the time
+
+# At this script level we don't support not running on GPU, as it would be painfully slow.
+# If you want to run without GPU you'd have to call lstm/train.sh with --gpu false
+
+stage=0
+train_stage=-10
+affix=
+common_egs_dir=
+
+# LSTM options
+splice_indexes="-2,-1,0,1,2 0 0"
+lstm_delay=" -1 -2 -3 "
+label_delay=5
+num_lstm_layers=3
+cell_dim=1024
+hidden_dim=1024
+recurrent_projection_dim=256
+non_recurrent_projection_dim=256
+chunk_width=20
+chunk_left_context=40
+chunk_right_context=0
+
+
+# training options
+num_epochs=10
+initial_effective_lrate=0.0006
+final_effective_lrate=0.00006
+num_jobs_initial=2
+num_jobs_final=12
+momentum=0.5
+num_chunk_per_minibatch=100
+samples_per_iter=20000
+remove_egs=true
+
+#decode options
+extra_left_context=
+extra_right_context=
+frames_per_chunk=
+
+#End configuration section
+
+echo "$0 $@" # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+dir=exp/nnet3/lstm
+dir=$dir${affix:+_$affix}
+if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
+
+local/nnet3/run_ivector_common.sh --stage $stage || exit 1;
+
+if [ $stage -le 8 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/lstm/train.sh --stage $train_stage \
+    --label-delay $label_delay \
+    --lstm-delay "$lstm_delay" \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --num-chunk-per-minibatch $num_chunk_per_minibatch \
+    --samples-per-iter $samples_per_iter \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_train_si284 \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --momentum $momentum \
+    --cmd "$decode_cmd" \
+    --num-lstm-layers $num_lstm_layers \
+    --cell-dim $cell_dim \
+    --hidden-dim $hidden_dim \
+    --recurrent-projection-dim $recurrent_projection_dim \
+    --non-recurrent-projection-dim $non_recurrent_projection_dim \
+    --chunk-width $chunk_width \
+    --chunk-left-context $chunk_left_context \
+    --chunk-right-context $chunk_right_context \
+    --egs-dir "$common_egs_dir" \
+    --remove-egs $remove_egs \
+    data/train_si284_hires data/lang exp/tri4b_ali_si284 $dir  || exit 1;
+fi
+
+if [ $stage -le 9 ]; then
+  if [ -z $extra_left_context ]; then
+    extra_left_context=$chunk_left_context
+  fi
+  if [ -z $extra_right_context ]; then
+    extra_right_context=$chunk_right_context
+  fi
+  if [ -z $frames_per_chunk ]; then
+    frames_per_chunk=$chunk_width
+  fi
+  for lm_suffix in tgpr bd_tgpr; do
+    graph_dir=exp/tri4b/graph_${lm_suffix}
+    # use already-built graphs
+    for year in eval92 dev93; do
+      (
+      num_jobs=`cat data/test_${year}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+      steps/nnet3/lstm/decode.sh --nj $num_jobs --cmd "$decode_cmd" \
+	  --extra-left-context $extra_left_context \
+	  --extra-right-context $extra_right_context \
+	  --frames-per-chunk "$frames_per_chunk" \
+	  --online-ivector-dir exp/nnet3/ivectors_test_$year \
+	 $graph_dir data/test_${year}_hires $dir/decode_${lm_suffix}_${year} || exit 1;
+      ) &
+    done
+  done
+fi
+
+exit 0;
+
diff --git a/egs/wsj/s5/local/nnet3/run_lstm_discriminative.sh b/egs/wsj/s5/local/nnet3/run_lstm_discriminative.sh
new file mode 100755
index 00000000000..ad1c12a835a
--- /dev/null
+++ b/egs/wsj/s5/local/nnet3/run_lstm_discriminative.sh
@@ -0,0 +1,201 @@
+#!/bin/bash
+
+set -o pipefail
+set -e
+# this is run_discriminative.sh
+
+# This script does discriminative training on top of CE nnet3 system.
+# note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
+# since the lattice generation runs in about real-time, so takes of the order of
+# 1000 hours of CPU time.
+# 
+. cmd.sh
+
+
+stage=0
+train_stage=-10 # can be used to start training in the middle.
+get_egs_stage=-10
+use_gpu=true  # for training
+cleanup=false  # run with --cleanup true --stage 6 to clean up (remove large things like denlats,
+               # alignments and degs).
+
+# Frame chunk options that will be used for lstm models. These need to match
+# the values used during training.
+frames_per_chunk=150
+extra_left_context=40
+extra_right_context=0
+extra_left_context_initial=-1
+extra_right_context_final=-1
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+srcdir=exp/nnet3/lstm_bidirectional_ld0
+train_data_dir=data/train_si284_hires
+online_ivector_dir=exp/nnet3/ivectors_train_si284
+degs_dir=                     # If provided, will skip the degs directory creation
+lats_dir=                     # If provided, will skip denlats creation
+
+## Objective options
+criterion=smbr
+one_silence_class=true
+
+dir=${srcdir}_${criterion}
+
+## Egs options
+frames_per_eg=150
+frames_overlap_per_eg=30
+truncate_deriv_weights=10
+
+## Nnet training options
+effective_learning_rate=0.0000125
+max_param_change=1
+num_jobs_nnet=4
+num_epochs=4
+regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options 
+minibatch_size=64
+adjust_priors=true
+modify_learning_rates=true
+last_layer_factor=0.1
+
+## Decode options
+decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we decided to run more.
+
+if $use_gpu; then
+  if ! cuda-compiled; then
+    cat <<EOF && exit 1 
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
+EOF
+  fi
+  num_threads=1
+else
+  # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
+  # almost the same, but this may be a little bit slow.
+  num_threads=16
+fi
+
+[ ! -z "$frames_per_chunk" ] && context_opts="$context_opts --frames-per-chunk $frames_per_chunk"
+[ ! -z "$extra_left_context" ] && context_opts="$context_opts --extra-left-context $extra_left_context"
+[ ! -z "$extra_right_context" ] && context_opts="$context_opts --extra-right-context $extra_right_context"
+[ ! -z "$extra_left_context_initial" ] && context_opts="$context_opts --extra-left-context-initial $extra_left_context_initial"
+[ ! -z "$extra_right_context_final" ] && context_opts="$context_opts --extra-right-context-final $extra_right_context_final"
+
+if [ ! -f ${srcdir}/final.mdl ]; then
+  echo "$0: expected ${srcdir}/final.mdl to exist; first run run_tdnn.sh or run_lstm.sh"
+  exit 1;
+fi
+
+
+
+if [ $stage -le 1 ]; then
+  # hardcode no-GPU for alignment, although you could use GPU [you wouldn't
+  # get excellent GPU utilization though.]
+  nj=100 # have a high number of jobs because this could take a while, and we might
+         # have some stragglers.
+  steps/nnet3/align.sh  --cmd "$decode_cmd" --use-gpu false \
+    --online-ivector-dir $online_ivector_dir $context_opts \
+     --nj $nj $train_data_dir data/lang $srcdir ${srcdir}_ali ;
+
+fi
+
+if [ -z "$lats_dir" ]; then
+  lats_dir=${srcdir}_denlats
+  if [ $stage -le 2 ]; then
+    nj=50  
+    # this doesn't really affect anything strongly, except the num-jobs for one of
+    # the phases of get_egs_discriminative.sh below.
+    num_threads_denlats=6
+    subsplit=40 # number of jobs that run per job (but 2 run at a time, so total jobs is 80, giving
+    # total slots = 80 * 6 = 480.
+    steps/nnet3/make_denlats.sh --cmd "$decode_cmd" --determinize true \
+      --online-ivector-dir $online_ivector_dir $context_opts \
+      --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode_dnn.config \
+      $train_data_dir data/lang $srcdir ${lats_dir} ;
+  fi
+fi
+
+model_left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'` 
+model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'` 
+
+left_context=$[model_left_context + extra_left_context]
+right_context=$[model_right_context + extra_right_context]
+
+valid_left_context=$[valid_left_context + frames_per_eg]
+valid_right_context=$[valid_right_context + frames_per_eg]
+
+frame_subsampling_opt=
+if [ -f $srcdir/frame_subsampling_factor ]; then
+  frame_subsampling_opt="--frame-subsampling-factor $(cat $srcdir/frame_subsampling_factor)"
+fi
+
+cmvn_opts=`cat $srcdir/cmvn_opts` 
+
+if [ -z "$degs_dir" ]; then
+  degs_dir=${srcdir}_degs
+
+  if [ $stage -le 3 ]; then
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${srcdir}_degs/storage ]; then
+      utils/create_split_dir.pl \
+        /export/b0{1,2,12,13}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5/${srcdir}_degs/storage ${srcdir}_degs/storage
+    fi
+    # have a higher maximum num-jobs if
+    if [ -d ${srcdir}_degs/storage ]; then max_jobs=10; else max_jobs=5; fi
+
+    degs_opts="--determinize true --minimize true --remove-output-symbols true --remove-epsilons true --collapse-transition-ids true"
+
+    steps/nnet3/get_egs_discriminative.sh \
+      --cmd "$decode_cmd --max-jobs-run $max_jobs --mem 20G" --stage $get_egs_stage --cmvn-opts "$cmvn_opts" \
+      --adjust-priors $adjust_priors \
+      --online-ivector-dir $online_ivector_dir \
+      --left-context $left_context --right-context $right_context \
+      --valid-left-context $valid_left_context --valid-right-context $valid_right_context \
+      --priors-left-context $valid_left_context --priors-right-context $valid_right_context $frame_subsampling_opt \
+      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg ${degs_opts} \
+      $train_data_dir data/lang ${srcdir}_ali $lats_dir $srcdir/final.mdl $degs_dir ;
+  fi
+fi
+
+if [ $stage -le 4 ]; then
+  steps/nnet3/train_discriminative.sh --cmd "$decode_cmd" \
+    --stage $train_stage \
+    --effective-lrate $effective_learning_rate --max-param-change $max_param_change \
+    --criterion $criterion --drop-frames true \
+    --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size $minibatch_size \
+    --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
+    --regularization-opts "$regularization_opts" \
+    --truncate-deriv-weights $truncate_deriv_weights --adjust-priors $adjust_priors \
+    --modify-learning-rates $modify_learning_rates --last-layer-factor $last_layer_factor \
+    ${degs_dir} $dir 
+fi
+
+if [ $stage -le 5 ]; then
+  for x in `seq $decode_start_epoch $num_epochs`; do
+    iter=epoch$x.adj
+    for lm_suffix in tgpr bd_tgpr; do
+      graph_dir=exp/tri4b/graph_${lm_suffix}
+      # use already-built graphs.
+      for year in eval92 dev93; do
+        (
+          steps/nnet3/decode.sh --nj 8 --cmd "$decode_cmd" --iter $iter \
+            --online-ivector-dir exp/nnet3/ivectors_test_$year $context_opts \
+            $graph_dir data/test_${year}_hires $dir/decode_${lm_suffix}_${year}_$iter ;
+        ) &
+      done
+    done
+  done
+fi
+wait;
+
+if [ $stage -le 6 ] && $cleanup; then
+  # if you run with "--cleanup true --stage 6" you can clean up.
+  rm ${lats_dir}/lat.*.gz || true
+  rm ${srcdir}_ali/ali.*.gz || true
+  steps/nnet2/remove_egs.sh ${srcdir}_degs || true
+fi
+
+
+exit 0;
+
diff --git a/egs/wsj/s5/local/nnet3/run_tdnn.sh b/egs/wsj/s5/local/nnet3/run_tdnn.sh
index 36841c862f4..337c5656de4 100755
--- a/egs/wsj/s5/local/nnet3/run_tdnn.sh
+++ b/egs/wsj/s5/local/nnet3/run_tdnn.sh
@@ -19,8 +19,8 @@ dir=exp/nnet3/nnet_tdnn_a
 
 
 if ! cuda-compiled; then
-  cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.
 EOF
@@ -40,7 +40,6 @@ if [ $stage -le 8 ]; then
     --feat-type raw \
     --online-ivector-dir exp/nnet3/ivectors_train_si284 \
     --cmvn-opts "--norm-means=false --norm-vars=false" \
-    --io-opts "-tc 12" \
     --initial-effective-lrate 0.005 --final-effective-lrate 0.0005 \
     --cmd "$decode_cmd" \
     --pnorm-input-dim 2000 \
@@ -63,3 +62,13 @@ if [ $stage -le 9 ]; then
   done
 fi
 
+
+exit 0;
+
+# results:
+grep WER exp/nnet3/nnet_tdnn_a/decode_{tgpr,bd_tgpr}_{eval92,dev93}/scoring_kaldi/best_wer
+exp/nnet3/nnet_tdnn_a/decode_tgpr_eval92/scoring_kaldi/best_wer:%WER 6.03 [ 340 / 5643, 74 ins, 20 del, 246 sub ] exp/nnet3/nnet_tdnn_a/decode_tgpr_eval92/wer_13_1.0
+exp/nnet3/nnet_tdnn_a/decode_tgpr_dev93/scoring_kaldi/best_wer:%WER 9.35 [ 770 / 8234, 162 ins, 84 del, 524 sub ] exp/nnet3/nnet_tdnn_a/decode_tgpr_dev93/wer_11_0.5
+exp/nnet3/nnet_tdnn_a/decode_bd_tgpr_eval92/scoring_kaldi/best_wer:%WER 3.81 [ 215 / 5643, 30 ins, 18 del, 167 sub ] exp/nnet3/nnet_tdnn_a/decode_bd_tgpr_eval92/wer_10_1.0
+exp/nnet3/nnet_tdnn_a/decode_bd_tgpr_dev93/scoring_kaldi/best_wer:%WER 6.74 [ 555 / 8234, 69 ins, 72 del, 414 sub ] exp/nnet3/nnet_tdnn_a/decode_bd_tgpr_dev93/wer_11_0.0
+b03:s5:
diff --git a/egs/wsj/s5/local/nnet3/run_tdnn_baseline.sh b/egs/wsj/s5/local/nnet3/run_tdnn_baseline.sh
index ffe6c2b2215..aefbcdd331b 100755
--- a/egs/wsj/s5/local/nnet3/run_tdnn_baseline.sh
+++ b/egs/wsj/s5/local/nnet3/run_tdnn_baseline.sh
@@ -35,14 +35,13 @@ if [ $stage -le 8 ]; then
      /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
   fi
 
-
   steps/nnet3/train_tdnn.sh --stage $train_stage \
     --num-epochs 8 --num-jobs-initial 2 --num-jobs-final 14 \
     --splice-indexes "-1,0,1  -2,1  -4,2 0" \
     --feat-type raw \
     --online-ivector-dir exp/nnet3/ivectors_train_si284 \
     --cmvn-opts "--norm-means=false --norm-vars=false" \
-    --io-opts "-tc 12" \
+    --io-opts "--max-jobs-run 12" \
     --initial-effective-lrate 0.005 --final-effective-lrate 0.0005 \
     --cmd "$decode_cmd" \
     --pnorm-input-dim 2000 \
diff --git a/egs/wsj/s5/local/nnet3/run_tdnn_discriminative.sh b/egs/wsj/s5/local/nnet3/run_tdnn_discriminative.sh
new file mode 100755
index 00000000000..b7ace847c6a
--- /dev/null
+++ b/egs/wsj/s5/local/nnet3/run_tdnn_discriminative.sh
@@ -0,0 +1,185 @@
+#!/bin/bash
+
+set -o pipefail
+set -e
+# this is run_discriminative.sh
+
+# This script does discriminative training on top of CE nnet3 system.
+# note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
+# since the lattice generation runs in about real-time, so takes of the order of
+# 1000 hours of CPU time.
+# 
+. cmd.sh
+
+
+stage=0
+train_stage=-10 # can be used to start training in the middle.
+get_egs_stage=-10
+use_gpu=true  # for training
+cleanup=false  # run with --cleanup true --stage 6 to clean up (remove large things like denlats,
+               # alignments and degs).
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+srcdir=exp/nnet3/nnet_ms_a
+train_data_dir=data/train_si284_hires
+online_ivector_dir=exp/nnet3/ivectors_train_si284
+degs_dir=                     # If provided, will skip the degs directory creation
+lats_dir=                     # If provided, will skip denlats creation
+
+## Objective options
+criterion=smbr
+one_silence_class=true
+
+dir=${srcdir}_${criterion}
+
+## Egs options
+frames_per_eg=150
+frames_overlap_per_eg=30
+truncate_deriv_weights=10
+
+## Nnet training options
+effective_learning_rate=0.0000125
+max_param_change=1
+num_jobs_nnet=4
+num_epochs=4
+regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options 
+minibatch_size=64
+adjust_priors=true
+modify_learning_rates=true
+last_layer_factor=0.1
+
+## Decode options
+decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we decided to run more.
+
+if $use_gpu; then
+  if ! cuda-compiled; then
+    cat <<EOF && exit 1 
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
+EOF
+  fi
+  num_threads=1
+else
+  # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
+  # almost the same, but this may be a little bit slow.
+  num_threads=16
+fi
+
+if [ ! -f ${srcdir}/final.mdl ]; then
+  echo "$0: expected ${srcdir}/final.mdl to exist; first run run_tdnn.sh or run_lstm.sh"
+  exit 1;
+fi
+
+if [ $stage -le 1 ]; then
+  # hardcode no-GPU for alignment, although you could use GPU [you wouldn't
+  # get excellent GPU utilization though.]
+  nj=100 # have a high number of jobs because this could take a while, and we might
+         # have some stragglers.
+  steps/nnet3/align.sh  --cmd "$decode_cmd" --use-gpu false \
+    --online-ivector-dir $online_ivector_dir \
+     --nj $nj $train_data_dir data/lang $srcdir ${srcdir}_ali ;
+
+fi
+
+if [ -z "$lats_dir" ]; then
+  lats_dir=${srcdir}_denlats
+  if [ $stage -le 2 ]; then
+    nj=50  
+    # this doesn't really affect anything strongly, except the num-jobs for one of
+    # the phases of get_egs_discriminative.sh below.
+    num_threads_denlats=6
+    subsplit=40 # number of jobs that run per job (but 2 run at a time, so total jobs is 80, giving
+    # total slots = 80 * 6 = 480.
+    steps/nnet3/make_denlats.sh --cmd "$decode_cmd" --determinize true \
+      --online-ivector-dir $online_ivector_dir \
+      --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode_dnn.config \
+      $train_data_dir data/lang $srcdir ${lats_dir} ;
+  fi
+fi
+
+model_left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'` 
+model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'` 
+
+left_context=$[model_left_context + extra_left_context]
+right_context=$[model_right_context + extra_right_context]
+
+valid_left_context=$[valid_left_context + frames_per_eg]
+valid_right_context=$[valid_right_context + frames_per_eg]
+
+frame_subsampling_opt=
+if [ -f $srcdir/frame_subsampling_factor ]; then
+  frame_subsampling_opt="--frame-subsampling-factor $(cat $srcdir/frame_subsampling_factor)"
+fi
+
+cmvn_opts=`cat $srcdir/cmvn_opts` 
+
+if [ -z "$degs_dir" ]; then
+  degs_dir=${srcdir}_degs
+
+  if [ $stage -le 3 ]; then
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${srcdir}_degs/storage ]; then
+      utils/create_split_dir.pl \
+        /export/b0{1,2,12,13}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5/${srcdir}_degs/storage ${srcdir}_degs/storage
+    fi
+    # have a higher maximum num-jobs if
+    if [ -d ${srcdir}_degs/storage ]; then max_jobs=10; else max_jobs=5; fi
+
+    degs_opts="--determinize true --minimize true --remove-output-symbols true --remove-epsilons true --collapse-transition-ids true"
+
+    steps/nnet3/get_egs_discriminative.sh \
+      --cmd "$decode_cmd --max-jobs-run $max_jobs --mem 20G" --stage $get_egs_stage --cmvn-opts "$cmvn_opts" \
+      --adjust-priors $adjust_priors \
+      --online-ivector-dir $online_ivector_dir \
+      --left-context $left_context --right-context $right_context \
+      --valid-left-context $valid_left_context --valid-right-context $valid_right_context \
+      --priors-left-context $valid_left_context --priors-right-context $valid_right_context $frame_subsampling_opt \
+      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg ${degs_opts} \
+      $train_data_dir data/lang ${srcdir}_ali $lats_dir $srcdir/final.mdl $degs_dir ;
+  fi
+fi
+
+if [ $stage -le 4 ]; then
+  steps/nnet3/train_discriminative.sh --cmd "$decode_cmd" \
+    --stage $train_stage \
+    --effective-lrate $effective_learning_rate --max-param-change $max_param_change \
+    --criterion $criterion --drop-frames true \
+    --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size $minibatch_size \
+    --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
+    --regularization-opts "$regularization_opts" \
+    --truncate-deriv-weights $truncate_deriv_weights --adjust-priors $adjust_priors \
+    --modify-learning-rates $modify_learning_rates --last-layer-factor $last_layer_factor \
+    ${degs_dir} $dir 
+fi
+
+if [ $stage -le 5 ]; then
+  for x in `seq $decode_start_epoch $num_epochs`; do
+    iter=epoch$x.adj
+    for lm_suffix in tgpr bd_tgpr; do
+      graph_dir=exp/tri4b/graph_${lm_suffix}
+      # use already-built graphs.
+      for year in eval92 dev93; do
+        (
+          steps/nnet3/decode.sh --nj 8 --cmd "$decode_cmd" --iter $iter \
+            --online-ivector-dir exp/nnet3/ivectors_test_$year \
+            $graph_dir data/test_${year}_hires $dir/decode_${lm_suffix}_${year}_$iter ;
+        ) &
+      done
+    done
+  done
+fi
+wait;
+
+if [ $stage -le 6 ] && $cleanup; then
+  # if you run with "--cleanup true --stage 6" you can clean up.
+  rm ${lats_dir}/lat.*.gz || true
+  rm ${srcdir}_ali/ali.*.gz || true
+  steps/nnet2/remove_egs.sh ${srcdir}_degs || true
+fi
+
+
+exit 0;
+
diff --git a/egs/wsj/s5/local/online/run_nnet2.sh b/egs/wsj/s5/local/online/run_nnet2.sh
index 6ee3f004ac5..e805df23462 100755
--- a/egs/wsj/s5/local/online/run_nnet2.sh
+++ b/egs/wsj/s5/local/online/run_nnet2.sh
@@ -56,7 +56,7 @@ if [ $stage -le 8 ]; then
     --num-threads "$num_threads" \
     --minibatch-size "$minibatch_size" \
     --parallel-opts "$parallel_opts" \
-    --io-opts "-tc 12" \
+    --io-opts "--max-jobs-run 12" \
     --initial-effective-lrate 0.005 --final-effective-lrate 0.0005 \
     --cmd "$decode_cmd" \
     --pnorm-input-dim 2000 \
diff --git a/egs/wsj/s5/local/run_bnf.sh b/egs/wsj/s5/local/run_bnf.sh
index 7c37bf72e7d..4bd2f6f8399 100644
--- a/egs/wsj/s5/local/run_bnf.sh
+++ b/egs/wsj/s5/local/run_bnf.sh
@@ -11,10 +11,14 @@ set -u
 . utils/parse_options.sh
 
 bnf_train_stage=-100
-align_dir=exp/tri4b_ali_si284 
-if [ ! -f exp_bnf/tri6_bnf/.done ]; then
-  mkdir -p exp_bnf
-  mkdir -p exp_bnf/tri6_bnf
+align_dir=exp/tri4b_ali_si284
+train_data_dir=data/train_si284
+exp_dir=exp_bnf
+bnf_exp_dir=$exp_dir/tri6_bnf
+
+if [ ! -f $bnf_exp_dir/.done ]; then
+  mkdir -p $exp_dir
+  mkdir -p $bnf_exp_dir
   echo ---------------------------------------------------------------------
   echo "Starting training the bottleneck network"
   echo ---------------------------------------------------------------------
@@ -26,8 +30,8 @@ if [ ! -f exp_bnf/tri6_bnf/.done ]; then
     --final-learning-rate 0.0005 \
     --num-hidden-layers 5 \
     --bottleneck-dim 42 --hidden-layer-dim 1024 --cmd "$train_cmd" \
-    data/train_si284 data/lang $align_dir exp_bnf/tri6_bnf || exit 1 
-  touch exp_bnf/tri6_bnf/.done
+    $train_data_dir data/lang $align_dir $bnf_exp_dir || exit 1 
+  touch $bnf_exp_dir/.done
 fi
 
 [ ! -d param_bnf ] && mkdir -p param_bnf
@@ -35,7 +39,7 @@ if [ ! -f data_bnf/train_bnf/.done ]; then
   mkdir -p data_bnf
   # put the archives in param_bnf/.
   steps/nnet2/dump_bottleneck_features.sh --cmd "$train_cmd" \
-    --transform-dir exp/tri4b_ali_si284  data/train_si284 data_bnf/train_bnf exp_bnf/tri6_bnf param_bnf exp_bnf/dump_bnf
+    --transform-dir $align_dir $train_data_dir data_bnf/train_bnf $bnf_exp_dir param_bnf $exp_dir/dump_bnf
   touch data_bnf/train_bnf/.done
 fi 
 
@@ -45,22 +49,22 @@ fi
 [ ! -d exp/tri4b/decode_bd_tgpr_dev93 ] && echo "No such directory exp/tri4b/decode_bd_tgpr_dev93" && exit 1;
 # put the archives in param_bnf/.
 steps/nnet2/dump_bottleneck_features.sh --nj 8 \
-  --transform-dir exp/tri4b/decode_bd_tgpr_eval92 data/test_eval92 data_bnf/eval92_bnf exp_bnf/tri6_bnf param_bnf exp_bnf/dump_bnf
+  --transform-dir exp/tri4b/decode_bd_tgpr_eval92 data/test_eval92 data_bnf/eval92_bnf $bnf_exp_dir param_bnf $exp_dir/dump_bnf
 
 steps/nnet2/dump_bottleneck_features.sh --nj 10 \
-  --transform-dir exp/tri4b/decode_bd_tgpr_dev93 data/test_dev93 data_bnf/dev93_bnf exp_bnf/tri6_bnf param_bnf exp_bnf/dump_bnf
+  --transform-dir exp/tri4b/decode_bd_tgpr_dev93 data/test_dev93 data_bnf/dev93_bnf $bnf_exp_dir param_bnf $exp_dir/dump_bnf
 
 
 
 if [ ! data_bnf/train/.done -nt data_bnf/train_bnf/.done ]; then
-  steps/nnet/make_fmllr_feats.sh --cmd "$train_cmd -tc 10" \
-     --transform-dir $align_dir  data_bnf/train_sat data/train_si284 \
-    exp/tri4b exp_bnf/make_fmllr_feats/log param_bnf/ 
+  steps/nnet/make_fmllr_feats.sh --cmd "$train_cmd --max-jobs-run 10" \
+     --transform-dir $align_dir  data_bnf/train_sat $train_data_dir \
+    exp/tri4b $exp_dir/make_fmllr_feats/log param_bnf/ 
 
   steps/append_feats.sh --cmd "$train_cmd" --nj 4 \
     data_bnf/train_bnf data_bnf/train_sat data_bnf/train \
-    exp_bnf/append_feats/log param_bnf/ 
-  steps/compute_cmvn_stats.sh --fake data_bnf/train exp_bnf/make_fmllr_feats param_bnf
+    $exp_dir/append_feats/log param_bnf/ 
+  steps/compute_cmvn_stats.sh --fake data_bnf/train $exp_dir/make_fmllr_feats param_bnf
   rm -r data_bnf/train_sat
 
   touch data_bnf/train/.done
@@ -68,20 +72,20 @@ fi
 ## preparing Bottleneck features for eval92 and dev93
 steps/nnet/make_fmllr_feats.sh \
   --nj 8 --transform-dir exp/tri4b/decode_bd_tgpr_eval92 data_bnf/eval92_sat data/test_eval92 \
-  exp/tri4b_ali_si284 exp_bnf/make_fmllr_feats/log param_bnf/ 
+  $align_dir $exp_dir/make_fmllr_feats/log param_bnf/ 
 steps/nnet/make_fmllr_feats.sh \
   --nj 10 --transform-dir exp/tri4b/decode_bd_tgpr_dev93 data_bnf/dev93_sat data/test_dev93 \
-  exp/tri4b_ali_si284 exp_bnf/make_fmllr_feats/log param_bnf/ 
+  $align_dir $exp_dir/make_fmllr_feats/log param_bnf/ 
 
 steps/append_feats.sh --nj 4 \
   data_bnf/eval92_bnf data_bnf/eval92_sat data_bnf/eval92 \
-  exp_bnf/append_feats/log param_bnf/ 
+  $exp_dir/append_feats/log param_bnf/ 
 steps/append_feats.sh --nj 4 \
   data_bnf/dev93_bnf data_bnf/dev93_sat data_bnf/dev93 \
-  exp_bnf/append_feats/log param_bnf/ 
+  $exp_dir/append_feats/log param_bnf/ 
   
-steps/compute_cmvn_stats.sh --fake data_bnf/eval92 exp_bnf/make_fmllr_feats param_bnf
-steps/compute_cmvn_stats.sh --fake data_bnf/dev93 exp_bnf/make_fmllr_feats param_bnf
+steps/compute_cmvn_stats.sh --fake data_bnf/eval92 $exp_dir/make_fmllr_feats param_bnf
+steps/compute_cmvn_stats.sh --fake data_bnf/dev93 $exp_dir/make_fmllr_feats param_bnf
 
 rm -r data_bnf/eval92_sat
 rm -r data_bnf/dev93_sat
diff --git a/egs/wsj/s5/local/run_kl_hmm.sh b/egs/wsj/s5/local/run_kl_hmm.sh
index 9e7679a7675..efe95052c1d 100644
--- a/egs/wsj/s5/local/run_kl_hmm.sh
+++ b/egs/wsj/s5/local/run_kl_hmm.sh
@@ -5,6 +5,8 @@
 
 . cmd.sh
 
+big_memory_cmd="$decode_cmd --mem 8G"
+
 states=20000
 dir=exp/tri4b_pretrain-dbn_dnn/
 
diff --git a/egs/wsj/s5/local/run_mmi_tri4b.sh b/egs/wsj/s5/local/run_mmi_tri4b.sh
index db34f8e1d84..2f05eddd884 100755
--- a/egs/wsj/s5/local/run_mmi_tri4b.sh
+++ b/egs/wsj/s5/local/run_mmi_tri4b.sh
@@ -9,7 +9,7 @@ steps/train_mmi.sh --cmd "$train_cmd" --boost 0.1 \
   data/train_si284 data/lang exp/tri4b_ali_si284 exp/tri4b_denlats_si284 \
   exp/tri4b_mmi_b0.1  || exit 1;
 
-steps/decode.sh --nj 10 --cmd "$decode_cmd" --transform-dir exp/tri3b/decode_tgpr_dev93 \
+steps/decode.sh --nj 10 --cmd "$decode_cmd" --transform-dir exp/tri4b/decode_tgpr_dev93 \
   exp/tri4b/graph_tgpr data/test_dev93 exp/tri4b_mmi_b0.1/decode_tgpr_dev93
 
 #first, train UBM for fMMI experiments.
@@ -23,16 +23,16 @@ steps/train_mmi_fmmi.sh \
 
 for iter in 3 4 5 6 7 8; do
  steps/decode_fmmi.sh --nj 10  --cmd "$decode_cmd" --iter $iter \
-   --transform-dir exp/tri3b/decode_tgpr_dev93  exp/tri4b/graph_tgpr data/test_dev93 \
+   --transform-dir exp/tri4b/decode_tgpr_dev93  exp/tri4b/graph_tgpr data/test_dev93 \
   exp/tri4b_fmmi_a/decode_tgpr_dev93_it$iter &
 done
 # decode the last iter with the bd model.
 for iter in 8; do
  steps/decode_fmmi.sh --nj 10  --cmd "$decode_cmd" --iter $iter \
-   --transform-dir exp/tri3b/decode_bd_tgpr_dev93  exp/tri4b/graph_bd_tgpr data/test_dev93 \
+   --transform-dir exp/tri4b/decode_bd_tgpr_dev93  exp/tri4b/graph_bd_tgpr data/test_dev93 \
   exp/tri4b_fmmi_a/decode_bd_tgpr_dev93_it$iter &
  steps/decode_fmmi.sh --nj 8  --cmd "$decode_cmd" --iter $iter \
-   --transform-dir exp/tri3b/decode_bd_tgpr_eval92  exp/tri4b/graph_bd_tgpr data/test_eval92 \
+   --transform-dir exp/tri4b/decode_bd_tgpr_eval92  exp/tri4b/graph_bd_tgpr data/test_eval92 \
   exp/tri4b_fmmi_a/decode_tgpr_eval92_it$iter &
 done
 
@@ -44,7 +44,7 @@ steps/train_mmi_fmmi_indirect.sh \
 
 for iter in 3 4 5 6 7 8; do
  steps/decode_fmmi.sh --nj 10  --cmd "$decode_cmd" --iter $iter \
-   --transform-dir exp/tri3b/decode_tgpr_dev93  exp/tri4b/graph_tgpr data/test_dev93 \
+   --transform-dir exp/tri4b/decode_tgpr_dev93  exp/tri4b/graph_tgpr data/test_dev93 \
   exp/tri4b_fmmi_indirect/decode_tgpr_dev93_it$iter &
 done
 
diff --git a/egs/wsj/s5/local/run_rnnlm-hs_tri3b.sh b/egs/wsj/s5/local/run_rnnlm-hs_tri3b.sh
deleted file mode 100755
index 302973aaf2f..00000000000
--- a/egs/wsj/s5/local/run_rnnlm-hs_tri3b.sh
+++ /dev/null
@@ -1,122 +0,0 @@
-#!/bin/bash
-
-lang_suffix=
-
-echo "$0 $@"  # Print the command line for logging
-. utils/parse_options.sh || exit 1;
-
-. cmd.sh
- # This step interpolates a small RNNLM (with weight 0.15) with the 4-gram LM.
-steps/rnnlmrescore.sh --rnnlm_ver rnnlm-hs-0.1b \
-  --N 100 --cmd "$decode_cmd" --inv-acwt 17 \
-  0.15 data/lang${lang_suffix}_test_bd_fg \
-  data/local/rnnlm-hs.h30.voc10k data/test_eval92 \
-  exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg \
-  exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg_rnnlm-hs30_0.15 || exit 1;
-
-steps/rnnlmrescore.sh --rnnlm_ver rnnlm-hs-0.1b \
-  --N 100 --cmd "$decode_cmd" --inv-acwt 17 \
-  0.3 data/lang${lang_suffix}_test_bd_fg \
-  data/local/rnnlm-hs.h100.voc20k data/test_eval92 \
-  exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg \
-  exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg_rnnlm-hs100_0.3 || exit 1;
-
-steps/rnnlmrescore.sh --rnnlm_ver rnnlm-hs-0.1b \
-  --N 100 --cmd "$decode_cmd" --inv-acwt 17 \
-  0.3 data/lang${lang_suffix}_test_bd_fg \
-  data/local/rnnlm-hs.h300.voc30k data/test_eval92 \
-  exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg \
-  exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg_rnnlm-hs300_0.3 || exit 1;
-
-steps/rnnlmrescore.sh --rnnlm_ver rnnlm-hs-0.1b \
-  --N 100 --cmd "$decode_cmd" --inv-acwt 17 \
-  0.3 data/lang${lang_suffix}_test_bd_fg \
-  data/local/rnnlm-hs.h400.voc40k data/test_eval92 \
-  exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg \
-  exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg_rnnlm-hs400_0.3 || exit 1;
-
-steps/rnnlmrescore.sh --rnnlm_ver rnnlm-hs-0.1b \
-  --N 1000 --cmd "$decode_cmd" --inv-acwt 17 \
-  0.3 data/lang${lang_suffix}_test_bd_fg \
-  data/local/rnnlm-hs.h400.voc40k data/test_eval92 \
-  exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg \
-  exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg_rnnlm-hs400_0.3_N1000 
-
-steps/rnnlmrescore.sh --rnnlm_ver rnnlm-hs-0.1b \
-  --N 1000 --cmd "$decode_cmd" --inv-acwt 17 \
-  0.3 data/lang${lang_suffix}_test_bd_fg \
-  data/local/rnnlm-hs.h400.voc40k data/test_eval92 \
-  exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg \
-  exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg_rnnlm-hs400_0.3_N1000 \
-  || exit 1;
-
-dir=exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg_rnnlm-hs400_0.4_N1000
-rm -rf $dir
-cp -r exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg_rnnlm-hs400_0.3_N1000 $dir
-steps/rnnlmrescore.sh --rnnlm_ver rnnlm-hs-0.1b \
-  --stage 7 --N 1000 --cmd "$decode_cmd" --inv-acwt 17 \
-  0.4 data/lang${lang_suffix}_test_bd_fg \
-  data/local/rnnlm-hs.h400.voc40k data/test_eval92 \
-  exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg $dir
-
-dir=exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg_rnnlm-hs400_0.4
-rm -rf $dir
-cp -r exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg_rnnlm-hs400_0.3 $dir
-steps/rnnlmrescore.sh --rnnlm_ver rnnlm-hs-0.1b \
-  --stage 7 --N 100 --cmd "$decode_cmd" --inv-acwt 17 \
-  0.4 data/lang${lang_suffix}_test_bd_fg \
-  data/local/rnnlm-hs.h400.voc40k data/test_eval92 \
-  exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg $dir
-
-dir=exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg_rnnlm-hs400_0.15
-rm -rf $dir
-cp -r exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg_rnnlm-hs400_0.3 $dir
-steps/rnnlmrescore.sh --rnnlm_ver rnnlm-hs-0.1b \
-  --stage 7 --N 100 --cmd "$decode_cmd" --inv-acwt 17 \
-  0.15 data/lang${lang_suffix}_test_bd_fg \
-  data/local/rnnlm-hs.h400.voc40k data/test_eval92 \
-  exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg $dir
-
-steps/rnnlmrescore.sh --rnnlm_ver rnnlm-hs-0.1b \
-  --N 10 --cmd "$decode_cmd" --inv-acwt 17 \
-  0.3 data/lang${lang_suffix}_test_bd_fg \
-  data/local/rnnlm-hs.h400.voc40k data/test_eval92 \
-  exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg \
-  exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg_rnnlm-hs400_0.3_N10 \
-  || exit 1;
-
-dir=exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg_rnnlm-hs400_0.4_N1000
-rm -rf $dir
-cp -r exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg_rnnlm-hs400_0.3_N1000 $dir
-steps/rnnlmrescore.sh --rnnlm_ver rnnlm-hs-0.1b \
-  --stage 7 --N 1000 --cmd "$decode_cmd" --inv-acwt 17 \
-  0.4 data/lang${lang_suffix}_test_bd_fg \
-  data/local/rnnlm-hs.h400.voc40k data/test_eval92 \
-  exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg $dir
-
-dir=exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg_rnnlm-hs400_0.15_N1000
-rm -rf $dir
-cp -r exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg_rnnlm-hs400_0.3_N1000 $dir
-steps/rnnlmrescore.sh --rnnlm_ver rnnlm-hs-0.1b \
-  --stage 7 --N 1000 --cmd "$decode_cmd" --inv-acwt 17 \
-  0.15 data/lang${lang_suffix}_test_bd_fg \
-  data/local/rnnlm-hs.h400.voc40k data/test_eval92 \
-  exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg $dir
-
-dir=exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg_rnnlm-hs400_0.5_N1000
-rm -rf $dir
-cp -r exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg_rnnlm-hs400_0.3_N1000 $dir
-steps/rnnlmrescore.sh --rnnlm_ver rnnlm-hs-0.1b \
-  --stage 7 --N 1000 --cmd "$decode_cmd" --inv-acwt 17 \
-  0.5 data/lang${lang_suffix}_test_bd_fg \
-  data/local/rnnlm-hs.h400.voc40k data/test_eval92 \
-  exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg $dir
-
-dir=exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg_rnnlm-hs400_0.75_N1000
-rm -rf $dir
-cp -r exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg_rnnlm-hs400_0.3_N1000 $dir
-steps/rnnlmrescore.sh --rnnlm_ver rnnlm-hs-0.1b \
-  --stage 7 --N 1000 --cmd "$decode_cmd" --inv-acwt 17 \
-  0.75 data/lang${lang_suffix}_test_bd_fg \
-  data/local/rnnlm-hs.h400.voc40k data/test_eval92 \
-  exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg $dir
diff --git a/egs/wsj/s5/local/run_rnnlms.sh b/egs/wsj/s5/local/run_rnnlms.sh
new file mode 100755
index 00000000000..6f2be505ec1
--- /dev/null
+++ b/egs/wsj/s5/local/run_rnnlms.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+
+. ./cmd.sh
+. ./path.sh
+
+# This script demonstrates how you can train rnnlms, and how you can use them to
+# rescore the n-best lists, or lattices.
+# Be careful: appending things like "--mem 16G" to $decode_cmd won't always
+# work, it depends what $decode_cmd is.
+
+# Trains Tomas Mikolov's version, which takes roughly 5 days with the following
+# parameter setting. We start from the dictionary directory without silence
+# probabilities (with suffix "_nosp").
+rm data/local/rnnlm.h300.voc40k/.error 2>/dev/null
+local/wsj_train_rnnlms.sh --dict-suffix "_nosp" \
+  --cmd "$decode_cmd --mem 16G" \
+  --hidden 300 --nwords 40000 --class 400 \
+  --direct 2000 data/local/rnnlm.h300.voc40k \
+  || touch data/local/rnnlm.h300.voc40k/.error &
+
+# Trains Yandex's version, which takes roughly 10 hours with the following
+# parameter setting. We start from the dictionary directory without silence
+# probabilities (with suffix "_nosp").
+num_threads_rnnlm=8
+rm data/local/rnnlm-hs.nce20.h400.voc40k/.error 2>/dev/null
+local/wsj_train_rnnlms.sh --dict-suffix "_nosp" \
+  --rnnlm_ver faster-rnnlm --threads $num_threads_rnnlm \
+  --cmd "$decode_cmd --mem 8G --num-threads $num_threads_rnnlm" \
+  --bptt 4 --bptt-block 10 --hidden 400 --nwords 40000 --direct 2000 \
+  --rnnlm-options "-direct-order 4 -nce 20" \
+  data/local/rnnlm-hs.nce20.h400.voc40k \
+  || touch data/local/rnnlm-hs.nce20.h400.voc40k/.error &
+
+wait;
+
+# Rescoring. We demonstrate results on the TDNN models. Make sure you have
+# finished running the following scripts:
+#   local/online/run_nnet2.sh
+#   local/online/run_nnet2_baseline.sh
+#   local/online/run_nnet2_discriminative.sh
+for lm_suffix in tgpr bd_tgpr; do
+  graph_dir=exp/tri4b/graph_${lm_suffix}
+  for year in eval92 dev93; do
+    decode_dir=exp/nnet2_online/nnet_ms_a_online/decode_${lm_suffix}_${year}
+
+    # N-best rescoring with Tomas Mikolov's version.
+    steps/rnnlmrescore.sh \
+      --N 1000 --cmd "$decode_cmd --mem 16G" --inv-acwt 10 0.75 \
+      data/lang_test_${lm_suffix} data/local/rnnlm.h300.voc40k \
+      data/test_${year} ${decode_dir} \
+      ${decode_dir}_rnnlm.h300.voc40k || exit 1;
+
+    # Lattice rescoring with Tomas Mikolov's version.
+    steps/lmrescore_rnnlm_lat.sh \
+      --weight 0.75 --cmd "$decode_cmd --mem 16G" --max-ngram-order 5 \
+      data/lang_test_${lm_suffix} data/local/rnnlm.h300.voc40k \
+      data/test_${year} ${decode_dir} \
+      ${decode_dir}_rnnlm.h300.voc40k_lat || exit 1;
+
+    # N-best rescoring with Yandex's version.
+    steps/rnnlmrescore.sh --rnnlm_ver faster-rnnlm \
+      --N 1000 --cmd "$decode_cmd --mem 8G" --inv-acwt 10 0.75 \
+      data/lang_test_${lm_suffix} data/local/rnnlm-hs.nce20.h400.voc40k \
+      data/test_${year} ${decode_dir} \
+      ${decode_dir}_rnnlm-hs.nce20.h400.voc40k || exit 1;
+  done
+done
diff --git a/egs/wsj/s5/local/run_rnnlms_sgmm5b.sh b/egs/wsj/s5/local/run_rnnlms_sgmm5b.sh
deleted file mode 100755
index 67fcee50a93..00000000000
--- a/egs/wsj/s5/local/run_rnnlms_sgmm5b.sh
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/bin/bash
-
-for test in dev93 eval92; do
-
-  steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_bd_tgpr data/lang_test_bd_fg \
-    data/test_${test} exp/sgmm5b_mmi_b0.1/decode_bd_tgpr_${test}_it4 exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4 || exit 1;
-
-
-# Note: for N-best-list generation, choosing the acoustic scale (12) that gave
-# the best WER on this test set.  Ideally we should do this on a dev set.
-
- # This step interpolates a small RNNLM (with weight 0.25) with the 4-gram LM.
-  steps/rnnlmrescore.sh \
-    --N 100 --cmd "$decode_cmd" --inv-acwt 12 \
-    0.25 data/lang_test_bd_fg data/local/rnnlm.h30.voc10k data/test_${test} \
-    exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4 exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4_rnnlm30_0.25  \
-    || exit 1;
-
-  steps/rnnlmrescore.sh \
-    --N 100 --cmd "$decode_cmd" --inv-acwt 12 \
-    0.5 data/lang_test_bd_fg data/local/rnnlm.h100.voc20k data/test_${test} \
-    exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4 exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4_rnnlm100_0.5 \
-    || exit 1;
-
-  steps/rnnlmrescore.sh \
-    --N 100 --cmd "$decode_cmd" --inv-acwt 12 \
-    0.5 data/lang_test_bd_fg data/local/rnnlm.h200.voc30k data/test_${test} \
-    exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4 exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4_rnnlm200_0.5 \
-    || exit 1;
-
-  steps/rnnlmrescore.sh \
-    --N 100 --cmd "$decode_cmd" --inv-acwt 12 \
-    0.5 data/lang_test_bd_fg data/local/rnnlm.h300.voc40k data/test_${test} \
-    exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4 exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4_rnnlm300_0.5 \
-    || exit 1;
-
-  steps/rnnlmrescore.sh \
-    --N 100 --cmd "$decode_cmd" --inv-acwt 12 \
-    0.75 data/lang_test_bd_fg data/local/rnnlm.h300.voc40k data/test_${test} \
-    exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4 exp/sgmm5b_mmi_b0.1/decode_bd_fg_${test}_it4_rnnlm300_0.75 \
-    || exit 1;
-done
diff --git a/egs/wsj/s5/local/run_rnnlms_tri3b.sh b/egs/wsj/s5/local/run_rnnlms_tri3b.sh
deleted file mode 100755
index 5d056860848..00000000000
--- a/egs/wsj/s5/local/run_rnnlms_tri3b.sh
+++ /dev/null
@@ -1,80 +0,0 @@
-#!/bin/bash
-
-lang_suffix=
-
-echo "$0 $@"  # Print the command line for logging
-. utils/parse_options.sh || exit 1;
-
-. cmd.sh
-
- # This step interpolates a small RNNLM (with weight 0.25) with the 4-gram LM.
-steps/rnnlmrescore.sh \
-  --N 100 --cmd "$decode_cmd" --inv-acwt 17 \
-  0.25 data/lang${lang_suffix}_test_bd_fg \
-  data/local/rnnlm.h30.voc10k data/test_eval92 \
-  exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg \
-  exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg_rnnlm30_0.25 || exit 1;
-
-steps/rnnlmrescore.sh \
-  --N 100 --cmd "$decode_cmd" --inv-acwt 17 \
-  0.5 data/lang${lang_suffix}_test_bd_fg \
-  data/local/rnnlm.h100.voc20k data/test_eval92 \
-  exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg \
-  exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg_rnnlm100_0.5 || exit 1;
-
-steps/rnnlmrescore.sh \
-  --N 100 --cmd "$decode_cmd" --inv-acwt 17 \
-  0.5 data/lang${lang_suffix}_test_bd_fg \
-  data/local/rnnlm.h200.voc30k data/test_eval92 \
-  exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg \
-  exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg_rnnlm200_0.5 || exit 1;
-
-steps/rnnlmrescore.sh \
-  --N 100 --cmd "$decode_cmd" --inv-acwt 17 \
-  0.5 data/lang${lang_suffix}_test_bd_fg \
-  data/local/rnnlm.h300.voc40k data/test_eval92 \
-  exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg \
-  exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg_rnnlm300_0.5 || exit 1;
-
-steps/rnnlmrescore.sh \
-  --N 1000 --cmd "$decode_cmd" --inv-acwt 17 \
-  0.5 data/lang${lang_suffix}_test_bd_fg \
-  data/local/rnnlm.h300.voc40k data/test_eval92 \
-  exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg \
-  exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg_rnnlm300_0.5_N1000 
-
-dir=exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg_rnnlm300_0.75_N1000
-rm -rf $dir
-cp -r exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg_rnnlm300_0.5_N1000 $dir
-steps/rnnlmrescore.sh \
-  --stage 7 --N 1000 --cmd "$decode_cmd" --inv-acwt 17 \
-  0.75 data/lang${lang_suffix}_test_bd_fg \
-  data/local/rnnlm.h300.voc40k data/test_eval92 \
-  exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg $dir
-
-dir=exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg_rnnlm300_0.75
-rm -rf $dir
-cp -r exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg_rnnlm300_0.5 $dir
-steps/rnnlmrescore.sh \
-  --stage 7 --N 100 --cmd "$decode_cmd" --inv-acwt 17 \
-  0.75 data/lang${lang_suffix}_test_bd_fg \
-  data/local/rnnlm.h300.voc40k data/test_eval92 \
-  exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg $dir
-
-dir=exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg_rnnlm300_0.25
-rm -rf $dir
-cp -r exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg_rnnlm300_0.5 $dir
-steps/rnnlmrescore.sh \
-  --stage 7 --N 100 --cmd "$decode_cmd" --inv-acwt 17 \
-  0.25 data/lang${lang_suffix}_test_bd_fg \
-  data/local/rnnlm.h300.voc40k data/test_eval92 \
-  exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg $dir
-
-steps/rnnlmrescore.sh \
-  --N 10 --cmd "$decode_cmd" --inv-acwt 17 \
-  0.5 data/lang${lang_suffix}_test_bd_fg \
-  data/local/rnnlm.h300.voc40k data/test_eval92 \
-  exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg \
-  exp/tri3b/decode${lang_suffix}_bd_tgpr_eval92_fg_rnnlm300_0.5_N10 \
-  || exit 1;
-
diff --git a/egs/wsj/s5/local/wsj_data_prep.sh b/egs/wsj/s5/local/wsj_data_prep.sh
index 3463747138a..edb9e6f2e3a 100755
--- a/egs/wsj/s5/local/wsj_data_prep.sh
+++ b/egs/wsj/s5/local/wsj_data_prep.sh
@@ -48,6 +48,8 @@ if [ ! -d links/11-13.1 -o ! -d links/13-34.1 -o ! -d links/11-2.1 ]; then
   echo "wsj_data_prep.sh: Spot check of command line arguments failed"
   echo "Command line arguments must be absolute pathnames to WSJ directories"
   echo "with names like 11-13.1."
+  echo "Note: if you have old-style WSJ distribution,"
+  echo "local/cstr_wsj_data_prep.sh may work instead, see run.sh for example."
   exit 1;
 fi
 
@@ -70,14 +72,14 @@ nl=`cat train_si284.flist | wc -l`
 [ "$nl" -eq 37416 ] || echo "Warning: expected 37416 lines in train_si284.flist, got $nl"
 
 # Now for the test sets.
-# links/13-34.1/wsj1/doc/indices/readme.doc 
+# links/13-34.1/wsj1/doc/indices/readme.doc
 # describes all the different test sets.
 # Note: each test-set seems to come in multiple versions depending
 # on different vocabulary sizes, verbalized vs. non-verbalized
 # pronunciations, etc.  We use the largest vocab and non-verbalized
 # pronunciations.
 # The most normal one seems to be the "baseline 60k test set", which
-# is h1_p0. 
+# is h1_p0.
 
 # Nov'92 (333 utts)
 # These index files have a slightly different format;
@@ -113,8 +115,8 @@ cat links/13-34.1/wsj1/doc/indices/h2_p0.ndx | \
 
 # Dev-set Hub 1,2 (503, 913 utterances)
 
-# Note: the ???'s below match WSJ and SI_DT, or wsj and si_dt.  
-# Sometimes this gets copied from the CD's with upcasing, don't know 
+# Note: the ???'s below match WSJ and SI_DT, or wsj and si_dt.
+# Sometimes this gets copied from the CD's with upcasing, don't know
 # why (could be older versions of the disks).
 find `readlink links/13-16.1`/???1/??_??_20 -print | grep -i ".wv1" | sort > dev_dt_20.flist
 find `readlink links/13-16.1`/???1/??_??_05 -print | grep -i ".wv1" | sort > dev_dt_05.flist
@@ -136,7 +138,7 @@ noiseword="<NOISE>";
 for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
    cat $x.trans1 | $local/normalize_transcript.pl $noiseword | sort > $x.txt || exit 1;
 done
- 
+
 # Create scp's with wav's. (the wv1 in the distribution is not really wav, it is sph.)
 for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
   awk '{printf("%s '$sph2pipe' -f wav %s |\n", $1, $2);}' < ${x}_sph.scp > ${x}_wav.scp
@@ -186,19 +188,19 @@ if [ ! -f wsj0-train-spkrinfo.txt ] || [ `cat wsj0-train-spkrinfo.txt | wc -l` -
   rm wsj0-train-spkrinfo.txt
   ! wget http://www.ldc.upenn.edu/Catalog/docs/LDC93S6A/wsj0-train-spkrinfo.txt && \
     echo "Getting wsj0-train-spkrinfo.txt from backup location" && \
-    wget --no-check-certificate https://sourceforge.net/projects/kaldi/files/wsj0-train-spkrinfo.txt 
+    wget --no-check-certificate https://sourceforge.net/projects/kaldi/files/wsj0-train-spkrinfo.txt
 fi
 
 if [ ! -f wsj0-train-spkrinfo.txt ]; then
   echo "Could not get the spkrinfo.txt file from LDC website (moved)?"
-  echo "This is possibly omitted from the training disks; couldn't find it." 
+  echo "This is possibly omitted from the training disks; couldn't find it."
   echo "Everything else may have worked; we just may be missing gender info"
   echo "which is only needed for VTLN-related diagnostics anyway."
   exit 1
 fi
 # Note: wsj0-train-spkrinfo.txt doesn't seem to be on the disks but the
 # LDC put it on the web.  Perhaps it was accidentally omitted from the
-# disks.  
+# disks.
 
 cat links/11-13.1/wsj0/doc/spkrinfo.txt \
     links/13-32.1/wsj1/doc/evl_spok/spkrinfo.txt \
diff --git a/egs/wsj/s5/local/wsj_extend_dict.sh b/egs/wsj/s5/local/wsj_extend_dict.sh
index 160d866843a..c2b11b8dc8b 100755
--- a/egs/wsj/s5/local/wsj_extend_dict.sh
+++ b/egs/wsj/s5/local/wsj_extend_dict.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# This script builds a larger word-list and dictionary 
+# This script builds a larger word-list and dictionary
 # than used for the LMs supplied with the WSJ corpus.
 # It uses a couple of strategies to fill-in words in
 # the LM training data but not in CMUdict.  One is
@@ -23,6 +23,8 @@ if [ $# -ne 1 ]; then
 fi
 if [ "`basename $1`" != 13-32.1 ]; then
   echo "Expecting the argument to this script to end in 13-32.1"
+  echo "Note: if you have old-style WSJ distribution,"
+  echo "local/cstr_wsj_extend_dict.sh may work instead, see run.sh for example."
   exit 1
 fi
 
@@ -46,7 +48,7 @@ mincount=2 # Minimum count of an OOV we will try to generate a pron for.
 # Remove comments from cmudict; print first field; remove
 # words like FOO(1) which are alternate prons: our dict format won't
 # include these markers.
-grep -v ';;;' data/local/dict${dict_suffix}/cmudict/cmudict.0.7a | 
+grep -v ';;;' data/local/dict${dict_suffix}/cmudict/cmudict.0.7a |
  perl -ane 's/^(\S+)\(\d+\)/$1/; print; ' | sort | uniq > $dir/dict.cmu
 
 cat $dir/dict.cmu | awk '{print $1}' | sort | uniq > $dir/wordlist.cmu
@@ -67,8 +69,8 @@ else
   | awk '/^</{next}{print toupper($0)}' | perl -e '
    open(F, "<$ARGV[0]")||die;
    while(<F>){ chop; $isword{$_} = 1; }
-   while(<STDIN>) { 
-    @A = split(" ", $_); 
+   while(<STDIN>) {
+    @A = split(" ", $_);
     for ($n = 0; $n < @A; $n++) {
       $a = $A[$n];
       if (! $isword{$a} && $a =~ s/^([^\.]+)\.$/$1/) { # nonwords that end in "."
@@ -81,7 +83,7 @@ else
   }
  ' $dir/wordlist.cmu | gzip -c > $dir/cleaned.gz
 fi
-  
+
 # get unigram counts
 echo "Getting unigram counts"
 gunzip -c $dir/cleaned.gz | tr -s ' ' '\n' | \
@@ -105,7 +107,7 @@ cat $dir/oov.counts | awk -v thresh=$mincount '{if ($1 >= thresh) { print $2; }}
 
 # First make some prons for possible acronyms.
 # Note: we don't do this for things like U.K or U.N,
-# or A.B. (which doesn't exist anyway), 
+# or A.B. (which doesn't exist anyway),
 # as we consider this normalization/spelling errors.
 
 cat $dir/oovlist | local/dict/get_acronym_prons.pl $dir/dict.cmu  > $dir/dict.acronyms
@@ -118,7 +120,7 @@ mkdir $dir/f $dir/b # forward, backward directions of rules...
 
 # Remove ; and , from words, if they are present; these
 # might crash our scripts, as they are used as separators there.
-filter_dict.pl $dir/dict.cmu > $dir/f/dict 
+filter_dict.pl $dir/dict.cmu > $dir/f/dict
 cat $dir/oovlist | filter_dict.pl > $dir/f/oovs
 reverse_dict.pl $dir/f/dict > $dir/b/dict
 reverse_dict.pl $dir/f/oovs > $dir/b/oovs
@@ -140,8 +142,8 @@ for d in $dir/f $dir/b; do
    score_rules.pl <rule.counts | sort -t';' -k3,3 -n -r >rules.with_scores
    get_candidate_prons.pl rules.with_scores dict oovs | \
      limit_candidate_prons.pl hierarchy > oovs.candidates
- )  &   
-done 
+ )  &
+done
 wait
 
 # Merge the candidates.
@@ -159,9 +161,9 @@ sort $dir/oovlist | diff - $dir/oovlist.handled  | grep -v 'd' | sed 's:< ::' >
 add_counts.pl $dir/oov.counts $dir/oovlist.handled | sort -nr > $dir/oovlist.handled.counts
 add_counts.pl $dir/oov.counts $dir/oovlist.not_handled | sort -nr > $dir/oovlist.not_handled.counts
 
-echo "**Top OOVs we handled are:**"; 
+echo "**Top OOVs we handled are:**";
 head $dir/oovlist.handled.counts
-echo "**Top OOVs we didn't handle are as follows (note: they are mostly misspellings):**"; 
+echo "**Top OOVs we didn't handle are as follows (note: they are mostly misspellings):**";
 head $dir/oovlist.not_handled.counts
 
 
diff --git a/egs/wsj/s5/local/wsj_format_data.sh b/egs/wsj/s5/local/wsj_format_data.sh
index c476e83ee6f..897b904db83 100755
--- a/egs/wsj/s5/local/wsj_format_data.sh
+++ b/egs/wsj/s5/local/wsj_format_data.sh
@@ -27,7 +27,7 @@ tmpdir=data/local/lm_tmp
 lexicon=data/local/lang${lang_suffix}_tmp/lexiconp.txt
 mkdir -p $tmpdir
 
-for x in train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do 
+for x in train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
   mkdir -p data/$x
   cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1;
   cp $srcdir/$x.txt data/$x/text || exit 1;
@@ -49,22 +49,8 @@ for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do
   cp -r data/lang${lang_suffix}/* $test || exit 1;
 
   gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
-   utils/find_arpa_oovs.pl $test/words.txt  > $tmpdir/oovs_${lm_suffix}.txt
-
-  # grep -v '<s> <s>' because the LM seems to have some strange and useless
-  # stuff in it with multiple <s>'s in the history.  Encountered some other similar
-  # things in a LM from Geoff.  Removing all "illegal" combinations of <s> and </s>,
-  # which are supposed to occur only at being/end of utt.  These can cause 
-  # determinization failures of CLG [ends up being epsilon cycles].
-  gunzip -c $lmdir/lm_${lm_suffix}.arpa.gz | \
-    grep -v '<s> <s>' | \
-    grep -v '</s> <s>' | \
-    grep -v '</s> </s>' | \
-    arpa2fst - | fstprint | \
-    utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \
-    utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
-      --osymbols=$test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-     fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
+    arpa2fst --disambig-symbol=#0 \
+             --read-symbol-table=$test/words.txt - $test/G.fst
 
   utils/validate_lang.pl --skip-determinization-check $test || exit 1;
 done
diff --git a/egs/wsj/s5/local/wsj_format_local_lms.sh b/egs/wsj/s5/local/wsj_format_local_lms.sh
index 22493fbe963..c415a806fff 100755
--- a/egs/wsj/s5/local/wsj_format_local_lms.sh
+++ b/egs/wsj/s5/local/wsj_format_local_lms.sh
@@ -45,17 +45,13 @@ fi
 # Be careful: this time we dispense with the grep -v '<s> <s>' so this might
 # not work for LMs generated from all toolkits.
 gunzip -c $lm_srcdir_3g/lm_pr6.0.gz | \
-  arpa2fst - | fstprint | \
-    utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
-      --osymbols=$lang/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-     fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang${lang_suffix}_test_bd_tgpr/G.fst || exit 1;
+  arpa2fst --disambig-symbol=#0 \
+           --read-symbol-table=$lang/words.txt - data/lang${lang_suffix}_test_bd_tgpr/G.fst || exit 1;
   fstisstochastic data/lang${lang_suffix}_test_bd_tgpr/G.fst
 
 gunzip -c $lm_srcdir_3g/lm_unpruned.gz | \
-  arpa2fst - | fstprint | \
-    utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
-      --osymbols=$lang/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-     fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang${lang_suffix}_test_bd_tg/G.fst || exit 1;
+  arpa2fst --disambig-symbol=#0 \
+           --read-symbol-table=$lang/words.txt - data/lang${lang_suffix}_test_bd_tg/G.fst || exit 1;
   fstisstochastic data/lang${lang_suffix}_test_bd_tg/G.fst
 
 # Build ConstArpaLm for the unpruned language model.
@@ -65,10 +61,8 @@ gunzip -c $lm_srcdir_3g/lm_unpruned.gz | \
   --unk-symbol=$unk - data/lang${lang_suffix}_test_bd_tgconst/G.carpa || exit 1
 
 gunzip -c $lm_srcdir_4g/lm_unpruned.gz | \
-  arpa2fst - | fstprint | \
-    utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
-      --osymbols=$lang/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-     fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang${lang_suffix}_test_bd_fg/G.fst || exit 1;
+  arpa2fst --disambig-symbol=#0 \
+           --read-symbol-table=$lang/words.txt - data/lang${lang_suffix}_test_bd_fg/G.fst || exit 1;
   fstisstochastic data/lang${lang_suffix}_test_bd_fg/G.fst
 
 # Build ConstArpaLm for the unpruned language model.
@@ -78,10 +72,8 @@ gunzip -c $lm_srcdir_4g/lm_unpruned.gz | \
   --unk-symbol=$unk - data/lang${lang_suffix}_test_bd_fgconst/G.carpa || exit 1
 
 gunzip -c $lm_srcdir_4g/lm_pr7.0.gz | \
-  arpa2fst - | fstprint | \
-    utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
-      --osymbols=$lang/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-     fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang${lang_suffix}_test_bd_fgpr/G.fst || exit 1;
+  arpa2fst --disambig-symbol=#0 \
+           --read-symbol-table=$lang/words.txt - data/lang${lang_suffix}_test_bd_fgpr/G.fst || exit 1;
   fstisstochastic data/lang${lang_suffix}_test_bd_fgpr/G.fst
 
 exit 0;
diff --git a/egs/wsj/s5/local/wsj_train_rnnlms.sh b/egs/wsj/s5/local/wsj_train_rnnlms.sh
index 1d4fda63fe7..d1ba64c0a3c 100755
--- a/egs/wsj/s5/local/wsj_train_rnnlms.sh
+++ b/egs/wsj/s5/local/wsj_train_rnnlms.sh
@@ -38,36 +38,10 @@ dir=$1
 srcdir=data/local/dict${dict_suffix}_larger
 mkdir -p $dir
 
+$KALDI_ROOT/tools/extras/check_for_rnnlm.sh "$rnnlm_ver" || exit 1
 export PATH=$KALDI_ROOT/tools/$rnnlm_ver:$PATH
 
 
-( # First make sure the kaldi_lm toolkit is installed.
- # Note: this didn't work out of the box for me, I had to
- # change the g++ version to just "g++" (no cross-compilation
- # needed for me as I ran on a machine that had been setup
- # as 64 bit by default.
- cd $KALDI_ROOT/tools || exit 1;
- if [ -f $rnnlm_ver/rnnlm ]; then
-   echo Not installing the rnnlm toolkit since it is already there.
- else
-   if [ $rnnlm_ver == "rnnlm-hs-0.1b" ]; then
-       extras/install_rnnlm_hs.sh
-   else
-       echo Downloading and installing the rnnlm tools
-       # http://www.fit.vutbr.cz/~imikolov/rnnlm/$rnnlm_ver.tgz
-       if [ ! -f $rnnlm_ver.tgz ]; then
-	   wget http://www.fit.vutbr.cz/~imikolov/rnnlm/$rnnlm_ver.tgz || exit 1;
-       fi
-       mkdir $rnnlm_ver
-       cd $rnnlm_ver
-       tar -xvzf ../$rnnlm_ver.tgz || exit 1;
-       make CC=g++ || exit 1;
-       echo Done making the rnnlm tools
-       fi
-   fi
-) || exit 1;
-
-
 if [ ! -f $srcdir/cleaned.gz -o ! -f $srcdir/lexicon.txt ]; then
   echo "Expecting files $srcdir/cleaned.gz and $srcdir/wordlist.final to exist";
   echo "You need to run local/wsj_extend_dict.sh before running this script."
diff --git a/egs/wsj/s5/path.sh b/egs/wsj/s5/path.sh
index c85d79a7ce3..2d17b17a84a 100755
--- a/egs/wsj/s5/path.sh
+++ b/egs/wsj/s5/path.sh
@@ -1,4 +1,6 @@
 export KALDI_ROOT=`pwd`/../../..
-[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh 
-export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin/:$KALDI_ROOT/src/kwsbin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$KALDI_ROOT/src/lmbin/:$KALDI_ROOT/src/nnet3bin/:$PWD:$PATH
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
 export LC_ALL=C
diff --git a/egs/wsj/s5/run.sh b/egs/wsj/s5/run.sh
index 8630352cdd9..7660320dece 100755
--- a/egs/wsj/s5/run.sh
+++ b/egs/wsj/s5/run.sh
@@ -20,8 +20,8 @@ wsj1=/export/corpora5/LDC/LDC94S13B
 
 local/wsj_data_prep.sh $wsj0/??-{?,??}.? $wsj1/??-{?,??}.?  || exit 1;
 
-# Sometimes, we have seen WSJ distributions that do not have subdirectories 
-# like '11-13.1', but instead have 'doc', 'si_et_05', etc. directly under the 
+# Sometimes, we have seen WSJ distributions that do not have subdirectories
+# like '11-13.1', but instead have 'doc', 'si_et_05', etc. directly under the
 # wsj0 or wsj1 directories. In such cases, try the following:
 #
 # corpus=/exports/work/inf_hcrc_cstr_general/corpora/wsj
@@ -44,65 +44,23 @@ local/wsj_format_data.sh --lang-suffix "_nosp" || exit 1;
  # containing many of the OOVs in the WSJ LM training data,
  # and an LM trained directly on that data (i.e. not just
  # copying the arpa files from the disks from LDC).
- # Caution: the commands below will only work if $decode_cmd 
+ # Caution: the commands below will only work if $decode_cmd
  # is setup to use qsub.  Else, just remove the --cmd option.
- # NOTE: If you have a setup corresponding to the cstr_wsj_data_prep.sh style,
- # use local/cstr_wsj_extend_dict.sh $corpus/wsj1/doc/ instead.
-
- # Note: I am commenting out the RNNLM-building commands below.  They take up a lot
- # of CPU time and are not really part of the "main recipe."
- # Be careful: appending things like "--mem 10G" to $decode_cmd
- # won't always work, it depends what $decode_cmd is.
+ # NOTE: If you have a setup corresponding to the older cstr_wsj_data_prep.sh style,
+ # use local/cstr_wsj_extend_dict.sh --dict-suffix "_nosp" $corpus/wsj1/doc/ instead.
   (
    local/wsj_extend_dict.sh --dict-suffix "_nosp" $wsj1/13-32.1  && \
    utils/prepare_lang.sh data/local/dict_nosp_larger \
      "<SPOKEN_NOISE>" data/local/lang_tmp_nosp_larger data/lang_nosp_bd && \
    local/wsj_train_lms.sh --dict-suffix "_nosp" &&
    local/wsj_format_local_lms.sh --lang-suffix "_nosp" # &&
- #
- #   ( local/wsj_train_rnnlms.sh --dict-suffix "_nosp" \
- #       --cmd "$decode_cmd --mem 10G" data/local/rnnlm.h30.voc10k &
- #     sleep 20; # wait till tools compiled.
- #     local/wsj_train_rnnlms.sh --dict-suffix "_nosp" \
- #       --cmd "$decode_cmd --mem 12G" \
- #       --hidden 100 --nwords 20000 --class 350 \
- #       --direct 1500 data/local/rnnlm.h100.voc20k &
- #     local/wsj_train_rnnlms.sh --dict-suffix "_nosp" \
- #       --cmd "$decode_cmd --mem 14G" \
- #       --hidden 200 --nwords 30000 --class 350 \
- #       --direct 1500 data/local/rnnlm.h200.voc30k &
- #     local/wsj_train_rnnlms.sh --dict-suffix "_nosp" \
- #       --cmd "$decode_cmd --mem 16G" \
- #       --hidden 300 --nwords 40000 --class 400 \
- #       --direct 2000 data/local/rnnlm.h300.voc40k &
- #   )
-   false && \ # Comment this out to train RNNLM-HS
-   (
-       num_threads_rnnlm=8
-       local/wsj_train_rnnlms.sh --dict-suffix "_nosp" \
-         --rnnlm_ver rnnlm-hs-0.1b --threads $num_threads_rnnlm \
-         --cmd "$decode_cmd --mem 1G --num-threads $num_threads_rnnlm" --bptt 4 --bptt-block 10 \
-         --hidden 30  --nwords 10000 --direct 1000 data/local/rnnlm-hs.h30.voc10k  
-       local/wsj_train_rnnlms.sh --dict-suffix "_nosp" \
-         --rnnlm_ver rnnlm-hs-0.1b --threads $num_threads_rnnlm \
-         --cmd "$decode_cmd --mem 1G --num-threads $num_threads_rnnlm" --bptt 4 --bptt-block 10 \
-         --hidden 100 --nwords 20000 --direct 1500 data/local/rnnlm-hs.h100.voc20k 
-       local/wsj_train_rnnlms.sh --dict-suffix "_nosp" \
-         --rnnlm_ver rnnlm-hs-0.1b --threads $num_threads_rnnlm \
-         --cmd "$decode_cmd --mem 1G --num-threads $num_threads_rnnlm" --bptt 4 --bptt-block 10 \
-         --hidden 300 --nwords 30000 --direct 1500 data/local/rnnlm-hs.h300.voc30k 
-       local/wsj_train_rnnlms.sh --dict-suffix "_nosp" \
-         --rnnlm_ver rnnlm-hs-0.1b --threads $num_threads_rnnlm \
-         --cmd "$decode_cmd --mem 1G --num-threads $num_threads_rnnlm" --bptt 4 --bptt-block 10 \
-         --hidden 400 --nwords 40000 --direct 2000 data/local/rnnlm-hs.h400.voc40k 
-   )
   ) &
 
 # Now make MFCC features.
 # mfccdir should be some place with a largish disk where you
 # want to store MFCC features.
 mfccdir=mfcc
-for x in test_eval92 test_eval93 test_dev93 train_si284; do 
+for x in test_eval92 test_eval93 test_dev93 train_si284; do
  steps/make_mfcc.sh --cmd "$train_cmd" --nj 20 \
    data/$x exp/make_mfcc/$x $mfccdir || exit 1;
  steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir || exit 1;
@@ -129,7 +87,7 @@ steps/train_mono.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \
  steps/decode.sh --nj 10 --cmd "$decode_cmd" exp/mono0a/graph_nosp_tgpr \
    data/test_dev93 exp/mono0a/decode_nosp_tgpr_dev93 && \
  steps/decode.sh --nj 8 --cmd "$decode_cmd" exp/mono0a/graph_nosp_tgpr \
-   data/test_eval92 exp/mono0a/decode_nosp_tgpr_eval92 
+   data/test_eval92 exp/mono0a/decode_nosp_tgpr_eval92
 ) &
 
 steps/align_si.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \
@@ -143,7 +101,7 @@ while [ ! -f data/lang_nosp_test_tgpr/tmp/LG.fst ] || \
   sleep 20;
 done
 sleep 30;
-# or the mono mkgraph.sh might be writing 
+# or the mono mkgraph.sh might be writing
 # data/lang_test_tgpr/tmp/LG.fst which will cause this to fail.
 
 utils/mkgraph.sh data/lang_nosp_test_tgpr \
@@ -222,9 +180,9 @@ steps/lmrescore.sh --cmd "$decode_cmd" \
   exp/tri2b/decode_nosp_tgpr_dev93_tg || exit 1;
 
 # Trying Minimum Bayes Risk decoding (like Confusion Network decoding):
-mkdir exp/tri2b/decode_nosp_tgpr_dev93_tg_mbr 
+mkdir exp/tri2b/decode_nosp_tgpr_dev93_tg_mbr
 cp exp/tri2b/decode_nosp_tgpr_dev93_tg/lat.*.gz \
-  exp/tri2b/decode_nosp_tgpr_dev93_tg_mbr 
+  exp/tri2b/decode_nosp_tgpr_dev93_tg_mbr
 local/score_mbr.sh --cmd "$decode_cmd" \
  data/test_dev93/ data/lang_nosp_test_tgpr/ \
  exp/tri2b/decode_nosp_tgpr_dev93_tg_mbr
@@ -291,14 +249,6 @@ steps/lmrescore.sh --cmd "$decode_cmd" \
   data/test_eval92 exp/tri3b/decode_nosp_bd_tgpr_eval92 \
   exp/tri3b/decode_nosp_bd_tgpr_eval92_tg || exit 1;
 
-# The command below is commented out as we commented out the steps above
-# that build the RNNLMs, so it would fail.
-# local/run_rnnlms_tri3b.sh --lang-suffix "_nosp"
-
-# The command below is commented out as we commented out the steps above
-# that build the RNNLMs (HS version), so it would fail.
-# wait; local/run_rnnlm-hs_tri3b.sh --lang-suffix "_nosp"
-
 # The following two steps, which are a kind of side-branch, try mixing up
 ( # from the 3b system.  This is to demonstrate that script.
  steps/mixup.sh --cmd "$train_cmd" \
@@ -326,7 +276,7 @@ steps/train_sat.sh  --cmd "$train_cmd" 4200 40000 \
  steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
    exp/tri4a/graph_nosp_tgpr data/test_eval92 \
    exp/tri4a/decode_nosp_tgpr_eval92 || exit 1;
-) & 
+) &
 
 
 # This step is just to demonstrate the train_quick.sh script, in which we
@@ -417,6 +367,10 @@ local/online/run_nnet2.sh
 local/online/run_nnet2_baseline.sh
 local/online/run_nnet2_discriminative.sh
 
+# Demonstration of RNNLM rescoring on TDNN models. We comment this out by
+# default.
+# local/run_rnnlms.sh
+
 local/run_mmi_tri4b.sh
 
 #local/run_nnet2.sh
@@ -443,7 +397,7 @@ local/nnet/run_dnn.sh
 #local/run_bnf_sgmm.sh
 
 
-# You probably want to try KL-HMM 
+# You probably want to try KL-HMM
 #local/run_kl_hmm.sh
 
 # Getting results [see RESULTS file]
@@ -474,9 +428,20 @@ local/nnet/run_dnn.sh
 #  - exp/tri4b/decode_bd_tgpr_eval92/kws/kwslist.xml
 
 # # forward-backward decoding example [way to speed up decoding by decoding forward
-# # and backward in time] 
+# # and backward in time]
 # local/run_fwdbwd.sh
 
 # # A couple of nnet3 recipes:
 # local/nnet3/run_tdnn_baseline.sh  # designed for exact comparison with nnet2 recipe
-# local/nnet3/run_tdnn.sh  # better absolute results
\ No newline at end of file
+# local/nnet3/run_tdnn.sh  # better absolute results
+# local/nnet3/run_lstm.sh  # lstm recipe
+# bidirectional lstm recipe
+# local/nnet3/run_lstm.sh --affix bidirectional \
+#	                  --lstm-delay " [-1,1] [-2,2] [-3,3] " \
+#                         --label-delay 0 \
+#                         --cell-dim 640 \
+#                         --recurrent-projection-dim 128 \
+#                         --non-recurrent-projection-dim 128 \
+#                         --chunk-left-context 40 \
+#                         --chunk-right-context 40
+
diff --git a/egs/wsj/s5/steps/align_basis_fmllr.sh b/egs/wsj/s5/steps/align_basis_fmllr.sh
index 0f195c6e88f..b3a2107a086 100755
--- a/egs/wsj/s5/steps/align_basis_fmllr.sh
+++ b/egs/wsj/s5/steps/align_basis_fmllr.sh
@@ -63,12 +63,14 @@ splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
 cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options.
 cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
 cp $srcdir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option.
+delta_opts=`cat $srcdir/delta_opts 2>/dev/null`
+cp $srcdir/delta_opts $dir 2>/dev/null
 
 if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
 echo "$0: feature type is $feat_type"
 
 case $feat_type in
-  delta) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  delta) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
   lda) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
     cp $srcdir/final.mat $dir    
    ;;
diff --git a/egs/wsj/s5/steps/align_fmllr_lats.sh b/egs/wsj/s5/steps/align_fmllr_lats.sh
new file mode 100755
index 00000000000..12c2a6479e4
--- /dev/null
+++ b/egs/wsj/s5/steps/align_fmllr_lats.sh
@@ -0,0 +1,160 @@
+#!/bin/bash
+#
+# Copyright 2012-2015  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0
+
+# Version of align_fmllr.sh that generates lattices (lat.*.gz) with
+# alignments of alternative pronunciations in them.  Mainly intended
+# as a precursor to CTC training for now.
+
+# Begin configuration section.  
+stage=0
+nj=4
+cmd=run.pl
+# Begin configuration.
+scale_opts="--transition-scale=1.0 --self-loop-scale=0.1"
+acoustic_scale=0.1
+beam=10
+retry_beam=40
+final_beam=20  # For the lattice-generation phase there is no retry-beam.  This
+               # is a limitation of gmm-latgen-faster.  We just use an
+               # intermediate beam.  We'll lose a little data and it will be
+               # slightly slower.  (however, the min-active of 200 that
+               # gmm-latgen-faster defaults to may help.)
+boost_silence=1.0 # factor by which to boost silence during alignment.
+fmllr_update_type=full
+# End configuration options.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f path.sh ] && . ./path.sh # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# != 4 ]; then
+   echo "usage: steps/align_fmllr_lats.sh <data-dir> <lang-dir> <src-dir> <align-dir>"
+   echo "e.g.:  steps/align_fmllr_lats.sh data/train data/lang exp/tri1 exp/tri1_lats"
+   echo "main options (for others, see top of script file)"
+   echo "  --config <config-file>                           # config containing options"
+   echo "  --nj <nj>                                        # number of parallel jobs"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   echo "  --fmllr-update-type (full|diag|offset|none)      # default full."
+   exit 1;
+fi
+
+data=$1
+lang=$2
+srcdir=$3
+dir=$4
+
+oov=`cat $lang/oov.int` || exit 1;
+silphonelist=`cat $lang/phones/silence.csl` || exit 1;
+sdata=$data/split$nj
+
+mkdir -p $dir/log
+echo $nj > $dir/num_jobs
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+
+cp $srcdir/{tree,final.mdl} $dir || exit 1;
+cp $srcdir/final.alimdl $dir 2>/dev/null
+cp $srcdir/final.occs $dir;
+splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
+cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options.
+cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
+cp $srcdir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option.
+delta_opts=`cat $srcdir/delta_opts 2>/dev/null`
+cp $srcdir/delta_opts $dir 2>/dev/null
+
+if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
+echo "$0: feature type is $feat_type"
+
+case $feat_type in
+  delta) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
+  lda) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
+    cp $srcdir/final.mat $dir
+    cp $srcdir/full.mat $dir 2>/dev/null
+   ;;
+  *) echo "Invalid feature type $feat_type" && exit 1;
+esac
+
+## Set up model and alignment model.
+mdl=$srcdir/final.mdl
+if [ -f $srcdir/final.alimdl ]; then
+  alimdl=$srcdir/final.alimdl
+else
+  alimdl=$srcdir/final.mdl
+fi
+[ ! -f $mdl ] && echo "$0: no such model $mdl" && exit 1;
+alimdl_cmd="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $alimdl - |"
+mdl_cmd="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $mdl - |"
+
+
+## because gmm-latgen-faster doesn't support adding the transition-probs to the
+## graph itself, we need to bake them into the compiled graphs.  This means we can't reuse previously compiled graphs,
+## because the other scripts write them without transition probs.
+if [ $stage -le 0 ]; then
+  echo "$0: compiling training graphs"
+  tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|";   
+  $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log  \
+    compile-train-graphs $scale_opts $dir/tree $dir/final.mdl  $lang/L.fst "$tra" \
+    "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
+fi
+
+
+if [ $stage -le 1 ]; then
+  # Note: we need to set --transition-scale=0.0 --self-loop-scale=0.0 because,
+  # as explained above, we compiled the transition probs into the training
+  # graphs.
+  echo "$0: aligning data in $data using $alimdl and speaker-independent features."
+  $cmd JOB=1:$nj $dir/log/align_pass1.JOB.log \
+    gmm-align-compiled --transition-scale=0.0 --self-loop-scale=0.0 --acoustic-scale=$acoustic_scale \
+        --beam=$beam --retry-beam=$retry_beam "$alimdl_cmd" \
+    "ark:gunzip -c $dir/fsts.JOB.gz|" "$sifeats" "ark:|gzip -c >$dir/pre_ali.JOB.gz" || exit 1;
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: computing fMLLR transforms"
+  if [ "$alimdl" != "$mdl" ]; then
+    $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \
+      ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \
+      weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \
+      gmm-post-to-gpost $alimdl "$sifeats" ark:- ark:- \| \
+      gmm-est-fmllr-gpost --fmllr-update-type=$fmllr_update_type \
+      --spk2utt=ark:$sdata/JOB/spk2utt $mdl "$sifeats" \
+      ark,s,cs:- ark:$dir/trans.JOB || exit 1;
+  else
+    $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \
+      ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \
+      weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \
+      gmm-est-fmllr --fmllr-update-type=$fmllr_update_type \
+      --spk2utt=ark:$sdata/JOB/spk2utt $mdl "$sifeats" \
+      ark,s,cs:- ark:$dir/trans.JOB || exit 1;
+  fi
+fi
+
+feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |"
+
+if [ $stage -le 3 ]; then
+  # Warning: gmm-latgen-faster doesn't support a retry-beam so you may get more
+  # alignment errors (however, it does have a default min-active=200 so this
+  # will tend to reduce alignment errors).
+  # --allow_partial=false makes sure we reach the end of the decoding graph.  
+  # --word-determinize=false makes sure we retain the alternative pronunciations of
+  #   words (including alternatives regarding optional silences).
+  #  --lattice-beam=$beam keeps all the alternatives that were within the beam,
+  #    it means we do no pruning of the lattice (lattices from a training transcription
+  #    will be small anyway).
+  echo "$0: generating lattices containing alternate pronunciations."
+  $cmd JOB=1:$nj $dir/log/generate_lattices.JOB.log \
+    gmm-latgen-faster --acoustic-scale=$acoustic_scale --beam=$final_beam \
+        --lattice-beam=$final_beam --allow-partial=false --word-determinize=false \
+      "$mdl_cmd" "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \
+      "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1;
+fi
+
+rm $dir/pre_ali.*.gz
+
+echo "$0: done generating lattices from training transcripts."
+
+utils/summarize_warnings.pl $dir/log
+
+exit 0;
diff --git a/egs/wsj/s5/steps/align_si.sh b/egs/wsj/s5/steps/align_si.sh
index 2e3e9496d5d..ff53c773819 100755
--- a/egs/wsj/s5/steps/align_si.sh
+++ b/egs/wsj/s5/steps/align_si.sh
@@ -56,6 +56,8 @@ splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
 cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options.
 cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
 cp $srcdir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option.
+delta_opts=`cat $srcdir/delta_opts 2>/dev/null`
+cp $srcdir/delta_opts $dir 2>/dev/null
 
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
 
@@ -68,7 +70,7 @@ if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
 echo "$0: feature type is $feat_type"
 
 case $feat_type in
-  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
   lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
     cp $srcdir/final.mat $srcdir/full.mat $dir    
    ;;
diff --git a/egs/wsj/s5/steps/cleanup/combine_short_segments.py b/egs/wsj/s5/steps/cleanup/combine_short_segments.py
new file mode 100755
index 00000000000..f51da6afa25
--- /dev/null
+++ b/egs/wsj/s5/steps/cleanup/combine_short_segments.py
@@ -0,0 +1,312 @@
+#!/usr/bin/env python
+
+# Copyright 2016 Vijayaditya Peddinti
+# Apache 2.0
+
+import argparse
+import sys
+import os
+import subprocess
+import errno
+import copy
+import shutil
+
+def GetArgs():
+    # we add compulsary arguments as named arguments for readability
+    parser = argparse.ArgumentParser(description="""
+    This script concatenates segments in the input_data_dir to ensure that"""
+    " the segments in the output_data_dir have a specified minimum length.",
+    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+    parser.add_argument("--minimum-duration", type=float, required = True,
+                        help="Minimum duration of the segments in the output directory")
+    parser.add_argument("--input-data-dir", type=str, required = True)
+    parser.add_argument("--output-data-dir", type=str, required = True)
+
+    print(' '.join(sys.argv))
+    args = parser.parse_args()
+    return args
+
+def RunKaldiCommand(command, wait = True):
+    """ Runs commands frequently seen in Kaldi scripts. These are usually a
+        sequence of commands connected by pipes, so we use shell=True """
+    p = subprocess.Popen(command, shell = True,
+                         stdout = subprocess.PIPE,
+                         stderr = subprocess.PIPE)
+
+    if wait:
+        [stdout, stderr] = p.communicate()
+        if p.returncode is not 0:
+            raise Exception("There was an error while running the command {0}\n".format(command)+"-"*10+"\n"+stderr)
+        return stdout, stderr
+    else:
+        return p
+
+def MakeDir(dir):
+    try:
+        os.mkdir(dir)
+    except OSError as exc:
+        if exc.errno != errno.EEXIST:
+            raise exc
+        raise Exception("Directory {0} already exists".format(dir))
+        pass
+
+def CheckFiles(input_data_dir):
+    for file_name in ['spk2utt', 'text', 'utt2spk', 'feats.scp']:
+        file_name = '{0}/{1}'.format(input_data_dir, file_name)
+        if not os.path.exists(file_name):
+            raise Exception("There is no such file {0}".format(file_name))
+
+def ParseFileToDict(file, assert2fields = False, value_processor = None):
+    if value_processor is None:
+        value_processor = lambda x: x[0]
+
+    dict = {}
+    for line in open(file, 'r'):
+        parts = line.split()
+        if assert2fields:
+            assert(len(parts) == 2)
+
+        dict[parts[0]] = value_processor(parts[1:])
+    return dict
+
+def WriteDictToFile(dict, file_name):
+    file = open(file_name, 'w')
+    keys = dict.keys()
+    keys.sort()
+    for key in keys:
+        value = dict[key]
+        if type(value) in [list, tuple] :
+            if type(value) is tuple:
+                value = list(value)
+            value.sort()
+            value = ' '.join(value)
+        file.write('{0}\t{1}\n'.format(key, value))
+    file.close()
+
+
+def ParseDataDirInfo(data_dir):
+    data_dir_file = lambda file_name: '{0}/{1}'.format(data_dir, file_name)
+
+    utt2spk = ParseFileToDict(data_dir_file('utt2spk'))
+    spk2utt = ParseFileToDict(data_dir_file('spk2utt'), value_processor = lambda x: x)
+    text = ParseFileToDict(data_dir_file('text'), value_processor = lambda x: " ".join(x))
+    # we want to assert feats.scp has just 2 fields, as we don't know how
+    # to process it otherwise
+    feat = ParseFileToDict(data_dir_file('feats.scp'), assert2fields = True)
+    utt2dur = ParseFileToDict(data_dir_file('utt2dur'), value_processor = lambda x: float(x[0]))
+    utt2uniq = None
+    if os.path.exists(data_dir_file('utt2uniq')):
+        utt2uniq = ParseFileToDict(data_dir_file('utt2uniq'))
+    return utt2spk, spk2utt, text, feat, utt2dur, utt2uniq
+
+
+def GetCombinedUttIndexRange(utt_index, utts, utt_durs, minimum_duration):
+    # We want the minimum number of concatenations
+    # to reach the minimum_duration. If two concatenations satisfy
+    # the minimum duration constraint we choose the shorter one.
+    left_index = utt_index - 1
+    right_index = utt_index + 1
+    num_remaining_segments = len(utts) - 1
+    cur_utt_dur = utt_durs[utts[utt_index]]
+
+    while num_remaining_segments > 0:
+
+        left_utt_dur = 0
+        if left_index >= 0:
+            left_utt_dur = utt_durs[utts[left_index]]
+        right_utt_dur = 0
+        if right_index <= len(utts) - 1:
+            right_utt_dur = utt_durs[utts[right_index]]
+
+        right_combined_utt_dur = cur_utt_dur + right_utt_dur
+        left_combined_utt_dur = cur_utt_dur + left_utt_dur
+        left_right_combined_utt_dur = cur_utt_dur + left_utt_dur + right_utt_dur
+
+        combine_left_exit = False
+        combine_right_exit = False
+        if right_combined_utt_dur >= minimum_duration:
+            if left_combined_utt_dur >= minimum_duration:
+                if left_combined_utt_dur <= right_combined_utt_dur:
+                    combine_left_exit = True
+                else:
+                    combine_right_exit = True
+            else:
+                combine_right_exit = True
+        elif left_combined_utt_dur >= minimum_duration:
+            combine_left_exit = True
+        elif left_right_combined_utt_dur >= minimum_duration :
+            combine_left_exit = True
+            combine_right_exit = True
+
+        if combine_left_exit and combine_right_exit:
+            cur_utt_dur = left_right_combined_utt_dur
+            break
+        elif combine_left_exit:
+            cur_utt_dur = left_combined_utt_dur
+            # move back the right_index as we don't need to combine it
+            right_index = right_index - 1
+            break
+        elif combine_right_exit:
+            cur_utt_dur = right_combined_utt_dur
+            # move back the left_index as we don't need to combine it
+            left_index = left_index + 1
+            break
+
+        # couldn't satisfy minimum duration requirement so continue search
+        if left_index >= 0:
+            num_remaining_segments = num_remaining_segments - 1
+        if right_index <= len(utts) - 1:
+            num_remaining_segments = num_remaining_segments - 1
+
+        left_index = left_index - 1
+        right_index = right_index + 1
+
+        cur_utt_dur = left_right_combined_utt_dur
+    left_index = max(0, left_index)
+    right_index = min(len(utts)-1, right_index)
+    return left_index, right_index, cur_utt_dur
+
+
+def WriteCombinedDirFiles(output_dir, utt2spk, spk2utt, text, feat, utt2dur, utt2uniq):
+    out_dir_file = lambda file_name: '{0}/{1}'.format(output_dir, file_name)
+    total_combined_utt_list = []
+    for speaker in spk2utt.keys():
+        utts = spk2utt[speaker]
+        for utt in utts:
+            if type(utt) is tuple:
+                #this is a combined utt
+                total_combined_utt_list.append((speaker, utt))
+
+    for speaker, combined_utt_tuple in total_combined_utt_list:
+        combined_utt_list = list(combined_utt_tuple)
+        combined_utt_list.sort()
+        new_utt_name = "-".join(combined_utt_list)+'-appended'
+
+        # updating the utt2spk dict
+        for utt in combined_utt_list:
+            spk_name = utt2spk.pop(utt)
+        utt2spk[new_utt_name] = spk_name
+
+        # updating the spk2utt dict
+        spk2utt[speaker].remove(combined_utt_tuple)
+        spk2utt[speaker].append(new_utt_name)
+
+        # updating the text dict
+        combined_text = []
+        for utt in combined_utt_list:
+            combined_text.append(text.pop(utt))
+        text[new_utt_name] = ' '.join(combined_text)
+
+        # updating the feat dict
+        combined_feat = []
+        for utt in combined_utt_list:
+            combined_feat.append(feat.pop(utt))
+        feat_command = "concat-feats --print-args=false {feats} - |".format(feats = " ".join(combined_feat))
+        feat[new_utt_name] = feat_command
+
+        # updating utt2dur
+        combined_dur = 0
+        for utt in combined_utt_list:
+            combined_dur += utt2dur.pop(utt)
+        utt2dur[new_utt_name] = combined_dur
+
+        # updating utt2uniq
+        if utt2uniq is not None:
+            combined_uniqs = []
+            for utt in combined_utt_list:
+                combined_uniqs.append(utt2uniq.pop(utt))
+            # utt2uniq file is used to map perturbed data to original unperturbed
+            # versions so that the training cross validation sets can avoid overlap
+            # of data however if perturbation changes the length of the utterance
+            # (e.g. speed perturbation) the utterance combinations in each
+            # perturbation of the original recording can be very different. So there
+            # is no good way to find the utt2uniq mappinng so that we can avoid
+            # overlap.
+            utt2uniq[new_utt_name] = combined_uniqs[0]
+
+
+    WriteDictToFile(utt2spk, out_dir_file('utt2spk'))
+    WriteDictToFile(spk2utt, out_dir_file('spk2utt'))
+    WriteDictToFile(feat, out_dir_file('feats.scp'))
+    WriteDictToFile(text, out_dir_file('text'))
+    if utt2uniq is not None:
+        WriteDictToFile(utt2uniq, out_dir_file('utt2uniq'))
+    WriteDictToFile(utt2dur, out_dir_file('utt2dur'))
+
+
+def CombineSegments(input_dir, output_dir, minimum_duration):
+    utt2spk, spk2utt, text, feat, utt2dur, utt2uniq = ParseDataDirInfo(input_dir)
+    total_combined_utt_list = []
+
+    # copy the duration dictionary so that we can modify it
+    utt_durs = copy.deepcopy(utt2dur)
+    speakers = spk2utt.keys()
+    speakers.sort()
+    for speaker in speakers:
+
+        utts = spk2utt[speaker] # this is an assignment of the reference
+        # In WriteCombinedDirFiles the values of spk2utt will have the list
+        # of combined utts which will be used as reference
+
+        # we make an assumption that the sorted uttlist corresponds
+        # to contiguous segments. This is true only if utt naming
+        # is done according to accepted conventions
+        # this is an easily violatable assumption. Have to think of a better
+        # way to do this.
+        utts.sort()
+        utt_index = 0
+        while utt_index < len(utts):
+            if utt_durs[utts[utt_index]] < minimum_duration:
+                left_index, right_index, cur_utt_dur = GetCombinedUttIndexRange(utt_index, utts, utt_durs, minimum_duration)
+                if not cur_utt_dur >= minimum_duration:
+                    # this is a rare occurrence, better make the user aware of this
+                    # situation and let them deal with it
+                    raise Exception('Speaker {0} does not have enough utterances to satisfy the minimum duration constraint'.format(speaker))
+
+                combined_duration = 0
+                combined_utts = []
+                # update the utts_dur dictionary
+                for utt in utts[left_index:right_index + 1]:
+                    combined_duration += utt_durs.pop(utt)
+                    if type(utt) is tuple:
+                        for item in utt:
+                            combined_utts.append(item)
+                    else:
+                        combined_utts.append(utt)
+                combined_utts = tuple(combined_utts) # converting to immutable type to use as dictionary key
+                assert(cur_utt_dur == combined_duration)
+
+                # now modify the utts list
+                combined_indices = range(left_index, right_index + 1)
+                # start popping from the largest index so that the lower
+                # indexes are valid
+                for i in combined_indices[::-1]:
+                    utts.pop(i)
+                utts.insert(left_index, combined_utts)
+                utt_durs[combined_utts] = combined_duration
+                utt_index = left_index
+            utt_index = utt_index + 1
+    WriteCombinedDirFiles(output_dir, utt2spk, spk2utt, text, feat, utt2dur, utt2uniq)
+
+def Main():
+    args = GetArgs()
+
+    CheckFiles(args.input_data_dir)
+    MakeDir(args.output_data_dir)
+    feat_lengths = {}
+    segments_file = '{0}/segments'.format(args.input_data_dir)
+
+    RunKaldiCommand("utils/data/get_utt2dur.sh {0}".format(args.input_data_dir))
+
+    CombineSegments(args.input_data_dir, args.output_data_dir, args.minimum_duration)
+
+    RunKaldiCommand("utils/utt2spk_to_spk2utt.pl {od}/utt2spk > {od}/spk2utt".format(od = args.output_data_dir))
+    if os.path.exists('{0}/cmvn.scp'.format(args.input_data_dir)):
+        shutil.copy('{0}/cmvn.scp'.format(args.input_data_dir), args.output_data_dir)
+
+    RunKaldiCommand("utils/fix_data_dir.sh {0}".format(args.output_data_dir))
+if __name__ == "__main__":
+    Main()
+
+
diff --git a/egs/wsj/s5/steps/cleanup/create_segments_from_ctm.pl b/egs/wsj/s5/steps/cleanup/create_segments_from_ctm.pl
index 911640f5495..5af5fd34662 100755
--- a/egs/wsj/s5/steps/cleanup/create_segments_from_ctm.pl
+++ b/egs/wsj/s5/steps/cleanup/create_segments_from_ctm.pl
@@ -42,11 +42,11 @@
                       (default = "<***>")
   --wer-cutoff      : Ignore segments with WER higher than the specified value.
                       -1 means no segment will be ignored. (default = -1)
-  --use-silence-midpoints : Set to 1 if you want to use silence midpoints 
+  --use-silence-midpoints : Set to 1 if you want to use silence midpoints
                       instead of min_sil_length for silence overhang.(default 0)
-  --force-correct-boundary-words : Set to zero if the segments will not be 
+  --force-correct-boundary-words : Set to zero if the segments will not be
                       required to have boundary words to be correct. Default 1
-  --aligned-ctm-filename : If set, the intermediate aligned ctm 
+  --aligned-ctm-filename : If set, the intermediate aligned ctm
                       is saved to this file
 EOU
 
@@ -56,7 +56,7 @@
 my $separator = ";";
 my $special_symbol = "<***>";
 my $wer_cutoff = -1;
-my $use_silence_midpoints = 0; 
+my $use_silence_midpoints = 0;
 my $force_correct_boundary_words = 1;
 my $aligned_ctm_filename = "";
 GetOptions(
@@ -122,13 +122,13 @@ sub PrintSegment {
 
   # Works out the surrounding silence.
   my $index = $seg_start_index - 1;
-  while ($index >= 0 && $aligned_ctm->[$index]->[0] eq 
+  while ($index >= 0 && $aligned_ctm->[$index]->[0] eq
          "<eps>" && $aligned_ctm->[$index]->[3] == 0) {
     $index -= 1;
   }
-  my $left_of_segment_has_deletion = "false"; 
-  $left_of_segment_has_deletion = "true" 
-      if ($index > 0 && $aligned_ctm->[$index-1]->[0] ne "<eps>" 
+  my $left_of_segment_has_deletion = "false";
+  $left_of_segment_has_deletion = "true"
+      if ($index > 0 && $aligned_ctm->[$index-1]->[0] ne "<eps>"
           && $aligned_ctm->[$index-1]->[3] == 0);
 
   my $pad_start_sil = ($aligned_ctm->[$seg_start_index]->[1] -
@@ -141,11 +141,11 @@ sub PrintSegment {
   my $right_of_segment_has_deletion = "false";
   $index = $seg_end_index + 1;
   while ($index < scalar(@{$aligned_ctm}) &&
-         $aligned_ctm->[$index]->[0] eq "<eps>" && 
+         $aligned_ctm->[$index]->[0] eq "<eps>" &&
          $aligned_ctm->[$index]->[3] == 0) {
     $index += 1;
   }
-  $right_of_segment_has_deletion = "true" 
+  $right_of_segment_has_deletion = "true"
       if ($index < scalar(@{$aligned_ctm})-1 && $aligned_ctm->[$index+1]->[0] ne
           "<eps>" && $aligned_ctm->[$index - 1]->[3] > 0);
   my $pad_end_sil = ($aligned_ctm->[$index - 1]->[1] +
@@ -155,7 +155,7 @@ sub PrintSegment {
   if (($right_of_segment_has_deletion eq "true") || !$use_silence_midpoints) {
       if ($pad_end_sil > $min_sil_length / 2.0) {
           $pad_end_sil = $min_sil_length / 2.0;
-      } 
+      }
   }
 
   my $seg_start = $aligned_ctm->[$seg_start_index]->[1] - $pad_start_sil;
@@ -228,7 +228,8 @@ sub SplitLongSegment {
                            $aligned_ctm->[$seg_end_index]->[2] -
                            $aligned_ctm->[$seg_start_index]->[1];
   my $current_seg_index = $seg_start_index;
-  while ($current_seg_length > 1.5 * $max_seg_length) {
+  my $aligned_ctm_size = keys($aligned_ctm);    
+  while ($current_seg_length > 1.5 * $max_seg_length && $current_seg_index < $aligned_ctm_size-1) {
     my $split_point = GetSplitPoint($aligned_ctm, $current_seg_index,
                                     $seg_end_index, $max_seg_length);
     my $ans = PrintSegment($aligned_ctm, $wav_id, $min_sil_length,
@@ -241,6 +242,14 @@ sub SplitLongSegment {
                           $aligned_ctm->[$current_seg_index]->[1];
   }
 
+  if ($current_seg_index eq $aligned_ctm_size-1) {
+      my $ans = PrintSegment($aligned_ctm, $wav_id, $min_sil_length,
+                             $min_seg_length, $current_seg_index, $current_seg_index,
+                             $current_seg_count, $SO, $TO);
+      $current_seg_count += 1 if ($ans != -1);
+      return ($current_seg_count, $current_seg_index);
+  }
+
   if ($current_seg_length > $max_seg_length) {
     my $split_point = GetSplitPoint($aligned_ctm, $current_seg_index,
                                     $seg_end_index,
@@ -269,6 +278,7 @@ sub ProcessWav {
       $current_ctm, $current_align, $SO, $TO, $ACT) = @_;
 
   my $wav_id = $current_ctm->[0]->[0];
+  my $channel_id = $current_ctm->[0]->[1];
   defined($wav_id) || die "Error: empty wav section\n";
 
   # First, we have to align the ctm file to the Levenshtein alignment.
@@ -308,7 +318,7 @@ sub ProcessWav {
             $aligned_ctm[-1]->[3] += 1;
           } else {
             push(@aligned_ctm, ["<eps>", $start, $dur, 1]);
-          }
+          } 
         } else {
           # Case 2.3: substitution.
           push(@aligned_ctm, [$ref_word, $start, $dur, 1]);
@@ -322,10 +332,10 @@ sub ProcessWav {
   }
 
   # Save the aligned CTM if needed
-  if(tell($ACT) != -1){
-    for (my $i=0; $i<=$#aligned_ctm; $i++) {
-      print $ACT "$aligned_ctm[$i][0] $aligned_ctm[$i][1] ";
-      print $ACT "$aligned_ctm[$i][2] $aligned_ctm[$i][3]\n";
+  if(defined($ACT)){
+    for (my $i = 0; $i <= $#aligned_ctm; $i++) {
+      print $ACT "$wav_id $channel_id $aligned_ctm[$i][1] $aligned_ctm[$i][2] ";
+      print $ACT "$aligned_ctm[$i][0] $aligned_ctm[$i][3]\n";
     }
   }
 
@@ -345,8 +355,8 @@ sub ProcessWav {
     # length, and if there are no alignment error around it. We also make sure
     # that segment contains actual words, instead of pure silence.
     if ($aligned_ctm[$x]->[0] eq "<eps>" &&
-        $aligned_ctm[$x]->[2] >= $min_sil_length 
-       && (($force_correct_boundary_words && $lcorrect eq "true" && 
+        $aligned_ctm[$x]->[2] >= $min_sil_length
+       && (($force_correct_boundary_words && $lcorrect eq "true" &&
             $rcorrect eq "true") || !$force_correct_boundary_words)) {
       if ($current_seg_length <= $max_seg_length &&
           $current_seg_length >= $min_seg_length) {
@@ -378,7 +388,7 @@ sub ProcessWav {
 # 011 A 3.39 0.23 SELL
 # 011 A 3.62 0.18 OFF
 # 011 A 3.83 0.45 ASSETS
-# 
+#
 # Output ctm:
 # 011 A 3.39 0.23 SELL
 # 011 A 3.62 0.18 OFF
@@ -391,7 +401,7 @@ sub InsertSilence {
 
     my $new_start = sprintf("%.2f",
                             $ctm_in->[$x - 1]->[2] + $ctm_in->[$x - 1]->[3]);
-    if ($new_start <= $ctm_in->[$x]->[2]) {
+    if ($new_start < $ctm_in->[$x]->[2]) {
       my $new_dur = sprintf("%.2f", $ctm_in->[$x]->[2] - $new_start);
       push(@{$ctm_out}, [$ctm_in->[$x - 1]->[0], $ctm_in->[$x - 1]->[1],
                          $new_start, $new_dur, "<eps>"]);
@@ -458,4 +468,4 @@ sub InsertSilence {
 close(AI);
 close($SO);
 close($TO);
-close($ACT);
+close($ACT) if defined($ACT);
diff --git a/egs/wsj/s5/steps/cleanup/debug_lexicon.sh b/egs/wsj/s5/steps/cleanup/debug_lexicon.sh
index c768d89b44e..cdf1ff3e5df 100755
--- a/egs/wsj/s5/steps/cleanup/debug_lexicon.sh
+++ b/egs/wsj/s5/steps/cleanup/debug_lexicon.sh
@@ -4,11 +4,12 @@
 
 # this script gets some stats that will help you debug the lexicon.
 
-# Begin configuration section.  
+# Begin configuration section.
 stage=1
 remove_stress=false
 nj=10  # number of jobs for various decoding-type things that we run.
 cmd=run.pl
+alidir=
 # End configuration section
 
 echo "$0 $@"  # Print the command line for logging
@@ -26,6 +27,8 @@ if [ $# != 5 ]; then
    echo "  --remove-stress <true|false>                     # if true, remove stress before printing analysis"
    echo "                                                   # note: if you change this, you only have to rerun"
    echo "                                                   # from stage 10."
+   echo "  --alidir <alignment-dir>                         # if supplied, training-data alignments and transforms"
+   echo "                                                   # are obtained from here instead of being generated."
    exit 1;
 fi
 
@@ -41,38 +44,46 @@ for f in $data/feats.scp $lang/phones.txt $src/final.mdl $srcdict; do
   [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1;
 done
 
-if [ $stage -le 1 ]; then
-  steps/align_fmllr.sh --cmd "$cmd" --nj $nj $data $lang $src ${src}_ali_$(basename $data)
+if [ -z $alidir ]; then
+  alidir=${src}_ali_$(basename $data)
+  if [ $stage -le 1 ]; then
+    steps/align_fmllr.sh --cmd "$cmd" --nj $nj $data $lang $src $alidir
+  fi
 fi
 
+phone_lang=data/$(basename $lang)_phone_bg
+
 if [ $stage -le 2 ]; then
-  utils/make_phone_bigram_lang.sh $lang ${src}_ali_$(basename $data) data/$(basename $lang)_phone_bg
+  utils/make_phone_bigram_lang.sh $lang $alidir $phone_lang
 fi
 
 if [ $stage -le 3 ]; then
-  utils/mkgraph.sh data/$(basename $lang)_phone_bg $src $src/graph_phone_bg
+  utils/mkgraph.sh $phone_lang $src $src/graph_phone_bg
 fi
 
 if [ $stage -le 4 ]; then
-  steps/decode_si.sh --cmd "$cmd" --nj $nj --transform-dir ${src}_ali_$(basename $data) \
-    --acwt 0.25 --beam 25.0 --lattice-beam 5.0 --max-active 2500 \
+  steps/decode_si.sh --skip-scoring true \
+    --cmd "$cmd" --nj $nj --transform-dir $alidir \
+    --acwt 0.25 --beam 10.0 --lattice-beam 5.0 --max-active 2500 \
     $src/graph_phone_bg $data $src/decode_$(basename $data)_phone_bg
 fi
 
 if [ $stage -le 5 ]; then
-  steps/get_train_ctm.sh $data $lang ${src}_ali_$(basename $data)
+  steps/get_train_ctm.sh --print-silence true --use-segments false \
+     --cmd "$cmd" $data $lang $alidir
 fi
 
 if [ $stage -le 6 ]; then
-  steps/get_ctm.sh --min-lmwt 3 --max-lmwt 8 \
-     $data data/$(basename $lang)_phone_bg $src/decode_$(basename $data)_phone_bg
+  steps/get_ctm.sh --use-segments false --cmd "$cmd" --min-lmwt 3 --max-lmwt 8 \
+     $data $phone_lang $src/decode_$(basename $data)_phone_bg
 fi
 
 if [ $stage -le 7 ]; then
   mkdir -p $dir
   # lmwt=4 corresponds to the scale we decoded at.
   cp $src/decode_$(basename $data)_phone_bg/score_4/$(basename $data).ctm $dir/phone.ctm
-  cp ${src}_ali_$(basename $data)/ctm $dir/word.ctm
+
+  cp $alidir/ctm $dir/word.ctm
 fi
 
 if [ $stage -le 8 ]; then
@@ -82,7 +93,7 @@ if [ $stage -le 8 ]; then
 # we'll convert it into two entries like this, with the start and end separately:
 # sw02054-A 0021332 START and
 # sw02054-A 0021356 END and
-# 
+#
 # and suppose phone.ctm has lines like
 # sw02054 A 213.09 0.24 sil
 # sw02054 A 213.33 0.13 ae_B
@@ -95,18 +106,17 @@ if [ $stage -le 8 ]; then
 # then after sorting and merge-sorting the two ctm files we can easily
 # work out for each word, what the phones were during that time.
 
-  grep -v '<eps>' data/$(basename $lang)_phone_bg/phones.txt | awk '{print $1, $1}' | \
+  grep -v '<eps>' $phone_lang/phones.txt | awk '{print $1, $1}' | \
     sed 's/_B$//' | sed 's/_I$//' | sed 's/_E$//' | sed 's/_S$//' >$dir/phone_map.txt
 
-  silphone=$(cat data/$(basename $lang)_phone_bg/phones/optional_silence.txt)
-  cat $dir/phone.ctm | utils/apply_map.pl -f 5 $dir/phone_map.txt | grep -v "$silphone\$" > $dir/phone_cleaned.ctm
+  cat $dir/phone.ctm | utils/apply_map.pl -f 5 $dir/phone_map.txt > $dir/phone_text.ctm > $dir/phone_mapped.ctm
 
   export LC_ALL=C
-  
+
   cat $dir/word.ctm | awk '{printf("%s-%s %09d START %s\n", $1, $2, 100*$3, $5); printf("%s-%s %09d END %s\n", $1, $2, 100*($3+$4), $5);}' | \
      sort >$dir/word_processed.ctm
 
-  cat $dir/phone_cleaned.ctm | awk '{printf("%s-%s %09d PHONE %s\n", $1, $2, 100*($3+(0.5*$4)), $5);}' | \
+  cat $dir/phone_mapped.ctm | awk '{printf("%s-%s %09d PHONE %s\n", $1, $2, 100*($3+(0.5*$4)), $5);}' | \
      sort >$dir/phone_processed.ctm
 
   # merge-sort both ctm's
@@ -129,12 +139,16 @@ if [ $stage -le 10 ]; then
   else
     cp $srcdict $dir/lexicon.txt
   fi
+  silphone=$(cat $phone_lang/phones/optional_silence.txt)
+  echo "<eps> $silphone" >> $dir/lexicon.txt
 
   awk '{count[$2] += $1;} END {for (w in count){print w, count[w];}}' \
       <$dir/prons.txt >$dir/counts.txt
 
+
+
   cat $dir/prons.txt | \
-    if $remove_stress; then 
+    if $remove_stress; then
       perl -e 'while(<>) { @A=split(" ", $_); for ($n=1;$n<@A;$n++) { $A[$n] =~ s/[0-9]$//; } print join(" ", @A) . "\n"; } '
     else
       cat
@@ -143,9 +157,9 @@ if [ $stage -le 10 ]; then
      open(D, "<$ARGV[0]") || die "opening dict file $ARGV[0]";
      # create a hash of all reference pronuncations, and for each word, record
      # a list of the prons, separated by " | ".
-     while (<D>) { 
-        @A = split(" ", $_); $is_pron{join(" ",@A)} = 1; 
-        $w = shift @A; 
+     while (<D>) {
+        @A = split(" ", $_); $is_pron{join(" ",@A)} = 1;
+        $w = shift @A;
         if (!defined $prons{$w}) { $prons{$w} = join(" ", @A); }
         else { $prons{$w} = $prons{$w} . " | " . join(" ", @A); }
      }
diff --git a/egs/wsj/s5/steps/cleanup/find_bad_utts.sh b/egs/wsj/s5/steps/cleanup/find_bad_utts.sh
index 97fb62a9c4f..80a71b0edc5 100755
--- a/egs/wsj/s5/steps/cleanup/find_bad_utts.sh
+++ b/egs/wsj/s5/steps/cleanup/find_bad_utts.sh
@@ -5,9 +5,9 @@
 # Computes training alignments using a model with delta or
 # LDA+MLLT features.  This version, rather than just using the
 # text to align, computes mini-language models (unigram) from the text
-# and a few common words in the LM, and allows
+# and a few common words in the LM.
 
-# Begin configuration section.  
+# Begin configuration section.
 nj=4
 cmd=run.pl
 use_graphs=false
@@ -82,7 +82,7 @@ echo "$0: feature type is $feat_type"
 case $feat_type in
   delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
   lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
-    cp $srcdir/final.mat $srcdir/full.mat $dir    
+    cp $srcdir/final.mat $srcdir/full.mat $dir
    ;;
   *) echo "$0: invalid feature type $feat_type" && exit 1;
 esac
@@ -155,7 +155,7 @@ if [ $stage -le 2 ]; then
   # <utterance-id>   <number of errors>  <reference-length>  <decoded-output>   <reference>
   # with the fields separated by tabs, e.g.
   # adg04_sr009_trn 1 	12	 SHOW THE GRIDLEY+S TRACK IN BRIGHT ORANGE WITH HORNE+S IN DIM RED AT	 SHOW THE GRIDLEY+S TRACK IN BRIGHT ORANGE WITH HORNE+S IN DIM RED
-  
+
   paste $dir/edits.txt \
       <(awk '{print $2}' $dir/length.txt) \
       <(awk '{$1="";print;}' <$dir/aligned_ref.txt) \
@@ -171,9 +171,9 @@ fi
 
 if [ $stage -le 3 ]; then
   ###
-  # These stats migh help people figure out what is wrong with the data
+  # These stats might help people figure out what is wrong with the data
   # a)human-friendly and machine-parsable alignment in the file per_utt_details.txt
-  # b)evaluation of per-speaker performance to possibly find speakers with 
+  # b)evaluation of per-speaker performance to possibly find speakers with
   #   distinctive accents/speech disorders and similar
   # c)Global analysis on (Ins/Del/Sub) operation, which might be used to figure
   #   out if there is systematic issue with lexicon, pronunciation or phonetic confusability
diff --git a/egs/wsj/s5/steps/cleanup/find_bad_utts_nnet.sh b/egs/wsj/s5/steps/cleanup/find_bad_utts_nnet.sh
new file mode 100755
index 00000000000..19beaca8914
--- /dev/null
+++ b/egs/wsj/s5/steps/cleanup/find_bad_utts_nnet.sh
@@ -0,0 +1,185 @@
+#!/bin/bash
+# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey)
+#           2016       Api.ai (Author: Ilya Platonov)      
+# Apache 2.0
+#
+# Tweaked version of find_bad_utts.sh to work with nnet2 and nnet3(supports chain models) non-ivector models.
+# This script uses nnet-info and nnet3-am-info to determine type of nnet (nnet2 or nnet3).
+# Use --acoustic-scale=1.0 for chain models.
+#
+# Begin configuration section.  
+nj=8
+cmd=run.pl
+use_graphs=false
+# Begin configuration.
+scale_opts="--transition-scale=1.0 --self-loop-scale=0.1"
+acoustic_scale=0.1
+beam=15.0
+lattice_beam=8.0
+max_active=750
+transform_dir=  # directory to find fMLLR transforms in.
+top_n_words=100 # Number of common words that we compile into each graph (most frequent
+                # in $lang/text.
+stage=-1
+cleanup=true
+# End configuration options.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f path.sh ] && . ./path.sh # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# != 4 ]; then
+   echo "usage: $0 <data-dir> <lang-dir> <src-dir> <dir>"
+   echo "e.g.:  $0 data/train data/lang exp/tri1 exp/tri1_debug"
+   echo "main options (for others, see top of script file)"
+   echo "  --config <config-file>                           # config containing options"
+   echo "  --nj <nj>                                        # number of parallel jobs"
+   echo "  --use-graphs true                                # use graphs in src-dir"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   exit 1;
+fi
+
+data=$1
+lang=$2
+srcdir=$3
+dir=$4
+
+for f in $data/text $lang/oov.int $srcdir/tree $srcdir/final.mdl \
+    $lang/L_disambig.fst $lang/phones/disambig.int; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1;
+done
+
+oov=`cat $lang/oov.int` || exit 1;
+mkdir -p $dir/log
+echo $nj > $dir/num_jobs
+sdata=$data/split$nj
+splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
+cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options.
+cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
+cp $srcdir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option.
+
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+
+cp $srcdir/{tree,final.mdl} $dir || exit 1;
+
+#checking type of nnet
+if nnet-info 1>/dev/null 2>/dev/null $srcdir/final.mdl; then 
+  nnet_type="nnet";
+  latgen_cmd="nnet-latgen-faster";
+elif nnet3-am-info 1>/dev/null 2>/dev/null $srcdir/final.mdl; then
+  nnet_type="nnet3"
+  frame_subsampling_factor=1;
+  nnet3_opt=
+  if [ -f $srcdir/frame_subsampling_factor ]; then
+    frame_subsampling_factor="$(cat $srcdir/frame_subsampling_factor)"
+  fi
+  if [ "$frame_subsamping_factor" != "1" ]; then
+    nnet3_opt="--frame-subsampling-factor=$frame_subsampling_factor";
+  fi
+  latgen_cmd="nnet3-latgen-faster $nnet3_opt";
+else
+  echo "Unsupported type of nnet for $srcdir/final.mdl";
+fi 
+
+echo "nnet type is $nnet_type";
+
+
+if [ $stage -le 0 ]; then
+  utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt <$data/text | \
+    awk '{for(x=2;x<=NF;x++) print $x;}' | sort | uniq -c | \
+    sort -rn > $dir/word_counts.int || exit 1;
+  num_words=$(awk '{x+=$1} END{print x}' < $dir/word_counts.int) || exit 1;
+  # print top-n words with their unigram probabilities.
+
+  head -n $top_n_words $dir/word_counts.int | awk -v tot=$num_words '{print $1/tot, $2;}' >$dir/top_words.int
+  utils/int2sym.pl -f 2 $lang/words.txt <$dir/top_words.int >$dir/top_words.txt
+fi
+
+echo "$0: feature type is raw"
+
+feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |";
+
+if [ $stage -le 1 ]; then
+  echo "$0: decoding $data using utterance-specific decoding graphs using model from $srcdir, output in $dir"
+
+  rm $dir/edits.*.txt $dir/aligned_ref.*.txt 2>/dev/null
+
+  $cmd JOB=1:$nj $dir/log/decode.JOB.log \
+    utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text \| \
+    steps/cleanup/make_utterance_fsts.pl $dir/top_words.int \| \
+    compile-train-graphs-fsts $scale_opts --read-disambig-syms=$lang/phones/disambig.int \
+     $dir/tree $dir/final.mdl $lang/L_disambig.fst ark:- ark:- \| \
+    $latgen_cmd --acoustic-scale=$acoustic_scale --beam=$beam \
+      --max-active=$max_active --lattice-beam=$lattice_beam \
+      --word-symbol-table=$lang/words.txt \
+     $dir/final.mdl ark:- "$feats" ark:- \| \
+    lattice-oracle ark:- "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|" \
+      ark,t:- ark,t:$dir/edits.JOB.txt \| \
+    utils/int2sym.pl -f 2- $lang/words.txt '>' $dir/aligned_ref.JOB.txt || exit 1;
+fi
+
+
+if [ $stage -le 2 ]; then
+  if [ -f $dir/edits.1.txt ]; then
+    # the awk commands below are to ensure that partially-written files don't confuse us.
+    for x in $(seq $nj); do cat $dir/edits.$x.txt; done | awk '{if(NF==2){print;}}' > $dir/edits.txt
+    for x in $(seq $nj); do cat $dir/aligned_ref.$x.txt; done | awk '{if(NF>=1){print;}}' > $dir/aligned_ref.txt
+  else
+    echo "$0: warning: no file $dir/edits.1.txt, using previously concatenated file if present."
+  fi
+
+  # in case any utterances failed to align, get filtered copy of $data/text
+  utils/filter_scp.pl $dir/edits.txt < $data/text  > $dir/text
+  cat $dir/text | awk '{print $1, (NF-1);}' > $dir/length.txt
+
+  n1=$(wc -l < $dir/edits.txt)
+  n2=$(wc -l < $dir/aligned_ref.txt)
+  n3=$(wc -l < $dir/text)
+  n4=$(wc -l < $dir/length.txt)
+  if [ $n1 -ne $n2 ] || [ $n2 -ne $n3 ] || [ $n3 -ne $n4 ]; then
+    echo "$0: mismatch in lengths of files:"
+    wc $dir/edits.txt $dir/aligned_ref.txt $dir/text $dir/length.txt
+    exit 1;
+  fi
+
+  # note: the format of all_info.txt is:
+  # <utterance-id>   <number of errors>  <reference-length>  <decoded-output>   <reference>
+  # with the fields separated by tabs, e.g.
+  # adg04_sr009_trn 1 	12	 SHOW THE GRIDLEY+S TRACK IN BRIGHT ORANGE WITH HORNE+S IN DIM RED AT	 SHOW THE GRIDLEY+S TRACK IN BRIGHT ORANGE WITH HORNE+S IN DIM RED
+  
+  paste $dir/edits.txt \
+      <(awk '{print $2}' $dir/length.txt) \
+      <(awk '{$1="";print;}' <$dir/aligned_ref.txt) \
+      <(awk '{$1="";print;}' <$dir/text) > $dir/all_info.txt
+
+  sort -nr -k2 $dir/all_info.txt > $dir/all_info.sorted.txt
+
+  if $cleanup; then
+    rm $dir/edits.*.txt $dir/aligned_ref.*.txt
+  fi
+
+fi
+
+if [ $stage -le 3 ]; then
+  ###
+  # These stats migh help people figure out what is wrong with the data
+  # a)human-friendly and machine-parsable alignment in the file per_utt_details.txt
+  # b)evaluation of per-speaker performance to possibly find speakers with 
+  #   distinctive accents/speech disorders and similar
+  # c)Global analysis on (Ins/Del/Sub) operation, which might be used to figure
+  #   out if there is systematic issue with lexicon, pronunciation or phonetic confusability
+
+  mkdir -p $dir/analysis
+  align-text --special-symbol="***"  ark:$dir/text ark:$dir/aligned_ref.txt  ark,t:- | \
+    utils/scoring/wer_per_utt_details.pl --special-symbol "***" > $dir/analysis/per_utt_details.txt
+
+  cat $dir/analysis/per_utt_details.txt | \
+    utils/scoring/wer_per_spk_details.pl $data/utt2spk > $dir/analysis/per_spk_details.txt
+
+  cat $dir/analysis/per_utt_details.txt | \
+    utils/scoring/wer_ops_details.pl --special-symbol "***" | \
+    sort -i -b -k1,1 -k4,4nr -k2,2 -k3,3 > $dir/analysis/ops_details.txt
+
+fi
+
diff --git a/egs/wsj/s5/steps/cleanup/make_utterance_graph.sh b/egs/wsj/s5/steps/cleanup/make_utterance_graph.sh
index 733eba34d10..a3b1e2af70a 100755
--- a/egs/wsj/s5/steps/cleanup/make_utterance_graph.sh
+++ b/egs/wsj/s5/steps/cleanup/make_utterance_graph.sh
@@ -111,10 +111,8 @@ while read line; do
       if (invoc[$x]) { printf("%s ", $x); } else { printf("%s ", oov); } }
       printf("\n"); }' > $wdir/text
     ngram-count -text $wdir/text -order $ngram_order "$srilm_options" -lm - |\
-      arpa2fst - | fstprint | utils/eps2disambig.pl | utils/s2eps.pl |\
-      fstcompile --isymbols=$lang/words.txt --osymbols=$lang/words.txt  \
-      --keep_isymbols=false --keep_osymbols=false |\
-      fstrmepsilon | fstarcsort --sort_type=ilabel > $wdir/G.fst || exit 1;
+      arpa2fst --disambig-symbol=#0 \
+             --read-symbol-table=$lang/words.txt - $wdir/G.fst || exit 1;
   fi
   fstisstochastic $wdir/G.fst || echo "$0: $uttid/G.fst not stochastic."
 
@@ -134,7 +132,7 @@ while read line; do
 
   make-h-transducer --disambig-syms-out=$wdir/disambig_tid.int \
     --transition-scale=$tscale $wdir/ilabels_${N}_${P} \
-    $model_dir/tree $model_dir/final.mdl > $wdir/Ha.fst 
+    $model_dir/tree $model_dir/final.mdl > $wdir/Ha.fst
 
   # Builds HCLGa.fst
   fsttablecompose $wdir/Ha.fst $wdir/CLG.fst | \
@@ -143,10 +141,10 @@ while read line; do
     fstminimizeencoded > $wdir/HCLGa.fst
   fstisstochastic $wdir/HCLGa.fst ||\
     echo "$0: $uttid/HCLGa.fst is not stochastic"
-  
+
   add-self-loops --self-loop-scale=$loopscale --reorder=true \
     $model_dir/final.mdl < $wdir/HCLGa.fst > $wdir/HCLG.fst
-  
+
   if [ $tscale == 1.0 -a $loopscale == 1.0 ]; then
     fstisstochastic $wdir/HCLG.fst ||\
       echo "$0: $uttid/HCLG.fst is not stochastic."
diff --git a/egs/wsj/s5/steps/conf/append_eval_to_ctm.py b/egs/wsj/s5/steps/conf/append_eval_to_ctm.py
new file mode 100755
index 00000000000..3a35f5a9281
--- /dev/null
+++ b/egs/wsj/s5/steps/conf/append_eval_to_ctm.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+
+# Copyright 2015  Brno University of Technology (author: Karel Vesely)
+# Apache 2.0
+
+import sys,operator
+
+# Append Levenshtein alignment of 'hypothesis' and 'reference' into 'CTM':
+# (i.e. the output of 'align-text' post-processed by 'wer_per_utt_details.pl')
+
+# The tags in the appended column are:
+#  'C' = correct
+#  'S' = substitution
+#  'I' = insertion
+#  'U' = unknown (not part of scored segment)
+
+if len(sys.argv) != 4:
+  print 'Usage: %s eval-in ctm-in ctm-eval-out' % __file__
+  sys.exit(1)
+dummy, eval_in, ctm_in, ctm_eval_out = sys.argv
+
+if ctm_eval_out == '-': ctm_eval_out = '/dev/stdout'
+
+# Read the evalutation,
+eval_vec = dict()
+with open(eval_in, 'r') as f:
+  while True:
+    # Reading 4 lines encoding one utterance,
+    ref = f.readline()
+    hyp = f.readline()
+    op = f.readline()
+    csid = f.readline()
+    if not ref: break
+    # Parse the input,
+    utt,tag,hyp_vec = hyp.split(' ',2)
+    assert(tag == 'hyp')
+    utt,tag,op_vec = op.split(' ',2)
+    assert(tag == 'op')
+    hyp_vec = hyp_vec.split()
+    op_vec = op_vec.split()
+    # Fill create eval vector with symbols 'C', 'S', 'I',
+    assert(utt not in eval_vec)
+    eval_vec[utt] = []
+    for op,hyp in zip(op_vec, hyp_vec):
+      if hyp != '<eps>': eval_vec[utt].append(op)
+
+# Load the 'ctm' into dictionary,
+ctm = dict()
+with open(ctm_in) as f:
+  for l in f:
+    utt, ch, beg, dur, wrd, conf = l.split()
+    if not utt in ctm: ctm[utt] = []
+    ctm[utt].append((utt, ch, float(beg), float(dur), wrd, float(conf)))
+
+# Build the 'ctm' with 'eval' column added,
+ctm_eval = []
+for utt,ctm_part in ctm.iteritems():
+  ctm_part.sort(key = operator.itemgetter(2)) # Sort by 'beg' time,
+  # extending the 'tuple' by '+':
+  merged = [ tup + (evl,) for tup,evl in zip(ctm_part,eval_vec[utt]) ]
+  ctm_eval.extend(merged)
+
+# Sort again,
+ctm_eval.sort(key = operator.itemgetter(0,1,2))
+
+# Store,
+with open(ctm_eval_out,'w') as f:
+  for tup in ctm_eval:
+    f.write('%s %s %f %f %s %f %s\n' % tup)
+
diff --git a/egs/wsj/s5/steps/conf/append_prf_to_ctm.py b/egs/wsj/s5/steps/conf/append_prf_to_ctm.py
new file mode 100755
index 00000000000..547b6176c9f
--- /dev/null
+++ b/egs/wsj/s5/steps/conf/append_prf_to_ctm.py
@@ -0,0 +1,75 @@
+#!/usr/bin/env python
+
+# Copyright 2015  Brno University of Technology (author: Karel Vesely)
+# Apache 2.0
+
+import sys
+
+# Append Levenshtein alignment of 'hypothesis' and 'reference' into 'CTM':
+# (parsed from the 'prf' output of 'sclite')
+
+# The tags in appended column are:
+#  'C' = correct
+#  'S' = substitution
+#  'I' = insertion
+#  'U' = unknown (not part of scored segment)
+
+# Parse options,
+if len(sys.argv) != 4:
+  print "Usage: %s prf ctm_in ctm_out" % __file__
+  sys.exit(1)
+prf_file, ctm_file, ctm_out_file = sys.argv[1:]
+
+if ctm_out_file == '-': ctm_out_file = '/dev/stdout'
+
+# Load the prf file,
+prf = []
+with open(prf_file) as f:
+  for l in f:
+    # Store the data,
+    if l[:5] == 'File:':
+      file_id = l.split()[1]
+    if l[:8] == 'Channel:':
+      chan = l.split()[1]
+    if l[:5] == 'H_T1:':
+      h_t1 = l
+    if l[:5] == 'Eval:':
+      evl = l
+      prf.append((file_id,chan,h_t1,evl))
+
+# Parse the prf records into dictionary,
+prf_dict = dict()
+for (f,c,t,e) in prf:
+  t_pos = 0 # position in the 't' string,
+  while t_pos < len(t):
+    t1 = t[t_pos:].split(' ',1)[0] # get 1st token at 't_pos'
+    try:
+      # get word evaluation letter 'C,S,I',
+      evl = e[t_pos] if e[t_pos] != ' ' else 'C' 
+      # add to dictionary,
+      key='%s,%s' % (f,c) # file,channel
+      if key not in prf_dict: prf_dict[key] = dict()
+      prf_dict[key][float(t1)] = evl
+    except ValueError:
+      pass
+    t_pos += len(t1)+1 # advance position for parsing,
+
+# Load the ctm file (with confidences),
+with open(ctm_file) as f:
+  ctm = [ l.split() for l in f ]
+
+# Append the sclite alignment tags to ctm,
+ctm_out = []
+for f, chan, beg, dur, wrd, conf in ctm:
+  # U = unknown, C = correct, S = substitution, I = insertion,
+  sclite_tag = 'U' 
+  try:
+    sclite_tag = prf_dict[('%s,%s'%(f,chan)).lower()][float(beg)]
+  except KeyError:
+    pass
+  ctm_out.append([f,chan,beg,dur,wrd,conf,sclite_tag])
+
+# Save the augmented ctm file,
+with open(ctm_out_file, 'w') as f:
+  f.writelines([' '.join(ctm_record)+'\n' for ctm_record in ctm_out])
+
diff --git a/egs/wsj/s5/steps/conf/apply_calibration.sh b/egs/wsj/s5/steps/conf/apply_calibration.sh
new file mode 100755
index 00000000000..c1a22e274b8
--- /dev/null
+++ b/egs/wsj/s5/steps/conf/apply_calibration.sh
@@ -0,0 +1,91 @@
+#!/bin/bash
+# Copyright 2015, Brno University of Technology (Author: Karel Vesely). Apache 2.0.
+
+# Trains logistic regression, which calibrates the per-word confidences,
+# which are extracted by the Minimum Bayes Risk decoding.
+
+# begin configuration section.
+cmd=
+stage=0
+# end configuration section.
+
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -ne 5 ]; then
+  echo "Usage: $0 [opts] <data-dir> <lang-dir|graph-dir> <decode-dir> <calibration-dir> <output-dir>"
+  echo " Options:"
+  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
+  exit 1;
+fi
+
+set -euo pipefail
+
+data=$1
+lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied.
+latdir=$3
+caldir=$4
+dir=$5
+
+model=$latdir/../final.mdl # assume model one level up from decoding dir.
+calibration=$caldir/calibration.mdl
+word_feats=$caldir/word_feats
+word_categories=$caldir/word_categories
+
+for f in $lang/words.txt $word_feats $word_categories $latdir/lat.1.gz $calibration $model; do
+  [ ! -f $f ] && echo "$0: Missing file $f" && exit 1
+done
+[ -z "$cmd" ] && echo "$0: Missing --cmd '...'" && exit 1
+
+[ -d $dir/log ] || mkdir -p $dir/log
+nj=$(cat $latdir/num_jobs)
+lmwt=$(cat $caldir/lmwt)
+decode_mbr=$(cat $caldir/decode_mbr)
+
+# Store the setup,
+echo $lmwt >$dir/lmwt
+echo $decode_mbr >$dir/decode_mbr 
+cp $calibration $dir/calibration.mdl
+cp $word_feats $dir/word_feats
+cp $word_categories $dir/word_categories
+
+# Create the ctm with raw confidences,
+# - we keep the timing relative to the utterance,
+if [ $stage -le 0 ]; then
+  $cmd JOB=1:$nj $dir/log/get_ctm.JOB.log \
+    lattice-scale --inv-acoustic-scale=$lmwt "ark:gunzip -c $latdir/lat.JOB.gz|" ark:- \| \
+    lattice-limit-depth ark:- ark:- \| \
+    lattice-push --push-strings=false ark:- ark:- \| \
+    lattice-align-words-lexicon --max-expand=10.0 \
+     $lang/phones/align_lexicon.int $model ark:- ark:- \| \
+    lattice-to-ctm-conf --decode-mbr=$decode_mbr ark:- - \| \
+    utils/int2sym.pl -f 5 $lang/words.txt \
+    '>' $dir/JOB.ctm
+  # Merge and clean,
+  for ((n=1; n<=nj; n++)); do cat $dir/${n}.ctm; done > $dir/ctm
+  rm $dir/*.ctm
+  cat $dir/ctm | utils/sym2int.pl -f 5 $lang/words.txt >$dir/ctm_int
+fi
+
+# Compute lattice-depth,
+latdepth=$dir/lattice_frame_depth.ark
+if [ $stage -le 1 ]; then
+  [ -e $latdepth ] || steps/conf/lattice_depth_per_frame.sh --cmd "$cmd" $latdir $dir
+fi
+
+# Create the forwarding data for logistic regression,
+if [ $stage -le 2 ]; then
+  steps/conf/prepare_calibration_data.py --conf-feats $dir/forward_feats.ark \
+    --lattice-depth $latdepth $dir/ctm_int $word_feats $word_categories
+fi
+
+# Apply calibration model to dev,
+if [ $stage -le 3 ]; then
+  logistic-regression-eval --apply-log=false $calibration \
+    ark:$dir/forward_feats.ark ark,t:- | \
+    awk '{ key=$1; p_corr=$4; sub(/,.*/,"",key); gsub(/\^/," ",key); print key,p_corr }' | \
+    utils/int2sym.pl -f 5 $lang/words.txt \
+    >$dir/ctm_calibrated
+fi
+
+exit 0
diff --git a/egs/wsj/s5/steps/conf/convert_ctm_to_tra.py b/egs/wsj/s5/steps/conf/convert_ctm_to_tra.py
new file mode 100755
index 00000000000..276d14b88f8
--- /dev/null
+++ b/egs/wsj/s5/steps/conf/convert_ctm_to_tra.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python
+
+# Copyright 2015  Brno University of Technology (author: Karel Vesely)
+# Apache 2.0
+
+import sys, operator
+
+# This scripts loads a 'ctm' file and converts it into the 'tra' format:
+# "utt-key word1 word2 word3 ... wordN"
+# The 'utt-key' is the 1st column in the CTM.
+
+# Typically the CTM contains:
+# - utterance-relative timimng (i.e. prepared without 'utils/convert_ctm.pl')
+# - confidences 
+
+if len(sys.argv) != 3:
+  print 'Usage: %s ctm-in tra-out' % __file__
+  sys.exit(1)
+dummy, ctm_in, tra_out = sys.argv
+
+if ctm_in == '-': ctm_in = '/dev/stdin'
+if tra_out == '-': tra_out = '/dev/stdout'
+
+# Load the 'ctm' into dictionary,
+tra = dict()
+with open(ctm_in) as f:
+  for l in f:
+    utt, ch, beg, dur, wrd, conf = l.split()
+    if not utt in tra: tra[utt] = []
+    tra[utt].append((float(beg),wrd))
+
+# Store the in 'tra' format,
+with open(tra_out,'w') as f:
+  for utt,tuples in tra.iteritems():
+    tuples.sort(key = operator.itemgetter(0)) # Sort by 'beg' time,
+    f.write('%s %s\n' % (utt,' '.join([t[1] for t in tuples])))
+
diff --git a/egs/wsj/s5/steps/conf/lattice_depth_per_frame.sh b/egs/wsj/s5/steps/conf/lattice_depth_per_frame.sh
new file mode 100755
index 00000000000..7167bd970bb
--- /dev/null
+++ b/egs/wsj/s5/steps/conf/lattice_depth_per_frame.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+# Copyright 2015  Brno University of Technology (Author: Karel Vesely)
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+# Extract lattice-depth for each frame.
+
+# Begin configuration
+cmd=run.pl
+# End configuration
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f path.sh ] && . ./path.sh # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# != 2 ]; then
+   echo "usage: $0 [opts] <dir-with-lats> <out-dir>"
+   echo "main options (for others, see top of script file)"
+   echo "  --config <config-file>          # config containing options"
+   echo "  --cmd"
+   exit 1;
+fi
+
+set -euo pipefail
+
+latdir=$1
+dir=$2
+
+[ ! -f $latdir/lat.1.gz ] && echo "Missing $latdir/lat.1.gz" && exit 1
+nj=$(cat $latdir/num_jobs)
+
+# Get the pdf-posterior vectors,
+$cmd JOB=1:$nj $dir/log/lattice_depth_per_frame.JOB.log \
+  lattice-depth-per-frame "ark:gunzip -c $latdir/lat.JOB.gz |" ark,t:$dir/lattice_frame_depth.JOB.ark
+# Merge,
+for ((n=1; n<=nj; n++)); do cat $dir/lattice_frame_depth.${n}.ark; done >$dir/lattice_frame_depth.ark
+rm $dir/lattice_frame_depth.*.ark
+
+# Done!
diff --git a/egs/wsj/s5/steps/conf/parse_arpa_unigrams.py b/egs/wsj/s5/steps/conf/parse_arpa_unigrams.py
new file mode 100755
index 00000000000..1be32d4c4d7
--- /dev/null
+++ b/egs/wsj/s5/steps/conf/parse_arpa_unigrams.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python
+
+# Copyright 2015  Brno University of Technology (author: Karel Vesely)
+# Apache 2.0
+
+import sys, gzip, re
+
+# Parse options,
+if len(sys.argv) != 4:
+  print "Usage: %s <words.txt> <arpa-gz> <unigrams>" % __file__
+  sys.exit(0)
+words_txt, arpa_gz, unigrams_out = sys.argv[1:]
+
+if arpa_gz == '-': arpa_gz = '/dev/stdin'
+if unigrams_out == '-': unigrams_out = '/dev/stdout'
+
+# Load the words.txt,
+words = [ l.split() for l in open(words_txt) ]
+
+# Load the unigram probabilities in 10log from ARPA,
+wrd_log10 = dict()
+with gzip.open(arpa_gz,'r') as f:
+  read = False
+  for l in f:
+    if l.strip() == '\\1-grams:': read = True
+    if l.strip() == '\\2-grams:': break
+    if read and len(l.split())>=2:
+      log10_p_unigram, wrd = re.split('[\t ]+',l.strip(),2)[:2]
+      wrd_log10[wrd] = float(log10_p_unigram)
+
+# Create list, 'wrd id log_p_unigram',
+words_unigram = [[wrd, id, (wrd_log10[wrd] if wrd in wrd_log10 else -99)] for wrd,id in words ]
+
+print >>sys.stderr, words_unigram[0]
+# Store,
+with open(unigrams_out,'w') as f:
+  f.writelines(['%s %s %g\n' % (w,i,p) for (w,i,p) in words_unigram])
+
diff --git a/egs/wsj/s5/steps/conf/prepare_calibration_data.py b/egs/wsj/s5/steps/conf/prepare_calibration_data.py
new file mode 100755
index 00000000000..bc8f92a2f7f
--- /dev/null
+++ b/egs/wsj/s5/steps/conf/prepare_calibration_data.py
@@ -0,0 +1,119 @@
+#!/usr/bin/env python
+
+# Copyright 2015  Brno University of Technology (author: Karel Vesely)
+# Apache 2.0
+
+import sys, math
+
+from optparse import OptionParser
+desc = """
+Prepare input features and training targets for logistic regression,
+which calibrates the Minimum Bayes Risk posterior confidences.
+
+The logisitc-regression input features are: 
+- posteriors from 'ctm' transformed by logit,
+- logarithm of word-length in letters,
+- 10base logarithm of unigram probability of a word from language model,
+- logarithm of average lattice-depth at position of the word (optional),
+
+The logistic-regresion targets are:
+- 1 for correct word,
+- 0 for incorrect word (substitution, insertion),
+
+The iput 'ctm' is augmented by per-word tags (or 'U' is added if no tags),
+'C' = correct
+'S' = substitution
+'I' = insertion
+'U' = unknown (not part of scored segment)
+
+The script can be used both to prepare the training data,
+or to prepare input features for forwarding through trained model.
+"""
+usage = "%prog [opts] ctm word-filter word-length unigrams depth-per-frame-ascii.ark word-categories"
+parser = OptionParser(usage=usage, description=desc)
+parser.add_option("--conf-targets", help="Targets file for logistic regression (no targets generated if '') [default %default]", default='')
+parser.add_option("--conf-feats", help="Feature file for logistic regression. [default %default]", default='')
+parser.add_option("--lattice-depth", help="Per-frame lattice depths, ascii-ark (optional). [default %default]", default='')
+(o, args) = parser.parse_args()
+
+if len(args) != 3:
+  parser.print_help()
+  sys.exit(1)
+ctm_file, word_feats_file, word_categories_file = args
+
+assert(o.conf_feats != '')
+
+# Load the ctm (optionally add eval colmn with 'U'):
+ctm = [ l.split() for l in open(ctm_file) ]
+if len(ctm[0]) == 6: [ l.append('U') for l in ctm ]
+assert(len(ctm[0]) == 7)
+
+# Load the word-features, the format: "wrd wrd_id filter length other_feats"
+# (typically 'other_feats' are unigram log-probabilities),
+word_feats = [ l.split(None,4) for l in open(word_feats_file) ]
+
+# Prepare filtering dict,
+word_filter = { wrd_id:bool(int(filter)) for (wrd,wrd_id,filter,length,other_feats) in word_feats }
+# Prepare the lenght dict,
+word_length = { wrd_id:float(length) for (wrd,wrd_id,filter,length,other_feats) in word_feats }
+# Prepare other_feats dict,
+other_feats = { wrd_id:other_feats.strip() for (wrd,wrd_id,filter,length,other_feats) in word_feats }
+
+# Build the targets,
+if o.conf_targets != '':
+  with open(o.conf_targets,'w') as f:
+    for (utt, chan, beg, dur, wrd_id, conf, score_tag) in ctm:
+      # Skip the words we don't know if being correct, 
+      if score_tag == 'U': continue 
+      # Some words are excluded from training (partial words, hesitations, etc.),
+      # (Value: 1 == keep word, 0 == exclude word from the targets),
+      if not word_filter[wrd_id]: continue 
+      # Build the key,
+      key = "%s^%s^%s^%s^%s,%s,%s" % (utt, chan, beg, dur, wrd_id, conf, score_tag)
+      # Build the target,
+      tgt = 1 if score_tag == 'C' else 0 # Correct = 1, else 0,
+      # Write,
+      f.write('%s %d\n' % (key,tgt))
+
+# Load the per-frame lattice-depth,
+# - we assume, the 1st column in 'ctm' is the 'utterance-key' in depth file,
+# - if the 'ctm' and 'ark' keys don't match, we leave this feature out,
+if o.lattice_depth:
+  depths = dict()
+  for l in open(o.lattice_depth):
+    utt,d = l.split(' ',1)
+    depths[utt] = map(int,d.split())
+
+# Load the 'word_categories' mapping for categorical input features derived from 'lang/words.txt',
+wrd_to_cat = [ l.split() for l in open(word_categories_file) ]
+wrd_to_cat = { wrd_id:int(category) for wrd,wrd_id,category in wrd_to_cat }
+wrd_cat_num = max(wrd_to_cat.values()) + 1
+
+# Build the input features,
+with open(o.conf_feats,'w') as f:
+  for (utt, chan, beg, dur, wrd_id, conf, score_tag) in ctm:
+    # Build the key, same as previously,
+    key = "%s^%s^%s^%s^%s,%s,%s" % (utt, chan, beg, dur, wrd_id, conf, score_tag)
+
+    # Build input features,
+    # - logit of MBR posterior,
+    damper = 0.001 # avoid -inf,+inf from log,
+    logit = math.log(float(conf)+damper) - math.log(1.0 - float(conf)+damper)
+    # - log of word-length,
+    log_word_length = math.log(word_length[wrd_id]) # i.e. number of phones in a word,
+    # - categorical distribution of words (with frequency higher than min-count),
+    wrd_1_of_k = [0]*wrd_cat_num; 
+    wrd_1_of_k[wrd_to_cat[wrd_id]] = 1;
+
+    # Compose the input feature vector,
+    feats = [ logit, log_word_length, other_feats[wrd_id] ] + wrd_1_of_k
+
+    # Optionally add average-depth of lattice at the word position,
+    if o.lattice_depth != '':
+      depth_slice = depths[utt][int(round(100.0*float(beg))):int(round(100.0*(float(beg)+float(dur))))]
+      log_avg_depth = math.log(float(sum(depth_slice))/len(depth_slice))
+      feats += [ log_avg_depth ]
+
+    # Store the input features, 
+    f.write(key + ' [ ' + ' '.join(map(str,feats)) + ' ]\n')
+
diff --git a/egs/wsj/s5/steps/conf/prepare_word_categories.py b/egs/wsj/s5/steps/conf/prepare_word_categories.py
new file mode 100755
index 00000000000..3b758001c5a
--- /dev/null
+++ b/egs/wsj/s5/steps/conf/prepare_word_categories.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python
+
+# Copyright 2015  Brno University of Technology (author: Karel Vesely)
+# Apache 2.0
+
+import sys
+
+from optparse import OptionParser
+desc = """
+Prepare mapping of words into categories. Each word with minimal frequency 
+has its own category, the rest is merged into single class.
+"""
+usage = "%prog [opts] words.txt ctm category_mapping"
+parser = OptionParser(usage=usage, description=desc)
+parser.add_option("--min-count", help="Minimum word-count to have a single word category. [default %default]", type='int', default=20)
+(o, args) = parser.parse_args()
+
+if len(args) != 3:
+  parser.print_help()
+  sys.exit(1)
+words_file, text_file, category_mapping_file = args
+
+if text_file == '-': text_file = '/dev/stdin'
+if category_mapping_file == '-': category_mapping_file = '/dev/stdout'
+
+# Read the words from the 'tra' file,
+with open(text_file) as f:
+  text_words = [ l.split()[1:] for l in f ]
+
+# Flatten the array of arrays of words,
+import itertools
+text_words = list(itertools.chain.from_iterable(text_words))
+
+# Count the words (regardless if correct or incorrect),
+word_counts = dict()
+for w in text_words:
+  if w not in word_counts: word_counts[w] = 0
+  word_counts[w] += 1
+
+# Read the words.txt,
+with open(words_file) as f:
+  word_id = [ l.split() for l in f ]
+
+# Append the categories,
+n=1
+word_id_cat=[]
+for word, idx in word_id:
+  cat = 0 
+  if word in word_counts:
+    if word_counts[word] > o.min_count:
+      cat = n; n += 1
+  word_id_cat.append([word, idx, str(cat)])
+
+# Store the mapping,
+with open(category_mapping_file,'w') as f:
+  f.writelines([' '.join(record)+'\n' for record in word_id_cat])
diff --git a/egs/wsj/s5/steps/conf/train_calibration.sh b/egs/wsj/s5/steps/conf/train_calibration.sh
new file mode 100755
index 00000000000..c2aca05056e
--- /dev/null
+++ b/egs/wsj/s5/steps/conf/train_calibration.sh
@@ -0,0 +1,125 @@
+#!/bin/bash
+# Copyright 2015, Brno University of Technology (Author: Karel Vesely). Apache 2.0.
+
+# Trains logistic regression, which calibrates the per-word confidences in 'CTM'.
+# The 'raw' confidences are obtained by Minimum Bayes Risk decoding.
+
+# The input features of logistic regression are:
+# - logit of Minumum Bayer Risk posterior
+# - log of word-length in characters
+# - log of average-depth depth of a lattice at words' position
+# - log of frames per character ratio
+# (- categorical distribution of 'lang/words.txt', DISABLED)
+
+# begin configuration section.
+cmd=
+lmwt=12
+decode_mbr=true
+word_min_count=10 # Minimum word-count for single-word category,
+normalizer=0.0025 # L2 regularization constant,
+category_text= # Alternative corpus for counting words to get word-categories (by default using 'ctm'),
+stage=0
+# end configuration section.
+
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -ne 5 ]; then
+  echo "Usage: $0 [opts] <data-dir> <lang-dir|graph-dir> <word-feats> <decode-dir> <calibration-dir>"
+  echo " Options:"
+  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
+  echo "    --lmwt <int>                    # scaling for confidence extraction"
+  echo "    --decode-mbr <bool>             # use Minimum Bayes Risk decoding"
+  echo "    --grep-filter <str>             # remove words from calibration targets"
+  exit 1;
+fi
+
+set -euo pipefail
+
+data=$1
+lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied.
+word_feats=$3
+latdir=$4
+dir=$5
+
+model=$latdir/../final.mdl # assume model one level up from decoding dir.
+
+for f in $data/text $lang/words.txt $word_feats $latdir/lat.1.gz; do
+  [ ! -f $f ] && echo "$0: Missing file $f" && exit 1
+done
+[ -z "$cmd" ] && echo "$0: Missing --cmd '...'" && exit 1
+
+[ -d $dir/log ] || mkdir -p $dir/log
+nj=$(cat $latdir/num_jobs)
+
+# Store the setup,
+echo $lmwt >$dir/lmwt
+echo $decode_mbr >$dir/decode_mbr
+cp $word_feats $dir/word_feats
+
+# Create the ctm with raw confidences,
+# - we keep the timing relative to the utterance,
+if [ $stage -le 0 ]; then
+  $cmd JOB=1:$nj $dir/log/get_ctm.JOB.log \
+    lattice-scale --inv-acoustic-scale=$lmwt "ark:gunzip -c $latdir/lat.JOB.gz|" ark:- \| \
+    lattice-limit-depth ark:- ark:- \| \
+    lattice-push --push-strings=false ark:- ark:- \| \
+    lattice-align-words-lexicon --max-expand=10.0 \
+     $lang/phones/align_lexicon.int $model ark:- ark:- \| \
+    lattice-to-ctm-conf --decode-mbr=$decode_mbr ark:- - \| \
+    utils/int2sym.pl -f 5 $lang/words.txt \
+    '>' $dir/JOB.ctm
+  # Merge and clean,
+  for ((n=1; n<=nj; n++)); do cat $dir/${n}.ctm; done > $dir/ctm
+  rm $dir/*.ctm
+fi
+
+# Get evaluation of the 'ctm' using the 'text' reference,
+if [ $stage -le 1 ]; then
+  steps/conf/convert_ctm_to_tra.py $dir/ctm - | \
+  align-text --special-symbol="<eps>" ark:$data/text ark:- ark,t:- | \
+  utils/scoring/wer_per_utt_details.pl --special-symbol "<eps>" \
+  >$dir/align_text 
+  # Append alignment to ctm,
+  steps/conf/append_eval_to_ctm.py $dir/align_text $dir/ctm $dir/ctm_aligned
+  # Convert words to 'ids',
+  cat $dir/ctm_aligned | utils/sym2int.pl -f 5 $lang/words.txt >$dir/ctm_aligned_int
+fi
+
+# Prepare word-categories (based on wotd frequencies in 'ctm'),
+if [ -z "$category_text" ]; then
+  steps/conf/convert_ctm_to_tra.py $dir/ctm - | \
+  steps/conf/prepare_word_categories.py --min-count $word_min_count $lang/words.txt - $dir/word_categories
+else
+  steps/conf/prepare_word_categories.py --min-count $word_min_count $lang/words.txt "$category_text" $dir/word_categories
+fi
+
+# Compute lattice-depth,
+latdepth=$dir/lattice_frame_depth.ark
+if [ $stage -le 2 ]; then
+  [ -e $latdepth ] || steps/conf/lattice_depth_per_frame.sh --cmd "$cmd" $latdir $dir
+fi
+
+# Create the training data for logistic regression,
+if [ $stage -le 3 ]; then
+  steps/conf/prepare_calibration_data.py \
+    --conf-targets $dir/train_targets.ark --conf-feats $dir/train_feats.ark \
+    --lattice-depth $latdepth $dir/ctm_aligned_int $word_feats $dir/word_categories
+fi
+
+# Train the logistic regression,
+if [ $stage -le 4 ]; then
+  logistic-regression-train --binary=false --normalizer=$normalizer ark:$dir/train_feats.ark \
+    ark:$dir/train_targets.ark $dir/calibration.mdl 2>$dir/log/logistic-regression-train.log
+fi
+
+# Apply calibration model to dev,
+if [ $stage -le 5 ]; then
+  logistic-regression-eval --apply-log=false $dir/calibration.mdl \
+    ark:$dir/train_feats.ark ark,t:- | \
+    awk '{ key=$1; p_corr=$4; sub(/,.*/,"",key); gsub(/\^/," ",key); print key,p_corr }' | \
+    utils/int2sym.pl -f 5 $lang/words.txt \
+    >$dir/ctm_calibrated_int
+fi
+
+exit 0
diff --git a/egs/wsj/s5/steps/decode.sh b/egs/wsj/s5/steps/decode.sh
index b0e2fed2017..f2bc1d367fd 100755
--- a/egs/wsj/s5/steps/decode.sh
+++ b/egs/wsj/s5/steps/decode.sh
@@ -3,8 +3,8 @@
 # Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
 # Apache 2.0
 
-# Begin configuration section.  
-transform_dir=   # this option won't normally be used, but it can be used if you want to 
+# Begin configuration section.
+transform_dir=   # this option won't normally be used, but it can be used if you want to
                  # supply existing fMLLR transforms when decoding.
 iter=
 model= # You can specify the model to use (e.g. if you want to use the .alimdl)
@@ -64,16 +64,16 @@ mkdir -p $dir/log
 echo $nj > $dir/num_jobs
 
 if [ -z "$model" ]; then # if --model <mdl> was not specified on the command line...
-  if [ -z $iter ]; then model=$srcdir/final.mdl; 
+  if [ -z $iter ]; then model=$srcdir/final.mdl;
   else model=$srcdir/$iter.mdl; fi
 fi
 
 if [ $(basename $model) != final.alimdl ] ; then
   # Do not use the $srcpath -- look at the path where the model is
-  if [ -f $(dirname $model)/final.alimdl ] ; then
-    echo -e '\n\n' 
-    echo $0 'WARNING: Running speaker independent system decoding using a SAT model!' 
-    echo $0 'WARNING: This is OK if you know what you are doing...' 
+  if [ -f $(dirname $model)/final.alimdl ] && [ -z "$transform_dir" ]; then
+    echo -e '\n\n'
+    echo $0 'WARNING: Running speaker independent system decoding using a SAT model!'
+    echo $0 'WARNING: This is OK if you know what you are doing...'
     echo -e '\n\n'
   fi
 fi
@@ -90,7 +90,7 @@ cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
 delta_opts=`cat $srcdir/delta_opts 2>/dev/null`
 
 thread_string=
-[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads" 
+[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads"
 
 case $feat_type in
   delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
@@ -129,7 +129,7 @@ fi
 if ! $skip_scoring ; then
   [ ! -x local/score.sh ] && \
     echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
-  local/score.sh --cmd "$cmd" $scoring_opts $data $graphdir $dir || 
+  local/score.sh --cmd "$cmd" $scoring_opts $data $graphdir $dir ||
     { echo "$0: Scoring failed. (ignore by '--skip-scoring true')"; exit 1; }
 fi
 
diff --git a/egs/wsj/s5/steps/decode_basis_fmllr.sh b/egs/wsj/s5/steps/decode_basis_fmllr.sh
index d0d37aed016..afb914e7f0d 100755
--- a/egs/wsj/s5/steps/decode_basis_fmllr.sh
+++ b/egs/wsj/s5/steps/decode_basis_fmllr.sh
@@ -95,6 +95,7 @@ mkdir -p $dir/log
 echo $nj > $dir/num_jobs
 splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
 cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
+delta_opts=`cat $srcdir/delta_opts 2>/dev/null`
 
 silphonelist=`cat $graphdir/phones/silence.csl` || exit 1;
 
@@ -144,7 +145,7 @@ done
 if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
 echo "$0: feature type is $feat_type";
 case $feat_type in
-  delta) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  delta) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
   lda) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";;
   *) echo "Invalid feature type $feat_type" && exit 1;
 esac
diff --git a/egs/wsj/s5/steps/decode_biglm.sh b/egs/wsj/s5/steps/decode_biglm.sh
index 9146ab8cebf..0663391430d 100755
--- a/egs/wsj/s5/steps/decode_biglm.sh
+++ b/egs/wsj/s5/steps/decode_biglm.sh
@@ -45,6 +45,7 @@ srcdir=`dirname $dir`; # The model directory is one level up from decoding direc
 sdata=$data/split$nj;
 splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
 cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
+delta_opts=`cat $srcdir/delta_opts 2>/dev/null`
 
 mkdir -p $dir/log
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
@@ -60,7 +61,7 @@ if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
 echo "decode_si.sh: feature type is $feat_type"
 
 case $feat_type in
-  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
   lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";;
   *) echo "Invalid feature type $feat_type" && exit 1;
 esac
diff --git a/egs/wsj/s5/steps/decode_combine.sh b/egs/wsj/s5/steps/decode_combine.sh
index ca4f84efdc7..e2926ee0e3a 100755
--- a/egs/wsj/s5/steps/decode_combine.sh
+++ b/egs/wsj/s5/steps/decode_combine.sh
@@ -47,7 +47,7 @@ mkdir -p $dir/log
 echo $nj > $dir/num_jobs
 
 # The lattice-interp command does the score interpolation (with composition),
-# and the lattice-copy-backoff replaces the result with the 1st lattice, in 
+# and the lattice-copy-backoff replaces the result with the 1st lattice, in
 # cases where the composed result was empty.
 $cmd JOB=1:$nj $dir/log/interp.JOB.log \
   lattice-interp --alpha=$weight1 "ark:gunzip -c $srcdir1/lat.JOB.gz|" \
@@ -55,6 +55,8 @@ $cmd JOB=1:$nj $dir/log/interp.JOB.log \
   lattice-copy-backoff "ark,s,cs:gunzip -c $srcdir1/lat.JOB.gz|" ark,s,cs:- \
    "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1;
 
+cp $srcdir1/final.mdl $dir/final.mdl
+
 if ! $skip_scoring ; then
   [ ! -x local/score.sh ] && \
     echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
diff --git a/egs/wsj/s5/steps/decode_fmmi.sh b/egs/wsj/s5/steps/decode_fmmi.sh
index b655d076698..5460d37ff28 100755
--- a/egs/wsj/s5/steps/decode_fmmi.sh
+++ b/egs/wsj/s5/steps/decode_fmmi.sh
@@ -58,6 +58,7 @@ srcdir=`dirname $dir`; # The model directory is one level up from decoding direc
 sdata=$data/split$nj;
 splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
 cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
+delta_opts=`cat $srcdir/delta_opts 2>/dev/null`
 thread_string=
 [ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads" 
 
@@ -75,7 +76,7 @@ if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
 echo "decode_fmmi.sh: feature type is $feat_type";
 
 case $feat_type in
-  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
   lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";;
   *) echo "Invalid feature type $feat_type" && exit 1;
 esac
diff --git a/egs/wsj/s5/steps/decode_fwdbwd.sh b/egs/wsj/s5/steps/decode_fwdbwd.sh
index 27c2d483301..f0e36227251 100755
--- a/egs/wsj/s5/steps/decode_fwdbwd.sh
+++ b/egs/wsj/s5/steps/decode_fwdbwd.sh
@@ -75,9 +75,10 @@ echo "decode_fwdbwd.sh: feature type is $feat_type";
 
 splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
 cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
+delta_opts=`cat $srcdir/delta_opts 2>/dev/null`
 
 case $feat_type in
-  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
   lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";;
   *) echo "Invalid feature type $feat_type" && exit 1;
 esac
diff --git a/egs/wsj/s5/steps/decode_nolats.sh b/egs/wsj/s5/steps/decode_nolats.sh
index 6f5e780cf30..9c05d3eea30 100755
--- a/egs/wsj/s5/steps/decode_nolats.sh
+++ b/egs/wsj/s5/steps/decode_nolats.sh
@@ -83,9 +83,10 @@ echo "decode.sh: feature type is $feat_type";
 
 splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
 cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
+delta_opts=`cat $srcdir/delta_opts 2>/dev/null`
 
 case $feat_type in
-  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
   lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";;
   *) echo "Invalid feature type $feat_type" && exit 1;
 esac
diff --git a/egs/wsj/s5/steps/decode_with_map.sh b/egs/wsj/s5/steps/decode_with_map.sh
index e05e4de4097..ab507debd11 100755
--- a/egs/wsj/s5/steps/decode_with_map.sh
+++ b/egs/wsj/s5/steps/decode_with_map.sh
@@ -71,9 +71,10 @@ echo "decode.sh: feature type is $feat_type";
 
 splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
 cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
+delta_opts=`cat $srcdir/delta_opts 2>/dev/null`
 
 case $feat_type in
-  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
   lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";;
   *) echo "Invalid feature type $feat_type" && exit 1;
 esac
diff --git a/egs/wsj/s5/steps/get_ctm.sh b/egs/wsj/s5/steps/get_ctm.sh
index 3d0ea576a57..2f2f6794e3d 100755
--- a/egs/wsj/s5/steps/get_ctm.sh
+++ b/egs/wsj/s5/steps/get_ctm.sh
@@ -8,6 +8,7 @@
 # begin configuration section.
 cmd=run.pl
 stage=0
+frame_shift=0.01
 min_lmwt=5
 max_lmwt=20
 use_segments=true # if we have a segments file, use it to convert
@@ -28,6 +29,8 @@ if [ $# -ne 3 ]; then
   echo "                                    # to produce a ctm relative to the original audio"
   echo "                                    # files, with channel information (typically needed"
   echo "                                    # for NIST scoring)."
+  echo "    --frame-shift (default=0.01)    # specify this if your lattices have a frame-shift"
+  echo "                                    # not equal to 0.01 seconds"
   echo "e.g.:"
   echo "$0 data/train data/lang exp/tri4a/decode/"
   echo "See also: steps/get_train_ctm.sh"
@@ -55,7 +58,7 @@ if [ $stage -le 0 ]; then
     [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
     filter_cmd="utils/convert_ctm.pl $data/segments $data/reco2file_and_channel"
   else
-    filter_cmd=cat    
+    filter_cmd=cat
   fi
 
   if [ -f $lang/phones/word_boundary.int ]; then
@@ -63,7 +66,7 @@ if [ $stage -le 0 ]; then
       set -o pipefail '&&' mkdir -p $dir/score_LMWT/ '&&' \
       lattice-1best --lm-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
       lattice-align-words $lang/phones/word_boundary.int $model ark:- ark:- \| \
-      nbest-to-ctm ark:- - \| \
+      nbest-to-ctm --frame-shift=$frame_shift ark:- - \| \
       utils/int2sym.pl -f 5 $lang/words.txt \| \
       $filter_cmd '>' $dir/score_LMWT/$name.ctm || exit 1;
   else
@@ -76,7 +79,7 @@ if [ $stage -le 0 ]; then
       set -o pipefail '&&' mkdir -p $dir/score_LMWT/ '&&' \
       lattice-1best --lm-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
       lattice-align-words-lexicon $lang/phones/align_lexicon.int $model ark:- ark:- \| \
-      nbest-to-ctm ark:- - \| \
+      nbest-to-ctm --frame-shift=$frame_shift ark:- - \| \
       utils/int2sym.pl -f 5 $lang/words.txt \| \
       $filter_cmd '>' $dir/score_LMWT/$name.ctm || exit 1;
   fi
diff --git a/egs/wsj/s5/steps/get_train_ctm.sh b/egs/wsj/s5/steps/get_train_ctm.sh
index a6cbb2ac06a..10b29708d84 100755
--- a/egs/wsj/s5/steps/get_train_ctm.sh
+++ b/egs/wsj/s5/steps/get_train_ctm.sh
@@ -7,9 +7,12 @@
 
 # begin configuration section.
 cmd=run.pl
+frame_shift=0.01
 stage=0
 use_segments=true # if we have a segments file, use it to convert
                   # the segments to be relative to the original files.
+print_silence=false # if true, will print <eps> (optional-silence) arcs.
+
 #end configuration section.
 
 echo "$0 $@"  # Print the command line for logging
@@ -26,6 +29,8 @@ if [ $# -ne 3 ]; then
   echo "                                    # to produce a ctm relative to the original audio"
   echo "                                    # files, with channel information (typically needed"
   echo "                                    # for NIST scoring)."
+  echo "    --frame-shift (default=0.01)    # specify this if your alignments have a frame-shift"
+  echo "                                    # not equal to 0.01 seconds"
   echo "e.g.:"
   echo "$0 data/train data/lang exp/tri3a_ali"
   echo "Produces ctm in: exp/tri3a_ali/ctm"
@@ -58,9 +63,9 @@ if [ $stage -le 0 ]; then
       "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $sdata/JOB/text |" \
       '' '' ark:- \| \
       lattice-align-words $lang/phones/word_boundary.int $model ark:- ark:- \| \
-      nbest-to-ctm ark:- - \| \
+      nbest-to-ctm --frame-shift=$frame_shift --print-silence=$print_silence ark:- - \| \
       utils/int2sym.pl -f 5 $lang/words.txt \| \
-      gzip -c '>' $dir/ctm.JOB.gz
+      gzip -c '>' $dir/ctm.JOB.gz || exit 1
   else
     if [ ! -f $lang/phones/align_lexicon.int ]; then
       echo "$0: neither $lang/phones/word_boundary.int nor $lang/phones/align_lexicon.int exists: cannot align."
@@ -71,14 +76,14 @@ if [ $stage -le 0 ]; then
       "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $sdata/JOB/text |" \
       '' '' ark:- \| \
       lattice-align-words-lexicon $lang/phones/align_lexicon.int $model ark:- ark:- \| \
-      nbest-to-ctm ark:- - \| \
+      nbest-to-ctm --frame-shift=$frame_shift --print-silence=$print_silence ark:- - \| \
       utils/int2sym.pl -f 5 $lang/words.txt \| \
-      gzip -c '>' $dir/ctm.JOB.gz
+      gzip -c '>' $dir/ctm.JOB.gz || exit 1
   fi
 fi
 
 if [ $stage -le 1 ]; then
-  if [ -f $data/segments ]; then
+  if [ -f $data/segments ] && $use_segments; then
     f=$data/reco2file_and_channel
     [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
     for n in `seq $nj`; do gunzip -c $dir/ctm.$n.gz; done | \
diff --git a/egs/wsj/s5/steps/lmrescore.sh b/egs/wsj/s5/steps/lmrescore.sh
index 0652c6c13ca..86595e862b9 100755
--- a/egs/wsj/s5/steps/lmrescore.sh
+++ b/egs/wsj/s5/steps/lmrescore.sh
@@ -4,6 +4,7 @@
 mode=4
 cmd=run.pl
 skip_scoring=false
+self_loop_scale=0.1
 # End configuration section.
 
 echo "$0 $@"  # Print the command line for logging
@@ -32,6 +33,10 @@ newlm=$newlang/G.fst
 [ ! -f $newlm ] && echo Missing file $newlm && exit 1;
 ! ls $indir/lat.*.gz >/dev/null && echo "No lattices input directory $indir" && exit 1;
 
+if ! cmp -s $oldlang/words.txt $newlang/words.txt; then
+  echo "$0: $oldlang/words.txt and $newlang/words.txt differ: make sure you know what you are doing.";
+fi
+
 oldlmcommand="fstproject --project_output=true $oldlm |"
 newlmcommand="fstproject --project_output=true $newlm |"
 
@@ -75,7 +80,7 @@ case "$mode" in
       gzip -c \>$outdir/lat.JOB.gz || exit 1;
     ;;
   3) # 3 is "exact" in that we remove the old LM scores accepting any path
-     # through G.fst (which is what we want as that happened in lattice 
+     # through G.fst (which is what we want as that happened in lattice
      # generation), but we add the new one with "phi matcher", only taking
      # backoff arcs if an explicit arc did not exist.
     $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \
@@ -100,7 +105,7 @@ case "$mode" in
       lattice-compose ark:- $outdir/Ldet.fst ark:- \| \
       lattice-determinize ark:- ark:- \| \
       lattice-compose --phi-label=$phi ark:- $newlm ark:- \| \
-      lattice-add-trans-probs --transition-scale=1.0 --self-loop-scale=0.1 \
+      lattice-add-trans-probs --transition-scale=1.0 --self-loop-scale=$self_loop_scale \
       $mdl ark:- ark:- \| \
       gzip -c \>$outdir/lat.JOB.gz  || exit 1;
     ;;
diff --git a/egs/wsj/s5/steps/lmrescore_const_arpa.sh b/egs/wsj/s5/steps/lmrescore_const_arpa.sh
index 3d70d41e59e..81698f07f0d 100755
--- a/egs/wsj/s5/steps/lmrescore_const_arpa.sh
+++ b/egs/wsj/s5/steps/lmrescore_const_arpa.sh
@@ -8,6 +8,8 @@
 # Begin configuration section.
 cmd=run.pl
 skip_scoring=false
+stage=1
+scoring_opts=
 # End configuration section.
 
 echo "$0 $@"  # Print the command line for logging
@@ -39,22 +41,28 @@ newlm=$newlang/G.carpa
 ! ls $indir/lat.*.gz >/dev/null &&\
   echo "$0: No lattices input directory $indir" && exit 1;
 
+if ! cmp -s $oldlang/words.txt $newlang/words.txt; then
+  echo "$0: $oldlang/words.txt and $newlang/words.txt differ: make sure you know what you are doing.";
+fi
+
 oldlmcommand="fstproject --project_output=true $oldlm |"
 
 mkdir -p $outdir/log
 nj=`cat $indir/num_jobs` || exit 1;
 cp $indir/num_jobs $outdir
 
-$cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \
-  lattice-lmrescore --lm-scale=-1.0 \
-  "ark:gunzip -c $indir/lat.JOB.gz|" "$oldlmcommand" ark:-  \| \
-  lattice-lmrescore-const-arpa --lm-scale=1.0 \
-  ark:- "$newlm" "ark,t:|gzip -c>$outdir/lat.JOB.gz" || exit 1;
+if [ $stage -le 1 ]; then
+  $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \
+    lattice-lmrescore --lm-scale=-1.0 \
+    "ark:gunzip -c $indir/lat.JOB.gz|" "$oldlmcommand" ark:-  \| \
+    lattice-lmrescore-const-arpa --lm-scale=1.0 \
+    ark:- "$newlm" "ark,t:|gzip -c>$outdir/lat.JOB.gz" || exit 1;
+fi
 
-if ! $skip_scoring ; then
+if ! $skip_scoring && [ $stage -le 2 ]; then
   err_msg="Not scoring because local/score.sh does not exist or not executable."
   [ ! -x local/score.sh ] && echo $err_msg && exit 1;
-  local/score.sh --cmd "$cmd" $data $newlang $outdir
+  local/score.sh --cmd "$cmd" $scoring_opts $data $newlang $outdir
 else
   echo "Not scoring because requested so..."
 fi
diff --git a/egs/wsj/s5/steps/lmrescore_rnnlm_lat.sh b/egs/wsj/s5/steps/lmrescore_rnnlm_lat.sh
new file mode 100755
index 00000000000..a669f5bc3d5
--- /dev/null
+++ b/egs/wsj/s5/steps/lmrescore_rnnlm_lat.sh
@@ -0,0 +1,97 @@
+#!/bin/bash
+
+# Copyright 2015  Guoguo Chen
+# Apache 2.0
+
+# This script rescores lattices with RNNLM.  See also rnnlmrescore.sh which is
+# an older script using n-best lists.
+
+# Begin configuration section.
+cmd=run.pl
+skip_scoring=false
+max_ngram_order=4
+N=10
+inv_acwt=12
+weight=1.0  # Interpolation weight for RNNLM.
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+. ./utils/parse_options.sh
+
+if [ $# != 5 ]; then
+   echo "Does language model rescoring of lattices (remove old LM, add new LM)"
+   echo "with RNNLM."
+   echo ""
+   echo "Usage: $0 [options] <old-lang-dir> <rnnlm-dir> \\"
+   echo "                   <data-dir> <input-decode-dir> <output-decode-dir>"
+   echo " e.g.: $0 ./rnnlm data/lang_tg data/test \\"
+   echo "                   exp/tri3/test_tg exp/tri3/test_rnnlm"
+   echo "options: [--cmd (run.pl|queue.pl [queue opts])]"
+   exit 1;
+fi
+
+[ -f path.sh ] && . ./path.sh;
+
+oldlang=$1
+rnnlm_dir=$2
+data=$3
+indir=$4
+outdir=$5
+
+oldlm=$oldlang/G.fst
+if [ -f $oldlang/G.carpa ]; then
+  oldlm=$oldlang/G.carpa
+elif [ ! -f $oldlm ]; then
+  echo "$0: expecting either $oldlang/G.fst or $oldlang/G.carpa to exist" &&\
+    exit 1;
+fi
+
+[ ! -f $oldlm ] && echo "$0: Missing file $oldlm" && exit 1;
+[ ! -f $rnnlm_dir/rnnlm ] && echo "$0: Missing file $rnnlm_dir/rnnlm" && exit 1;
+[ ! -f $rnnlm_dir/unk.probs ] &&\
+  echo "$0: Missing file $rnnlm_dir/unk.probs" && exit 1;
+[ ! -f $oldlang/words.txt ] &&\
+  echo "$0: Missing file $oldlang/words.txt" && exit 1;
+! ls $indir/lat.*.gz >/dev/null &&\
+  echo "$0: No lattices input directory $indir" && exit 1;
+awk -v n=$0 -v w=$weight 'BEGIN {if (w < 0 || w > 1) {
+  print n": Interpolation weight should be in the range of [0, 1]"; exit 1;}}' \
+  || exit 1;
+
+oldlm_command="fstproject --project_output=true $oldlm |"
+
+acwt=`perl -e "print (1.0/$inv_acwt);"`
+
+mkdir -p $outdir/log
+nj=`cat $indir/num_jobs` || exit 1;
+cp $indir/num_jobs $outdir
+
+oldlm_weight=`perl -e "print -1.0 * $weight;"`
+if [ "$oldlm" == "$oldlang/G.fst" ]; then
+  $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \
+    lattice-lmrescore --lm-scale=$oldlm_weight \
+    "ark:gunzip -c $indir/lat.JOB.gz|" "$oldlm_command" ark:-  \| \
+    lattice-lmrescore-rnnlm --lm-scale=$weight \
+    --max-ngram-order=$max_ngram_order ark:$rnnlm_dir/unk.probs \
+    $oldlang/words.txt ark:- "$rnnlm_dir/rnnlm" \
+    "ark,t:|gzip -c>$outdir/lat.JOB.gz" || exit 1;
+else
+  $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \
+    lattice-lmrescore-const-arpa --lm-scale=$oldlm_weight \
+    "ark:gunzip -c $indir/lat.JOB.gz|" "$oldlm_command" ark:-  \| \
+    lattice-lmrescore-rnnlm --lm-scale=$weight \
+    --max-ngram-order=$max_ngram_order ark:$rnnlm_dir/unk.probs \
+    $oldlang/words.txt ark:- "$rnnlm_dir/rnnlm" \
+    "ark,t:|gzip -c>$outdir/lat.JOB.gz" || exit 1;
+fi
+
+if ! $skip_scoring ; then
+  err_msg="Not scoring because local/score.sh does not exist or not executable."
+  [ ! -x local/score.sh ] && echo $err_msg && exit 1;
+  local/score.sh --cmd "$cmd" $data $oldlang $outdir
+else
+  echo "Not scoring because requested so..."
+fi
+
+exit 0;
diff --git a/egs/wsj/s5/steps/make_denlats.sh b/egs/wsj/s5/steps/make_denlats.sh
index 65b4bb8d320..6afecfe5246 100755
--- a/egs/wsj/s5/steps/make_denlats.sh
+++ b/egs/wsj/s5/steps/make_denlats.sh
@@ -51,6 +51,7 @@ dir=$4
 sdata=$data/split$nj
 splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
 cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
+delta_opts=`cat $srcdir/delta_opts 2>/dev/null`
 thread_string=
 [ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads"
 
@@ -87,7 +88,7 @@ if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
 echo "$0: feature type is $feat_type"
 
 case $feat_type in
-  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
   lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
     cp $srcdir/final.mat $dir    
    ;;
diff --git a/egs/wsj/s5/steps/make_mfcc.sh b/egs/wsj/s5/steps/make_mfcc.sh
index 1d152f6cf8d..09c34d40b24 100755
--- a/egs/wsj/s5/steps/make_mfcc.sh
+++ b/egs/wsj/s5/steps/make_mfcc.sh
@@ -1,4 +1,4 @@
-#!/bin/bash 
+#!/bin/bash
 
 # Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
 # Apache 2.0
@@ -81,7 +81,7 @@ if [ -f $data/segments ]; then
   for n in $(seq $nj); do
     split_segments="$split_segments $logdir/segments.$n"
   done
- 
+
   utils/split_scp.pl $data/segments $split_segments || exit 1;
   rm $logdir/.error 2>/dev/null
 
@@ -127,8 +127,8 @@ done > $data/feats.scp
 
 rm $logdir/wav_${name}.*.scp  $logdir/segments.* 2>/dev/null
 
-nf=`cat $data/feats.scp | wc -l` 
-nu=`cat $data/utt2spk | wc -l` 
+nf=`cat $data/feats.scp | wc -l`
+nu=`cat $data/utt2spk | wc -l`
 if [ $nf -ne $nu ]; then
   echo "It seems not all of the feature files were successfully processed ($nf != $nu);"
   echo "consider using utils/fix_data_dir.sh $data"
diff --git a/egs/wsj/s5/steps/make_phone_graph.sh b/egs/wsj/s5/steps/make_phone_graph.sh
index 4dbb5a8a206..247e5a35d5d 100755
--- a/egs/wsj/s5/steps/make_phone_graph.sh
+++ b/egs/wsj/s5/steps/make_phone_graph.sh
@@ -4,7 +4,7 @@
 
 # Copyright 2013  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
 
-# This script makes a phone-based LM, without smoothing to unigram, that 
+# This script makes a phone-based LM, without smoothing to unigram, that
 # is to be used for segmentation, and uses that together with a model to
 # make a decoding graph.
 # Uses SRILM.
@@ -46,7 +46,7 @@ done
 loc=`which ngram-count`;
 if [ -z $loc ]; then
   if uname -a | grep 64 >/dev/null; then # some kind of 64 bit...
-    sdir=`pwd`/../../../tools/srilm/bin/i686-m64 
+    sdir=`pwd`/../../../tools/srilm/bin/i686-m64
   else
     sdir=`pwd`/../../../tools/srilm/bin/i686
   fi
@@ -92,17 +92,14 @@ fi
 
 if [ $stage -le 3 ]; then
   echo "$0: creating G_phones.fst from ARPA"
-  gunzip -c $dir/phone_graph/arpa_noug.gz | arpa2fst - - | fstprint | \
-    utils/eps2disambig.pl | utils/s2eps.pl | \
-    awk '{if (NF < 5 || $5 < 100.0) { print; }}' | \
-    fstcompile --isymbols=$lang/phones.txt --osymbols=$lang/phones.txt \
-       --keep_isymbols=false --keep_osymbols=false | \
-    fstconnect | \
-    fstrmepsilon > $dir/phone_graph/G_phones.fst
-   fstisstochastic $dir/phone_graph/G_phones.fst  || echo "[info]: G_phones not stochastic."
+  gunzip -c $dir/phone_graph/arpa_noug.gz | \
+    arpa2fst --disambig-symbol=#0 --read-symbol-table=$lang/phones.txt - - | \
+    fstprint | awk '{if (NF < 5 || $5 < 100.0) { print; }}' | fstcompile | \
+    fstconnect > $dir/phone_graph/G_phones.fst
+  fstisstochastic $dir/phone_graph/G_phones.fst || echo "[info]: G_phones not stochastic."
 fi
 
-  
+
 if [ $stage -le 4 ]; then
   echo "$0: creating CLG."
 
@@ -118,7 +115,7 @@ if [ $stage -le 5 ]; then
   echo "$0: creating Ha.fst"
   make-h-transducer --disambig-syms-out=$dir/phone_graph/disambig_tid.int \
     --transition-scale=$tscale $dir/phone_graph/ilabels_${N}_${P} $dir/tree $dir/final.mdl \
-       > $dir/phone_graph/Ha.fst 
+       > $dir/phone_graph/Ha.fst
 fi
 
 if [ $stage -le 6 ]; then
@@ -135,7 +132,7 @@ if [ $stage -le 7 ]; then
     $dir/final.mdl < $dir/phone_graph/HCLGa.fst > $dir/phone_graph/HCLG.fst || exit 1;
 
   if [ $tscale == 1.0 -a $loopscale == 1.0 ]; then
-    # No point doing this test if transition-scale not 1, as it is bound to fail. 
+    # No point doing this test if transition-scale not 1, as it is bound to fail.
     fstisstochastic $dir/phone_graph/HCLG.fst || echo "[info]: final HCLG is not stochastic."
   fi
 
diff --git a/egs/wsj/s5/steps/nnet/align.sh b/egs/wsj/s5/steps/nnet/align.sh
index eae3f552658..7ba12cdf114 100755
--- a/egs/wsj/s5/steps/nnet/align.sh
+++ b/egs/wsj/s5/steps/nnet/align.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright 2012-2013 Brno University of Technology (Author: Karel Vesely)
+# Copyright 2012-2015 Brno University of Technology (author: Karel Vesely)
 # Apache 2.0
 
 # Aligns 'data' to sequences of transition-ids using Neural Network based acoustic model.
@@ -14,6 +14,7 @@ scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
 beam=10
 retry_beam=40
 nnet_forward_opts="--no-softmax=true --prior-scale=1.0"
+ivector=            # rx-specifier with i-vectors (ark-with-vectors),
 
 align_to_lats=false # optionally produce alignment in lattice format
  lats_decode_opts="--acoustic-scale=0.1 --beam=20 --lattice_beam=10"
@@ -27,6 +28,8 @@ use_gpu="no" # yes|no|optionaly
 [ -f path.sh ] && . ./path.sh # source the path.
 . parse_options.sh || exit 1;
 
+set -euo pipefail
+
 if [ $# != 4 ]; then
    echo "usage: $0 <data-dir> <lang-dir> <src-dir> <align-dir>"
    echo "e.g.:  $0 data/train data/lang exp/tri1 exp/tri1_ali"
@@ -78,6 +81,27 @@ feats="ark,s,cs:copy-feats scp:$sdata/JOB/feats.scp ark:- |"
 [ ! -z "$cmvn_opts" ] && feats="$feats apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp ark:- ark:- |"
 # add-deltas (optional),
 [ ! -z "$delta_opts" ] && feats="$feats add-deltas $delta_opts ark:- ark:- |"
+# add-pytel transform (optional),
+[ -e $D/pytel_transform.py ] && feats="$feats /bin/env python $D/pytel_transform.py |"
+
+# add-ivector (optional),
+if [ -e $D/ivector_dim ]; then
+  [ -z $ivector ] && echo "Missing --ivector, they were used in training!" && exit 1
+  # Get the tool, 
+  ivector_append_tool=append-vector-to-feats # default,
+  [ -e $D/ivector_append_tool ] && ivector_append_tool=$(cat $D/ivector_append_tool)
+  # Check dims,
+  feats_job_1=$(sed 's:JOB:1:g' <(echo $feats))
+  dim_raw=$(feat-to-dim "$feats_job_1" -)
+  dim_raw_and_ivec=$(feat-to-dim "$feats_job_1 $ivector_append_tool ark:- '$ivector' ark:- |" -)
+  dim_ivec=$((dim_raw_and_ivec - dim_raw))
+  [ $dim_ivec != "$(cat $D/ivector_dim)" ] && \
+    echo "Error, i-vector dim. mismatch (expected $(cat $D/ivector_dim), got $dim_ivec in '$ivector')" && \
+    exit 1
+  # Append to feats,
+  feats="$feats $ivector_append_tool ark:- '$ivector' ark:- |"
+fi
+
 # nnet-forward,
 feats="$feats nnet-forward $nnet_forward_opts --feature-transform=$feature_transform --class-frame-counts=$class_frame_counts --use-gpu=$use_gpu $nnet ark:- ark:- |"
 #
diff --git a/egs/wsj/s5/steps/nnet/decode.sh b/egs/wsj/s5/steps/nnet/decode.sh
index 35065db20e7..49ba466fc26 100755
--- a/egs/wsj/s5/steps/nnet/decode.sh
+++ b/egs/wsj/s5/steps/nnet/decode.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2012-2013 Karel Vesely, Daniel Povey
+# Copyright 2012-2015 Brno University of Technology (author: Karel Vesely), Daniel Povey
 # Apache 2.0
 
 # Begin configuration section. 
@@ -9,6 +9,10 @@ feature_transform=  # non-default location of feature_transform (optional)
 model=              # non-default location of transition model (optional)
 class_frame_counts= # non-default location of PDF counts (optional)
 srcdir=             # non-default location of DNN-dir (decouples model dir from decode dir)
+ivector=            # rx-specifier with i-vectors (ark-with-vectors),
+
+blocksoftmax_dims=   # 'csl' with block-softmax dimensions: dim1,dim2,dim3,...
+blocksoftmax_active= # '1' for the 1st block, 
 
 stage=0 # stage=1 skips lattice generation
 nj=4
@@ -35,6 +39,8 @@ echo "$0 $@"  # Print the command line for logging
 [ -f ./path.sh ] && . ./path.sh; # source the path.
 . parse_options.sh || exit 1;
 
+set -euo pipefail
+
 if [ $# != 3 ]; then
    echo "Usage: $0 [options] <graph-dir> <data-dir> <decode-dir>"
    echo "... where <decode-dir> is assumed to be a sub-directory of the directory"
@@ -109,12 +115,44 @@ feats="ark,s,cs:copy-feats scp:$sdata/JOB/feats.scp ark:- |"
 [ ! -z "$delta_opts" ] && feats="$feats add-deltas $delta_opts ark:- ark:- |"
 # add-pytel transform (optional),
 [ -e $D/pytel_transform.py ] && feats="$feats /bin/env python $D/pytel_transform.py |"
-#
+
+# add-ivector (optional),
+if [ -e $D/ivector_dim ]; then
+  [ -z $ivector ] && echo "Missing --ivector, they were used in training!" && exit 1
+  # Get the tool, 
+  ivector_append_tool=append-vector-to-feats # default,
+  [ -e $D/ivector_append_tool ] && ivector_append_tool=$(cat $D/ivector_append_tool)
+  # Check dims,
+  feats_job_1=$(sed 's:JOB:1:g' <(echo $feats))
+  dim_raw=$(feat-to-dim "$feats_job_1" -)
+  dim_raw_and_ivec=$(feat-to-dim "$feats_job_1 $ivector_append_tool ark:- '$ivector' ark:- |" -)
+  dim_ivec=$((dim_raw_and_ivec - dim_raw))
+  [ $dim_ivec != "$(cat $D/ivector_dim)" ] && \
+    echo "Error, i-vector dim. mismatch (expected $(cat $D/ivector_dim), got $dim_ivec in '$ivector')" && \
+    exit 1
+  # Append to feats,
+  feats="$feats $ivector_append_tool ark:- '$ivector' ark:- |"
+fi
+
+# select a block from blocksoftmax,
+if [ ! -z "$blocksoftmax_dims" ]; then
+  # blocksoftmax_active is a csl! dim1,dim2,dim3,...
+  [ -z "$blocksoftmax_active" ] && echo "$0 Missing option --blocksoftmax-active N" && exit 1
+  # getting dims,
+  dim_total=$(awk -F'[:,]' '{ for(i=1;i<=NF;i++) { sum += $i }; print sum; }' <(echo $blocksoftmax_dims))
+  dim_block=$(awk -F'[:,]' -v active=$blocksoftmax_active '{ print $active; }' <(echo $blocksoftmax_dims))
+  offset=$(awk -F'[:,]' -v active=$blocksoftmax_active '{ sum=0; for(i=1;i<active;i++) { sum += $i }; print sum; }' <(echo $blocksoftmax_dims))
+  # create components which select a block,
+  nnet-initialize <(echo "<Copy> <InputDim> $dim_total <OutputDim> $dim_block <BuildVector> $((1+offset)):$((offset+dim_block)) </BuildVector>"; 
+                    echo "<Softmax> <InputDim> $dim_block <OutputDim> $dim_block") $dir/copy_and_softmax.nnet 
+  # nnet is assembled on-the fly, <BlockSoftmax> is removed, while <Copy> + <Softmax> is added,
+  nnet="nnet-concat 'nnet-copy --remove-last-components=1 $nnet - |' $dir/copy_and_softmax.nnet - |"
+fi
 
 # Run the decoding in the queue,
 if [ $stage -le 0 ]; then
   $cmd --num-threads $((num_threads+1)) JOB=1:$nj $dir/log/decode.JOB.log \
-    nnet-forward $nnet_forward_opts --feature-transform=$feature_transform --class-frame-counts=$class_frame_counts --use-gpu=$use_gpu $nnet "$feats" ark:- \| \
+    nnet-forward $nnet_forward_opts --feature-transform=$feature_transform --class-frame-counts=$class_frame_counts --use-gpu=$use_gpu "$nnet" "$feats" ark:- \| \
     latgen-faster-mapped$thread_string --min-active=$min_active --max-active=$max_active --max-mem=$max_mem --beam=$beam \
     --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=true --word-symbol-table=$graphdir/words.txt \
     $model $graphdir/HCLG.fst ark:- "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;
diff --git a/egs/wsj/s5/steps/nnet/make_bn_feats.sh b/egs/wsj/s5/steps/nnet/make_bn_feats.sh
index 1c7b66b02f5..83a2a5fc159 100755
--- a/egs/wsj/s5/steps/nnet/make_bn_feats.sh
+++ b/egs/wsj/s5/steps/nnet/make_bn_feats.sh
@@ -1,6 +1,6 @@
 #!/bin/bash 
 
-# Copyright 2012-2014 Brno University of Technology (author: Karel Vesely)
+# Copyright 2012-2015 Brno University of Technology (author: Karel Vesely)
 # Apache 2.0
 # To be run from .. (one directory up from here)
 # see ../run.sh for example
@@ -9,8 +9,10 @@
 nj=4
 cmd=run.pl
 remove_last_components=4 # remove N last components from the nnet
+nnet_forward_opts=
 use_gpu=no
 htk_save=false
+ivector=            # rx-specifier with i-vectors (ark-with-vectors),
 # End configuration section.
 
 echo "$0 $@"  # Print the command line for logging
@@ -18,6 +20,8 @@ echo "$0 $@"  # Print the command line for logging
 if [ -f path.sh ]; then . ./path.sh; fi
 . parse_options.sh || exit 1;
 
+set -euo pipefail
+
 if [ $# != 5 ]; then
    echo "usage: $0 [options] <tgt-data-dir> <src-data-dir> <nnet-dir> <log-dir> <abs-path-to-bn-feat-dir>";
    echo "options: "
@@ -78,12 +82,31 @@ feats="ark,s,cs:copy-feats scp:$sdata/JOB/feats.scp ark:- |"
 [ ! -z "$cmvn_opts" ] && feats="$feats apply-cmvn $cmvn_opts --utt2spk=ark:$srcdata/utt2spk scp:$srcdata/cmvn.scp ark:- ark:- |"
 # add-deltas (optional),
 [ ! -z "$delta_opts" ] && feats="$feats add-deltas $delta_opts ark:- ark:- |"
-#
+# add-pytel transform (optional),
+[ -e $D/pytel_transform.py ] && feats="$feats /bin/env python $D/pytel_transform.py |"
+
+# add-ivector (optional),
+if [ -e $D/ivector_dim ]; then
+  [ -z $ivector ] && echo "Missing --ivector, they were used in training!" && exit 1
+  # Get the tool, 
+  ivector_append_tool=append-vector-to-feats # default,
+  [ -e $D/ivector_append_tool ] && ivector_append_tool=$(cat $D/ivector_append_tool)
+  # Check dims,
+  feats_job_1=$(sed 's:JOB:1:g' <(echo $feats))
+  dim_raw=$(feat-to-dim "$feats_job_1" -)
+  dim_raw_and_ivec=$(feat-to-dim "$feats_job_1 $ivector_append_tool ark:- '$ivector' ark:- |" -)
+  dim_ivec=$((dim_raw_and_ivec - dim_raw))
+  [ $dim_ivec != "$(cat $D/ivector_dim)" ] && \
+    echo "Error, i-vector dim. mismatch (expected $(cat $D/ivector_dim), got $dim_ivec in '$ivector')" && \
+    exit 1
+  # Append to feats,
+  feats="$feats $ivector_append_tool ark:- '$ivector' ark:- |"
+fi
 
 if [ $htk_save == false ]; then
   # Run the forward pass,
   $cmd JOB=1:$nj $logdir/make_bnfeats.JOB.log \
-    nnet-forward --use-gpu=$use_gpu $nnet "$feats" \
+    nnet-forward $nnet_forward_opts --use-gpu=$use_gpu $nnet "$feats" \
     ark,scp:$bnfeadir/raw_bnfea_$name.JOB.ark,$bnfeadir/raw_bnfea_$name.JOB.scp \
     || exit 1;
   # concatenate the .scp files
@@ -101,7 +124,7 @@ else # htk_save == true
   # Run the forward pass saving HTK features,
   $cmd JOB=1:$nj $logdir/make_bnfeats_htk.JOB.log \
     mkdir -p $data/htkfeats/JOB \; \
-    nnet-forward --use-gpu=$use_gpu $nnet "$feats" ark:- \| \
+    nnet-forward $nnet_forward_opts --use-gpu=$use_gpu $nnet "$feats" ark:- \| \
     copy-feats-to-htk --output-dir=$data/htkfeats/JOB ark:- || exit 1
   # Make list of htk features,
   find $data/htkfeats -name *.fea >$data/htkfeats.scp
diff --git a/egs/wsj/s5/steps/nnet/make_denlats.sh b/egs/wsj/s5/steps/nnet/make_denlats.sh
index 02d25c744d7..3ad1d248df3 100755
--- a/egs/wsj/s5/steps/nnet/make_denlats.sh
+++ b/egs/wsj/s5/steps/nnet/make_denlats.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright 2012-2013 Karel Vesely, Daniel Povey
+# Copyright 2012-2013 Brno University of Technology (author: Karel Vesely), Daniel Povey
 # Apache 2.0.
 
 # Create denominator lattices for MMI/MPE/sMBR training.
@@ -22,12 +22,15 @@ max_mem=20000000 # This will stop the processes getting too large.
 # End configuration section.
 use_gpu=no # yes|no|optional
 parallel_opts="--num-threads 2"
+ivector=         # rx-specifier with i-vectors (ark-with-vectors),
 
 echo "$0 $@"  # Print the command line for logging
 
 [ -f ./path.sh ] && . ./path.sh; # source the path.
 . parse_options.sh || exit 1;
 
+set -euo pipefail
+
 if [ $# != 4 ]; then
    echo "Usage: steps/$0 [options] <data-dir> <lang-dir> <src-dir> <exp-dir>"
    echo "  e.g.: steps/$0 data/train data/lang exp/tri1 exp/tri1_denlats"
@@ -110,15 +113,35 @@ feats="ark,s,cs:copy-feats scp:$sdata/JOB/feats.scp ark:- |"
 [ ! -z "$cmvn_opts" ] && feats="$feats apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp ark:- ark:- |"
 # add-deltas (optional),
 [ ! -z "$delta_opts" ] && feats="$feats add-deltas $delta_opts ark:- ark:- |"
+# add-pytel transform (optional),
+[ -e $D/pytel_transform.py ] && feats="$feats /bin/env python $D/pytel_transform.py |"
+
+# add-ivector (optional),
+if [ -e $D/ivector_dim ]; then
+  [ -z $ivector ] && echo "Missing --ivector, they were used in training!" && exit 1
+  # Get the tool, 
+  ivector_append_tool=append-vector-to-feats # default,
+  [ -e $D/ivector_append_tool ] && ivector_append_tool=$(cat $D/ivector_append_tool)
+  # Check dims,
+  feats_job_1=$(sed 's:JOB:1:g' <(echo $feats))
+  dim_raw=$(feat-to-dim "$feats_job_1" -)
+  dim_raw_and_ivec=$(feat-to-dim "$feats_job_1 $ivector_append_tool ark:- '$ivector' ark:- |" -)
+  dim_ivec=$((dim_raw_and_ivec - dim_raw))
+  [ $dim_ivec != "$(cat $D/ivector_dim)" ] && \
+    echo "Error, i-vector dim. mismatch (expected $(cat $D/ivector_dim), got $dim_ivec in '$ivector')" && \
+    exit 1
+  # Append to feats,
+  feats="$feats $ivector_append_tool ark:- '$ivector' ark:- |"
+fi
+
 # nnet-forward,
 feats="$feats nnet-forward $nnet_forward_opts --feature-transform=$feature_transform --class-frame-counts=$class_frame_counts --use-gpu=$use_gpu $nnet ark:- ark:- |"
-#
 
 # if this job is interrupted by the user, we want any background jobs to be
 # killed too.
 cleanup() {
   local pids=$(jobs -pr)
-  [ -n "$pids" ] && kill $pids
+  [ -n "$pids" ] && kill $pids || true
 }
 trap "cleanup" INT QUIT TERM EXIT
 
@@ -140,7 +163,7 @@ else
   # each job from 1 to $nj is split into multiple pieces (sub-split), and we aim
   # to have at most two jobs running at each time.  The idea is that if we have stragglers
   # from one job, we can be processing another one at the same time.
-  rm $dir/.error 2>/dev/null
+  rm -f $dir/.error
 
   prev_pid=
   for n in `seq $[nj+1]`; do
diff --git a/egs/wsj/s5/steps/nnet/make_fmllr_feats.sh b/egs/wsj/s5/steps/nnet/make_fmllr_feats.sh
index fd2ab230f47..c9d679004f1 100755
--- a/egs/wsj/s5/steps/nnet/make_fmllr_feats.sh
+++ b/egs/wsj/s5/steps/nnet/make_fmllr_feats.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2012-2014  Brno University of Technology (Author: Karel Vesely),
+# Copyright 2012-2015  Brno University of Technology (author: Karel Vesely),
 #                 
 # Apache 2.0.
 #
@@ -19,19 +19,21 @@ echo "$0 $@"  # Print the command line for logging
 [ -f ./path.sh ] && . ./path.sh; # source the path.
 . parse_options.sh || exit 1;
 
+set -euo pipefail
+
 if [ $# != 5 ]; then
    echo "Usage: $0 [options] <tgt-data-dir> <src-data-dir> <gmm-dir> <log-dir> <fea-dir>"
    echo "e.g.: $0 data-fmllr/train data/train exp/tri5a exp/make_fmllr_feats/log plp/processed/"
    echo ""
-   echo "This script works on CMN + (delta+delta-delta | LDA+MLLT) features; it works out"
-   echo "what type of features you used (assuming it's one of these two)"
-   echo "You can also use fMLLR features-- you have to supply --transform-dir option."
+   echo "This script dumps fMLLR features to disk, so it can be used for NN training."
+   echo "It automoatically figures out the 'feature-type' of the source GMM systems."
    echo ""
    echo "main options (for others, see top of script file)"
    echo "  --config <config-file>                           # config containing options"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs"
    echo "  --nj <nj>                                        # number of parallel jobs"
-   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
-   echo "  --transform-dir <transform-dir>                  # where to find fMLLR transforms."
+   echo "  --transform-dir <transform-dir>                  # dir with fMLLR transforms"
+   echo "  --raw-transform-dir <transform-dir>              # dir with raw-fMLLR transforms"
    exit 1;
 fi
 
@@ -42,9 +44,12 @@ logdir=$4
 feadir=$5
 
 sdata=$srcdata/split$nj;
-splice_opts=`cat $gmmdir/splice_opts 2>/dev/null` # frame-splicing options.
-cmvn_opts=`cat $gmmdir/cmvn_opts 2>/dev/null`
-delta_opts=`cat $gmmdir/delta_opts 2>/dev/null`
+
+# Get the config,
+D=$gmmdir
+[ -f $D/cmvn_opts ] && cmvn_opts=$(cat $D/cmvn_opts) || cmvn_opts=
+[ -f $D/delta_opts ] && delta_opts=$(cat $D/delta_opts) || delta_opts=
+[ -f $D/splice_opts ] && splice_opts=$(cat $D/splice_opts) || splice_opts=
 
 mkdir -p $data $logdir $feadir
 [[ -d $sdata && $srcdata/feats.scp -ot $sdata ]] || split_data.sh $srcdata $nj || exit 1;
@@ -59,22 +64,17 @@ done
   echo "$0: Missing $raw_transform_dir/raw_trans.1" && exit 1;
 
 # Figure-out the feature-type,
-feat_type=delta # Default
-[ ! -f $gmmdir/final.mat -a ! -z "$transform_dir" ] && feat_type=delta_fmllr
-[ -f $gmmdir/final.mat ] && feat_type=lda
-[ -f $gmmdir/final.mat -a ! -z "$transform_dir" ] && feat_type=lda_fmllr
+feat_type="[UNKNOWN]"
+[ -z "$raw_transform_dir" -a ! -f $gmmdir/final.mat -a ! -z "$transform_dir" ] && feat_type=delta_fmllr
+[ -z "$raw_transform_dir" -a -f $gmmdir/final.mat -a ! -z "$transform_dir" ] && feat_type=lda_fmllr
 [ ! -z "$raw_transform_dir" ] && feat_type=raw_fmllr
-[ ! -z "$raw_transform_dir" -a -f $gmmdir/final.mat -a ! -z "$transform_dir" ] && feat_type=raw_fmllr_lda_fmllr
 echo "$0: feature type is $feat_type";
 
 # Hand-code the feature pipeline,
 case $feat_type in
-  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
   delta_fmllr) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- | transform-feats --utt2spk=ark:$sdata/JOB/utt2spk \"ark:cat $transform_dir/trans.* |\" ark:- ark:- |";;
-  lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $gmmdir/final.mat ark:- ark:- |";;
   lda_fmllr) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $gmmdir/final.mat ark:- ark:- | transform-feats --utt2spk=ark:$sdata/JOB/utt2spk \"ark:cat $transform_dir/trans.* |\" ark:- ark:- |";;
   raw_fmllr) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$raw_transform_dir/raw_trans.JOB ark:- ark:- |";;
-  raw_fmllr_lda_fmllr) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$cur_trans_dir/raw_trans.JOB ark:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $gmmdir/final.mat ark:- ark:- | transform-feats --utt2spk=ark:$sdata/JOB/utt2spk \"ark:cat $transform_dir/trans.* |\" ark:- ark:- |";;
   *) echo "Invalid feature type $feat_type" && exit 1;
 esac
 
diff --git a/egs/wsj/s5/steps/nnet/make_fmmi_feats.sh b/egs/wsj/s5/steps/nnet/make_fmmi_feats.sh
index a0b28250aa6..2874f00067b 100755
--- a/egs/wsj/s5/steps/nnet/make_fmmi_feats.sh
+++ b/egs/wsj/s5/steps/nnet/make_fmmi_feats.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2012-2013  Brno University of Technology (Author: Karel Vesely),
+# Copyright 2012-2015  Brno University of Technology (author: Karel Vesely),
 #
 # Apache 2.0
 #
@@ -20,6 +20,8 @@ echo "$0 $@"  # Print the command line for logging
 [ -f ./path.sh ] && . ./path.sh; # source the path.
 . parse_options.sh || exit 1;
 
+set -euo pipefail
+
 if [ $# != 5 ]; then
    echo "Usage: $0 [options] <tgt-data-dir> <src-data-dir> <gmm-dir> <log-dir> <fea-dir>"
    echo "e.g.: $0 data-fmmi/train data/train exp/tri5a_fmmi_b0.1 data-fmmi/train/_log data-fmmi/train/_data "
@@ -44,8 +46,11 @@ logdir=$4
 feadir=$5
 
 sdata=$srcdata/split$nj;
-splice_opts=`cat $gmmdir/splice_opts 2>/dev/null`
-cmvn_opts=`cat $gmmdir/cmvn_opts 2>/dev/null`
+
+# Get the config,
+D=$gmmdir
+[ -f $D/cmvn_opts ] && cmvn_opts=$(cat $D/cmvn_opts) || cmvn_opts=
+[ -f $D/splice_opts ] && splice_opts=$(cat $D/splice_opts) || splice_opts=
 
 mkdir -p $data $logdir $feadir
 [[ -d $sdata && $srcdata/feats.scp -ot $sdata ]] || split_data.sh $srcdata $nj || exit 1;
diff --git a/egs/wsj/s5/steps/nnet/make_priors.sh b/egs/wsj/s5/steps/nnet/make_priors.sh
index f3e9c1edbee..3e7967a1b58 100755
--- a/egs/wsj/s5/steps/nnet/make_priors.sh
+++ b/egs/wsj/s5/steps/nnet/make_priors.sh
@@ -1,6 +1,6 @@
 #!/bin/bash 
 
-# Copyright 2012-2014 Brno University of Technology (author: Karel Vesely)
+# Copyright 2012-2015 Brno University of Technology (author: Karel Vesely)
 # Apache 2.0
 # To be run from .. (one directory up from here)
 # see ../run.sh for example
@@ -9,6 +9,7 @@
 nj=4
 cmd=run.pl
 use_gpu=no
+ivector=
 # End configuration section.
 
 echo "$0 $@"  # Print the command line for logging
@@ -16,6 +17,8 @@ echo "$0 $@"  # Print the command line for logging
 if [ -f path.sh ]; then . ./path.sh; fi
 . parse_options.sh || exit 1;
 
+set -euo pipefail
+
 if [ $# != 2 ]; then
    echo "usage: $0 [options] <data-dir> <nnet-dir>";
    echo "options: "
@@ -43,6 +46,9 @@ sdata=$data/split$nj
 
 echo "Accumulating prior stats by forwarding '$data' with '$nndir'"
 
+# We estimate priors on 10k utterances, selected randomly from the splitted data,
+N=$((10000/nj))
+
 # PREPARE FEATURE EXTRACTION PIPELINE
 # import config,
 cmvn_opts=
@@ -54,13 +60,32 @@ D=$nndir
 [ -e $D/delta_opts ] && delta_opts=$(cat $D/delta_opts)
 #
 # Create the feature stream,
-feats="ark,s,cs:copy-feats scp:$sdata/JOB/feats.scp ark:- |"
+feats="ark:cat $sdata/JOB/feats.scp | utils/shuffle_list.pl --srand 777 | head -n$N | copy-feats scp:- ark:- |"
 # apply-cmvn (optional),
 [ ! -z "$cmvn_opts" -a ! -f $sdata/1/cmvn.scp ] && echo "$0: Missing $sdata/1/cmvn.scp" && exit 1
 [ ! -z "$cmvn_opts" ] && feats="$feats apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp ark:- ark:- |"
 # add-deltas (optional),
 [ ! -z "$delta_opts" ] && feats="$feats add-deltas $delta_opts ark:- ark:- |"
-#
+# add-pytel transform (optional),
+[ -e $D/pytel_transform.py ] && feats="$feats /bin/env python $D/pytel_transform.py |"
+
+# add-ivector (optional),
+if [ -e $D/ivector_dim ]; then
+  [ -z $ivector ] && echo "Missing --ivector, they were used in training!" && exit 1
+  # Get the tool, 
+  ivector_append_tool=append-vector-to-feats # default,
+  [ -e $D/ivector_append_tool ] && ivector_append_tool=$(cat $D/ivector_append_tool)
+  # Check dims,
+  feats_job_1=$(sed 's:JOB:1:g' <(echo $feats))
+  dim_raw=$(feat-to-dim "$feats_job_1" -)
+  dim_raw_and_ivec=$(feat-to-dim "$feats_job_1 $ivector_append_tool ark:- '$ivector' ark:- |" -)
+  dim_ivec=$((dim_raw_and_ivec - dim_raw))
+  [ $dim_ivec != "$(cat $D/ivector_dim)" ] && \
+    echo "Error, i-vector dim. mismatch (expected $(cat $D/ivector_dim), got $dim_ivec in '$ivector')" && \
+    exit 1
+  # Append to feats,
+  feats="$feats $ivector_append_tool ark:- '$ivector' ark:- |"
+fi
 
 # Run the forward pass,
 $cmd JOB=1:$nj $nndir/log/prior_stats.JOB.log \
diff --git a/egs/wsj/s5/steps/nnet/pretrain_dbn.sh b/egs/wsj/s5/steps/nnet/pretrain_dbn.sh
index c8d9250f420..0895ddf1500 100755
--- a/egs/wsj/s5/steps/nnet/pretrain_dbn.sh
+++ b/egs/wsj/s5/steps/nnet/pretrain_dbn.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright 2013-2014 Brno University of Technology (Author: Karel Vesely)
+# Copyright 2013-2015 Brno University of Technology (author: Karel Vesely)
 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,46 +14,54 @@
 # See the Apache 2 License for the specific language governing permissions and
 # limitations under the License.
 
-# To be run from ..
+# To be run from ../../
 #
-# Deep Belief Network pre-training by Contrastive Divergence (CD-1) algorithm.
-# The script can pre-train on plain features (ie. saved fMLLR features), 
-# or modified features (CMN, delta).
-# The script creates feature-transform in nnet format, which contains splice 
-# and shift+scale (zero mean and unit variance on DBN input).
+# Restricted Boltzman Machine (RBM) pre-training by Contrastive Divergence
+# algorithm (CD-1). A stack of RBMs forms a Deep Belief Neetwork (DBN).
+#
+# This script by default pre-trains on plain features (ie. saved fMLLR features),
+# building a 'feature_transform' containing +/-5 frame splice and global CMVN.
+#
+# There is also a support for adding speaker-based CMVN, deltas, i-vectors,
+# or passing custom 'feature_transform' or its prototype.
 #
-# For special cases it is possible to use external feature-transform.
-# 
 
 # Begin configuration.
-#
-# nnet config
-nn_depth=6     #number of hidden layers
-hid_dim=2048   #number of units per layer
-param_stddev_first=0.1 #init parameters in 1st RBM
-param_stddev=0.1 #init parameters in other RBMs
+
+# topology, initialization,
+nn_depth=6             # number of hidden layers,
+hid_dim=2048           # number of neurons per layer,
+param_stddev_first=0.1 # init parameters in 1st RBM
+param_stddev=0.1 # init parameters in other RBMs
 input_vis_type=gauss # type of visible nodes on DBN input
-# number of iterations
-rbm_iter=1            #number of pre-training epochs (Gaussian-Bernoulli RBM has 2x more)
-# pre-training opts
-rbm_lrate=0.4         #RBM learning rate
-rbm_lrate_low=0.01    #lower RBM learning rate (for Gaussian units)
-rbm_l2penalty=0.0002  #L2 penalty (increases RBM-mixing rate)
+
+# number of iterations,
+rbm_iter=1            # number of pre-training epochs (Gaussian-Bernoulli RBM has 2x more)
+
+# pre-training opts,
+rbm_lrate=0.4         # RBM learning rate
+rbm_lrate_low=0.01    # lower RBM learning rate (for Gaussian units)
+rbm_l2penalty=0.0002  # L2 penalty (increases RBM-mixing rate)
 rbm_extra_opts=
-# data processing config
-copy_feats=true    # resave the features randomized consecutively to tmpdir
- copy_feats_tmproot= # tmproot for copy-feats (optional)
-# feature config
-feature_transform= # Optionally reuse feature processing front-end (override splice,etc.)
-feature_transform_proto= # Optionally pass prototype of feature transform
-cmvn_opts=        # Optionally do CMVN of the input features with options
-delta_opts=       # Optionally use deltas on the input features
-splice=5           # Temporal splicing
-splice_step=1      # Stepsize of the splicing (1 is consecutive splice, 
-                   # value 2 would do [ -10 -8 -6 -4 -2 0 2 4 6 8 10 ] splicing)
+
+# data processing,
+copy_feats=true     # resave the features to tmpdir,
+copy_feats_tmproot=/tmp/kaldi.XXXX # sets tmproot for 'copy-feats',
+
+# feature processing,
+splice=5            # (default) splice features both-ways along time axis,
+cmvn_opts=          # (optional) adds 'apply-cmvn' to input feature pipeline, see opts,
+delta_opts=         # (optional) adds 'add-deltas' to input feature pipeline, see opts,
+ivector=            # (optional) adds 'append-vector-to-feats', the option is rx-filename for the 2nd stream,
+ivector_append_tool=append-vector-to-feats # (optional) the tool for appending ivectors,
+
+feature_transform_proto= # (optional) use this prototype for 'feature_transform',
+feature_transform=  # (optional) directly use this 'feature_transform',
+
 # misc.
 verbose=1 # enable per-cache reports
 skip_cuda_check=false
+
 # End configuration.
 
 echo "$0 $@"  # Print the command line for logging
@@ -61,6 +69,7 @@ echo "$0 $@"  # Print the command line for logging
 [ -f path.sh ] && . ./path.sh;
 . parse_options.sh || exit 1;
 
+set -euo pipefail
 
 if [ $# != 2 ]; then
    echo "Usage: $0 <data> <exp-dir>"
@@ -71,22 +80,23 @@ if [ $# != 2 ]; then
    echo "  --nn-depth <N>                   # number of RBM layers"
    echo "  --hid-dim <N>                    # number of hidden units per layer"
    echo "  --rbm-iter <N>                   # number of CD-1 iterations per layer"
-   echo "  --dbm-drop-data <float>          # probability of frame-dropping,"
    echo "                                   # can be used to subsample large datasets"
    echo "  --rbm-lrate <float>              # learning-rate for Bernoulli-Bernoulli RBMs"
    echo "  --rbm-lrate-low <float>          # learning-rate for Gaussian-Bernoulli RBM"
    echo ""
-   echo "  --copy-feats <bool>              # copy features to /tmp, to accelerate training"
-   echo "  --apply-cmvn <bool>              # normalize input features (opt.)"
-   echo "    --norm-vars <bool>               # use variance normalization (opt.)"
+   echo "  --cmvn-opts  <string>            # add 'apply-cmvn' to input feature pipeline"
+   echo "  --delta-opts <string>            # add 'add-deltas' to input feature pipeline"
    echo "  --splice <N>                     # splice +/-N frames of input features"
+   echo "  --copy-feats <bool>              # copy features to /tmp, lowers storage stress"
+   echo ""
+   echo "  --feature_transform_proto <file> # use this prototype for 'feature_transform'"
+   echo "  --feature-transform <file>       # directly use this 'feature_transform'"
    exit 1;
 fi
 
 data=$1
 dir=$2
 
-
 for f in $data/feats.scp; do
   [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
 done
@@ -94,7 +104,7 @@ done
 echo "# INFO"
 echo "$0 : Pre-training Deep Belief Network as a stack of RBMs"
 printf "\t dir       : $dir \n"
-printf "\t Train-set : $data \n"
+printf "\t Train-set : $data '$(cat $data/feats.scp | wc -l)'\n"
 echo
 
 [ -e $dir/${nn_depth}.dbn ] && echo "$0 Skipping, already have $dir/${nn_depth}.dbn" && exit 0
@@ -107,104 +117,153 @@ mkdir -p $dir/log
 ###### PREPARE FEATURES ######
 echo
 echo "# PREPARING FEATURES"
-# shuffle the list
-echo "Preparing train/cv lists"
-cat $data/feats.scp | utils/shuffle_list.pl --srand ${seed:-777} > $dir/train.scp
-# print the list size
-wc -l $dir/train.scp
-
-# re-save the shuffled features, so they are stored sequentially on the disk in /tmp/
 if [ "$copy_feats" == "true" ]; then
-  tmpdir=$(mktemp -d $copy_feats_tmproot); mv $dir/train.scp{,_non_local}
-  copy-feats scp:$dir/train.scp_non_local ark,scp:$tmpdir/train.ark,$dir/train.scp || exit 1
-  trap "echo \"Removing features tmpdir $tmpdir @ $(hostname)\"; ls $tmpdir; rm -r $tmpdir" EXIT
+  # re-save the features to local disk into /tmp/,
+  tmpdir=$(mktemp -d $copy_feats_tmproot)
+  trap "echo \"# Removing features tmpdir $tmpdir @ $(hostname)\"; ls $tmpdir; rm -r $tmpdir" INT QUIT TERM EXIT
+  copy-feats scp:$data/feats.scp ark,scp:$tmpdir/train.ark,$dir/train_sorted.scp || exit 1
+else
+  # or copy the list,
+  cp $data/feats.scp $dir/train_sorted.scp
 fi
+# shuffle the list,
+utils/shuffle_list.pl --srand 777 <$dir/train_sorted.scp >$dir/train.scp
 
-# create a 10k utt subset for global cmvn estimates
+# create a 10k utt subset for global cmvn estimates,
 head -n 10000 $dir/train.scp > $dir/train.scp.10k
 
+# for debugging, add list with non-local features,
+utils/shuffle_list.pl --srand 777 <$data/feats.scp >$dir/train.scp_non_local
+
 ###### OPTIONALLY IMPORT FEATURE SETTINGS ######
+ivector_dim= # no ivectors,
 if [ ! -z $feature_transform ]; then
   D=$(dirname $feature_transform)
-  echo "Importing feature settings from: $transf_dir"
+  echo "# importing feature settings from dir '$D'"
   [ -e $D/cmvn_opts ] && cmvn_opts=$(cat $D/cmvn_opts)
   [ -e $D/delta_opts ] && delta_opts=$(cat $D/delta_opts)
-  echo "Imported config : cmvn_opts='$cmvn_opts' delta_opts='$delta_opts'"
+  [ -e $D/ivector_dim ] && ivector_dim=$(cat $D/ivector_dim)
+  [ -e $D/ivector_append_tool ] && ivector_append_tool=$(cat $D/ivector_append_tool)
+  echo "# cmvn_opts='$cmvn_opts' delta_opts='$delta_opts' ivector_dim='$ivector_dim'"
 fi
 
 ###### PREPARE FEATURE PIPELINE ######
-
 # read the features
-feats="ark:copy-feats scp:$dir/train.scp ark:- |"
+feats_tr="ark:copy-feats scp:$dir/train.scp ark:- |"
 
 # optionally add per-speaker CMVN
 if [ ! -z "$cmvn_opts" ]; then
-  echo "Will use CMVN statistics : $data/cmvn.scp"
+  echo "+ 'apply-cmvn' with '$cmvn_opts' using statistics : $data/cmvn.scp"
   [ ! -r $data/cmvn.scp ] && echo "Missing $data/cmvn.scp" && exit 1;
-  cmvn="scp:$data/cmvn.scp"
-  feats="$feats apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk $cmvn ark:- ark:- |"
+  [ ! -r $data/utt2spk ] && echo "Missing $data/utt2spk" && exit 1;
+  feats_tr="$feats_tr apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp ark:- ark:- |"
 else
-  echo "apply-cmvn not used"
+  echo "# 'apply-cmvn' not used,"
 fi
 
 # optionally add deltas
 if [ ! -z "$delta_opts" ]; then
-  feats="$feats add-deltas $delta_opts ark:- ark:- |"
+  feats_tr="$feats_tr add-deltas $delta_opts ark:- ark:- |"
+  echo "# + 'add-deltas' with '$delta_opts'"
 fi
 
 # keep track of the config,
-[ ! -z "$cmvn_opts" ] && echo "$cmvn_opts" >$dir/cmvn_opts 
+[ ! -z "$cmvn_opts" ] && echo "$cmvn_opts" >$dir/cmvn_opts
 [ ! -z "$delta_opts" ] && echo "$delta_opts" >$dir/delta_opts
 #
 
+# get feature dim,
+feat_dim=$(feat-to-dim "$feats_tr" -)
+echo "# feature dim : $feat_dim (input of 'feature_transform')"
 
-# get feature dim
-echo -n "Getting feature dim : "
-feat_dim=$(feat-to-dim --print-args=false "$feats" -)
-echo $feat_dim
-
-
-# Now we will start building feature_transform which will 
-# be applied in CUDA to gain more speed.
+# Now we start building 'feature_transform' which goes right in front of a NN.
+# The forwarding is computed on a GPU before the frame shuffling is applied.
 #
-# We will use 1GPU for both feature_transform and MLP training in one binary tool. 
-# It is necessary, because we need to run it as a single process, using single GPU
-# and avoiding I/O overheads.
+# Same GPU is used both for 'feature_transform' and the NN training.
+# So it has to be done by a single process (we are using exclusive mode).
+# This also reduces the CPU-GPU uploads/downloads to minimum.
 
 if [ ! -z "$feature_transform" ]; then
-  echo Using already prepared feature_transform: $feature_transform
-  cp $feature_transform $dir/final.feature_transform
+  echo "# importing 'feature_transform' from '$feature_transform'"
+  tmp=$dir/imported_$(basename $feature_transform)
+  cp $feature_transform $tmp; feature_transform=$tmp
 else
-  if [ ! -z "$feature_transform_proto" ]; then
-    feature_transform=$dir/tr_$(basename $feature_transform_proto)
-    log=$dir/log/feature-transform-initialize.log
-    nnet-initialize --binary=false $feature_transform_proto $feature_transform 2>$log || { cat $log; exit 1; }
+  # Make default proto with splice,
+  if [ ! -z $feature_transform_proto ]; then
+    echo "# importing custom 'feature_transform_proto' from : $feature_transform_proto"
   else
-    # Generate the splice transform
-    echo "Using splice +/- $splice , step $splice_step"
-    feature_transform=$dir/tr_splice$splice-$splice_step.nnet
-    utils/nnet/gen_splice.py --fea-dim=$feat_dim --splice=$splice --splice-step=$splice_step > $feature_transform
+    echo "+ default 'feature_transform_proto' with splice +/-$splice frames"
+    feature_transform_proto=$dir/splice${splice}.proto
+    echo "<Splice> <InputDim> $feat_dim <OutputDim> $(((2*splice+1)*feat_dim)) <BuildVector> -$splice:$splice </BuildVector>" >$feature_transform_proto
   fi
 
-  # Renormalize the MLP input to zero mean and unit variance
+  # Initialize 'feature-transform' from a prototype,
+  feature_transform=$dir/tr_$(basename $feature_transform_proto .proto).nnet
+  nnet-initialize --binary=false $feature_transform_proto $feature_transform
+
+  # Renormalize the MLP input to zero mean and unit variance,
   feature_transform_old=$feature_transform
   feature_transform=${feature_transform%.nnet}_cmvn-g.nnet
-  echo "Renormalizing MLP input features into $feature_transform"
-  nnet-forward --use-gpu=yes \
-    $feature_transform_old "$(echo $feats | sed 's|train.scp|train.scp.10k|')" \
-    ark:- 2>$dir/log/cmvn_glob_fwd.log |\
-  compute-cmvn-stats ark:- - | cmvn-to-nnet - - |\
-  nnet-concat --binary=false $feature_transform_old - $feature_transform
-
-  # MAKE LINK TO THE FINAL feature_transform, so the other scripts will find it ######
-  [ -f $dir/final.feature_transform ] && unlink $dir/final.feature_transform
-  (cd $dir; ln -s $(basename $feature_transform) final.feature_transform )
+  echo "# compute normalization stats from 10k sentences"
+  nnet-forward --print-args=true --use-gpu=yes $feature_transform_old \
+    "$(echo $feats_tr | sed 's|train.scp|train.scp.10k|')" ark:- |\
+    compute-cmvn-stats ark:- $dir/cmvn-g.stats
+  echo "# + normalization of NN-input at '$feature_transform'"
+  nnet-concat --print-args=false --binary=false $feature_transform_old \
+    "cmvn-to-nnet $dir/cmvn-g.stats -|" $feature_transform
 fi
 
+if [ ! -z $ivector ]; then
+  echo
+  echo "# ADDING IVECTOR FEATURES"
+  # The iVectors are concatenated 'as they are' directly to the input of the neural network,
+  # To do this, we paste the features, and use <ParallelComponent> where the 1st component
+  # contains the transform and 2nd network contains <Copy> component.
+
+  echo "# getting dims,"
+  dim_raw=$(feat-to-dim "$feats_tr" -)
+  dim_raw_and_ivec=$(feat-to-dim "$feats_tr $ivector_append_tool ark:- '$ivector' ark:- |" -)
+  dim_ivec=$((dim_raw_and_ivec - dim_raw))
+  echo "# dims, feats-raw $dim_raw, ivectors $dim_ivec,"
+
+  # Should we do something with 'feature_transform'?
+  if [ ! -z $ivector_dim ]; then
+    # No, the 'ivector_dim' comes from dir with 'feature_transform' with iVec forwarding,
+    echo "# assuming we got '$feature_transform' with ivector forwarding,"
+    [ $ivector_dim != $dim_ivec ] && \
+    echo -n "Error, i-vector dimensionality mismatch!" && \
+    echo " (expected $ivector_dim, got $dim_ivec in $ivector)" && exit 1
+  else
+    # Yes, adjust the transform to do ``iVec forwarding'',
+    feature_transform_old=$feature_transform
+    feature_transform=${feature_transform%.nnet}_ivec_copy.nnet
+    echo "# setting up ivector forwarding into '$feature_transform',"
+    dim_transformed=$(feat-to-dim "$feats_tr nnet-forward $feature_transform_old ark:- ark:- |" -)
+    nnet-initialize --print-args=false <(echo "<Copy> <InputDim> $dim_ivec <OutputDim> $dim_ivec <BuildVector> 1:$dim_ivec </BuildVector>") $dir/tr_ivec_copy.nnet
+    nnet-initialize --print-args=false <(echo "<ParallelComponent> <InputDim> $((dim_raw+dim_ivec)) <OutputDim> $((dim_transformed+dim_ivec)) <NestedNnetFilename> $feature_transform_old $dir/tr_ivec_copy.nnet </NestedNnetFilename>") $feature_transform
+  fi
+  echo $dim_ivec >$dir/ivector_dim # mark down the iVec dim!
+  echo $ivector_append_tool >$dir/ivector_append_tool
+
+  # pasting the iVecs to the feaures,
+  echo "# + ivector input '$ivector'"
+  feats_tr="$feats_tr $ivector_append_tool ark:- '$ivector' ark:- |"
+fi
+
+###### Show the final 'feature_transform' in the log,
+echo
+echo "### Showing the final 'feature_transform':"
+nnet-info $feature_transform
+echo "###"
+
+###### MAKE LINK TO THE FINAL feature_transform, so the other scripts will find it ######
+[ -f $dir/final.feature_transform ] && unlink $dir/final.feature_transform
+(cd $dir; ln -s $(basename $feature_transform) final.feature_transform )
+feature_transform=$dir/final.feature_transform
 
 
 ###### GET THE DIMENSIONS ######
-num_fea=$(feat-to-dim --print-args=false "$feats nnet-forward --use-gpu=no $feature_transform ark:- ark:- |" - 2>/dev/null)
+num_fea=$(feat-to-dim --print-args=false "$feats_tr nnet-forward --use-gpu=no $feature_transform ark:- ark:- |" - 2>/dev/null)
 num_hid=$hid_dim
 
 
@@ -215,61 +274,55 @@ for depth in $(seq 1 $nn_depth); do
   RBM=$dir/$depth.rbm
   [ -f $RBM ] && echo "RBM '$RBM' already trained, skipping." && continue
 
-  # The first RBM needs special treatment, because of Gussian input nodes
+  # The first RBM needs special treatment, because of Gussian input nodes,
   if [ "$depth" == "1" ]; then
     # This is usually Gaussian-Bernoulli RBM (not if CNN layers are part of input transform)
-    # initialize
-    echo "Initializing '$RBM.init'"
-    echo "<NnetProto>
-    <Rbm> <InputDim> $num_fea <OutputDim> $num_hid <VisibleType> $input_vis_type <HiddenType> bern <ParamStddev> $param_stddev_first
-    </NnetProto>
-    " > $RBM.proto
+    # initialize,
+    echo "# initializing '$RBM.init'"
+    echo "<Rbm> <InputDim> $num_fea <OutputDim> $num_hid <VisibleType> $input_vis_type <HiddenType> bern <ParamStddev> $param_stddev_first" > $RBM.proto
     nnet-initialize $RBM.proto $RBM.init 2>$dir/log/nnet-initialize.$depth.log || exit 1
-    # pre-train
-    num_iter=$rbm_iter; [ $input_vis_type == "gauss" ] && num_iter=$((2*rbm_iter)) #2x more epochs for Gaussian input
+    # pre-train,
+    num_iter=$rbm_iter; [ $input_vis_type == "gauss" ] && num_iter=$((2*rbm_iter)) # 2x more epochs for Gaussian input
     [ $input_vis_type == "bern" ] && rbm_lrate_low=$rbm_lrate # original lrate for Bernoulli input
-    echo "Pretraining '$RBM' (input $input_vis_type, lrate $rbm_lrate_low, iters $num_iter)"
+    echo "# pretraining '$RBM' (input $input_vis_type, lrate $rbm_lrate_low, iters $num_iter)"
     rbm-train-cd1-frmshuff --learn-rate=$rbm_lrate_low --l2-penalty=$rbm_l2penalty \
       --num-iters=$num_iter --verbose=$verbose \
       --feature-transform=$feature_transform \
       $rbm_extra_opts \
-      $RBM.init "$feats" $RBM 2>$dir/log/rbm.$depth.log || exit 1
+      $RBM.init "$feats_tr" $RBM 2>$dir/log/rbm.$depth.log || exit 1
   else
-    #This is Bernoulli-Bernoulli RBM
-    #cmvn stats for init
-    echo "Computing cmvn stats '$dir/$depth.cmvn' for RBM initialization"
-    if [ ! -f $dir/$depth.cmvn ]; then 
-      nnet-forward --use-gpu=yes \
-       "nnet-concat $feature_transform $dir/$((depth-1)).dbn - |" \
-        "$(echo $feats | sed 's|train.scp|train.scp.10k|')" \
-        ark:- 2>$dir/log/cmvn_fwd.$depth.log | \
-      compute-cmvn-stats ark:- - 2>$dir/log/cmvn.$depth.log | \
-      cmvn-to-nnet - $dir/$depth.cmvn || exit 1
+    # This is Bernoulli-Bernoulli RBM,
+    # cmvn stats for init,
+    echo "# computing cmvn stats '$dir/$depth.cmvn' for RBM initialization"
+    if [ ! -f $dir/$depth.cmvn ]; then
+      nnet-forward --print-args=false --use-gpu=yes \
+        "nnet-concat $feature_transform $dir/$((depth-1)).dbn - |" \
+        "$(echo $feats_tr | sed 's|train.scp|train.scp.10k|')" ark:- | \
+      compute-cmvn-stats --print-args=false ark:- - | \
+      cmvn-to-nnet --print-args=false - $dir/$depth.cmvn || exit 1
     else
-      echo compute-cmvn-stats already done, skipping.
+      echo "# compute-cmvn-stats already done, skipping."
     fi
-    #initialize
-    echo "Initializing '$RBM.init'"
-    echo "<NnetProto>
-    <Rbm> <InputDim> $num_hid <OutputDim> $num_hid <VisibleType> bern <HiddenType> bern <ParamStddev> $param_stddev <VisibleBiasCmvnFilename> $dir/$depth.cmvn
-    </NnetProto>
-    " > $RBM.proto
+    # initialize,
+    echo "initializing '$RBM.init'"
+    echo "<Rbm> <InputDim> $num_hid <OutputDim> $num_hid <VisibleType> bern <HiddenType> bern <ParamStddev> $param_stddev <VisibleBiasCmvnFilename> $dir/$depth.cmvn" > $RBM.proto
     nnet-initialize $RBM.proto $RBM.init 2>$dir/log/nnet-initialize.$depth.log || exit 1
-    #pre-train
-    echo "Pretraining '$RBM' (lrate $rbm_lrate, iters $rbm_iter)"
+    # pre-train,
+    echo "pretraining '$RBM' (lrate $rbm_lrate, iters $rbm_iter)"
     rbm-train-cd1-frmshuff --learn-rate=$rbm_lrate --l2-penalty=$rbm_l2penalty \
       --num-iters=$rbm_iter --verbose=$verbose \
       --feature-transform="nnet-concat $feature_transform $dir/$((depth-1)).dbn - |" \
       $rbm_extra_opts \
-      $RBM.init "$feats" $RBM 2>$dir/log/rbm.$depth.log || exit 1
+      $RBM.init "$feats_tr" $RBM 2>$dir/log/rbm.$depth.log || exit 1
   fi
 
-  #Create DBN stack
+  # Create DBN stack,
   if [ "$depth" == "1" ]; then
-    rbm-convert-to-nnet --binary=true $RBM $dir/$depth.dbn
-  else 
-    rbm-convert-to-nnet --binary=true $RBM - | \
-    nnet-concat $dir/$((depth-1)).dbn - $dir/$depth.dbn
+    echo "# converting RBM to $dir/$depth.dbn"
+    rbm-convert-to-nnet $RBM $dir/$depth.dbn
+  else
+    echo "# appending RBM to $dir/$depth.dbn"
+    nnet-concat $dir/$((depth-1)).dbn "rbm-convert-to-nnet $RBM - |"  $dir/$depth.dbn
   fi
 
 done
@@ -278,7 +331,7 @@ echo
 echo "# REPORT"
 echo "# RBM pre-training progress (line per-layer)"
 grep progress $dir/log/rbm.*.log
-echo 
+echo
 
 echo "Pre-training finished."
 
diff --git a/egs/wsj/s5/steps/nnet/train.sh b/egs/wsj/s5/steps/nnet/train.sh
index 1f53c3eb1b7..9f05b34f4d3 100755
--- a/egs/wsj/s5/steps/nnet/train.sh
+++ b/egs/wsj/s5/steps/nnet/train.sh
@@ -1,92 +1,91 @@
 #!/bin/bash
 
-# Copyright 2012/2014  Brno University of Technology (Author: Karel Vesely)
+# Copyright 2012-2015  Brno University of Technology (author: Karel Vesely)
 # Apache 2.0
 
 # Begin configuration.
-config=            # config, which is also sent to all other scripts
-
-# NETWORK INITIALIZATION
-nnet_init=          # select initialized MLP (override initialization)
-nnet_proto=         # select network prototype (initialize it)
-proto_opts=        # non-default options for 'make_nnet_proto.py'
-feature_transform= # provide feature transform (=splice,rescaling,...) (don't build new one)
-pytel_transform=   # use external transform defined in python (BUT specific)
-network_type=dnn   # (dnn,cnn1d,cnn2d,lstm) select type of neural network
-cnn_proto_opts=     # extra options for 'make_cnn_proto.py'
-#
-hid_layers=4       # nr. of hidden layers (prior to sotfmax or bottleneck)
-hid_dim=1024       # select hidden dimension
-bn_dim=            # set a value to get a bottleneck network
-dbn=               # select DBN to prepend to the MLP initialization
-#
-init_opts=         # options, passed to the initialization script
-
-# FEATURE PROCESSING
-copy_feats=true # resave the train/cv features into /tmp (disabled by default)
- copy_feats_tmproot= # tmproot for copy-feats (optional)
-# feature config (applies always)
-cmvn_opts=
-delta_opts=
-# feature_transform:
-splice=5         # temporal splicing
-splice_step=1    # stepsize of the splicing (1 == no gap between frames)
-feat_type=plain
-# feature config (applies to feat_type traps)
-traps_dct_basis=11 # nr. od DCT basis (applies to `traps` feat_type, splice10 )
-# feature config (applies to feat_type transf) (ie. LDA+MLLT, no fMLLR)
-transf=
-splice_after_transf=5
-# feature config (applies to feat_type lda)
-lda_dim=300        # LDA dimension (applies to `lda` feat_type)
-
-# LABELS
-labels=            # use these labels to train (override deafault pdf alignments, has to be in 'Posterior' format, see ali-to-post) 
-num_tgt=           # force to use number of outputs in the MLP (default is autodetect)
-
-# TRAINING SCHEDULER
-learn_rate=0.008   # initial learning rate
-train_opts=        # options, passed to the training script
-train_tool=        # optionally change the training tool
-frame_weights=     # per-frame weights for gradient weighting
-
-# OTHER
-seed=777    # seed value used for training data shuffling and initialization
+
+config=             # config, also forwarded to 'train_scheduler.sh',
+
+# topology, initialization,
+network_type=dnn    # select type of neural network (dnn,cnn1d,cnn2d,lstm),
+hid_layers=4        # nr. of hidden layers (before sotfmax or bottleneck),
+hid_dim=1024        # number of neurons per layer,
+bn_dim=             # (optional) adds bottleneck and one more hidden layer to the NN,
+dbn=                # (optional) prepend layers to the initialized NN,
+
+proto_opts=         # adds options to 'make_nnet_proto.py',
+cnn_proto_opts=     # adds options to 'make_cnn_proto.py',
+
+nnet_init=          # (optional) use this pre-initialized NN,
+nnet_proto=         # (optional) use this NN prototype for initialization,
+
+# feature processing,
+splice=5            # (default) splice features both-ways along time axis,
+cmvn_opts=          # (optional) adds 'apply-cmvn' to input feature pipeline, see opts,
+delta_opts=         # (optional) adds 'add-deltas' to input feature pipeline, see opts,
+ivector=            # (optional) adds 'append-vector-to-feats', the option is rx-filename for the 2nd stream,
+ivector_append_tool=append-vector-to-feats # (optional) the tool for appending ivectors,
+
+feat_type=plain  
+traps_dct_basis=11    # (feat_type=traps) nr. of DCT basis, 11 is good with splice=10,
+transf=               # (feat_type=transf) import this linear tranform,
+splice_after_transf=5 # (feat_type=transf) splice after the linear transform,
+
+feature_transform_proto= # (optional) use this prototype for 'feature_transform',
+feature_transform=  # (optional) directly use this 'feature_transform',
+pytel_transform=    # (BUT) use external python transform,
+
+# labels,
+labels=            # (optional) specify non-default training targets,
+                   # (targets need to be in posterior format, see 'ali-to-post', 'feat-to-post'),
+num_tgt=           # (optional) specifiy number of NN outputs, to be used with 'labels=',
+
+# training scheduler,
+learn_rate=0.008   # initial learning rate,
+scheduler_opts=    # options, passed to the training scheduler,
+train_tool=        # optionally change the training tool,
+train_tool_opts=   # options for the training tool,
+frame_weights=     # per-frame weights for gradient weighting,
+utt_weights=       # per-utterance weights (scalar for --frame-weights),
+
+# data processing, misc.
+copy_feats=true     # resave the train/cv features into /tmp (disabled by default),
+copy_feats_tmproot=/tmp/kaldi.XXXX # sets tmproot for 'copy-feats',
+seed=777            # seed value used for data-shuffling, nn-initialization, and training,
 skip_cuda_check=false
+
 # End configuration.
 
 echo "$0 $@"  # Print the command line for logging
 
 [ -f path.sh ] && . ./path.sh; 
-
-
 . parse_options.sh || exit 1;
 
+set -euo pipefail
 
 if [ $# != 6 ]; then
    echo "Usage: $0 <data-train> <data-dev> <lang-dir> <ali-train> <ali-dev> <exp-dir>"
    echo " e.g.: $0 data/train data/cv data/lang exp/mono_ali_train exp/mono_ali_cv exp/mono_nnet"
    echo ""
    echo " Training data : <data-train>,<ali-train> (for optimizing cross-entropy)"
-   echo " Held-out data : <data-dev>,<ali-dev> (for learn-rate/model selection based on cross-entopy)"
+   echo " Held-out data : <data-dev>,<ali-dev> (for learn-rate scheduling, model selection)"
    echo " note.: <ali-train>,<ali-dev> can point to same directory, or 2 separate directories."
    echo ""
    echo "main options (for others, see top of script file)"
    echo "  --config <config-file>   # config containing options"
    echo ""
-   echo "  --apply-cmvn <bool>      # apply CMN"
-   echo "  --norm-vars <bool>       # add CVN if CMN already active"
-   echo "  --splice <N>             # concatenate input features"
-   echo "  --feat-type <type>       # select type of input features"
-   echo ""
-   echo "  --mlp-proto <file>       # use this NN prototype"
+   echo "  --network-type (dnn,cnn1d,cnn2d,lstm)  # type of neural network"
+   echo "  --nnet-proto <file>      # use this NN prototype"
    echo "  --feature-transform <file> # re-use this input feature transform"
-   echo "  --hid-layers <N>         # number of hidden layers"
-   echo "  --hid-dim <N>            # width of hidden layers"
-   echo "  --bn-dim <N>             # make bottle-neck network with bn-with N"
    echo ""
+   echo "  --feat-type (plain|traps|transf) # type of input features"
+   echo "  --cmvn-opts  <string>            # add 'apply-cmvn' to input feature pipeline"
+   echo "  --delta-opts <string>            # add 'add-deltas' to input feature pipeline"
+   echo "  --splice <N>                     # splice +/-N frames of input features"
+   echo 
    echo "  --learn-rate <float>     # initial leaning-rate"
-   echo "  --copy-feats <bool>      # copy input features to /tmp (it's faster)"
+   echo "  --copy-feats <bool>      # copy features to /tmp, lowers storage stress"
    echo ""
    exit 1;
 fi
@@ -100,7 +99,7 @@ dir=$6
 
 # Using alidir for supervision (default)
 if [ -z "$labels" ]; then 
-  silphonelist=`cat $lang/phones/silence.csl` || exit 1;
+  silphonelist=`cat $lang/phones/silence.csl`
   for f in $alidir/final.mdl $alidir/ali.1.gz $alidir_cv/ali.1.gz; do
     [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
   done
@@ -114,14 +113,18 @@ echo
 echo "# INFO"
 echo "$0 : Training Neural Network"
 printf "\t dir       : $dir \n"
-printf "\t Train-set : $data $alidir \n"
-printf "\t CV-set    : $data_cv $alidir_cv \n"
+printf "\t Train-set : $data $(cat $data/feats.scp | wc -l), $alidir \n"
+printf "\t CV-set    : $data_cv $(cat $data_cv/feats.scp | wc -l) $alidir_cv \n"
 echo
 
 mkdir -p $dir/{log,nnet}
 
-# skip when already trained
-[ -e $dir/final.nnet ] && printf "\nSKIPPING TRAINING... ($0)\nnnet already trained : $dir/final.nnet ($(readlink $dir/final.nnet))\n\n" && exit 0
+# skip when already trained,
+if [ -e $dir/final.nnet ]; then
+  echo "SKIPPING TRAINING... ($0)"
+  echo "nnet already trained : $dir/final.nnet ($(readlink $dir/final.nnet))"
+  exit 0
+fi
 
 # check if CUDA compiled in and GPU is available,
 if ! $skip_cuda_check; then cuda-gpu-available || exit 1; fi
@@ -135,76 +138,90 @@ if [ ! -z "$labels" ]; then
   labels_cv="$labels"
 else
   echo "Using PDF targets from dirs '$alidir' '$alidir_cv'"
-  # define pdf-alignment rspecifiers
+  # training targets in posterior format,
   labels_tr="ark:ali-to-pdf $alidir/final.mdl \"ark:gunzip -c $alidir/ali.*.gz |\" ark:- | ali-to-post ark:- ark:- |"
   labels_cv="ark:ali-to-pdf $alidir/final.mdl \"ark:gunzip -c $alidir_cv/ali.*.gz |\" ark:- | ali-to-post ark:- ark:- |"
-  # 
-  labels_tr_pdf="ark:ali-to-pdf $alidir/final.mdl \"ark:gunzip -c $alidir/ali.*.gz |\" ark:- |" # for analyze-counts.
+  # training targets for analyze-counts,
+  labels_tr_pdf="ark:ali-to-pdf $alidir/final.mdl \"ark:gunzip -c $alidir/ali.*.gz |\" ark:- |"
   labels_tr_phn="ark:ali-to-phones --per-frame=true $alidir/final.mdl \"ark:gunzip -c $alidir/ali.*.gz |\" ark:- |"
 
-  # get pdf-counts, used later to post-process DNN posteriors
-  analyze-counts --verbose=1 --binary=false "$labels_tr_pdf" $dir/ali_train_pdf.counts 2>$dir/log/analyze_counts_pdf.log || exit 1
-  # copy the old transition model, will be needed by decoder
-  copy-transition-model --binary=false $alidir/final.mdl $dir/final.mdl || exit 1
+  # get pdf-counts, used later for decoding/aligning,
+  num_pdf=$(hmm-info $alidir/final.mdl | awk '/pdfs/{print $4}')
+  analyze-counts --verbose=1 --binary=false --counts-dim=$num_pdf \
+    ${frame_weights:+ "--frame-weights=$frame_weights"} \
+    ${utt_weights:+ "--utt-weights=$utt_weights"} \
+    "$labels_tr_pdf" $dir/ali_train_pdf.counts 2>$dir/log/analyze_counts_pdf.log
+  # copy the old transition model, will be needed by decoder,
+  copy-transition-model --binary=false $alidir/final.mdl $dir/final.mdl
   # copy the tree
-  cp $alidir/tree $dir/tree || exit 1
+  cp $alidir/tree $dir/tree
 
-  # make phone counts for analysis
-  [ -e $lang/phones.txt ] && analyze-counts --verbose=1 --symbol-table=$lang/phones.txt "$labels_tr_phn" /dev/null 2>$dir/log/analyze_counts_phones.log || exit 1
+  # make phone counts for analysis,
+  [ -e $lang/phones.txt ] && analyze-counts --verbose=1 --symbol-table=$lang/phones.txt --counts-dim=$num_pdf \
+    ${frame_weights:+ "--frame-weights=$frame_weights"} \
+    ${utt_weights:+ "--utt-weights=$utt_weights"} \
+    "$labels_tr_phn" /dev/null 2>$dir/log/analyze_counts_phones.log
 fi
 
 ###### PREPARE FEATURES ######
 echo
 echo "# PREPARING FEATURES"
-# shuffle the list
-echo "Preparing train/cv lists :"
-cat $data/feats.scp | utils/shuffle_list.pl --srand ${seed:-777} > $dir/train.scp
-cp $data_cv/feats.scp $dir/cv.scp
-# print the list sizes
-wc -l $dir/train.scp $dir/cv.scp
-
-# re-save the train/cv features to /tmp, reduces LAN traffic, avoids disk-seeks due to shuffled features
 if [ "$copy_feats" == "true" ]; then
-  tmpdir=$(mktemp -d $copy_feats_tmproot); mv $dir/train.scp{,_non_local}; mv $dir/cv.scp{,_non_local}
-  copy-feats scp:$dir/train.scp_non_local ark,scp:$tmpdir/train.ark,$dir/train.scp || exit 1
-  copy-feats scp:$dir/cv.scp_non_local ark,scp:$tmpdir/cv.ark,$dir/cv.scp || exit 1
-  trap "echo \"Removing features tmpdir $tmpdir @ $(hostname)\"; ls $tmpdir; rm -r $tmpdir" EXIT
+  echo "# re-saving features to local disk,"
+  tmpdir=$(mktemp -d $copy_feats_tmproot)
+  copy-feats scp:$data/feats.scp ark,scp:$tmpdir/train.ark,$dir/train_sorted.scp
+  copy-feats scp:$data_cv/feats.scp ark,scp:$tmpdir/cv.ark,$dir/cv.scp
+  trap "echo '# Removing features tmpdir $tmpdir @ $(hostname)'; ls $tmpdir; rm -r $tmpdir" EXIT
+else
+  # or copy the list,
+  cp $data/feats.scp $dir/train_sorted.scp
+  cp $data_cv/feats.scp $dir/cv.scp
 fi
+# shuffle the list,
+utils/shuffle_list.pl --srand ${seed:-777} <$dir/train_sorted.scp >$dir/train.scp
 
-#create a 10k utt subset for global cmvn estimates
+# create a 10k utt subset for global cmvn estimates,
 head -n 10000 $dir/train.scp > $dir/train.scp.10k
 
+# for debugging, add lists with non-local features,
+utils/shuffle_list.pl --srand ${seed:-777} <$data/feats.scp >$dir/train.scp_non_local
+cp $data_cv/feats.scp $dir/cv.scp_non_local
 
-###### PREPARE FEATURE PIPELINE ######
-
-# optionally import feature setup from pre-training,
+###### OPTIONALLY IMPORT FEATURE SETTINGS (from pre-training) ######
+ivector_dim= # no ivectors,
 if [ ! -z $feature_transform ]; then
   D=$(dirname $feature_transform)
+  echo "# importing feature settings from dir '$D'"
   [ -e $D/norm_vars ] && cmvn_opts="--norm-means=true --norm-vars=$(cat $D/norm_vars)" # Bwd-compatibility,
   [ -e $D/cmvn_opts ] && cmvn_opts=$(cat $D/cmvn_opts)
   [ -e $D/delta_order ] && delta_opts="--delta-order=$(cat $D/delta_order)" # Bwd-compatibility,
   [ -e $D/delta_opts ] && delta_opts=$(cat $D/delta_opts)
-  echo "Imported config : cmvn_opts='$cmvn_opts' delta_opts='$delta_opts'"
+  [ -e $D/ivector_dim ] && ivector_dim=$(cat $D/ivector_dim)
+  [ -e $D/ivector_append_tool ] && ivector_append_tool=$(cat $D/ivector_append_tool)
+  echo "# cmvn_opts='$cmvn_opts' delta_opts='$delta_opts' ivector_dim='$ivector_dim'"
 fi
 
+###### PREPARE FEATURE PIPELINE ######
 # read the features,
 feats_tr="ark:copy-feats scp:$dir/train.scp ark:- |"
 feats_cv="ark:copy-feats scp:$dir/cv.scp ark:- |"
+
 # optionally add per-speaker CMVN,
 if [ ! -z "$cmvn_opts" ]; then
-  echo "Will use CMVN statistics : $data/cmvn.scp, $data_cv/cmvn.scp"
+  echo "# + 'apply-cmvn' with '$cmvn_opts' using statistics : $data/cmvn.scp, $data_cv/cmvn.scp"
   [ ! -r $data/cmvn.scp ] && echo "Missing $data/cmvn.scp" && exit 1;
   [ ! -r $data_cv/cmvn.scp ] && echo "Missing $data_cv/cmvn.scp" && exit 1;
   feats_tr="$feats_tr apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp ark:- ark:- |"
   feats_cv="$feats_cv apply-cmvn $cmvn_opts --utt2spk=ark:$data_cv/utt2spk scp:$data_cv/cmvn.scp ark:- ark:- |"
 else
-  echo "apply-cmvn is not used"
+  echo "# 'apply-cmvn' is not used,"
 fi
+
 # optionally add deltas,
 if [ ! -z "$delta_opts" ]; then
   feats_tr="$feats_tr add-deltas $delta_opts ark:- ark:- |"
   feats_cv="$feats_cv add-deltas $delta_opts ark:- ark:- |"
-  echo "add-deltas with $delta_opts"
+  echo "# + 'add-deltas' with '$delta_opts'"
 fi
 
 # keep track of the config,
@@ -219,33 +236,40 @@ if [ ! -z "$pytel_transform" ]; then
   pytel_transform=$dir/pytel_transform.py
   feats_tr="$feats_tr /bin/env python $pytel_transform |"
   feats_cv="$feats_cv /bin/env python $pytel_transform |"
+  echo "# + 'pytel-transform' from '$pytel_transform'"
 fi
 
-# get feature dim
-echo "Getting feature dim : "
-feat_dim=$(feat-to-dim --print-args=false "$feats_tr" -)
-echo "Feature dim is : $feat_dim"
+# get feature dim,
+feat_dim=$(feat-to-dim "$feats_tr" -)
+echo "# feature dim : $feat_dim (input of 'feature_transform')"
 
-# Now we will start building complex feature_transform which will 
-# be forwarded in CUDA to have fast run-time.
+# Now we start building 'feature_transform' which goes right in front of a NN. 
+# The forwarding is computed on a GPU before the frame shuffling is applied.
 #
-# We will use 1GPU for both feature_transform and MLP training in one binary tool. 
-# This is against the kaldi spirit to have many independent small processing units, 
-# but it is necessary because of compute exclusive mode, where GPU cannot be shared
-# by multiple processes.
+# Same GPU is used both for 'feature_transform' and the NN training.
+# So it has to be done by a single process (we are using exclusive mode).
+# This also reduces the CPU-GPU uploads/downloads to minimum.
 
 if [ ! -z "$feature_transform" ]; then
-  echo "Using pre-computed feature-transform : '$feature_transform'"
-  tmp=$dir/$(basename $feature_transform) 
+  echo "# importing 'feature_transform' from '$feature_transform'"
+  tmp=$dir/imported_$(basename $feature_transform) 
   cp $feature_transform $tmp; feature_transform=$tmp
 else
-  # Generate the splice transform
-  echo "Using splice +/- $splice , step $splice_step"
-  feature_transform=$dir/tr_splice$splice-$splice_step.nnet
-  utils/nnet/gen_splice.py --fea-dim=$feat_dim --splice=$splice --splice-step=$splice_step > $feature_transform
+  # Make default proto with splice,
+  if [ ! -z $feature_transform_proto ]; then
+    echo "# importing custom 'feature_transform_proto' from '$feature_transform_proto'"
+  else
+    echo "# + default 'feature_transform_proto' with splice +/-$splice frames,"
+    feature_transform_proto=$dir/splice${splice}.proto
+    echo "<Splice> <InputDim> $feat_dim <OutputDim> $(((2*splice+1)*feat_dim)) <BuildVector> -$splice:$splice </BuildVector>" >$feature_transform_proto
+  fi
+
+  # Initialize 'feature-transform' from a prototype,
+  feature_transform=$dir/tr_$(basename $feature_transform_proto .proto).nnet
+  nnet-initialize --binary=false $feature_transform_proto $feature_transform
 
   # Choose further processing of spliced features
-  echo "Feature type : $feat_type"
+  echo "# feature type : $feat_type"
   case $feat_type in
     plain)
     ;;
@@ -253,14 +277,14 @@ else
       #generate hamming+dct transform
       feature_transform_old=$feature_transform
       feature_transform=${feature_transform%.nnet}_hamm_dct${traps_dct_basis}.nnet
-      echo "Preparing Hamming DCT transform into : $feature_transform"
+      echo "# + Hamming DCT transform (t$((splice*2+1)),dct${traps_dct_basis}) into '$feature_transform'"
       #prepare matrices with time-transposed hamming and dct
       utils/nnet/gen_hamm_mat.py --fea-dim=$feat_dim --splice=$splice > $dir/hamm.mat
       utils/nnet/gen_dct_mat.py --fea-dim=$feat_dim --splice=$splice --dct-basis=$traps_dct_basis > $dir/dct.mat
       #put everything together
       compose-transforms --binary=false $dir/dct.mat $dir/hamm.mat - | \
         transf-to-nnet - - | \
-        nnet-concat --binary=false $feature_transform_old - $feature_transform || exit 1
+        nnet-concat --binary=false $feature_transform_old - $feature_transform
     ;;
     transf)
       feature_transform_old=$feature_transform
@@ -271,131 +295,153 @@ else
       nnet-concat --binary=false $feature_transform_old \
         "transf-to-nnet $transf - |" \
         "utils/nnet/gen_splice.py --fea-dim=$feat_dim --splice=$splice_after_transf |" \
-        $feature_transform || exit 1
-    ;;
-    lda)
-      transf=$dir/lda$lda_dim.mat
-      #get the LDA statistics
-      if [ ! -r "$dir/lda.acc" ]; then
-        echo "LDA: Converting alignments to posteriors $dir/lda_post.scp"
-        ali-to-post "ark:gunzip -c $alidir/ali.*.gz|" ark:- | \
-          weight-silence-post 0.0 $silphonelist $alidir/final.mdl ark:- ark,scp:$dir/lda_post.ark,$dir/lda_post.scp 2>$dir/log/ali-to-post-lda.log || exit 1;
-        echo "Accumulating LDA statistics $dir/lda.acc on top of spliced feats"
-        acc-lda --rand-prune=4.0 $alidir/final.mdl "$feats_tr nnet-forward $feature_transform ark:- ark:- |" scp:$dir/lda_post.scp $dir/lda.acc 2>$dir/log/acc-lda.log || exit 1;
-      else
-        echo "LDA: Using pre-computed stats $dir/lda.acc"
-      fi
-      #estimate the transform  
-      echo "Estimating LDA transform $dir/lda.mat from the statistics $dir/lda.acc"
-      est-lda --write-full-matrix=$dir/lda.full.mat --dim=$lda_dim $transf $dir/lda.acc 2>$dir/log/lda.log || exit 1;
-      #append the LDA matrix to feature_transform
-      feature_transform_old=$feature_transform
-      feature_transform=${feature_transform%.nnet}_lda${lda_dim}.nnet
-      transf-to-nnet $transf - | \
-        nnet-concat --binary=false $feature_transform_old - $feature_transform || exit 1
-      #remove the temporary file
-      rm $dir/lda_post.{ark,scp}
+        $feature_transform
     ;;
     *)
       echo "Unknown feature type $feat_type"
       exit 1;
     ;;
   esac
-  # keep track of feat_type
+
+  # keep track of feat_type,
   echo $feat_type > $dir/feat_type
 
-  # Renormalize the MLP input to zero mean and unit variance
+  # Renormalize the MLP input to zero mean and unit variance,
   feature_transform_old=$feature_transform
   feature_transform=${feature_transform%.nnet}_cmvn-g.nnet
-  echo "Renormalizing MLP input features into $feature_transform"
-  nnet-forward --use-gpu=yes \
-    $feature_transform_old "$(echo $feats_tr | sed 's|train.scp|train.scp.10k|')" \
-    ark:- 2>$dir/log/nnet-forward-cmvn.log |\
-  compute-cmvn-stats ark:- - | cmvn-to-nnet - - |\
-  nnet-concat --binary=false $feature_transform_old - $feature_transform
-  [ ! -f $feature_transform ] && cat $dir/log/nnet-forward-cmvn.log && echo "Error: Global CMVN failed, was the CUDA GPU okay?" && echo && exit 1
+  echo "# compute normalization stats from 10k sentences"
+  nnet-forward --print-args=true --use-gpu=yes $feature_transform_old \
+    "$(echo $feats_tr | sed 's|train.scp|train.scp.10k|')" ark:- |\
+    compute-cmvn-stats ark:- $dir/cmvn-g.stats
+  echo "# + normalization of NN-input at '$feature_transform'"
+  nnet-concat --binary=false $feature_transform_old \
+    "cmvn-to-nnet $dir/cmvn-g.stats -|" $feature_transform
 fi
 
+if [ ! -z $ivector ]; then
+  echo
+  echo "# ADDING IVECTOR FEATURES"
+  # The iVectors are concatenated 'as they are' directly to the input of the neural network,
+  # To do this, we paste the features, and use <ParallelComponent> where the 1st component
+  # contains the transform and 2nd network contains <Copy> component.
+  
+  echo "# getting dims,"
+  dim_raw=$(feat-to-dim "$feats_tr" -)
+  dim_raw_and_ivec=$(feat-to-dim "$feats_tr $ivector_append_tool ark:- '$ivector' ark:- |" -)
+  dim_ivec=$((dim_raw_and_ivec - dim_raw))
+  echo "# dims, feats-raw $dim_raw, ivectors $dim_ivec,"
+
+  # Should we do something with 'feature_transform'?
+  if [ ! -z $ivector_dim ]; then 
+    # No, the 'ivector_dim' comes from dir with 'feature_transform' with iVec forwarding,
+    echo "# assuming we got '$feature_transform' with ivector forwarding,"
+    [ $ivector_dim != $dim_ivec ] && \
+    echo -n "Error, i-vector dimensionality mismatch!" && \
+    echo " (expected $ivector_dim, got $dim_ivec in $ivector)" && exit 1
+  else 
+    # Yes, adjust the transform to do ``iVec forwarding'',
+    feature_transform_old=$feature_transform
+    feature_transform=${feature_transform%.nnet}_ivec_copy.nnet
+    echo "# setting up ivector forwarding into '$feature_transform',"
+    dim_transformed=$(feat-to-dim "$feats_tr nnet-forward $feature_transform_old ark:- ark:- |" -)
+    nnet-initialize --print-args=false <(echo "<Copy> <InputDim> $dim_ivec <OutputDim> $dim_ivec <BuildVector> 1:$dim_ivec </BuildVector>") $dir/tr_ivec_copy.nnet 
+    nnet-initialize --print-args=false <(echo "<ParallelComponent> <InputDim> $((dim_raw+dim_ivec)) <OutputDim> $((dim_transformed+dim_ivec)) \
+                                               <NestedNnetFilename> $feature_transform_old $dir/tr_ivec_copy.nnet </NestedNnetFilename>") $feature_transform 
+  fi
+  echo $dim_ivec >$dir/ivector_dim # mark down the iVec dim!
+  echo $ivector_append_tool >$dir/ivector_append_tool
+
+  # pasting the iVecs to the feaures,
+  echo "# + ivector input '$ivector'"
+  feats_tr="$feats_tr $ivector_append_tool ark:- '$ivector' ark:- |"
+  feats_cv="$feats_cv $ivector_append_tool ark:- '$ivector' ark:- |"
+fi
+
+###### Show the final 'feature_transform' in the log,
+echo
+echo "### Showing the final 'feature_transform':"
+nnet-info $feature_transform
+echo "###"
 
 ###### MAKE LINK TO THE FINAL feature_transform, so the other scripts will find it ######
-(cd $dir; [ ! -f final.feature_transform ] && ln -s $(basename $feature_transform) final.feature_transform )
+[ -f $dir/final.feature_transform ] && unlink $dir/final.feature_transform
+(cd $dir; ln -s $(basename $feature_transform) final.feature_transform )
+feature_transform=$dir/final.feature_transform
 
 
 ###### INITIALIZE THE NNET ######
 echo 
 echo "# NN-INITIALIZATION"
-[ ! -z "$nnet_init" ] && echo "Using pre-initialized network '$nnet_init'";
-if [ ! -z "$nnet_proto" ]; then 
-  echo "Initializing using network prototype '$nnet_proto'";
+if [ ! -z $nnet_init ]; then
+  echo "# using pre-initialized network '$nnet_init'"
+elif [ ! -z $nnet_proto ]; then
+  echo "# initializing NN from prototype '$nnet_proto'";
   nnet_init=$dir/nnet.init; log=$dir/log/nnet_initialize.log
-  nnet-initialize $nnet_proto $nnet_init 2>$log || { cat $log; exit 1; } 
-fi
-if [[ -z "$nnet_init" && -z "$nnet_proto" ]]; then
-  echo "Getting input/output dims :"
-  #initializing the MLP, get the i/o dims...
-  #input-dim
-  num_fea=$(feat-to-dim "$feats_tr nnet-forward $feature_transform ark:- ark:- |" - )
-  { #optioanlly take output dim of DBN
-    [ ! -z $dbn ] && num_fea=$(nnet-forward "nnet-concat $feature_transform $dbn -|" "$feats_tr" ark:- | feat-to-dim ark:- -)
-    [ -z "$num_fea" ] && echo "Getting nnet input dimension failed!!" && exit 1
-  }
-
-  #output-dim
-  [ -z $num_tgt ] && num_tgt=$(hmm-info --print-args=false $alidir/final.mdl | grep pdfs | awk '{ print $NF }')
-
-  # make network prototype
+  nnet-initialize --seed=$seed $nnet_proto $nnet_init
+else 
+  echo "# getting input/output dims :"
+  # input-dim,
+  get_dim_from=$feature_transform
+  [ ! -z "$dbn" ] && get_dim_from="nnet-concat $feature_transform '$dbn' -|"
+  num_fea=$(feat-to-dim "$feats_tr nnet-forward \"$get_dim_from\" ark:- ark:- |" -)
+
+  # output-dim,
+  [ -z $num_tgt ] && \
+    num_tgt=$(hmm-info --print-args=false $alidir/final.mdl | grep pdfs | awk '{ print $NF }')
+
+  # make network prototype,
   nnet_proto=$dir/nnet.proto
-  echo "Genrating network prototype $nnet_proto"
+  echo "# genrating network prototype $nnet_proto"
   case "$network_type" in
     dnn)
       utils/nnet/make_nnet_proto.py $proto_opts \
         ${bn_dim:+ --bottleneck-dim=$bn_dim} \
-        $num_fea $num_tgt $hid_layers $hid_dim >$nnet_proto || exit 1 
+        $num_fea $num_tgt $hid_layers $hid_dim >$nnet_proto
       ;;
     cnn1d)
       delta_order=$([ -z $delta_opts ] && echo "0" || { echo $delta_opts | tr ' ' '\n' | grep "delta[-_]order" | sed 's:^.*=::'; })
       echo "Debug : $delta_opts, delta_order $delta_order"
       utils/nnet/make_cnn_proto.py $cnn_proto_opts \
         --splice=$splice --delta-order=$delta_order --dir=$dir \
-        $num_fea >$nnet_proto || exit 1
+        $num_fea >$nnet_proto
       cnn_fea=$(cat $nnet_proto | grep -v '^$' | tail -n1 | awk '{ print $5; }')
       utils/nnet/make_nnet_proto.py $proto_opts \
         --no-proto-head --no-smaller-input-weights \
         ${bn_dim:+ --bottleneck-dim=$bn_dim} \
-        "$cnn_fea" $num_tgt $hid_layers $hid_dim >>$nnet_proto || exit 1 
+        "$cnn_fea" $num_tgt $hid_layers $hid_dim >>$nnet_proto
       ;;
     cnn2d) 
       delta_order=$([ -z $delta_opts ] && echo "0" || { echo $delta_opts | tr ' ' '\n' | grep "delta[-_]order" | sed 's:^.*=::'; })
       echo "Debug : $delta_opts, delta_order $delta_order"
       utils/nnet/make_cnn2d_proto.py $cnn_proto_opts \
         --splice=$splice --delta-order=$delta_order --dir=$dir \
-        $num_fea >$nnet_proto || exit 1
+        $num_fea >$nnet_proto
       cnn_fea=$(cat $nnet_proto | grep -v '^$' | tail -n1 | awk '{ print $5; }')
       utils/nnet/make_nnet_proto.py $proto_opts \
         --no-proto-head --no-smaller-input-weights \
         ${bn_dim:+ --bottleneck-dim=$bn_dim} \
-        "$cnn_fea" $num_tgt $hid_layers $hid_dim >>$nnet_proto || exit 1
+        "$cnn_fea" $num_tgt $hid_layers $hid_dim >>$nnet_proto
       ;;
     lstm)
       utils/nnet/make_lstm_proto.py $proto_opts \
-        $num_fea $num_tgt >$nnet_proto || exit 1 
+        $num_fea $num_tgt >$nnet_proto
       ;;
     blstm)
       utils/nnet/make_blstm_proto.py $proto_opts \
-        $num_fea $num_tgt >$nnet_proto || exit 1
+        $num_fea $num_tgt >$nnet_proto
       ;; 
-    *) echo "Unknown : --network_type $network_type" && exit 1;
+    *) echo "Unknown : --network-type $network_type" && exit 1;
   esac
 
-  # initialize
-  nnet_init=$dir/nnet.init; log=$dir/log/nnet_initialize.log
-  echo "Initializing $nnet_proto -> $nnet_init"
-  nnet-initialize $nnet_proto $nnet_init 2>$log || { cat $log; exit 1; }
+  # initialize,
+  nnet_init=$dir/nnet.init
+  echo "# initializing the NN '$nnet_proto' -> '$nnet_init'"
+  nnet-initialize --seed=$seed $nnet_proto $nnet_init
 
-  # optionally prepend dbn to the initialization
-  if [ ! -z $dbn ]; then
-    nnet_init_old=$nnet_init; nnet_init=$dir/nnet_$(basename $dbn)_dnn.init
-    nnet-concat $dbn $nnet_init_old $nnet_init || exit 1 
+  # optionally prepend dbn to the initialization,
+  if [ ! -z "$dbn" ]; then
+    nnet_init_old=$nnet_init; nnet_init=$dir/nnet_dbn_dnn.init
+    nnet-concat "$dbn" $nnet_init_old $nnet_init
   fi
 fi
 
@@ -404,22 +450,17 @@ fi
 echo
 echo "# RUNNING THE NN-TRAINING SCHEDULER"
 steps/nnet/train_scheduler.sh \
+  ${scheduler_opts} \
+  ${train_tool:+ --train-tool "$train_tool"} \
+  ${train_tool_opts:+ --train-tool-opts "$train_tool_opts"} \
   ${feature_transform:+ --feature-transform $feature_transform} \
   --learn-rate $learn_rate \
-  --randomizer-seed $seed \
-  ${train_opts} \
-  ${train_tool:+ --train-tool "$train_tool"} \
   ${frame_weights:+ --frame-weights "$frame_weights"} \
+  ${utt_weights:+ --utt-weights "$utt_weights"} \
   ${config:+ --config $config} \
-  $nnet_init "$feats_tr" "$feats_cv" "$labels_tr" "$labels_cv" $dir || exit 1
-
-if $prepend_cnn; then
-  echo "Preparing feature transform with CNN layers for RBM pre-training."
-  nnet-concat $dir/final.feature_transform "nnet-copy --remove-last-layers=$(((hid_layers+1)*2)) $dir/final.nnet - |" \
-    $dir/final.feature_transform_cnn 2>$dir/log/concat_transf_cnn.log || exit 1
-fi
+  $nnet_init "$feats_tr" "$feats_cv" "$labels_tr" "$labels_cv" $dir
 
-echo "$0 successfuly finished.. $dir"
+echo "$0: Successfuly finished. '$dir'"
 
 sleep 3
 exit 0
diff --git a/egs/wsj/s5/steps/nnet/train_mmi.sh b/egs/wsj/s5/steps/nnet/train_mmi.sh
index 6e1b42653c7..e2bbfbc6e92 100755
--- a/egs/wsj/s5/steps/nnet/train_mmi.sh
+++ b/egs/wsj/s5/steps/nnet/train_mmi.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright 2013  Brno University of Technology (Author: Karel Vesely)  
+# Copyright 2013-2015  Brno University of Technology (author: Karel Vesely)  
 # Apache 2.0.
 
 # Sequence-discriminative MMI/BMMI training of DNN.
@@ -21,6 +21,7 @@ learn_rate=0.00001
 halving_factor=1.0 #ie. disable halving
 drop_frames=true
 verbose=1
+ivector=
 
 seed=777    # seed value used for training data shuffling
 skip_cuda_check=false
@@ -31,9 +32,11 @@ echo "$0 $@"  # Print the command line for logging
 [ -f ./path.sh ] && . ./path.sh; # source the path.
 . parse_options.sh || exit 1;
 
+set -euo pipefail
+
 if [ $# -ne 6 ]; then
-  echo "Usage: steps/$0 <data> <lang> <srcdir> <ali> <denlats> <exp>"
-  echo " e.g.: steps/$0 data/train_all data/lang exp/tri3b_dnn exp/tri3b_dnn_ali exp/tri3b_dnn_denlats exp/tri3b_dnn_mmi"
+  echo "Usage: $0 <data> <lang> <srcdir> <ali> <denlats> <exp>"
+  echo " e.g.: $0 data/train_all data/lang exp/tri3b_dnn exp/tri3b_dnn_ali exp/tri3b_dnn_denlats exp/tri3b_dnn_mmi"
   echo "Main options (for others, see top of script file)"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   echo "  --config <config-file>                           # config containing options"
@@ -54,7 +57,9 @@ alidir=$4
 denlatdir=$5
 dir=$6
 
-for f in $data/feats.scp $alidir/{tree,final.mdl,ali.1.gz} $denlatdir/lat.scp $srcdir/{final.nnet,final.feature_transform}; do
+for f in $data/feats.scp $denlatdir/lat.scp \
+         $alidir/{tree,final.mdl,ali.1.gz} \
+         $srcdir/{final.nnet,final.feature_transform}; do
   [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
 done
 
@@ -65,7 +70,7 @@ mkdir -p $dir/log
 
 cp $alidir/{final.mdl,tree} $dir
 
-silphonelist=`cat $lang/phones/silence.csl` || exit 1;
+silphonelist=`cat $lang/phones/silence.csl`
 
 
 #Get the files we will need
@@ -91,7 +96,7 @@ model=$dir/final.mdl
 
 # Shuffle the feature list to make the GD stochastic!
 # By shuffling features, we have to use lattices with random access (indexed by .scp file).
-cat $data/feats.scp | utils/shuffle_list.pl --srand $seed > $dir/train.scp
+cat $data/feats.scp | utils/shuffle_list.pl --srand $seed >$dir/train.scp
 
 ###
 ### PREPARE FEATURE EXTRACTION PIPELINE
@@ -112,15 +117,34 @@ feats="ark,o:copy-feats scp:$dir/train.scp ark:- |"
 [ ! -z "$cmvn_opts" ] && feats="$feats apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp ark:- ark:- |"
 # add-deltas (optional),
 [ ! -z "$delta_opts" ] && feats="$feats add-deltas $delta_opts ark:- ark:- |"
-#
-# Record the setup,
+# add-pytel transform (optional),
+[ -e $D/pytel_transform.py ] && feats="$feats /bin/env python $D/pytel_transform.py |"
+
+# add-ivector (optional),
+if [ -e $D/ivector_dim ]; then
+  [ -z $ivector ] && echo "Missing --ivector, they were used in training!" && exit 1
+  # Get the tool, 
+  ivector_append_tool=append-vector-to-feats # default,
+  [ -e $D/ivector_append_tool ] && ivector_append_tool=$(cat $D/ivector_append_tool)
+  # Check dims,
+  dim_raw=$(feat-to-dim "$feats" -)
+  dim_raw_and_ivec=$(feat-to-dim "$feats $ivector_append_tool ark:- '$ivector' ark:- |" -)
+  dim_ivec=$((dim_raw_and_ivec - dim_raw))
+  [ $dim_ivec != "$(cat $D/ivector_dim)" ] && \
+    echo "Error, i-vector dim. mismatch (expected $(cat $D/ivector_dim), got $dim_ivec in '$ivector')" && \
+    exit 1
+  # Append to feats,
+  feats="$feats $ivector_append_tool ark:- '$ivector' ark:- |"
+fi
+
+### Record the setup,
 [ ! -z "$cmvn_opts" ] && echo $cmvn_opts >$dir/cmvn_opts
 [ ! -z "$delta_opts" ] && echo $delta_opts >$dir/delta_opts
-###
-###
+[ -e $D/pytel_transform.py ] && cp $D/pytel_transform.py $dir/pytel_transform.py
+[ -e $D/ivector_dim ] && cp $D/ivector_dim $dir/ivector_dim
+[ -e $D/ivector_append_tool ] && cp $D/ivector_append_tool $dir/ivector_append_tool
 ###
 
-
 ###
 ### Prepare the alignments
 ### 
@@ -173,7 +197,7 @@ while [ $x -le $num_iters ]; do
        --learn-rate=$learn_rate \
        --drop-frames=$drop_frames \
        --verbose=$verbose \
-       $cur_mdl $alidir/final.mdl "$feats" "$lats" "$ali" $dir/$x.nnet || exit 1
+       $cur_mdl $alidir/final.mdl "$feats" "$lats" "$ali" $dir/$x.nnet
   fi
   cur_mdl=$dir/$x.nnet
 
@@ -189,9 +213,15 @@ done
 
 echo "MMI/BMMI training finished"
 
-echo "Re-estimating priors by forwarding the training set."
-. cmd.sh
-nj=$(cat $alidir/num_jobs)
-steps/nnet/make_priors.sh --cmd "$train_cmd" --nj $nj $data $dir || exit 1
+if [ -e $dir/prior_counts ]; then
+  echo "Priors are already re-estimated, skipping... ($dir/prior_counts)"
+else
+  echo "Re-estimating priors by forwarding 10k utterances from training set."
+  . cmd.sh
+  nj=$(cat $alidir/num_jobs)
+  steps/nnet/make_priors.sh --cmd "$train_cmd" --nj $nj \
+    ${ivector:+ --ivector "$ivector"} $data $dir
+fi
 
+echo "$0: Done. '$dir'"
 exit 0
diff --git a/egs/wsj/s5/steps/nnet/train_mpe.sh b/egs/wsj/s5/steps/nnet/train_mpe.sh
index 6dd77d59edd..6eb107ef04f 100755
--- a/egs/wsj/s5/steps/nnet/train_mpe.sh
+++ b/egs/wsj/s5/steps/nnet/train_mpe.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright 2013-2014  Brno University of Technology (Author: Karel Vesely)  
+# Copyright 2013-2015  Brno University of Technology (author: Karel Vesely)  
 # Apache 2.0.
 
 # Sequence-discriminative MPE/sMBR training of DNN.
@@ -17,12 +17,15 @@ num_iters=4
 acwt=0.1
 lmwt=1.0
 learn_rate=0.00001
+momentum=0.0
 halving_factor=1.0 #ie. disable halving
 do_smbr=true
 exclude_silphones=true # exclude silphones from approximate accuracy computation
 unkphonelist= # exclude unkphones from approximate accuracy computation (overrides exclude_silphones)
 one_silence_class=true # true : reduce insertions in sMBR/MPE FW/BW, more stable training,
+                       # (all silphones are seen as a single class in the sMBR/MPE FW/BW)
 verbose=1
+ivector=
 
 seed=777    # seed value used for training data shuffling
 skip_cuda_check=false
@@ -33,9 +36,11 @@ echo "$0 $@"  # Print the command line for logging
 [ -f ./path.sh ] && . ./path.sh; # source the path.
 . parse_options.sh || exit 1;
 
+set -euo pipefail
+
 if [ $# -ne 6 ]; then
-  echo "Usage: steps/$0 <data> <lang> <srcdir> <ali> <denlats> <exp>"
-  echo " e.g.: steps/$0 data/train_all data/lang exp/tri3b_dnn exp/tri3b_dnn_ali exp/tri3b_dnn_denlats exp/tri3b_dnn_smbr"
+  echo "Usage: $0 <data> <lang> <srcdir> <ali> <denlats> <exp>"
+  echo " e.g.: $0 data/train_all data/lang exp/tri3b_dnn exp/tri3b_dnn_ali exp/tri3b_dnn_denlats exp/tri3b_dnn_smbr"
   echo "Main options (for others, see top of script file)"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   echo "  --config <config-file>                           # config containing options"
@@ -55,7 +60,9 @@ alidir=$4
 denlatdir=$5
 dir=$6
 
-for f in $data/feats.scp $alidir/{tree,final.mdl,ali.1.gz} $denlatdir/lat.scp $srcdir/{final.nnet,final.feature_transform}; do
+for f in $data/feats.scp $denlatdir/lat.scp \
+         $alidir/{tree,final.mdl,ali.1.gz} \
+         $srcdir/{final.nnet,final.feature_transform}; do
   [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
 done
 
@@ -66,7 +73,7 @@ mkdir -p $dir/log
 
 cp $alidir/{final.mdl,tree} $dir
 
-silphonelist=`cat $lang/phones/silence.csl` || exit 1;
+silphonelist=`cat $lang/phones/silence.csl`
 
 #Get the files we will need
 nnet=$srcdir/$(readlink $srcdir/final.nnet || echo final.nnet);
@@ -87,7 +94,9 @@ cp $feature_transform $dir/final.feature_transform
 model=$dir/final.mdl
 [ -z "$model" ] && echo "Error transition model '$model' does not exist!" && exit 1;
 
-#enable/disable silphones from MPE training
+# The argument '--silence-phones=csl' together with '--one-silence-class=true'
+# will cause regrouping of the silenece phones into a single class in the FW/BW
+# which calculates the Loss derivative (the 'new' behavior).
 mpe_silphones_arg= #empty
 $exclude_silphones && mpe_silphones_arg="--silence-phones=$silphonelist" # all silphones
 [ ! -z $unkphonelist ] && mpe_silphones_arg="--silence-phones=$unkphonelist" # unk only
@@ -116,15 +125,34 @@ feats="ark,o:copy-feats scp:$dir/train.scp ark:- |"
 [ ! -z "$cmvn_opts" ] && feats="$feats apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp ark:- ark:- |"
 # add-deltas (optional),
 [ ! -z "$delta_opts" ] && feats="$feats add-deltas $delta_opts ark:- ark:- |"
-#
-# Record the setup,
+# add-pytel transform (optional),
+[ -e $D/pytel_transform.py ] && feats="$feats /bin/env python $D/pytel_transform.py |"
+
+# add-ivector (optional),
+if [ -e $D/ivector_dim ]; then
+  [ -z $ivector ] && echo "Missing --ivector, they were used in training!" && exit 1
+  # Get the tool, 
+  ivector_append_tool=append-vector-to-feats # default,
+  [ -e $D/ivector_append_tool ] && ivector_append_tool=$(cat $D/ivector_append_tool)
+  # Check dims,
+  dim_raw=$(feat-to-dim "$feats" -)
+  dim_raw_and_ivec=$(feat-to-dim "$feats $ivector_append_tool ark:- '$ivector' ark:- |" -)
+  dim_ivec=$((dim_raw_and_ivec - dim_raw))
+  [ $dim_ivec != "$(cat $D/ivector_dim)" ] && \
+    echo "Error, i-vector dim. mismatch (expected $(cat $D/ivector_dim), got $dim_ivec in '$ivector')" && \
+    exit 1
+  # Append to feats,
+  feats="$feats $ivector_append_tool ark:- '$ivector' ark:- |"
+fi
+
+### Record the setup,
 [ ! -z "$cmvn_opts" ] && echo $cmvn_opts >$dir/cmvn_opts
 [ ! -z "$delta_opts" ] && echo $delta_opts >$dir/delta_opts
-###
-###
+[ -e $D/pytel_transform.py ] && cp {$D,$dir}/pytel_transform.py
+[ -e $D/ivector_dim ] && cp {$D,$dir}/ivector_dim
+[ -e $D/ivector_append_tool ] && cp $D/ivector_append_tool $dir/ivector_append_tool
 ###
 
-
 ###
 ### Prepare the alignments
 ### 
@@ -155,11 +183,12 @@ while [ $x -le $num_iters ]; do
        --acoustic-scale=$acwt \
        --lm-scale=$lmwt \
        --learn-rate=$learn_rate \
+       --momentum=$momentum \
        --do-smbr=$do_smbr \
        --verbose=$verbose \
        --one-silence-class=$one_silence_class \
        $mpe_silphones_arg \
-       $cur_mdl $alidir/final.mdl "$feats" "$lats" "$ali" $dir/$x.nnet || exit 1
+       $cur_mdl $alidir/final.mdl "$feats" "$lats" "$ali" $dir/$x.nnet
   fi
   cur_mdl=$dir/$x.nnet
 
@@ -176,9 +205,15 @@ done
 
 echo "MPE/sMBR training finished"
 
-echo "Re-estimating priors by forwarding the training set."
-. cmd.sh
-nj=$(cat $alidir/num_jobs)
-steps/nnet/make_priors.sh --cmd "$train_cmd" --nj $nj $data $dir || exit 1
+if [ -e $dir/prior_counts ]; then
+  echo "Priors are already re-estimated, skipping... ($dir/prior_counts)"
+else
+  echo "Re-estimating priors by forwarding 10k utterances from training set."
+  . cmd.sh
+  nj=$(cat $alidir/num_jobs)
+  steps/nnet/make_priors.sh --cmd "$train_cmd" --nj $nj \
+    ${ivector:+ --ivector "$ivector"} $data $dir
+fi
 
+echo "$0: Done. '$dir'"
 exit 0
diff --git a/egs/wsj/s5/steps/nnet/train_scheduler.sh b/egs/wsj/s5/steps/nnet/train_scheduler.sh
index 4569203e123..59901f5d1d2 100755
--- a/egs/wsj/s5/steps/nnet/train_scheduler.sh
+++ b/egs/wsj/s5/steps/nnet/train_scheduler.sh
@@ -1,36 +1,35 @@
 #!/bin/bash
 
-# Copyright 2012  Karel Vesely (Brno University of Technology)
+# Copyright 2012-2015  Brno University of Technology (author: Karel Vesely)
 # Apache 2.0
 
-# Train neural network
+# Schedules epochs and controls learning rate during the neural network training
 
 # Begin configuration.
 
-# training options
+# training options,
 learn_rate=0.008
 momentum=0
 l1_penalty=0
 l2_penalty=0
-# data processing
-minibatch_size=256
-randomizer_size=32768
-randomizer_seed=777
+
+# data processing,
+train_tool="nnet-train-frmshuff"
+train_tool_opts="--minibatch-size=256 --randomizer-size=32768 --randomizer-seed=777"
 feature_transform=
-# learn rate scheduling
+
+# learn rate scheduling,
 max_iters=20
 min_iters=0 # keep training, disable weight rejection, start learn-rate halving as usual,
-keep_lr_iters=0 # fix learning rate for N initial epochs,
-#start_halving_inc=0.5
-#end_halving_inc=0.1
+keep_lr_iters=0 # fix learning rate for N initial epochs, disable weight rejection,
 start_halving_impr=0.01
 end_halving_impr=0.001
 halving_factor=0.5
-# misc.
+
+# misc,
 verbose=1
-# tool
-train_tool="nnet-train-frmshuff"
 frame_weights=
+utt_weights=
  
 # End configuration.
 
@@ -39,6 +38,8 @@ echo "$0 $@"  # Print the command line for logging
 
 . parse_options.sh || exit 1;
 
+set -euo pipefail
+
 if [ $# != 6 ]; then
    echo "Usage: $0 <mlp-init> <feats-tr> <feats-cv> <labels-tr> <labels-cv> <exp-dir>"
    echo " e.g.: $0 0.nnet scp:train.scp scp:cv.scp ark:labels_tr.ark ark:labels_cv.ark exp/dnn1"
@@ -62,69 +63,71 @@ dir=$6
 [ -e $dir/final.nnet ] && echo "'$dir/final.nnet' exists, skipping training" && exit 0
 
 ##############################
-#start training
+# start training
 
-# choose mlp to start with
+# choose mlp to start with,
 mlp_best=$mlp_init
 mlp_base=${mlp_init##*/}; mlp_base=${mlp_base%.*}
-# optionally resume training from the best epoch
+
+# optionally resume training from the best epoch, using saved learning-rate,
 [ -e $dir/.mlp_best ] && mlp_best=$(cat $dir/.mlp_best)
 [ -e $dir/.learn_rate ] && learn_rate=$(cat $dir/.learn_rate)
 
-# cross-validation on original network
+# cross-validation on original network,
 log=$dir/log/iter00.initial.log; hostname>$log
-$train_tool --cross-validate=true \
- --minibatch-size=$minibatch_size --randomizer-size=$randomizer_size --randomize=false --verbose=$verbose \
- ${feature_transform:+ --feature-transform=$feature_transform} \
- ${frame_weights:+ "--frame-weights=$frame_weights"} \
- "$feats_cv" "$labels_cv" $mlp_best \
- 2>> $log || exit 1;
+$train_tool --cross-validate=true --randomize=false --verbose=$verbose $train_tool_opts \
+  ${feature_transform:+ --feature-transform=$feature_transform} \
+  ${frame_weights:+ "--frame-weights=$frame_weights"} \
+  ${utt_weights:+ "--utt-weights=$utt_weights"} \
+  "$feats_cv" "$labels_cv" $mlp_best \
+  2>> $log
 
 loss=$(cat $dir/log/iter00.initial.log | grep "AvgLoss:" | tail -n 1 | awk '{ print $4; }')
 loss_type=$(cat $dir/log/iter00.initial.log | grep "AvgLoss:" | tail -n 1 | awk '{ print $5; }')
 echo "CROSSVAL PRERUN AVG.LOSS $(printf "%.4f" $loss) $loss_type"
 
-# resume lr-halving
+# resume lr-halving,
 halving=0
 [ -e $dir/.halving ] && halving=$(cat $dir/.halving)
-# training
+
+# training,
 for iter in $(seq -w $max_iters); do
   echo -n "ITERATION $iter: "
   mlp_next=$dir/nnet/${mlp_base}_iter${iter}
   
-  # skip iteration if already done
+  # skip iteration (epoch) if already done,
   [ -e $dir/.done_iter$iter ] && echo -n "skipping... " && ls $mlp_next* && continue 
   
-  # training
+  # training,
   log=$dir/log/iter${iter}.tr.log; hostname>$log
-  $train_tool \
-   --learn-rate=$learn_rate --momentum=$momentum --l1-penalty=$l1_penalty --l2-penalty=$l2_penalty \
-   --minibatch-size=$minibatch_size --randomizer-size=$randomizer_size --randomize=true --verbose=$verbose \
-   --binary=true \
-   ${feature_transform:+ --feature-transform=$feature_transform} \
-   ${frame_weights:+ "--frame-weights=$frame_weights"} \
-   ${randomizer_seed:+ --randomizer-seed=$randomizer_seed} \
-   "$feats_tr" "$labels_tr" $mlp_best $mlp_next \
-   2>> $log || exit 1; 
+  $train_tool --cross-validate=false --randomize=true --verbose=$verbose $train_tool_opts \
+    --learn-rate=$learn_rate --momentum=$momentum \
+    --l1-penalty=$l1_penalty --l2-penalty=$l2_penalty \
+    ${feature_transform:+ --feature-transform=$feature_transform} \
+    ${frame_weights:+ "--frame-weights=$frame_weights"} \
+    ${utt_weights:+ "--utt-weights=$utt_weights"} \
+    "$feats_tr" "$labels_tr" $mlp_best $mlp_next \
+    2>> $log || exit 1; 
 
   tr_loss=$(cat $dir/log/iter${iter}.tr.log | grep "AvgLoss:" | tail -n 1 | awk '{ print $4; }')
   echo -n "TRAIN AVG.LOSS $(printf "%.4f" $tr_loss), (lrate$(printf "%.6g" $learn_rate)), "
   
-  # cross-validation
+  # cross-validation,
   log=$dir/log/iter${iter}.cv.log; hostname>$log
-  $train_tool --cross-validate=true \
-   --minibatch-size=$minibatch_size --randomizer-size=$randomizer_size --randomize=false --verbose=$verbose \
-   ${feature_transform:+ --feature-transform=$feature_transform} \
-   ${frame_weights:+ "--frame-weights=$frame_weights"} \
-   "$feats_cv" "$labels_cv" $mlp_next \
-   2>>$log || exit 1;
+  $train_tool --cross-validate=true --randomize=false --verbose=$verbose $train_tool_opts \
+    ${feature_transform:+ --feature-transform=$feature_transform} \
+    ${frame_weights:+ "--frame-weights=$frame_weights"} \
+    ${utt_weights:+ "--utt-weights=$utt_weights"} \
+    "$feats_cv" "$labels_cv" $mlp_next \
+    2>>$log || exit 1;
   
   loss_new=$(cat $dir/log/iter${iter}.cv.log | grep "AvgLoss:" | tail -n 1 | awk '{ print $4; }')
   echo -n "CROSSVAL AVG.LOSS $(printf "%.4f" $loss_new), "
 
-  # accept or reject new parameters (based on objective function)
+  # accept or reject?
   loss_prev=$loss
   if [ 1 == $(bc <<< "$loss_new < $loss") -o $iter -le $keep_lr_iters -o $iter -le $min_iters ]; then
+    # accepting: the loss was better, or we had fixed learn-rate, or we had fixed epoch-number,
     loss=$loss_new
     mlp_best=$dir/nnet/${mlp_base}_iter${iter}_learnrate${learn_rate}_tr$(printf "%.4f" $tr_loss)_cv$(printf "%.4f" $loss_new)
     [ $iter -le $min_iters ] && mlp_best=${mlp_best}_min-iters-$min_iters
@@ -133,18 +136,19 @@ for iter in $(seq -w $max_iters); do
     echo "nnet accepted ($(basename $mlp_best))"
     echo $mlp_best > $dir/.mlp_best 
   else
+    # rejecting,
     mlp_reject=$dir/nnet/${mlp_base}_iter${iter}_learnrate${learn_rate}_tr$(printf "%.4f" $tr_loss)_cv$(printf "%.4f" $loss_new)_rejected
     mv $mlp_next $mlp_reject
     echo "nnet rejected ($(basename $mlp_reject))"
   fi
 
-  # create .done file as a mark that iteration is over
+  # create .done file, the iteration (epoch) is completed,
   touch $dir/.done_iter$iter
   
-  # no learn-rate halving yet, if keep_lr_iters set accordingly
+  # continue with original learn-rate,
   [ $iter -le $keep_lr_iters ] && continue 
 
-  # stopping criterion
+  # stopping criterion,
   rel_impr=$(bc <<< "scale=10; ($loss_prev-$loss)/$loss_prev")
   if [ 1 == $halving -a 1 == $(bc <<< "$rel_impr < $end_halving_impr") ]; then
     if [ $iter -le $min_iters ]; then
@@ -155,30 +159,27 @@ for iter in $(seq -w $max_iters); do
     break
   fi
 
-  # start annealing when improvement is low
+  # start learning-rate fade-out when improvement is low,
   if [ 1 == $(bc <<< "$rel_impr < $start_halving_impr") ]; then
     halving=1
     echo $halving >$dir/.halving
   fi
   
-  # do annealing
+  # reduce the learning-rate,
   if [ 1 == $halving ]; then
     learn_rate=$(awk "BEGIN{print($learn_rate*$halving_factor)}")
     echo $learn_rate >$dir/.learn_rate
   fi
 done
 
-# select the best network
+# select the best network,
 if [ $mlp_best != $mlp_init ]; then 
   mlp_final=${mlp_best}_final_
   ( cd $dir/nnet; ln -s $(basename $mlp_best) $(basename $mlp_final); )
   ( cd $dir; ln -s nnet/$(basename $mlp_final) final.nnet; )
-  echo "Succeeded training the Neural Network : $dir/final.nnet"
+  echo "$0: Succeeded training the Neural Network : '$dir/final.nnet'"
 else
-  "Error training neural network..."
+  echo "$0: Error training neural network..."
   exit 1
 fi
 
-
-
-
diff --git a/egs/wsj/s5/steps/nnet2/adjust_priors.sh b/egs/wsj/s5/steps/nnet2/adjust_priors.sh
new file mode 100755
index 00000000000..3cdcfb4ae73
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet2/adjust_priors.sh
@@ -0,0 +1,80 @@
+#!/bin/bash
+
+# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
+# Copyright (c) 2015, Johns Hopkins University (Yenda Trmal <jtrmal@gmail.com>)
+# License: Apache 2.0
+
+# Begin configuration section.
+cmd=run.pl
+iter=final
+# End configuration section
+
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+if [ $# != 2 ]; then
+  echo "Usage: $0 [opts] <degs-dir> <nnet-dir>"
+  echo " e.g.: $0 exp/tri4_mpe_degs exp/tri4_mpe"
+  echo ""
+  echo "Performs priors adjustment either on the final iteration"
+  echo "or iteration of choice of the training. The adjusted model"
+  echo "filename will be suffixed by \"adj\", i.e. for the final"
+  echo "iteration final.mdl will become final.adj.mdl"
+  echo ""
+  echo "Main options (for others, see top of script file)"
+  echo "  --config <config-file>                           # config file containing options"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --iter <iteration|final>                         # which iteration to be adjusted"
+  exit 1;
+fi
+
+degs_dir=$1
+dir=$2
+
+src_model=$dir/${iter}.mdl
+
+if [ ! -f $src_model ]; then
+  echo "$0: Expecting $src_model to exist."
+  exit 1
+fi
+
+if [ ! -f $degs_dir/priors_egs.1.ark ]; then
+  echo "$0: Expecting $degs_dir/priors_egs.1.ark to exist."
+  exit 1
+fi
+
+num_archives_priors=`cat $degs_dir/info/num_archives_priors` || {
+  echo "Could not find $degs_dir/info/num_archives_priors.";
+  exit 1;
+}
+
+$cmd JOB=1:$num_archives_priors $dir/log/get_post.${iter}.JOB.log \
+  nnet-compute-from-egs "nnet-to-raw-nnet $src_model -|" \
+  ark:$degs_dir/priors_egs.JOB.ark ark:- \| \
+  matrix-sum-rows ark:- ark:- \| \
+  vector-sum ark:- $dir/post.${iter}.JOB.vec || {
+    echo "Error in getting posteriors for adjusting priors."
+    echo "See $dir/log/get_post.${iter}.*.log";
+    exit 1;
+  }
+
+
+$cmd $dir/log/sum_post.${iter}.log \
+  vector-sum $dir/post.${iter}.*.vec $dir/post.${iter}.vec || {
+    echo "Error in summing posteriors. See $dir/log/sum_post.${iter}.log";
+    exit 1;
+  }
+
+rm -f $dir/post.${iter}.*.vec
+
+echo "Re-adjusting priors based on computed posteriors for iter $iter"
+$cmd $dir/log/adjust_priors.${iter}.log \
+  nnet-adjust-priors $src_model $dir/post.${iter}.vec $dir/${iter}.adj.mdl || {
+    echo "Error in adjusting priors. See $dir/log/adjust_priors.${iter}.log";
+    exit 1;
+  }
+
+echo "Done adjusting priors (on $src_model)"
diff --git a/egs/wsj/s5/steps/nnet2/decode.sh b/egs/wsj/s5/steps/nnet2/decode.sh
index 753411f4563..7f1c8c2673e 100755
--- a/egs/wsj/s5/steps/nnet2/decode.sh
+++ b/egs/wsj/s5/steps/nnet2/decode.sh
@@ -68,7 +68,7 @@ done
 sdata=$data/split$nj;
 cmvn_opts=`cat $srcdir/cmvn_opts` || exit 1;
 thread_string=
-[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads" 
+[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads"
 
 mkdir -p $dir/log
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
@@ -99,7 +99,7 @@ if [ ! -z "$transform_dir" ]; then
   [ ! -s $transform_dir/num_jobs ] && \
     echo "$0: expected $transform_dir/num_jobs to contain the number of jobs." && exit 1;
   nj_orig=$(cat $transform_dir/num_jobs)
-  
+
   if [ $feat_type == "raw" ]; then trans=raw_trans;
   else trans=trans; fi
   if [ $feat_type == "lda" ] && \
@@ -142,7 +142,7 @@ if [ $stage -le 1 ]; then
      $graphdir/HCLG.fst "$feats" "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;
 fi
 
-# The output of this script is the files "lat.*.gz"-- we'll rescore this at 
+# The output of this script is the files "lat.*.gz"-- we'll rescore this at
 # different acoustic scales to get the final output.
 
 
@@ -151,7 +151,8 @@ if [ $stage -le 2 ]; then
     [ ! -x local/score.sh ] && \
       echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
     echo "score best paths"
-    local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir
+    [ "$iter" != "final" ] && iter_opt="--iter $iter"
+    local/score.sh $iter_opt $scoring_opts --cmd "$cmd" $data $graphdir $dir
     echo "score confidence and timing with sclite"
   fi
 fi
diff --git a/egs/wsj/s5/steps/nnet2/dump_bottleneck_features.sh b/egs/wsj/s5/steps/nnet2/dump_bottleneck_features.sh
index 785a0bf8139..0746a3188a1 100755
--- a/egs/wsj/s5/steps/nnet2/dump_bottleneck_features.sh
+++ b/egs/wsj/s5/steps/nnet2/dump_bottleneck_features.sh
@@ -36,18 +36,23 @@ nnetdir=$3
 archivedir=$4
 dir=$5
 
-# because we [cat trans.*], no need to keep nj consistent with [# of trans]
-nj=`cat $transform_dir/num_jobs` || exit 1;
-
-# Assume that final.mat and final.nnet are at nnetdir
-nnet_lda=$nnetdir/final.mat
+# Assume that final.nnet is in nnetdir
 bnf_nnet=$nnetdir/final.raw
-for file in $nnet_lda $bnf_nnet; do
-  if [ ! -f $file ] ; then
-    echo "No such file $file";
-    exit 1;
-  fi
-done
+if [ ! -f $bnf_nnet ] ; then
+  echo "No such file $bnf_nnet";
+  exit 1;
+fi
+
+## Set up input features of nnet
+if [ -z "$feat_type" ]; then
+  if [ -f $nnetdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
+fi
+echo "$0: feature type is $feat_type"
+
+if [ "$feat_type" == "lda" ] && [ ! -f $nnetdir/final.mat ]; then
+  echo "$0: no such file $nnetdir/final.mat"
+  exit 1
+fi
 
 name=`basename $data`
 sdata=$data/split$nj
@@ -55,19 +60,13 @@ sdata=$data/split$nj
 mkdir -p $dir/log
 mkdir -p $bnf_data
 echo $nj > $nnetdir/num_jobs
-nnet_plice_opts=`cat $nnetdir/nnet_splice_opts 2>/dev/null`
 splice_opts=`cat $nnetdir/splice_opts 2>/dev/null`
+delta_opts=`cat $nnetdir/delta_opts 2>/dev/null`
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
 
-## Set up input features of nnet
-if [ -z "$feat_type" ]; then
-  if [ -f $nnetdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
-fi
-echo "$0: feature type is $feat_type"
-
 case $feat_type in
   raw) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |";;
-  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
   lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $nnetdir/final.mat ark:- ark:- |"
    ;;
   *) echo "Invalid feature type $feat_type" && exit 1;
@@ -76,10 +75,16 @@ esac
 if [ ! -z "$transform_dir" ]; then
   echo "Using transforms from $transform_dir"
   [ ! -f $transform_dir/trans.1 ] && echo "No such file $transform_dir/trans.1" && exit 1;
-#  cat $transform_dir/trans.* > $nnetdir/trans || exit 1;
-  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |"
+  transform_nj=`cat $transform_dir/num_jobs` || exit 1;
+  if [ "$nj" != "$transform_nj" ]; then
+    for n in $(seq $transform_nj); do cat $transform_dir/trans.$n; done >$dir/trans.ark
+    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$dir/trans.ark ark:- ark:- |"
+  else
+    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$transform_dir/trans.JOB ark:- ark:- |"
+  fi
 fi
 
+
 if [ $stage -le 1 ]; then
   echo "Making BNF scp and ark."
   $cmd JOB=1:$nj $dir/log/make_bnf_$name.JOB.log \
@@ -87,23 +92,23 @@ if [ $stage -le 1 ]; then
     copy-feats --compress=true ark:- ark,scp:$archivedir/raw_bnfeat_$name.JOB.ark,$archivedir/raw_bnfeat_$name.JOB.scp || exit 1;
 fi
 
-N0=$(cat $data/feats.scp | wc -l) 
+rm $dir/trans.ark 2>/dev/null
+
+N0=$(cat $data/feats.scp | wc -l)
 N1=$(cat $archivedir/raw_bnfeat_$name.*.scp | wc -l)
 if [[ "$N0" != "$N1" ]]; then
   echo "Error happens when generating BNF for $name (Original:$N0  BNF:$N1)"
   exit 1;
 fi
 
-echo -n >$bnf_data/feats.scp
 # Concatenate feats.scp into bnf_data
-for n in `seq 1 $nj`; do
-  cat $archivedir/raw_bnfeat_$name.$n.scp >> $bnf_data/feats.scp
-done
+for n in $(seq $nj); do  cat $archivedir/raw_bnfeat_$name.$n.scp; done > $bnf_data/feats.scp
 
 for f in segments spk2utt text utt2spk wav.scp char.stm glm kws reco2file_and_channel stm; do
   [ -e $data/$f ] && cp -r $data/$f $bnf_data/$f
 done
 
+echo "$0: computing CMVN stats."
 steps/compute_cmvn_stats.sh $bnf_data $dir $archivedir
 
 echo "$0: done making BNF feats.scp."
diff --git a/egs/wsj/s5/steps/nnet2/get_egs_discriminative2.sh b/egs/wsj/s5/steps/nnet2/get_egs_discriminative2.sh
index c932e0463cc..4c08a08b824 100755
--- a/egs/wsj/s5/steps/nnet2/get_egs_discriminative2.sh
+++ b/egs/wsj/s5/steps/nnet2/get_egs_discriminative2.sh
@@ -81,29 +81,30 @@ nj=$(cat $denlatdir/num_jobs) || exit 1; # $nj is the number of
                                          # splits of the denlats and alignments.
 
 
+[ "$(readlink /bin/sh)" == dash ] && \
+  echo "This script won't work if /bin/sh points to dash.  make it point to bash." && exit 1
+
 nj_ali=$(cat $alidir/num_jobs) || exit 1;
 
 sdata=$data/split$nj
 utils/split_data.sh $data $nj
 
-
-
-
 if [ $nj_ali -eq $nj ]; then
   ali_rspecifier="ark,s,cs:gunzip -c $alidir/ali.JOB.gz |"
-  prior_ali_rspecifier="ark,s,cs:gunzip -c $alidir/ali.JOB.gz | copy-int-vector ark:- ark,t:- | utils/filter_scp.pl $dir/priors_uttlist | ali-to-pdf $alidir/final.mdl ark,t:- ark:- |"
+  all_ids=$(seq -s, $nj)
+  prior_ali_rspecifier="ark,s,cs:gunzip -c $alidir/ali.{$all_ids}.gz | copy-int-vector ark:- ark,t:- | utils/filter_scp.pl $dir/priors_uttlist | ali-to-pdf $alidir/final.mdl ark,t:- ark:- |"
 else
   ali_rspecifier="scp:$dir/ali.scp"
   prior_ali_rspecifier="ark,s,cs:utils/filter_scp.pl $dir/priors_uttlist $dir/ali.scp | ali-to-pdf $alidir/final.mdl scp:- ark:- |"
   if [ $stage -le 1 ]; then
     echo "$0: number of jobs in den-lats versus alignments differ: dumping them as single archive and index."
     all_ids=$(seq -s, $nj_ali)
-    copy-int-vector --print-args=false \
-      "ark:gunzip -c $alidir/ali.{$all_ids}.gz|" ark,scp:$dir/ali.ark,$dir/ali.scp || exit 1;
+    $cmd $dir/log/copy_alignments.log \
+      copy-int-vector "ark:gunzip -c $alidir/ali.{$all_ids}.gz|" \
+      ark,scp:$dir/ali.ark,$dir/ali.scp || exit 1;
   fi
 fi
 
-
 splice_opts=`cat $alidir/splice_opts 2>/dev/null`
 silphonelist=`cat $lang/phones/silence.csl` || exit 1;
 cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null`
@@ -124,7 +125,7 @@ else
   echo 0 > $dir/info/ivector_dim
 fi
 
-# Get list of validation utterances. 
+# Get list of validation utterances.
 awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_utts_subset \
     > $dir/priors_uttlist || exit 1;
 
@@ -137,13 +138,13 @@ echo "$0: feature type is $feat_type"
 
 case $feat_type in
   raw) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"
-    priors_feats="ark,s,cs:utils/filter_scp.pl $dir/priors_uttlist $sdata/JOB/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$data/cmvn.scp scp:- ark:- |"
+    priors_feats="ark,s,cs:utils/filter_scp.pl $dir/priors_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |"
    ;;
-  lda) 
+  lda)
     splice_opts=`cat $alidir/splice_opts 2>/dev/null`
-    cp $alidir/final.mat $dir    
+    cp $alidir/final.mat $dir
     feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
-    priors_feats="ark,s,cs:utils/filter_scp.pl $dir/priors_uttlist $sdata/JOB/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$data/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
+    priors_feats="ark,s,cs:utils/filter_scp.pl $dir/priors_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
     ;;
   *) echo "$0: invalid feature type $feat_type" && exit 1;
 esac
@@ -159,7 +160,7 @@ if [ ! -z "$transform_dir" ]; then
   [ ! -s $transform_dir/num_jobs ] && \
     echo "$0: expected $transform_dir/num_jobs to contain the number of jobs." && exit 1;
   nj_orig=$(cat $transform_dir/num_jobs)
-  
+
   if [ $feat_type == "raw" ]; then trans=raw_trans;
   else trans=trans; fi
   if [ $feat_type == "lda" ] && ! cmp $transform_dir/final.mat $alidir/final.mat; then
@@ -173,19 +174,20 @@ if [ ! -z "$transform_dir" ]; then
   if [ $nj -ne $nj_orig ]; then
     # Copy the transforms into an archive with an index.
     for n in $(seq $nj_orig); do cat $transform_dir/$trans.$n; done | \
-       copy-feats ark:- ark,scp:$dir/$trans.ark,$dir/$trans.scp || exit 1;
+      copy-feats ark:- ark,scp:$dir/$trans.ark,$dir/$trans.scp || exit 1;
     feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/$trans.scp ark:- ark:- |"
-    priors_feats="$priors_feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/$trans.scp ark:- ark:- |"
+    priors_feats="$priors_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/$trans.scp ark:- ark:- |"
   else
     # number of jobs matches with alignment dir.
     feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/$trans.JOB ark:- ark:- |"
-    priors_feats="$priors_feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/$trans.JOB ark:- ark:- |"
+    all_ids=`seq -s, $nj`
+    priors_feats="$priors_feats transform-feats --utt2spk=ark:$data/utt2spk 'ark:cat $transform_dir/$trans.{$all_ids} |' ark:- ark:- |"
   fi
 fi
 if [ ! -z $online_ivector_dir ]; then
   # add iVectors to the features.
   feats="$feats paste-feats --length-tolerance=$ivector_period ark:- 'ark,s,cs:utils/filter_scp.pl $sdata/JOB/utt2spk $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |' ark:- |"
-  priors_feats="$priors_feats paste-feats --length-tolerance=$ivector_period ark:- 'ark,s,cs:utils/filter_scp.pl $sdata/JOB/utt2spk $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |' ark:- |"
+  priors_feats="$priors_feats paste-feats --length-tolerance=$ivector_period ark:- 'ark,s,cs:utils/filter_scp.pl $dir/priors_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |' ark:- |"
 fi
 
 
@@ -193,7 +195,7 @@ if [ $stage -le 2 ]; then
   echo "$0: working out number of frames of training data"
   num_frames=$(steps/nnet2/get_num_frames.sh $data)
 
-  echo $num_frames > $dir/info/num_frames 
+  echo $num_frames > $dir/info/num_frames
 
   # Working out total number of archives. Add one on the assumption the
   # num-frames won't divide exactly, and we want to round up.
@@ -210,7 +212,7 @@ if [ $stage -le 2 ]; then
 
   echo $num_archives >$dir/info/num_archives || exit 1
   echo $num_archives_temp >$dir/info/num_archives_temp || exit 1
-  
+
   frames_per_archive=$[$num_frames/$num_archives]
 
   # note, this is the number of frames per archive prior to discarding frames.
@@ -256,38 +258,22 @@ if [ $stage -le 10 ]; then
 priors_egs_list=
 for y in `seq $num_archives_priors`; do
   utils/create_data_link.pl $dir/priors_egs.$y.ark
-  for x in `seq $nj`; do
-    utils/create_data_link.pl $dir/priors_egs_orig.$x.$y.ark
-  done
-  priors_egs_list="$priors_egs_list ark:$dir/priors_egs_orig.JOB.$y.ark"
+  priors_egs_list="$priors_egs_list ark:$dir/priors_egs.$y.ark"
 done
- 
+
 nnet_context_opts="--left-context=$left_context --right-context=$right_context"
 
 echo "$0: dumping egs for prior adjustment in the background."
 
-$cmd JOB=1:$nj $dir/log/create_priors_subset.JOB.log \
+$cmd $dir/log/create_priors_subset.log \
   nnet-get-egs $ivectors_opt $nnet_context_opts "$priors_feats" \
   "$prior_ali_rspecifier ali-to-post ark:- ark:- |" \
   ark:- \| nnet-copy-egs ark:- $priors_egs_list || \
-  { touch $dir/.error; echo "Error in creating priors subset. See $dir/log/create_priors_subset.*.log"; exit 1; }
+  { touch $dir/.error; echo "Error in creating priors subset. See $dir/log/create_priors_subset.log"; exit 1; }
 
 sleep 3;
 
-echo "$0: recombining archives on disk"
-# combine all the "priors_egs_orig.JOB.*.scp" (over the $nj splits of the data) and
-# writing to the priors_egs.JOB.ark
-
-priors_egs_list=
-for n in $(seq $nj); do 
-  priors_egs_list="$priors_egs_list $dir/priors_egs_orig.$n.JOB.ark"
-done
-
-echo $num_archives_priors >$dir/info/num_archives_priors 
-
-$cmd JOB=1:$num_archives_priors $dir/log/copy_priors_egs.JOB.log \
-  nnet-copy-egs "ark:cat $priors_egs_list|"  ark:$dir/priors_egs.JOB.ark || \
-  { touch $dir/.error; echo "Error in creating priors_egs. See $dir/log/copy_priors_egs.*.log"; exit 1; }
+echo $num_archives_priors >$dir/info/num_archives_priors
 
 fi
 
@@ -306,12 +292,12 @@ if [ $stage -le 3 ]; then
 fi
 
 if [ $stage -le 4 ]; then
-  
+
   degs_list=$(for n in $(seq $nj); do echo $dir/degs_orig.$n.JOB.ark; done)
 
   if [ $num_archives -eq $num_archives_temp ]; then
     echo "$0: combining data into final archives and shuffling it"
-    
+
     $cmd JOB=1:$num_archives $dir/log/shuffle.JOB.log \
       cat $degs_list \| nnet-shuffle-egs-discriminative --srand=JOB ark:- \
        ark:$dir/degs.JOB.ark || exit 1;
@@ -354,10 +340,6 @@ if $cleanup; then
       file=$dir/degs_orig.$x.$y.ark
       [ -L $file ] && rm $(readlink -f $file); rm $file
     done
-    for y in $(seq $num_archives_priors); do
-      file=$dir/priors_egs_orig.$x.$y.ark
-      [ -L $file ] && rm $(readlink -f $file); rm $file
-    done
   done
   if [ $num_archives_temp -ne $num_archives ]; then
     for z in $(seq $num_archives); do
diff --git a/egs/wsj/s5/steps/nnet2/get_lda_block.sh b/egs/wsj/s5/steps/nnet2/get_lda_block.sh
index c840e014250..7bd4ecf5647 100755
--- a/egs/wsj/s5/steps/nnet2/get_lda_block.sh
+++ b/egs/wsj/s5/steps/nnet2/get_lda_block.sh
@@ -104,7 +104,7 @@ while [ $[$cur_index+$block_size] -le $feat_dim ]; do
   echo >> $dir/indexes
   num_blocks=$[$num_blocks+1]
   cur_index=$[$cur_index+$block_shift]
-  if [ $[$cur_index+$block_size-1] -gt $feat_dim ]; then
+  if [ $[$cur_index+$block_size] -gt $feat_dim ]; then
     cur_index=$[$feat_dim-$block_size];
   fi
 done
diff --git a/egs/wsj/s5/steps/nnet2/get_num_frames.sh b/egs/wsj/s5/steps/nnet2/get_num_frames.sh
deleted file mode 100755
index a960e2fcfe9..00000000000
--- a/egs/wsj/s5/steps/nnet2/get_num_frames.sh
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/bin/bash
-
-# This script works out the approximate number of frames in a training directory
-# this is sometimes needed by higher-level scripts
-
-num_samples=1000
-
-
-if [ -f path.sh ]; then . ./path.sh; fi
-. parse_options.sh || exit 1;
-
-if [ $# -ne 1 ]; then
-  (
-    echo "Usage: $0 <data-dir>"
-    echo "Prints the number of frames of data in the data-dir, via sampling rather"
-    echo "than trying to access all the data."
-  ) 1>&2
-fi
-
-data=$1
-
-if [ ! -f $data/feats.scp ]; then
-  if [ -f $data/segments ]; then
-    echo "$0: $data/feats.scp does not exist, but $data/segments does exist; using that and assuming 100 frames per second." 1>&2
-    num_frames=$(cat $data/segments | awk '{x += $4 - $3;} END{print int(x*100);}') || exit 1;
-    echo $num_frames
-    exit 0;
-  else
-    echo "$0: neither $data/feats.scp nor $data/segments exist." 1>&2
-    exit 1;
-  fi
-fi
-
-
-sample_frames=$(utils/shuffle_list.pl $data/feats.scp | head -n $num_samples | sort | feat-to-len --print-args=false scp:-)
-
-num_files_orig=$(wc -l <$data/feats.scp)
-if [ $num_samples -lt $num_files_orig ]; then
-  num_files_sampled=$num_samples
-else
-  num_files_sampled=$num_files_orig
-fi
-
-perl -e "\$n = int(($sample_frames * 1.0 * $num_files_orig) / (1.0 * $num_files_sampled)); print \"\$n\n\";";
diff --git a/egs/wsj/s5/steps/nnet2/get_num_frames.sh b/egs/wsj/s5/steps/nnet2/get_num_frames.sh
new file mode 120000
index 00000000000..d5eab6ede07
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet2/get_num_frames.sh
@@ -0,0 +1 @@
+../../utils/data/get_num_frames.sh
\ No newline at end of file
diff --git a/egs/wsj/s5/steps/nnet2/remove_egs.sh b/egs/wsj/s5/steps/nnet2/remove_egs.sh
index da0484e954a..143a5d0d86a 100755
--- a/egs/wsj/s5/steps/nnet2/remove_egs.sh
+++ b/egs/wsj/s5/steps/nnet2/remove_egs.sh
@@ -34,10 +34,8 @@ if [ -f $egs/.nodelete ]; then
 fi
 
 
-flist=$egs/egs.*.ark
 
-
-for f in $egs/egs.*.ark $egs/degs.*.ark; do
+for f in $egs/egs.*.ark $egs/degs.*.ark $egs/cegs.*.ark; do
   if [ -L $f ]; then
     rm $(dirname $f)/$(readlink $f)  # this will print a warning if it fails.
   fi
diff --git a/egs/wsj/s5/steps/nnet2/retrain_simple2.sh b/egs/wsj/s5/steps/nnet2/retrain_simple2.sh
index d3f5223b59d..9e018015075 100755
--- a/egs/wsj/s5/steps/nnet2/retrain_simple2.sh
+++ b/egs/wsj/s5/steps/nnet2/retrain_simple2.sh
@@ -355,7 +355,7 @@ while [ $x -lt $num_iters ]; do
         $cmd $parallel_opts $dir/log/train.$x.$n.log \
           nnet-train$parallel_suffix $parallel_train_opts \
           --minibatch-size=$this_minibatch_size --srand=$x $dir/$x.mdl \
-          "ark:nnet-copy-egs --frame=$frame ark:$cur_egs_dir/egs.$archive.ark ark:-|nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-|" \
+          "ark,bg:nnet-copy-egs --frame=$frame ark:$cur_egs_dir/egs.$archive.ark ark:-|nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-|" \
           $dir/$[$x+1].$n.mdl || touch $dir/.error &
       done
       wait
diff --git a/egs/wsj/s5/steps/nnet2/train_convnet_accel2.sh b/egs/wsj/s5/steps/nnet2/train_convnet_accel2.sh
index 1c34749ba7f..a99075f2aef 100755
--- a/egs/wsj/s5/steps/nnet2/train_convnet_accel2.sh
+++ b/egs/wsj/s5/steps/nnet2/train_convnet_accel2.sh
@@ -4,14 +4,16 @@
 #                2013  Xiaohui Zhang
 #                2013  Guoguo Chen
 #                2014  Vimal Manohar
+#                2015  Xingyu Na
 # Apache 2.0.
 
 # train_convnet_accel2.sh is modified from train_pnorm_accel2.sh. It propotypes
-# the training of a ConvNet. The ConvNet is composed of 4 layers. The first layer
+# the training of a ConvNet. The ConvNet is composed of 4 hidden layers. The first layer
 # is a Convolutional1d component plus a Maxpooling component. The second layer
 # is a single Convolutional1d component. The third and fourth layers are affine
 # components with ReLU nonlinearities. Due to non-squashing output, normalize
-# component is applied to all four layers.
+# component is applied to all four layers. The number of hidden layers is hard
+# coded now.
 
 # train_pnorm_accel2.sh is a modified form of train_pnorm_simple2.sh (the "2"
 # suffix is because they both use the the "new" egs format, created by
@@ -63,7 +65,7 @@ shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of
                 # (the point of this is to get data in different minibatches on different iterations,
                 # since in the preconditioning method, 2 samples in the same minibatch can
                 # affect each others' gradients.
-
+num_hidden_layers=4
 add_layers_period=2 # by default, add new layers every 2 iterations.
 stage=-3
 
@@ -84,6 +86,7 @@ patch_dim1=7          # dim of convolutional kernel in the first layer
 pool_size=3           # size of pooling after the first convolutional layer
 num_filters2=256      # number of filters in the second convolutional layer
 patch_dim2=4          # dim of convolutional kernel in the second layer
+patch_step2=1         # patch step of the second convolutional layer
 
 mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
         # specified.)
@@ -262,10 +265,8 @@ if [ $stage -le -2 ]; then
   tot_input_dim=$[$feat_dim*$tot_splice]
   num_patch1=$[1+($feat_dim-$patch_dim1)/$patch_step1]
   num_pool=$[$num_patch1/$pool_size]
-  patch_dim2=$[$patch_dim2*$num_filters1]
-  patch_step2=$num_filters1
-  patch_stride2=$[$num_pool*$num_filters1]   # same as pool outputs
-  num_patch2=$[1+($num_pool*$num_filters1-$patch_dim2)/$patch_step2]
+  patch_stride2=$num_pool
+  num_patch2=$[1+($patch_stride2-$patch_dim2)/$patch_step2]
   conv_out_dim1=$[$num_filters1*$num_patch1] # 128 x (36 - 7 + 1)
   pool_out_dim=$[$num_filters1*$num_pool]
   conv_out_dim2=$[$num_filters2*$num_patch2]
@@ -284,7 +285,7 @@ SoftmaxComponent dim=$num_leaves
 EOF
   
   cat >$dir/replace.1.config <<EOF
-Convolutional1dComponent input-dim=$pool_out_dim output-dim=$conv_out_dim2 learning-rate=$initial_lrate param-stddev=$stddev bias-stddev=$bias_stddev patch-dim=$patch_dim2 patch-step=$patch_step2 patch-stride=$patch_stride2
+Convolutional1dComponent input-dim=$pool_out_dim output-dim=$conv_out_dim2 learning-rate=$initial_lrate param-stddev=$stddev bias-stddev=$bias_stddev patch-dim=$patch_dim2 patch-step=$patch_step2 patch-stride=$patch_stride2 appended-conv=true
 NormalizeComponent dim=$conv_out_dim2
 AffineComponentPreconditionedOnline input-dim=$conv_out_dim2 output-dim=$num_leaves $online_preconditioning_opts learning-rate=$initial_lrate param-stddev=0 bias-stddev=0
 SoftmaxComponent dim=$num_leaves
@@ -511,7 +512,7 @@ while [ $x -lt $num_iters ]; do
         $cmd $parallel_opts $dir/log/train.$x.$n.log \
           nnet-train$parallel_suffix $parallel_train_opts \
           --minibatch-size=$this_minibatch_size --srand=$x "$mdl" \
-          "ark:nnet-copy-egs --frame=$frame ark:$cur_egs_dir/egs.$archive.ark ark:-|nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-|" \
+          "ark,bg:nnet-copy-egs --frame=$frame ark:$cur_egs_dir/egs.$archive.ark ark:-|nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-|" \
           $dir/$[$x+1].$n.mdl || touch $dir/.error &
       done
       wait
diff --git a/egs/wsj/s5/steps/nnet2/train_discriminative.sh b/egs/wsj/s5/steps/nnet2/train_discriminative.sh
index 177ad2fc4b0..053d13d1b40 100755
--- a/egs/wsj/s5/steps/nnet2/train_discriminative.sh
+++ b/egs/wsj/s5/steps/nnet2/train_discriminative.sh
@@ -359,7 +359,7 @@ while [ $x -lt $num_iters ]; do
        --criterion=$criterion --drop-frames=$drop_frames \
        --one-silence-class=$one_silence_class --boost=$boost \
        --acoustic-scale=$acoustic_scale $dir/$x.mdl \
-       "ark:nnet-combine-egs-discriminative ark:$degs_dir/degs.JOB.$[$x%$iters_per_epoch].ark ark:- |" \
+       "ark,bg:nnet-combine-egs-discriminative ark:$degs_dir/degs.JOB.$[$x%$iters_per_epoch].ark ark:- |" \
         $dir/$[$x+1].JOB.mdl \
       || exit 1;
 
diff --git a/egs/wsj/s5/steps/nnet2/train_more2.sh b/egs/wsj/s5/steps/nnet2/train_more2.sh
index 9734d38f691..f83056f2315 100755
--- a/egs/wsj/s5/steps/nnet2/train_more2.sh
+++ b/egs/wsj/s5/steps/nnet2/train_more2.sh
@@ -204,7 +204,7 @@ while [ $x -lt $num_iters ]; do
         $cmd $parallel_opts $dir/log/train.$x.$n.log \
           nnet-train$parallel_suffix $parallel_train_opts \
           --minibatch-size=$minibatch_size --srand=$x $dir/$x.mdl \
-          "ark:nnet-copy-egs --frame=$frame ark:$egs_dir/egs.$archive.ark ark:-|nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-|" \
+          "ark,bg:nnet-copy-egs --frame=$frame ark:$egs_dir/egs.$archive.ark ark:-|nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-|" \
           $dir/$[$x+1].$n.mdl || touch $dir/.error &
       done
       wait
diff --git a/egs/wsj/s5/steps/nnet2/train_multilang2.sh b/egs/wsj/s5/steps/nnet2/train_multilang2.sh
index 04590a99dcc..41d9bc95059 100755
--- a/egs/wsj/s5/steps/nnet2/train_multilang2.sh
+++ b/egs/wsj/s5/steps/nnet2/train_multilang2.sh
@@ -338,7 +338,7 @@ while [ $x -lt $num_iters ]; do
           $cmd $parallel_opts $dir/$lang/log/train.$x.$n.log \
             nnet-train$parallel_suffix $parallel_train_opts \
             --minibatch-size=$this_minibatch_size --srand=$x $dir/$lang/$x.mdl \
-            "ark:nnet-copy-egs --keep-proportion=$this_keep_proportion --frame=$frame ark:${egs_dir[$lang]}/egs.$archive.ark ark:-|nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-|" \
+            "ark,bg:nnet-copy-egs --keep-proportion=$this_keep_proportion --frame=$frame ark:${egs_dir[$lang]}/egs.$archive.ark ark:-|nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-|" \
             $dir/$lang/$[$x+1].$n.mdl || touch $dir/.error &
         done
       done
diff --git a/egs/wsj/s5/steps/nnet2/train_multisplice_accel2.sh b/egs/wsj/s5/steps/nnet2/train_multisplice_accel2.sh
index 03ca6c4c62d..82156fed39f 100755
--- a/egs/wsj/s5/steps/nnet2/train_multisplice_accel2.sh
+++ b/egs/wsj/s5/steps/nnet2/train_multisplice_accel2.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey). 
+# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey).
 #           2013  Xiaohui Zhang
 #           2013  Guoguo Chen
 #           2014  Vimal Manohar
@@ -21,13 +21,13 @@ num_epochs=15      # Number of epochs of training;
 initial_effective_lrate=0.01
 final_effective_lrate=0.001
 bias_stddev=0.5
-pnorm_input_dim=3000 
+pnorm_input_dim=3000
 pnorm_output_dim=300
 presoftmax_prior_scale_power=-0.25 # use the specified power value on the priors (inverse priors)
-                                   # to scale the pre-softmax outputs  
+                                   # to scale the pre-softmax outputs
 minibatch_size=128 # by default use a smallish minibatch size for neural net
                    # training; this controls instability which would otherwise
-                   # be a problem with multi-threaded update. 
+                   # be a problem with multi-threaded update.
 
 samples_per_iter=400000 # each iteration of training, see this many samples
                         # per job.  This option is passed to get_egs.sh
@@ -37,6 +37,9 @@ prior_subset_size=10000 # 10k samples per job, for computing priors.  Should be
                         # more than enough.
 num_jobs_compute_prior=10 # these are single-threaded, run on CPU.
 get_egs_stage=0
+fix_nnet=false
+min_average=0.05
+max_average=0.95
 online_ivector_dir=
 remove_egs=true  # set to false to disable removing egs.
 
@@ -76,7 +79,7 @@ precondition_rank_out=80 # relates to online preconditioning
 mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
         # specified.)
 num_threads=16
-parallel_opts="--num-threads 16 --mem 1G" 
+parallel_opts="--num-threads 16 --mem 1G"
   # by default we use 16 threads; this lets the queue know.
   # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
 combine_num_threads=8
@@ -87,12 +90,12 @@ lda_opts=
 lda_dim=
 egs_opts=
 transform_dir=     # If supplied, overrides alidir
-cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.  
+cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.
             # only relevant for "raw" features, not lda.
 feat_type=  # Can be used to force "raw" features.
 align_cmd=              # The cmd that is passed to steps/nnet2/align.sh
 align_use_gpu=          # Passed to use_gpu in steps/nnet2/align.sh [yes/no]
-realign_times=          # List of times on which we realign.  Each time is 
+realign_times=          # List of times on which we realign.  Each time is
                         # floating point number strictly between 0 and 1, which
                         # will be multiplied by the num-iters to get an iteration
                         # number.
@@ -143,15 +146,15 @@ if [ $# != 4 ]; then
   echo "                                                   # Format : layer<hidden_layer_index>/<frame_indices>....layer<hidden_layer>/<frame_indices> "
   echo "                                                   # (note: we splice processed, typically 40-dimensional frames"
   echo "  --lda-dim <dim|''>                               # Dimension to reduce spliced features to with LDA"
-  echo "  --realign-epochs <list-of-epochs|''>             # A list of space-separated epoch indices the beginning of which"
-  echo "                                                   # realignment is to be done"
+  echo "  --realign-times <list-of-times|\"\">             # A list of space-separated floating point numbers between 0.0 and"
+  echo "                                                   # 1.0 to specify how far through training realignment is to be done"
   echo "  --align-cmd (utils/run.pl|utils/queue.pl <queue opts>) # passed to align.sh"
   echo "  --align-use-gpu (yes/no)                         # specify is gpu is to be used for realignment"
   echo "  --num-jobs-align <#njobs|30>                     # Number of jobs to perform realignment"
   echo "  --stage <stage|-4>                               # Used to run a partially-completed training process from somewhere in"
   echo "                                                   # the middle."
 
-  
+
   exit 1;
 fi
 
@@ -258,7 +261,7 @@ if [ $stage -le -2 ]; then
   online_preconditioning_opts="alpha=$alpha num-samples-history=$num_samples_history update-period=$update_period rank-in=$precondition_rank_in rank-out=$precondition_rank_out max-change-per-sample=$max_change_per_sample"
 
   initial_lrate=$(perl -e "print ($initial_effective_lrate*$num_jobs_initial);")
-  
+
   # create the config files for nnet initialization
   python steps/nnet2/make_multisplice_configs.py  \
     --splice-indexes "$splice_indexes"  \
@@ -279,7 +282,7 @@ if [ $stage -le -2 ]; then
     nnet-am-init $alidir/tree $lang/topo "nnet-init $dir/nnet.config -|" \
     $dir/0.mdl || exit 1;
 fi
-
+if [ $pnorm_input_dim -eq $pnorm_output_dim ] && [ $fix_nnet ]; then fix_nnet=true;fi
 if [ $stage -le -1 ]; then
   echo "Training transition probabilities and setting priors"
   $cmd $dir/log/train_trans.log \
@@ -290,16 +293,16 @@ if [ $stage -le -1 ]; then
     echo "prepare initial vector for FixedScaleComponent before softmax"
     echo "use priors^$presoftmax_prior_scale_power and rescale to average 1"
 
-    # obtains raw pdf count    
+    # obtains raw pdf count
     $cmd JOB=1:$nj $dir/log/acc_pdf.JOB.log \
       ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
       post-to-tacc --per-pdf=true --binary=false $alidir/final.mdl ark:- $dir/JOB.pacc || exit 1;
     cat $dir/*.pacc > $dir/pacc
     rm $dir/*.pacc
     awk -v power=$presoftmax_prior_scale_power \
-      '{ for(i=2; i<=NF-1; i++) {sum[i]+=$i} } 
+      '{ for(i=2; i<=NF-1; i++) {sum[i]+=$i} }
       END {
-        for (i=2; i<=NF-1; i++) {total+=sum[i]} 
+        for (i=2; i<=NF-1; i++) {total+=sum[i]}
         ave_pdf=int(total/(NF-2)); total+=0.01*ave_pdf*(NF-2)
         for (i=2; i<=NF-1; i++) {rescale+=((sum[i]+0.01*ave_pdf)/total)^power}
         rescale/=(NF-2)
@@ -310,7 +313,7 @@ if [ $stage -le -1 ]; then
     echo "insert an additional layer of FixedScaleComponent before softmax"
     inp=`nnet-am-info $dir/0.mdl | grep 'Softmax' | awk '{print $2}'`
     nnet-init $dir/per_element.config - | nnet-insert --insert-at=$inp --randomize-next-component=false $dir/0.mdl - $dir/0.mdl
-  fi  
+  fi
 fi
 
 # set num_iters so that as close as possible, we process the data $num_epochs
@@ -393,13 +396,13 @@ cur_egs_dir=$egs_dir
 
 while [ $x -lt $num_iters ]; do
   [ $x -eq $exit_stage ] && echo "$0: Exiting early due to --exit-stage $exit_stage" && exit 0;
-
+  if [ $x -gt $[$num_iters/2] ]; then fix_nnet=false; fi
   this_num_jobs=$(perl -e "print int(0.5+$num_jobs_initial+($num_jobs_final-$num_jobs_initial)*$x/$num_iters);")
 
   ilr=$initial_effective_lrate; flr=$final_effective_lrate; np=$num_archives_processed; nt=$num_archives_to_process;
   this_learning_rate=$(perl -e "print (($x + 1 >= $num_iters ? $flr : $ilr*exp($np*log($flr/$ilr)/$nt))*$this_num_jobs);");
 
-  echo "On iteration $x, learning rate is $this_learning_rate."    
+  echo "On iteration $x, learning rate is $this_learning_rate."
 
   if [ ! -z "${realign_this_iter[$x]}" ]; then
     prev_egs_dir=$cur_egs_dir
@@ -444,7 +447,7 @@ while [ $x -lt $num_iters ]; do
         steps/nnet2/remove_egs.sh $prev_egs_dir
       fi
     fi
-    
+
     # Set off jobs doing some diagnostics, in the background.
     # Use the egs dir from the previous iteration for the diagnostics
     $cmd $dir/log/compute_prob_valid.$x.log \
@@ -496,7 +499,7 @@ while [ $x -lt $num_iters ]; do
     ( # this sub-shell is so that when we "wait" below,
       # we only wait for the training jobs that we just spawned,
       # not the diagnostic jobs that we spawned above.
-      
+
       # We can't easily use a single parallel SGE job to do the main training,
       # because the computation of which archive and which --frame option
       # to use for each job is a little complex, so we spawn each one separately.
@@ -512,7 +515,7 @@ while [ $x -lt $num_iters ]; do
         $cmd $parallel_opts $dir/log/train.$x.$n.log \
           nnet-train$parallel_suffix $parallel_train_opts \
           --minibatch-size=$this_minibatch_size --srand=$x "$mdl" \
-          "ark:nnet-copy-egs --frame=$frame ark:$cur_egs_dir/egs.$archive.ark ark:-|nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-|" \
+          "ark,bg:nnet-copy-egs --frame=$frame ark:$cur_egs_dir/egs.$archive.ark ark:-|nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-|" \
           $dir/$[$x+1].$n.mdl || touch $dir/.error &
       done
       wait
@@ -535,11 +538,15 @@ while [ $x -lt $num_iters ]; do
       n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
           $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
           undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
-          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob; 
+          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
           $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
       [ -z "$n" ] && echo "Error getting best model" && exit 1;
       cp $dir/$[$x+1].$n.mdl $dir/$[$x+1].mdl || exit 1;
     fi
+    if $fix_nnet; then
+      # do nnet-am-fix to fix some pathology in the network
+      nnet-am-fix --max-average-deriv=$max_average --min-average-deriv=$min_average $dir/$[$x+1].mdl $dir/$[$x+1].mdl 2>$dir/log/fix.$x.log || exit;
+    fi
 
     if [ "$mix_up" -gt 0 ] && [ $x -eq $mix_up_iter ]; then
       echo "Warning: the mix up opertion is disabled!"
@@ -569,7 +576,7 @@ if [ $stage -le $num_iters ]; then
     cur_offset=0 # current offset from first_model_combine.
     for n in $(seq $max_models_combine); do
       next_offset=$[($n*$num_models_combine)/$max_models_combine]
-      sub_list="" 
+      sub_list=""
       for o in $(seq $cur_offset $[$next_offset-1]); do
         iter=$[$first_model_combine+$o]
         mdl=$dir/$iter.mdl
diff --git a/egs/wsj/s5/steps/nnet2/train_multisplice_ensemble.sh b/egs/wsj/s5/steps/nnet2/train_multisplice_ensemble.sh
index 02f02804153..a5cef8aea44 100755
--- a/egs/wsj/s5/steps/nnet2/train_multisplice_ensemble.sh
+++ b/egs/wsj/s5/steps/nnet2/train_multisplice_ensemble.sh
@@ -490,7 +490,7 @@ while [ $x -lt $num_iters ]; do
           nnet-train-ensemble \
           --minibatch-size=$this_minibatch_size --srand=$x \
           --beta=$beta $nnets_ensemble_in \
-          "ark:nnet-copy-egs --frame=$frame ark:$cur_egs_dir/egs.$archive.ark ark:-|nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-|" \
+          "ark,bg:nnet-copy-egs --frame=$frame ark:$cur_egs_dir/egs.$archive.ark ark:-|nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-|" \
           ark:- $nnets_ensemble_out || touch $dir/.error &
       done
       wait
diff --git a/egs/wsj/s5/steps/nnet2/train_pnorm_accel2.sh b/egs/wsj/s5/steps/nnet2/train_pnorm_accel2.sh
index 01dbe9b5dbf..4176d347ccd 100755
--- a/egs/wsj/s5/steps/nnet2/train_pnorm_accel2.sh
+++ b/egs/wsj/s5/steps/nnet2/train_pnorm_accel2.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey). 
+# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey).
 #                2013  Xiaohui Zhang
 #                2013  Guoguo Chen
 #                2014  Vimal Manohar
@@ -27,12 +27,12 @@ num_epochs=15      # Number of epochs of training;
 initial_effective_lrate=0.01
 final_effective_lrate=0.001
 bias_stddev=0.5
-pnorm_input_dim=3000 
+pnorm_input_dim=3000
 pnorm_output_dim=300
 p=2
 minibatch_size=128 # by default use a smallish minibatch size for neural net
                    # training; this controls instability which would otherwise
-                   # be a problem with multi-threaded update. 
+                   # be a problem with multi-threaded update.
 
 samples_per_iter=400000 # each iteration of training, see this many samples
                         # per job.  This option is passed to get_egs.sh
@@ -77,7 +77,7 @@ precondition_rank_out=80 # relates to online preconditioning
 mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
         # specified.)
 num_threads=16
-parallel_opts="--num-threads 16 --mem 1G" 
+parallel_opts="--num-threads 16 --mem 1G"
   # by default we use 16 threads; this lets the queue know.
   # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
 combine_num_threads=8
@@ -90,12 +90,12 @@ egs_opts=
 io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time.
 transform_dir=     # If supplied, overrides alidir
 postdir=
-cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.  
+cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.
             # only relevant for "raw" features, not lda.
 feat_type=  # Can be used to force "raw" features.
 align_cmd=              # The cmd that is passed to steps/nnet2/align.sh
 align_use_gpu=          # Passed to use_gpu in steps/nnet2/align.sh [yes/no]
-realign_times=          # List of times on which we realign.  Each time is 
+realign_times=          # List of times on which we realign.  Each time is
                         # floating point number strictly between 0 and 1, which
                         # will be multiplied by the num-iters to get an iteration
                         # number.
@@ -139,15 +139,15 @@ if [ $# != 4 ]; then
   echo "  --splice-width <width|4>                         # Number of frames on each side to append for feature input"
   echo "                                                   # (note: we splice processed, typically 40-dimensional frames"
   echo "  --lda-dim <dim|250>                              # Dimension to reduce spliced features to with LDA"
-  echo "  --realign-epochs <list-of-epochs|\"\">           # A list of space-separated epoch indices the beginning of which"
-  echo "                                                   # realignment is to be done"
+  echo "  --realign-times <list-of-times|\"\">             # A list of space-separated floating point numbers between 0.0 and"
+  echo "                                                   # 1.0 to specify how far through training realignment is to be done"
   echo "  --align-cmd (utils/run.pl|utils/queue.pl <queue opts>) # passed to align.sh"
   echo "  --align-use-gpu (yes/no)                         # specify is gpu is to be used for realignment"
   echo "  --num-jobs-align <#njobs|30>                     # Number of jobs to perform realignment"
   echo "  --stage <stage|-4>                               # Used to run a partially-completed training process from somewhere in"
   echo "                                                   # the middle."
 
-  
+
   exit 1;
 fi
 
@@ -205,7 +205,7 @@ ivector_dim=$(cat $dir/ivector_dim) || exit 1;
 lda_dim=$(cat $dir/lda_dim) || exit 1;
 
 if [ $stage -le -3 ] && [ -z "$egs_dir" ]; then
-  echo "$0: calling get_egs2.sh"            
+  echo "$0: calling get_egs2.sh"
   steps/nnet2/get_egs2.sh $egs_opts "${extra_opts[@]}"  --io-opts "$io_opts" \
     --postdir "$postdir" --samples-per-iter $samples_per_iter --stage $get_egs_stage \
     --cmd "$cmd" $egs_opts $data $alidir $dir/egs || exit 1;
@@ -253,7 +253,7 @@ SoftmaxComponent dim=$num_leaves
 EOF
 
   # to hidden.config it will write the part of the config corresponding to a
-  # single hidden layer; we need this to add new layers. 
+  # single hidden layer; we need this to add new layers.
   cat >$dir/hidden.config <<EOF
 AffineComponentPreconditionedOnline input-dim=$pnorm_output_dim output-dim=$pnorm_input_dim $online_preconditioning_opts learning-rate=$initial_lrate param-stddev=$stddev bias-stddev=$bias_stddev
 PnormComponent input-dim=$pnorm_input_dim output-dim=$pnorm_output_dim p=$p
@@ -356,7 +356,7 @@ while [ $x -lt $num_iters ]; do
 
   # TODO: remove this line.
   echo "On iteration $x, learning rate is $this_learning_rate."
-    
+
   if [ ! -z "${realign_this_iter[$x]}" ]; then
     prev_egs_dir=$cur_egs_dir
     cur_egs_dir=$dir/egs_${realign_this_iter[$x]}
@@ -366,7 +366,7 @@ while [ $x -lt $num_iters ]; do
     if [ ! -z "${realign_this_iter[$x]}" ]; then
       time=${realign_this_iter[$x]}
 
-             
+
 
       echo "Getting average posterior for purposes of adjusting the priors."
       # Note: this just uses CPUs, using a smallish subset of data.
@@ -402,7 +402,7 @@ while [ $x -lt $num_iters ]; do
         steps/nnet2/remove_egs.sh $prev_egs_dir
       fi
     fi
-    
+
     # Set off jobs doing some diagnostics, in the background.
     # Use the egs dir from the previous iteration for the diagnostics
     $cmd $dir/log/compute_prob_valid.$x.log \
@@ -410,7 +410,7 @@ while [ $x -lt $num_iters ]; do
     $cmd $dir/log/compute_prob_train.$x.log \
       nnet-compute-prob $dir/$x.mdl ark:$cur_egs_dir/train_diagnostic.egs &
     if [ $x -gt 0 ] && [ ! -f $dir/log/mix_up.$[$x-1].log ]; then
-      [ ! -f $x.mdl ] && sleep 10; 
+      [ ! -f $x.mdl ] && sleep 10;
       $cmd $dir/log/progress.$x.log \
         nnet-show-progress --use-gpu=no $dir/$[$x-1].mdl $dir/$x.mdl \
         ark:$cur_egs_dir/train_diagnostic.egs '&&' \
@@ -445,7 +445,7 @@ while [ $x -lt $num_iters ]; do
     ( # this sub-shell is so that when we "wait" below,
       # we only wait for the training jobs that we just spawned,
       # not the diagnostic jobs that we spawned above.
-      
+
       # We can't easily use a single parallel SGE job to do the main training,
       # because the computation of which archive and which --frame option
       # to use for each job is a little complex, so we spawn each one separately.
@@ -461,7 +461,7 @@ while [ $x -lt $num_iters ]; do
         $cmd $parallel_opts $dir/log/train.$x.$n.log \
           nnet-train$parallel_suffix $parallel_train_opts \
           --minibatch-size=$this_minibatch_size --srand=$x "$mdl" \
-          "ark:nnet-copy-egs --frame=$frame ark:$cur_egs_dir/egs.$archive.ark ark:-|nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-|" \
+          "ark,bg:nnet-copy-egs --frame=$frame ark:$cur_egs_dir/egs.$archive.ark ark:-|nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-|" \
           $dir/$[$x+1].$n.mdl || touch $dir/.error &
       done
       wait
@@ -484,7 +484,7 @@ while [ $x -lt $num_iters ]; do
       n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
           $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
           undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
-          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob; 
+          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
           $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
       [ -z "$n" ] && echo "Error getting best model" && exit 1;
       cp $dir/$[$x+1].$n.mdl $dir/$[$x+1].mdl || exit 1;
@@ -521,7 +521,7 @@ if [ $stage -le $num_iters ]; then
     cur_offset=0 # current offset from first_model_combine.
     for n in $(seq $max_models_combine); do
       next_offset=$[($n*$num_models_combine)/$max_models_combine]
-      sub_list="" 
+      sub_list=""
       for o in $(seq $cur_offset $[$next_offset-1]); do
         iter=$[$first_model_combine+$o]
         mdl=$dir/$iter.mdl
diff --git a/egs/wsj/s5/steps/nnet2/train_pnorm_multisplice2.sh b/egs/wsj/s5/steps/nnet2/train_pnorm_multisplice2.sh
index 2708eb85636..3e6c0c2ed96 100755
--- a/egs/wsj/s5/steps/nnet2/train_pnorm_multisplice2.sh
+++ b/egs/wsj/s5/steps/nnet2/train_pnorm_multisplice2.sh
@@ -429,7 +429,7 @@ while [ $x -lt $num_iters ]; do
         $cmd $parallel_opts $dir/log/train.$x.$n.log \
           nnet-train$parallel_suffix $parallel_train_opts \
           --minibatch-size=$this_minibatch_size --srand=$x "$mdl" \
-          "ark:nnet-copy-egs --frame=$frame ark:$cur_egs_dir/egs.$archive.ark ark:-|nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-|" \
+          "ark,bg:nnet-copy-egs --frame=$frame ark:$cur_egs_dir/egs.$archive.ark ark:-|nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-|" \
           $dir/$[$x+1].$n.mdl || touch $dir/.error &
       done
       wait
diff --git a/egs/wsj/s5/steps/nnet2/train_pnorm_simple2.sh b/egs/wsj/s5/steps/nnet2/train_pnorm_simple2.sh
index 44639ebd2d9..fe0f4cf7a37 100755
--- a/egs/wsj/s5/steps/nnet2/train_pnorm_simple2.sh
+++ b/egs/wsj/s5/steps/nnet2/train_pnorm_simple2.sh
@@ -465,7 +465,7 @@ while [ $x -lt $num_iters ]; do
         $cmd $parallel_opts $dir/log/train.$x.$n.log \
           nnet-train$parallel_suffix $parallel_train_opts \
           --minibatch-size=$this_minibatch_size --srand=$x "$mdl" \
-          "ark:nnet-copy-egs --frame=$frame ark:$cur_egs_dir/egs.$archive.ark ark:-|nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-|" \
+          "ark,bg:nnet-copy-egs --frame=$frame ark:$cur_egs_dir/egs.$archive.ark ark:-|nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-|" \
           $dir/$[$x+1].$n.mdl || touch $dir/.error &
       done
       wait
diff --git a/egs/wsj/s5/steps/nnet3/adjust_priors.sh b/egs/wsj/s5/steps/nnet3/adjust_priors.sh
new file mode 100755
index 00000000000..60d377f18e8
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/adjust_priors.sh
@@ -0,0 +1,101 @@
+#!/bin/bash
+
+. path.sh
+
+# This script computes the DNN output averaged over a small subset of 
+# training egs and stores it in post.$iter.vec.
+# This is used for the purpose of adjusting the nnet priors. 
+# When --use-raw-nnet is false, then the computed priors is added into the 
+# nnet model; hence the term adjust priors. 
+# When --use-raw-nnet is true, the computed priors is not added into the 
+# nnet model and left in the file post.$iter.vec.
+
+cmd=run.pl
+prior_subset_size=20000   # 20k samples per job, for computing priors.
+num_jobs_compute_prior=10 # these are single-threaded, run on CPU.
+use_gpu=false             # if true, we run on GPU.
+egs_type=egs              # Compute from $egs_type.*.ark in $egs_dir
+                          # If --egs-type is degs, then the program
+                          # nnet3-discriminative-compute-from-egs is used 
+                          # instead of nnet3-compute-from-egs.
+use_raw_nnet=false        # If raw nnet, the averaged posterior is computed 
+                          # and stored in post.$iter.vec; but there is no
+                          # adjusting of priors
+iter=final
+
+. utils/parse_options.sh
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 [opts] <exp-dir> <egs-dir>"
+  echo " e.g.: $0 exp/nnet3_sad_snr/tdnn_train_100k_whole_1k_splice2_2_relu500"
+  exit 1
+fi
+
+dir=$1
+egs_dir=$2
+
+if $use_gpu; then
+  prior_gpu_opt="--use-gpu=yes"
+  prior_queue_opt="--gpu 1"
+else
+  prior_gpu_opt="--use-gpu=no"
+  prior_queue_opt=""
+fi
+
+for f in $egs_dir/$egs_type.1.ark $egs_dir/info/num_archives; do 
+  if [ ! -f $f ]; then
+    echo "$f not found" 
+    exit 1 
+  fi
+done
+
+if $use_raw_nnet; then
+  model=$dir/$iter.raw
+else 
+  model="nnet3-am-copy --raw=true $dir/$iter.mdl - |"
+fi
+
+rm -f $dir/post.$iter.*.vec 2>/dev/null
+
+left_context=`cat $egs_dir/info/left_context` || exit 1
+right_context=`cat $egs_dir/info/right_context` || exit 1
+
+context_opts="--left-context=$left_context --right-context=$right_context"
+
+num_archives=$(cat $egs_dir/info/num_archives) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }
+if [ $num_jobs_compute_prior -gt $num_archives ]; then egs_part=1;
+else egs_part=JOB; fi
+
+if [ $egs_type != "degs" ]; then
+  $cmd JOB=1:$num_jobs_compute_prior $prior_queue_opt $dir/log/get_post.$iter.JOB.log \
+    nnet3-copy-egs ark:$egs_dir/$egs_type.$egs_part.ark ark:- \| \
+    nnet3-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
+    nnet3-merge-egs ark:- ark:- \| \
+    nnet3-compute-from-egs $prior_gpu_opt --apply-exp=true \
+    "$model" ark:- ark:- \| \
+    matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$iter.JOB.vec || exit 1;
+else 
+  $cmd JOB=1:$num_jobs_compute_prior $prior_queue_opt $dir/log/get_post.$iter.JOB.log \
+    nnet3-discriminative-copy-egs ark:$egs_dir/$egs_type.$egs_part.ark ark:- \| \
+    nnet3-discriminative-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
+    nnet3-discriminative-merge-egs ark:- ark:- \| \
+    nnet3-compute-from-degs $prior_gpu_opt --apply-exp=true \
+    "$model" ark:- ark:- \| \
+    matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$iter.JOB.vec || exit 1;
+
+fi
+
+sleep 3;  # make sure there is time for $dir/post.$iter.*.vec to appear.
+
+$cmd $dir/log/vector_sum.$iter.log \
+  vector-sum $dir/post.$iter.*.vec $dir/post.$iter.vec || exit 1;
+
+if ! $use_raw_nnet; then
+  run.pl $dir/log/adjust_priors.$iter.log \
+    nnet3-am-adjust-priors $dir/$iter.mdl $dir/post.$iter.vec $dir/$iter.adj.mdl
+fi
+
+rm -f $dir/post.$iter.*.vec;
+
diff --git a/egs/wsj/s5/steps/nnet3/align.sh b/egs/wsj/s5/steps/nnet3/align.sh
new file mode 100755
index 00000000000..e151876c690
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/align.sh
@@ -0,0 +1,153 @@
+#!/bin/bash
+# Copyright 2012  Brno University of Technology (Author: Karel Vesely)
+#           2013  Johns Hopkins University (Author: Daniel Povey)
+#           2015  Vijayaditya Peddinti
+#           2016  Vimal Manohar
+# Apache 2.0
+
+# Computes training alignments using nnet3 DNN
+
+# Begin configuration section.
+nj=4
+cmd=run.pl
+# Begin configuration.
+scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+beam=10
+retry_beam=40
+transform_dir=
+iter=final
+use_gpu=true
+frames_per_chunk=50
+extra_left_context=0
+extra_right_context=0
+extra_left_context_initial=-1
+extra_right_context_final=-1
+online_ivector_dir=
+feat_type=  # you can set this to force it to use delta features.
+# End configuration options.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f path.sh ] && . ./path.sh # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# != 4 ]; then
+   echo "Usage: $0 [--transform-dir <transform-dir>] <data-dir> <lang-dir> <src-dir> <align-dir>"
+   echo "e.g.: $0 data/train data/lang exp/nnet4 exp/nnet4_ali"
+   echo "main options (for others, see top of script file)"
+   echo "  --config <config-file>                           # config containing options"
+   echo "  --nj <nj>                                        # number of parallel jobs"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   exit 1;
+fi
+
+data=$1
+lang=$2
+srcdir=$3
+dir=$4
+
+oov=`cat $lang/oov.int` || exit 1;
+mkdir -p $dir/log
+echo $nj > $dir/num_jobs
+sdata=$data/split$nj
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+
+if $use_gpu; then
+  queue_opt="--gpu 1"
+  gpu_opt="--use-gpu=yes"
+else
+  queue_opt=""
+  gpu_opt="--use-gpu=no"
+fi
+
+extra_files=
+[ ! -z "$online_ivector_dir" ] && \
+  extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
+for f in $srcdir/tree $srcdir/${iter}.mdl $data/feats.scp $lang/L.fst $extra_files; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+cp $srcdir/{tree,${iter}.mdl} $dir || exit 1;
+
+
+## Set up features.  Note: these are different from the normal features
+## because we have one rspecifier that has the features for the entire
+## training set, not separate ones for each batch.
+if [ -z "$feat_type" ]; then
+  if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=raw; fi
+fi
+echo "$0: feature type is $feat_type"
+
+cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
+cp $srcdir/cmvn_opts $dir 2>/dev/null
+
+case $feat_type in
+  raw) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"
+   ;;
+  lda)
+    splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
+    cp $srcdir/splice_opts $dir 2>/dev/null
+    cp $srcdir/final.mat $dir || exit 1;
+    feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
+    ;;
+  *) echo "$0: invalid feature type $feat_type" && exit 1;
+esac
+
+if [ ! -z "$transform_dir" ]; then
+  echo "$0: using transforms from $transform_dir"
+  [ ! -s $transform_dir/num_jobs ] && \
+    echo "$0: expected $transform_dir/num_jobs to contain the number of jobs." && exit 1;
+  nj_orig=$(cat $transform_dir/num_jobs)
+
+  if [ $feat_type == "raw" ]; then trans=raw_trans;
+  else trans=trans; fi
+  if [ $feat_type == "lda" ] && ! cmp $transform_dir/final.mat $srcdir/final.mat; then
+    echo "$0: LDA transforms differ between $srcdir and $transform_dir"
+    exit 1;
+  fi
+  if [ ! -f $transform_dir/$trans.1 ]; then
+    echo "$0: expected $transform_dir/$trans.1 to exist (--transform-dir option)"
+    exit 1;
+  fi
+  if [ $nj -ne $nj_orig ]; then
+    # Copy the transforms into an archive with an index.
+    for n in $(seq $nj_orig); do cat $transform_dir/$trans.$n; done | \
+       copy-feats ark:- ark,scp:$dir/$trans.ark,$dir/$trans.scp || exit 1;
+    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/$trans.scp ark:- ark:- |"
+  else
+    # number of jobs matches with alignment dir.
+    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/$trans.JOB ark:- ark:- |"
+  fi
+fi
+
+ivector_opts=
+if [ ! -z "$online_ivector_dir" ]; then
+  ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
+  # note: subsample-feats, with negative n, will repeat each feature -n times.
+  ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector_period=$ivector_period"
+fi
+
+echo "$0: aligning data in $data using model from $srcdir, putting alignments in $dir"
+
+tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|";
+
+frame_subsampling_opt=
+if [ -f $srcdir/frame_subsampling_factor ]; then
+  # e.g. for 'chain' systems
+  frame_subsampling_opt="--frame-subsampling-factor=$(cat $srcdir/frame_subsampling_factor)"
+  cp $srcdir/frame_subsampling_factor $dir
+fi
+
+$cmd $queue_opt JOB=1:$nj $dir/log/align.JOB.log \
+  compile-train-graphs $dir/tree $srcdir/${iter}.mdl  $lang/L.fst "$tra" ark:- \| \
+  nnet3-align-compiled $scale_opts $ivector_opts $frame_subsampling_opt \
+  --frames-per-chunk=$frames_per_chunk \
+  --extra-left-context=$extra_left_context \
+  --extra-right-context=$extra_right_context \
+  --extra-left-context-initial=$extra_left_context_initial \
+  --extra-right-context-final=$extra_right_context_final \
+  $gpu_opt --beam=$beam --retry-beam=$retry_beam \
+  $srcdir/${iter}.mdl ark:- "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
+
+echo "$0: done aligning data."
+
diff --git a/egs/wsj/s5/steps/nnet3/chain/build_tree.sh b/egs/wsj/s5/steps/nnet3/chain/build_tree.sh
new file mode 100755
index 00000000000..a2cb9927393
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/chain/build_tree.sh
@@ -0,0 +1,179 @@
+#!/bin/bash
+# Copyright 2012-2015  Johns Hopkins University (Author: Daniel Povey).
+#  Apache 2.0.
+
+
+# This script builds a tree for use in the 'chain' systems (although the script
+# itself is pretty generic and doesn't use any 'chain' binaries).  This is just
+# like the first stages of a standard system, like 'train_sat.sh', except it
+# does 'convert-ali' to convert alignments to a monophone topology just created
+# from the 'lang' directory (in case the topology is different from where you
+# got the system's alignments from), and it stops after the tree-building and
+# model-initialization stage, without re-estimating the Gaussians or training
+# the transitions.
+
+
+# Begin configuration section.
+stage=-5
+exit_stage=-100 # you can use this to require it to exit at the
+                # beginning of a specific stage.  Not all values are
+                # supported.
+cmd=run.pl
+context_opts=  # e.g. set this to "--context-width 5 --central-position 2" for quinphone.
+cluster_thresh=-1  # for build-tree control final bottom-up clustering of leaves
+frame_subsampling_factor=1
+leftmost_questions_truncate=10
+tree_stats_opts=
+cluster_phones_opts=
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# != 5 ]; then
+  echo "Usage: steps/train_sat.sh <#leaves> <data> <lang> <ali-dir> <exp-dir>"
+  echo " e.g.: steps/train_sat.sh 2500 15000 data/train_si84 data/lang exp/tri2b_ali_si84 exp/tri3b"
+  echo "Main options (for others, see top of script file)"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --config <config-file>                           # config containing options"
+  echo "  --stage <stage>                                  # stage to do partial re-run from."
+  exit 1;
+fi
+
+numleaves=$1
+data=$2
+lang=$3
+alidir=$4
+dir=$5
+
+for f in $data/feats.scp $lang/phones.txt $alidir/final.mdl $alidir/ali.1.gz; do
+  [ ! -f $f ] && echo "train_sat.sh: no such file $f" && exit 1;
+done
+
+oov=`cat $lang/oov.int`
+nj=`cat $alidir/num_jobs` || exit 1;
+silphonelist=`cat $lang/phones/silence.csl`
+ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1;
+sdata=$data/split$nj;
+splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options.
+cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null`
+delta_opts=`cat $alidir/delta_opts 2>/dev/null`
+
+mkdir -p $dir/log
+cp $alidir/splice_opts $dir 2>/dev/null # frame-splicing options.
+cp $alidir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option.
+cp $alidir/delta_opts $dir 2>/dev/null # delta option.
+
+echo $nj >$dir/num_jobs
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+
+# Set up features.
+
+if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
+echo "$0: feature type is $feat_type"
+
+## Set up speaker-independent features.
+case $feat_type in
+  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
+    cp $alidir/final.mat $dir
+    cp $alidir/full.mat $dir 2>/dev/null
+    ;;
+  *) echo "$0: invalid feature type $feat_type" && exit 1;
+esac
+
+# Add fMLLR transforms if available
+if [ -f $alidir/trans.1 ]; then
+  echo "$0: Using transforms from $alidir"
+  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$alidir/trans.JOB ark:- ark:- |"
+fi
+
+# Do subsampling of feats, if needed
+if [ $frame_subsampling_factor -gt 1 ]; then
+  feats="$feats subsample-feats --n=$frame_subsampling_factor ark:- ark:- |"
+fi
+
+if [ $stage -le -5 ]; then
+  echo "$0: Initializing monophone model (for alignment conversion, in case topology changed)"
+
+  [ ! -f $lang/phones/sets.int ] && exit 1;
+  shared_phones_opt="--shared-phones=$lang/phones/sets.int"
+  # get feature dimension
+  example_feats="`echo $feats | sed s/JOB/1/g`";
+  if ! feat_dim=$(feat-to-dim "$example_feats" - 2>/dev/null) || [ -z $feat_dim ]; then
+    feat-to-dim "$example_feats" - # to see the error message.
+    echo "error getting feature dimension"
+    exit 1;
+  fi
+  $cmd JOB=1 $dir/log/init_mono.log \
+    gmm-init-mono $shared_phones_opt "--train-feats=$feats subset-feats --n=10 ark:- ark:-|" $lang/topo $feat_dim \
+      $dir/mono.mdl $dir/mono.tree || exit 1;
+fi
+
+
+if [ $stage -le -4 ]; then
+  # Get tree stats.
+  echo "$0: Accumulating tree stats"
+  $cmd JOB=1:$nj $dir/log/acc_tree.JOB.log \
+     convert-ali --frame-subsampling-factor=$frame_subsampling_factor \
+         $alidir/final.mdl $dir/mono.mdl $dir/mono.tree "ark:gunzip -c $alidir/ali.JOB.gz|" ark:-  \| \
+      acc-tree-stats $context_opts $tree_stats_opts --ci-phones=$ciphonelist $dir/mono.mdl \
+         "$feats" ark:- $dir/JOB.treeacc || exit 1;
+  [ "`ls $dir/*.treeacc | wc -w`" -ne "$nj" ] && echo "$0: Wrong #tree-accs" && exit 1;
+  $cmd $dir/log/sum_tree_acc.log \
+    sum-tree-stats $dir/treeacc $dir/*.treeacc || exit 1;
+  rm $dir/*.treeacc
+fi
+
+if [ $stage -le -3 ] && $train_tree; then
+  echo "$0: Getting questions for tree clustering."
+  # preparing questions, roots file...
+  $cmd $dir/log/questions.log \
+     cluster-phones $cluster_phones_opts $context_opts $dir/treeacc \
+     $lang/phones/sets.int $dir/questions.int || exit 1;
+  cat $lang/phones/extra_questions.int >> $dir/questions.int
+  $cmd $dir/log/compile_questions.log \
+    compile-questions --leftmost-questions-truncate=$leftmost_questions_truncate \
+      $context_opts $lang/topo $dir/questions.int $dir/questions.qst || exit 1;
+
+  # questions_truncated.int will be needed later on when we build the phone
+  # language model for 'chain' training.  It's a mechanism of keeping the graph
+  # small.
+  if [ $leftmost_questions_truncate -gt 0 ]; then
+     head -n $leftmost_questions_truncate $dir/questions.int > $dir/questions_truncated.int
+  else
+    cp $dir/questions.int $dir/questions_truncated.int
+  fi
+
+  echo "$0: Building the tree"
+  $cmd $dir/log/build_tree.log \
+    build-tree $context_opts --verbose=1 --max-leaves=$numleaves \
+    --cluster-thresh=$cluster_thresh $dir/treeacc $lang/phones/roots.int \
+    $dir/questions.qst $lang/topo $dir/tree || exit 1;
+fi
+
+if [ $stage -le -2 ]; then
+  echo "$0: Initializing the model"
+  gmm-init-model  --write-occs=$dir/1.occs  \
+    $dir/tree $dir/treeacc $lang/topo $dir/1.mdl 2> $dir/log/init_model.log || exit 1;
+  grep 'no stats' $dir/log/init_model.log && echo "This is a bad warning.";
+  rm $dir/treeacc
+fi
+
+if [ $stage -le -1 ]; then
+  # Convert the1alignments to the new tree.  Note: we likely will not use these
+  # converted alignments in the CTC system directly, but they could be useful
+  # for other purposes.
+  echo "$0: Converting alignments from $alidir to use current tree"
+  $cmd JOB=1:$nj $dir/log/convert.JOB.log \
+    convert-ali --frame-subsampling-factor=$frame_subsampling_factor \
+       $alidir/final.mdl $dir/1.mdl $dir/tree \
+     "ark:gunzip -c $alidir/ali.JOB.gz|" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
+fi
+
+cp $dir/1.mdl $dir/final.mdl
+
+echo $0: Done building tree
+
diff --git a/egs/wsj/s5/steps/nnet3/chain/gen_topo.pl b/egs/wsj/s5/steps/nnet3/chain/gen_topo.pl
new file mode 100755
index 00000000000..32dfa272a97
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/chain/gen_topo.pl
@@ -0,0 +1,42 @@
+#!/usr/bin/env perl
+
+# Copyright 2012  Johns Hopkins University (author: Daniel Povey)
+
+# Generate a topology file.  This allows control of the number of states in the
+# non-silence HMMs, and in the silence HMMs.  This is a modified version of
+# 'utils/gen_topo.pl' that generates a different type of topology, one that we
+# believe should be useful in the 'chain' model.  Note: right now it doesn't
+# have any real options, and it treats silence and nonsilence the same.  The
+# intention is that you write different versions of this script, or add options,
+# if you experiment with it.
+
+if (@ARGV != 2) {
+  print STDERR "Usage: utils/gen_topo.pl <colon-separated-nonsilence-phones> <colon-separated-silence-phones>\n";
+  print STDERR "e.g.:  utils/gen_topo.pl 4:5:6:7:8:9:10 1:2:3\n";
+  exit (1);
+}
+
+($nonsil_phones, $sil_phones) = @ARGV;
+
+$nonsil_phones =~ s/:/ /g;
+$sil_phones =~ s/:/ /g;
+$nonsil_phones =~ m/^\d[ \d]+$/ || die "$0: bad arguments @ARGV\n";
+$sil_phones =~ m/^\d[ \d]*$/ || die "$0: bad arguments @ARGV\n";
+
+print "<Topology>\n";
+print "<TopologyEntry>\n";
+print "<ForPhones>\n";
+print "$nonsil_phones $sil_phones\n";
+print "</ForPhones>\n";
+# The next two lines may look like a bug, but they are as intended.  State 0 has
+# no self-loop, it happens exactly once.  And it can go either to state 1 (with
+# a self-loop) or to state 2, so we can have zero or more instances of state 1
+# following state 0.
+# We make the transition-probs 0.5 so they normalize, to keep the code happy.
+# In fact, we always set the transition probability scale to 0.0 in the 'chain'
+# code, so they are never used.
+print "<State> 0 <PdfClass> 0 <Transition> 1 0.5 <Transition> 2 0.5 </State>\n";
+print "<State> 1 <PdfClass> 1 <Transition> 1 0.5 <Transition> 2 0.5 </State>\n";
+print "<State> 2 </State>\n";
+print "</TopologyEntry>\n";
+print "</Topology>\n";
diff --git a/egs/wsj/s5/steps/nnet3/chain/gen_topo.py b/egs/wsj/s5/steps/nnet3/chain/gen_topo.py
new file mode 100755
index 00000000000..fdd7a02fd88
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/chain/gen_topo.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python
+
+# Copyright 2012  Johns Hopkins University (author: Daniel Povey)
+
+# Generate a topology file.  This allows control of the number of states in the
+# non-silence HMMs, and in the silence HMMs.  This is a modified version of
+# 'utils/gen_topo.pl' that generates a different type of topology, one that we
+# believe should be useful in the 'chain' model.  Note: right now it doesn't
+# have any real options, and it treats silence and nonsilence the same.  The
+# intention is that you write different versions of this script, or add options,
+# if you experiment with it.
+
+from __future__ import print_function
+import argparse
+
+
+parser = argparse.ArgumentParser(description="Usage: steps/nnet3/chain/gen_topo.py "
+                                             "<colon-separated-nonsilence-phones> <colon-separated-silence-phones>"
+                                             "e.g.:  steps/nnet3/chain/gen_topo.pl 4:5:6:7:8:9:10 1:2:3\n",
+                                 epilog="See egs/swbd/s5c/local/chain/train_tdnn_a.sh for example of usage.");
+parser.add_argument("nonsilence_phones", type=str,
+                    help="List of non-silence phones as integers, separated by colons, e.g. 4:5:6:7:8:9");
+parser.add_argument("silence_phones", type=str,
+                    help="List of silence phones as integers, separated by colons, e.g. 1:2:3");
+
+args = parser.parse_args()
+
+silence_phones = [ int(x) for x in args.silence_phones.split(":") ]
+nonsilence_phones = [ int(x) for x in args.nonsilence_phones.split(":") ]
+all_phones = silence_phones +  nonsilence_phones
+
+print("<Topology>")
+print("<TopologyEntry>")
+print("<ForPhones>")
+print(" ".join([str(x) for x in all_phones]))
+print("</ForPhones>")
+# The next two lines may look like a bug, but they are as intended.  State 0 has
+# no self-loop, it happens exactly once.  And it can go either to state 1 (with
+# a self-loop) or to state 2, so we can have zero or more instances of state 1
+# following state 0.
+# We make the transition-probs 0.5 so they normalize, to keep the code happy.
+# In fact, we always set the transition probability scale to 0.0 in the 'chain'
+# code, so they are never used.
+print("<State> 0 <PdfClass> 0 <Transition> 1 0.5 <Transition> 2 0.5 </State>")
+print("<State> 1 <PdfClass> 1 <Transition> 1 0.5 <Transition> 2 0.5 </State>")
+print("<State> 2 </State>")
+print("</TopologyEntry>")
+print("</Topology>")
+
diff --git a/egs/wsj/s5/steps/nnet3/chain/gen_topo2.py b/egs/wsj/s5/steps/nnet3/chain/gen_topo2.py
new file mode 100755
index 00000000000..a33dab666e6
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/chain/gen_topo2.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python
+
+# Copyright 2012  Johns Hopkins University (author: Daniel Povey)
+
+# Generate a topology file.  This allows control of the number of states in the
+# non-silence HMMs, and in the silence HMMs.  This is a modified version of
+# 'utils/gen_topo.pl' that generates a different type of topology, one that we
+# believe should be useful in the 'chain' model.  Note: right now it doesn't
+# have any real options, and it treats silence and nonsilence the same.  The
+# intention is that you write different versions of this script, or add options,
+# if you experiment with it.
+
+from __future__ import print_function
+import argparse
+
+
+parser = argparse.ArgumentParser(description="Usage: steps/nnet3/chain/gen_topo.py "
+                                             "<colon-separated-nonsilence-phones> <colon-separated-silence-phones>"
+                                             "e.g.:  steps/nnet3/chain/gen_topo.pl 4:5:6:7:8:9:10 1:2:3\n",
+                                 epilog="See egs/swbd/s5c/local/chain/train_tdnn_a.sh for example of usage.");
+parser.add_argument("nonsilence_phones", type=str,
+                    help="List of non-silence phones as integers, separated by colons, e.g. 4:5:6:7:8:9");
+parser.add_argument("silence_phones", type=str,
+                    help="List of silence phones as integers, separated by colons, e.g. 1:2:3");
+
+args = parser.parse_args()
+
+silence_phones = [ int(x) for x in args.silence_phones.split(":") ]
+nonsilence_phones = [ int(x) for x in args.nonsilence_phones.split(":") ]
+all_phones = silence_phones +  nonsilence_phones
+
+print("<Topology>")
+print("<TopologyEntry>")
+print("<ForPhones>")
+print(" ".join([str(x) for x in all_phones]))
+print("</ForPhones>")
+
+# the pdf-classes are as follows:
+#  pdf-class 0 is in a 1-frame sequence, the initial and final state.
+#  pdf-class 1 is in a sequence with >=3 frames, the 'middle' states.  (important that
+#   it be numbered 1, which is the default list of pdf-classes used in 'cluster-phones').
+#  pdf-class 2 is the initial-state in a sequence with >= 2 frames.
+#  pdf-class 3 is the final-state in a sequence with >= 2 frames.
+# state 0 is nonemitting in this topology.
+
+print("<State> 0 <Transition> 1 0.5 <Transition> 2 0.5 </State>")  # initial nonemitting state.
+print("<State> 1 <PdfClass> 0 <Transition> 5 1.0 </State>")  # 1-frame sequence.
+print("<State> 2 <PdfClass> 2 <Transition> 3 0.5 <Transition> 4 0.5 </State>")  # 2 or more frames
+print("<State> 3 <PdfClass> 1 <Transition> 3 0.5 <Transition> 4 0.5 </State>")  # 3 or more frames
+print("<State> 4 <PdfClass> 3 <Transition> 5 1.0 </State>") # 2 or more frames.
+print("<State> 5 </State>")  # final nonemitting state
+
+print("</TopologyEntry>")
+print("</Topology>")
+
diff --git a/egs/wsj/s5/steps/nnet3/chain/gen_topo3.py b/egs/wsj/s5/steps/nnet3/chain/gen_topo3.py
new file mode 100755
index 00000000000..f43f5046813
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/chain/gen_topo3.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python
+
+# Copyright 2012  Johns Hopkins University (author: Daniel Povey)
+
+# Generate a topology file.  This allows control of the number of states in the
+# non-silence HMMs, and in the silence HMMs.  This is a modified version of
+# 'utils/gen_topo.pl' that generates a different type of topology, one that we
+# believe should be useful in the 'chain' model.  Note: right now it doesn't
+# have any real options, and it treats silence and nonsilence the same.  The
+# intention is that you write different versions of this script, or add options,
+# if you experiment with it.
+
+from __future__ import print_function
+import argparse
+
+
+parser = argparse.ArgumentParser(description="Usage: steps/nnet3/chain/gen_topo.py "
+                                             "<colon-separated-nonsilence-phones> <colon-separated-silence-phones>"
+                                             "e.g.:  steps/nnet3/chain/gen_topo.pl 4:5:6:7:8:9:10 1:2:3\n",
+                                 epilog="See egs/swbd/s5c/local/chain/train_tdnn_a.sh for example of usage.");
+parser.add_argument("nonsilence_phones", type=str,
+                    help="List of non-silence phones as integers, separated by colons, e.g. 4:5:6:7:8:9");
+parser.add_argument("silence_phones", type=str,
+                    help="List of silence phones as integers, separated by colons, e.g. 1:2:3");
+
+args = parser.parse_args()
+
+silence_phones = [ int(x) for x in args.silence_phones.split(":") ]
+nonsilence_phones = [ int(x) for x in args.nonsilence_phones.split(":") ]
+all_phones = silence_phones +  nonsilence_phones
+
+print("<Topology>")
+print("<TopologyEntry>")
+print("<ForPhones>")
+print(" ".join([str(x) for x in all_phones]))
+print("</ForPhones>")
+print("<State> 0 <PdfClass> 0 <Transition> 0 0.5 <Transition> 1 0.5 </State>")
+print("<State> 1 </State>")
+print("</TopologyEntry>")
+print("</Topology>")
+
diff --git a/egs/wsj/s5/steps/nnet3/chain/gen_topo4.py b/egs/wsj/s5/steps/nnet3/chain/gen_topo4.py
new file mode 100755
index 00000000000..6d88a6e4449
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/chain/gen_topo4.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env python
+
+# Copyright 2012  Johns Hopkins University (author: Daniel Povey)
+
+# Generate a topology file.  This allows control of the number of states in the
+# non-silence HMMs, and in the silence HMMs.  This is a modified version of
+# 'utils/gen_topo.pl' that generates a different type of topology, one that we
+# believe should be useful in the 'chain' model.  Note: right now it doesn't
+# have any real options, and it treats silence and nonsilence the same.  The
+# intention is that you write different versions of this script, or add options,
+# if you experiment with it.
+
+from __future__ import print_function
+import argparse
+
+
+parser = argparse.ArgumentParser(description="Usage: steps/nnet3/chain/gen_topo.py "
+                                             "<colon-separated-nonsilence-phones> <colon-separated-silence-phones>"
+                                             "e.g.:  steps/nnet3/chain/gen_topo.pl 4:5:6:7:8:9:10 1:2:3\n",
+                                 epilog="See egs/swbd/s5c/local/chain/train_tdnn_a.sh for example of usage.");
+parser.add_argument("nonsilence_phones", type=str,
+                    help="List of non-silence phones as integers, separated by colons, e.g. 4:5:6:7:8:9");
+parser.add_argument("silence_phones", type=str,
+                    help="List of silence phones as integers, separated by colons, e.g. 1:2:3");
+
+args = parser.parse_args()
+
+silence_phones = [ int(x) for x in args.silence_phones.split(":") ]
+nonsilence_phones = [ int(x) for x in args.nonsilence_phones.split(":") ]
+all_phones = silence_phones +  nonsilence_phones
+
+print("<Topology>")
+print("<TopologyEntry>")
+print("<ForPhones>")
+print(" ".join([str(x) for x in all_phones]))
+print("</ForPhones>")
+# state 0 is obligatory (occurs once)
+print("<State> 0 <PdfClass> 0 <Transition> 1 0.3333 <Transition> 2 0.3333 <Transition> 3 0.3333 </State> ")
+# state 1 is used only when >2 frames
+print("<State> 1 <PdfClass> 1 <Transition> 1 0.5 <Transition> 2 0.5 </State>")
+# state 2 is used only when >=2 frames (and occurs once)
+print("<State> 2 <PdfClass> 2 <Transition> 3 1.0 </State>")
+print("<State> 3 </State>")  # final nonemitting state
+print("</TopologyEntry>")
+print("</Topology>")
+
diff --git a/egs/wsj/s5/steps/nnet3/chain/gen_topo5.py b/egs/wsj/s5/steps/nnet3/chain/gen_topo5.py
new file mode 100755
index 00000000000..1583966b58c
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/chain/gen_topo5.py
@@ -0,0 +1,50 @@
+#!/usr/bin/env python
+
+# Copyright 2012  Johns Hopkins University (author: Daniel Povey)
+
+# Generate a topology file.  This allows control of the number of states in the
+# non-silence HMMs, and in the silence HMMs.  This is a modified version of
+# 'utils/gen_topo.pl' that generates a different type of topology, one that we
+# believe should be useful in the 'chain' model.  Note: right now it doesn't
+# have any real options, and it treats silence and nonsilence the same.  The
+# intention is that you write different versions of this script, or add options,
+# if you experiment with it.
+
+from __future__ import print_function
+import argparse
+
+
+parser = argparse.ArgumentParser(description="Usage: steps/nnet3/chain/gen_topo.py "
+                                             "<colon-separated-nonsilence-phones> <colon-separated-silence-phones>"
+                                             "e.g.:  steps/nnet3/chain/gen_topo.pl 4:5:6:7:8:9:10 1:2:3\n",
+                                 epilog="See egs/swbd/s5c/local/chain/train_tdnn_a.sh for example of usage.");
+parser.add_argument("nonsilence_phones", type=str,
+                    help="List of non-silence phones as integers, separated by colons, e.g. 4:5:6:7:8:9");
+parser.add_argument("silence_phones", type=str,
+                    help="List of silence phones as integers, separated by colons, e.g. 1:2:3");
+
+args = parser.parse_args()
+
+silence_phones = [ int(x) for x in args.silence_phones.split(":") ]
+nonsilence_phones = [ int(x) for x in args.nonsilence_phones.split(":") ]
+all_phones = silence_phones +  nonsilence_phones
+
+print("<Topology>")
+print("<TopologyEntry>")
+print("<ForPhones>")
+print(" ".join([str(x) for x in all_phones]))
+print("</ForPhones>")
+# state 0 is nonemitting
+print("<State> 0 <Transition> 1 0.5 <Transition> 2 0.5 </State>")
+# state 1 is for when we traverse it in 1 state
+print("<State> 1 <PdfClass> 0 <Transition> 4 1.0 </State>")
+# state 2 is for when we traverse it in >1 state, for the first state.
+print("<State> 2 <PdfClass> 2 <Transition> 3 1.0 </State>")
+# state 3 is for the self-loop.  Use pdf-class 1 here so that the default
+# phone-class clustering (which uses only pdf-class 1 by default) gets only
+# stats from longer phones.
+print("<State> 3 <PdfClass> 1 <Transition> 3 0.5 <Transition> 4 0.5 </State>")
+print("<State> 4 </State>")
+print("</TopologyEntry>")
+print("</Topology>")
+
diff --git a/egs/wsj/s5/steps/nnet3/chain/gen_topo6.py b/egs/wsj/s5/steps/nnet3/chain/gen_topo6.py
new file mode 100755
index 00000000000..d62cd4aaee4
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/chain/gen_topo6.py
@@ -0,0 +1,63 @@
+#!/usr/bin/env python
+
+# Copyright 2012  Johns Hopkins University (author: Daniel Povey)
+
+# Generate a topology file.  This allows control of the number of states in the
+# non-silence HMMs, and in the silence HMMs.  This is a modified version of
+# 'utils/gen_topo.pl' that generates a different type of topology, one that we
+# believe should be useful in the 'chain' model.  Note: right now it doesn't
+# have any real options, and it treats silence and nonsilence the same.  The
+# intention is that you write different versions of this script, or add options,
+# if you experiment with it.
+
+from __future__ import print_function
+import argparse
+
+
+parser = argparse.ArgumentParser(description="Usage: steps/nnet3/chain/gen_topo.py "
+                                             "<colon-separated-nonsilence-phones> <colon-separated-silence-phones>"
+                                             "e.g.:  steps/nnet3/chain/gen_topo.pl 4:5:6:7:8:9:10 1:2:3\n",
+                                 epilog="See egs/swbd/s5c/local/chain/train_tdnn_a.sh for example of usage.");
+parser.add_argument("nonsilence_phones", type=str,
+                    help="List of non-silence phones as integers, separated by colons, e.g. 4:5:6:7:8:9");
+parser.add_argument("silence_phones", type=str,
+                    help="List of silence phones as integers, separated by colons, e.g. 1:2:3");
+
+args = parser.parse_args()
+
+silence_phones = [ int(x) for x in args.silence_phones.split(":") ]
+nonsilence_phones = [ int(x) for x in args.nonsilence_phones.split(":") ]
+
+print("<Topology>")
+print("<TopologyEntry>")
+print("<ForPhones>")
+print(" ".join([str(x) for x in nonsilence_phones]))
+print("</ForPhones>")
+# The next two lines may look like a bug, but they are as intended.  State 0 has
+# no self-loop, it happens exactly once.  And it can go either to state 1 (with
+# a self-loop) or to state 2, so we can have zero or more instances of state 1
+# following state 0.
+# We make the transition-probs 0.5 so they normalize, to keep the code happy.
+# In fact, we always set the transition probability scale to 0.0 in the 'chain'
+# code, so they are never used.
+print("<State> 0 <PdfClass> 0 <Transition> 1 0.5 <Transition> 2 0.5 </State>")
+print("<State> 1 <PdfClass> 1 <Transition> 1 0.5 <Transition> 2 0.5 </State>")
+print("<State> 2 <PdfClass> 2 <Transition> 3 0.5 <Transition> 4 0.5 </State>")
+print("<State> 3 <PdfClass> 3 <Transition> 3 0.5 <Transition> 4 0.5 </State>")
+print("<State> 4 <PdfClass> 4 <Transition> 5 0.5 <Transition> 6 0.5 </State>")
+print("<State> 5 <PdfClass> 5 <Transition> 5 0.5 <Transition> 6 0.5 </State>")
+print("<State> 6 </State>")
+print("</TopologyEntry>")
+
+print("<TopologyEntry>")
+print("<ForPhones>")
+print(" ".join([str(x) for x in silence_phones]))
+print("</ForPhones>")
+print("<State> 0 <PdfClass> 0 <Transition> 0 0.25 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 </State>")
+print("<State> 1 <PdfClass> 1 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 <Transition> 4 0.25 </State>")
+print("<State> 2 <PdfClass> 2 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 <Transition> 4 0.25 </State>")
+print("<State> 3 <PdfClass> 3 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 <Transition> 4 0.25 </State>")
+print("<State> 4 <PdfClass> 4 <Transition> 4 0.75 <Transition> 5 0.25 </State>")
+print("<State> 5 </State>")
+print("</TopologyEntry>")
+print("</Topology>")
diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
new file mode 100755
index 00000000000..d3112752856
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
@@ -0,0 +1,441 @@
+#!/bin/bash
+
+# Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
+#
+# This script, which will generally be called from other neural-net training
+# scripts, extracts the training examples used to train the 'chain' system
+# (and also the validation examples used for diagnostics), and puts them in
+# separate archives.
+#
+# This script dumps egs with many frames of labels, controlled by the
+# frames_per_eg config variable (default: 25), plus left and right context.
+# Because CTC training involves alignment of data, we can't meaningfully train
+# frame by frame.   The supervision approach involves the time alignment, though--
+# it is just applied in a loose way, where each symbol can appear in the
+# frame-range that it was in in the alignment, extended by a certain margin.
+#
+
+
+# Begin configuration section.
+cmd=run.pl
+feat_type=raw     # set it to 'lda' to use LDA features.
+frames_per_eg=25   # number of feature frames example (not counting added context).
+                   # more->less disk space and less time preparing egs, but more
+                   # I/O during training.  note: the script may reduce this if
+                   # reduce_frames_per_eg is true.
+frames_overlap_per_eg=0  # number of supervised frames of overlap that we aim for per eg.
+                  # can be useful to avoid wasted data if you're using --left-deriv-truncate
+                  # and --right-deriv-truncate.
+cut_zero_frames=-1  # if activated, activates new-style derivative weights.. i'll reorganize
+                    # this if it works well.
+frame_subsampling_factor=3 # frames-per-second of features we train on divided
+                           # by frames-per-second at output of chain model
+alignment_subsampling_factor=3 # frames-per-second of input alignments divided
+                               # by frames-per-second at output of chain model
+left_context=4    # amount of left-context per eg (i.e. extra frames of input features
+                  # not present in the output supervision).
+right_context=4   # amount of right-context per eg.
+valid_left_context=   # amount of left_context for validation egs, typically used in
+                      # recurrent architectures to ensure matched condition with
+                      # training egs
+valid_right_context=  # amount of right_context for validation egs
+compress=true   # set this to false to disable compression (e.g. if you want to see whether
+                # results are affected).
+
+num_utts_subset=300     # number of utterances in validation and training
+                        # subsets used for shrinkage and diagnostics.
+num_valid_egs_combine=0  # #validation examples for combination weights at the very end.
+num_train_egs_combine=1000 # number of train examples for the above.
+num_egs_diagnostic=400 # number of frames for "compute_prob" jobs
+frames_per_iter=400000 # each iteration of training, see this many frames per
+                       # job, measured at the sampling rate of the features
+                       # used.  This is just a guideline; it will pick a number
+                       # that divides the number of samples in the entire data.
+
+right_tolerance=  #CTC right tolerance == max label delay.
+left_tolerance=
+
+transform_dir=     # If supplied, overrides latdir as the place to find fMLLR transforms
+
+stage=0
+nj=15         # This should be set to the maximum number of jobs you are
+              # comfortable to run in parallel; you can increase it if your disk
+              # speed is greater and you have more machines.
+max_shuffle_jobs_run=100  # the shuffle jobs now include the nnet3-chain-normalize-egs command,
+                         # which is fairly CPU intensive, so we can run quite a few at once
+                         # without overloading the disks.
+online_ivector_dir=  # can be used if we are including speaker information as iVectors.
+cmvn_opts=  # can be used for specifying CMVN options, if feature type is not lda (if lda,
+            # it doesn't make sense to use different options than were used as input to the
+            # LDA transform).  This is used to turn off CMVN in the online-nnet experiments.
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+
+if [ $# != 4 ]; then
+  echo "Usage: $0 [opts] <data> <chain-dir> <lattice-dir> <egs-dir>"
+  echo " e.g.: $0 data/train exp/tri4_nnet exp/tri3_lats exp/tri4_nnet/egs"
+  echo ""
+  echo "From <chain-dir>, 0.trans_mdl (the transition-model), tree (the tree)"
+  echo "and normalization.fst (the normalization FST, derived from the denominator FST)"
+  echo "are read."
+  echo ""
+  echo "Main options (for others, see top of script file)"
+  echo "  --config <config-file>                           # config file containing options"
+  echo "  --nj <nj>                                        # The maximum number of jobs you want to run in"
+  echo "                                                   # parallel (increase this only if you have good disk and"
+  echo "                                                   # network speed).  default=6"
+  echo "  --cmd (utils/run.pl;utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --frames-per-iter <#samples;400000>              # Number of frames of data to process per iteration, per"
+  echo "                                                   # process."
+  echo "  --feat-type <lda|raw>                            # (raw is the default).  The feature type you want"
+  echo "                                                   # to use as input to the neural net."
+  echo "  --frame-subsampling-factor <factor;3>            # factor by which num-frames at nnet output is reduced "
+  echo "  --frames-per-eg <frames;25>                      # number of supervised frames per eg on disk"
+  echo "  --frames-overlap-per-eg <frames;25>              # number of supervised frames of overlap between egs"
+  echo "  --left-context <width;4>                         # Number of frames on left side to append for feature input"
+  echo "  --right-context <width;4>                        # Number of frames on right side to append for feature input"
+  echo "  --num-egs-diagnostic <#frames;4000>              # Number of egs used in computing (train,valid) diagnostics"
+  echo "  --num-valid-egs-combine <#frames;10000>          # Number of egss used in getting combination weights at the"
+  echo "                                                   # very end."
+  echo "  --stage <stage|0>                                # Used to run a partially-completed training process from somewhere in"
+  echo "                                                   # the middle."
+
+  exit 1;
+fi
+
+data=$1
+chaindir=$2
+latdir=$3
+dir=$4
+
+# Check some files.
+[ ! -z "$online_ivector_dir" ] && \
+  extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
+
+for f in $data/feats.scp $latdir/lat.1.gz $latdir/final.mdl \
+         $chaindir/{0.trans_mdl,tree,normalization.fst} $extra_files; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+sdata=$data/split$nj
+utils/split_data.sh $data $nj
+
+mkdir -p $dir/log $dir/info
+
+num_lat_jobs=$(cat $latdir/num_jobs) || exit 1;
+
+# Get list of validation utterances.
+
+frame_shift=$(utils/data/get_frame_shift.sh $data)
+utils/data/get_utt2dur.sh $data
+
+cat $data/utt2dur | \
+  awk -v min_len=$frames_per_eg -v fs=$frame_shift '{if ($2 * 1/fs >= min_len) print $1}' | \
+  utils/shuffle_list.pl | head -$num_utts_subset > $dir/valid_uttlist || exit 1;
+
+len_uttlist=`wc -l $dir/valid_uttlist | awk '{print $1}'`
+if [ $len_uttlist -lt $num_utts_subset ]; then
+  echo "Number of utterances which have length at least $frames_per_eg is really low. Please check your data." && exit 1;
+fi
+
+if [ -f $data/utt2uniq ]; then  # this matters if you use data augmentation.
+  # because of this stage we can again have utts with lengths less than
+  # frames_per_eg
+  echo "File $data/utt2uniq exists, so augmenting valid_uttlist to"
+  echo "include all perturbed versions of the same 'real' utterances."
+  mv $dir/valid_uttlist $dir/valid_uttlist.tmp
+  utils/utt2spk_to_spk2utt.pl $data/utt2uniq > $dir/uniq2utt
+  cat $dir/valid_uttlist.tmp | utils/apply_map.pl $data/utt2uniq | \
+    sort | uniq | utils/apply_map.pl $dir/uniq2utt | \
+    awk '{for(n=1;n<=NF;n++) print $n;}' | sort  > $dir/valid_uttlist
+  rm $dir/uniq2utt $dir/valid_uttlist.tmp
+fi
+
+cat $data/utt2dur | \
+  awk -v min_len=$frames_per_eg -v fs=$frame_shift '{if ($2 * 1/fs >= min_len) print $1}' | \
+   utils/filter_scp.pl --exclude $dir/valid_uttlist | \
+   utils/shuffle_list.pl | head -$num_utts_subset > $dir/train_subset_uttlist || exit 1;
+len_uttlist=`wc -l $dir/train_subset_uttlist | awk '{print $1}'`
+if [ $len_uttlist -lt $num_utts_subset ]; then
+  echo "Number of utterances which have length at least $frames_per_eg is really low. Please check your data." && exit 1;
+fi
+
+[ -z "$transform_dir" ] && transform_dir=$latdir
+
+# because we'll need the features with a different number of jobs than $latdir,
+# copy to ark,scp.
+if [ -f $transform_dir/trans.1 ] && [ $feat_type != "raw" ]; then
+  echo "$0: using transforms from $transform_dir"
+  if [ $stage -le 0 ]; then
+    $cmd $dir/log/copy_transforms.log \
+      copy-feats "ark:cat $transform_dir/trans.* |" "ark,scp:$dir/trans.ark,$dir/trans.scp"
+  fi
+fi
+if [ -f $transform_dir/raw_trans.1 ] && [ $feat_type == "raw" ]; then
+  echo "$0: using raw transforms from $transform_dir"
+  if [ $stage -le 0 ]; then
+    $cmd $dir/log/copy_transforms.log \
+      copy-feats "ark:cat $transform_dir/raw_trans.* |" "ark,scp:$dir/trans.ark,$dir/trans.scp"
+  fi
+fi
+
+
+
+## Set up features.
+echo "$0: feature type is $feat_type"
+
+case $feat_type in
+  raw) feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- |"
+    valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |"
+    train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |"
+    echo $cmvn_opts >$dir/cmvn_opts # caution: the top-level nnet training script should copy this to its own dir now.
+   ;;
+  lda)
+    splice_opts=`cat $latdir/splice_opts 2>/dev/null`
+    # caution: the top-level nnet training script should copy these to its own dir now.
+    cp $latdir/{splice_opts,cmvn_opts,final.mat} $dir || exit 1;
+    [ ! -z "$cmvn_opts" ] && \
+       echo "You cannot supply --cmvn-opts option if feature type is LDA." && exit 1;
+    cmvn_opts=$(cat $dir/cmvn_opts)
+    feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
+    valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
+    train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
+    ;;
+  *) echo "$0: invalid feature type --feat-type '$feat_type'" && exit 1;
+esac
+
+if [ -f $dir/trans.scp ]; then
+  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/trans.scp ark:- ark:- |"
+  valid_feats="$valid_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp ark:- ark:- |"
+  train_subset_feats="$train_subset_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp ark:- ark:- |"
+fi
+
+if [ ! -z "$online_ivector_dir" ]; then
+  ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
+  echo $ivector_dim > $dir/info/ivector_dim
+  ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
+
+  ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $sdata/JOB/utt2spk $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'"
+  valid_ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'"
+  train_subset_ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'"
+else
+  echo 0 >$dir/info/ivector_dim
+fi
+
+if [ $stage -le 1 ]; then
+  echo "$0: working out number of frames of training data"
+  num_frames=$(steps/nnet2/get_num_frames.sh $data)
+  echo $num_frames > $dir/info/num_frames
+  echo "$0: working out feature dim"
+  feats_one="$(echo $feats | sed s/JOB/1/g)"
+  feat_dim=$(feat-to-dim "$feats_one" -) || exit 1;
+  echo $feat_dim > $dir/info/feat_dim
+else
+  num_frames=$(cat $dir/info/num_frames) || exit 1;
+  feat_dim=$(cat $dir/info/feat_dim) || exit 1;
+fi
+
+# the + 1 is to round up, not down... we assume it doesn't divide exactly.
+num_archives=$[$num_frames/$frames_per_iter+1]
+
+# We may have to first create a smaller number of larger archives, with number
+# $num_archives_intermediate, if $num_archives is more than the maximum number
+# of open filehandles that the system allows per process (ulimit -n).
+max_open_filehandles=$(ulimit -n) || exit 1
+num_archives_intermediate=$num_archives
+archives_multiple=1
+while [ $[$num_archives_intermediate+4] -gt $max_open_filehandles ]; do
+  archives_multiple=$[$archives_multiple+1]
+  num_archives_intermediate=$[$num_archives/$archives_multiple] || exit 1;
+done
+# now make sure num_archives is an exact multiple of archives_multiple.
+num_archives=$[$archives_multiple*$num_archives_intermediate] || exit 1;
+
+echo $num_archives >$dir/info/num_archives
+echo $frames_per_eg >$dir/info/frames_per_eg
+# Work out the number of egs per archive
+egs_per_archive=$[$num_frames/($frames_per_eg*$num_archives)] || exit 1;
+! [ $egs_per_archive -le $frames_per_iter ] && \
+  echo "$0: script error: egs_per_archive=$egs_per_archive not <= frames_per_iter=$frames_per_iter" \
+  && exit 1;
+
+echo $egs_per_archive > $dir/info/egs_per_archive
+
+echo "$0: creating $num_archives archives, each with $egs_per_archive egs, with"
+echo "$0:   $frames_per_eg labels per example, and (left,right) context = ($left_context,$right_context)"
+echo $num_archives
+
+if [ -e $dir/storage ]; then
+  # Make soft links to storage directories, if distributing this way..  See
+  # utils/create_split_dir.pl.
+  echo "$0: creating data links"
+  utils/create_data_link.pl $(for x in $(seq $num_archives); do echo $dir/cegs.$x.ark; done)
+  for x in $(seq $num_archives_intermediate); do
+    utils/create_data_link.pl $(for y in $(seq $nj); do echo $dir/cegs_orig.$y.$x.ark; done)
+  done
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: copying training lattices"
+
+  $cmd --max-jobs-run 6 JOB=1:$num_lat_jobs $dir/log/lattice_copy.JOB.log \
+    lattice-copy "ark:gunzip -c $latdir/lat.JOB.gz|" ark,scp:$dir/lat.JOB.ark,$dir/lat.JOB.scp || exit 1;
+
+  for id in $(seq $num_lat_jobs); do cat $dir/lat.$id.scp; done > $dir/lat.scp
+fi
+
+
+egs_opts="--left-context=$left_context --right-context=$right_context --num-frames=$frames_per_eg --num-frames-overlap=$frames_overlap_per_eg --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress --cut-zero-frames=$cut_zero_frames"
+
+
+[ -z $valid_left_context ] &&  valid_left_context=$left_context;
+[ -z $valid_right_context ] &&  valid_right_context=$right_context;
+# don't do the overlap thing for the validation data.
+valid_egs_opts="--left-context=$valid_left_context --right-context=$valid_right_context --num-frames=$frames_per_eg --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress"
+
+ctc_supervision_all_opts="--lattice-input=true --frame-subsampling-factor=$alignment_subsampling_factor"
+[ ! -z $right_tolerance ] && \
+  ctc_supervision_all_opts="$ctc_supervision_all_opts --right-tolerance=$right_tolerance"
+
+[ ! -z $left_tolerance ] && \
+  ctc_supervision_all_opts="$ctc_supervision_all_opts --left-tolerance=$left_tolerance"
+
+echo $left_context > $dir/info/left_context
+echo $right_context > $dir/info/right_context
+
+if [ $stage -le 3 ]; then
+  echo "$0: Getting validation and training subset examples."
+  rm $dir/.error 2>/dev/null
+  echo "$0: ... extracting validation and training-subset alignments."
+
+  utils/filter_scp.pl <(cat $dir/valid_uttlist $dir/train_subset_uttlist) \
+    <$dir/lat.scp >$dir/lat_special.scp
+
+  $cmd $dir/log/create_valid_subset.log \
+    lattice-align-phones --replace-output-symbols=true $latdir/final.mdl scp:$dir/lat_special.scp ark:- \| \
+    chain-get-supervision $ctc_supervision_all_opts $chaindir/tree $chaindir/0.trans_mdl \
+      ark:- ark:- \| \
+    nnet3-chain-get-egs $valid_ivector_opt $valid_egs_opts $chaindir/normalization.fst \
+      "$valid_feats" ark,s,cs:- "ark:$dir/valid_all.cegs" || touch $dir/.error &
+  $cmd $dir/log/create_train_subset.log \
+    lattice-align-phones --replace-output-symbols=true $latdir/final.mdl scp:$dir/lat_special.scp ark:- \| \
+    chain-get-supervision $ctc_supervision_all_opts \
+     $chaindir/tree $chaindir/0.trans_mdl ark:- ark:- \| \
+    nnet3-chain-get-egs $train_subset_ivector_opt $valid_egs_opts $chaindir/normalization.fst \
+       "$train_subset_feats" ark,s,cs:- "ark:$dir/train_subset_all.cegs" || touch $dir/.error &
+  wait;
+  [ -f $dir/.error ] && echo "Error detected while creating train/valid egs" && exit 1
+  echo "... Getting subsets of validation examples for diagnostics and combination."
+  $cmd $dir/log/create_valid_subset_combine.log \
+    nnet3-chain-subset-egs --n=$num_valid_egs_combine ark:$dir/valid_all.cegs \
+    ark:$dir/valid_combine.cegs || touch $dir/.error &
+  $cmd $dir/log/create_valid_subset_diagnostic.log \
+    nnet3-chain-subset-egs --n=$num_egs_diagnostic ark:$dir/valid_all.cegs \
+    ark:$dir/valid_diagnostic.cegs || touch $dir/.error &
+
+  $cmd $dir/log/create_train_subset_combine.log \
+    nnet3-chain-subset-egs --n=$num_train_egs_combine ark:$dir/train_subset_all.cegs \
+    ark:$dir/train_combine.cegs || touch $dir/.error &
+  $cmd $dir/log/create_train_subset_diagnostic.log \
+    nnet3-chain-subset-egs --n=$num_egs_diagnostic ark:$dir/train_subset_all.cegs \
+    ark:$dir/train_diagnostic.cegs || touch $dir/.error &
+  wait
+  sleep 5  # wait for file system to sync.
+  cat $dir/valid_combine.cegs $dir/train_combine.cegs > $dir/combine.cegs
+
+  for f in $dir/{combine,train_diagnostic,valid_diagnostic}.cegs; do
+    [ ! -s $f ] && echo "No examples in file $f" && exit 1;
+  done
+  rm $dir/valid_all.cegs $dir/train_subset_all.cegs $dir/{train,valid}_combine.cegs
+fi
+
+if [ $stage -le 4 ]; then
+  # create cegs_orig.*.*.ark; the first index goes to $nj,
+  # the second to $num_archives_intermediate.
+
+  egs_list=
+  for n in $(seq $num_archives_intermediate); do
+    egs_list="$egs_list ark:$dir/cegs_orig.JOB.$n.ark"
+  done
+  echo "$0: Generating training examples on disk"
+
+  # The examples will go round-robin to egs_list.  Note: we omit the
+  # 'normalization.fst' argument while creating temporary egs: the phase of egs
+  # preparation that involves the normalization FST is quite CPU-intensive and
+  # it's more convenient to do it later, in the 'shuffle' stage.  Otherwise to
+  # make it efficient we need to use a large 'nj', like 40, and in that case
+  # there can be too many small files to deal with, because the total number of
+  # files is the product of 'nj' by 'num_archives_intermediate', which might be
+  # quite large.
+  $cmd JOB=1:$nj $dir/log/get_egs.JOB.log \
+    utils/filter_scp.pl $sdata/JOB/utt2spk $dir/lat.scp \| \
+    lattice-align-phones --replace-output-symbols=true $latdir/final.mdl scp:- ark:- \| \
+    chain-get-supervision $ctc_supervision_all_opts \
+      $chaindir/tree $chaindir/0.trans_mdl ark:- ark:- \| \
+    nnet3-chain-get-egs $ivector_opt $egs_opts \
+     "$feats" ark,s,cs:- ark:- \| \
+    nnet3-chain-copy-egs --random=true --srand=JOB ark:- $egs_list || exit 1;
+fi
+
+if [ $stage -le 5 ]; then
+  echo "$0: recombining and shuffling order of archives on disk"
+  # combine all the "egs_orig.*.JOB.scp" (over the $nj splits of the data) and
+  # shuffle the order, writing to the egs.JOB.ark
+
+  # the input is a concatenation over the input jobs.
+  egs_list=
+  for n in $(seq $nj); do
+    egs_list="$egs_list $dir/cegs_orig.$n.JOB.ark"
+  done
+
+  if [ $archives_multiple == 1 ]; then # normal case.
+    $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \
+      nnet3-chain-normalize-egs $chaindir/normalization.fst "ark:cat $egs_list|" ark:- \| \
+      nnet3-chain-shuffle-egs --srand=JOB ark:- ark:$dir/cegs.JOB.ark  || exit 1;
+  else
+    # we need to shuffle the 'intermediate archives' and then split into the
+    # final archives.  we create soft links to manage this splitting, because
+    # otherwise managing the output names is quite difficult (and we don't want
+    # to submit separate queue jobs for each intermediate archive, because then
+    # the --max-jobs-run option is hard to enforce).
+    output_archives="$(for y in $(seq $archives_multiple); do echo ark:$dir/cegs.JOB.$y.ark; done)"
+    for x in $(seq $num_archives_intermediate); do
+      for y in $(seq $archives_multiple); do
+        archive_index=$[($x-1)*$archives_multiple+$y]
+        # egs.intermediate_archive.{1,2,...}.ark will point to egs.archive.ark
+        ln -sf cegs.$archive_index.ark $dir/cegs.$x.$y.ark || exit 1
+      done
+    done
+    $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \
+      nnet3-chain-normalize-egs $chaindir/normalization.fst "ark:cat $egs_list|" ark:- \| \
+      nnet3-chain-shuffle-egs --srand=JOB ark:- ark:- \| \
+      nnet3-chain-copy-egs ark:- $output_archives || exit 1;
+  fi
+fi
+
+if [ $stage -le 6 ]; then
+  echo "$0: removing temporary archives"
+  (
+    cd $dir
+    for f in $(ls -l . | grep 'cegs_orig' | awk '{ X=NF-1; Y=NF-2; if ($X == "->")  print $Y, $NF; }'); do rm $f; done
+    # the next statement removes them if we weren't using the soft links to a
+    # 'storage' directory.
+    rm cegs_orig.*.ark 2>/dev/null
+  )
+  if [ $archives_multiple -gt 1 ]; then
+    # there are some extra soft links that we should delete.
+    for f in $dir/cegs.*.*.ark; do rm $f; done
+  fi
+  echo "$0: removing temporary lattices"
+  rm $dir/lat.*
+  echo "$0: removing temporary alignments and transforms"
+  # Ignore errors below because trans.* might not exist.
+  rm $dir/{ali,trans}.{ark,scp} 2>/dev/null
+
+fi
+
+echo "$0: Finished preparing training examples"
diff --git a/egs/wsj/s5/steps/nnet3/chain/nnet3_chain_lib.py b/egs/wsj/s5/steps/nnet3/chain/nnet3_chain_lib.py
new file mode 100644
index 00000000000..d6819e25060
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/chain/nnet3_chain_lib.py
@@ -0,0 +1,245 @@
+
+
+# Copyright 2016 Vijayaditya Peddinti.
+# Apache 2.0.
+
+
+import subprocess
+import logging
+import math
+import re
+import time
+import imp
+import os
+
+train_lib = imp.load_source('ntl', 'steps/nnet3/nnet3_train_lib.py')
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+handler.setLevel(logging.INFO)
+formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s')
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+
+def GetNumberOfLeaves(dir):
+    [stdout, stderr] = train_lib.RunKaldiCommand("am-info {0}/final.mdl 2>/dev/null | grep -w pdfs".format(dir))
+    parts = stdout.split()
+    #number of pdfs 7115
+    assert(' '.join(parts[0:3]) == "number of pdfs")
+    num_leaves = int(parts[3])
+    if num_leaves == 0:
+        raise Exception("Number of leaves is 0")
+    return num_leaves
+
+def CreatePhoneLm(dir, tree_dir, run_opts, lm_opts = None):
+    train_lib.RunKaldiCommand("""
+  {command} {dir}/log/make_phone_lm.log \
+    chain-est-phone-lm {lm_opts} \
+     "ark:gunzip -c {tree_dir}/ali.*.gz | ali-to-phones {tree_dir}/final.mdl ark:- ark:- |" \
+     {dir}/phone_lm.fst
+    """.format(command = run_opts.command,
+               dir = dir,
+               lm_opts = lm_opts if lm_opts is not None else '',
+               tree_dir = tree_dir))
+
+def CreateDenominatorFst(dir, tree_dir, run_opts):
+    train_lib.RunKaldiCommand("""
+    copy-transition-model {tree_dir}/final.mdl {dir}/0.trans_mdl
+    {command} {dir}/log/make_den_fst.log \
+    chain-make-den-fst {dir}/tree {dir}/0.trans_mdl {dir}/phone_lm.fst \
+        {dir}/den.fst {dir}/normalization.fst""".format(
+            tree_dir = tree_dir, dir = dir, command = run_opts.command))
+
+def GenerateChainEgs(dir, data, lat_dir, egs_dir,
+                    left_context, right_context,
+                    run_opts, stage = 0,
+                    valid_left_context = None, valid_right_context = None,
+                    left_tolerance = None, right_tolerance = None,
+                    frame_subsampling_factor = 3,
+                    alignment_subsampling_factor = 3,
+                    feat_type = 'raw', online_ivector_dir = None,
+                    frames_per_iter = 20000, frames_per_eg = 20,
+                    egs_opts = None, cmvn_opts = None, transform_dir = None):
+
+    train_lib.RunKaldiCommand("""
+steps/nnet3/chain/get_egs.sh {egs_opts} \
+  --cmd "{command}" \
+  --cmvn-opts "{cmvn_opts}" \
+  --feat-type {feat_type} \
+  --transform-dir "{transform_dir}" \
+  --online-ivector-dir "{ivector_dir}" \
+  --left-context {left_context} --right-context {right_context} \
+  --valid-left-context '{valid_left_context}' \
+  --valid-right-context '{valid_right_context}' \
+  --left-tolerance '{left_tolerance}' \
+  --right-tolerance '{right_tolerance}' \
+  --frame-subsampling-factor {frame_subsampling_factor} \
+  --alignment-subsampling-factor {alignment_subsampling_factor} \
+  --stage {stage} \
+  --frames-per-iter {frames_per_iter} \
+  --frames-per-eg {frames_per_eg} \
+  {data} {dir} {lat_dir} {egs_dir}
+      """.format(command = run_opts.command,
+          cmvn_opts = cmvn_opts if cmvn_opts is not None else '',
+          feat_type = feat_type,
+          transform_dir = transform_dir if transform_dir is not None else '',
+          ivector_dir = online_ivector_dir if online_ivector_dir is not None else '',
+          left_context = left_context, right_context = right_context,
+          valid_left_context = valid_left_context if valid_left_context is not None else '',
+          valid_right_context = valid_right_context if valid_right_context is not None else '',
+          left_tolerance = left_tolerance if left_tolerance is not None else '',
+          right_tolerance = right_tolerance if right_tolerance is not None else '',
+          frame_subsampling_factor = frame_subsampling_factor,
+          alignment_subsampling_factor = alignment_subsampling_factor,
+          stage = stage, frames_per_iter = frames_per_iter,
+          frames_per_eg = frames_per_eg,
+          data = data, lat_dir = lat_dir, dir = dir, egs_dir = egs_dir,
+          egs_opts = egs_opts if egs_opts is not None else '' ))
+
+# this function is exactly similar to the version in nnet3_train_lib.py
+# except it uses egs files in place of cegs files
+def ComputePreconditioningMatrix(dir, egs_dir, num_lda_jobs, run_opts,
+                                 max_lda_jobs = None, rand_prune = 4.0,
+                                 lda_opts = None):
+    if max_lda_jobs is not None:
+        if num_lda_jobs > max_lda_jobs:
+            num_lda_jobs = max_lda_jobs
+
+
+  # Write stats with the same format as stats for LDA.
+    train_lib.RunKaldiCommand("""
+{command} JOB=1:{num_lda_jobs} {dir}/log/get_lda_stats.JOB.log \
+ nnet3-chain-acc-lda-stats --rand-prune={rand_prune} \
+    {dir}/init.raw "ark:{egs_dir}/cegs.JOB.ark" {dir}/JOB.lda_stats""".format(
+        command = run_opts.command,
+        num_lda_jobs = num_lda_jobs,
+        dir = dir,
+        egs_dir = egs_dir,
+        rand_prune = rand_prune))
+
+    # the above command would have generated dir/{1..num_lda_jobs}.lda_stats
+    lda_stat_files = map(lambda x: '{0}/{1}.lda_stats'.format(dir, x),
+                         range(1, num_lda_jobs + 1))
+
+    train_lib.RunKaldiCommand("""
+{command} {dir}/log/sum_transform_stats.log \
+    sum-lda-accs {dir}/lda_stats {lda_stat_files}""".format(
+        command = run_opts.command,
+        dir = dir, lda_stat_files = " ".join(lda_stat_files)))
+
+    for file in lda_stat_files:
+        try:
+            os.remove(file)
+        except OSError:
+            raise Exception("There was error while trying to remove lda stat files.")
+    # this computes a fixed affine transform computed in the way we described in
+    # Appendix C.6 of http://arxiv.org/pdf/1410.7455v6.pdf; it's a scaled variant
+    # of an LDA transform but without dimensionality reduction.
+
+    train_lib.RunKaldiCommand("""
+{command} {dir}/log/get_transform.log \
+ nnet-get-feature-transform {lda_opts} {dir}/lda.mat {dir}/lda_stats
+     """.format(command = run_opts.command,dir = dir,
+                lda_opts = lda_opts if lda_opts is not None else ""))
+
+    train_lib.ForceSymlink("../lda.mat", "{0}/configs/lda.mat".format(dir))
+
+def PrepareInitialAcousticModel(dir, run_opts):
+    """ Adds the first layer; this will also add in the lda.mat and
+        presoftmax_prior_scale.vec. It will also prepare the acoustic model
+        with the transition model."""
+
+    train_lib.RunKaldiCommand("""
+{command} {dir}/log/add_first_layer.log \
+   nnet3-init --srand=-1 {dir}/init.raw {dir}/configs/layer1.config {dir}/0.raw     """.format(command = run_opts.command,
+               dir = dir))
+
+    # The model-format for a 'chain' acoustic model is just the transition
+    # model and then the raw nnet, so we can use 'cat' to create this, as
+    # long as they have the same mode (binary or not binary).
+    # We ensure that they have the same mode (even if someone changed the
+    # script to make one or both of them text mode) by copying them both
+    # before concatenating them.
+    train_lib.RunKaldiCommand("""
+{command} {dir}/log/init_mdl.log \
+    nnet3-am-init {dir}/0.trans_mdl {dir}/0.raw {dir}/0.mdl""".format(
+                   command = run_opts.command, dir = dir))
+
+def CombineModels(dir, num_iters, num_iters_combine, num_chunk_per_minibatch,
+                  egs_dir, leaky_hmm_coefficient, l2_regularize,
+                  xent_regularize, run_opts):
+    # Now do combination.  In the nnet3 setup, the logic
+    # for doing averaging of subsets of the models in the case where
+    # there are too many models to reliably esetimate interpolation
+    # factors (max_models_combine) is moved into the nnet3-combine
+    raw_model_strings = []
+    for iter in range(num_iters - num_iters_combine + 1, num_iters + 1):
+      model_file = '{0}/{1}.mdl'.format(dir, iter)
+      if not os.path.exists(model_file):
+          raise Exception('Model file {0} missing'.format(model_file))
+      raw_model_strings.append('"nnet3-am-copy --raw=true {0} -|"'.format(model_file))
+    train_lib.RunKaldiCommand("""
+{command} {combine_queue_opt} {dir}/log/combine.log \
+nnet3-chain-combine --num-iters=40 \
+   --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \
+   --enforce-sum-to-one=true --enforce-positive-weights=true \
+   --verbose=3 {dir}/den.fst {raw_models} "ark,bg:nnet3-chain-merge-egs --minibatch-size={num_chunk_per_minibatch} ark:{egs_dir}/combine.cegs ark:-|" \
+"|nnet3-am-copy --set-raw-nnet=- {dir}/{num_iters}.mdl {dir}/final.mdl"
+    """.format(command = run_opts.command,
+               combine_queue_opt = run_opts.combine_queue_opt,
+               l2 = l2_regularize, leaky = leaky_hmm_coefficient,
+               dir = dir, raw_models = " ".join(raw_model_strings),
+               num_chunk_per_minibatch = num_chunk_per_minibatch,
+               num_iters = num_iters,
+               egs_dir = egs_dir))
+
+  # Compute the probability of the final, combined model with
+  # the same subset we used for the previous compute_probs, as the
+  # different subsets will lead to different probs.
+    ComputeTrainCvProbabilities(dir, 'final', egs_dir, l2_regularize, xent_regularize, leaky_hmm_coefficient, run_opts, wait = False)
+
+def ComputeTrainCvProbabilities(dir, iter, egs_dir, l2_regularize, xent_regularize,
+                                leaky_hmm_coefficient, run_opts, wait = False):
+
+    model = '{0}/{1}.mdl'.format(dir, iter)
+
+    train_lib.RunKaldiCommand("""
+{command} {dir}/log/compute_prob_valid.{iter}.log \
+  nnet3-chain-compute-prob --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \
+  --xent-regularize={xent_reg} \
+  "nnet3-am-copy --raw=true {model} - |" {dir}/den.fst \
+        "ark,bg:nnet3-chain-merge-egs ark:{egs_dir}/valid_diagnostic.cegs ark:- |"
+    """.format(command = run_opts.command,
+               dir = dir, iter = iter, model = model,
+               l2 = l2_regularize, leaky = leaky_hmm_coefficient,
+               xent_reg = xent_regularize,
+               egs_dir = egs_dir), wait = wait)
+
+    train_lib.RunKaldiCommand("""
+{command} {dir}/log/compute_prob_train.{iter}.log \
+  nnet3-chain-compute-prob --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \
+  --xent-regularize={xent_reg} \
+  "nnet3-am-copy --raw=true {model} - |" {dir}/den.fst \
+        "ark,bg:nnet3-chain-merge-egs ark:{egs_dir}/train_diagnostic.cegs ark:- |"
+    """.format(command = run_opts.command,
+               dir = dir,
+               iter = iter,
+               model = model,
+               l2 = l2_regularize, leaky = leaky_hmm_coefficient,
+               xent_reg = xent_regularize,
+               egs_dir = egs_dir), wait = wait)
+
+def ComputeProgress(dir, iter, run_opts, wait=False):
+
+    prev_model = '{0}/{1}.mdl'.format(dir, iter - 1)
+    model = '{0}/{1}.mdl'.format(dir, iter)
+    train_lib.RunKaldiCommand("""
+{command} {dir}/log/progress.{iter}.log \
+nnet3-am-info {model} '&&' \
+nnet3-show-progress --use-gpu=no "nnet3-am-copy --raw=true {prev_model} - |" "nnet3-am-copy --raw=true {model} - |"
+    """.format(command = run_opts.command,
+               dir = dir,
+               iter = iter,
+               model = model,
+               prev_model = prev_model), wait = wait)
diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py
new file mode 100755
index 00000000000..2c12ee27b45
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/chain/train.py
@@ -0,0 +1,706 @@
+#!/usr/bin/env python
+
+
+# Copyright 2016 Vijayaditya Peddinti.
+# Apache 2.0.
+
+
+# this script is based on steps/nnet3/lstm/train.sh
+
+import os
+import subprocess
+import argparse
+import sys
+import pprint
+import logging
+import imp
+import traceback
+import shutil
+import math
+
+train_lib = imp.load_source('ntl', 'steps/nnet3/nnet3_train_lib.py')
+chain_lib = imp.load_source('ncl', 'steps/nnet3/chain/nnet3_chain_lib.py')
+nnet3_log_parse = imp.load_source('nlp', 'steps/nnet3/report/nnet3_log_parse_lib.py')
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+handler.setLevel(logging.INFO)
+formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s')
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+logger.info('Starting chain model trainer (train.py)')
+
+
+def GetArgs():
+    # we add compulsary arguments as named arguments for readability
+    parser = argparse.ArgumentParser(description="""
+    Trains RNN and DNN acoustic models using the 'chain' objective function.
+    """,
+    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+    # feat options
+    parser.add_argument("--feat.online-ivector-dir", type=str, dest='online_ivector_dir',
+                        default = None, action = train_lib.NullstrToNoneAction,
+                        help="directory with the ivectors extracted in an online fashion.")
+    parser.add_argument("--feat.cmvn-opts", type=str, dest='cmvn_opts',
+                        default = None, action = train_lib.NullstrToNoneAction,
+                        help="A string specifying '--norm-means' and '--norm-vars' values")
+
+    # egs extraction options
+    parser.add_argument("--egs.chunk-width", type=int, dest='chunk_width',
+                        default = 150,
+                        help="Number of output labels in each example. Caution: if you double this you should halve --trainer.samples-per-iter.")
+    parser.add_argument("--egs.chunk-left-context", type=int, dest='chunk_left_context',
+                        default = 0,
+                        help="Number of additional frames of input to the left"
+                        " of the input chunk. This extra context will be used"
+                        " in the estimation of RNN state before prediction of"
+                        " the first label. In the case of FF-DNN this extra"
+                        " context will be used to allow for frame-shifts")
+    parser.add_argument("--egs.chunk-right-context", type=int, dest='chunk_right_context',
+                        default = 0,
+                        help="Number of additional frames of input to the right"
+                        " of the input chunk. This extra context will be used"
+                        " in the estimation of bidirectional RNN state before"
+                        " prediction of the first label.")
+    parser.add_argument("--egs.transform_dir", type=str, dest='transform_dir',
+                        default = None, action = train_lib.NullstrToNoneAction,
+                        help="String to provide options directly to steps/nnet3/get_egs.sh script")
+    parser.add_argument("--egs.dir", type=str, dest='egs_dir',
+                        default = None, action = train_lib.NullstrToNoneAction,
+                        help="Directory with egs. If specified this directory "
+                        "will be used rather than extracting egs")
+    parser.add_argument("--egs.stage", type=int, dest='egs_stage',
+                        default = -6, help="Stage at which get_egs.sh should be restarted")
+    parser.add_argument("--egs.opts", type=str, dest='egs_opts',
+                        default = None, action = train_lib.NullstrToNoneAction,
+                        help="String to provide options directly to steps/nnet3/get_egs.sh script")
+
+    # chain options
+    parser.add_argument("--chain.lm-opts", type=str, dest='lm_opts',
+                        default = None, action = train_lib.NullstrToNoneAction,
+                        help="options to be be passed to chain-est-phone-lm")
+    parser.add_argument("--chain.l2-regularize", type=float, dest='l2_regularize',
+                        default = 0.0,
+                        help="Weight of regularization function which is the"
+                        " l2-norm of the output of the network. It should be"
+                        " used without the log-softmax layer for the outputs."
+                        " As l2-norm of the log-softmax outputs can dominate"
+                        " the objective function.")
+    parser.add_argument("--chain.xent-regularize", type=float, dest='xent_regularize',
+                        default = 0.0,
+                        help="Weight of regularization function which is the"
+                        " cross-entropy cost the outputs.")
+    parser.add_argument("--chain.right-tolerance", type=int, dest='right_tolerance',
+                        default = 5, help="")
+    parser.add_argument("--chain.left-tolerance", type=int, dest='left_tolerance',
+                        default = 5, help="")
+    parser.add_argument("--chain.leaky-hmm-coefficient", type=float, dest='leaky_hmm_coefficient',
+                        default = 0.00001, help="")
+    parser.add_argument("--chain.apply-deriv-weights", type=str, dest='apply_deriv_weights',
+                        default=True, action=train_lib.StrToBoolAction,
+                        choices = ["true", "false"],
+                        help="")
+    parser.add_argument("--chain.truncate-deriv-weights", type=float, dest='truncate_deriv_weights',
+                        default =0,
+                        help="Can be used to set to zero the weights of derivs"
+                        " from frames near the edges.  (counts subsampled frames)")
+    parser.add_argument("--chain.frame-subsampling-factor", type=int,
+                        dest='frame_subsampling_factor',
+                        default = 3,
+                        help="ratio of frames-per-second of features we train"
+                        " on, to chain model's output")
+    parser.add_argument("--chain.alignment-subsampling-factor", type=int,
+                        dest='alignment_subsampling_factor',
+                        default = 3,
+                        help="ratio of frames-per-second of input alignments to"
+                        " chain model's output")
+    parser.add_argument("--chain.ngram-order", type=int, dest='ngram_order',
+                        default = 3, help="")
+    parser.add_argument("--chain.left-deriv-truncate", type=int,
+                        dest='left_deriv_truncate',
+                        default = None, help="")
+    parser.add_argument("--chain.right-deriv-truncate", type=int,
+                        dest='right_deriv_truncate',
+                        default = None, help="")
+
+
+    # trainer options
+    parser.add_argument("--trainer.num-epochs", type=int, dest='num_epochs',
+                        default = 10,
+                        help="Number of epochs to train the model")
+    parser.add_argument("--trainer.prior-subset-size", type=int, dest='prior_subset_size',
+                        default = 20000,
+                        help="Number of samples for computing priors")
+    parser.add_argument("--trainer.num-jobs-compute-prior", type=int, dest='num_jobs_compute_prior',
+                        default = 10,
+                        help="The prior computation jobs are single threaded and run on the CPU")
+    parser.add_argument("--trainer.max-models-combine", type=int, dest='max_models_combine',
+                        default = 20,
+                        help="The maximum number of models used in the final"
+                        " model combination stage. These models will themselves"
+                        " be averages of iteration-number ranges")
+    parser.add_argument("--trainer.shuffle-buffer-size", type=int, dest='shuffle_buffer_size',
+                        default = 5000,
+                        help="Controls randomization of the samples on each"
+                        " iteration. If 0 or a large value the randomization is"
+                        " complete, but this will consume memory and cause spikes"
+                        " in disk I/O.  Smaller is easier on disk and memory but"
+                        " less random.  It's not a huge deal though, as samples"
+                        " are anyway randomized right at the start. (the point"
+                        " of this is to get data in different minibatches on"
+                        " different iterations, since in the preconditioning"
+                        " method, 2 samples in the same minibatch can affect"
+                        " each others' gradients.")
+    parser.add_argument("--trainer.add-layers-period", type=int, dest='add_layers_period',
+                        default=2,
+                        help="The number of iterations between adding layers"
+                        " during layer-wise discriminative training.")
+    parser.add_argument("--trainer.max-param-change", type=float, dest='max_param_change',
+                        default=2.0,
+                        help="The maximum change in parameters allowed per"
+                        " minibatch, measured in Frobenius norm over the entire model")
+    parser.add_argument("--trainer.frames-per-iter", type=int, dest='frames_per_iter',
+                        default=800000,
+                        help ="Each iteration of training, see this many [input]"
+                        " frames per job.  This option is passed to get_egs.sh."
+                        " Aim for about a minute of training time")
+    parser.add_argument("--trainer.lda.rand-prune", type=float, dest='rand_prune',
+                        default=4.0,
+                        help="Value used in preconditioning matrix estimation")
+    parser.add_argument("--trainer.lda.max-lda-jobs", type=float, dest='max_lda_jobs',
+                        default=10,
+                        help="Max number of jobs used for LDA stats accumulation")
+
+    # Parameters for the optimization
+    parser.add_argument("--trainer.optimization.initial-effective-lrate", type=float, dest='initial_effective_lrate',
+                        default = 0.0002,
+                        help="Learning rate used during the initial iteration")
+    parser.add_argument("--trainer.optimization.final-effective-lrate", type=float, dest='final_effective_lrate',
+                        default = 0.00002,
+                        help="Learning rate used during the final iteration")
+    parser.add_argument("--trainer.optimization.num-jobs-initial", type=int, dest='num_jobs_initial',
+                        default = 1,
+                        help="Number of neural net jobs to run in parallel at the start of training")
+    parser.add_argument("--trainer.optimization.num-jobs-final", type=int, dest='num_jobs_final',
+                        default = 8,
+                        help="Number of neural net jobs to run in parallel at"
+                        " the end of training")
+    parser.add_argument("--trainer.optimization.max-models-combine", type=int, dest='max_models_combine',
+                        default = 20,
+                        help = "The is the maximum number of models we give to"
+                        " the final 'combine' stage, but these models will"
+                        " themselves be averages of iteration-number ranges.")
+    parser.add_argument("--trainer.optimization.momentum", type=float, dest='momentum',
+                        default = 0.0,
+                        help="Momentum used in update computation."
+                        " Note: we implemented it in such a way that it doesn't"
+                        " increase the effective learning rate.")
+    parser.add_argument("--trainer.optimization.shrink-value", type=float, dest='shrink_value',
+                        default = 1.0,
+                        help="Scaling factor used for scaling the parameter"
+                        " matrices when the derivative averages are below the"
+                        " shrink-threshold at the non-linearities")
+    parser.add_argument("--trainer.optimization.shrink-threshold", type=float, dest='shrink_threshold',
+                        default = 0.15,
+                        help="If the derivative averages are below this"
+                        " threshold we scale the parameter matrices with the"
+                        " shrink-value. It is less than 0.25 for sigmoid non-linearities.")
+    parser.add_argument("--trainer.optimization.shrink-nonlinearity", type=str, dest='shrink_nonlinearity',
+                        default = "SigmoidComponent", choices = ["TanhComponent", "SigmoidComponent"],
+                        help="The non-linear component from which the"
+                        " deriv-avg values are going to used to compute"
+                        " mean-deriv-avg. The mean-deriv-avg is going to be"
+                        " compared with shrink-threshold. Be careful to specify"
+                        " a shrink-threshold which is dependent on the"
+                        " shrink-nonlinearity type")
+
+    # RNN specific trainer options
+    parser.add_argument("--trainer.num-chunk-per-minibatch", type=int, dest='num_chunk_per_minibatch',
+                        default=512,
+                        help="Number of sequences to be processed in parallel every minibatch" )
+
+    # General options
+    parser.add_argument("--stage", type=int, default=-4,
+                        help="Specifies the stage of the experiment to execution from")
+    parser.add_argument("--exit-stage", type=int, default=None,
+                        help="If specified, training exits before running this stage")
+    parser.add_argument("--cmd", type=str, action = train_lib.NullstrToNoneAction, dest="command",
+                        help="Specifies the script to launch jobs."
+                        " e.g. queue.pl for launching on SGE cluster run.pl"
+                        " for launching on local machine", default = "queue.pl")
+    parser.add_argument("--use-gpu", type=str, action = train_lib.StrToBoolAction,
+                        choices = ["true", "false"],
+                        help="Use GPU for training", default=True)
+    parser.add_argument("--cleanup", type=str, action = train_lib.StrToBoolAction,
+                        choices = ["true", "false"],
+                        help="Clean up models after training", default=True)
+    parser.add_argument("--cleanup.remove-egs", type=str, dest='remove_egs',
+                        default = True, action = train_lib.StrToBoolAction,
+                        choices = ["true", "false"],
+                        help="If true, remove egs after experiment")
+    parser.add_argument("--cleanup.preserve-model-interval", dest = "preserve_model_interval",
+                        type=int, default=100,
+                        help="Determines iterations for which models will be preserved during cleanup. If iter % preserve_model_interval == 0 model will be preserved.")
+
+    parser.add_argument("--reporting.email", dest = "email",
+                        type=str, default=None, action = train_lib.NullstrToNoneAction,
+                        help="Email-id to report about the progress of the experiment. NOTE: It assumes the machine on which the script is being run can send emails from command line via. mail program. The Kaldi mailing list will not support this feature. It might require local expertise to setup. ")
+    parser.add_argument("--reporting.interval", dest = "reporting_interval",
+                        type=int, default=0.1,
+                        help="Frequency with which reports have to be sent, measured in terms of fraction of iterations. If 0 and reporting mail has been specified then only failure notifications are sent")
+
+    parser.add_argument("--feat-dir", type=str, required = True,
+                        help="Directory with features used for training the neural network.")
+    parser.add_argument("--tree-dir", type=str, required = True,
+                        help="Languade directory")
+    parser.add_argument("--lat-dir", type=str, required = True,
+                        help="Directory with alignments used for training the neural network.")
+    parser.add_argument("--dir", type=str, required = True,
+                        help="Directory to store the models and all other files.")
+
+    print(' '.join(sys.argv))
+    print(sys.argv)
+
+    args = parser.parse_args()
+
+    [args, run_opts] = ProcessArgs(args)
+
+    return [args, run_opts]
+
+def ProcessArgs(args):
+    # process the options
+    if args.chunk_width < 1:
+        raise Exception("--egs.chunk-width should have a minimum value of 1")
+
+    if args.chunk_left_context < 0:
+        raise Exception("--egs.chunk-left-context should be non-negative")
+
+    if args.chunk_right_context < 0:
+        raise Exception("--egs.chunk-right-context should be non-negative")
+
+    if (not os.path.exists(args.dir)) or (not os.path.exists(args.dir+"/configs")):
+        raise Exception("""This scripts expects {0} to exist and have a configs
+        directory which is the output of make_configs.py script""")
+
+    if args.transform_dir is None:
+        args.transform_dir = args.lat_dir
+    # set the options corresponding to args.use_gpu
+    run_opts = RunOpts()
+    if args.use_gpu:
+        if not train_lib.CheckIfCudaCompiled():
+            logger.warning("""
+    You are running with one thread but you have not compiled
+    for CUDA.  You may be running a setup optimized for GPUs.  If you have
+    GPUs and have nvcc installed, go to src/ and do ./configure; make""")
+
+        run_opts.train_queue_opt = "--gpu 1"
+        run_opts.parallel_train_opts = ""
+        run_opts.combine_queue_opt = "--gpu 1"
+
+    else:
+        logger.warning("""
+    Without using a GPU this will be very slow.  nnet3 does not yet support multiple threads.""")
+
+        run_opts.train_queue_opt = ""
+        run_opts.parallel_train_opts = "--use-gpu=no"
+        run_opts.combine_queue_opt = ""
+
+    run_opts.command = args.command
+
+    return [args, run_opts]
+
+# a class to store run options
+class RunOpts:
+    def __init__(self):
+        self.command = None
+        self.train_queue_opt = None
+        self.combine_queue_opt = None
+        self.parallel_train_opts = None
+
+
+def TrainNewModels(dir, iter, num_jobs, num_archives_processed, num_archives,
+                   raw_model_string, egs_dir,
+                   apply_deriv_weights,
+                   left_deriv_truncate, right_deriv_truncate,
+                   l2_regularize, xent_regularize, leaky_hmm_coefficient,
+                   momentum, max_param_change,
+                   shuffle_buffer_size, num_chunk_per_minibatch,
+                   frame_subsampling_factor, truncate_deriv_weights,
+                   cache_io_opts, run_opts):
+      # We cannot easily use a single parallel SGE job to do the main training,
+      # because the computation of which archive and which --frame option
+      # to use for each job is a little complex, so we spawn each one separately.
+      # this is no longer true for RNNs as we use do not use the --frame option
+      # but we use the same script for consistency with FF-DNN code
+
+    deriv_time_opts=""
+    if left_deriv_truncate is not None:
+        deriv_time_opts += " --optimization.min-deriv-time={0}".format(left_deriv_truncate)
+    if right_deriv_truncate is not None:
+        deriv_time_opts += " --optimization.max-deriv-time={0}".format(int(chunk-width-right_deriv_truncate))
+
+    processes = []
+    for job in range(1,num_jobs+1):
+        k = num_archives_processed + job - 1 # k is a zero-based index that we will derive
+                                               # the other indexes from.
+        archive_index = (k % num_archives) + 1 # work out the 1-based archive index.
+        frame_shift = (archive_index + k/num_archives) % frame_subsampling_factor
+        # previous : frame_shift = (k/num_archives) % frame_subsampling_factor
+        if job == 1:
+            cur_cache_io_opts = cache_io_opts + " --write-cache={dir}/cache.{next_iter}".format(dir = dir, next_iter = iter + 1)
+        else:
+            cur_cache_io_opts = cache_io_opts
+
+        process_handle = train_lib.RunKaldiCommand("""
+{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \
+  nnet3-chain-train {parallel_train_opts} \
+  --apply-deriv-weights={app_deriv_wts} \
+  --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \
+  {cache_io_opts}  --xent-regularize={xent_reg} {deriv_time_opts} \
+  --print-interval=10 --momentum={momentum} \
+  --max-param-change={max_param_change} \
+   "{raw_model}" {dir}/den.fst \
+  "ark,bg:nnet3-chain-copy-egs --truncate-deriv-weights={trunc_deriv} --frame-shift={fr_shft} ark:{egs_dir}/cegs.{archive_index}.ark ark:- | nnet3-chain-shuffle-egs --buffer-size={shuffle_buffer_size} --srand={iter} ark:- ark:-| nnet3-chain-merge-egs --minibatch-size={num_chunk_per_minibatch} ark:- ark:- |" \
+  {dir}/{next_iter}.{job}.raw
+          """.format(command = run_opts.command,
+                     train_queue_opt = run_opts.train_queue_opt,
+                     dir = dir, iter = iter, next_iter = iter + 1, job = job,
+                     deriv_time_opts = deriv_time_opts,
+                     trunc_deriv = truncate_deriv_weights,
+                     app_deriv_wts = apply_deriv_weights,
+                     fr_shft = frame_shift, l2 = l2_regularize,
+                     xent_reg = xent_regularize, leaky = leaky_hmm_coefficient,
+                     parallel_train_opts = run_opts.parallel_train_opts,
+                     momentum = momentum, max_param_change = max_param_change,
+                     raw_model = raw_model_string,
+                     egs_dir = egs_dir, archive_index = archive_index,
+                     shuffle_buffer_size = shuffle_buffer_size,
+                     cache_io_opts = cur_cache_io_opts,
+                     num_chunk_per_minibatch = num_chunk_per_minibatch),
+          wait = False)
+
+        processes.append(process_handle)
+
+    all_success = True
+    for process in processes:
+        process.wait()
+        [stdout_value, stderr_value] = process.communicate()
+        if stderr_value.strip() != '':
+            print(stderr_value)
+        if process.returncode != 0:
+            all_success = False
+
+    if not all_success:
+        open('{0}/.error'.format(dir), 'w').close()
+        raise Exception("There was error during training iteration {0}".format(iter))
+
+def TrainOneIteration(dir, iter, egs_dir,
+                      num_jobs, num_archives_processed, num_archives,
+                      learning_rate, shrinkage_value, num_chunk_per_minibatch,
+                      num_hidden_layers, add_layers_period,
+                      apply_deriv_weights, left_deriv_truncate, right_deriv_truncate,
+                      l2_regularize, xent_regularize, leaky_hmm_coefficient,
+                      momentum, max_param_change, shuffle_buffer_size,
+                      frame_subsampling_factor, truncate_deriv_weights,
+                      run_opts):
+
+    # Set off jobs doing some diagnostics, in the background.
+    # Use the egs dir from the previous iteration for the diagnostics
+    logger.info("Training neural net (pass {0})".format(iter))
+
+    chain_lib.ComputeTrainCvProbabilities(dir, iter, egs_dir,
+            l2_regularize, xent_regularize, leaky_hmm_coefficient, run_opts)
+
+    if iter > 0:
+        chain_lib.ComputeProgress(dir, iter, run_opts)
+
+    if iter > 0 and (iter <= (num_hidden_layers-1) * add_layers_period) and (iter % add_layers_period == 0):
+
+        do_average = False # if we've just mixed up, don't do averaging but take the
+                           # best.
+        cur_num_hidden_layers = 1 + iter / add_layers_period
+        config_file = "{0}/configs/layer{1}.config".format(dir, cur_num_hidden_layers)
+        raw_model_string = "nnet3-am-copy --raw=true --learning-rate={lr} {dir}/{iter}.mdl - | nnet3-init --srand={iter} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, config=config_file)
+        cache_io_opts = ""
+    else:
+        do_average = True
+        if iter == 0:
+            do_average = False   # on iteration 0, pick the best, don't average.
+        raw_model_string = "nnet3-am-copy --raw=true --learning-rate={0} {1}/{2}.mdl - |".format(learning_rate, dir, iter)
+        cache_io_opts = "--read-cache={dir}/cache.{iter}".format(dir = dir, iter = iter)
+
+    if do_average:
+      cur_num_chunk_per_minibatch = num_chunk_per_minibatch
+      cur_max_param_change = max_param_change
+    else:
+      # on iteration zero or when we just added a layer, use a smaller minibatch
+      # size (and we will later choose the output of just one of the jobs): the
+      # model-averaging isn't always helpful when the model is changing too fast
+      # (i.e. it can worsen the objective function), and the smaller minibatch
+      # size will help to keep the update stable.
+      cur_num_chunk_per_minibatch = num_chunk_per_minibatch / 2
+      cur_max_param_change = float(max_param_change) / math.sqrt(2)
+
+    TrainNewModels(dir, iter, num_jobs, num_archives_processed, num_archives,
+                   raw_model_string, egs_dir,
+                   apply_deriv_weights,
+                   left_deriv_truncate, right_deriv_truncate,
+                   l2_regularize, xent_regularize, leaky_hmm_coefficient,
+                   momentum, cur_max_param_change,
+                   shuffle_buffer_size, cur_num_chunk_per_minibatch,
+                   frame_subsampling_factor, truncate_deriv_weights,
+                   cache_io_opts, run_opts)
+
+    [models_to_average, best_model] = train_lib.GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter))
+    nnets_list = []
+    for n in models_to_average:
+      nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n))
+
+    if do_average:
+        # average the output of the different jobs.
+        train_lib.RunKaldiCommand("""
+{command} {dir}/log/average.{iter}.log \
+nnet3-average {nnet_list} - \| \
+nnet3-am-copy --scale={shrink} --set-raw-nnet=- {dir}/{iter}.mdl {dir}/{new_iter}.mdl
+        """.format(command = run_opts.command,
+                   dir = dir,
+                   iter = iter,
+                   nnet_list = " ".join(nnets_list),
+                   shrink = shrinkage_value,
+                   new_iter = iter + 1))
+
+    else:
+        # choose the best model from different jobs
+        train_lib.RunKaldiCommand("""
+{command} {dir}/log/select.{iter}.log \
+    nnet3-am-copy --scale={shrink} --set-raw-nnet={dir}/{next_iter}.{best_model_index}.raw  {dir}/{iter}.mdl {dir}/{next_iter}.mdl
+        """.format(command = run_opts.command,
+                   dir = dir, iter = iter, next_iter = iter + 1,
+                   shrink = shrinkage_value, best_model_index =  best_model))
+
+    try:
+        for i in range(1, num_jobs + 1):
+            os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i))
+    except OSError:
+        raise Exception("Error while trying to delete the raw models")
+
+    new_model = "{0}/{1}.mdl".format(dir, iter + 1)
+
+    if not os.path.isfile(new_model):
+        raise Exception("Could not find {0}, at the end of iteration {1}".format(new_model, iter))
+    elif os.stat(new_model).st_size == 0:
+        raise Exception("{0} has size 0. Something went wrong in iteration {1}".format(new_model, iter))
+    if os.path.exists("{0}/cache.{1}".format(dir, iter)):
+        os.remove("{0}/cache.{1}".format(dir, iter))
+
+def CheckForRequiredFiles(feat_dir, tree_dir, lat_dir):
+    for file in ['{0}/feats.scp'.format(feat_dir), '{0}/ali.1.gz'.format(tree_dir),
+                 '{0}/final.mdl'.format(tree_dir), '{0}/tree'.format(tree_dir),
+                 '{0}/lat.1.gz'.format(lat_dir), '{0}/final.mdl'.format(lat_dir),
+                 '{0}/num_jobs'.format(lat_dir), '{0}/splice_opts'.format(lat_dir)]:
+        if not os.path.isfile(file):
+            raise Exception('Expected {0} to exist.'.format(file))
+
+# args is a Namespace with the required parameters
+def Train(args, run_opts):
+    arg_string = pprint.pformat(vars(args))
+    logger.info("Arguments for the experiment\n{0}".format(arg_string))
+
+    # Check files
+    CheckForRequiredFiles(args.feat_dir, args.tree_dir, args.lat_dir)
+
+    # Set some variables.
+    num_jobs = train_lib.GetNumberOfJobs(args.tree_dir)
+    feat_dim = train_lib.GetFeatDim(args.feat_dir)
+    ivector_dim = train_lib.GetIvectorDim(args.online_ivector_dir)
+
+    # split the training data into parts for individual jobs
+    # we will use the same number of jobs as that used for alignment
+    train_lib.SplitData(args.feat_dir, num_jobs)
+    shutil.copy('{0}/tree'.format(args.tree_dir), args.dir)
+    f = open('{0}/num_jobs'.format(args.dir), 'w')
+    f.write(str(num_jobs))
+    f.close()
+
+    config_dir = '{0}/configs'.format(args.dir)
+    var_file = '{0}/vars'.format(config_dir)
+
+    [model_left_context, model_right_context, num_hidden_layers] = train_lib.ParseModelConfigVarsFile(var_file)
+    # Initialize as "raw" nnet, prior to training the LDA-like preconditioning
+    # matrix.  This first config just does any initial splicing that we do;
+    # we do this as it's a convenient way to get the stats for the 'lda-like'
+    # transform.
+    if (args.stage <= -6):
+        logger.info("Creating phone language-model")
+        chain_lib.CreatePhoneLm(args.dir, args.tree_dir, run_opts, lm_opts = args.lm_opts)
+
+    if (args.stage <= -5):
+        logger.info("Creating denominator FST")
+        chain_lib.CreateDenominatorFst(args.dir, args.tree_dir, run_opts)
+
+    if (args.stage <= -4):
+        logger.info("Initializing a basic network for estimating preconditioning matrix")
+        train_lib.RunKaldiCommand("""
+{command} {dir}/log/nnet_init.log \
+    nnet3-init --srand=-2 {dir}/configs/init.config {dir}/init.raw
+    """.format(command = run_opts.command,
+               dir = args.dir))
+
+    left_context = args.chunk_left_context + model_left_context
+    right_context = args.chunk_right_context + model_right_context
+
+    default_egs_dir = '{0}/egs'.format(args.dir)
+    if (args.stage <= -3) and args.egs_dir is None:
+        logger.info("Generating egs")
+        # this is where get_egs.sh is called.
+        chain_lib.GenerateChainEgs(args.dir, args.feat_dir, args.lat_dir, default_egs_dir,
+                                    left_context + args.frame_subsampling_factor/2,
+                                    right_context + args.frame_subsampling_factor/2,
+                                    run_opts,
+                                    left_tolerance = args.left_tolerance,
+                                    right_tolerance = args.right_tolerance,
+                                    frame_subsampling_factor = args.frame_subsampling_factor,
+                                    alignment_subsampling_factor = args.alignment_subsampling_factor,
+                                    frames_per_eg = args.chunk_width,
+                                    egs_opts = args.egs_opts,
+                                    cmvn_opts = args.cmvn_opts,
+                                    online_ivector_dir = args.online_ivector_dir,
+                                    frames_per_iter = args.frames_per_iter,
+                                    transform_dir = args.transform_dir,
+                                    stage = args.egs_stage)
+
+    if args.egs_dir is None:
+        egs_dir = default_egs_dir
+    else:
+        egs_dir = args.egs_dir
+
+    [egs_left_context, egs_right_context, frames_per_eg, num_archives] = train_lib.VerifyEgsDir(egs_dir, feat_dim, ivector_dim, left_context, right_context)
+    assert(args.chunk_width == frames_per_eg)
+    num_archives_expanded = num_archives * args.frame_subsampling_factor
+
+    if (args.num_jobs_final > num_archives_expanded):
+        raise Exception('num_jobs_final cannot exceed the expanded number of archives')
+
+    # copy the properties of the egs to dir for
+    # use during decoding
+    train_lib.CopyEgsPropertiesToExpDir(egs_dir, args.dir)
+
+    if (args.stage <= -2):
+        logger.info('Computing the preconditioning matrix for input features')
+
+        chain_lib.ComputePreconditioningMatrix(args.dir, egs_dir, num_archives, run_opts,
+                                               max_lda_jobs = args.max_lda_jobs,
+                                               rand_prune = args.rand_prune)
+
+    if (args.stage <= -1):
+        logger.info("Preparing the initial acoustic model.")
+        chain_lib.PrepareInitialAcousticModel(args.dir, run_opts)
+
+    file_handle = open("{0}/frame_subsampling_factor".format(args.dir),"w")
+    file_handle.write(str(args.frame_subsampling_factor))
+    file_handle.close()
+
+    # set num_iters so that as close as possible, we process the data $num_epochs
+    # times, i.e. $num_iters*$avg_num_jobs) == $num_epochs*$num_archives,
+    # where avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.
+    num_archives_to_process = args.num_epochs * num_archives_expanded
+    num_archives_processed = 0
+    num_iters=(num_archives_to_process * 2) / (args.num_jobs_initial + args.num_jobs_final)
+
+    num_iters_combine = train_lib.VerifyIterations(num_iters, args.num_epochs,
+                                                   num_hidden_layers, num_archives_expanded,
+                                                   args.max_models_combine, args.add_layers_period,
+                                                   args.num_jobs_final)
+
+    learning_rate = lambda iter, current_num_jobs, num_archives_processed: train_lib.GetLearningRate(iter, current_num_jobs, num_iters,
+                                                                                           num_archives_processed,
+                                                                                           num_archives_to_process,
+                                                                                           args.initial_effective_lrate,
+                                                                                           args.final_effective_lrate)
+
+    logger.info("Training will run for {0} epochs = {1} iterations".format(args.num_epochs, num_iters))
+    for iter in range(num_iters):
+        if (args.exit_stage is not None) and (iter == args.exit_stage):
+            logger.info("Exiting early due to --exit-stage {0}".format(iter))
+            return
+        current_num_jobs = int(0.5 + args.num_jobs_initial + (args.num_jobs_final - args.num_jobs_initial) * float(iter) / num_iters)
+
+        if args.stage <= iter:
+            if args.shrink_value != 1.0:
+                model_file = "{dir}/{iter}.mdl".format(dir = args.dir, iter = iter)
+                shrinkage_value = args.shrink_value if train_lib.DoShrinkage(iter, model_file, args.shrink_nonlinearity, args.shrink_threshold) else 1
+            else:
+                shrinkage_value = args.shrink_value
+            logger.info("On iteration {0}, learning rate is {1} and shrink value is {2}.".format(iter, learning_rate(iter, current_num_jobs, num_archives_processed), shrinkage_value))
+
+            TrainOneIteration(args.dir, iter, egs_dir, current_num_jobs,
+                              num_archives_processed, num_archives,
+                              learning_rate(iter, current_num_jobs, num_archives_processed),
+                              shrinkage_value,
+                              args.num_chunk_per_minibatch,
+                              num_hidden_layers, args.add_layers_period,
+                              args.apply_deriv_weights, args.left_deriv_truncate, args.right_deriv_truncate,
+                              args.l2_regularize, args.xent_regularize, args.leaky_hmm_coefficient,
+                              args.momentum, args.max_param_change,
+                              args.shuffle_buffer_size,
+                              args.frame_subsampling_factor,
+                              args.truncate_deriv_weights, run_opts)
+            if args.cleanup:
+                # do a clean up everythin but the last 2 models, under certain conditions
+                train_lib.RemoveModel(args.dir, iter-2, num_iters, num_iters_combine,
+                            args.preserve_model_interval)
+
+            if args.email is not None:
+                reporting_iter_interval = num_iters * args.reporting_interval
+                if iter % reporting_iter_interval == 0:
+                # lets do some reporting
+                    [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir, key="log-probability")
+                    message = report
+                    subject = "Update : Expt {dir} : Iter {iter}".format(dir = args.dir, iter = iter)
+                    train_lib.SendMail(message, subject, args.email)
+
+        num_archives_processed = num_archives_processed + current_num_jobs
+
+    if args.stage <= num_iters:
+        logger.info("Doing final combination to produce final.mdl")
+        chain_lib.CombineModels(args.dir, num_iters, num_iters_combine,
+                args.num_chunk_per_minibatch, egs_dir,
+                args.leaky_hmm_coefficient, args.l2_regularize,
+                args.xent_regularize, run_opts)
+
+    if args.cleanup:
+        logger.info("Cleaning up the experiment directory {0}".format(args.dir))
+        remove_egs = args.remove_egs
+        if args.egs_dir is not None:
+            # this egs_dir was not created by this experiment so we will not
+            # delete it
+            remove_egs = False
+
+        train_lib.CleanNnetDir(args.dir, num_iters, egs_dir,
+                               preserve_model_interval = args.preserve_model_interval,
+                               remove_egs = remove_egs)
+
+    # do some reporting
+    [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir, "log-probability")
+    if args.email is not None:
+        train_lib.SendMail(report, "Update : Expt {0} : complete".format(args.dir), args.email)
+
+    report_handle = open("{dir}/accuracy.report".format(dir = args.dir), "w")
+    report_handle.write(report)
+    report_handle.close()
+
+def Main():
+    [args, run_opts] = GetArgs()
+    try:
+        Train(args, run_opts)
+    except Exception as e:
+        if args.email is not None:
+            message = "Training session for experiment {dir} died due to an error.".format(dir = args.dir)
+            sendMail(message, message, args.email)
+        traceback.print_exc()
+        raise e
+
+if __name__ == "__main__":
+    Main()
diff --git a/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh b/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh
new file mode 100755
index 00000000000..036da48cdc9
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh
@@ -0,0 +1,645 @@
+#!/bin/bash
+
+# note, TDNN is the same as what we used to call multisplice.
+# This version of the script, nnet3/chain/train_tdnn.sh, is for 'chain' systems.
+
+# Copyright 2012-2015  Johns Hopkins University (Author: Daniel Povey).
+#           2013  Xiaohui Zhang
+#           2013  Guoguo Chen
+#           2014  Vimal Manohar
+#           2014  Vijayaditya Peddinti
+# Apache 2.0.
+
+
+# Begin configuration section.
+cmd=run.pl
+num_epochs=10      # Number of epochs of training;
+                   # the number of iterations is worked out from this.
+                   # Be careful with this: we actually go over the data
+                   # num-epochs * frame-subsampling-factor times, due to
+                   # using different data-shifts.
+truncate_deriv_weights=0  # can be used to set to zero the weights of derivs from frames
+                          # near the edges.  (counts subsampled frames).
+apply_deriv_weights=true
+initial_effective_lrate=0.0002
+final_effective_lrate=0.00002
+extra_left_context=0  # actually for recurrent setups.
+pnorm_input_dim=3000
+pnorm_output_dim=300
+relu_dim=  # you can use this to make it use ReLU's instead of p-norms.
+
+jesus_opts=  # opts to steps/nnet3/make_jesus_configs.py.
+             # If nonempty, assumes you want to use the jesus nonlinearity,
+             # and you should supply various options to that script in
+             # this string.
+rand_prune=4.0 # Relates to a speedup we do for LDA.
+minibatch_size=512  # This default is suitable for GPU-based training.
+                    # Set it to 128 for multi-threaded CPU-based training.
+lm_opts=   # options to chain-est-phone-lm
+l2_regularize=0.0
+leaky_hmm_coefficient=0.00001
+xent_regularize=0.0
+frames_per_iter=800000  # each iteration of training, see this many [input]
+                        # frames per job.  This option is passed to get_egs.sh.
+                        # Aim for about a minute of training time
+right_tolerance=5  # tolerance at the same frame-rate as the alignment directory.
+left_tolerance=5    # tolerance at the same frame-rate as the alignment directory.
+num_jobs_initial=1  # Number of neural net jobs to run in parallel at the start of training
+num_jobs_final=8   # Number of neural net jobs to run in parallel at the end of training
+frame_subsampling_factor=3  # ratio of frames-per-second of features we train
+                            # on, to chain model's output
+alignment_subsampling_factor=3  # ratio of frames-per-second of input alignments
+                                # to chain model's output
+get_egs_stage=0    # can be used for rerunning after partial
+online_ivector_dir=
+max_param_change=2.0
+remove_egs=true  # set to false to disable removing egs after training is done.
+
+max_models_combine=20 # The "max_models_combine" is the maximum number of models we give
+  # to the final 'combine' stage, but these models will themselves be averages of
+  # iteration-number ranges.
+ngram_order=3
+
+shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples
+                # on each iter.  You could set it to 0 or to a large value for complete
+                # randomization, but this would both consume memory and cause spikes in
+                # disk I/O.  Smaller is easier on disk and memory but less random.  It's
+                # not a huge deal though, as samples are anyway randomized right at the start.
+                # (the point of this is to get data in different minibatches on different iterations,
+                # since in the preconditioning method, 2 samples in the same minibatch can
+                # affect each others' gradients.
+final_layer_normalize_target=1.0  # you can set this to less than one if you
+                                  # think the final layer is learning too fast
+                                  # compared with the other layers.
+add_layers_period=2 # by default, add new layers every 2 iterations.
+stage=-7
+exit_stage=-100 # you can set this to terminate the training early.  Exits before running this stage
+
+
+# count space-separated fields in splice_indexes to get num-hidden-layers.
+splice_indexes="-4,-3,-2,-1,0,1,2,3,4  0  -2,2  0  -4,4 0"
+pool_type='none'
+pool_window=
+pool_lpfilter_width=
+
+# Format : layer<hidden_layer>/<frame_indices>....layer<hidden_layer>/<frame_indices> "
+# note: hidden layers which are composed of one or more components,
+# so hidden layer indexing is different from component count
+
+randprune=4.0 # speeds up LDA.
+use_gpu=true    # if true, we run on GPU.
+cleanup=true
+egs_dir=
+max_lda_jobs=20  # use no more than 20 jobs for the LDA accumulation.
+lda_opts=
+egs_opts=
+transform_dir=     # If supplied, this dir used instead of latdir to find transforms.
+cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.
+            # only relevant for "raw" features, not lda.
+feat_type=raw  # or set to 'lda' to use LDA features.
+frames_per_eg=25   # number of frames of output per chunk.  To be passed on to get_egs.sh.
+left_deriv_truncate=   # number of time-steps to avoid using the deriv of, on the left.
+right_deriv_truncate=  # number of time-steps to avoid using the deriv of, on the right.
+
+# End configuration section.
+
+trap 'for pid in $(jobs -pr); do kill -TERM $pid; done' INT QUIT TERM
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+if [ $# != 4 ]; then
+  echo "Usage: $0 [opts] <data> <tree-dir> <phone-lattice-dir> <exp-dir>"
+  echo " e.g.: $0 data/train exp/chain/tri3b_tree exp/tri3_latali exp/chain/tdnn_a"
+  echo ""
+  echo "Main options (for others, see top of script file)"
+  echo "  --config <config-file>                           # config file containing options"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --num-epochs <#epochs|15>                        # Number of epochs of training"
+  echo "  --initial-effective-lrate <lrate|0.02> # effective learning rate at start of training."
+  echo "  --final-effective-lrate <lrate|0.004>   # effective learning rate at end of training."
+  echo "                                                   # data, 0.00025 for large data"
+  echo "  --num-hidden-layers <#hidden-layers|2>           # Number of hidden layers, e.g. 2 for 3 hours of data, 4 for 100hrs"
+  echo "  --add-layers-period <#iters|2>                   # Number of iterations between adding hidden layers"
+  echo "  --num-jobs-initial <num-jobs|1>                  # Number of parallel jobs to use for neural net training, at the start."
+  echo "  --num-jobs-final <num-jobs|8>                    # Number of parallel jobs to use for neural net training, at the end"
+  echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job, for CPU-based training (will affect"
+  echo "                                                   # results as well as speed; may interact with batch size; if you increase"
+  echo "                                                   # this, you may want to decrease the batch size."
+  echo "  --parallel-opts <opts|\"-pe smp 16 -l ram_free=1G,mem_free=1G\">      # extra options to pass to e.g. queue.pl for processes that"
+  echo "                                                   # use multiple threads... note, you might have to reduce mem_free,ram_free"
+  echo "                                                   # versus your defaults, because it gets multiplied by the -pe smp argument."
+  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
+  echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
+  echo "                                                   # should not get too large, e.g. >2k)."
+  echo "  --frames-per-iter <#frames|400000>               # Number of frames of data to process per iteration, per"
+  echo "                                                   # process."
+  echo "  --splice-indexes <string|layer0/-4:-3:-2:-1:0:1:2:3:4> "
+  echo "                                                   # Frame indices used for each splice layer."
+  echo "                                                   # Format : layer<hidden_layer_index>/<frame_indices>....layer<hidden_layer>/<frame_indices> "
+  echo "                                                   # (note: we splice processed, typically 40-dimensional frames"
+  echo "  --lda-dim <dim|''>                               # Dimension to reduce spliced features to with LDA"
+  echo "  --stage <stage|-4>                               # Used to run a partially-completed training process from somewhere in"
+  echo "                                                   # the middle."
+
+
+  exit 1;
+fi
+
+data=$1
+treedir=$2
+latdir=$3
+dir=$4
+
+
+# Check some files.
+for f in $data/feats.scp $treedir/ali.1.gz $treedir/final.mdl $treedir/tree \
+    $latdir/lat.1.gz $latdir/final.mdl $latdir/num_jobs $latdir/splice_opts; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+
+# Set some variables.
+nj=`cat $treedir/num_jobs` || exit 1;  # number of jobs in alignment dir...
+
+sdata=$data/split$nj
+utils/split_data.sh $data $nj
+
+mkdir -p $dir/log
+echo $nj > $dir/num_jobs
+cp $treedir/tree $dir
+
+
+# First work out the feature and iVector dimension, needed for tdnn config creation.
+case $feat_type in
+  raw) feat_dim=$(feat-to-dim --print-args=false scp:$data/feats.scp -) || \
+      { echo "$0: Error getting feature dim"; exit 1; }
+    ;;
+  lda)  [ ! -f $treedir/final.mat ] && echo "$0: With --feat-type lda option, expect $treedir/final.mat to exist."
+   # get num-rows in lda matrix, which is the lda feature dim.
+   feat_dim=$(matrix-dim --print-args=false $treedir/final.mat | cut -f 1)
+    ;;
+  *)
+   echo "$0: Bad --feat-type '$feat_type';"; exit 1;
+esac
+if [ -z "$online_ivector_dir" ]; then
+  ivector_dim=0
+else
+  ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
+fi
+
+if  [ $stage -le -7 ]; then
+  echo "$0: creating phone language-model"
+
+  $cmd $dir/log/make_phone_lm.log \
+    chain-est-phone-lm $lm_opts \
+     "ark:gunzip -c $treedir/ali.*.gz | ali-to-phones $treedir/final.mdl ark:- ark:- |" \
+     $dir/phone_lm.fst || exit 1
+fi
+
+if [ $stage -le -6 ]; then
+  echo "$0: creating denominator FST"
+  copy-transition-model $treedir/final.mdl $dir/0.trans_mdl
+  $cmd $dir/log/make_den_fst.log \
+    chain-make-den-fst $dir/tree $dir/0.trans_mdl $dir/phone_lm.fst \
+       $dir/den.fst $dir/normalization.fst || exit 1;
+fi
+
+# work out num-leaves
+num_leaves=$(am-info $dir/0.trans_mdl | grep -w pdfs | awk '{print $NF}') || exit 1;
+[ $num_leaves -gt 0 ] || exit 1;
+
+if [ $stage -le -5 ]; then
+  echo "$0: creating neural net configs";
+
+  if [ ! -z "$jesus_opts" ]; then
+    $cmd $dir/log/make_configs.log \
+       python steps/nnet3/make_jesus_configs.py \
+      --xent-regularize=$xent_regularize \
+      --include-log-softmax=false \
+      --splice-indexes "$splice_indexes"  \
+      --feat-dim $feat_dim \
+      --ivector-dim $ivector_dim  \
+       $jesus_opts \
+      --num-targets $num_leaves \
+      $dir/configs || exit 1;
+  else
+    [ $xent_regularize != "0.0" ] && \
+      echo "$0: --xent-regularize option not supported by tdnn/make_configs.py." && exit 1;
+    if [ ! -z "$relu_dim" ]; then
+      dim_opts="--relu-dim $relu_dim"
+    else
+      dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim  $pnorm_output_dim"
+    fi
+
+    # create the config files for nnet initialization
+    pool_opts=
+    pool_opts=$pool_opts${pool_type:+" --pool-type $pool_type "}
+    pool_opts=$pool_opts${pool_window:+" --pool-window $pool_window "}
+    pool_opts=$pool_opts${pool_lpfilter_width:+" --pool-lpfilter-width $pool_lpfilter_width "}
+
+    python steps/nnet3/tdnn/make_configs.py $pool_opts \
+      --include-log-softmax=false \
+      --final-layer-normalize-target $final_layer_normalize_target \
+      --splice-indexes "$splice_indexes"  \
+      --feat-dim $feat_dim \
+      --ivector-dim $ivector_dim  \
+      $dim_opts \
+      --num-targets $num_leaves \
+      --use-presoftmax-prior-scale false \
+      $dir/configs || exit 1;
+  fi
+
+  # Initialize as "raw" nnet, prior to training the LDA-like preconditioning
+  # matrix.  This first config just does any initial splicing that we do;
+  # we do this as it's a convenient way to get the stats for the 'lda-like'
+  # transform.
+  $cmd $dir/log/nnet_init.log \
+    nnet3-init --srand=-2 $dir/configs/init.config $dir/init.raw || exit 1;
+fi
+
+# sourcing the "vars" below sets
+# left_context=(something)
+# right_context=(something)
+# num_hidden_layers=(something)
+. $dir/configs/vars || exit 1;
+
+# the next 2 lines are in case the configs were created by an older
+# config-generating script, which writes to left_context and right_context
+# instead of model_left_context and model_right_context.
+[ -z $model_left_context ] && model_left_context=$left_context
+[ -z $model_right_context ] && model_right_context=$right_context
+
+! [ "$num_hidden_layers" -gt 0 ] && echo \
+ "$0: Expected num_hidden_layers to be defined" && exit 1;
+
+[ -z "$transform_dir" ] && transform_dir=$latdir
+
+if [ $stage -le -4 ] && [ -z "$egs_dir" ]; then
+  extra_opts=()
+  [ ! -z "$cmvn_opts" ] && extra_opts+=(--cmvn-opts "$cmvn_opts")
+  [ ! -z "$feat_type" ] && extra_opts+=(--feat-type $feat_type)
+  [ ! -z "$online_ivector_dir" ] && extra_opts+=(--online-ivector-dir $online_ivector_dir)
+  extra_opts+=(--transform-dir $transform_dir)
+  # we need a bit of extra left-context and right-context to allow for frame
+  # shifts (we use shifted version of the data for more variety).
+  extra_opts+=(--left-context $[$model_left_context+$frame_subsampling_factor/2+$extra_left_context])
+  extra_opts+=(--right-context $[$model_right_context+$frame_subsampling_factor/2])
+  echo "$0: calling get_egs.sh"
+  steps/nnet3/chain/get_egs.sh $egs_opts "${extra_opts[@]}" \
+      --frames-per-iter $frames_per_iter --stage $get_egs_stage \
+      --cmd "$cmd" \
+      --right-tolerance "$right_tolerance" \
+      --left-tolerance "$left_tolerance" \
+      --frames-per-eg $frames_per_eg \
+      --frame-subsampling-factor $frame_subsampling_factor \
+      --alignment-subsampling-factor $alignment_subsampling_factor \
+      $data $dir $latdir $dir/egs || exit 1;
+fi
+
+[ -z $egs_dir ] && egs_dir=$dir/egs
+
+if [ "$feat_dim" != "$(cat $egs_dir/info/feat_dim)" ]; then
+  echo "$0: feature dimension mismatch with egs in $egs_dir: $feat_dim vs $(cat $egs_dir/info/feat_dim)";
+  exit 1;
+fi
+if [ "$ivector_dim" != "$(cat $egs_dir/info/ivector_dim)" ]; then
+  echo "$0: ivector dimension mismatch with egs in $egs_dir: $ivector_dim vs $(cat $egs_dir/info/ivector_dim)";
+  exit 1;
+fi
+
+# copy any of the following that exist, to $dir.
+cp $egs_dir/{cmvn_opts,splice_opts,final.mat} $dir 2>/dev/null
+
+# confirm that the egs_dir has the necessary context (especially important if
+# the --egs-dir option was used on the command line).
+egs_left_context=$(cat $egs_dir/info/left_context) || exit -1
+egs_right_context=$(cat $egs_dir/info/right_context) || exit -1
+( [ $egs_left_context -lt $model_left_context ] || \
+  [ $egs_right_context -lt $model_right_context ] ) && \
+   echo "$0: egs in $egs_dir have too little context" && exit -1;
+
+frames_per_eg=$(cat $egs_dir/info/frames_per_eg) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }
+num_archives=$(cat $egs_dir/info/num_archives) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }
+
+num_archives_expanded=$[$num_archives*$frame_subsampling_factor]
+
+[ $num_jobs_initial -gt $num_jobs_final ] && \
+  echo "$0: --initial-num-jobs cannot exceed --final-num-jobs" && exit 1;
+
+[ $num_jobs_final -gt $num_archives_expanded ] && \
+  echo "$0: --final-num-jobs cannot exceed #archives $num_archives_expanded." && exit 1;
+
+if [ $stage -le -3 ]; then
+  echo "$0: getting preconditioning matrix for input features."
+  num_lda_jobs=$num_archives
+  [ $num_lda_jobs -gt $max_lda_jobs ] && num_lda_jobs=$max_lda_jobs
+
+  # Write stats with the same format as stats for LDA.
+  $cmd JOB=1:$num_lda_jobs $dir/log/get_lda_stats.JOB.log \
+      nnet3-chain-acc-lda-stats --rand-prune=$rand_prune \
+         $dir/init.raw "ark:$egs_dir/cegs.JOB.ark" $dir/JOB.lda_stats || exit 1;
+
+  all_lda_accs=$(for n in $(seq $num_lda_jobs); do echo $dir/$n.lda_stats; done)
+  $cmd $dir/log/sum_transform_stats.log \
+    sum-lda-accs $dir/lda_stats $all_lda_accs || exit 1;
+
+  rm $all_lda_accs || exit 1;
+
+  # this computes a fixed affine transform computed in the way we described in
+  # Appendix C.6 of http://arxiv.org/pdf/1410.7455v6.pdf; it's a scaled variant
+  # of an LDA transform but without dimensionality reduction.
+  $cmd $dir/log/get_transform.log \
+     nnet-get-feature-transform $lda_opts $dir/lda.mat $dir/lda_stats || exit 1;
+
+  ln -sf ../lda.mat $dir/configs/lda.mat
+fi
+
+if [ $stage -le -1 ]; then
+  # Add the first layer; this will add in the lda.mat and
+  # presoftmax_prior_scale.vec.
+
+  echo "$0: creating initial raw model"
+  $cmd $dir/log/add_first_layer.log \
+       nnet3-init --srand=-1 $dir/init.raw $dir/configs/layer1.config $dir/0.raw || exit 1;
+
+
+  # The model-format for a 'chain' acoustic model is just the transition
+  # model and then the raw nnet, so we can use 'cat' to create this, as
+  # long as they have the same mode (binary or not binary).
+  # We ensure that they have the same mode (even if someone changed the
+  # script to make one or both of them text mode) by copying them both
+  # before concatenating them.
+
+  echo "$0: creating initial model"
+  $cmd $dir/log/init_model.log \
+    nnet3-am-init $dir/0.trans_mdl $dir/0.raw $dir/0.mdl || exit 1;
+fi
+
+echo $frame_subsampling_factor >$dir/frame_subsampling_factor || exit 1;
+
+# set num_iters so that as close as possible, we process the data $num_epochs
+# times, i.e. $num_iters*$avg_num_jobs) == $num_epochs*$num_archives_expanded
+# where avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.
+
+num_archives_to_process=$[$num_epochs*$num_archives_expanded]
+num_archives_processed=0
+num_iters=$[($num_archives_to_process*2)/($num_jobs_initial+$num_jobs_final)]
+
+! [ $num_iters -gt $[$finish_add_layers_iter+2] ] \
+  && echo "$0: Insufficient epochs" && exit 1
+
+finish_add_layers_iter=$[$num_hidden_layers * $add_layers_period]
+
+echo "$0: Will train for $num_epochs epochs = $num_iters iterations"
+
+if $use_gpu; then
+  parallel_suffix=""
+  train_queue_opt="--gpu 1"
+  combine_queue_opt="--gpu 1"
+  prior_gpu_opt="--use-gpu=yes"
+  prior_queue_opt="--gpu 1"
+  parallel_train_opts=
+  if ! cuda-compiled; then
+    echo "$0: WARNING: you are running with one thread but you have not compiled"
+    echo "   for CUDA.  You may be running a setup optimized for GPUs.  If you have"
+    echo "   GPUs and have nvcc installed, go to src/ and do ./configure; make"
+    exit 1
+  fi
+else
+  echo "$0: without using a GPU this will be very slow.  nnet3 does not yet support multiple threads."
+  parallel_train_opts="--use-gpu=no"
+  train_queue_opt="--num-threads $num_threads"
+  combine_queue_opt=""  # the combine stage will be quite slow if not using
+                        # GPU, as we didn't enable that program to use
+                        # multiple threads.
+  prior_gpu_opt="--use-gpu=no"
+  prior_queue_opt=""
+fi
+
+
+approx_iters_per_epoch_final=$[$num_archives_expanded/$num_jobs_final]
+# First work out how many iterations we want to combine over in the final
+# nnet3-combine-fast invocation.  (We may end up subsampling from these if the
+# number exceeds max_model_combine).  The number we use is:
+# min(max(max_models_combine, approx_iters_per_epoch_final),
+#     1/2 * iters_after_last_layer_added)
+num_iters_combine=$max_models_combine
+if [ $num_iters_combine -lt $approx_iters_per_epoch_final ]; then
+   num_iters_combine=$approx_iters_per_epoch_final
+fi
+half_iters_after_add_layers=$[($num_iters-$finish_add_layers_iter)/2]
+if [ $num_iters_combine -gt $half_iters_after_add_layers ]; then
+  num_iters_combine=$half_iters_after_add_layers
+fi
+first_model_combine=$[$num_iters-$num_iters_combine+1]
+
+x=0
+
+deriv_time_opts=
+[ ! -z "$left_deriv_truncate" ] && deriv_time_opts="--optimization.min-deriv-time=$left_deriv_truncate"
+[ ! -z "$right_deriv_truncate" ] && \
+  deriv_time_opts="$deriv_time_opts --optimization.max-deriv-time=$((frames_per_eg - right_deriv_truncate))"
+
+
+while [ $x -lt $num_iters ]; do
+  [ $x -eq $exit_stage ] && echo "$0: Exiting early due to --exit-stage $exit_stage" && exit 0;
+
+  this_num_jobs=$(perl -e "print int(0.5+$num_jobs_initial+($num_jobs_final-$num_jobs_initial)*$x/$num_iters);")
+
+  ilr=$initial_effective_lrate; flr=$final_effective_lrate; np=$num_archives_processed; nt=$num_archives_to_process;
+  this_learning_rate=$(perl -e "print (($x + 1 >= $num_iters ? $flr : $ilr*exp($np*log($flr/$ilr)/$nt))*$this_num_jobs);");
+
+  echo "On iteration $x, learning rate is $this_learning_rate."
+
+
+  if [ $x -ge 0 ] && [ $stage -le $x ]; then
+
+    # Set off jobs doing some diagnostics, in the background.
+    # Use the egs dir from the previous iteration for the diagnostics
+    $cmd $dir/log/compute_prob_valid.$x.log \
+      nnet3-chain-compute-prob --l2-regularize=$l2_regularize --leaky-hmm-coefficient=$leaky_hmm_coefficient --xent-regularize=$xent_regularize \
+          "nnet3-am-copy --raw=true $dir/$x.mdl -|" $dir/den.fst \
+          "ark,bg:nnet3-chain-merge-egs ark:$egs_dir/valid_diagnostic.cegs ark:- |" &
+    $cmd $dir/log/compute_prob_train.$x.log \
+      nnet3-chain-compute-prob --l2-regularize=$l2_regularize --leaky-hmm-coefficient=$leaky_hmm_coefficient  --xent-regularize=$xent_regularize \
+          "nnet3-am-copy --raw=true $dir/$x.mdl -|" $dir/den.fst \
+          "ark,bg:nnet3-chain-merge-egs ark:$egs_dir/train_diagnostic.cegs ark:- |" &
+
+    if [ $x -gt 0 ]; then
+      # This doesn't use the egs, it only shows the relative change in model parameters.
+      $cmd $dir/log/progress.$x.log \
+        nnet3-show-progress --use-gpu=no "nnet3-am-copy --raw=true $dir/$[$x-1].mdl - |" \
+                  "nnet3-am-copy --raw=true $dir/$x.mdl - |" '&&' \
+        nnet3-am-info $dir/$x.mdl &
+    fi
+
+    echo "Training neural net (pass $x)"
+
+    if [ $x -gt 0 ] && \
+      [ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \
+      [ $[$x%$add_layers_period] -eq 0 ]; then
+      do_average=false # if we've just mixed up, don't do averaging but take the
+                       # best.
+      cur_num_hidden_layers=$[1+$x/$add_layers_period]
+      config=$dir/configs/layer$cur_num_hidden_layers.config
+      mdl="nnet3-am-copy --raw=true --learning-rate=$this_learning_rate $dir/$x.mdl - | nnet3-init --srand=$x - $config - |"
+      cache_io_opts=""
+    else
+      do_average=true
+      if [ $x -eq 0 ]; then do_average=false; fi # on iteration 0, pick the best, don't average.
+      mdl="nnet3-am-copy --raw=true --learning-rate=$this_learning_rate $dir/$x.mdl -|"
+      cache_io_opts="--read-cache=$dir/cache.$x"
+    fi
+    if $do_average; then
+      this_minibatch_size=$minibatch_size
+      this_max_param_change=$max_param_change
+    else
+      # on iteration zero or when we just added a layer, use a smaller minibatch
+      # size (and we will later choose the output of just one of the jobs): the
+      # model-averaging isn't always helpful when the model is changing too fast
+      # (i.e. it can worsen the objective function), and the smaller minibatch
+      # size will help to keep the update stable.
+      this_minibatch_size=$[$minibatch_size/2];
+      this_max_param_change=$(perl -e "print ($max_param_change/sqrt(2));")
+    fi
+
+    rm $dir/.error 2>/dev/null
+
+
+    (
+      trap 'for pid in $(jobs -pr); do kill -TERM $pid; done' INT QUIT TERM
+      # this sub-shell is so that when we "wait" below,
+      # we only wait for the training jobs that we just spawned,
+      # not the diagnostic jobs that we spawned above.
+
+      # We can't easily use a single parallel SGE job to do the main training,
+      # because the computation of which archive and which --frame option
+      # to use for each job is a little complex, so we spawn each one separately.
+      for n in $(seq $this_num_jobs); do
+        k=$[$num_archives_processed + $n - 1]; # k is a zero-based index that we'll derive
+                                               # the other indexes from.
+        archive=$[($k%$num_archives)+1]; # work out the 1-based archive index.
+        frame_shift=$[($k/$num_archives)%$frame_subsampling_factor];
+        if [ $n -eq 1 ]; then
+          # opts for computation cache (storing compiled computation).
+          this_cache_io_opts="$cache_io_opts --write-cache=$dir/cache.$[$x+1]"
+        else
+          this_cache_io_opts="$cache_io_opts"
+        fi
+        $cmd $train_queue_opt $dir/log/train.$x.$n.log \
+          nnet3-chain-train --apply-deriv-weights=$apply_deriv_weights \
+             --l2-regularize=$l2_regularize --leaky-hmm-coefficient=$leaky_hmm_coefficient --xent-regularize=$xent_regularize \
+              $this_cache_io_opts $parallel_train_opts $deriv_time_opts \
+             --max-param-change=$this_max_param_change \
+            --print-interval=10 "$mdl" $dir/den.fst \
+          "ark,bg:nnet3-chain-copy-egs --truncate-deriv-weights=$truncate_deriv_weights --frame-shift=$frame_shift ark:$egs_dir/cegs.$archive.ark ark:- | nnet3-chain-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-| nnet3-chain-merge-egs --minibatch-size=$this_minibatch_size ark:- ark:- |" \
+          $dir/$[$x+1].$n.raw || touch $dir/.error &
+      done
+      wait
+    )
+    # the error message below is not that informative, but $cmd will
+    # have printed a more specific one.
+    [ -f $dir/.error ] && echo "$0: error on iteration $x of training" && exit 1;
+
+    models_to_average=$(steps/nnet3/get_successful_models.py --difference-threshold 0.1 $this_num_jobs $dir/log/train.$x.%.log)
+    nnets_list=
+    for n in $models_to_average; do
+      nnets_list="$nnets_list $dir/$[$x+1].$n.raw"
+    done
+
+    if $do_average; then
+      # average the output of the different jobs.
+      $cmd $dir/log/average.$x.log \
+        nnet3-average $nnets_list - \| \
+        nnet3-am-copy --set-raw-nnet=- $dir/$x.mdl $dir/$[$x+1].mdl || exit 1;
+    else
+      # choose the best from the different jobs.
+      n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
+          $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
+          undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
+          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
+          $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
+      [ -z "$n" ] && echo "Error getting best model" && exit 1;
+      $cmd $dir/log/select.$x.log \
+        nnet3-am-copy --set-raw-nnet=$dir/$[$x+1].$n.raw  $dir/$x.mdl $dir/$[$x+1].mdl || exit 1;
+    fi
+
+    rm $nnets_list
+    [ ! -f $dir/$[$x+1].mdl ] && exit 1;
+    if [ -f $dir/$[$x-1].mdl ] && $cleanup && \
+       [ $[($x-1)%10] -ne 0  ] && [ $[$x-1] -lt $first_model_combine ]; then
+      rm $dir/$[$x-1].mdl
+    fi
+  fi
+  rm $dir/cache.$x 2>/dev/null
+  x=$[$x+1]
+  num_archives_processed=$[$num_archives_processed+$this_num_jobs]
+done
+
+
+if [ $stage -le $num_iters ]; then
+  echo "Doing final combination to produce final.mdl"
+
+  # Now do combination.  In the nnet3 setup, the logic
+  # for doing averaging of subsets of the models in the case where
+  # there are too many models to reliably esetimate interpolation
+  # factors (max_models_combine) is moved into the nnet3-combine
+  nnets_list=()
+  for n in $(seq 0 $[num_iters_combine-1]); do
+    iter=$[$first_model_combine+$n]
+    [ ! -f $dir/$iter.mdl ] && echo "Expected $mdl to exist" && exit 1;
+    mdl="nnet3-am-copy --raw=true $dir/$iter.mdl - |"
+    nnets_list[$n]="$mdl";
+  done
+
+  # Below, we use --use-gpu=no to disable nnet3-combine-fast from using a GPU,
+  # as if there are many models it can give out-of-memory error; and we set
+  # num-threads to 8 to speed it up (this isn't ideal...)
+
+  $cmd $combine_queue_opt $dir/log/combine.log \
+    nnet3-chain-combine --num-iters=40  --l2-regularize=$l2_regularize --leaky-hmm-coefficient=$leaky_hmm_coefficient \
+       --enforce-sum-to-one=true --enforce-positive-weights=true \
+       --verbose=3 $dir/den.fst "${nnets_list[@]}" "ark,bg:nnet3-chain-merge-egs --minibatch-size=$minibatch_size ark:$egs_dir/combine.cegs ark:-|" \
+       "|nnet3-am-copy --set-raw-nnet=- $dir/$first_model_combine.mdl $dir/final.mdl" || exit 1;
+
+
+  # Compute the probability of the final, combined model with
+  # the same subset we used for the previous compute_probs, as the
+  # different subsets will lead to different probs.
+  $cmd $dir/log/compute_prob_valid.final.log \
+    nnet3-chain-compute-prob --l2-regularize=$l2_regularize --leaky-hmm-coefficient=$leaky_hmm_coefficient --xent-regularize=$xent_regularize \
+           "nnet3-am-copy --raw=true $dir/final.mdl - |" $dir/den.fst \
+    "ark,bg:nnet3-chain-merge-egs ark:$egs_dir/valid_diagnostic.cegs ark:- |" &
+  $cmd $dir/log/compute_prob_train.final.log \
+    nnet3-chain-compute-prob --l2-regularize=$l2_regularize --leaky-hmm-coefficient=$leaky_hmm_coefficient --xent-regularize=$xent_regularize \
+      "nnet3-am-copy --raw=true $dir/final.mdl - |" $dir/den.fst \
+    "ark,bg:nnet3-chain-merge-egs ark:$egs_dir/train_diagnostic.cegs ark:- |" &
+fi
+
+if [ ! -f $dir/final.mdl ]; then
+  echo "$0: $dir/final.mdl does not exist."
+  # we don't want to clean up if the training didn't succeed.
+  exit 1;
+fi
+
+sleep 2
+
+echo Done
+
+if $cleanup; then
+  echo Cleaning up data
+  if $remove_egs && [[ $egs_dir =~ $dir/egs* ]]; then
+    steps/nnet2/remove_egs.sh $egs_dir
+  fi
+
+  echo Removing most of the models
+  for x in `seq 0 $num_iters`; do
+    if [ $[$x%100] -ne 0 ] && [ $x -ne $num_iters ] && [ -f $dir/$x.mdl ]; then
+       # delete all but every 100th model; don't delete the ones which combine to form the final model.
+      rm $dir/$x.mdl
+    fi
+  done
+fi
diff --git a/egs/wsj/s5/steps/nnet3/components.py b/egs/wsj/s5/steps/nnet3/components.py
new file mode 100644
index 00000000000..e9723c392cc
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/components.py
@@ -0,0 +1,442 @@
+#!/usr/bin/env python
+
+from __future__ import print_function
+import os
+import argparse
+import sys
+import warnings
+import copy
+from operator import itemgetter
+
+def GetSumDescriptor(inputs):
+    sum_descriptors = inputs
+    while len(sum_descriptors) != 1:
+        cur_sum_descriptors = []
+        pair = []
+        while len(sum_descriptors) > 0:
+            value = sum_descriptors.pop()
+            if value.strip() != '':
+                pair.append(value)
+            if len(pair) == 2:
+                cur_sum_descriptors.append("Sum({0}, {1})".format(pair[0], pair[1]))
+                pair = []
+        if pair:
+            cur_sum_descriptors.append(pair[0])
+        sum_descriptors = cur_sum_descriptors
+    return sum_descriptors
+
+# adds the input nodes and returns the descriptor
+def AddInputLayer(config_lines, feat_dim, splice_indexes=[0], ivector_dim=0):
+    components = config_lines['components']
+    component_nodes = config_lines['component-nodes']
+    output_dim = 0
+    components.append('input-node name=input dim=' + str(feat_dim))
+    list = [('Offset(input, {0})'.format(n) if n != 0 else 'input') for n in splice_indexes]
+    output_dim += len(splice_indexes) * feat_dim
+    if ivector_dim > 0:
+        components.append('input-node name=ivector dim=' + str(ivector_dim))
+        list.append('ReplaceIndex(ivector, t, 0)')
+        output_dim += ivector_dim
+    if len(list) > 1:
+        splice_descriptor = "Append({0})".format(", ".join(list))
+    else:
+        splice_descriptor = list[0]
+    print(splice_descriptor)
+    return {'descriptor': splice_descriptor,
+            'dimension': output_dim}
+
+def AddNoOpLayer(config_lines, name, input):
+    components = config_lines['components']
+    component_nodes = config_lines['component-nodes']
+
+    components.append('component name={0}_noop type=NoOpComponent dim={1}'.format(name, input['dimension']))
+    component_nodes.append('component-node name={0}_noop component={0}_noop input={1}'.format(name, input['descriptor']))
+
+    return {'descriptor':  '{0}_noop'.format(name),
+            'dimension': input['dimension']}
+
+def AddLdaLayer(config_lines, name, input, lda_file):
+    return AddFixedAffineLayer(config_lines, name, input, lda_file)
+
+def AddFixedAffineLayer(config_lines, name, input, matrix_file):
+    components = config_lines['components']
+    component_nodes = config_lines['component-nodes']
+
+    components.append('component name={0}_fixaffine type=FixedAffineComponent matrix={1}'.format(name, matrix_file))
+    component_nodes.append('component-node name={0}_fixaffine component={0}_fixaffine input={1}'.format(name, input['descriptor']))
+
+    return {'descriptor':  '{0}_fixaffine'.format(name),
+            'dimension': input['dimension']}
+
+
+def AddBlockAffineLayer(config_lines, name, input, output_dim, num_blocks):
+    components = config_lines['components']
+    component_nodes = config_lines['component-nodes']
+    assert((input['dimension'] % num_blocks == 0) and
+            (output_dim % num_blocks == 0))
+    components.append('component name={0}_block_affine type=BlockAffineComponent input-dim={1} output-dim={2} num-blocks={3}'.format(name, input['dimension'], output_dim, num_blocks))
+    component_nodes.append('component-node name={0}_block_affine component={0}_block_affine input={1}'.format(name, input['descriptor']))
+
+    return {'descriptor' : '{0}_block_affine'.format(name),
+                           'dimension' : output_dim}
+
+def AddPermuteLayer(config_lines, name, input, column_map):
+    components = config_lines['components']
+    component_nodes = config_lines['component-nodes']
+    permute_indexes = ",".join(map(lambda x: str(x), column_map))
+    components.append('component name={0}_permute type=PermuteComponent column-map={1}'.format(name, permute_indexes))
+    component_nodes.append('component-node name={0}_permute component={0}_permute input={1}'.format(name, input['descriptor']))
+
+    return {'descriptor': '{0}_permute'.format(name),
+            'dimension': input['dimension']}
+
+def AddAffineLayer(config_lines, name, input, output_dim, ng_affine_options = ""):
+    components = config_lines['components']
+    component_nodes = config_lines['component-nodes']
+
+    components.append("component name={0}_affine type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input['dimension'], output_dim, ng_affine_options))
+    component_nodes.append("component-node name={0}_affine component={0}_affine input={1}".format(name, input['descriptor']))
+
+    return {'descriptor':  '{0}_affine'.format(name),
+            'dimension': output_dim}
+
+def AddAffRelNormLayer(config_lines, name, input, output_dim, ng_affine_options = " bias-stddev=0 ", norm_target_rms = 1.0, self_repair_scale = None):
+    components = config_lines['components']
+    component_nodes = config_lines['component-nodes']
+
+    self_repair_string = "self-repair-scale={0:.10f}".format(self_repair_scale) if self_repair_scale is not None else ''
+    components.append("component name={0}_affine type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input['dimension'], output_dim, ng_affine_options))
+    components.append("component name={0}_relu type=RectifiedLinearComponent dim={1} {2}".format(name, output_dim, self_repair_string))
+    components.append("component name={0}_renorm type=NormalizeComponent dim={1} target-rms={2}".format(name, output_dim, norm_target_rms))
+
+    component_nodes.append("component-node name={0}_affine component={0}_affine input={1}".format(name, input['descriptor']))
+    component_nodes.append("component-node name={0}_relu component={0}_relu input={0}_affine".format(name))
+    component_nodes.append("component-node name={0}_renorm component={0}_renorm input={0}_relu".format(name))
+
+    return {'descriptor':  '{0}_renorm'.format(name),
+            'dimension': output_dim}
+
+def AddConvolutionLayer(config_lines, name, input,
+                       input_x_dim, input_y_dim, input_z_dim,
+                       filt_x_dim, filt_y_dim,
+                       filt_x_step, filt_y_step,
+                       num_filters, input_vectorization,
+                       param_stddev = None, bias_stddev = None,
+                       filter_bias_file = None,
+                       is_updatable = True):
+    assert(input['dimension'] == input_x_dim * input_y_dim * input_z_dim)
+    components = config_lines['components']
+    component_nodes = config_lines['component-nodes']
+
+    conv_init_string = ("component name={name}_conv type=ConvolutionComponent "
+                       "input-x-dim={input_x_dim} input-y-dim={input_y_dim} input-z-dim={input_z_dim} "
+                       "filt-x-dim={filt_x_dim} filt-y-dim={filt_y_dim} "
+                       "filt-x-step={filt_x_step} filt-y-step={filt_y_step} "
+                       "input-vectorization-order={vector_order}".format(name = name,
+                       input_x_dim = input_x_dim, input_y_dim = input_y_dim, input_z_dim = input_z_dim,
+                       filt_x_dim = filt_x_dim, filt_y_dim = filt_y_dim,
+                       filt_x_step = filt_x_step, filt_y_step = filt_y_step,
+                       vector_order = input_vectorization))
+    if filter_bias_file is not None:
+        conv_init_string += " matrix={0}".format(filter_bias_file)
+    else:
+        conv_init_string += " num-filters={0}".format(num_filters)
+
+    components.append(conv_init_string)
+    component_nodes.append("component-node name={0}_conv_t component={0}_conv input={1}".format(name, input['descriptor']))
+
+    num_x_steps = (1 + (input_x_dim - filt_x_dim) / filt_x_step)
+    num_y_steps = (1 + (input_y_dim - filt_y_dim) / filt_y_step)
+    output_dim = num_x_steps * num_y_steps * num_filters;
+    return {'descriptor':  '{0}_conv_t'.format(name),
+            'dimension': output_dim,
+            '3d-dim': [num_x_steps, num_y_steps, num_filters],
+            'vectorization': 'zyx'}
+
+# The Maxpooling component assumes input vectorizations of type zyx
+def AddMaxpoolingLayer(config_lines, name, input,
+                      input_x_dim, input_y_dim, input_z_dim,
+                      pool_x_size, pool_y_size, pool_z_size,
+                      pool_x_step, pool_y_step, pool_z_step):
+    if input_x_dim < 1 or input_y_dim < 1 or input_z_dim < 1:
+        raise Exception("non-positive maxpooling input size ({0}, {1}, {2})".
+                 format(input_x_dim, input_y_dim, input_z_dim))
+    if pool_x_size > input_x_dim or pool_y_size > input_y_dim or pool_z_size > input_z_dim:
+        raise Exception("invalid maxpooling pool size vs. input size")
+    if pool_x_step > pool_x_size or pool_y_step > pool_y_size or pool_z_step > pool_z_size:
+        raise Exception("invalid maxpooling pool step vs. pool size")
+    
+    assert(input['dimension'] == input_x_dim * input_y_dim * input_z_dim)
+    components = config_lines['components']
+    component_nodes = config_lines['component-nodes']
+
+    components.append('component name={name}_maxp type=MaxpoolingComponent '
+                      'input-x-dim={input_x_dim} input-y-dim={input_y_dim} input-z-dim={input_z_dim} '
+                      'pool-x-size={pool_x_size} pool-y-size={pool_y_size} pool-z-size={pool_z_size} '
+                      'pool-x-step={pool_x_step} pool-y-step={pool_y_step} pool-z-step={pool_z_step} '.
+                      format(name = name,
+                      input_x_dim = input_x_dim, input_y_dim = input_y_dim, input_z_dim = input_z_dim,
+                      pool_x_size = pool_x_size, pool_y_size = pool_y_size, pool_z_size = pool_z_size,
+                      pool_x_step = pool_x_step, pool_y_step = pool_y_step, pool_z_step = pool_z_step))
+
+    component_nodes.append('component-node name={0}_maxp_t component={0}_maxp input={1}'.format(name, input['descriptor']))
+
+    num_pools_x = 1 + (input_x_dim - pool_x_size) / pool_x_step;
+    num_pools_y = 1 + (input_y_dim - pool_y_size) / pool_y_step;
+    num_pools_z = 1 + (input_z_dim - pool_z_size) / pool_z_step;
+    output_dim = num_pools_x * num_pools_y * num_pools_z;
+
+    return {'descriptor':  '{0}_maxp_t'.format(name),
+            'dimension': output_dim,
+            '3d-dim': [num_pools_x, num_pools_y, num_pools_z],
+            'vectorization': 'zyx'}
+
+
+def AddSoftmaxLayer(config_lines, name, input):
+    components = config_lines['components']
+    component_nodes = config_lines['component-nodes']
+
+    components.append("component name={0}_log_softmax type=LogSoftmaxComponent dim={1}".format(name, input['dimension']))
+    component_nodes.append("component-node name={0}_log_softmax component={0}_log_softmax input={1}".format(name, input['descriptor']))
+
+    return {'descriptor':  '{0}_log_softmax'.format(name),
+            'dimension': input['dimension']}
+
+def AddPerDimAffineLayer(config_lines, name, input, input_window):
+    components = config_lines['components']
+    component_nodes = config_lines['component-nodes']
+
+    filter_context = int((input_window - 1) / 2)
+    filter_input_splice_indexes = range(-1 * filter_context, filter_context + 1)
+    list = [('Offset({0}, {1})'.format(input['descriptor'], n) if n != 0 else input['descriptor']) for n in filter_input_splice_indexes]
+    filter_input_descriptor = 'Append({0})'.format(' , '.join(list))
+    filter_input_descriptor = {'descriptor':filter_input_descriptor,
+                               'dimension':len(filter_input_splice_indexes) * input['dimension']}
+
+    # add permute component to shuffle the feature columns of the Append
+    # descriptor output so that columns corresponding to the same feature index
+    # are contiguous add a block-affine component to collapse all the feature
+    # indexes across time steps into a single value
+    num_feats = input['dimension']
+    num_times = len(filter_input_splice_indexes)
+    column_map = []
+    for i in range(num_feats):
+        for j in range(num_times):
+            column_map.append(j * num_feats + i)
+
+    composite_config_lines = {'components':[], 'component-nodes':[]}
+
+    permuted_output_descriptor = AddPermuteLayer(composite_config_lines,
+            name, filter_input_descriptor, column_map)
+
+    # add a block-affine component
+    output_descriptor = AddBlockAffineLayer(composite_config_lines, name,
+                                                  permuted_output_descriptor,
+                                                  num_feats, num_feats)
+
+
+    # strip names
+    ccl = composite_config_lines['components']
+    composite_config_line = ''
+    for index in range(len(ccl)):
+        parts = ccl[index].split()
+        assert(parts[0] == "component" and parts[1].split('=')[0] == "name")
+        composite_config_line += " component{0}='{1}'".format(index+1, " ".join(parts[2:]))
+
+    components.append("component name={name} type=CompositeComponent num-components={nc} {rest}".format(name = '{0}_PDA'.format(name),
+                   nc = len(ccl),
+                   rest = composite_config_line))
+    component_nodes.append("component-node name={0}_PDA component={0}_PDA input={1}".format(name, filter_input_descriptor['descriptor']))
+    return [{'descriptor': '{0}_PDA'.format(name),
+            'dimension': output_descriptor['dimension']
+            }, filter_context, filter_context]
+
+def AddSigmoidLayer(config_lines, name, input, self_repair_scale = None):
+    components = config_lines['components']
+    component_nodes = config_lines['component-nodes']
+
+    self_repair_string = "self-repair-scale={0:.10f}".format(self_repair_scale) if self_repair_scale is not None else ''
+    components.append("component name={0}_sigmoid type=SigmoidComponent dim={1}".format(name, input['dimension'], self_repair_string))
+    component_nodes.append("component-node name={0}_sigmoid component={0}_sigmoid input={1}".format(name, input['descriptor']))
+    return {'descriptor':  '{0}_sigmoid'.format(name),
+            'dimension': input['dimension']}
+
+def AddOutputLayer(config_lines, input, label_delay = None, suffix=None, objective_type = "linear"):
+    components = config_lines['components']
+    component_nodes = config_lines['component-nodes']
+    name = 'output'
+    if suffix is not None:
+        name = '{0}-{1}'.format(name, suffix)
+
+    if label_delay is None:
+        component_nodes.append('output-node name={0} input={1} objective={2}'.format(name, input['descriptor'], objective_type))
+    else:
+        component_nodes.append('output-node name={0} input=Offset({1},{2}) objective={3}'.format(name, input['descriptor'], label_delay, objective_type))
+
+def AddFinalLayer(config_lines, input, output_dim,
+        ng_affine_options = " param-stddev=0 bias-stddev=0 ",
+        label_delay=None,
+        use_presoftmax_prior_scale = False,
+        prior_scale_file = None,
+        include_log_softmax = True,
+        add_final_sigmoid = False,
+        name_affix = None,
+        objective_type = "linear"):
+    components = config_lines['components']
+    component_nodes = config_lines['component-nodes']
+
+    if name_affix is not None:
+        final_node_prefix = 'Final-' + str(name_affix)
+    else:
+        final_node_prefix = 'Final'
+
+    prev_layer_output = AddAffineLayer(config_lines,
+            final_node_prefix , input, output_dim,
+            ng_affine_options)
+    if include_log_softmax:
+        if use_presoftmax_prior_scale :
+            components.append('component name={0}-fixed-scale type=FixedScaleComponent scales={1}'.format(final_node_prefix, prior_scale_file))
+            component_nodes.append('component-node name={0}-fixed-scale component={0}-fixed-scale input={1}'.format(final_node_prefix,
+                prev_layer_output['descriptor']))
+            prev_layer_output['descriptor'] = "{0}-fixed-scale".format(final_node_prefix)
+        prev_layer_output = AddSoftmaxLayer(config_lines, final_node_prefix, prev_layer_output)
+    elif add_final_sigmoid:
+        # Useful when you need the final outputs to be probabilities
+        # between 0 and 1.
+        # Usually used with an objective-type such as "quadratic"
+        prev_layer_output = AddSigmoidLayer(config_lines, final_node_prefix, prev_layer_output)
+    # we use the same name_affix as a prefix in for affine/scale nodes but as a
+    # suffix for output node
+    AddOutputLayer(config_lines, prev_layer_output, label_delay, suffix = name_affix, objective_type = objective_type)
+
+def AddLstmLayer(config_lines,
+                 name, input, cell_dim,
+                 recurrent_projection_dim = 0,
+                 non_recurrent_projection_dim = 0,
+                 clipping_threshold = 1.0,
+                 norm_based_clipping = "false",
+                 ng_per_element_scale_options = "",
+                 ng_affine_options = "",
+                 lstm_delay = -1,
+                 self_repair_scale = None):
+    assert(recurrent_projection_dim >= 0 and non_recurrent_projection_dim >= 0)
+    components = config_lines['components']
+    component_nodes = config_lines['component-nodes']
+
+    input_descriptor = input['descriptor']
+    input_dim = input['dimension']
+    name = name.strip()
+
+    if (recurrent_projection_dim == 0):
+        add_recurrent_projection = False
+        recurrent_projection_dim = cell_dim
+        recurrent_connection = "m_t"
+    else:
+        add_recurrent_projection = True
+        recurrent_connection = "r_t"
+    if (non_recurrent_projection_dim == 0):
+        add_non_recurrent_projection = False
+    else:
+        add_non_recurrent_projection = True
+
+    self_repair_string = "self-repair-scale={0:.10f}".format(self_repair_scale) if self_repair_scale is not None else ''
+    # Natural gradient per element scale parameters
+    ng_per_element_scale_options += " param-mean=0.0 param-stddev=1.0 "
+    # Parameter Definitions W*(* replaced by - to have valid names)
+    components.append("# Input gate control : W_i* matrices")
+    components.append("component name={0}_W_i-xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + recurrent_projection_dim, cell_dim, ng_affine_options))
+    components.append("# note : the cell outputs pass through a diagonal matrix")
+    components.append("component name={0}_w_ic type=NaturalGradientPerElementScaleComponent  dim={1} {2}".format(name, cell_dim, ng_per_element_scale_options))
+
+    components.append("# Forget gate control : W_f* matrices")
+    components.append("component name={0}_W_f-xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + recurrent_projection_dim, cell_dim, ng_affine_options))
+    components.append("# note : the cell outputs pass through a diagonal matrix")
+    components.append("component name={0}_w_fc type=NaturalGradientPerElementScaleComponent  dim={1} {2}".format(name, cell_dim, ng_per_element_scale_options))
+
+    components.append("#  Output gate control : W_o* matrices")
+    components.append("component name={0}_W_o-xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + recurrent_projection_dim, cell_dim, ng_affine_options))
+    components.append("# note : the cell outputs pass through a diagonal matrix")
+    components.append("component name={0}_w_oc type=NaturalGradientPerElementScaleComponent  dim={1} {2}".format(name, cell_dim, ng_per_element_scale_options))
+
+    components.append("# Cell input matrices : W_c* matrices")
+    components.append("component name={0}_W_c-xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + recurrent_projection_dim, cell_dim, ng_affine_options))
+
+
+    components.append("# Defining the non-linearities")
+    components.append("component name={0}_i type=SigmoidComponent dim={1} {2}".format(name, cell_dim, self_repair_string))
+    components.append("component name={0}_f type=SigmoidComponent dim={1} {2}".format(name, cell_dim, self_repair_string))
+    components.append("component name={0}_o type=SigmoidComponent dim={1} {2}".format(name, cell_dim, self_repair_string))
+    components.append("component name={0}_g type=TanhComponent dim={1} {2}".format(name, cell_dim, self_repair_string))
+    components.append("component name={0}_h type=TanhComponent dim={1} {2}".format(name, cell_dim, self_repair_string))
+
+    components.append("# Defining the cell computations")
+    components.append("component name={0}_c1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
+    components.append("component name={0}_c2 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
+    components.append("component name={0}_m type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
+    components.append("component name={0}_c type=ClipGradientComponent dim={1} clipping-threshold={2} norm-based-clipping={3} ".format(name, cell_dim, clipping_threshold, norm_based_clipping))
+
+    # c1_t and c2_t defined below
+    component_nodes.append("component-node name={0}_c_t component={0}_c input=Sum({0}_c1_t, {0}_c2_t)".format(name))
+    c_tminus1_descriptor = "IfDefined(Offset({0}_c_t, {1}))".format(name, lstm_delay)
+
+    component_nodes.append("# i_t")
+    component_nodes.append("component-node name={0}_i1 component={0}_W_i-xr input=Append({1}, IfDefined(Offset({0}_{2}, {3})))".format(name, input_descriptor, recurrent_connection, lstm_delay))
+    component_nodes.append("component-node name={0}_i2 component={0}_w_ic  input={1}".format(name, c_tminus1_descriptor))
+    component_nodes.append("component-node name={0}_i_t component={0}_i input=Sum({0}_i1, {0}_i2)".format(name))
+
+    component_nodes.append("# f_t")
+    component_nodes.append("component-node name={0}_f1 component={0}_W_f-xr input=Append({1}, IfDefined(Offset({0}_{2}, {3})))".format(name, input_descriptor, recurrent_connection, lstm_delay))
+    component_nodes.append("component-node name={0}_f2 component={0}_w_fc  input={1}".format(name, c_tminus1_descriptor))
+    component_nodes.append("component-node name={0}_f_t component={0}_f input=Sum({0}_f1,{0}_f2)".format(name))
+
+    component_nodes.append("# o_t")
+    component_nodes.append("component-node name={0}_o1 component={0}_W_o-xr input=Append({1}, IfDefined(Offset({0}_{2}, {3})))".format(name, input_descriptor, recurrent_connection, lstm_delay))
+    component_nodes.append("component-node name={0}_o2 component={0}_w_oc input={0}_c_t".format(name))
+    component_nodes.append("component-node name={0}_o_t component={0}_o input=Sum({0}_o1, {0}_o2)".format(name))
+
+    component_nodes.append("# h_t")
+    component_nodes.append("component-node name={0}_h_t component={0}_h input={0}_c_t".format(name))
+
+    component_nodes.append("# g_t")
+    component_nodes.append("component-node name={0}_g1 component={0}_W_c-xr input=Append({1}, IfDefined(Offset({0}_{2}, {3})))".format(name, input_descriptor, recurrent_connection, lstm_delay))
+    component_nodes.append("component-node name={0}_g_t component={0}_g input={0}_g1".format(name))
+
+    component_nodes.append("# parts of c_t")
+    component_nodes.append("component-node name={0}_c1_t component={0}_c1  input=Append({0}_f_t, {1})".format(name, c_tminus1_descriptor))
+    component_nodes.append("component-node name={0}_c2_t component={0}_c2 input=Append({0}_i_t, {0}_g_t)".format(name))
+
+    component_nodes.append("# m_t")
+    component_nodes.append("component-node name={0}_m_t component={0}_m input=Append({0}_o_t, {0}_h_t)".format(name))
+
+    # add the recurrent connections
+    if (add_recurrent_projection and add_non_recurrent_projection):
+        components.append("# projection matrices : Wrm and Wpm")
+        components.append("component name={0}_W-m type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, recurrent_projection_dim + non_recurrent_projection_dim, ng_affine_options))
+        components.append("component name={0}_r type=ClipGradientComponent dim={1} clipping-threshold={2} norm-based-clipping={3} ".format(name, recurrent_projection_dim, clipping_threshold, norm_based_clipping))
+        component_nodes.append("# r_t and p_t")
+        component_nodes.append("component-node name={0}_rp_t component={0}_W-m input={0}_m_t".format(name))
+        component_nodes.append("dim-range-node name={0}_r_t_preclip input-node={0}_rp_t dim-offset=0 dim={1}".format(name, recurrent_projection_dim))
+        component_nodes.append("component-node name={0}_r_t component={0}_r input={0}_r_t_preclip".format(name))
+        output_descriptor = '{0}_rp_t'.format(name)
+        output_dim = recurrent_projection_dim + non_recurrent_projection_dim
+
+    elif add_recurrent_projection:
+        components.append("# projection matrices : Wrm")
+        components.append("component name={0}_Wrm type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, recurrent_projection_dim, ng_affine_options))
+        components.append("component name={0}_r type=ClipGradientComponent dim={1} clipping-threshold={2} norm-based-clipping={3} ".format(name, recurrent_projection_dim, clipping_threshold, norm_based_clipping))
+        component_nodes.append("# r_t")
+        component_nodes.append("component-node name={0}_r_t_preclip component={0}_Wrm input={0}_m_t".format(name))
+        component_nodes.append("component-node name={0}_r_t component={0}_r input={0}_r_t_preclip".format(name))
+        output_descriptor = '{0}_r_t'.format(name)
+        output_dim = recurrent_projection_dim
+
+    else:
+        components.append("component name={0}_r type=ClipGradientComponent dim={1} clipping-threshold={2} norm-based-clipping={3} ".format(name, cell_dim, clipping_threshold, norm_based_clipping))
+        component_nodes.append("component-node name={0}_r_t component={0}_r input={0}_m_t".format(name))
+        output_descriptor = '{0}_r_t'.format(name)
+        output_dim = cell_dim
+
+    return {
+            'descriptor': output_descriptor,
+            'dimension':output_dim
+            }
diff --git a/egs/wsj/s5/steps/nnet3/decode.sh b/egs/wsj/s5/steps/nnet3/decode.sh
index 880ddd11f48..151dd6be2e7 100755
--- a/egs/wsj/s5/steps/nnet3/decode.sh
+++ b/egs/wsj/s5/steps/nnet3/decode.sh
@@ -12,8 +12,11 @@ stage=1
 transform_dir=    # dir to find fMLLR transforms.
 nj=4 # number of decoding jobs.  If --transform-dir set, must match that number!
 acwt=0.1  # Just a default value, used for adaptation and beam-pruning..
+post_decode_acwt=1.0  # can be used in 'chain' systems to scale acoustics by 10 so the
+                      # regular scoring script works.
 cmd=run.pl
 beam=15.0
+frames_per_chunk=50
 max_active=7000
 min_active=200
 ivector_scale=1.0
@@ -23,6 +26,10 @@ num_threads=1 # if >1, will use gmm-latgen-faster-parallel
 parallel_opts=  # ignored now.
 scoring_opts=
 skip_scoring=false
+extra_left_context=0
+extra_right_context=0
+extra_left_context_initial=-1
+extra_right_context_final=-1
 feat_type=
 online_ivector_dir=
 minimize=false
@@ -69,7 +76,7 @@ done
 sdata=$data/split$nj;
 cmvn_opts=`cat $srcdir/cmvn_opts` || exit 1;
 thread_string=
-[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads" 
+[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads"
 
 mkdir -p $dir/log
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
@@ -95,7 +102,7 @@ if [ ! -z "$transform_dir" ]; then
   [ ! -s $transform_dir/num_jobs ] && \
     echo "$0: expected $transform_dir/num_jobs to contain the number of jobs." && exit 1;
   nj_orig=$(cat $transform_dir/num_jobs)
-  
+
   if [ $feat_type == "raw" ]; then trans=raw_trans;
   else trans=trans; fi
   if [ $feat_type == "lda" ] && \
@@ -125,19 +132,36 @@ fi
 
 if [ ! -z "$online_ivector_dir" ]; then
   ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
-  ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector_period=$ivector_period"
+  ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
+fi
+
+if [ "$post_decode_acwt" == 1.0 ]; then
+  lat_wspecifier="ark:|gzip -c >$dir/lat.JOB.gz"
+else
+  lat_wspecifier="ark:|lattice-scale --acoustic-scale=$post_decode_acwt ark:- ark:- | gzip -c >$dir/lat.JOB.gz"
+fi
+
+frame_subsampling_opt=
+if [ -f $srcdir/frame_subsampling_factor ]; then
+  # e.g. for 'chain' systems
+  frame_subsampling_opt="--frame-subsampling-factor=$(cat $srcdir/frame_subsampling_factor)"
 fi
 
 if [ $stage -le 1 ]; then
   $cmd --num-threads $num_threads JOB=1:$nj $dir/log/decode.JOB.log \
-    nnet3-latgen-faster$thread_string $ivector_opts \
+    nnet3-latgen-faster$thread_string $ivector_opts $frame_subsampling_opt \
+     --frames-per-chunk=$frames_per_chunk \
+     --extra-left-context=$extra_left_context \
+     --extra-right-context=$extra_right_context \
+     --extra-left-context-initial=$extra_left_context_initial \
+     --extra-right-context-final=$extra_right_context_final \
      --minimize=$minimize --max-active=$max_active --min-active=$min_active --beam=$beam \
      --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=true \
      --word-symbol-table=$graphdir/words.txt "$model" \
-     $graphdir/HCLG.fst "$feats" "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;
+     $graphdir/HCLG.fst "$feats" "$lat_wspecifier" || exit 1;
 fi
 
-# The output of this script is the files "lat.*.gz"-- we'll rescore this at 
+# The output of this script is the files "lat.*.gz"-- we'll rescore this at
 # different acoustic scales to get the final output.
 
 
@@ -146,7 +170,8 @@ if [ $stage -le 2 ]; then
     [ ! -x local/score.sh ] && \
       echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
     echo "score best paths"
-    local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir
+    [ "$iter" != "final" ] && iter_opt="--iter $iter"
+    local/score.sh $iter_opt $scoring_opts --cmd "$cmd" $data $graphdir $dir
     echo "score confidence and timing with sclite"
   fi
 fi
diff --git a/egs/wsj/s5/steps/nnet3/dot/nnet3_to_dot.py b/egs/wsj/s5/steps/nnet3/dot/nnet3_to_dot.py
index 88cf54e824e..2290c4d2e7f 100755
--- a/egs/wsj/s5/steps/nnet3/dot/nnet3_to_dot.py
+++ b/egs/wsj/s5/steps/nnet3/dot/nnet3_to_dot.py
@@ -34,6 +34,11 @@
         'shape':'box',
         'style':'filled'
     },
+    'ConvolutionComponent':{
+        'color':'lightpink',
+        'shape':'box',
+        'style':'filled'
+    },
     'FixedScaleComponent':{
         'color':'blueviolet',
         'shape':'box',
@@ -64,6 +69,11 @@
         'shape':'rectangle',
         'style':'filled'
     },
+    'ClipGradientComponent':{
+        'color':'bisque',
+        'shape':'rectangle',
+        'style':'filled'
+    },
     'ElementwiseProductComponent':{
         'color':'green',
         'shape':'rectangle',
@@ -84,10 +94,10 @@ def GetDotNodeName(name_string, is_component = False):
     #   2. Nnet3 names can be shared among components and component nodes
     #      dot does not allow common names
     #
-    name_string = re.sub("-", "hyphen", name_string)
+    node_name_string = re.sub("-", "hyphen", name_string)
     if is_component:
-        name_string += name_string.strip() + "_component"
-    return name_string
+        node_name_string += node_name_string.strip() + "_component"
+    return {"label":name_string, "node":node_name_string}
 
 def ProcessAppendDescriptor(segment, parent_node_name, affix, edge_attributes = None):
     dot_graph = []
@@ -96,18 +106,18 @@ def ProcessAppendDescriptor(segment, parent_node_name, affix, edge_attributes =
     for i in range(len(segment['sub_segments'])):
         sub_segment = segment['sub_segments'][i]
         part_name = "{0}{1}{2}".format(desc_name, sub_segment['name'], i)
-        names.append("<{0}> part {1}".format(GetDotNodeName(part_name), i))
+        names.append("<{0}> part {1}".format(GetDotNodeName(part_name)['node'], i))
         dot_graph += DescriptorSegmentToDot(sub_segment, "{0}:{1}".format(desc_name, part_name), desc_name)
 
     part_index = len(segment['sub_segments'])
     for i in range(len(segment['arguments'])):
         part_name = "{0}{1}{2}".format(desc_name, segment['arguments'][i], part_index + i)
-        names.append("<{0}> part {1}".format(GetDotNodeName(part_name), part_index + i))
-        dot_graph.append("{0} -> {1}:{2}".format(GetDotNodeName(segment['arguments'][i]), GetDotNodeName(desc_name), GetDotNodeName(part_name)))
+        names.append("<{0}> part {1}".format(GetDotNodeName(part_name)['node'], part_index + i))
+        dot_graph.append("{0} -> {1}:{2}".format(GetDotNodeName(segment['arguments'][i])['node'], GetDotNodeName(desc_name)['node'], GetDotNodeName(part_name)['node']))
 
     label = "|".join(names)
     label = "{{"+label+"}|Append}"
-    dot_graph.append('{0} [shape=Mrecord, label="{1}"];'.format(GetDotNodeName(desc_name), label))
+    dot_graph.append('{0} [shape=Mrecord, label="{1}"];'.format(GetDotNodeName(desc_name)['node'], label))
 
     attr_string = ''
     if edge_attributes is not None:
@@ -116,7 +126,7 @@ def ProcessAppendDescriptor(segment, parent_node_name, affix, edge_attributes =
         if edge_attributes.has_key('style'):
             attr_string += ' style={0} '.format(edge_attributes['style'])
 
-    dot_string = '{0} -> {1} [tailport=s]'.format(GetDotNodeName(desc_name), GetDotNodeName(parent_node_name))
+    dot_string = '{0} -> {1} [tailport=s]'.format(GetDotNodeName(desc_name)['node'], GetDotNodeName(parent_node_name)['node'])
 
     if attr_string != '':
         dot_string += ' [{0}] '.format(attr_string)
@@ -125,6 +135,28 @@ def ProcessAppendDescriptor(segment, parent_node_name, affix, edge_attributes =
 
     return dot_graph
 
+def ProcessRoundDescriptor(segment, parent_node_name, affix, edge_attributes = None):
+    dot_graph = []
+
+    label = 'Round ({0})'.format(segment['arguments'][1])
+    style = None
+    if edge_attributes is not None:
+        if edge_attributes.has_key('label'):
+            label = "{0} {1}".format(edge_attributes['label'], label)
+        if edge_attributes.has_key('style'):
+            style  = 'style={0}'.format(edge_attributes['style'])
+
+    attr_string = 'label="{0}"'.format(label)
+    if style is not None:
+        attr_string += ' {0}'.format(style)
+    dot_graph.append('{0}->{1} [ {2} ]'.format(GetDotNodeName(segment['arguments'][0])['node'],
+                                                                    GetDotNodeName(parent_node_name)['node'],
+                                                                    attr_string))
+    if segment['sub_segments']:
+        raise Exception("Round can just deal with forwarding descriptor, no sub-segments allowed")
+    return dot_graph
+
+
 def ProcessOffsetDescriptor(segment, parent_node_name, affix, edge_attributes = None):
     dot_graph = []
 
@@ -140,8 +172,8 @@ def ProcessOffsetDescriptor(segment, parent_node_name, affix, edge_attributes =
     if style is not None:
         attr_string += ' {0}'.format(style)
 
-    dot_graph.append('{0}->{1} [ {2} ]'.format(GetDotNodeName(segment['arguments'][0]),
-                                                                    GetDotNodeName(parent_node_name),
+    dot_graph.append('{0}->{1} [ {2} ]'.format(GetDotNodeName(segment['arguments'][0])['node'],
+                                                                    GetDotNodeName(parent_node_name)['node'],
                                                                     attr_string))
     if segment['sub_segments']:
         raise Exception("Offset can just deal with forwarding descriptor, no sub-segments allowed")
@@ -151,21 +183,23 @@ def ProcessSumDescriptor(segment, parent_node_name, affix, edge_attributes = Non
     dot_graph = []
     names = []
     desc_name = 'Sum_{0}'.format(affix)
+    # create the sum node
     for i in range(len(segment['sub_segments'])):
         sub_segment = segment['sub_segments'][i]
         part_name = "{0}{1}{2}".format(desc_name, sub_segment['name'], i)
-        names.append("<{0}> part {1}".format(GetDotNodeName(part_name), i))
-        dot_graph += DescriptorSegmentToDot(sub_segment, "{0}:{1}".format(desc_name, part_name), desc_name)
+        names.append("<{0}> part {1}".format(GetDotNodeName(part_name)['node'], i))
+        dot_graph += DescriptorSegmentToDot(sub_segment, "{0}:{1}".format(desc_name, part_name), desc_name+"_"+str(i))
 
+    # link the sum node parts to corresponding segments
     part_index = len(segment['sub_segments'])
     for i in range(len(segment['arguments'])):
         part_name = "{0}{1}{2}".format(desc_name, segment['arguments'][i], part_index + i)
-        names.append("<{0}> part {1}".format(GetDotNodeName(part_name), part_index + i))
-        dot_graph.append("{0} -> {1}:{2}".format(GetDotNodeName(segment['arguments'][i]), GetDotNodeName(desc_name), GetDotNodeName(part_name)))
+        names.append("<{0}> part {1}".format(GetDotNodeName(part_name)['node'], part_index + i))
+        dot_graph.append("{0} -> {1}:{2}".format(GetDotNodeName(segment['arguments'][i])['node'], GetDotNodeName(desc_name)['node'], GetDotNodeName(part_name)['node']))
 
     label = "|".join(names)
     label = '{{'+label+'}|Sum}'
-    dot_graph.append('{0} [shape=Mrecord, label="{1}", color=red];'.format(GetDotNodeName(desc_name), label))
+    dot_graph.append('{0} [shape=Mrecord, label="{1}", color=red];'.format(GetDotNodeName(desc_name)['node'], label))
 
     attr_string = ''
     if edge_attributes is not None:
@@ -174,7 +208,7 @@ def ProcessSumDescriptor(segment, parent_node_name, affix, edge_attributes = Non
         if edge_attributes.has_key('style'):
             attr_string += ' style={0} '.format(edge_attributes['style'])
 
-    dot_string = '{0} -> {1}'.format(GetDotNodeName(desc_name), GetDotNodeName(parent_node_name))
+    dot_string = '{0} -> {1}'.format(GetDotNodeName(desc_name)['node'], GetDotNodeName(parent_node_name)['node'])
 
     dot_string += ' [{0} tailport=s ] '.format(attr_string)
     dot_graph.append(dot_string)
@@ -195,8 +229,8 @@ def ProcessReplaceIndexDescriptor(segment, parent_node_name, affix, edge_attribu
     if style is not None:
         attr_string += ' {0}'.format(style)
 
-    dot_graph.append('{0}->{1} [{2}]'.format(GetDotNodeName(segment['arguments'][0]),
-                                                                    GetDotNodeName(parent_node_name),
+    dot_graph.append('{0}->{1} [{2}]'.format(GetDotNodeName(segment['arguments'][0])['node'],
+                                                                    GetDotNodeName(parent_node_name)['node'],
                                                                     attr_string))
     if segment['sub_segments']:
         raise Exception("ReplaceIndex can just deal with forwarding descriptor, no sub-segments allowed")
@@ -215,7 +249,7 @@ def ProcessIfDefinedDescriptor(segment, parent_node_name, affix, edge_attributes
         dot_graph += DescriptorSegmentToDot(sub_segment, parent_node_name, parent_node_name, edge_attributes={'style':'dotted', 'label':'IfDefined'})
 
     if segment['arguments']:
-        dot_graph.append('{0} -> {1} [style=dotted, label="IfDefined"]'.format(GetDotNodeName(segment['arguments'][0]), GetDotNodeName(parent_node_name)))
+        dot_graph.append('{0} -> {1} [style=dotted, label="IfDefined"]'.format(GetDotNodeName(segment['arguments'][0])['node'], GetDotNodeName(parent_node_name)['node']))
 
     return dot_graph
 
@@ -232,6 +266,8 @@ def DescriptorSegmentToDot(segment, parent_node_name, affix, edge_attributes = N
         dot_graph += ProcessIfDefinedDescriptor(segment, parent_node_name, affix, edge_attributes)
     elif segment['name'] == "ReplaceIndex":
         dot_graph += ProcessReplaceIndexDescriptor(segment, parent_node_name, affix, edge_attributes)
+    elif segment['name'] == "Round":
+        dot_graph += ProcessRoundDescriptor(segment, parent_node_name, affix, edge_attributes)
     else:
         raise Exception('Descriptor {0}, is not recognized by this script. Please add Process{0}Descriptor method'.format(segment['name']))
     return dot_graph
@@ -244,7 +280,7 @@ def Nnet3DescriptorToDot(descriptor, parent_node_name):
             dot_lines += DescriptorSegmentToDot(segment, parent_node_name, parent_node_name)
     elif arguments:
         assert(len(arguments) == 1)
-        dot_lines.append("{0} -> {1}".format(GetDotNodeName(arguments[0]), GetDotNodeName(parent_node_name)))
+        dot_lines.append("{0} -> {1}".format(GetDotNodeName(arguments[0])['node'], GetDotNodeName(parent_node_name)['node']))
     return dot_lines
 
 def ParseNnet3String(string):
@@ -298,27 +334,28 @@ def Nnet3ComponentToDot(component_config, component_attributes = None):
     except KeyError:
         pass
 
-    return ['{0} [label="{1}" {2}]'.format(GetDotNodeName(component_config['name'], is_component = True), label, attr_string)]
+    return ['{0} [label="{1}" {2}]'.format(GetDotNodeName(component_config['name'], is_component = True)['node'], label, attr_string)]
 
 
 # input-node name=input dim=40
 def Nnet3InputToDot(parsed_config):
-    return ['{0} [ label="{1}\\ndim={2}"]'.format(GetDotNodeName(parsed_config['name']), parsed_config['name'], parsed_config['dim'] )]
+    return ['{0} [ label="{1}\\ndim={2}"]'.format(GetDotNodeName(parsed_config['name'])['node'], parsed_config['name'], parsed_config['dim'] )]
 
 # output-node name=output input=Final_log_softmax dim=3940 objective=linear
+#output-node name=output input=Offset(Final_log_softmax, 5) dim=3940 objective=linear
 def Nnet3OutputToDot(parsed_config):
     dot_graph = []
-    dot_graph.append('{0} [ label="{1}\\nobjective={2}"]'.format(GetDotNodeName(parsed_config['name']), parsed_config['name'], parsed_config['objective']))
-    dot_graph.append('{0} -> {1}'.format(GetDotNodeName(parsed_config['input']), GetDotNodeName(parsed_config['name'])))
+    dot_graph += Nnet3DescriptorToDot(parsed_config['input'], parsed_config['name'])
+    dot_graph.append('{0} [ label="{1}\\nobjective={2}"]'.format(GetDotNodeName(parsed_config['name'])['node'], parsed_config['name'], parsed_config['objective']))
     return dot_graph
 
 # dim-range-node name=Lstm1_r_t input-node=Lstm1_rp_t dim-offset=0 dim=256
 def Nnet3DimrangeToDot(parsed_config):
     dot_graph = []
-    dot_graph.append(parsed_config['name'])
-    dot_graph.append('{0} [shape=rectangle]'.format(GetDotNodeName(parsed_config['name'])))
-    dot_graph.append('{0} -> {1} [taillabel="dimrange({2}, {3})"]'.format(GetDotNodeName(parsed_config['input-node']),
-                                                           GetDotNodeName(parsed_config['name']),
+    dot_node = GetDotNodeName(parsed_config['name'])
+    dot_graph.append('{0} [shape=rectangle, label="{1}"]'.format(dot_node['node'], dot_node['label']))
+    dot_graph.append('{0} -> {1} [taillabel="dimrange({2}, {3})"]'.format(GetDotNodeName(parsed_config['input-node'])['node'],
+                                                           GetDotNodeName(parsed_config['name'])['node'],
                                                            parsed_config['dim-offset'],
                                                            parsed_config['dim']))
     return dot_graph
@@ -326,9 +363,10 @@ def Nnet3DimrangeToDot(parsed_config):
 def Nnet3ComponentNodeToDot(parsed_config):
     dot_graph = []
     dot_graph += Nnet3DescriptorToDot(parsed_config['input'], parsed_config['name'])
-    dot_graph.append('{0} [ label="{1}", shape=box ]'.format(GetDotNodeName(parsed_config['name']), parsed_config['name']))
-    dot_graph.append('{0} -> {1} [ weight=10 ]'.format(GetDotNodeName(parsed_config['component'], is_component = True),
-                                                       GetDotNodeName(parsed_config['name'])))
+    dot_node = GetDotNodeName(parsed_config['name'])
+    dot_graph.append('{0} [ label="{1}", shape=box ]'.format(dot_node['node'], dot_node['label']))
+    dot_graph.append('{0} -> {1} [ weight=10 ]'.format(GetDotNodeName(parsed_config['component'], is_component = True)['node'],
+                                                       GetDotNodeName(parsed_config['name'])['node']))
     return dot_graph
 
 def GroupConfigs(configs, node_prefixes = []):
@@ -408,6 +446,8 @@ def ParseConfigLines(lines, node_prefixes = [], component_attributes = None ):
                         " will be clustered together in the dot-graph"
                         " --node-prefixes Lstm1,Lstm2,Layer1", default=None)
 
+    parser.add_argument("dotfile", help="name of the dot output file")
+
     print(' '.join(sys.argv), file=sys.stderr)
 
     args = parser.parse_args()
@@ -420,4 +460,7 @@ def ParseConfigLines(lines, node_prefixes = [], component_attributes = None ):
 
     lines = sys.stdin.readlines()
     dot_graph = ParseConfigLines(lines, component_attributes = component_attributes, node_prefixes = node_prefixes)
-    print("\n".join(dot_graph))
+
+    dotfile_handle = open(args.dotfile, "w")
+    dotfile_handle.write("\n".join(dot_graph))
+    dotfile_handle.close()
diff --git a/egs/wsj/s5/steps/nnet3/get_egs.sh b/egs/wsj/s5/steps/nnet3/get_egs.sh
index 28dc237153e..364f6a72443 100755
--- a/egs/wsj/s5/steps/nnet3/get_egs.sh
+++ b/egs/wsj/s5/steps/nnet3/get_egs.sh
@@ -19,14 +19,19 @@ feat_type=raw     # set it to 'lda' to use LDA features.
 frames_per_eg=8   # number of frames of labels per example.  more->less disk space and
                   # less time preparing egs, but more I/O during training.
                   # note: the script may reduce this if reduce_frames_per_eg is true.
-left_context=4    # amount of left-context per eg
-right_context=4   # amount of right-context per eg
+left_context=4    # amount of left-context per eg (i.e. extra frames of input features
+                  # not present in the output supervision).
+right_context=4   # amount of right-context per eg.
+valid_left_context=   # amount of left_context for validation egs, typically used in
+                      # recurrent architectures to ensure matched condition with
+                      # training egs
+valid_right_context=  # amount of right_context for validation egs
 compress=true   # set this to false to disable compression (e.g. if you want to see whether
                 # results are affected).
 
 reduce_frames_per_eg=true  # If true, this script may reduce the frames_per_eg
                            # if there is only one archive and even with the
-                           # reduced frames_pe_eg, the number of
+                           # reduced frames_per_eg, the number of
                            # samples_per_iter that would result is less than or
                            # equal to the user-specified value.
 num_utts_subset=300     # number of utterances in validation and training
@@ -34,15 +39,18 @@ num_utts_subset=300     # number of utterances in validation and training
 num_valid_frames_combine=0 # #valid frames for combination weights at the very end.
 num_train_frames_combine=10000 # # train frames for the above.
 num_frames_diagnostic=4000 # number of frames for "compute_prob" jobs
-samples_per_iter=400000 # each iteration of training, see this many samples
-                        # per job.  This is just a guideline; it will pick a number
-                        # that divides the number of samples in the entire data.
+samples_per_iter=400000 # this is the target number of egs in each archive of egs
+                        # (prior to merging egs).  We probably should have called
+                        # it egs_per_iter. This is just a guideline; it will pick
+                        # a number that divides the number of samples in the
+                        # entire data.
 
 transform_dir=     # If supplied, overrides alidir as the place to find fMLLR transforms
-post_dir=        # If supplied, we will use posteriors in it as soft training targets.
 
 stage=0
-io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time. 
+nj=6         # This should be set to the maximum number of jobs you are
+             # comfortable to run in parallel; you can increase it if your disk
+             # speed is greater and you have more machines.
 online_ivector_dir=  # can be used if we are including speaker information as iVectors.
 cmvn_opts=  # can be used for specifying CMVN options, if feature type is not lda (if lda,
             # it doesn't make sense to use different options than were used as input to the
@@ -53,16 +61,17 @@ echo "$0 $@"  # Print the command line for logging
 if [ -f path.sh ]; then . ./path.sh; fi
 . parse_options.sh || exit 1;
 
-
 if [ $# != 3 ]; then
   echo "Usage: $0 [opts] <data> <ali-dir> <egs-dir>"
   echo " e.g.: $0 data/train exp/tri3_ali exp/tri4_nnet/egs"
   echo ""
   echo "Main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config file containing options"
+  echo "  --nj <nj>                                        # The maximum number of jobs you want to run in"
+  echo "                                                   # parallel (increase this only if you have good disk and"
+  echo "                                                   # network speed).  default=6"
   echo "  --cmd (utils/run.pl;utils/queue.pl <queue opts>) # how to run jobs."
-  echo "  --samples-per-iter <#samples;400000>             # Number of samples of data to process per iteration, per"
-  echo "                                                   # process."
+  echo "  --samples-per-iter <#samples;400000>             # Target number of egs per archive (option is badly named)"
   echo "  --feat-type <lda|raw>                            # (raw is the default).  The feature type you want"
   echo "                                                   # to use as input to the neural net."
   echo "  --frames-per-eg <frames;8>                       # number of frames per eg on disk"
@@ -73,7 +82,7 @@ if [ $# != 3 ]; then
   echo "                                                   # very end."
   echo "  --stage <stage|0>                                # Used to run a partially-completed training process from somewhere in"
   echo "                                                   # the middle."
-  
+
   exit 1;
 fi
 
@@ -89,19 +98,19 @@ for f in $data/feats.scp $alidir/ali.1.gz $alidir/final.mdl $alidir/tree $extra_
   [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
 done
 
-nj=`cat $alidir/num_jobs` || exit 1;  # number of jobs in alignment dir...
-
 sdata=$data/split$nj
 utils/split_data.sh $data $nj
 
 mkdir -p $dir/log $dir/info
 cp $alidir/tree $dir
 
-# Get list of validation utterances. 
+num_ali_jobs=$(cat $alidir/num_jobs) || exit 1;
+
+# Get list of validation utterances.
 awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_utts_subset \
     > $dir/valid_uttlist || exit 1;
 
-if [ -f $data/utt2uniq ]; then
+if [ -f $data/utt2uniq ]; then  # this matters if you use data augmentation.
   echo "File $data/utt2uniq exists, so augmenting valid_uttlist to"
   echo "include all perturbed versions of the same 'real' utterances."
   mv $dir/valid_uttlist $dir/valid_uttlist.tmp
@@ -117,7 +126,26 @@ awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid_uttlis
 
 [ -z "$transform_dir" ] && transform_dir=$alidir
 
-## Set up features. 
+# because we'll need the features with a different number of jobs than $alidir,
+# copy to ark,scp.
+if [ -f $transform_dir/trans.1 ] && [ $feat_type != "raw" ]; then
+  echo "$0: using transforms from $transform_dir"
+  if [ $stage -le 0 ]; then
+    $cmd $dir/log/copy_transforms.log \
+      copy-feats "ark:cat $transform_dir/trans.* |" "ark,scp:$dir/trans.ark,$dir/trans.scp"
+  fi
+fi
+if [ -f $transform_dir/raw_trans.1 ] && [ $feat_type == "raw" ]; then
+  echo "$0: using raw transforms from $transform_dir"
+  if [ $stage -le 0 ]; then
+    $cmd $dir/log/copy_transforms.log \
+      copy-feats "ark:cat $transform_dir/raw_trans.* |" "ark,scp:$dir/trans.ark,$dir/trans.scp"
+  fi
+fi
+
+
+
+## Set up features.
 echo "$0: feature type is $feat_type"
 
 case $feat_type in
@@ -126,7 +154,7 @@ case $feat_type in
     train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |"
     echo $cmvn_opts >$dir/cmvn_opts # caution: the top-level nnet training script should copy this to its own dir now.
    ;;
-  lda) 
+  lda)
     splice_opts=`cat $alidir/splice_opts 2>/dev/null`
     # caution: the top-level nnet training script should copy these to its own dir now.
     cp $alidir/{splice_opts,cmvn_opts,final.mat} $dir || exit 1;
@@ -140,32 +168,25 @@ case $feat_type in
   *) echo "$0: invalid feature type --feat-type '$feat_type'" && exit 1;
 esac
 
-if [ -f $transform_dir/trans.1 ] && [ $feat_type != "raw" ]; then
-  echo "$0: using transforms from $transform_dir"
-  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/trans.JOB ark:- ark:- |"
-  valid_feats="$valid_feats transform-feats --utt2spk=ark:$data/utt2spk 'ark:cat $transform_dir/trans.*|' ark:- ark:- |"
-  train_subset_feats="$train_subset_feats transform-feats --utt2spk=ark:$data/utt2spk 'ark:cat $transform_dir/trans.*|' ark:- ark:- |"
-fi
-if [ -f $transform_dir/raw_trans.1 ] && [ $feat_type == "raw" ]; then
-  echo "$0: using raw-fMLLR transforms from $transform_dir"
-  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/raw_trans.JOB ark:- ark:- |"
-  valid_feats="$valid_feats transform-feats --utt2spk=ark:$data/utt2spk 'ark:cat $transform_dir/raw_trans.*|' ark:- ark:- |"
-  train_subset_feats="$train_subset_feats transform-feats --utt2spk=ark:$data/utt2spk 'ark:cat $transform_dir/raw_trans.*|' ark:- ark:- |"
+if [ -f $dir/trans.scp ]; then
+  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/trans.scp ark:- ark:- |"
+  valid_feats="$valid_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp ark:- ark:- |"
+  train_subset_feats="$train_subset_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp ark:- ark:- |"
 fi
 
 if [ ! -z "$online_ivector_dir" ]; then
   ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
   echo $ivector_dim > $dir/info/ivector_dim
   ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
-  
+
   ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $sdata/JOB/utt2spk $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'"
   valid_ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'"
-  train_subset_ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'"  
+  train_subset_ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'"
 else
   echo 0 >$dir/info/ivector_dim
 fi
 
-if [ $stage -le 0 ]; then
+if [ $stage -le 1 ]; then
   echo "$0: working out number of frames of training data"
   num_frames=$(steps/nnet2/get_num_frames.sh $data)
   echo $num_frames > $dir/info/num_frames
@@ -192,10 +213,22 @@ while $reduce_frames_per_eg && [ $frames_per_eg -gt 1 ] && \
 done
 $reduced && echo "$0: reduced frames_per_eg to $frames_per_eg because amount of data is small."
 
+# We may have to first create a smaller number of larger archives, with number
+# $num_archives_intermediate, if $num_archives is more than the maximum number
+# of open filehandles that the system allows per process (ulimit -n).
+max_open_filehandles=$(ulimit -n) || exit 1
+num_archives_intermediate=$num_archives
+archives_multiple=1
+while [ $[$num_archives_intermediate+4] -gt $max_open_filehandles ]; do
+  archives_multiple=$[$archives_multiple+1]
+  num_archives_intermediate=$[$num_archives/$archives_multiple+1];
+done
+# now make sure num_archives is an exact multiple of archives_multiple.
+num_archives=$[$archives_multiple*$num_archives_intermediate]
+
 echo $num_archives >$dir/info/num_archives
 echo $frames_per_eg >$dir/info/frames_per_eg
-
-# Working out number of egs per archive
+# Work out the number of egs per archive
 egs_per_archive=$[$num_frames/($frames_per_eg*$num_archives)]
 ! [ $egs_per_archive -le $samples_per_iter ] && \
   echo "$0: script error: egs_per_archive=$egs_per_archive not <= samples_per_iter=$samples_per_iter" \
@@ -206,38 +239,48 @@ echo $egs_per_archive > $dir/info/egs_per_archive
 echo "$0: creating $num_archives archives, each with $egs_per_archive egs, with"
 echo "$0:   $frames_per_eg labels per example, and (left,right) context = ($left_context,$right_context)"
 
-# Making soft links to storage directories.  This is a no-up unless
-# the subdirectory $dir/storage/ exists.  See utils/create_split_dir.pl
-for x in `seq $num_archives`; do
-  utils/create_data_link.pl $dir/egs.$x.ark
-  for y in `seq $nj`; do
-    utils/create_data_link.pl $dir/egs_orig.$x.$y.ark
+
+
+if [ -e $dir/storage ]; then
+  # Make soft links to storage directories, if distributing this way..  See
+  # utils/create_split_dir.pl.
+  echo "$0: creating data links"
+  utils/create_data_link.pl $(for x in $(seq $num_archives); do echo $dir/egs.$x.ark; done)
+  for x in $(seq $num_archives_intermediate); do
+    utils/create_data_link.pl $(for y in $(seq $nj); do echo $dir/egs_orig.$y.$x.ark; done)
   done
-done
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: copying data alignments"
+  for id in $(seq $num_ali_jobs); do gunzip -c $alidir/ali.$id.gz; done | \
+    copy-int-vector ark:- ark,scp:$dir/ali.ark,$dir/ali.scp || exit 1;
+fi
 
 egs_opts="--left-context=$left_context --right-context=$right_context --compress=$compress"
 
+[ -z $valid_left_context ] &&  valid_left_context=$left_context;
+[ -z $valid_right_context ] &&  valid_right_context=$right_context;
+valid_egs_opts="--left-context=$valid_left_context --right-context=$valid_right_context --compress=$compress"
+
 echo $left_context > $dir/info/left_context
 echo $right_context > $dir/info/right_context
 num_pdfs=$(tree-info --print-args=false $alidir/tree | grep num-pdfs | awk '{print $2}')
-if [ $stage -le 2 ]; then
+if [ $stage -le 3 ]; then
   echo "$0: Getting validation and training subset examples."
   rm $dir/.error 2>/dev/null
   echo "$0: ... extracting validation and training-subset alignments."
-  set -o pipefail;
-  for id in $(seq $nj); do gunzip -c $alidir/ali.$id.gz; done | \
-    copy-int-vector ark:- ark,t:- | \
-    utils/filter_scp.pl <(cat $dir/valid_uttlist $dir/train_subset_uttlist) | \
-    gzip -c >$dir/ali_special.gz || exit 1;
-  set +o pipefail; # unset the pipefail option.
+
+  utils/filter_scp.pl <(cat $dir/valid_uttlist $dir/train_subset_uttlist) \
+    <$dir/ali.scp >$dir/ali_special.scp
 
   $cmd $dir/log/create_valid_subset.log \
-    nnet3-get-egs --num-pdfs=$num_pdfs $valid_ivector_opt $egs_opts "$valid_feats" \
-     "ark,s,cs:gunzip -c $dir/ali_special.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" \
-     "ark:$dir/valid_all.egs" || touch $dir/.error &
+    nnet3-get-egs --num-pdfs=$num_pdfs $valid_ivector_opt $valid_egs_opts "$valid_feats" \
+    "ark,s,cs:ali-to-pdf $alidir/final.mdl scp:$dir/ali_special.scp ark:- | ali-to-post ark:- ark:- |" \
+    "ark:$dir/valid_all.egs" || touch $dir/.error &
   $cmd $dir/log/create_train_subset.log \
-    nnet3-get-egs --num-pdfs=$num_pdfs $train_subset_ivector_opt $egs_opts "$train_subset_feats" \
-     "ark,s,cs:gunzip -c $dir/ali_special.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" \
+    nnet3-get-egs --num-pdfs=$num_pdfs $train_subset_ivector_opt $valid_egs_opts "$train_subset_feats" \
+     "ark,s,cs:ali-to-pdf $alidir/final.mdl scp:$dir/ali_special.scp ark:- | ali-to-post ark:- ark:- |" \
      "ark:$dir/train_subset_all.egs" || touch $dir/.error &
   wait;
   [ -f $dir/.error ] && echo "Error detected while creating train/valid egs" && exit 1
@@ -262,55 +305,76 @@ if [ $stage -le 2 ]; then
   for f in $dir/{combine,train_diagnostic,valid_diagnostic}.egs; do
     [ ! -s $f ] && echo "No examples in file $f" && exit 1;
   done
-  rm $dir/valid_all.egs $dir/train_subset_all.egs $dir/{train,valid}_combine.egs $dir/ali_special.gz
+  rm $dir/valid_all.egs $dir/train_subset_all.egs $dir/{train,valid}_combine.egs
 fi
 
-if [ $stage -le 3 ]; then
-  # create egs_orig.*.*.ark; the first index goes to $num_archives,
-  # the second to $nj (which is the number of jobs in the original alignment
-  # dir)
+if [ $stage -le 4 ]; then
+  # create egs_orig.*.*.ark; the first index goes to $nj,
+  # the second to $num_archives_intermediate.
 
   egs_list=
-  for n in $(seq $num_archives); do
-    egs_list="$egs_list ark:$dir/egs_orig.$n.JOB.ark"
+  for n in $(seq $num_archives_intermediate); do
+    egs_list="$egs_list ark:$dir/egs_orig.JOB.$n.ark"
   done
   echo "$0: Generating training examples on disk"
-  # The examples will go round-robin to egs_list. 
-  if [ ! -z $post_dir ]; then
-    $cmd $io_opts JOB=1:$nj $dir/log/get_egs.JOB.log \
-      nnet3-get-egs --num-pdfs=$num_pdfs $ivector_opt $egs_opts --num-frames=$frames_per_eg "$feats" \
-      scp:$post_dir/post.JOB.scp ark:- \| \
-      nnet3-copy-egs ark:- $egs_list || exit 1;
-  else 
-    $cmd $io_opts JOB=1:$nj $dir/log/get_egs.JOB.log \
-      nnet3-get-egs --num-pdfs=$num_pdfs $ivector_opt $egs_opts --num-frames=$frames_per_eg "$feats" \
-      "ark,s,cs:gunzip -c $alidir/ali.JOB.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" ark:- \| \
-      nnet3-copy-egs ark:- $egs_list || exit 1;
-  fi
+  # The examples will go round-robin to egs_list.
+  $cmd JOB=1:$nj $dir/log/get_egs.JOB.log \
+    nnet3-get-egs --num-pdfs=$num_pdfs $ivector_opt $egs_opts --num-frames=$frames_per_eg "$feats" \
+    "ark,s,cs:filter_scp.pl $sdata/JOB/utt2spk $dir/ali.scp | ali-to-pdf $alidir/final.mdl scp:- ark:- | ali-to-post ark:- ark:- |" ark:- \| \
+    nnet3-copy-egs --random=true --srand=JOB ark:- $egs_list || exit 1;
 fi
-if [ $stage -le 4 ]; then
+
+if [ $stage -le 5 ]; then
   echo "$0: recombining and shuffling order of archives on disk"
-  # combine all the "egs_orig.JOB.*.scp" (over the $nj splits of the data) and
+  # combine all the "egs_orig.*.JOB.scp" (over the $nj splits of the data) and
   # shuffle the order, writing to the egs.JOB.ark
 
+  # the input is a concatenation over the input jobs.
   egs_list=
-  for n in $(seq $nj); do 
-    egs_list="$egs_list $dir/egs_orig.JOB.$n.ark"
+  for n in $(seq $nj); do
+    egs_list="$egs_list $dir/egs_orig.$n.JOB.ark"
   done
 
-  $cmd $io_opts $extra_opts JOB=1:$num_archives $dir/log/shuffle.JOB.log \
-    nnet3-shuffle-egs --srand=JOB "ark:cat $egs_list|" ark:$dir/egs.JOB.ark  || exit 1;
+  if [ $archives_multiple == 1 ]; then # normal case.
+    $cmd --max-jobs-run $nj JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \
+      nnet3-shuffle-egs --srand=JOB "ark:cat $egs_list|" ark:$dir/egs.JOB.ark  || exit 1;
+  else
+    # we need to shuffle the 'intermediate archives' and then split into the
+    # final archives.  we create soft links to manage this splitting, because
+    # otherwise managing the output names is quite difficult (and we don't want
+    # to submit separate queue jobs for each intermediate archive, because then
+    # the --max-jobs-run option is hard to enforce).
+    output_archives="$(for y in $(seq $archives_multiple); do echo ark:$dir/egs.JOB.$y.ark; done)"
+    for x in $(seq $num_archives_intermediate); do
+      for y in $(seq $archives_multiple); do
+        archive_index=$[($x-1)*$archives_multiple+$y]
+        # egs.intermediate_archive.{1,2,...}.ark will point to egs.archive.ark
+        ln -sf egs.$archive_index.ark $dir/egs.$x.$y.ark || exit 1
+      done
+    done
+    $cmd --max-jobs-run $nj JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \
+      nnet3-shuffle-egs --srand=JOB "ark:cat $egs_list|" ark:- \| \
+      nnet3-copy-egs ark:- $output_archives || exit 1;
+  fi
+
 fi
 
-if [ $stage -le 5 ]; then
+if [ $stage -le 6 ]; then
   echo "$0: removing temporary archives"
-  for x in `seq $num_archives`; do
-    for y in `seq $nj`; do
+  for x in $(seq $nj); do
+    for y in $(seq $num_archives_intermediate); do
       file=$dir/egs_orig.$x.$y.ark
       [ -L $file ] && rm $(readlink -f $file)
       rm $file
     done
   done
+  if [ $archives_multiple -gt 1 ]; then
+    # there are some extra soft links that we should delete.
+    for f in $dir/egs.*.*.ark; do rm $f; done
+  fi
+  echo "$0: removing temporary alignments and transforms"
+  # Ignore errors below because trans.* might not exist.
+  rm $dir/{ali,trans}.{ark,scp} 2>/dev/null
 fi
 
 echo "$0: Finished preparing training examples"
diff --git a/egs/wsj/s5/steps/nnet3/get_egs_discriminative.sh b/egs/wsj/s5/steps/nnet3/get_egs_discriminative.sh
new file mode 100755
index 00000000000..c72bbc073ab
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/get_egs_discriminative.sh
@@ -0,0 +1,457 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
+# Copyright 2014-2015   Vimal Manohar
+
+# This script dumps examples MPE or MMI or state-level minimum bayes risk (sMBR)
+# training of neural nets.  
+# Criterion supported are mpe, smbr and mmi
+
+# Begin configuration section.
+cmd=run.pl
+feat_type=raw     # set it to 'lda' to use LDA features.
+frames_per_eg=150 # number of frames of labels per example.  more->less disk space and
+                  # less time preparing egs, but more I/O during training.
+                  # note: the script may reduce this if reduce_frames_per_eg is true.
+frames_overlap_per_eg=30 # number of supervised frames of overlap that we aim for per eg.
+                  # can be useful to avoid wasted data if you're using --left-deriv-truncate
+                  # and --right-deriv-truncate.
+frame_subsampling_factor=1 # ratio between input and output frame-rate of nnet.
+                           # this should be read from the nnet. For now, it is taken as an option
+left_context=4    # amount of left-context per eg (i.e. extra frames of input features
+                  # not present in the output supervision).
+right_context=4   # amount of right-context per eg.
+valid_left_context=   # amount of left_context for validation egs, typically used in
+                      # recurrent architectures to ensure matched condition with
+                      # training egs
+valid_right_context=  # amount of right_context for validation egs
+adjust_priors=true
+priors_left_context=   # amount of left_context for priors egs
+priors_right_context=   # amount of right_context for priors egs
+compress=true   # set this to false to disable compression (e.g. if you want to see whether
+                # results are affected).
+num_utts_subset=80     # number of utterances in validation and training
+                        # subsets used for shrinkage and diagnostics.
+
+frames_per_iter=400000 # each iteration of training, see this many frames
+                       # per job.  This is just a guideline; it will pick a number
+                       # that divides the number of samples in the entire data.
+
+determinize=true
+minimize=true
+remove_output_symbols=true
+remove_epsilons=true
+collapse_transition_ids=true
+acwt=0.1
+
+stage=0
+max_jobs_run=15
+max_shuffle_jobs_run=15
+
+transform_dir= # If this is a SAT system, directory for transforms
+online_ivector_dir=
+cmvn_opts=  # can be used for specifying CMVN options, if feature type is not lda (if lda,
+            # it doesn't make sense to use different options than were used as input to the
+            # LDA transform).  This is used to turn off CMVN in the online-nnet experiments.
+
+num_priors_subset=100
+num_archives_priors=10
+
+# End configuration section.
+
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+
+if [ $# != 6 ]; then
+  echo "Usage: $0 [opts] <data> <lang> <ali-dir> <denlat-dir> <src-model-file> <degs-dir>"
+  echo " e.g.: $0 data/train data/lang exp/tri3_ali exp/tri4_nnet_denlats exp/tri4/final.mdl exp/tri4_mpe/degs"
+  echo ""
+  echo "Main options (for others, see top of script file)"
+  echo "  --config <config-file>                           # config file containing options"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs (probably would be good to add -tc 5 or so if using"
+  echo "                                                   # GridEngine (to avoid excessive NFS traffic)."
+  echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
+  echo "                                                   # process."
+  echo "  --stage <stage|-8>                               # Used to run a partially-completed training process from somewhere in"
+  echo "                                                   # the middle."
+  echo "  --online-ivector-dir <dir|"">                    # Directory for online-estimated iVectors, used in the"
+  echo "                                                   # online-neural-net setup."
+  exit 1;
+fi
+
+data=$1
+lang=$2
+alidir=$3
+denlatdir=$4
+src_model=$5
+dir=$6
+
+extra_files=
+[ ! -z $online_ivector_dir ] && \
+  extra_files="$online_ivector_dir/ivector_period $online_ivector_dir/ivector_online.scp"
+
+# Check some files.
+for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/num_jobs $alidir/tree \
+         $denlatdir/lat.1.gz $denlatdir/num_jobs $src_model $extra_files; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+mkdir -p $dir/log $dir/info || exit 1;
+
+[ "$(readlink /bin/sh)" == dash ] && \
+  echo "This script won't work if /bin/sh points to dash.  make it point to bash." && exit 1
+
+nj=$(cat $denlatdir/num_jobs) || exit 1;
+
+sdata=$data/split$nj
+utils/split_data.sh $data $nj
+
+# Get list of validation utterances.
+awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_utts_subset \
+    > $dir/valid_uttlist || exit 1;
+
+if [ -f $data/utt2uniq ]; then  # this matters if you use data augmentation.
+  echo "File $data/utt2uniq exists, so augmenting valid_uttlist to"
+  echo "include all perturbed versions of the same 'real' utterances."
+  mv $dir/valid_uttlist $dir/valid_uttlist.tmp
+  utils/utt2spk_to_spk2utt.pl $data/utt2uniq > $dir/uniq2utt
+  cat $dir/valid_uttlist.tmp | utils/apply_map.pl $data/utt2uniq | \
+    sort | uniq | utils/apply_map.pl $dir/uniq2utt | \
+    awk '{for(n=1;n<=NF;n++) print $n;}' | sort  > $dir/valid_uttlist
+  rm $dir/uniq2utt $dir/valid_uttlist.tmp
+fi
+
+awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid_uttlist | \
+   utils/shuffle_list.pl | head -$num_utts_subset > $dir/train_subset_uttlist || exit 1;
+
+[ -z "$transform_dir" ] && transform_dir=$alidir
+  
+if [ $stage -le 1 ]; then
+  nj_ali=$(cat $alidir/num_jobs)
+  all_ids=$(seq -s, $nj_ali)
+  $cmd $dir/log/copy_alignments.log \
+    copy-int-vector "ark:gunzip -c $alidir/ali.{$all_ids}.gz|" \
+    ark,scp:$dir/ali.ark,$dir/ali.scp || exit 1;
+fi
+
+prior_ali_rspecifier="ark,s,cs:utils/filter_scp.pl $dir/priors_uttlist $dir/ali.scp | ali-to-pdf $alidir/final.mdl scp:- ark:- |"
+
+if [ -f $transform_dir/trans.1 ] && [ $feat_type != "raw" ]; then
+  echo "$0: using transforms from $transform_dir"
+  if [ $stage -le 0 ]; then
+    $cmd $dir/log/copy_transforms.log \
+      copy-feats "ark:cat $transform_dir/trans.* |" "ark,scp:$dir/trans.ark,$dir/trans.scp"
+  fi
+fi
+if [ -f $transform_dir/raw_trans.1 ] && [ $feat_type == "raw" ]; then
+  echo "$0: using raw transforms from $transform_dir"
+  if [ $stage -le 0 ]; then
+    $cmd $dir/log/copy_transforms.log \
+      copy-feats "ark:cat $transform_dir/raw_trans.* |" "ark,scp:$dir/trans.ark,$dir/trans.scp"
+  fi
+fi
+
+silphonelist=`cat $lang/phones/silence.csl` || exit 1;
+cp $alidir/tree $dir
+cp $lang/phones/silence.csl $dir/info/
+cp $src_model $dir/final.mdl || exit 1
+
+# Get list of utterances for prior computation.
+awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid_uttlist | \
+  utils/shuffle_list.pl | head -$num_priors_subset \
+  > $dir/priors_uttlist || exit 1;
+
+## We don't support deltas here, only LDA or raw (mainly because deltas are less
+## frequently used).
+if [ -z $feat_type ]; then
+  if [ -f $alidir/final.mat ] && [ ! -f $transform_dir/raw_trans.1 ]; then feat_type=lda; else feat_type=raw; fi
+fi
+echo "$0: feature type is $feat_type"
+
+case $feat_type in
+  raw) feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- |"
+    valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |"
+    train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |"
+    priors_feats="ark,s,cs:utils/filter_scp.pl $dir/priors_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |"
+    echo $cmvn_opts > $dir/cmvn_opts
+   ;;
+  lda)
+    splice_opts=`cat $alidir/splice_opts 2>/dev/null`
+    cp $alidir/splice_opts $dir 2>/dev/null
+    cp $alidir/final.mat $dir
+    [ ! -z "$cmvn_opts" ] && \
+       echo "You cannot supply --cmvn-opts option if feature type is LDA." && exit 1;
+    cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null`
+    cp $alidir/cmvn_opts $dir 2>/dev/null
+    feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
+    valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
+    train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
+    priors_feats="ark,s,cs:utils/filter_scp.pl $dir/priors_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
+    ;;
+  *) echo "$0: invalid feature type $feat_type" && exit 1;
+esac
+
+if [ -f $dir/trans.scp ]; then
+  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/trans.scp ark:- ark:- |"
+  valid_feats="$valid_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp|' ark:- ark:- |"
+  train_subset_feats="$train_subset_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp|' ark:- ark:- |"
+  priors_feats="$priors_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp|' ark:- ark:- |"
+fi
+
+if [ ! -z $online_ivector_dir ]; then
+  ivector_period=$(cat $online_ivector_dir/ivector_period)
+  ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
+  echo $ivector_dim >$dir/info/ivector_dim
+
+  ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $sdata/JOB/utt2spk $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'"
+  valid_ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'"
+  train_subset_ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'"
+  priors_ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $dir/priors_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'"
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: working out number of frames of training data"
+  num_frames=$(steps/nnet2/get_num_frames.sh $data)
+  echo $num_frames > $dir/info/num_frames
+  echo "$0: working out feature dim"
+  feats_one="$(echo $feats | sed s/JOB/1/g)"
+  feat_dim=$(feat-to-dim "$feats_one" -) || exit 1;
+  echo $feat_dim > $dir/info/feat_dim
+else
+  num_frames=$(cat $dir/info/num_frames) || exit 1;
+  feat_dim=$(cat $dir/info/feat_dim) || exit 1;
+fi
+
+# Working out total number of archives. Add one on the assumption the
+# num-frames won't divide exactly, and we want to round up.
+num_archives=$[$num_frames/$frames_per_iter+1]
+
+# We may have to first create a smaller number of larger archives, with number
+# $num_archives_intermediate, if $num_archives is more than the maximum number
+# of open filehandles that the system allows per process (ulimit -n).
+max_open_filehandles=$(ulimit -n) || exit 1
+num_archives_intermediate=$num_archives
+archives_multiple=1
+while [ $[$num_archives_intermediate+4] -gt $max_open_filehandles ]; do
+  archives_multiple=$[$archives_multiple+1]
+  num_archives_intermediate=$[$num_archives/$archives_multiple] || exit 1;
+done
+# now make sure num_archives is an exact multiple of archives_multiple.
+num_archives=$[$archives_multiple*$num_archives_intermediate] || exit 1;
+
+echo $num_archives >$dir/info/num_archives
+echo $frames_per_eg >$dir/info/frames_per_eg
+# Work out the number of egs per archive
+egs_per_archive=$[$num_frames/($frames_per_eg*$num_archives)] || exit 1;
+! [ $egs_per_archive -le $frames_per_iter ] && \
+  echo "$0: script error: egs_per_archive=$egs_per_archive not <= frames_per_iter=$frames_per_iter" \
+  && exit 1;
+
+echo $egs_per_archive > $dir/info/egs_per_archive
+
+echo "$0: creating $num_archives archives, each with $egs_per_archive egs, with"
+echo "$0:   $frames_per_eg labels per example, and (left,right) context = ($left_context,$right_context)"
+
+
+if [ -e $dir/storage ]; then
+  # Make soft links to storage directories, if distributing this way..  See
+  # utils/create_split_dir.pl.
+  echo "$0: creating data links"
+  utils/create_data_link.pl $(for x in $(seq $num_archives); do echo $dir/degs.$x.ark; done)
+  for x in $(seq $num_archives_intermediate); do
+    utils/create_data_link.pl $(for y in $(seq $nj); do echo $dir/degs_orig.$y.$x.ark; done)
+  done
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: copying training lattices"
+
+  $cmd --max-jobs-run 6 JOB=1:$nj $dir/log/lattice_copy.JOB.log \
+    lattice-copy --write-compact=false --include="cat $dir/valid_uttlist $dir/train_subset_uttlist |" --ignore-missing \
+    "ark:gunzip -c $denlatdir/lat.JOB.gz|" ark,scp:$dir/lat_special.JOB.ark,$dir/lat_special.JOB.scp || exit 1;
+
+  for id in $(seq $nj); do cat $dir/lat_special.$id.scp; done > $dir/lat_special.scp
+fi
+
+splitter_opts="--supervision-splitter.determinize=$determinize --supervision-splitter.minimize=$minimize --supervision-splitter.remove_output_symbols=$remove_output_symbols --supervision-splitter.remove_epsilons=$remove_epsilons --supervision-splitter.collapse-transition-ids=$collapse_transition_ids --supervision-splitter.acoustic-scale=$acwt"
+
+[ -z $valid_left_context ] &&  valid_left_context=$left_context;
+[ -z $valid_right_context ] &&  valid_right_context=$right_context;
+
+[ -z $priors_left_context ] &&  priors_left_context=$left_context;
+[ -z $priors_right_context ] &&  priors_right_context=$right_context;
+
+left_context=$[left_context+frame_subsampling_factor/2]
+right_context=$[right_context+frame_subsampling_factor/2]
+
+egs_opts="--left-context=$left_context --right-context=$right_context --num-frames=$frames_per_eg --num-frames-overlap=$frames_overlap_per_eg --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress $splitter_opts"
+
+valid_left_context=$[valid_left_context+frame_subsampling_factor/2]
+valid_right_context=$[valid_right_context+frame_subsampling_factor/2]
+
+# don't do the overlap thing for the validation data.
+valid_egs_opts="--left-context=$valid_left_context --right-context=$valid_right_context --num-frames=$frames_per_eg --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress $splitter_opts"
+
+priors_left_context=$[priors_left_context+frame_subsampling_factor/2]
+priors_right_context=$[priors_right_context+frame_subsampling_factor/2]
+
+# don't do the overlap thing for the priors computation data.
+priors_egs_opts="--left-context=$priors_left_context --right-context=$priors_right_context --num-frames=1 --compress=$compress"
+
+supervision_all_opts="--frame-subsampling-factor=$frame_subsampling_factor"
+
+echo $left_context > $dir/info/left_context
+echo $right_context > $dir/info/right_context
+
+echo $priors_left_context > $dir/info/priors_left_context
+echo $priors_right_context > $dir/info/priors_right_context
+
+echo $frame_subsampling_factor > $dir/info/frame_subsampling_factor
+
+(
+  if $adjust_priors && [ $stage -le 10 ]; then
+    if [ ! -f $dir/ali.scp ]; then
+      nj_ali=$(cat $alidir/num_jobs)
+      all_ids=$(seq -s, $nj_ali)
+      $cmd $dir/log/copy_alignments.log \
+        copy-int-vector "ark:gunzip -c $alidir/ali.{$all_ids}.gz|" \
+        ark,scp:$dir/ali.ark,$dir/ali.scp || exit 1;
+    fi
+
+    priors_egs_list=
+    for y in `seq $num_archives_priors`; do
+      utils/create_data_link.pl $dir/priors_egs.$y.ark
+      priors_egs_list="$priors_egs_list ark:$dir/priors_egs.$y.ark"
+    done
+
+    echo "$0: dumping egs for prior adjustment in the background."
+
+    num_pdfs=`am-info $alidir/final.mdl | grep pdfs | awk '{print $NF}' 2>/dev/null` || exit 1
+
+    $cmd $dir/log/create_priors_subset.log \
+      nnet3-get-egs --num-pdfs=$num_pdfs $priors_ivector_opt $priors_egs_opts "$priors_feats" \
+      "$prior_ali_rspecifier ali-to-post ark:- ark:- |" \
+      ark:- \| nnet3-copy-egs ark:- $priors_egs_list || \
+      { touch $dir/.error; echo "Error in creating priors subset. See $dir/log/create_priors_subset.log"; exit 1; }
+
+    sleep 3;
+
+    echo $num_archives_priors >$dir/info/num_archives_priors
+  else
+    echo 0 > $dir/info/num_archives_priors
+  fi
+) &
+
+if [ $stage -le 4 ]; then
+  echo "$0: Getting validation and training subset examples."
+  rm $dir/.error 2>/dev/null
+  echo "$0: ... extracting validation and training-subset alignments."
+
+  #utils/filter_scp.pl <(cat $dir/valid_uttlist $dir/train_subset_uttlist) \
+  #  <$dir/lat.scp >$dir/lat_special.scp
+
+  utils/filter_scp.pl <(cat $dir/valid_uttlist $dir/train_subset_uttlist) \
+    <$dir/ali.scp >$dir/ali_special.scp
+
+  $cmd $dir/log/create_valid_subset.log \
+    discriminative-get-supervision $supervision_all_opts \
+    scp:$dir/ali_special.scp scp:$dir/lat_special.scp ark:- \| \
+    nnet3-discriminative-get-egs $valid_ivector_opt $valid_egs_opts \
+    $dir/final.mdl "$valid_feats" ark,s,cs:- "ark:$dir/valid_diagnostic.degs" || touch $dir/.error &
+
+  $cmd $dir/log/create_train_subset.log \
+    discriminative-get-supervision $supervision_all_opts \
+    scp:$dir/ali_special.scp scp:$dir/lat_special.scp ark:- \| \
+    nnet3-discriminative-get-egs $train_subset_ivector_opt $egs_opts \
+    $dir/final.mdl "$train_subset_feats" ark,s,cs:- "ark:$dir/train_diagnostic.degs" || touch $dir/.error &
+  wait;
+  [ -f $dir/.error ] && echo "Error detected while creating train/valid egs" && exit 1
+  echo "... Getting subsets of validation examples for diagnostics and combination."
+  
+  for f in $dir/{train_diagnostic,valid_diagnostic}.degs; do
+    [ ! -s $f ] && echo "No examples in file $f" && exit 1;
+  done
+fi
+
+if [ $stage -le 5 ]; then
+  # create degs_orig.*.*.ark; the first index goes to $nj,
+  # the second to $num_archives_intermediate.
+
+  degs_list=
+  for n in $(seq $num_archives_intermediate); do
+    degs_list="$degs_list ark:$dir/degs_orig.JOB.$n.ark"
+  done
+  echo "$0: Generating training examples on disk"
+  
+  # The examples will go round-robin to degs_list.  
+  # To make it efficient we need to use a large 'nj', like 40, and in that case
+  # there can be too many small files to deal with, because the total number of
+  # files is the product of 'nj' by 'num_archives_intermediate', which might be
+  # quite large.
+  $cmd --max-jobs-run $max_jobs_run JOB=1:$nj $dir/log/get_egs.JOB.log \
+    discriminative-get-supervision $supervision_all_opts \
+    "scp:utils/filter_scp.pl $sdata/JOB/utt2spk $dir/ali.scp |" \
+    "ark:gunzip -c $denlatdir/lat.JOB.gz |" ark:- \| \
+    nnet3-discriminative-get-egs $ivector_opt $egs_opts \
+    $dir/final.mdl "$feats" ark,s,cs:- ark:- \| \
+    nnet3-discriminative-copy-egs --random=true --srand=JOB ark:- $degs_list || exit 1;
+fi
+
+if [ $stage -le 6 ]; then
+  echo "$0: recombining and shuffling order of archives on disk"
+  # combine all the "degs_orig.*.JOB.scp" (over the $nj splits of the data) and
+  # shuffle the order, writing to the degs.JOB.ark
+
+  # the input is a concatenation over the input jobs.
+  degs_list=
+  for n in $(seq $nj); do
+    degs_list="$degs_list $dir/degs_orig.$n.JOB.ark"
+  done
+  
+  if [ $archives_multiple == 1 ]; then # normal case.
+    $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \
+      nnet3-discriminative-shuffle-egs --srand=JOB "ark:cat $degs_list|" ark:$dir/degs.JOB.ark  || exit 1;
+  else
+    # we need to shuffle the 'intermediate archives' and then split into the
+    # final archives.  we create soft links to manage this splitting, because
+    # otherwise managing the output names is quite difficult (and we don't want
+    # to submit separate queue jobs for each intermediate archive, because then
+    # the --max-jobs-run option is hard to enforce).
+    output_archives="$(for y in $(seq $archives_multiple); do echo ark:$dir/degs.JOB.$y.ark; done)"
+    for x in $(seq $num_archives_intermediate); do
+      for y in $(seq $archives_multiple); do
+        archive_index=$[($x-1)*$archives_multiple+$y]
+        # degs.intermediate_archive.{1,2,...}.ark will point to degs.archive.ark
+        ln -sf degs.$archive_index.ark $dir/degs.$x.$y.ark || exit 1
+      done
+    done
+    $cmd --max-jobs-run $max_shuffle_jobs_run --mem 8G JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \
+      nnet3-discriminative-shuffle-egs --srand=JOB "ark:cat $degs_list|" ark:- \| \
+      nnet3-discriminative-copy-egs ark:- $output_archives || exit 1;
+  fi
+fi
+  
+if [ $stage -le 7 ]; then
+  echo "$0: removing temporary archives"
+  (
+    cd $dir
+    for f in $(ls -l . | grep 'degs_orig' | awk '{ X=NF-1; Y=NF-2; if ($X == "->")  print $Y, $NF; }'); do rm $f; done
+  )
+  if [ $archives_multiple -gt 1 ]; then
+    # there are some extra soft links that we should delete.
+    for f in $dir/degs.*.*.ark; do rm $f; done
+  fi
+  echo "$0: removing temporary lattices"
+  rm $dir/lat.*
+  echo "$0: removing temporary alignments and transforms"
+  # Ignore errors below because trans.* might not exist.
+  rm $dir/{ali,trans}.{ark,scp} 2>/dev/null
+fi
+
+wait
+
+echo "$0: Finished preparing training examples"
diff --git a/egs/wsj/s5/steps/nnet3/get_egs_targets.sh b/egs/wsj/s5/steps/nnet3/get_egs_targets.sh
new file mode 100755
index 00000000000..7fbc24858b5
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/get_egs_targets.sh
@@ -0,0 +1,409 @@
+#!/bin/bash
+
+# Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey).  
+#           2015-2016 Vimal Manohar
+# Apache 2.0.
+
+# This script is similar to steps/nnet3/get_egs.sh but used
+# when getting general targets (not from alignment directory) for raw nnet 
+#
+# This script, which will generally be called from other neural-net training
+# scripts, extracts the training examples used to train the neural net (and also
+# the validation examples used for diagnostics), and puts them in separate archives.
+#
+# This script dumps egs with several frames of labels, controlled by the
+# frames_per_eg config variable (default: 8).  This takes many times less disk
+# space because typically we have 4 to 7 frames of context on the left and
+# right, and this ends up getting shared.  This is at the expense of slightly
+# higher disk I/O while training.
+
+
+# Begin configuration section.
+cmd=run.pl
+feat_type=raw       # set it to 'lda' to use LDA features.
+target_type=sparse  # dense to have dense targets, 
+                    # sparse to have posteriors targets
+num_targets=        # required for target-type=sparse with raw nnet
+frames_per_eg=8   # number of frames of labels per example.  more->less disk space and
+                  # less time preparing egs, but more I/O during training.
+                  # note: the script may reduce this if reduce_frames_per_eg is true.
+left_context=4    # amount of left-context per eg (i.e. extra frames of input features
+                  # not present in the output supervision).
+right_context=4   # amount of right-context per eg.
+valid_left_context=   # amount of left_context for validation egs, typically used in
+                      # recurrent architectures to ensure matched condition with
+                      # training egs
+valid_right_context=  # amount of right_context for validation egs
+compress=true   # set this to false to disable compression (e.g. if you want to see whether
+                # results are affected).
+
+reduce_frames_per_eg=true  # If true, this script may reduce the frames_per_eg
+                           # if there is only one archive and even with the
+                           # reduced frames_per_eg, the number of
+                           # samples_per_iter that would result is less than or
+                           # equal to the user-specified value.
+num_utts_subset=300     # number of utterances in validation and training
+                        # subsets used for shrinkage and diagnostics.
+num_valid_frames_combine=0 # #valid frames for combination weights at the very end.
+num_train_frames_combine=10000 # # train frames for the above.
+num_frames_diagnostic=4000 # number of frames for "compute_prob" jobs
+samples_per_iter=400000 # this is the target number of egs in each archive of egs
+                        # (prior to merging egs).  We probably should have called
+                        # it egs_per_iter. This is just a guideline; it will pick
+                        # a number that divides the number of samples in the
+                        # entire data.
+
+transform_dir=     
+
+stage=0
+nj=6         # This should be set to the maximum number of jobs you are
+             # comfortable to run in parallel; you can increase it if your disk
+             # speed is greater and you have more machines.
+online_ivector_dir=  # can be used if we are including speaker information as iVectors.
+cmvn_opts=  # can be used for specifying CMVN options, if feature type is not lda (if lda,
+            # it doesn't make sense to use different options than were used as input to the
+            # LDA transform).  This is used to turn off CMVN in the online-nnet experiments.
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+if [ $# != 3 ]; then
+  echo "Usage: $0 [opts] <data> <targets-scp> <egs-dir>"
+  echo " e.g.: $0 data/train data/train/snr_targets.scp exp/tri4_nnet/egs"
+  echo ""
+  echo "Main options (for others, see top of script file)"
+  echo "  --config <config-file>                           # config file containing options"
+  echo "  --nj <nj>                                        # The maximum number of jobs you want to run in"
+  echo "                                                   # parallel (increase this only if you have good disk and"
+  echo "                                                   # network speed).  default=6"
+  echo "  --cmd (utils/run.pl;utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --samples-per-iter <#samples;400000>             # Target number of egs per archive (option is badly named)"
+  echo "  --feat-type <lda|raw>                            # (raw is the default).  The feature type you want"
+  echo "                                                   # to use as input to the neural net."
+  echo "  --frames-per-eg <frames;8>                       # number of frames per eg on disk"
+  echo "  --left-context <width;4>                         # Number of frames on left side to append for feature input"
+  echo "  --right-context <width;4>                        # Number of frames on right side to append for feature input"
+  echo "  --num-frames-diagnostic <#frames;4000>           # Number of frames used in computing (train,valid) diagnostics"
+  echo "  --num-valid-frames-combine <#frames;10000>       # Number of frames used in getting combination weights at the"
+  echo "                                                   # very end."
+  echo "  --stage <stage|0>                                # Used to run a partially-completed training process from somewhere in"
+  echo "                                                   # the middle."
+
+  exit 1;
+fi
+
+data=$1
+targets_scp=$2
+dir=$3
+
+# Check some files.
+[ ! -z "$online_ivector_dir" ] && \
+  extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
+
+for f in $data/feats.scp $targets_scp $extra_files; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+sdata=$data/split$nj
+utils/split_data.sh $data $nj
+
+mkdir -p $dir/log $dir/info
+
+
+# Get list of validation utterances.
+awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_utts_subset | sort \
+    > $dir/valid_uttlist || exit 1;
+
+if [ -f $data/utt2uniq ]; then  # this matters if you use data augmentation.
+  echo "File $data/utt2uniq exists, so augmenting valid_uttlist to"
+  echo "include all perturbed versions of the same 'real' utterances."
+  mv $dir/valid_uttlist $dir/valid_uttlist.tmp
+  utils/utt2spk_to_spk2utt.pl $data/utt2uniq > $dir/uniq2utt
+  cat $dir/valid_uttlist.tmp | utils/apply_map.pl $data/utt2uniq | \
+    sort | uniq | utils/apply_map.pl $dir/uniq2utt | \
+    awk '{for(n=1;n<=NF;n++) print $n;}' | sort  > $dir/valid_uttlist
+  rm $dir/uniq2utt $dir/valid_uttlist.tmp
+fi
+
+awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid_uttlist | \
+   utils/shuffle_list.pl | head -$num_utts_subset | sort > $dir/train_subset_uttlist || exit 1;
+
+if [ ! -z "$transform_dir" ] && [ -f $transform_dir/trans.1 ] && [ $feat_type != "raw" ]; then
+  echo "$0: using transforms from $transform_dir"
+  if [ $stage -le 0 ]; then
+    $cmd $dir/log/copy_transforms.log \
+      copy-feats "ark:cat $transform_dir/trans.* |" "ark,scp:$dir/trans.ark,$dir/trans.scp"
+  fi
+fi
+if [ -f $transform_dir/raw_trans.1 ] && [ $feat_type == "raw" ]; then
+  echo "$0: using raw transforms from $transform_dir"
+  if [ $stage -le 0 ]; then
+    $cmd $dir/log/copy_transforms.log \
+      copy-feats "ark:cat $transform_dir/raw_trans.* |" "ark,scp:$dir/trans.ark,$dir/trans.scp"
+  fi
+fi
+
+
+
+## Set up features.
+echo "$0: feature type is $feat_type"
+
+case $feat_type in
+  raw) feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- |"
+    valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |"
+    train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |"
+    echo $cmvn_opts >$dir/cmvn_opts # caution: the top-level nnet training script should copy this to its own dir now.
+   ;;
+  lda)
+    splice_opts=`cat $transform_dir/splice_opts 2>/dev/null`
+    # caution: the top-level nnet training script should copy these to its own dir now.
+    cp $transform_dir/{splice_opts,cmvn_opts,final.mat} $dir || exit 1;
+    [ ! -z "$cmvn_opts" ] && \
+       echo "You cannot supply --cmvn-opts option if feature type is LDA." && exit 1;
+    cmvn_opts=$(cat $dir/cmvn_opts)
+    feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
+    valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
+    train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
+    ;;
+  *) echo "$0: invalid feature type --feat-type '$feat_type'" && exit 1;
+esac
+
+if [ -f $dir/trans.scp ]; then
+  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/trans.scp ark:- ark:- |"
+  valid_feats="$valid_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp ark:- ark:- |"
+  train_subset_feats="$train_subset_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp ark:- ark:- |"
+fi
+
+if [ ! -z "$online_ivector_dir" ]; then
+  ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
+  echo $ivector_dim > $dir/info/ivector_dim
+  ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
+
+  ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $sdata/JOB/utt2spk $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'"
+  valid_ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'"
+  train_subset_ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'"
+else
+  echo 0 >$dir/info/ivector_dim
+fi
+
+if [ $stage -le 1 ]; then
+  echo "$0: working out number of frames of training data"
+  num_frames=$(steps/nnet2/get_num_frames.sh $data)
+  echo $num_frames > $dir/info/num_frames
+  echo "$0: working out feature dim"
+  feats_one="$(echo $feats | sed s:JOB:1:g)"
+  feat_dim=$(feat-to-dim "$feats_one" -) || exit 1;
+  echo $feat_dim > $dir/info/feat_dim
+else
+  num_frames=$(cat $dir/info/num_frames) || exit 1;
+  feat_dim=$(cat $dir/info/feat_dim) || exit 1;
+fi
+
+# the + 1 is to round up, not down... we assume it doesn't divide exactly.
+num_archives=$[$num_frames/($frames_per_eg*$samples_per_iter)+1]
+# (for small data)- while reduce_frames_per_eg == true and the number of
+# archives is 1 and would still be 1 if we reduced frames_per_eg by 1, reduce it
+# by 1.
+reduced=false
+while $reduce_frames_per_eg && [ $frames_per_eg -gt 1 ] && \
+  [ $[$num_frames/(($frames_per_eg-1)*$samples_per_iter)] -eq 0 ]; do
+  frames_per_eg=$[$frames_per_eg-1]
+  num_archives=1
+  reduced=true
+done
+$reduced && echo "$0: reduced frames_per_eg to $frames_per_eg because amount of data is small."
+
+# We may have to first create a smaller number of larger archives, with number
+# $num_archives_intermediate, if $num_archives is more than the maximum number
+# of open filehandles that the system allows per process (ulimit -n).
+max_open_filehandles=$(ulimit -n) || exit 1
+num_archives_intermediate=$num_archives
+archives_multiple=1
+while [ $[$num_archives_intermediate+4] -gt $max_open_filehandles ]; do
+  archives_multiple=$[$archives_multiple+1]
+  num_archives_intermediate=$[$num_archives/$archives_multiple+1];
+done
+# now make sure num_archives is an exact multiple of archives_multiple.
+num_archives=$[$archives_multiple*$num_archives_intermediate]
+
+echo $num_archives >$dir/info/num_archives
+echo $frames_per_eg >$dir/info/frames_per_eg
+# Work out the number of egs per archive
+egs_per_archive=$[$num_frames/($frames_per_eg*$num_archives)]
+! [ $egs_per_archive -le $samples_per_iter ] && \
+  echo "$0: script error: egs_per_archive=$egs_per_archive not <= samples_per_iter=$samples_per_iter" \
+  && exit 1;
+
+echo $egs_per_archive > $dir/info/egs_per_archive
+
+echo "$0: creating $num_archives archives, each with $egs_per_archive egs, with"
+echo "$0:   $frames_per_eg labels per example, and (left,right) context = ($left_context,$right_context)"
+
+
+
+if [ -e $dir/storage ]; then
+  # Make soft links to storage directories, if distributing this way..  See
+  # utils/create_split_dir.pl.
+  echo "$0: creating data links"
+  utils/create_data_link.pl $(for x in $(seq $num_archives); do echo $dir/egs.$x.ark; done)
+  for x in $(seq $num_archives_intermediate); do
+    utils/create_data_link.pl $(for y in $(seq $nj); do echo $dir/egs_orig.$y.$x.ark; done)
+  done
+fi
+
+egs_opts="--left-context=$left_context --right-context=$right_context --compress=$compress"
+
+[ -z $valid_left_context ] &&  valid_left_context=$left_context;
+[ -z $valid_right_context ] &&  valid_right_context=$right_context;
+valid_egs_opts="--left-context=$valid_left_context --right-context=$valid_right_context --compress=$compress"
+
+echo $left_context > $dir/info/left_context
+echo $right_context > $dir/info/right_context
+
+for n in `seq $nj`; do
+  utils/filter_scp.pl $sdata/$n/utt2spk $targets_scp > $dir/targets.$n.scp
+done
+
+targets_scp_split=$dir/targets.JOB.scp
+
+if [ $target_type == "dense" ]; then
+  num_targets=$(feat-to-dim "scp:$targets_scp" - 2>/dev/null) || exit 1
+fi
+
+if [ -z "$num_targets" ]; then
+  echo "$0: num-targets is not set" 
+  exit 1
+fi
+
+case $target_type in
+  "dense") 
+    get_egs_program="nnet3-get-egs-dense-targets --num-targets=$num_targets"
+
+    targets="ark:utils/filter_scp.pl --exclude $dir/valid_uttlist $targets_scp_split | copy-feats scp:- ark:- |"
+    valid_targets="ark:utils/filter_scp.pl $dir/valid_uttlist $targets_scp | copy-feats scp:- ark:- |"
+    train_subset_targets="ark:utils/filter_scp.pl $dir/train_subset_uttlist $targets_scp | copy-feats scp:- ark:- |"
+    ;;
+  "sparse")
+    get_egs_program="nnet3-get-egs --num-pdfs=$num_targets"
+    targets="ark:utils/filter_scp.pl --exclude $dir/valid_uttlist $targets_scp_split | ali-to-post scp:- ark:- |"
+    valid_targets="ark:utils/filter_scp.pl $dir/valid_uttlist $targets_scp | ali-to-post scp:- ark:- |" 
+    train_subset_targets="ark:utils/filter_scp.pl $dir/train_subset_uttlist $targets_scp | ali-to-post scp:- ark:- |"
+    ;;
+  default)
+    echo "$0: Unknown --target-type $target_type. Choices are dense and sparse"
+    exit 1
+esac
+
+if [ $stage -le 3 ]; then
+  echo "$0: Getting validation and training subset examples."
+  rm -f $dir/.error 2>/dev/null
+  $cmd $dir/log/create_valid_subset.log \
+    $get_egs_program \
+    $valid_ivector_opt $valid_egs_opts "$valid_feats" \
+    "$valid_targets" \
+    "ark:$dir/valid_all.egs" || touch $dir/.error &
+  $cmd $dir/log/create_train_subset.log \
+    $get_egs_program \
+    $train_subset_ivector_opt $valid_egs_opts "$train_subset_feats" \
+    "$train_subset_targets" \
+    "ark:$dir/train_subset_all.egs" || touch $dir/.error &
+  wait;
+  [ -f $dir/.error ] && echo "Error detected while creating train/valid egs" && exit 1
+  echo "... Getting subsets of validation examples for diagnostics and combination."
+  $cmd $dir/log/create_valid_subset_combine.log \
+    nnet3-subset-egs --n=$num_valid_frames_combine ark:$dir/valid_all.egs \
+    ark:$dir/valid_combine.egs || touch $dir/.error &
+  $cmd $dir/log/create_valid_subset_diagnostic.log \
+    nnet3-subset-egs --n=$num_frames_diagnostic ark:$dir/valid_all.egs \
+    ark:$dir/valid_diagnostic.egs || touch $dir/.error &
+
+  $cmd $dir/log/create_train_subset_combine.log \
+    nnet3-subset-egs --n=$num_train_frames_combine ark:$dir/train_subset_all.egs \
+    ark:$dir/train_combine.egs || touch $dir/.error &
+  $cmd $dir/log/create_train_subset_diagnostic.log \
+    nnet3-subset-egs --n=$num_frames_diagnostic ark:$dir/train_subset_all.egs \
+    ark:$dir/train_diagnostic.egs || touch $dir/.error &
+  wait
+  sleep 5  # wait for file system to sync.
+  cat $dir/valid_combine.egs $dir/train_combine.egs > $dir/combine.egs
+
+  for f in $dir/{combine,train_diagnostic,valid_diagnostic}.egs; do
+    [ ! -s $f ] && echo "No examples in file $f" && exit 1;
+  done
+  rm -f $dir/valid_all.egs $dir/train_subset_all.egs $dir/{train,valid}_combine.egs
+fi
+
+if [ $stage -le 4 ]; then
+  # create egs_orig.*.*.ark; the first index goes to $nj,
+  # the second to $num_archives_intermediate.
+
+  egs_list=
+  for n in $(seq $num_archives_intermediate); do
+    egs_list="$egs_list ark:$dir/egs_orig.JOB.$n.ark"
+  done
+  echo "$0: Generating training examples on disk"
+  # The examples will go round-robin to egs_list.
+  $cmd JOB=1:$nj $dir/log/get_egs.JOB.log \
+    $get_egs_program \
+    $ivector_opt $egs_opts --num-frames=$frames_per_eg "$feats" "$targets" \
+    ark:- \| \
+    nnet3-copy-egs --random=true --srand=JOB ark:- $egs_list || exit 1;
+fi
+
+if [ $stage -le 5 ]; then
+  echo "$0: recombining and shuffling order of archives on disk"
+  # combine all the "egs_orig.*.JOB.scp" (over the $nj splits of the data) and
+  # shuffle the order, writing to the egs.JOB.ark
+
+  # the input is a concatenation over the input jobs.
+  egs_list=
+  for n in $(seq $nj); do
+    egs_list="$egs_list $dir/egs_orig.$n.JOB.ark"
+  done
+
+  if [ $archives_multiple == 1 ]; then # normal case.
+    $cmd --max-jobs-run $nj JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \
+      nnet3-shuffle-egs --srand=JOB "ark:cat $egs_list|" ark:$dir/egs.JOB.ark  || exit 1;
+  else
+    # we need to shuffle the 'intermediate archives' and then split into the
+    # final archives.  we create soft links to manage this splitting, because
+    # otherwise managing the output names is quite difficult (and we don't want
+    # to submit separate queue jobs for each intermediate archive, because then
+    # the --max-jobs-run option is hard to enforce).
+    output_archives="$(for y in $(seq $archives_multiple); do echo ark:$dir/egs.JOB.$y.ark; done)"
+    for x in $(seq $num_archives_intermediate); do
+      for y in $(seq $archives_multiple); do
+        archive_index=$[($x-1)*$archives_multiple+$y]
+        # egs.intermediate_archive.{1,2,...}.ark will point to egs.archive.ark
+        ln -sf egs.$archive_index.ark $dir/egs.$x.$y.ark || exit 1
+      done
+    done
+    $cmd --max-jobs-run $nj JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \
+      nnet3-shuffle-egs --srand=JOB "ark:cat $egs_list|" ark:- \| \
+      nnet3-copy-egs ark:- $output_archives || exit 1;
+  fi
+
+fi
+
+if [ $stage -le 6 ]; then
+  echo "$0: removing temporary archives"
+  for x in $(seq $nj); do
+    for y in $(seq $num_archives_intermediate); do
+      file=$dir/egs_orig.$x.$y.ark
+      [ -L $file ] && rm $(readlink -f $file)
+      rm $file
+    done
+  done
+  if [ $archives_multiple -gt 1 ]; then
+    # there are some extra soft links that we should delete.
+    for f in $dir/egs.*.*.ark; do rm $f; done
+  fi
+  echo "$0: removing temporary"
+  # Ignore errors below because trans.* might not exist.
+  rm -f $dir/trans.{ark,scp} $dir/targets.*.scp 2>/dev/null
+fi
+
+echo "$0: Finished preparing training examples"
+
diff --git a/egs/wsj/s5/steps/nnet3/get_successful_models.py b/egs/wsj/s5/steps/nnet3/get_successful_models.py
new file mode 100755
index 00000000000..3661d91b8d5
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/get_successful_models.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python
+
+from __future__ import print_function
+import re
+import os
+import argparse
+import sys
+import warnings
+import copy
+import glob
+
+
+if __name__ == "__main__":
+    # we add compulsory arguments as named arguments for readability
+    parser = argparse.ArgumentParser(description="Create a list of models suitable for averaging "
+                                                 "based on their train objf values.",
+                                     epilog="See steps/nnet3/lstm/train.sh for example.")
+
+    parser.add_argument("--difference-threshold", type=float,
+                        help="The threshold for discarding models, "
+                        "when objf of the model differs more than this value from the best model "
+                        "it is discarded.",
+                        default=1.0)
+
+    parser.add_argument("num_models", type=int,
+                        help="Number of models.")
+
+    parser.add_argument("logfile_pattern", type=str,
+                        help="Pattern for identifying the log-file names. "
+                        "It specifies the entire log file name, except for the job number, "
+                        "which is replaced with '%'. e.g. exp/nneet3/tdnn_sp/log/train.4.%.log")
+
+
+    args = parser.parse_args()
+
+    assert(args.num_models > 0)
+
+    parse_regex = re.compile("LOG .* Overall average objective function for 'output' is ([0-9e.\-+]+) over ([0-9e.\-+]+) frames")
+    loss = []
+    for i in range(args.num_models):
+        model_num = i + 1
+        logfile = re.sub('%', str(model_num), args.logfile_pattern)
+        lines = open(logfile, 'r').readlines()
+        this_loss = -100000
+        for line_num in range(1, len(lines) + 1):
+            # we search from the end as this would result in
+            # lesser number of regex searches. Python regex is slow !
+            mat_obj = parse_regex.search(lines[-1*line_num])
+            if mat_obj is not None:
+                this_loss = float(mat_obj.groups()[0])
+                break;
+        loss.append(this_loss);
+    max_index = loss.index(max(loss))
+    accepted_models = []
+    for i in range(args.num_models):
+        if (loss[max_index] - loss[i]) <= args.difference_threshold:
+            accepted_models.append(i+1)
+
+    model_list = " ".join(map(lambda x: str(x), accepted_models))
+    print(model_list)
+
+    if len(accepted_models) != args.num_models:
+        print("WARNING: Only {0}/{1} of the models have been accepted for averaging, based on log files {2}.".format(len(accepted_models), args.num_models, args.logfile_pattern), file=sys.stderr)
+        print("         Using models {0}".format(model_list), file=sys.stderr)
diff --git a/egs/wsj/s5/steps/nnet3/lstm/decode.sh b/egs/wsj/s5/steps/nnet3/lstm/decode.sh
new file mode 100755
index 00000000000..07195c071d3
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/lstm/decode.sh
@@ -0,0 +1,163 @@
+#!/bin/bash
+
+# Copyright 2012-2015  Johns Hopkins University (Author: Daniel Povey).
+# Apache 2.0.
+
+# This script does decoding with a neural-net.  If the neural net was built on
+# top of fMLLR transforms from a conventional system, you should provide the
+# --transform-dir option.
+
+# Begin configuration section.
+stage=1
+transform_dir=    # dir to find fMLLR transforms.
+nj=4 # number of decoding jobs.  If --transform-dir set, must match that number!
+acwt=0.1  # Just a default value, used for adaptation and beam-pruning..
+cmd=run.pl
+beam=15.0
+max_active=7000
+min_active=200
+ivector_scale=1.0
+lattice_beam=8.0 # Beam we use in lattice generation.
+iter=final
+num_threads=1 # if >1, will use gmm-latgen-faster-parallel
+parallel_opts=  # ignored now.
+scoring_opts=
+skip_scoring=false
+feat_type=
+online_ivector_dir=
+minimize=false
+
+frames_per_chunk=10000
+extra_left_context=20   # it is recommended to use the same value as the chunk_left_context
+                        # used during training
+extra_right_context=0   # it is recommended to use the same value as the chunk_right_context
+                        # used during training (usually used in bi-directional LSTM case)
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [options] <graph-dir> <data-dir> <decode-dir>"
+  echo "e.g.:   steps/nnet3/decode.sh --nj 8 \\"
+  echo "--online-ivector-dir exp/nnet2_online/ivectors_test_eval92 \\"
+  echo "    exp/tri4b/graph_bg data/test_eval92_hires $dir/decode_bg_eval92"
+  echo "main options (for others, see top of script file)"
+  echo "  --transform-dir <decoding-dir>           # directory of previous decoding"
+  echo "                                           # where we can find transforms for SAT systems."
+  echo "  --config <config-file>                   # config containing options"
+  echo "  --nj <nj>                                # number of parallel jobs"
+  echo "  --cmd <cmd>                              # Command to run in parallel with"
+  echo "  --beam <beam>                            # Decoding beam; default 15.0"
+  echo "  --iter <iter>                            # Iteration of model to decode; default is final."
+  echo "  --scoring-opts <string>                  # options to local/score.sh"
+  echo "  --num-threads <n>                        # number of threads to use, default 1."
+  echo "  --parallel-opts <opts>                   # e.g. '--num-threads 4' if you supply --num-threads 4"
+  exit 1;
+fi
+
+graphdir=$1
+data=$2
+dir=$3
+srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory.
+model=$srcdir/$iter.mdl
+
+
+[ ! -z "$online_ivector_dir" ] && \
+  extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
+
+for f in $graphdir/HCLG.fst $data/feats.scp $model $extra_files; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+sdata=$data/split$nj;
+cmvn_opts=`cat $srcdir/cmvn_opts` || exit 1;
+thread_string=
+[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads"
+
+mkdir -p $dir/log
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh --per-utt $data $nj || exit 1;
+echo $nj > $dir/num_jobs
+
+
+## Set up features.
+if [ -z "$feat_type" ]; then
+  if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=raw; fi
+  echo "$0: feature type is $feat_type"
+fi
+
+splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
+
+case $feat_type in
+  raw) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
+    ;;
+  *) echo "$0: invalid feature type $feat_type" && exit 1;
+esac
+if [ ! -z "$transform_dir" ]; then
+  echo "$0: using transforms from $transform_dir"
+  [ ! -s $transform_dir/num_jobs ] && \
+    echo "$0: expected $transform_dir/num_jobs to contain the number of jobs." && exit 1;
+  nj_orig=$(cat $transform_dir/num_jobs)
+
+  if [ $feat_type == "raw" ]; then trans=raw_trans;
+  else trans=trans; fi
+  if [ $feat_type == "lda" ] && \
+    ! cmp $transform_dir/../final.mat $srcdir/final.mat && \
+    ! cmp $transform_dir/final.mat $srcdir/final.mat; then
+    echo "$0: LDA transforms differ between $srcdir and $transform_dir"
+    exit 1;
+  fi
+  if [ ! -f $transform_dir/$trans.1 ]; then
+    echo "$0: expected $transform_dir/$trans.1 to exist (--transform-dir option)"
+    exit 1;
+  fi
+  if [ $nj -ne $nj_orig ]; then
+    # Copy the transforms into an archive with an index.
+    for n in $(seq $nj_orig); do cat $transform_dir/$trans.$n; done | \
+       copy-feats ark:- ark,scp:$dir/$trans.ark,$dir/$trans.scp || exit 1;
+    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/$trans.scp ark:- ark:- |"
+  else
+    # number of jobs matches with alignment dir.
+    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/$trans.JOB ark:- ark:- |"
+  fi
+elif grep 'transform-feats --utt2spk' $srcdir/log/train.1.log >&/dev/null; then
+  echo "$0: **WARNING**: you seem to be using a neural net system trained with transforms,"
+  echo "  but you are not providing the --transform-dir option in test time."
+fi
+##
+
+if [ ! -z "$online_ivector_dir" ]; then
+  ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
+  ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector_period=$ivector_period"
+fi
+
+if [ $stage -le 1 ]; then
+  $cmd --num-threads $num_threads JOB=1:$nj $dir/log/decode.JOB.log \
+    nnet3-latgen-faster$thread_string $ivector_opts \
+     --frames-per-chunk=$frames_per_chunk \
+     --minimize=$minimize --max-active=$max_active --min-active=$min_active --beam=$beam \
+     --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=true \
+     --extra-left-context=$extra_left_context \
+     --extra-right-context=$extra_right_context \
+     --word-symbol-table=$graphdir/words.txt "$model" \
+     $graphdir/HCLG.fst "$feats" "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;
+fi
+
+# The output of this script is the files "lat.*.gz"-- we'll rescore this at
+# different acoustic scales to get the final output.
+
+
+if [ $stage -le 2 ]; then
+  if ! $skip_scoring ; then
+    [ ! -x local/score.sh ] && \
+      echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
+    echo "score best paths"
+    local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir
+    echo "score confidence and timing with sclite"
+  fi
+fi
+echo "Decoding done."
+exit 0;
diff --git a/egs/wsj/s5/steps/nnet3/lstm/make_configs.py b/egs/wsj/s5/steps/nnet3/lstm/make_configs.py
index 3c7c2e2c975..9c2c641b0e9 100755
--- a/egs/wsj/s5/steps/nnet3/lstm/make_configs.py
+++ b/egs/wsj/s5/steps/nnet3/lstm/make_configs.py
@@ -6,218 +6,160 @@
 import sys
 import warnings
 import copy
+import imp
 
-# adds the input nodes and returns the descriptor
-def AddInputLayer(config_lines, feat_dim, splice_indexes=[0], ivector_dim=0):
-    components = config_lines['components']
-    component_nodes = config_lines['component-nodes']
-    output_dim = 0
-    components.append('input-node name=input dim=' + str(feat_dim))
-    list = [('Offset(input, {0})'.format(n) if n != 0 else 'input') for n in splice_indexes]
-    output_dim += len(splice_indexes) * feat_dim
-    if args.ivector_dim > 0:
-        components.append('input-node name=ivector dim=' + str(ivector_dim))
-        list.append('ReplaceIndex(ivector, t, 0)')
-        output_dim += ivector_dim
-    splice_descriptor = "Append({0})".format(", ".join(list))
-    print(splice_descriptor)
-    return {'descriptor': splice_descriptor,
-            'dimension': output_dim}
-
-def AddLdaLayer(config_lines, name, input, lda_file):
-    components = config_lines['components']
-    component_nodes = config_lines['component-nodes']
-
-    components.append('component name={0}_lda type=FixedAffineComponent matrix={1}'.format(name, lda_file))
-    component_nodes.append('component-node name={0}_lda component={0}_lda input={1}'.format(name, input['descriptor']))
-
-    return {'descriptor':  '{0}_lda'.format(name),
-            'dimension': input['dimension']}
-
-def AddAffineLayer(config_lines, name, input, output_dim):
-    components = config_lines['components']
-    component_nodes = config_lines['component-nodes']
-
-    components.append("component name={0}_affine type=NaturalGradientAffineComponent input-dim={1} output-dim={2}".format(name, input['dimension'], output_dim))
-    component_nodes.append("component-node name={0}_affine component={0}_affine input={1}".format(name, input['descriptor']))
-
-    return {'descriptor':  '{0}_affine'.format(name),
-            'dimension': output_dim}
-
-def AddAffRelNormLayer(config_lines, name, input, output_dim):
-    components = config_lines['components']
-    component_nodes = config_lines['component-nodes']
-
-    components.append("component name={0}_affine type=NaturalGradientAffineComponent input-dim={1} output-dim={2}".format(name, input['dimension'], output_dim))
-    components.append("component name={0}_relu type=RectifiedLinearComponent dim={1}".format(name, output_dim))
-    components.append("component name={0}_renorm type=NormalizeComponent dim={1}".format(name, output_dim))
-
-    component_nodes.append("component-node name={0}_affine component={0}_affine input={1}".format(name, input['descriptor']))
-    component_nodes.append("component-node name={0}_relu component={0}_relu input={0}_affine".format(name))
-    component_nodes.append("component-node name={0}_renorm component={0}_renorm input={0}_relu".format(name))
-
-    return {'descriptor':  '{0}_renorm'.format(name),
-            'dimension': output_dim}
-
-
-
-def AddSoftmaxLayer(config_lines, name, input):
-    components = config_lines['components']
-    component_nodes = config_lines['component-nodes']
-
-    components.append("component name={0}_log_softmax type=LogSoftmaxComponent dim={1}".format(name, input['dimension']))
-    component_nodes.append("component-node name={0}_log_softmax component={0}_log_softmax input={1}".format(name, input['descriptor']))
-
-    return {'descriptor':  '{0}_log_softmax'.format(name),
-            'dimension': input['dimension']}
-
-def AddOutputNode(config_lines, input):
-    components = config_lines['components']
-    component_nodes = config_lines['component-nodes']
-    component_nodes.append('output-node name=output input={0}'.format(input['descriptor']))
-
-def AddFinalLayer(config_lines, input, output_dim):
-    prev_layer_output = AddAffineLayer(config_lines, "Final", input, output_dim)
-    prev_layer_output = AddSoftmaxLayer(config_lines, "Final", prev_layer_output)
-    AddOutputNode(config_lines, prev_layer_output)
-
-def AddLstmLayer(config_lines,
-                 name, input, cell_dim,
-                 recurrent_projection_dim = 0,
-                 non_recurrent_projection_dim = 0):
-    assert(recurrent_projection_dim >= 0 and non_recurrent_projection_dim >= 0)
-    components = config_lines['components']
-    component_nodes = config_lines['component-nodes']
-
-    input_descriptor = input['descriptor']
-    input_dim = input['dimension']
-    name = name.strip()
-
-    if (recurrent_projection_dim == 0):
-        add_recurrent_projection = False
-        recurrent_projection_dim = cell_dim
-        recurrent_connection = "m_t"
-    else:
-        add_recurrent_projection = True
-        recurrent_connection = "r_t"
-    if (non_recurrent_projection_dim == 0):
-        add_non_recurrent_projection = False
-    else:
-        add_non_recurrent_projection = True
-
-
-    # Parameter Definitions W*(* replaced by - to have valid names)
-    components.append("# Input gate control : W_i* matrices")
-    components.append("component name={0}_W_i-xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2}".format(name, input_dim + recurrent_projection_dim, cell_dim))
-    components.append("# note : the cell outputs pass through a diagonal matrix")
-    components.append("component name={0}_w_ic type=NaturalGradientPerElementScaleComponent  dim={1}".format(name, cell_dim))
-
-    components.append("# Forget gate control : W_f* matrices")
-    components.append("component name={0}_W_f-xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2}".format(name, input_dim + recurrent_projection_dim, cell_dim))
-    components.append("# note : the cell outputs pass through a diagonal matrix")
-    components.append("component name={0}_w_fc type=NaturalGradientPerElementScaleComponent  dim={1}".format(name, cell_dim))
-
-    components.append("#  Output gate control : W_o* matrices")
-    components.append("component name={0}_W_o-xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2}".format(name, input_dim + recurrent_projection_dim, cell_dim))
-    components.append("# note : the cell outputs pass through a diagonal matrix")
-    components.append("component name={0}_w_oc type=NaturalGradientPerElementScaleComponent  dim={1}".format(name, cell_dim))
-
-    components.append("# Cell input matrices : W_c* matrices")
-    components.append("component name={0}_W_c-xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2}".format(name, input_dim + recurrent_projection_dim, cell_dim))
-
-    if add_recurrent_projection and add_non_recurrent_projection:
-        components.append("# projection matrices : Wrm and Wpm")
-        components.append("component name={0}_W-m type=NaturalGradientAffineComponent input-dim={1} output-dim={2}".format(name, cell_dim, recurrent_projection_dim + non_recurrent_projection_dim))
-
-    elif add_recurrent_projection :
-        components.append("# projection matrices : Wrm")
-        components.append("component name={0}_Wrm type=NaturalGradientAffineComponent input-dim={1} output-dim={2}".format(name, cell_dim, recurrent_projection_dim))
-
-    components.append("# Defining the non-linearities")
-    components.append("component name={0}_i type=SigmoidComponent dim={1}".format(name, cell_dim))
-    components.append("component name={0}_f type=SigmoidComponent dim={1}".format(name, cell_dim))
-    components.append("component name={0}_o type=SigmoidComponent dim={1}".format(name, cell_dim))
-    components.append("component name={0}_g type=TanhComponent dim={1}".format(name, cell_dim))
-    components.append("component name={0}_h type=TanhComponent dim={1}".format(name, cell_dim))
-
-    components.append("# Defining the cell computations")
-    components.append("component name={0}_c1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
-    components.append("component name={0}_c2 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
-    components.append("component name={0}_m type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
-
-    # c1_t and c2_t defined below
-    c_tminus1_descriptor = "Sum(IfDefined(Offset({0}_c1_t, -1)), IfDefined(Offset( {0}_c2_t, -1)))".format(name)
-
-    component_nodes.append("# i_t")
-    component_nodes.append("component-node name={0}_i1 component={0}_W_i-xr input=Append({1}, IfDefined(Offset({0}_{2}, -1)))".format(name, input_descriptor, recurrent_connection))
-    component_nodes.append("component-node name={0}_i2 component={0}_w_ic  input={1}".format(name, c_tminus1_descriptor))
-    component_nodes.append("component-node name={0}_i_t component={0}_i input=Sum({0}_i1, {0}_i2)".format(name))
-
-    component_nodes.append("# f_t")
-    component_nodes.append("component-node name={0}_f1 component={0}_W_f-xr input=Append({1}, IfDefined(Offset({0}_{2}, -1)))".format(name, input_descriptor, recurrent_connection))
-    component_nodes.append("component-node name={0}_f2 component={0}_w_fc  input={1}".format(name, c_tminus1_descriptor))
-    component_nodes.append("component-node name={0}_f_t component={0}_f input=Sum({0}_f1,{0}_f2)".format(name))
-
-    component_nodes.append("# o_t")
-    component_nodes.append("component-node name={0}_o1 component={0}_W_o-xr input=Append({1}, IfDefined(Offset({0}_{2}, -1)))".format(name, input_descriptor, recurrent_connection))
-    component_nodes.append("component-node name={0}_o2 component={0}_w_oc input=Sum({0}_c1_t, {0}_c2_t)".format(name))
-    component_nodes.append("component-node name={0}_o_t component={0}_o input=Sum({0}_o1, {0}_o2)".format(name))
-
-    component_nodes.append("# h_t")
-    component_nodes.append("component-node name={0}_h_t component={0}_h input=Sum({0}_c1_t, {0}_c2_t)".format(name))
-
-    component_nodes.append("# g_t")
-    component_nodes.append("component-node name={0}_g1 component={0}_W_c-xr input=Append({1}, IfDefined(Offset({0}_{2}, -1)))".format(name, input_descriptor, recurrent_connection))
-    component_nodes.append("component-node name={0}_g_t component={0}_g input={0}_g1".format(name))
-
-    component_nodes.append("# parts of c_t")
-    component_nodes.append("component-node name={0}_c1_t component={0}_c1  input=Append({0}_f_t, {1})".format(name, c_tminus1_descriptor))
-    component_nodes.append("component-node name={0}_c2_t component={0}_c2 input=Append({0}_i_t, {0}_g_t)".format(name))
-
-    component_nodes.append("# m_t")
-    component_nodes.append("component-node name={0}_m_t component={0}_m input=Append({0}_o_t, {0}_h_t)".format(name))
-
-    if (add_recurrent_projection and add_non_recurrent_projection):
-        component_nodes.append("# r_t and p_t")
-        component_nodes.append("component-node name={0}_rp_t component={0}_W-m input={0}_m_t".format(name))
-        component_nodes.append("dim-range-node name={0}_r_t input-node={0}_rp_t dim-offset=0 dim={1}".format(name, recurrent_projection_dim))
-        output_descriptor = '{0}_rp_t'.format(name)
-        output_dim = recurrent_projection_dim + non_recurrent_projection_dim
-
-    elif add_recurrent_projection:
-        component_nodes.append("# r_t")
-        component_nodes.append("component-node name={0}_r_t component={0}_Wrm input={0}_m_t".format(name))
-        output_descriptor = '{0}_r_t'.format(name)
-        output_dim = recurrent_projection_dim
+nodes = imp.load_source('nodes', 'steps/nnet3/components.py')
+nnet3_train_lib = imp.load_source('ntl', 'steps/nnet3/nnet3_train_lib.py')
+chain_lib = imp.load_source('ncl', 'steps/nnet3/chain/nnet3_chain_lib.py')
+
+def GetArgs():
+    # we add compulsary arguments as named arguments for readability
+    parser = argparse.ArgumentParser(description="Writes config files and variables "
+                                                 "for LSTMs creation and training",
+                                     epilog="See steps/nnet3/lstm/train.sh for example.")
+
+    # Only one of these arguments can be specified, and one of them has to
+    # be compulsarily specified
+    feat_group = parser.add_mutually_exclusive_group(required = True)
+    feat_group.add_argument("--feat-dim", type=int,
+                            help="Raw feature dimension, e.g. 13")
+    feat_group.add_argument("--feat-dir", type=str,
+                            help="Feature directory, from which we derive the feat-dim")
+
+    # only one of these arguments can be specified
+    ivector_group = parser.add_mutually_exclusive_group(required = False)
+    ivector_group.add_argument("--ivector-dim", type=int,
+                                help="iVector dimension, e.g. 100", default=0)
+    ivector_group.add_argument("--ivector-dir", type=str,
+                                help="iVector dir, which will be used to derive the ivector-dim  ", default=None)
+
+    num_target_group = parser.add_mutually_exclusive_group(required = True)
+    num_target_group.add_argument("--num-targets", type=int,
+                                  help="number of network targets (e.g. num-pdf-ids/num-leaves)")
+    num_target_group.add_argument("--ali-dir", type=str,
+                                  help="alignment directory, from which we derive the num-targets")
+    num_target_group.add_argument("--tree-dir", type=str,
+                                  help="directory with final.mdl, from which we derive the num-targets")
+
+    # General neural network options
+    parser.add_argument("--splice-indexes", type=str,
+                        help="Splice indexes at input layer, e.g. '-3,-2,-1,0,1,2,3'", required = True, default="0")
+    parser.add_argument("--xent-regularize", type=float,
+                        help="For chain models, if nonzero, add a separate output for cross-entropy "
+                        "regularization (with learning-rate-factor equal to the inverse of this)",
+                        default=0.0)
+    parser.add_argument("--include-log-softmax", type=str, action=nnet3_train_lib.StrToBoolAction,
+                        help="add the final softmax layer ", default=True, choices = ["false", "true"])
+
+    # LSTM options
+    parser.add_argument("--num-lstm-layers", type=int,
+                        help="Number of LSTM layers to be stacked", default=1)
+    parser.add_argument("--cell-dim", type=int,
+                        help="dimension of lstm-cell")
+    parser.add_argument("--recurrent-projection-dim", type=int,
+                        help="dimension of recurrent projection")
+    parser.add_argument("--non-recurrent-projection-dim", type=int,
+                        help="dimension of non-recurrent projection")
+    parser.add_argument("--hidden-dim", type=int,
+                        help="dimension of fully-connected layers")
+
+    # Natural gradient options
+    parser.add_argument("--ng-per-element-scale-options", type=str,
+                        help="options to be supplied to NaturalGradientPerElementScaleComponent", default="")
+    parser.add_argument("--ng-affine-options", type=str,
+                        help="options to be supplied to NaturalGradientAffineComponent", default="")
+
+    # Gradient clipper options
+    parser.add_argument("--norm-based-clipping", type=str, action=nnet3_train_lib.StrToBoolAction,
+                        help="use norm based clipping in ClipGradient components ", default=True, choices = ["false", "true"])
+    parser.add_argument("--clipping-threshold", type=float,
+                        help="clipping threshold used in ClipGradient components, if clipping-threshold=0 no clipping is done", default=30)
+    parser.add_argument("--self-repair-scale", type=float,
+                        help="A non-zero value activates the self-repair mechanism in the sigmoid and tanh non-linearities of the LSTM", default=None)
+
+    # Delay options
+    parser.add_argument("--label-delay", type=int, default=None,
+                        help="option to delay the labels to make the lstm robust")
+
+    parser.add_argument("--lstm-delay", type=str, default=None,
+                        help="option to have different delays in recurrence for each lstm")
+
+    parser.add_argument("config_dir",
+                        help="Directory to write config files and variables")
+
+    print(' '.join(sys.argv))
+
+    args = parser.parse_args()
+    args = CheckArgs(args)
 
+    return args
+
+def CheckArgs(args):
+    if not os.path.exists(args.config_dir):
+        os.makedirs(args.config_dir)
+
+    ## Check arguments.
+    if args.feat_dir is not None:
+        args.feat_dim = nnet3_train_lib.GetFeatDim(args.feat_dir)
+
+    if args.ali_dir is not None:
+        args.num_targets = nnet3_train_lib.GetNumberOfLeaves(args.ali_dir)
+    elif args.tree_dir is not None:
+        args.num_targets = chain_lib.GetNumberOfLeaves(args.tree_dir)
+
+    if args.ivector_dir is not None:
+        args.ivector_dim = nnet3_train_lib.GetIvectorDim(args.ivector_dir)
+
+    if not args.feat_dim > 0:
+        raise Exception("feat-dim has to be postive")
+
+    if not args.num_targets > 0:
+        print(args.num_targets)
+        raise Exception("num_targets has to be positive")
+
+    if not args.ivector_dim >= 0:
+        raise Exception("ivector-dim has to be non-negative")
+
+    if (args.num_lstm_layers < 1):
+        sys.exit("--num-lstm-layers has to be a positive integer")
+    if (args.clipping_threshold < 0):
+        sys.exit("--clipping-threshold has to be a non-negative")
+    if args.lstm_delay is None:
+        args.lstm_delay = [[-1]] * args.num_lstm_layers
     else:
-        output_descriptor = '{0}_m_t'.format(name)
-        output_dim = cell_dim
+        try:
+            args.lstm_delay = ParseLstmDelayString(args.lstm_delay.strip())
+        except ValueError:
+            sys.exit("--lstm-delay has incorrect format value. Provided value is '{0}'".format(args.lstm_delay))
+        if len(args.lstm_delay) != args.num_lstm_layers:
+            sys.exit("--lstm-delay: Number of delays provided has to match --num-lstm-layers")
 
-    return {
-            'descriptor': output_descriptor,
-            'dimension':output_dim
-            }
+    return args
 
 def PrintConfig(file_name, config_lines):
     f = open(file_name, 'w')
     f.write("\n".join(config_lines['components'])+"\n")
     f.write("\n#Component nodes\n")
-    f.write("\n".join(config_lines['component-nodes']))
+    f.write("\n".join(config_lines['component-nodes'])+"\n")
     f.close()
 
-def ParseSpliceString(splice_indexes):
+def ParseSpliceString(splice_indexes, label_delay=None):
     ## Work out splice_array e.g. splice_array = [ [ -3,-2,...3 ], [0], [-2,2], .. [ -8,8 ] ]
     split1 = splice_indexes.split(" ");  # we already checked the string is nonempty.
     if len(split1) < 1:
         splice_indexes = "0"
 
-    left_context = 0
-    right_context = 0
+    left_context=0
+    right_context=0
+    if label_delay is not None:
+        left_context = -label_delay
+        right_context = label_delay
+
     splice_array = []
     try:
         for i in range(len(split1)):
-            indexes = map(lambda x: int(x), split1[i].split(","))
+            indexes = map(lambda x: int(x), split1[i].strip().split(","))
+            print(indexes)
             if len(indexes) < 1:
                 raise ValueError("invalid --splice-indexes argument, too-short element: "
                                 + splice_indexes)
@@ -232,7 +174,7 @@ def ParseSpliceString(splice_indexes):
             right_context += indexes[-1]
             splice_array.append(indexes)
     except ValueError as e:
-        raise ValueError("invalid --splice-indexes argument " + splice_indexes + e)
+        raise ValueError("invalid --splice-indexes argument " + splice_indexes + str(e))
 
     left_context = max(0, left_context)
     right_context = max(0, right_context)
@@ -243,111 +185,146 @@ def ParseSpliceString(splice_indexes):
             'num_hidden_layers':len(splice_array)
             }
 
-if __name__ == "__main__":
-    # we add compulsary arguments as named arguments for readability
-    parser = argparse.ArgumentParser(description="Writes config files and variables "
-                                                 "for LSTMs creation and training",
-                                     epilog="See steps/nnet3/lstm/train.sh for example.")
-    parser.add_argument("--splice-indexes", type=str,
-                        help="Splice indexes at input layer, e.g. '-3,-2,-1,0,1,2,3' [compulsary argument]", default="0")
-    parser.add_argument("--num-lstm-layers", type=int,
-                        help="Number of LSTM layers to be stacked", default=1)
-    parser.add_argument("--feat-dim", type=int,
-                        help="Raw feature dimension, e.g. 13")
-    parser.add_argument("--ivector-dim", type=int,
-                        help="iVector dimension, e.g. 100", default=0)
-    parser.add_argument("--cell-dim", type=int,
-                        help="dimension of lstm-cell")
-    parser.add_argument("--recurrent-projection-dim", type=int,
-                        help="dimension of recurrent projection")
-    parser.add_argument("--non-recurrent-projection-dim", type=int,
-                        help="dimension of non-recurrent projection")
-    parser.add_argument("--hidden-dim", type=int,
-                        help="dimension of fully-connected layers")
-    parser.add_argument("--bptt-truncation-width", type=int,
-                        help="number of time steps through which gradient is backpropagated", default=20)
-    parser.add_argument("--context-sensitive-chunk-width", type=int,
-                        help="number of frames used to estimate the state of the first frame in truncated BPTT ", default=20)
-    parser.add_argument("--num-targets", type=int,
-                        help="number of network targets (e.g. num-pdf-ids/num-leaves)")
-    parser.add_argument("config_dir",
-                        help="Directory to write config files and variables")
-
-    print(' '.join(sys.argv))
-
-    args = parser.parse_args()
-
-    if not os.path.exists(args.config_dir):
-        os.makedirs(args.config_dir)
-
-    ## Check arguments.
-    if args.splice_indexes is None:
-        sys.exit("--splice-indexes argument is required")
-    if args.feat_dim is None or not (args.feat_dim > 0):
-        sys.exit("--feat-dim argument is required")
-    if args.num_targets is None or not (args.num_targets > 0):
-        sys.exit("--feat-dim argument is required")
-    if (args.num_lstm_layers < 1):
-        sys.exit("--num-lstm-layers has to be a positive integer")
-    if (args.bptt_truncation_width < 1):
-        sys.exit("--bptt-truncation-width has to be a positive integer")
-    if (args.context_sensitive_chunk_width < 0):
-        sys.exit("--context-sensitive-chunk-width has to be a non-negative integer")
-
-
-
-    parsed_splice_output = ParseSpliceString(args.splice_indexes)
-    left_context = parsed_splice_output['left_context']
-    right_context = parsed_splice_output['right_context']
-    num_hidden_layers = parsed_splice_output['num_hidden_layers']
-    splice_indexes = parsed_splice_output['splice_indexes']
+def ParseLstmDelayString(lstm_delay):
+    ## Work out lstm_delay e.g. "-1 [-1,1] -2" -> list([ [-1], [-1, 1], [-2] ])
+    split1 = lstm_delay.split(" ");
+    lstm_delay_array = []
+    try:
+        for i in range(len(split1)):
+            indexes = map(lambda x: int(x), split1[i].strip().lstrip('[').rstrip(']').strip().split(","))
+            if len(indexes) < 1:
+                raise ValueError("invalid --lstm-delay argument, too-short element: "
+                                + lstm_delay)
+	    elif len(indexes) == 2 and indexes[0] * indexes[1] >= 0:
+                raise ValueError('Warning: ' + str(indexes) + ' is not a standard BLSTM mode. There should be a negative delay for the forward, and a postive delay for the backward.')
+            lstm_delay_array.append(indexes)
+    except ValueError as e:
+        raise ValueError("invalid --lstm-delay argument " + lstm_delay + str(e))
 
-    if (num_hidden_layers < args.num_lstm_layers):
-        sys.exit("--num-lstm-layers : number of lstm layers has to be greater than number of layers, decided based on splice-indexes")
+    return lstm_delay_array
 
-    left_context = left_context + args.bptt_truncation_width + args.context_sensitive_chunk_width
-    right_context = right_context
 
-    # write the files used by other scripts like steps/nnet3/get_egs.sh
-    f = open(args.config_dir + "/vars", "w")
-    print('left_context=' + str(left_context), file=f)
-    print('right_context=' + str(right_context), file=f)
-    print('num_hidden_layers=' + str(num_hidden_layers), file=f)
-    # print('initial_right_context=' + str(splice_array[0][-1]), file=f)
-    f.close()
+def MakeConfigs(config_dir, feat_dim, ivector_dim, num_targets,
+                splice_indexes, lstm_delay, cell_dim,
+                recurrent_projection_dim, non_recurrent_projection_dim,
+                num_lstm_layers, num_hidden_layers,
+                norm_based_clipping, clipping_threshold,
+                ng_per_element_scale_options, ng_affine_options,
+                label_delay, include_log_softmax, xent_regularize, self_repair_scale):
 
     config_lines = {'components':[], 'component-nodes':[]}
 
     config_files={}
-    prev_layer_output = AddInputLayer(config_lines, args.feat_dim, splice_indexes[0], args.ivector_dim)
+    prev_layer_output = nodes.AddInputLayer(config_lines, feat_dim, splice_indexes[0], ivector_dim)
 
     # Add the init config lines for estimating the preconditioning matrices
     init_config_lines = copy.deepcopy(config_lines)
     init_config_lines['components'].insert(0, '# Config file for initializing neural network prior to')
     init_config_lines['components'].insert(0, '# preconditioning matrix computation')
-    AddOutputNode(init_config_lines, prev_layer_output)
-    config_files[args.config_dir + '/init.config'] = init_config_lines
-
-    prev_layer_output = AddLdaLayer(config_lines, "L0", prev_layer_output, args.config_dir + '/lda.mat')
-
-    for i in range(args.num_lstm_layers):
-        prev_layer_output = AddLstmLayer(config_lines, "Lstm{0}".format(i+1), prev_layer_output, args.cell_dim,
-                                         args.recurrent_projection_dim, args.non_recurrent_projection_dim)
+    nodes.AddOutputLayer(init_config_lines, prev_layer_output)
+    config_files[config_dir + '/init.config'] = init_config_lines
+
+    prev_layer_output = nodes.AddLdaLayer(config_lines, "L0", prev_layer_output, config_dir + '/lda.mat')
+
+    for i in range(num_lstm_layers):
+	if len(lstm_delay[i]) == 2: # BLSTM layer case, add both forward and backward
+            prev_layer_output1 = nodes.AddLstmLayer(config_lines, "BLstm{0}_forward".format(i+1), prev_layer_output, cell_dim,
+                                             recurrent_projection_dim, non_recurrent_projection_dim,
+                                             clipping_threshold, norm_based_clipping,
+                                             ng_per_element_scale_options, ng_affine_options,
+                                             lstm_delay = lstm_delay[i][0], self_repair_scale = self_repair_scale)
+            prev_layer_output2 = nodes.AddLstmLayer(config_lines, "BLstm{0}_backward".format(i+1), prev_layer_output, cell_dim,
+                                             recurrent_projection_dim, non_recurrent_projection_dim,
+                                             clipping_threshold, norm_based_clipping,
+                                             ng_per_element_scale_options, ng_affine_options,
+                                             lstm_delay = lstm_delay[i][1], self_repair_scale = self_repair_scale)
+            prev_layer_output['descriptor'] = 'Append({0}, {1})'.format(prev_layer_output1['descriptor'], prev_layer_output2['descriptor'])
+	    prev_layer_output['dimension'] = prev_layer_output1['dimension'] + prev_layer_output2['dimension']
+	else: # LSTM layer case
+	    prev_layer_output = nodes.AddLstmLayer(config_lines, "Lstm{0}".format(i+1), prev_layer_output, cell_dim,
+			                    recurrent_projection_dim, non_recurrent_projection_dim,
+					    clipping_threshold, norm_based_clipping,
+					    ng_per_element_scale_options, ng_affine_options,
+					    lstm_delay = lstm_delay[i][0], self_repair_scale = self_repair_scale)
         # make the intermediate config file for layerwise discriminative
         # training
-        AddFinalLayer(config_lines, prev_layer_output, args.num_targets)
-        config_files['{0}/layer{1}.config'.format(args.config_dir, i+1)] = config_lines
-        config_lines = {'components':[], 'component-nodes':[]}
+        nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, ng_affine_options, label_delay = label_delay, include_log_softmax = include_log_softmax)
 
-    for i in range(args.num_lstm_layers, num_hidden_layers):
-        prev_layer_output = AddAffRelNormLayer(config_lines, "L{0}".format(i+1), prev_layer_output, args.hidden_dim)
+
+        if xent_regularize != 0.0:
+            nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets,
+                                include_log_softmax = True,
+                                name_affix = 'xent')
+
+        config_files['{0}/layer{1}.config'.format(config_dir, i+1)] = config_lines
+        config_lines = {'components':[], 'component-nodes':[]}
+	if len(lstm_delay[i]) == 2:
+	    # since the form 'Append(Append(xx, yy), zz)' is not allowed, here we don't wrap the descriptor with 'Append()' so that we would have the form
+	    # 'Append(xx, yy, zz)' in the next lstm layer
+	    prev_layer_output['descriptor'] = '{0}, {1}'.format(prev_layer_output1['descriptor'], prev_layer_output2['descriptor'])
+
+    if len(lstm_delay[i]) == 2:
+        # since there is no 'Append' in 'AffRelNormLayer', here we wrap the descriptor with 'Append()'
+        prev_layer_output['descriptor'] = 'Append({0})'.format(prev_layer_output['descriptor'])
+    for i in range(num_lstm_layers, num_hidden_layers):
+        prev_layer_output = nodes.AddAffRelNormLayer(config_lines, "L{0}".format(i+1),
+                                               prev_layer_output, hidden_dim,
+                                               ng_affine_options, self_repair_scale = self_repair_scale)
         # make the intermediate config file for layerwise discriminative
         # training
-        AddFinalLayer(config_lines, prev_layer_output, args.num_targets)
-        config_files['{0}/layer{1}.config'.format(args.config_dir, i+1)] = config_lines
+        nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets, ng_affine_options, label_delay = label_delay, include_log_softmax = include_log_softmax)
+
+        if xent_regularize != 0.0:
+            nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets,
+                                include_log_softmax = True,
+                                name_affix = 'xent')
+
+        config_files['{0}/layer{1}.config'.format(config_dir, i+1)] = config_lines
         config_lines = {'components':[], 'component-nodes':[]}
 
     # printing out the configs
     # init.config used to train lda-mllt train
     for key in config_files.keys():
         PrintConfig(key, config_files[key])
+
+
+
+
+def ProcessSpliceIndexes(config_dir, splice_indexes, label_delay, num_lstm_layers):
+    parsed_splice_output = ParseSpliceString(splice_indexes.strip(), label_delay)
+    left_context = parsed_splice_output['left_context']
+    right_context = parsed_splice_output['right_context']
+    num_hidden_layers = parsed_splice_output['num_hidden_layers']
+    splice_indexes = parsed_splice_output['splice_indexes']
+
+    if (num_hidden_layers < num_lstm_layers):
+        raise Exception("num-lstm-layers : number of lstm layers has to be greater than number of layers, decided based on splice-indexes")
+
+    # write the files used by other scripts like steps/nnet3/get_egs.sh
+    f = open(config_dir + "/vars", "w")
+    print('model_left_context=' + str(left_context), file=f)
+    print('model_right_context=' + str(right_context), file=f)
+    print('num_hidden_layers=' + str(num_hidden_layers), file=f)
+    # print('initial_right_context=' + str(splice_array[0][-1]), file=f)
+    f.close()
+
+    return [left_context, right_context, num_hidden_layers, splice_indexes]
+
+
+def Main():
+    args = GetArgs()
+    [left_context, right_context, num_hidden_layers, splice_indexes] = ProcessSpliceIndexes(args.config_dir, args.splice_indexes, args.label_delay, args.num_lstm_layers)
+
+    MakeConfigs(args.config_dir,
+                args.feat_dim, args.ivector_dim, args.num_targets,
+                splice_indexes, args.lstm_delay, args.cell_dim,
+                args.recurrent_projection_dim, args.non_recurrent_projection_dim,
+                args.num_lstm_layers, num_hidden_layers,
+                args.norm_based_clipping,
+                args.clipping_threshold,
+                args.ng_per_element_scale_options, args.ng_affine_options,
+                args.label_delay, args.include_log_softmax, args.xent_regularize,
+                args.self_repair_scale)
+
+if __name__ == "__main__":
+    Main()
diff --git a/egs/wsj/s5/steps/nnet3/lstm/train.sh b/egs/wsj/s5/steps/nnet3/lstm/train.sh
index fade7ef454d..3a1c7f14535 100755
--- a/egs/wsj/s5/steps/nnet3/lstm/train.sh
+++ b/egs/wsj/s5/steps/nnet3/lstm/train.sh
@@ -1,33 +1,29 @@
 #!/bin/bash
 
-
-# Copyright 2012-2015  Johns Hopkins University (Author: Daniel Povey). 
+# Copyright 2012-2015  Johns Hopkins University (Author: Daniel Povey).
 #           2013  Xiaohui Zhang
 #           2013  Guoguo Chen
 #           2014  Vimal Manohar
 #           2014-2015  Vijayaditya Peddinti
 # Apache 2.0.
 
+# Terminology:
+# sample - one input-output tuple, which is an input sequence and output sequence for LSTM
+# frame  - one output label and the input context used to compute it
 
 # Begin configuration section.
 cmd=run.pl
-num_epochs=15      # Number of epochs of training;
+num_epochs=10      # Number of epochs of training;
                    # the number of iterations is worked out from this.
-initial_effective_lrate=0.01
-final_effective_lrate=0.001
-rand_prune=4.0 # Relates to a speedup we do for LDA.
-minibatch_size=512  # This default is suitable for GPU-based training.
-                    # Set it to 128 for multi-threaded CPU-based training.
-
-samples_per_iter=400000 # each iteration of training, see this many samples
-                        # per job.  This option is passed to get_egs.sh
-num_jobs_initial=1  # Number of neural net jobs to run in parallel at the start of training
+initial_effective_lrate=0.0003
+final_effective_lrate=0.00003
+num_jobs_initial=1 # Number of neural net jobs to run in parallel at the start of training
 num_jobs_final=8   # Number of neural net jobs to run in parallel at the end of training
-prior_subset_size=20000 # 20k samples per job, for computing priors. 
+prior_subset_size=20000 # 20k samples per job, for computing priors.
 num_jobs_compute_prior=10 # these are single-threaded, run on CPU.
 get_egs_stage=0    # can be used for rerunning after partial
 online_ivector_dir=
-presoftmax_prior_scale_power=-0.25
+presoftmax_prior_scale_power=-0.25  # we haven't yet used pre-softmax prior scaling in the LSTM model
 remove_egs=true  # set to false to disable removing egs after training is done.
 
 max_models_combine=20 # The "max_models_combine" is the maximum number of models we give
@@ -48,45 +44,75 @@ stage=-6
 exit_stage=-100 # you can set this to terminate the training early.  Exits before running this stage
 
 # count space-separated fields in splice_indexes to get num-hidden-layers.
-splice_indexes="-4,-3,-2,-1,0,1,2,3,4 0 0 0 0 0"
+splice_indexes="-2,-1,0,1,2 0 0"
 # Format : layer<hidden_layer>/<frame_indices>....layer<hidden_layer>/<frame_indices> "
 # note: hidden layers which are composed of one or more components,
 # so hidden layer indexing is different from component count
+
 # LSTM parameters
 num_lstm_layers=3
 cell_dim=1024  # dimension of the LSTM cell
 hidden_dim=1024  # the dimension of the fully connected hidden layer outputs
-recurrent_projection_dim=256  
+recurrent_projection_dim=256
 non_recurrent_projection_dim=256
-bptt_truncation_width=20  # number of BPTT steps 
-context_sensitive_chunk_width=20 # number of steps used in the estimation of the first LSTM state 
-                                 # see Chen 2015, "Training Deep Bidirectional LSTM Acoustic Model for LVCSR by a Context-Sensitive-Chunk BPTT Approach"
-
-
-io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
-randprune=4.0 # speeds up LDA.
-affine_opts=
-
+norm_based_clipping=true  # if true norm_based_clipping is used.
+                          # In norm-based clipping the activation Jacobian matrix
+                          # for the recurrent connections in the network is clipped
+                          # to ensure that the individual row-norm (l2) does not increase
+                          # beyond the clipping_threshold.
+                          # If false, element-wise clipping is used.
+clipping_threshold=30     # if norm_based_clipping is true this would be the maximum value of the row l2-norm,
+                          # else this is the max-absolute value of each element in Jacobian.
+chunk_width=20  # number of output labels in the sequence used to train an LSTM
+                # Caution: if you double this you should halve --samples-per-iter.
+chunk_left_context=40  # number of steps used in the estimation of LSTM state before prediction of the first label
+chunk_right_context=0  # number of steps used in the estimation of LSTM state before prediction of the first label (usually used in bi-directional LSTM case)
+label_delay=5  # the lstm output is used to predict the label with the specified delay
+lstm_delay=" -1 -2 -3 "  # the delay to be used in the recurrence of lstms
+                         # "-1 -2 -3" means the a three layer stacked LSTM would use recurrence connections with
+                         # delays -1, -2 and -3 at layer1 lstm, layer2 lstm and layer3 lstm respectively
+			 # "[-1,1] [-2,2] [-3,3]" means a three layer stacked bi-directional LSTM would use recurrence
+			 # connections with delay -1 for the forward, 1 for the backward at layer1,
+			 # -2 for the forward, 2 for the backward at layer2, and so on at layer3
+num_bptt_steps=    # this variable counts the number of time steps to back-propagate from the last label in the chunk
+                   # it is usually same as chunk_width
+
+
+# nnet3-train options
+shrink=0.99  # this parameter would be used to scale the parameter matrices
+shrink_threshold=0.15  # a value less than 0.25 that we compare the mean of
+                       # 'deriv-avg' for sigmoid components with, and if it's
+                       # less, we shrink.
+max_param_change=2.0  # max param change per minibatch
+num_chunk_per_minibatch=100  # number of sequences to be processed in parallel every mini-batch
+
+samples_per_iter=20000 # this is really the number of egs in each archive.  Each eg has
+                       # 'chunk_width' frames in it-- for chunk_width=20, this value (20k)
+                       # is equivalent to the 400k number that we use as a default in
+                       # regular DNN training.
+momentum=0.5    # e.g. 0.5.  Note: we implemented it in such a way that
+                # it doesn't increase the effective learning rate.
 use_gpu=true    # if true, we run on GPU.
-num_threads=16  # if using CPU, the number of threads we use.
 cleanup=true
 egs_dir=
 max_lda_jobs=10  # use no more than 10 jobs for the LDA accumulation.
 lda_opts=
 egs_opts=
 transform_dir=     # If supplied, this dir used instead of alidir to find transforms.
-cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.  
+cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.
             # only relevant for "raw" features, not lda.
 feat_type=raw  # or set to 'lda' to use LDA features.
 align_cmd=              # The cmd that is passed to steps/nnet2/align.sh
 align_use_gpu=          # Passed to use_gpu in steps/nnet2/align.sh [yes/no]
-realign_times=          # List of times on which we realign.  Each time is 
+realign_times=          # List of times on which we realign.  Each time is
                         # floating point number strictly between 0 and 1, which
                         # will be multiplied by the num-iters to get an iteration
                         # number.
 num_jobs_align=30       # Number of jobs for realignment
+
+rand_prune=4.0 # speeds up LDA.
+
 # End configuration section.
-frames_per_eg=8 # to be passed on to get_egs.sh
 
 trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM
 
@@ -102,14 +128,13 @@ if [ $# != 4 ]; then
   echo "Main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config file containing options"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
-  echo "  --num-epochs <#epochs|15>                        # Number of epochs of training"
-  echo "  --initial-effective-lrate <lrate|0.02> # effective learning rate at start of training."
-  echo "  --final-effective-lrate <lrate|0.004>   # effective learning rate at end of training."
+  echo "  --num-epochs <#epochs|10>                        # Number of epochs of training"
+  echo "  --initial-effective-lrate <lrate|0.0003>         # effective learning rate at start of training."
+  echo "  --final-effective-lrate <lrate|0.00003>          # effective learning rate at end of training."
   echo "                                                   # data, 0.00025 for large data"
-  echo "  --num-hidden-layers <#hidden-layers|2>           # Number of hidden layers, e.g. 2 for 3 hours of data, 4 for 100hrs"
-  echo "  --add-layers-period <#iters|2>                   # Number of iterations between adding hidden layers"
-  echo "  --presoftmax-prior-scale-power <power|-0.25>     # use the specified power value on the priors (inverse priors) to scale"
-  echo "                                                   # the pre-softmax outputs (set to 0.0 to disable the presoftmax element scale)"
+  echo "  --momentum <momentum|0.5>                        # Momentum constant: note, this is "
+  echo "                                                   # implemented in such a way that it doesn't"
+  echo "                                                   # increase the effective learning rate."
   echo "  --num-jobs-initial <num-jobs|1>                  # Number of parallel jobs to use for neural net training, at the start."
   echo "  --num-jobs-final <num-jobs|8>                    # Number of parallel jobs to use for neural net training, at the end"
   echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job, for CPU-based training (will affect"
@@ -118,24 +143,13 @@ if [ $# != 4 ]; then
   echo "  --parallel-opts <opts|\"-pe smp 16 -l ram_free=1G,mem_free=1G\">      # extra options to pass to e.g. queue.pl for processes that"
   echo "                                                   # use multiple threads... note, you might have to reduce mem_free,ram_free"
   echo "                                                   # versus your defaults, because it gets multiplied by the -pe smp argument."
-  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
-  echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
-  echo "                                                   # should not get too large, e.g. >2k)."
-  echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
-  echo "                                                   # process."
-  echo "  --splice-indexes <string|\"-4,-3,-2,-1,0,1,2,3,4 0 0\"> "
+  echo "  --splice-indexes <string|\"-2,-1,0,1,2 0 0\"> "
   echo "                                                   # Frame indices used for each splice layer."
   echo "                                                   # Format : <frame_indices> .... <frame_indices> "
+  echo "                                                   # the number of fields determines the number of LSTM and non-recurrent layers"
+  echo "                                                   # also see the --num-lstm-layers option"
   echo "                                                   # (note: we splice processed, typically 40-dimensional frames"
   echo "  --lda-dim <dim|''>                               # Dimension to reduce spliced features to with LDA"
-  echo " ################### LSTM options ###################### "
-  echo "  --num-lstm-layers <int|3>                        # number of LSTM layers"
-  echo "  --lstm-cell-dim   <int|1024>                     # dimension of the LSTM cell"
-  echo "  --hidden-dim      <int|1024>                     # the dimension of the fully connected hidden layer outputs"
-  echo "  --recurrent-projection-dim  <int|256>            # the output dimension of the recurrent-projection-matrix"
-  echo "  --non-recurrent-projection-dim  <int|256>        # the output dimension of the non-recurrent-projection-matrix"
-  echo "  --bptt-truncation-width <int|20>                 # number of BPTT steps" 
-  echo "  --context-sensitive-chunk-width <int|20>         # number of steps used in the estimation of the first LSTM state" 
   echo "  --realign-epochs <list-of-epochs|''>             # A list of space-separated epoch indices the beginning of which"
   echo "                                                   # realignment is to be done"
   echo "  --align-cmd (utils/run.pl|utils/queue.pl <queue opts>) # passed to align.sh"
@@ -144,7 +158,39 @@ if [ $# != 4 ]; then
   echo "  --stage <stage|-4>                               # Used to run a partially-completed training process from somewhere in"
   echo "                                                   # the middle."
 
-  
+  echo " ################### LSTM options ###################### "
+  echo "  --num-lstm-layers <int|3>                        # number of LSTM layers"
+  echo "  --cell-dim   <int|1024>                          # dimension of the LSTM cell"
+  echo "  --hidden-dim      <int|1024>                     # the dimension of the fully connected hidden layer outputs"
+  echo "  --recurrent-projection-dim  <int|256>            # the output dimension of the recurrent-projection-matrix"
+  echo "  --non-recurrent-projection-dim  <int|256>        # the output dimension of the non-recurrent-projection-matrix"
+  echo "  --chunk-left-context <int|40>                    # number of time-steps used in the estimation of the first LSTM state"
+  echo "  --chunk-width <int|20>                           # number of output labels in the sequence used to train an LSTM"
+  echo "                                                   # Caution: if you double this you should halve --samples-per-iter."
+  echo "  --norm-based-clipping <bool|true>                # if true norm_based_clipping is used."
+  echo "                                                   # In norm-based clipping the activation Jacobian matrix"
+  echo "                                                   # for the recurrent connections in the network is clipped"
+  echo "                                                   # to ensure that the individual row-norm (l2) does not increase"
+  echo "                                                   # beyond the clipping_threshold."
+  echo "                                                   # If false, element-wise clipping is used."
+  echo "  --num-bptt-steps <int|>                          # this variable counts the number of time steps to back-propagate from the last label in the chunk"
+  echo "                                                   # it defaults to chunk_width"
+  echo "  --label-delay <int|5>                            # the lstm output is used to predict the label with the specified delay"
+
+  echo "  --lstm-delay <str|\" -1 -2 -3 \">                # the delay to be used in the recurrence of lstms"
+  echo "                                                   # \"-1 -2 -3\" means the a three layer stacked LSTM would use recurrence connections with "
+  echo "                                                   # delays -1, -2 and -3 at layer1 lstm, layer2 lstm and layer3 lstm respectively"
+  echo "  --clipping-threshold <int|30>                    # if norm_based_clipping is true this would be the maximum value of the row l2-norm,"
+  echo "                                                   # else this is the max-absolute value of each element in Jacobian."
+
+  echo " ################### LSTM specific training options ###################### "
+  echo "  --num-chunks-per-minibatch <minibatch-size|100>  # Number of sequences to be processed in parallel in a minibatch"
+  echo "  --samples-per-iter <#samples|20000>              # Number of egs in each archive of data.  This times --chunk-width is"
+  echo "                                                   # the number of frames processed per iteration"
+  echo "  --shrink <shrink|0.99>                           # if non-zero this parameter will be used to scale the parameter matrices"
+  echo "  --shrink-threshold <threshold|0.15>              # a threshold (should be between 0.0 and 0.25) that controls when to"
+  echo "                                                   # do parameter shrinking."
+  echo " for more options see the script"
   exit 1;
 fi
 
@@ -202,8 +248,14 @@ if [ $stage -le -5 ]; then
   echo "$0: creating neural net configs";
 
   # create the config files for nnet initialization
-  python steps/nnet3/lstm/make_configs.py  \
-    --splice-indexes "$splice_indexes" \
+  # note an additional space is added to splice_indexes to
+  # avoid issues with the python ArgParser which can have
+  # issues with negative arguments (due to minus sign)
+  config_extra_opts=()
+  [ ! -z "$lstm_delay" ] && config_extra_opts+=(--lstm-delay "$lstm_delay")
+
+  steps/nnet3/lstm/make_configs.py  "${config_extra_opts[@]}" \
+    --splice-indexes "$splice_indexes " \
     --num-lstm-layers $num_lstm_layers \
     --feat-dim $feat_dim \
     --ivector-dim $ivector_dim \
@@ -211,9 +263,10 @@ if [ $stage -le -5 ]; then
     --hidden-dim $hidden_dim \
     --recurrent-projection-dim $recurrent_projection_dim \
     --non-recurrent-projection-dim $non_recurrent_projection_dim \
-    --bptt-truncation-width $bptt_truncation_width \
-    --context-sensitive-chunk-width $context_sensitive_chunk_width \
+    --norm-based-clipping $norm_based_clipping \
+    --clipping-threshold $clipping_threshold \
     --num-targets $num_leaves \
+    --label-delay $label_delay \
    $dir/configs || exit 1;
   # Initialize as "raw" nnet, prior to training the LDA-like preconditioning
   # matrix.  This first config just does any initial splicing that we do;
@@ -222,13 +275,13 @@ if [ $stage -le -5 ]; then
   $cmd $dir/log/nnet_init.log \
     nnet3-init --srand=-2 $dir/configs/init.config $dir/init.raw || exit 1;
 fi
-
 # sourcing the "vars" below sets
-# left_context=(something)
-# right_context=(something)
+# model_left_context=(something)
+# model_right_context=(something)
 # num_hidden_layers=(something)
 . $dir/configs/vars || exit 1;
-
+left_context=$((chunk_left_context + model_left_context))
+right_context=$((chunk_right_context + model_right_context))
 context_opts="--left-context=$left_context --right-context=$right_context"
 
 ! [ "$num_hidden_layers" -gt 0 ] && echo \
@@ -236,7 +289,6 @@ context_opts="--left-context=$left_context --right-context=$right_context"
 
 [ -z "$transform_dir" ] && transform_dir=$alidir
 
-
 if [ $stage -le -4 ] && [ -z "$egs_dir" ]; then
   extra_opts=()
   [ ! -z "$cmvn_opts" ] && extra_opts+=(--cmvn-opts "$cmvn_opts")
@@ -245,26 +297,30 @@ if [ $stage -le -4 ] && [ -z "$egs_dir" ]; then
   extra_opts+=(--transform-dir $transform_dir)
   extra_opts+=(--left-context $left_context)
   extra_opts+=(--right-context $right_context)
+  extra_opts+=(--valid-left-context $((chunk_width + left_context)))
+  extra_opts+=(--valid-right-context $((chunk_width + right_context)))
+
+  # Note: in RNNs we process sequences of labels rather than single label per sample
   echo "$0: calling get_egs.sh"
   steps/nnet3/get_egs.sh $egs_opts "${extra_opts[@]}" \
-      --samples-per-iter $samples_per_iter --stage $get_egs_stage \
-      --io-opts "$io_opts" \
       --cmd "$cmd" $egs_opts \
-      --frames-per-eg $frames_per_eg \
+      --stage $get_egs_stage \
+      --samples-per-iter $samples_per_iter \
+      --frames-per-eg $chunk_width \
       $data $alidir $dir/egs || exit 1;
 fi
 
-if [ "$feat_dim" != "$(cat $dir/egs/info/feat_dim)" ]; then
-  echo "$0: feature dimension mismatch with egs, $feat_dim vs $(cat $dir/egs/info/feat_dim)";
+[ -z $egs_dir ] && egs_dir=$dir/egs
+
+if [ "$feat_dim" != "$(cat $egs_dir/info/feat_dim)" ]; then
+  echo "$0: feature dimension mismatch with egs, $feat_dim vs $(cat $egs_dir/info/feat_dim)";
   exit 1;
 fi
-if [ "$ivector_dim" != "$(cat $dir/egs/info/ivector_dim)" ]; then
-  echo "$0: ivector dimension mismatch with egs, $ivector_dim vs $(cat $dir/egs/info/ivector_dim)";
+if [ "$ivector_dim" != "$(cat $egs_dir/info/ivector_dim)" ]; then
+  echo "$0: ivector dimension mismatch with egs, $ivector_dim vs $(cat $egs_dir/info/ivector_dim)";
   exit 1;
 fi
 
-[ -z $egs_dir ] && egs_dir=$dir/egs
-
 # copy any of the following that exist, to $dir.
 cp $egs_dir/{cmvn_opts,splice_opts,final.mat} $dir 2>/dev/null
 
@@ -272,22 +328,18 @@ cp $egs_dir/{cmvn_opts,splice_opts,final.mat} $dir 2>/dev/null
 # the --egs-dir option was used on the command line).
 egs_left_context=$(cat $egs_dir/info/left_context) || exit -1
 egs_right_context=$(cat $egs_dir/info/right_context) || exit -1
-( ! [ $(cat $egs_dir/info/left_context) -le $left_context ] ||
-  ! [ $(cat $egs_dir/info/right_context) -le $right_context ] ) && \
+ ( [ $egs_left_context -lt $left_context ] || \
+   [ $egs_right_context -lt $right_context ] ) && \
    echo "$0: egs in $egs_dir have too little context" && exit -1;
 
-frames_per_eg=$(cat $egs_dir/info/frames_per_eg) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }
-num_archives=$(cat $egs_dir/info/num_archives) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }
-
-# num_archives_expanded considers each separate label-position from
-# 0..frames_per_eg-1 to be a separate archive.
-num_archives_expanded=$[$num_archives*$frames_per_eg]
+chunk_width=$(cat $egs_dir/info/frames_per_eg) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }
+num_archives=$(cat $egs_dir/info/num_archives) || { echo "error: no such file $egs_dir/info/num_archives"; exit 1; }
 
 [ $num_jobs_initial -gt $num_jobs_final ] && \
   echo "$0: --initial-num-jobs cannot exceed --final-num-jobs" && exit 1;
 
-[ $num_jobs_final -gt $num_archives_expanded ] && \
-  echo "$0: --final-num-jobs cannot exceed #archives $num_archives_expanded." && exit 1;
+[ $num_jobs_final -gt $num_archives ] && \
+  echo "$0: --final-num-jobs cannot exceed #archives $num_archives." && exit 1;
 
 
 if [ $stage -le -3 ]; then
@@ -320,14 +372,14 @@ if [ $stage -le -2 ]; then
   echo "$0: preparing initial vector for FixedScaleComponent before softmax"
   echo "  ... using priors^$presoftmax_prior_scale_power and rescaling to average 1"
 
-  # obtains raw pdf count    
+  # obtains raw pdf count
   $cmd JOB=1:$nj $dir/log/acc_pdf.JOB.log \
      ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
      post-to-tacc --per-pdf=true  $alidir/final.mdl ark:- $dir/pdf_counts.JOB || exit 1;
   $cmd $dir/log/sum_pdf_counts.log \
        vector-sum --binary=false $dir/pdf_counts.* $dir/pdf_counts || exit 1;
   rm $dir/pdf_counts.*
-  
+
   awk -v power=$presoftmax_prior_scale_power -v smooth=0.01 \
      '{ for(i=2; i<=NF-1; i++) { count[i-2] = $i;  total += $i; }
         num_pdfs=NF-2;  average_count = total/num_pdfs;
@@ -351,10 +403,10 @@ fi
 
 
 # set num_iters so that as close as possible, we process the data $num_epochs
-# times, i.e. $num_iters*$avg_num_jobs) == $num_epochs*$num_archives_expanded,
+# times, i.e. $num_iters*$avg_num_jobs) == $num_epochs*$num_archives,
 # where avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.
 
-num_archives_to_process=$[$num_epochs*$num_archives_expanded]
+num_archives_to_process=$[$num_epochs*$num_archives]
 num_archives_processed=0
 num_iters=$[($num_archives_to_process*2)/($num_jobs_initial+$num_jobs_final)]
 
@@ -379,22 +431,16 @@ if $use_gpu; then
     exit 1
   fi
 else
-  if [ $num_threads -gt 1 ]; then
-    parallel_suffix="-parallel"
-    parallel_train_opts="--num-threads=$num_threads"
-    train_queue_opt="--num-threads $num_threads"
-    combine_queue_opt=""  # the combine stage will be quite slow if not using
-                          # GPU, as we didn't enable that program to use
-                          # multiple threads.
-  else
-    parallel_suffix=""
-  fi
+  echo "$0: without using a GPU this will be very slow.  nnet3 does not yet support multiple threads."
+  parallel_train_opts="--use-gpu=no"
+  combine_queue_opt=""  # the combine stage will be quite slow if not using
+                        # GPU, as we didn't enable that program to use
+                        # multiple threads.
   prior_gpu_opt="--use-gpu=no"
   prior_queue_opt=""
 fi
 
-
-approx_iters_per_epoch_final=$[$num_archives_expanded/$num_jobs_final]
+approx_iters_per_epoch_final=$[$num_archives/$num_jobs_final]
 # First work out how many iterations we want to combine over in the final
 # nnet3-combine-fast invocation.  (We may end up subsampling from these if the
 # number exceeds max_model_combine).  The number we use is:
@@ -423,16 +469,16 @@ for realign_time in $realign_times; do
 done
 
 cur_egs_dir=$egs_dir
-
+[ -z $num_bptt_steps ] && num_bptt_steps=$chunk_width;
+min_deriv_time=$((chunk_width - num_bptt_steps))
 while [ $x -lt $num_iters ]; do
   [ $x -eq $exit_stage ] && echo "$0: Exiting early due to --exit-stage $exit_stage" && exit 0;
 
   this_num_jobs=$(perl -e "print int(0.5+$num_jobs_initial+($num_jobs_final-$num_jobs_initial)*$x/$num_iters);")
 
   ilr=$initial_effective_lrate; flr=$final_effective_lrate; np=$num_archives_processed; nt=$num_archives_to_process;
-  this_learning_rate=$(perl -e "print (($x + 1 >= $num_iters ? $flr : $ilr*exp($np*log($flr/$ilr)/$nt))*$this_num_jobs);");
-
-  echo "On iteration $x, learning rate is $this_learning_rate."    
+  this_effective_learning_rate=$(perl -e "print ($x + 1 >= $num_iters ? $flr : $ilr*exp($np*log($flr/$ilr)/$nt));");
+  this_learning_rate=$(perl -e "print ($this_effective_learning_rate*$this_num_jobs);");
 
   if [ ! -z "${realign_this_iter[$x]}" ]; then
     prev_egs_dir=$cur_egs_dir
@@ -440,6 +486,15 @@ while [ $x -lt $num_iters ]; do
   fi
 
   if [ $x -ge 0 ] && [ $stage -le $x ]; then
+    # Set this_shrink value.
+    if [ $x -eq 0 ] || nnet3-am-info --print-args=false $dir/$x.mdl | \
+      perl -e "while(<>){ if (m/type=Sigmoid.+deriv-avg=.+mean=(\S+)/) { \$n++; \$tot+=\$1; } } exit(\$tot/\$n > $shrink_threshold);"; then
+      this_shrink=$shrink; # e.g. avg-deriv of sigmoids was <= 0.125, so shrink.
+    else
+      this_shrink=1.0  # don't shrink: sigmoids are not over-saturated.
+    fi
+    echo "On iteration $x, learning rate is $this_learning_rate and shrink value is $this_shrink."
+
     if [ ! -z "${realign_this_iter[$x]}" ]; then
       time=${realign_this_iter[$x]}
 
@@ -478,23 +533,22 @@ while [ $x -lt $num_iters ]; do
         steps/nnet3/remove_egs.sh $prev_egs_dir
       fi
     fi
-    
+
     # Set off jobs doing some diagnostics, in the background.
     # Use the egs dir from the previous iteration for the diagnostics
     $cmd $dir/log/compute_prob_valid.$x.log \
       nnet3-compute-prob "nnet3-am-copy --raw=true $dir/$x.mdl - |" \
-            "ark:nnet3-merge-egs ark:$cur_egs_dir/valid_diagnostic.egs ark:- |" &
+            "ark,bg:nnet3-merge-egs ark:$cur_egs_dir/valid_diagnostic.egs ark:- |" &
     $cmd $dir/log/compute_prob_train.$x.log \
       nnet3-compute-prob "nnet3-am-copy --raw=true $dir/$x.mdl - |" \
-           "ark:nnet3-merge-egs ark:$cur_egs_dir/train_diagnostic.egs ark:- |" &
+           "ark,bg:nnet3-merge-egs ark:$cur_egs_dir/train_diagnostic.egs ark:- |" &
 
-    # nnet3-show-progress not implemented yet
-    #if [ $x -gt 0 ] && [ ! -f $dir/log/mix_up.$[$x-1].log ]; then
-    #  $cmd $dir/log/progress.$x.log \
-    #    nnet3-show-progress --use-gpu=no $dir/$[$x-1].mdl $dir/$x.mdl \
-    #    ark:$cur_egs_dir/train_diagnostic.egs '&&' \
-    #    nnet3-info $dir/$x.mdl &
-    #fi
+    if [ $x -gt 0 ]; then
+      $cmd $dir/log/progress.$x.log \
+        nnet3-info "nnet3-am-copy --raw=true $dir/$x.mdl - |" '&&' \
+        nnet3-show-progress --use-gpu=no "nnet3-am-copy --raw=true $dir/$[$x-1].mdl - |" "nnet3-am-copy --raw=true $dir/$x.mdl - |" \
+        "ark,bg:nnet3-merge-egs --minibatch-size=256 ark:$cur_egs_dir/train_diagnostic.egs ark:-|" &
+    fi
 
     echo "Training neural net (pass $x)"
 
@@ -506,20 +560,23 @@ while [ $x -lt $num_iters ]; do
       cur_num_hidden_layers=$[1+$x/$add_layers_period]
       config=$dir/configs/layer$cur_num_hidden_layers.config
       raw="nnet3-am-copy --raw=true --learning-rate=$this_learning_rate $dir/$x.mdl - | nnet3-init --srand=$x - $config - |"
+      cache_read_opt="" # an option for writing cache (storing pairs of nnet-computations
+                        # and computation-requests) during training.
     else
       do_average=true
       if [ $x -eq 0 ]; then do_average=false; fi # on iteration 0, pick the best, don't average.
       raw="nnet3-am-copy --raw=true --learning-rate=$this_learning_rate $dir/$x.mdl -|"
+      cache_read_opt="--read-cache=$dir/cache.$x"
     fi
     if $do_average; then
-      this_minibatch_size=$minibatch_size
+      this_num_chunk_per_minibatch=$num_chunk_per_minibatch
     else
       # on iteration zero or when we just added a layer, use a smaller minibatch
       # size (and we will later choose the output of just one of the jobs): the
       # model-averaging isn't always helpful when the model is changing too fast
       # (i.e. it can worsen the objective function), and the smaller minibatch
       # size will help to keep the update stable.
-      this_minibatch_size=$[$minibatch_size/2];
+      this_num_chunk_per_minibatch=$[$num_chunk_per_minibatch/2];
     fi
 
     rm $dir/.error 2>/dev/null
@@ -528,22 +585,29 @@ while [ $x -lt $num_iters ]; do
     ( # this sub-shell is so that when we "wait" below,
       # we only wait for the training jobs that we just spawned,
       # not the diagnostic jobs that we spawned above.
-      
-      # We can't easily use a single parallel SGE job to do the main training,
+
+      # We cannot easily use a single parallel SGE job to do the main training,
       # because the computation of which archive and which --frame option
       # to use for each job is a little complex, so we spawn each one separately.
+      # this is no longer true for RNNs as we use do not use the --frame option
+      # but we use the same script for consistency with FF-DNN code
+
       for n in $(seq $this_num_jobs); do
-        k=$[$num_archives_processed + $n - 1]; # k is a zero-based index that we'll derive
+        k=$[$num_archives_processed + $n - 1]; # k is a zero-based index that we will derive
                                                # the other indexes from.
         archive=$[($k%$num_archives)+1]; # work out the 1-based archive index.
-        frame=$[(($k/$num_archives)%$frames_per_eg)]; # work out the 0-based frame
-        # index; this increases more slowly than the archive index because the
-        # same archive with different frame indexes will give similar gradients,
-        # so we want to separate them in time.
-
+        if [ $n -eq 1 ]; then
+          # an option for writing cache (storing pairs of nnet-computations and
+          # computation-requests) during training.
+          cache_write_opt=" --write-cache=$dir/cache.$[$x+1]"
+        else
+          cache_write_opt=""
+        fi
         $cmd $train_queue_opt $dir/log/train.$x.$n.log \
-          nnet3-train$parallel_suffix $parallel_train_opts "$raw" \
-          "ark:nnet3-copy-egs --frame=$frame $context_opts ark:$cur_egs_dir/egs.$archive.ark ark:- | nnet3-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-| nnet3-merge-egs --minibatch-size=$this_minibatch_size ark:- ark:- |" \
+          nnet3-train $parallel_train_opts $cache_read_opt $cache_write_opt --print-interval=10 --momentum=$momentum \
+          --max-param-change=$max_param_change \
+          --optimization.min-deriv-time=$min_deriv_time "$raw" \
+          "ark,bg:nnet3-copy-egs $context_opts ark:$cur_egs_dir/egs.$archive.ark ark:- | nnet3-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-| nnet3-merge-egs --minibatch-size=$this_num_chunk_per_minibatch --measure-output-frames=false --discard-partial-minibatches=true ark:- ark:- |" \
           $dir/$[$x+1].$n.raw || touch $dir/.error &
       done
       wait
@@ -552,8 +616,9 @@ while [ $x -lt $num_iters ]; do
     # have printed a more specific one.
     [ -f $dir/.error ] && echo "$0: error on iteration $x of training" && exit 1;
 
+    models_to_average=$(steps/nnet3/get_successful_models.py $this_num_jobs $dir/log/train.$x.%.log)
     nnets_list=
-    for n in `seq 1 $this_num_jobs`; do
+    for n in $models_to_average; do
       nnets_list="$nnets_list $dir/$[$x+1].$n.raw"
     done
 
@@ -561,19 +626,24 @@ while [ $x -lt $num_iters ]; do
       # average the output of the different jobs.
       $cmd $dir/log/average.$x.log \
         nnet3-average $nnets_list - \| \
-        nnet3-am-copy --set-raw-nnet=- $dir/$x.mdl $dir/$[$x+1].mdl || exit 1;
+        nnet3-am-copy --scale=$this_shrink --set-raw-nnet=- $dir/$x.mdl $dir/$[$x+1].mdl || exit 1;
     else
       # choose the best from the different jobs.
       n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
           $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
           undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
-          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob; 
-          $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
+          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
+          $best_n=$n; } } print "$best_n\n"; ' $this_num_jobs $dir/log/train.$x.%d.log) || exit 1;
       [ -z "$n" ] && echo "Error getting best model" && exit 1;
       $cmd $dir/log/select.$x.log \
-        nnet3-am-copy --set-raw-nnet=$dir/$[$x+1].$n.raw  $dir/$x.mdl $dir/$[$x+1].mdl || exit 1;
+        nnet3-am-copy --scale=$this_shrink --set-raw-nnet=$dir/$[$x+1].$n.raw  $dir/$x.mdl $dir/$[$x+1].mdl || exit 1;
     fi
 
+    nnets_list=
+    for n in `seq 1 $this_num_jobs`; do
+      nnets_list="$nnets_list $dir/$[$x+1].$n.raw"
+    done
+
     rm $nnets_list
     [ ! -f $dir/$[$x+1].mdl ] && exit 1;
     if [ -f $dir/$[$x-1].mdl ] && $cleanup && \
@@ -581,6 +651,7 @@ while [ $x -lt $num_iters ]; do
       rm $dir/$[$x-1].mdl
     fi
   fi
+  rm $dir/cache.$x 2>/dev/null
   x=$[$x+1]
   num_archives_processed=$[$num_archives_processed+$this_num_jobs]
 done
@@ -601,14 +672,11 @@ if [ $stage -le $num_iters ]; then
     nnets_list[$n]="nnet3-am-copy --raw=true $mdl -|";
   done
 
-  # Below, we use --use-gpu=no to disable nnet3-combine-fast from using a GPU,
-  # as if there are many models it can give out-of-memory error; and we set
-  # num-threads to 8 to speed it up (this isn't ideal...)
-
+  combine_num_chunk_per_minibatch=$(python -c "print int(1024.0/($chunk_width))")
   $cmd $combine_queue_opt $dir/log/combine.log \
     nnet3-combine --num-iters=40 \
        --enforce-sum-to-one=true --enforce-positive-weights=true \
-       --verbose=3 "${nnets_list[@]}" "ark:nnet3-merge-egs --minibatch-size=1024 ark:$cur_egs_dir/combine.egs ark:-|" \
+       --verbose=3 "${nnets_list[@]}" "ark,bg:nnet3-merge-egs --measure-output-frames=false --minibatch-size=$combine_num_chunk_per_minibatch ark:$cur_egs_dir/combine.egs ark:-|" \
     "|nnet3-am-copy --set-raw-nnet=- $dir/$num_iters.mdl $dir/combined.mdl" || exit 1;
 
   # Compute the probability of the final, combined model with
@@ -616,20 +684,21 @@ if [ $stage -le $num_iters ]; then
   # different subsets will lead to different probs.
   $cmd $dir/log/compute_prob_valid.final.log \
     nnet3-compute-prob "nnet3-am-copy --raw=true $dir/combined.mdl -|" \
-    "ark:nnet3-merge-egs ark:$cur_egs_dir/valid_diagnostic.egs ark:- |" &
+    "ark,bg:nnet3-merge-egs --minibatch-size=256 ark:$cur_egs_dir/valid_diagnostic.egs ark:- |" &
   $cmd $dir/log/compute_prob_train.final.log \
     nnet3-compute-prob  "nnet3-am-copy --raw=true $dir/combined.mdl -|" \
-    "ark:nnet3-merge-egs ark:$cur_egs_dir/train_diagnostic.egs ark:- |" &
+    "ark,bg:nnet3-merge-egs --minibatch-size=256 ark:$cur_egs_dir/train_diagnostic.egs ark:- |" &
 fi
 
 if [ $stage -le $[$num_iters+1] ]; then
   echo "Getting average posterior for purposes of adjusting the priors."
   # Note: this just uses CPUs, using a smallish subset of data.
   rm $dir/post.$x.*.vec 2>/dev/null
+  if [ $num_jobs_compute_prior -gt $num_archives ]; then egs_part=1;
+  else egs_part=JOB; fi
   $cmd JOB=1:$num_jobs_compute_prior $prior_queue_opt $dir/log/get_post.$x.JOB.log \
-    nnet3-copy-egs --frame=random $context_opts --srand=JOB ark:$cur_egs_dir/egs.1.ark ark:- \| \
-    nnet3-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
-    nnet3-merge-egs ark:- ark:- \| \
+    nnet3-subset-egs --srand=JOB --n=$prior_subset_size ark:$cur_egs_dir/egs.$egs_part.ark ark:- \| \
+    nnet3-merge-egs --measure-output-frames=true --minibatch-size=128 ark:- ark:- \| \
     nnet3-compute-from-egs $prior_gpu_opt --apply-exp=true \
       "nnet3-am-copy --raw=true $dir/combined.mdl -|" ark:- ark:- \| \
     matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1;
@@ -667,7 +736,7 @@ if $cleanup; then
   for x in `seq 0 $num_iters`; do
     if [ $[$x%100] -ne 0 ] && [ $x -ne $num_iters ] && [ -f $dir/$x.mdl ]; then
        # delete all but every 100th model; don't delete the ones which combine to form the final model.
-      rm $dir/$x.mdl
+       rm $dir/$x.mdl
     fi
   done
 fi
diff --git a/egs/wsj/s5/steps/nnet3/make_denlats.sh b/egs/wsj/s5/steps/nnet3/make_denlats.sh
new file mode 100755
index 00000000000..fadc164c539
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/make_denlats.sh
@@ -0,0 +1,251 @@
+#!/bin/bash
+# Copyright 2012        Johns Hopkins University (Author: Daniel Povey)
+#           2014-2015   Vimal Manohar
+# Apache 2.0.
+
+# Create denominator lattices for MMI/MPE training.  
+# This version uses the neural-net models (version 3, i.e. the nnet3 code).
+# Creates its output in $dir/lat.*.gz
+
+# Begin configuration section.
+nj=4
+cmd=run.pl
+sub_split=1
+beam=13.0
+frames_per_chunk=50
+lattice_beam=7.0
+self_loop_scale=0.1
+acwt=0.1
+max_active=5000
+min_active=200
+transform_dir=
+max_mem=20000000 # This will stop the processes getting too large.
+# This is in bytes, but not "real" bytes-- you have to multiply
+# by something like 5 or 10 to get real bytes (not sure why so large)
+num_threads=1 # Fixed to 1 for now
+online_ivector_dir=
+determinize=true
+minimize=false
+ivector_scale=1.0
+extra_left_context=0
+extra_right_context=0
+extra_left_context_initial=-1
+extra_right_context_final=-1
+feat_type=  # you can set this in order to run on top of delta features, although we don't
+            # normally want to do this.
+# End configuration section.
+
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+num_threads=1 # Fixed to 1 for now
+
+if [ $# != 4 ]; then
+  echo "Usage: steps/nnet3/make_denlats.sh [options] <data-dir> <lang-dir> <src-dir> <exp-dir>"
+  echo "  e.g.: steps/nnet3/make_denlats.sh data/train data/lang exp/nnet4 exp/nnet4_denlats"
+  echo "Works for (delta|lda) features, and (with --transform-dir option) such features"
+  echo " plus transforms."
+  echo ""
+  echo "Main options (for others, see top of script file)"
+  echo "  --config <config-file>                           # config containing options"
+  echo "  --nj <nj>                                        # number of parallel jobs"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --sub-split <n-split>                            # e.g. 40; use this for "
+  echo "                           # large databases so your jobs will be smaller and"
+  echo "                           # will (individually) finish reasonably soon."
+  echo "  --transform-dir <transform-dir>   # directory to find fMLLR transforms."
+  echo "  --num-threads  <n>                # number of threads per decoding job"
+  exit 1;
+fi
+
+data=$1
+lang=$2
+srcdir=$3
+dir=$4
+
+
+extra_files=
+[ ! -z "$online_ivector_dir" ] && \
+  extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
+for f in $data/feats.scp $lang/L.fst $srcdir/final.mdl $extra_files; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1;
+done
+
+sdata=$data/split$nj
+splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
+thread_string=
+[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads"
+
+mkdir -p $dir/log
+split_data.sh $data $nj || exit 1;
+echo $nj > $dir/num_jobs
+
+oov=`cat $lang/oov.int` || exit 1;
+
+cp -rH $lang $dir/
+
+# Compute grammar FST which corresponds to unigram decoding graph.
+new_lang="$dir/"$(basename "$lang")
+
+# mkgraph.sh expects a whole directory "lang", so put everything in one directory...
+# it gets L_disambig.fst and G.fst (among other things) from $dir/lang, and
+# final.mdl from $srcdir; the output HCLG.fst goes in $dir/graph.
+
+echo "Compiling decoding graph in $dir/dengraph"
+if [ -s $dir/dengraph/HCLG.fst ] && [ $dir/dengraph/HCLG.fst -nt $srcdir/final.mdl ]; then
+  echo "Graph $dir/dengraph/HCLG.fst already exists: skipping graph creation."
+else
+  echo "Making unigram grammar FST in $new_lang"
+  cat $data/text | utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt | \
+   awk '{for(n=2;n<=NF;n++){ printf("%s ", $n); } printf("\n"); }' | \
+    utils/make_unigram_grammar.pl | fstcompile | fstarcsort --sort_type=ilabel > $new_lang/G.fst \
+    || exit 1;
+  utils/mkgraph.sh --self-loop-scale $self_loop_scale $new_lang $srcdir $dir/dengraph || exit 1;
+fi
+cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
+cp $srcdir/cmvn_opts $dir 2>/dev/null
+
+if [ -z "$feat_type" ]; then
+  if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=raw; fi
+fi
+echo "$0: feature type is $feat_type"         
+
+case $feat_type in
+  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  raw) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"
+   ;;
+  lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
+    cp $srcdir/final.mat $dir    
+   ;;
+  *) echo "Invalid feature type $feat_type" && exit 1;
+esac
+
+if [ ! -z "$transform_dir" ]; then
+  echo "$0: using transforms from $transform_dir"
+  [ ! -s $transform_dir/num_jobs ] && \
+    echo "$0: expected $transform_dir/num_jobs to contain the number of jobs." && exit 1;
+  nj_orig=$(cat $transform_dir/num_jobs)
+  
+  if [ $feat_type == "raw" ]; then trans=raw_trans;
+  else trans=trans; fi
+  if [ $feat_type == "lda" ] && ! cmp $transform_dir/final.mat $srcdir/final.mat; then
+    echo "$0: LDA transforms differ between $srcdir and $transform_dir"
+    exit 1;
+  fi
+  if [ ! -f $transform_dir/$trans.1 ]; then
+    echo "$0: expected $transform_dir/$trans.1 to exist (--transform-dir option)"
+    exit 1;
+  fi
+  if [ $nj -ne $nj_orig ]; then
+    # Copy the transforms into an archive with an index.
+    for n in $(seq $nj_orig); do cat $transform_dir/$trans.$n; done | \
+       copy-feats ark:- ark,scp:$dir/$trans.ark,$dir/$trans.scp || exit 1;
+    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/$trans.scp ark:- ark:- |"
+  else
+    # number of jobs matches with alignment dir.
+    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/$trans.JOB ark:- ark:- |"
+  fi
+fi
+
+
+# if this job is interrupted by the user, we want any background jobs to be
+# killed too.
+cleanup() {
+  local pids=$(jobs -pr)
+  [ -n "$pids" ] && kill $pids
+}
+trap "cleanup" INT QUIT TERM EXIT
+
+if [ ! -z "$online_ivector_dir" ]; then
+  ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
+  ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
+fi
+
+if [ -f $srcdir/frame_subsampling_factor ]; then
+  # e.g. for 'chain' systems
+  frame_subsampling_opt="--frame-subsampling-factor=$(cat $srcdir/frame_subsampling_factor)"
+  cp $srcdir/frame_subsampling_factor $dir
+fi
+
+lattice_determinize_cmd=
+if $determinize; then
+  lattice_determinize_cmd="lattice-determinize-non-compact --acoustic-scale=$acwt --max-mem=$max_mem --minimize=$minimize --prune --beam=$beam ark:- ark:- |"
+fi
+
+if [ $sub_split -eq 1 ]; then 
+  $cmd --num-threads $num_threads JOB=1:$nj $dir/log/decode_den.JOB.log \
+    nnet3-latgen-faster$thread_string $ivector_opts $frame_subsampling_opt \
+    --frames-per-chunk=$frames_per_chunk \
+    --extra-left-context=$extra_left_context \
+    --extra-right-context=$extra_right_context \
+    --extra-left-context-initial=$extra_left_context_initial \
+    --extra-right-context-final=$extra_right_context_final \
+    --minimize=false --determinize-lattice=false \
+    --word-determinize=false --phone-determinize=false \
+    --max-active=$max_active --min-active=$min_active --beam=$beam \
+    --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=false \
+    --max-mem=$max_mem --word-symbol-table=$lang/words.txt $srcdir/final.mdl  \
+    $dir/dengraph/HCLG.fst "$feats" \
+    "ark:|$lattice_determinize_cmd gzip -c >$dir/lat.JOB.gz" || exit 1
+else
+
+  # each job from 1 to $nj is split into multiple pieces (sub-split), and we aim
+  # to have at most two jobs running at each time.  The idea is that if we have stragglers 
+  # from one job, we can be processing another one at the same time.
+  rm $dir/.error 2>/dev/null
+
+  prev_pid=
+  for n in `seq $[nj+1]`; do
+    if [ $n -gt $nj ]; then
+      this_pid=
+    elif [ -f $dir/.done.$n ] && [ $dir/.done.$n -nt $srcdir/final.mdl ]; then
+      echo "Not processing subset $n as already done (delete $dir/.done.$n if not)";
+      this_pid=
+    else
+      sdata2=$data/split$nj/$n/split$sub_split;
+      if [ ! -d $sdata2 ] || [ $sdata2 -ot $sdata/$n/feats.scp ]; then
+        split_data.sh --per-utt $sdata/$n $sub_split || exit 1;
+      fi
+      mkdir -p $dir/log/$n
+      mkdir -p $dir/part
+      feats_subset=`echo $feats | sed "s/trans.JOB/trans.$n/g" | sed s:JOB/:$n/split$sub_split/JOB/:g`
+
+      $cmd --num-threads $num_threads JOB=1:$sub_split $dir/log/$n/decode_den.JOB.log \
+        nnet3-latgen-faster$thread_string $ivector_opts $frame_subsampling_opt \
+        --frames-per-chunk=$frames_per_chunk \
+        --extra-left-context=$extra_left_context \
+        --extra-right-context=$extra_right_context \
+        --extra-left-context-initial=$extra_left_context_initial \
+        --extra-right-context-final=$extra_right_context_final \
+        --minimize=false --determinize-lattice=false \
+        --word-determinize=false --phone-determinize=false \
+        --max-active=$max_active --min-active=$min_active --beam=$beam \
+        --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=false \
+        --max-mem=$max_mem --word-symbol-table=$lang/words.txt $srcdir/final.mdl  \
+        $dir/dengraph/HCLG.fst "$feats_subset" \
+        "ark:|$lattice_determinize_cmd gzip -c >$dir/lat.$n.JOB.gz" || touch $dir/.error &
+      this_pid=$!
+    fi
+    if [ ! -z "$prev_pid" ]; then  # Wait for the previous job; merge the previous set of lattices.
+      wait $prev_pid
+      [ -f $dir/.error ] && echo "$0: error generating denominator lattices" && exit 1;
+      rm $dir/.merge_error 2>/dev/null
+      echo Merging archives for data subset $prev_n
+      for k in `seq $sub_split`; do
+        gunzip -c $dir/lat.$prev_n.$k.gz || touch $dir/.merge_error;
+      done | gzip -c > $dir/lat.$prev_n.gz || touch $dir/.merge_error;
+      [ -f $dir/.merge_error ] && echo "$0: Merging lattices for subset $prev_n failed (or maybe some other error)" && exit 1;
+      rm $dir/lat.$prev_n.*.gz
+      touch $dir/.done.$prev_n
+    fi
+    prev_n=$n
+    prev_pid=$this_pid
+  done
+fi
+
+
+echo "$0: done generating denominator lattices."
+
diff --git a/egs/wsj/s5/steps/nnet3/make_jesus_configs.py b/egs/wsj/s5/steps/nnet3/make_jesus_configs.py
new file mode 100755
index 00000000000..af6afcb99e3
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/make_jesus_configs.py
@@ -0,0 +1,538 @@
+#!/usr/bin/env python
+
+# tdnn or RNN with 'jesus layer'
+
+#  inputs to jesus layer:
+#      - for each spliced version of the previous layer the output (of dim  --jesus-forward-output-dim)
+
+#  outputs of jesus layer:
+#     for all layers:
+#       --jesus-forward-output-dim
+
+
+# we're using python 3.x style print but want it to work in python 2.x,
+from __future__ import print_function
+import re, os, argparse, sys, math, warnings
+import imp
+
+nnet3_train_lib = imp.load_source('ntl', 'steps/nnet3/nnet3_train_lib.py')
+chain_lib = imp.load_source('ncl', 'steps/nnet3/chain/nnet3_chain_lib.py')
+
+parser = argparse.ArgumentParser(description="Writes config files and variables "
+                                 "for TDNNs creation and training",
+                                 epilog="See steps/nnet3/train_tdnn.sh for example.");
+parser.add_argument("--splice-indexes", type=str, required = True,
+                    help="Splice[:recurrence] indexes at each hidden layer, e.g. '-3,-2,-1,0,1,2,3 -3,0:-3 -3,0:-3 -6,-3,0:-6,-3'. "
+                    "Note: recurrence indexes are optional, may not appear in 1st layer, and must be "
+                    "either all negative or all positive for any given layer.")
+
+# Only one of these arguments can be specified, and one of them has to
+# be compulsarily specified
+feat_group = parser.add_mutually_exclusive_group(required = True)
+feat_group.add_argument("--feat-dim", type=int,
+                        help="Raw feature dimension, e.g. 13")
+feat_group.add_argument("--feat-dir", type=str,
+                        help="Feature directory, from which we derive the feat-dim")
+
+# only one of these arguments can be specified
+ivector_group = parser.add_mutually_exclusive_group(required = False)
+ivector_group.add_argument("--ivector-dim", type=int,
+                            help="iVector dimension, e.g. 100", default=0)
+ivector_group.add_argument("--ivector-dir", type=str,
+                            help="iVector dir, which will be used to derive the ivector-dim  ", default=None)
+
+num_target_group = parser.add_mutually_exclusive_group(required = True)
+num_target_group.add_argument("--num-targets", type=int,
+                              help="number of network targets (e.g. num-pdf-ids/num-leaves)")
+num_target_group.add_argument("--ali-dir", type=str,
+                              help="alignment directory, from which we derive the num-targets")
+num_target_group.add_argument("--tree-dir", type=str,
+                              help="directory with final.mdl, from which we derive the num-targets")
+
+parser.add_argument("--include-log-softmax", type=str,
+                    help="add the final softmax layer ", default="true", choices = ["false", "true"])
+parser.add_argument("--xent-regularize", type=float,
+                    help="For chain models, if nonzero, add a separate output for cross-entropy "
+                    "regularization (with learning-rate-factor equal to the inverse of this)",
+                    default=0.0)
+parser.add_argument("--xent-separate-forward-affine", type=str,
+                    help="if using --xent-regularize, gives it separate last-but-one weight matrix",
+                    default="false", choices = ["false", "true"])
+parser.add_argument("--use-repeated-affine", type=str,
+                    help="if true use RepeatedAffineComponent, else BlockAffineComponent (i.e. no sharing)",
+                    default="true", choices = ["false", "true"])
+parser.add_argument("--final-layer-learning-rate-factor", type=float,
+                    help="Learning-rate factor for final affine component",
+                    default=1.0)
+parser.add_argument("--self-repair-scale", type=float,
+                    help="Small scale involved in fixing derivatives, if supplied (e.g. try 0.00001)",
+                    default=0.0)
+parser.add_argument("--jesus-hidden-dim", type=int,
+                    help="hidden dimension of Jesus layer.", default=10000)
+parser.add_argument("--jesus-forward-output-dim", type=int,
+                    help="part of output dimension of Jesus layer that goes to next layer",
+                    default=1000)
+parser.add_argument("--jesus-forward-input-dim", type=int,
+                    help="Input dimension of Jesus layer that comes from affine projection "
+                    "from the previous layer (same as output dim of forward affine transform)",
+                    default=1000)
+parser.add_argument("--final-hidden-dim", type=int,
+                    help="Final hidden layer dimension-- or if <0, the same as "
+                    "--jesus-forward-input-dim", default=-1)
+parser.add_argument("--num-jesus-blocks", type=int,
+                    help="number of blocks in Jesus layer.  All configs of the form "
+                    "--jesus-*-dim will be rounded up to be a multiple of this.",
+                    default=100);
+parser.add_argument("--jesus-stddev-scale", type=float,
+                    help="Scaling factor on parameter stddev of Jesus layer (smaller->jesus layer learns faster)",
+                    default=1.0)
+parser.add_argument("--clipping-threshold", type=float,
+                    help="clipping threshold used in ClipGradient components (only relevant if "
+                    "recurrence indexes are specified).  If clipping-threshold=0 no clipping is done",
+                    default=15)
+parser.add_argument("config_dir",
+                    help="Directory to write config files and variables");
+
+print(' '.join(sys.argv))
+
+args = parser.parse_args()
+
+if not os.path.exists(args.config_dir):
+    os.makedirs(args.config_dir)
+
+## Check arguments.
+if args.feat_dir is not None:
+    args.feat_dim = nnet3_train_lib.GetFeatDim(args.feat_dir)
+
+if args.ali_dir is not None:
+    args.num_targets = nnet3_train_lib.GetNumberOfLeaves(args.ali_dir)
+elif args.tree_dir is not None:
+    args.num_targets = chain_lib.GetNumberOfLeaves(args.tree_dir)
+
+if args.ivector_dir is not None:
+    args.ivector_dim = nnet3_train_lib.GetIvectorDim(args.ivector_dir)
+
+if not args.feat_dim > 0:
+    raise Exception("feat-dim has to be postive")
+
+if not args.num_targets > 0:
+    print(args.num_targets)
+    raise Exception("num_targets has to be positive")
+
+if not args.ivector_dim >= 0:
+    raise Exception("ivector-dim has to be non-negative")
+
+
+## Check arguments.
+if args.num_jesus_blocks < 1:
+    sys.exit("invalid --num-jesus-blocks value");
+if args.final_hidden_dim < 0:
+    args.final_hidden_dim = args.jesus_forward_input_dim
+
+for name in [ "jesus_hidden_dim", "jesus_forward_output_dim", "jesus_forward_input_dim",
+              "final_hidden_dim" ]:
+    old_val = getattr(args, name)
+    if old_val % args.num_jesus_blocks != 0:
+        new_val = old_val + args.num_jesus_blocks - (old_val % args.num_jesus_blocks)
+        printable_name = '--' + name.replace('_', '-')
+        print('Rounding up {0} from {1} to {2} to be a multiple of --num-jesus-blocks={3} '.format(
+                printable_name, old_val, new_val, args.num_jesus_blocks))
+        setattr(args, name, new_val);
+
+# this is a bit like a struct, initialized from a string, which describes how to
+# set up the statistics-pooling and statistics-extraction components.
+# An example string is 'mean(-99:3:9::99)', which means, compute the mean of
+# data within a window of -99 to +99, with distinct means computed every 9 frames
+# (we round to get the appropriate one), and with the input extracted on multiples
+# of 3 frames (so this will force the input to this layer to be evaluated
+# every 3 frames).  Another example string is 'mean+stddev(-99:3:9:99)',
+# which will also cause the standard deviation to be computed.
+class StatisticsConfig:
+    # e.g. c = StatisticsConfig('mean+stddev(-99:3:9:99)', 400, 'jesus1-forward-output-affine')
+    def __init__(self, config_string, input_dim, input_name):
+        self.input_dim = input_dim
+        self.input_name = input_name
+
+        m = re.search("(mean|mean\+stddev)\((-?\d+):(-?\d+):(-?\d+):(-?\d+)\)",
+                      config_string)
+        if m == None:
+            sys.exit("Invalid splice-index or statistics-config string: " + config_string)
+        self.output_stddev = (m.group(1) != 'mean')
+        self.left_context = -int(m.group(2))
+        self.input_period = int(m.group(3))
+        self.stats_period = int(m.group(4))
+        self.right_context = int(m.group(5))
+        if not (self.left_context > 0 and self.right_context > 0 and
+                self.input_period > 0 and self.stats_period > 0 and
+                self.left_context % self.stats_period == 0 and
+                self.right_context % self.stats_period == 0 and
+                self.stats_period % self.input_period == 0):
+            sys.exit("Invalid configuration of statistics-extraction: " + config_string)
+
+    # OutputDim() returns the output dimension of the node that this produces.
+    def OutputDim(self):
+        return self.input_dim * (2 if self.output_stddev else 1)
+
+    # OutputDims() returns an array of output dimensions, consisting of
+    # [ input-dim ] if just "mean" was specified, otherwise
+    # [ input-dim input-dim ]
+    def OutputDims(self):
+        return [ self.input_dim, self.input_dim ] if self.output_stddev else [ self.input_dim ]
+
+    # Descriptor() returns the textual form of the descriptor by which the
+    # output of this node is to be accessed.
+    def Descriptor(self):
+        return 'Round({0}-pooling-{1}-{2}, {3})'.format(self.input_name, self.left_context, self.right_context,
+                                                       self.stats_period)
+
+    # This function writes the configuration lines need to compute the specified
+    # statistics, to the file f.
+    def WriteConfigs(self, f):
+        print('component name={0}-extraction-{1}-{2} type=StatisticsExtractionComponent input-dim={3} '
+              'input-period={4} output-period={5} include-variance={6} '.format(
+                self.input_name, self.left_context, self.right_context,
+                self.input_dim, self.input_period, self.stats_period,
+                ('true' if self.output_stddev else 'false')), file=f)
+        print('component-node name={0}-extraction-{1}-{2} component={0}-extraction-{1}-{2} input={0} '.format(
+                self.input_name, self.left_context, self.right_context), file=f)
+        stats_dim = 1 + self.input_dim * (2 if self.output_stddev else 1)
+        print('component name={0}-pooling-{1}-{2} type=StatisticsPoolingComponent input-dim={3} '
+              'input-period={4} left-context={1} right-context={2} num-log-count-features=0 '
+              'output-stddevs={5} '.format(self.input_name, self.left_context, self.right_context,
+                                           stats_dim, self.stats_period,
+                                           ('true' if self.output_stddev else 'false')),
+              file=f)
+        print('component-node name={0}-pooling-{1}-{2} component={0}-pooling-{1}-{2} input={0}-extraction-{1}-{2} '.format(
+                self.input_name, self.left_context, self.right_context), file=f)
+
+
+
+
+## Work out splice_array
+## e.g. for
+## args.splice_indexes == '-3,-2,-1,0,1,2,3 -3,0:-3 -3,0:-3 -6,-3,0:-6,-3'
+## we would have
+##   splice_array = [ [ -3,-2,...3 ], [-3,0] [-3,0] [-6,-3,0]
+
+
+splice_array = []
+left_context = 0
+right_context = 0
+split_on_spaces = args.splice_indexes.split(" ");  # we already checked the string is nonempty.
+if len(split_on_spaces) < 2:
+    sys.exit("invalid --splice-indexes argument, too short: "
+             + args.splice_indexes)
+try:
+    for string in split_on_spaces:
+        this_layer = len(splice_array)
+
+        this_splices = string.split(",")
+        splice_array.append(this_splices)
+        # the rest of this block updates left_context and right_context, and
+        # does some checking.
+        leftmost_splice = 10000
+        rightmost_splice = -10000
+        for s in this_splices:
+            try:
+                n = int(s)
+                if n < leftmost_splice:
+                    leftmost_splice = n
+                if n > rightmost_splice:
+                    rightmost_splice = n
+            except:
+                if len(splice_array) == 1:
+                    sys.exit("First dimension of splicing array must not have averaging [yet]")
+                try:
+                    x = StatisticsConfig(s, 100, 'foo')
+                except:
+                    sys.exit("The following element of the splicing array is not a valid specifier "
+                    "of statistics: " + s)
+
+        if leftmost_splice == 10000 or rightmost_splice == -10000:
+            sys.exit("invalid element of --splice-indexes: " + string)
+        left_context += -leftmost_splice
+        right_context += rightmost_splice
+except ValueError as e:
+    sys.exit("invalid --splice-indexes argument " + args.splice_indexes + " " + str(e))
+left_context = max(0, left_context)
+right_context = max(0, right_context)
+num_hidden_layers = len(splice_array)
+input_dim = len(splice_array[0]) * args.feat_dim  +  args.ivector_dim
+
+f = open(args.config_dir + "/vars", "w")
+print('left_context=' + str(left_context), file=f)
+print('right_context=' + str(right_context), file=f)
+print('num_hidden_layers=' + str(num_hidden_layers), file=f)
+f.close()
+
+
+f = open(args.config_dir + "/init.config", "w")
+print('# Config file for initializing neural network prior to', file=f)
+print('# preconditioning matrix computation', file=f)
+print('input-node name=input dim=' + str(args.feat_dim), file=f)
+list=[ ('Offset(input, {0})'.format(n) if n != 0 else 'input' ) for n in splice_array[0] ]
+if args.ivector_dim > 0:
+    print('input-node name=ivector dim=' + str(args.ivector_dim), file=f)
+    list.append('ReplaceIndex(ivector, t, 0)')
+# example of next line:
+# output-node name=output input="Append(Offset(input, -3), Offset(input, -2), Offset(input, -1), ... , Offset(input, 3), ReplaceIndex(ivector, t, 0))"
+print('output-node name=output input=Append({0})'.format(", ".join(list)), file=f)
+f.close()
+
+
+for l in range(1, num_hidden_layers + 1):
+    # the following summarizes the structure of the layers:  Here, the Jesus component includes ReLU at its input and output, and renormalize
+    #   at its output after the ReLU.
+    # layer1: splice + LDA-transform + affine + ReLU + renormalize
+    # layerX: splice + Jesus + affine + ReLU
+
+    # Inside the jesus component is:
+    #  [permute +] ReLU + repeated-affine + ReLU + repeated-affine
+    # [we make the repeated-affine the last one so we don't have to redo that in backprop].
+    # We follow this with a post-jesus composite component containing the operations:
+    #  [permute +] ReLU + renormalize
+    # call this post-jesusN.
+    # After this we use dim-range nodes to split up the output into
+    # [ jesusN-forward-output, jesusN-direct-output and jesusN-projected-output ]
+    # parts;
+    # and nodes for the jesusN-forward-affine.
+
+    f = open(args.config_dir + "/layer{0}.config".format(l), "w")
+    print('# Config file for layer {0} of the network'.format(l), file=f)
+    if l == 1:
+        print('component name=lda type=FixedAffineComponent matrix={0}/lda.mat'.
+              format(args.config_dir), file=f)
+        splices = [ ('Offset(input, {0})'.format(n) if n != 0 else 'input') for n in splice_array[l-1] ]
+        if args.ivector_dim > 0: splices.append('ReplaceIndex(ivector, t, 0)')
+        orig_input='Append({0})'.format(', '.join(splices))
+        # e.g. orig_input = 'Append(Offset(input, -2), ... Offset(input, 2), ivector)'
+        print('component-node name=lda component=lda input={0}'.format(orig_input),
+              file=f)
+        # after the initial LDA transform, put a trainable affine layer and a ReLU, followed
+        # by a NormalizeComponent.
+        print('component name=affine1 type=NaturalGradientAffineComponent '
+              'input-dim={0} output-dim={1} bias-stddev=0'.format(
+                input_dim, args.jesus_forward_input_dim), file=f)
+        print('component-node name=affine1 component=affine1 input=lda',
+              file=f)
+        # the ReLU after the affine
+        print('component name=relu1 type=RectifiedLinearComponent dim={1} self-repair-scale={2}'.format(
+                l, args.jesus_forward_input_dim, args.self_repair_scale), file=f)
+        print('component-node name=relu1 component=relu1 input=affine1', file=f)
+        # the renormalize component after the ReLU
+        print ('component name=renorm1 type=NormalizeComponent dim={0} '.format(
+                args.jesus_forward_input_dim), file=f)
+        print('component-node name=renorm1 component=renorm1 input=relu1', file=f)
+        cur_output = 'renorm1'
+        cur_affine_output_dim = args.jesus_forward_input_dim
+    else:
+        splices = []
+        spliced_dims = []
+        for s in splice_array[l-1]:
+            # the connection from the previous layer
+            try:
+                offset = int(s)
+                # it's an integer offset.
+                splices.append('Offset({0}, {1})'.format(cur_output, offset))
+                spliced_dims.append(cur_affine_output_dim)
+            except:
+                # it's not an integer offset, so assume it specifies the
+                # statistics-extraction.
+                stats = StatisticsConfig(s, cur_affine_output_dim, cur_output)
+                stats.WriteConfigs(f)
+                splices.append(stats.Descriptor())
+                spliced_dims.extend(stats.OutputDims())
+
+        # get the input to the Jesus layer.
+        cur_input = 'Append({0})'.format(', '.join(splices))
+        cur_dim = sum(spliced_dims)
+
+        this_jesus_output_dim = args.jesus_forward_output_dim
+
+        # As input to the Jesus component we'll append the spliced input and any
+        # mean/stddev-stats input, and the first thing inside the component that
+        # we do is rearrange the dimensions so that things pertaining to a
+        # particular block stay together.
+
+        column_map = []
+        for x in range(0, args.num_jesus_blocks):
+            dim_offset = 0
+            for src_splice in spliced_dims:
+                src_block_size = src_splice / args.num_jesus_blocks
+                for y in range(0, src_block_size):
+                    column_map.append(dim_offset + (x * src_block_size) + y)
+                dim_offset += src_splice
+        if sorted(column_map) != range(0, sum(spliced_dims)):
+            print("column_map is " + str(column_map))
+            print("num_jesus_blocks is " + str(args.num_jesus_blocks))
+            print("spliced_dims is " + str(spliced_dims))
+            sys.exit("code error creating new column order")
+
+        need_input_permute_component = (column_map != range(0, sum(spliced_dims)))
+
+        # Now add the jesus component.
+
+        permute_offset = (1 if need_input_permute_component else 0)
+
+        if args.jesus_hidden_dim > 0: # normal case where we have jesus-hidden-dim.
+            num_sub_components = 4 + permute_offset
+            hidden_else_output_dim = args.jesus_hidden_dim
+        else: # no hidden part in jesus layer.
+            num_sub_components = 2 + permute_offset
+            hidden_else_output_dim = args.jesus_forward_output_dim
+        print('component name=jesus{0} type=CompositeComponent num-components={1}'.format(
+                l, num_sub_components), file=f, end='')
+        # print the sub-components of the CompositeComopnent on the same line.
+        # this CompositeComponent has the same effect as a sequence of
+        # components, but saves memory.
+        if need_input_permute_component:
+            print(" component1='type=PermuteComponent column-map={1}'".format(
+                    l, ','.join([str(x) for x in column_map])), file=f, end='')
+        print(" component{0}='type=RectifiedLinearComponent dim={1} self-repair-scale={2}'".format(
+                1 + permute_offset,
+                cur_dim, args.self_repair_scale), file=f, end='')
+
+        if args.use_repeated_affine == "true":
+            print(" component{0}='type=NaturalGradientRepeatedAffineComponent input-dim={1} output-dim={2} "
+                  "num-repeats={3} param-stddev={4} bias-mean={5} bias-stddev=0'".format(
+                    2 + permute_offset,
+                    cur_dim, hidden_else_output_dim,
+                    args.num_jesus_blocks,
+                    args.jesus_stddev_scale / math.sqrt(cur_dim / args.num_jesus_blocks),
+                    0.5 * args.jesus_stddev_scale),
+                  file=f, end='')
+        else:
+            print(" component{0}='type=BlockAffineComponent input-dim={1} output-dim={2} "
+                  "num-blocks={3} param-stddev={4} bias-stddev=0'".format(
+                    2 + permute_offset,
+                    cur_dim, hidden_else_output_dim,
+                    args.num_jesus_blocks,
+                    args.jesus_stddev_scale / math.sqrt(cur_dim / args.num_jesus_blocks)),
+                  file=f, end='')
+
+        if args.jesus_hidden_dim > 0: # normal case where we have jesus-hidden-dim.
+            print(" component{0}='type=RectifiedLinearComponent dim={1} self-repair-scale={2}'".format(
+                    3 + permute_offset, hidden_else_output_dim,
+                    args.self_repair_scale), file=f, end='')
+
+            if args.use_repeated_affine == "true":
+                print(" component{0}='type=NaturalGradientRepeatedAffineComponent input-dim={1} output-dim={2} "
+                      "num-repeats={3} param-stddev={4} bias-mean={5} bias-stddev=0'".format(
+                        4 + permute_offset,
+                        args.jesus_hidden_dim,
+                        this_jesus_output_dim,
+                        args.num_jesus_blocks,
+                        args.jesus_stddev_scale / math.sqrt(args.jesus_hidden_dim / args.num_jesus_blocks),
+                        0.5 * args.jesus_stddev_scale),
+                      file=f, end='')
+            else:
+                print(" component{0}='type=BlockAffineComponent input-dim={1} output-dim={2} "
+                      "num-blocks={3} param-stddev={4} bias-stddev=0'".format(
+                        4 + permute_offset,
+                        args.jesus_hidden_dim,
+                        this_jesus_output_dim,
+                        args.num_jesus_blocks,
+                        args.jesus_stddev_scale / math.sqrt((args.jesus_hidden_dim / args.num_jesus_blocks))),
+                      file=f, end='')
+
+        print("", file=f) # print newline.
+        print('component-node name=jesus{0} component=jesus{0} input={1}'.format(
+                l, cur_input), file=f)
+
+        # now print the post-Jesus component which consists of ReLU +
+        # renormalize.
+
+        num_sub_components = 2
+        print('component name=post-jesus{0} type=CompositeComponent num-components=2'.format(l),
+              file=f, end='')
+
+        # still within the post-Jesus component, print the ReLU
+        print(" component1='type=RectifiedLinearComponent dim={0} self-repair-scale={1}'".format(
+                this_jesus_output_dim, args.self_repair_scale), file=f, end='')
+        # still within the post-Jesus component, print the NormalizeComponent
+        print(" component2='type=NormalizeComponent dim={0} '".format(
+                this_jesus_output_dim), file=f, end='')
+        print("", file=f) # print newline.
+        print('component-node name=post-jesus{0} component=post-jesus{0} input=jesus{0}'.format(l),
+              file=f)
+
+        # handle the forward output, we need an affine node for this:
+        cur_affine_output_dim = (args.jesus_forward_input_dim if l < num_hidden_layers else args.final_hidden_dim)
+        print('component name=forward-affine{0} type=NaturalGradientAffineComponent '
+              'input-dim={1} output-dim={2} bias-stddev=0'.
+              format(l, args.jesus_forward_output_dim, cur_affine_output_dim), file=f)
+        print('component-node name=jesus{0}-forward-output-affine component=forward-affine{0} input=post-jesus{0}'.format(
+            l), file=f)
+        # for each recurrence delay, create an affine node followed by a
+        # clip-gradient node.  [if there are multiple recurrences in the same layer,
+        # each one gets its own affine projection.]
+
+        # The reason we set the param-stddev to 0 is out of concern that if we
+        # initialize to nonzero, this will encourage the corresponding inputs at
+        # the jesus layer to become small (to remove this random input), which
+        # in turn will make this component learn slowly (due to small
+        # derivatives).  we set the bias-mean to 0.001 so that the ReLUs on the
+        # input of the Jesus layer are in the part of the activation that has a
+        # nonzero derivative- otherwise with this setup it would never learn.
+
+        cur_output = 'jesus{0}-forward-output-affine'.format(l)
+
+
+    # with each new layer we regenerate the final-affine component, with a ReLU before it
+    # because the layers we printed don't end with a nonlinearity.
+    print('component name=final-relu type=RectifiedLinearComponent dim={0} self-repair-scale={1}'.format(
+            cur_affine_output_dim, args.self_repair_scale), file=f)
+    print('component-node name=final-relu component=final-relu input={0}'.format(cur_output),
+          file=f)
+    print('component name=final-affine type=NaturalGradientAffineComponent '
+          'input-dim={0} output-dim={1} learning-rate-factor={2} param-stddev=0.0 bias-stddev=0'.format(
+            cur_affine_output_dim, args.num_targets,
+            args.final_layer_learning_rate_factor), file=f)
+    print('component-node name=final-affine component=final-affine input=final-relu',
+          file=f)
+    # printing out the next two, and their component-nodes, for l > 1 is not
+    # really necessary as they will already exist, but it doesn't hurt and makes
+    # the structure clearer.
+    if args.include_log_softmax == "true":
+        print('component name=final-log-softmax type=LogSoftmaxComponent dim={0}'.format(
+                args.num_targets), file=f)
+        print('component-node name=final-log-softmax component=final-log-softmax '
+              'input=final-affine', file=f)
+        print('output-node name=output input=final-log-softmax', file=f)
+    else:
+        print('output-node name=output input=final-affine', file=f)
+
+    if args.xent_regularize != 0.0:
+        xent_input = 'final-relu'
+        if l == num_hidden_layers and args.xent_separate_forward_affine == "true":
+            print('component name=forward-affine{0}-xent type=NaturalGradientAffineComponent '
+                  'input-dim={1} output-dim={2} bias-stddev=0'.
+                  format(l, args.jesus_forward_output_dim, args.final_hidden_dim), file=f)
+            print('component-node name=jesus{0}-forward-output-affine-xent component=forward-affine{0}-xent input=post-jesus{0}'.format(
+                    l), file=f)
+            print('component name=final-relu-xent type=RectifiedLinearComponent dim={0} self-repair-scale={1}'.format(
+                    args.final_hidden_dim, args.self_repair_scale), file=f)
+            print('component-node name=final-relu-xent component=final-relu-xent '
+                  'input=jesus{0}-forward-output-affine-xent'.format(l), file=f)
+            xent_input = 'final-relu-xent'
+
+        # This block prints the configs for a separate output that will be
+        # trained with a cross-entropy objective in the 'chain' models... this
+        # has the effect of regularizing the hidden parts of the model.  we use
+        # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+        # 1.0 / args.xent_regularize is suitable as it means the xent
+        # final-layer learns at a rate independent of the regularization
+        # constant; and the 0.5 was tuned so as to make the relative progress
+        # similar in the xent and regular final layers.
+        print('component name=final-affine-xent type=NaturalGradientAffineComponent '
+              'input-dim={0} output-dim={1} param-stddev=0.0 bias-stddev=0 learning-rate-factor={2}'.format(
+                cur_affine_output_dim, args.num_targets, 0.5 / args.xent_regularize), file=f)
+        print('component-node name=final-affine-xent component=final-affine-xent input={0}'.format(
+                xent_input), file=f)
+        print('component name=final-log-softmax-xent type=LogSoftmaxComponent dim={0}'.format(
+                args.num_targets), file=f)
+        print('component-node name=final-log-softmax-xent component=final-log-softmax-xent '
+              'input=final-affine-xent', file=f)
+        print('output-node name=output-xent input=final-log-softmax-xent', file=f)
+
+    f.close()
diff --git a/egs/wsj/s5/steps/nnet3/make_tdnn_configs.py b/egs/wsj/s5/steps/nnet3/make_tdnn_configs.py
index 57ed753c8c1..8403c273a9d 100644
--- a/egs/wsj/s5/steps/nnet3/make_tdnn_configs.py
+++ b/egs/wsj/s5/steps/nnet3/make_tdnn_configs.py
@@ -15,12 +15,20 @@
                     help="Raw feature dimension, e.g. 13")
 parser.add_argument("--ivector-dim", type=int,
                     help="iVector dimension, e.g. 100", default=0)
+parser.add_argument("--include-log-softmax", type=str,
+                    help="add the final softmax layer ", default="true", choices = ["false", "true"])
+parser.add_argument("--final-layer-normalize-target", type=float,
+                    help="RMS target for final layer (set to <1 if final layer learns too fast",
+                    default=1.0)
 parser.add_argument("--pnorm-input-dim", type=int,
                     help="input dimension to p-norm nonlinearities")
 parser.add_argument("--pnorm-output-dim", type=int,
                     help="output dimension of p-norm nonlinearities")
 parser.add_argument("--relu-dim", type=int,
                     help="dimension of ReLU nonlinearities")
+parser.add_argument("--use-presoftmax-prior-scale", type=str,
+                    help="if true, a presoftmax-prior-scale is added",
+                    choices=['true', 'false'], default = "true")
 parser.add_argument("--num-targets", type=int,
                     help="number of network targets (e.g. num-pdf-ids/num-leaves)")
 parser.add_argument("config_dir",
@@ -38,8 +46,8 @@
     sys.exit("--splice-indexes argument is required");
 if args.feat_dim is None or not (args.feat_dim > 0):
     sys.exit("--feat-dim argument is required");
-if args.num_targets is None or not (args.feat_dim > 0):
-    sys.exit("--feat-dim argument is required");
+if args.num_targets is None or not (args.num_targets > 0):
+    sys.exit("--num-targets argument is required");
 if not args.relu_dim is None:
     if not args.pnorm_input_dim is None or not args.pnorm_output_dim is None:
         sys.exit("--relu-dim argument not compatible with "
@@ -53,12 +61,16 @@
     nonlin_input_dim = args.pnorm_input_dim
     nonlin_output_dim = args.pnorm_output_dim
 
+if args.use_presoftmax_prior_scale == "true":
+    use_presoftmax_prior_scale = True
+else:
+    use_presoftmax_prior_scale = False
 
 ## Work out splice_array e.g. splice_array = [ [ -3,-2,...3 ], [0], [-2,2], .. [ -8,8 ] ]
 splice_array = []
 left_context = 0
 right_context = 0
-split1 = args.splice_indexes.split(" ");  # we already checked the string is nonempty.
+split1 = args.splice_indexes.split();  # we already checked the string is nonempty.
 if len(split1) < 1:
     sys.exit("invalid --splice-indexes argument, too short: "
              + args.splice_indexes)
@@ -125,19 +137,22 @@
         print('# In nnet3 framework, p in P-norm is always 2.', file=f)
         print('component name=nonlin{0} type=PnormComponent input-dim={1} output-dim={2}'.
               format(l, args.pnorm_input_dim, args.pnorm_output_dim), file=f)
-    print('component name=renorm{0} type=NormalizeComponent dim={1}'.format(
-         l, nonlin_output_dim), file=f)
+    print('component name=renorm{0} type=NormalizeComponent dim={1} target-rms={2}'.format(
+        l, nonlin_output_dim,
+        (1.0 if l < num_hidden_layers else args.final_layer_normalize_target)), file=f)
     print('component name=final-affine type=NaturalGradientAffineComponent '
           'input-dim={0} output-dim={1} param-stddev=0 bias-stddev=0'.format(
           nonlin_output_dim, args.num_targets), file=f)
     # printing out the next two, and their component-nodes, for l > 1 is not
     # really necessary as they will already exist, but it doesn't hurt and makes
     # the structure clearer.
-    print('component name=final-fixed-scale type=FixedScaleComponent '
-          'scales={0}/presoftmax_prior_scale.vec'.format(
-          args.config_dir), file=f)
-    print('component name=final-log-softmax type=LogSoftmaxComponent dim={0}'.format(
-          args.num_targets), file=f)
+    if args.include_log_softmax == "true":
+        if use_presoftmax_prior_scale :
+            print('component name=final-fixed-scale type=FixedScaleComponent '
+                  'scales={0}/presoftmax_prior_scale.vec'.format(
+                    args.config_dir), file=f)
+        print('component name=final-log-softmax type=LogSoftmaxComponent dim={0}'.format(
+                args.num_targets), file=f)
     print('# Now for the network structure', file=f)
     if l == 1:
         splices = [ ('Offset(input, {0})'.format(n) if n != 0 else 'input') for n in splice_array[l-1] ]
@@ -161,11 +176,19 @@
 
     print('component-node name=final-affine component=final-affine input=renorm{0}'.
           format(l), file=f)
-    print('component-node name=final-fixed-scale component=final-fixed-scale input=final-affine',
-          file=f)
-    print('component-node name=final-log-softmax component=final-log-softmax '
-          'input=final-fixed-scale', file=f)
-    print('output-node name=output input=final-log-softmax', file=f)
+
+    if args.include_log_softmax == "true":
+        if use_presoftmax_prior_scale:
+            print('component-node name=final-fixed-scale component=final-fixed-scale input=final-affine',
+                  file=f)
+            print('component-node name=final-log-softmax component=final-log-softmax '
+                  'input=final-fixed-scale', file=f)
+        else:
+            print('component-node name=final-log-softmax component=final-log-softmax '
+                  'input=final-affine', file=f)
+        print('output-node name=output input=final-log-softmax', file=f)
+    else:
+        print('output-node name=output input=final-affine', file=f)
     f.close()
 
 # component name=nonlin1 type=PnormComponent input-dim=$pnorm_input_dim output-dim=$pnorm_output_dim
diff --git a/egs/wsj/s5/steps/nnet3/nnet3_to_dot.sh b/egs/wsj/s5/steps/nnet3/nnet3_to_dot.sh
index 24666b8bd02..c36de8c16bf 100755
--- a/egs/wsj/s5/steps/nnet3/nnet3_to_dot.sh
+++ b/egs/wsj/s5/steps/nnet3/nnet3_to_dot.sh
@@ -1,11 +1,12 @@
 #!/bin/bash
 
 # script showing use of nnet3_to_dot.py
-# Copyright 2015  Johns Hopkins University (Author: Vijayaditya Peddinti). 
+# Copyright 2015  Johns Hopkins University (Author: Vijayaditya Peddinti).
 
 # Begin configuration section.
 component_attributes="name,type"
 node_prefixes=""
+info_bin=nnet3-am-info
 echo "$0 $@"  # Print the command line for logging
 
 [ -f ./path.sh ] && . ./path.sh; # source the path.
@@ -20,7 +21,7 @@ if [ $# != 3 ]; then
   echo "  --node-prefixes <string|Lstm1,Lstm2>          # list of prefixes. Nnet3 components/component-nodes with the same prefix"
   echo "                                                # will be clustered together in the dot-graph"
 
-  
+
   exit 1;
 fi
 
@@ -29,10 +30,10 @@ dot_file=$2
 output_file=$3
 
 attr=${node_prefixes:+ --node-prefixes "$node_prefixes"}
-nnet3-am-info $model | \
+$info_bin $model | \
   steps/nnet3/dot/nnet3_to_dot.py \
     --component-attributes "$component_attributes" \
-    $attr  > $dot_file
+    $attr $dot_file
 
 command -v dot >/dev/null 2>&1 || { echo >&2 "This script requires dot but it's not installed. Please compile $dot_file with dot"; exit 1; }
-dot -Tpng $dot_file -o $output_file
+dot -Tpdf $dot_file -o $output_file
diff --git a/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py b/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py
new file mode 100644
index 00000000000..cbe2245652b
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/nnet3_train_lib.py
@@ -0,0 +1,702 @@
+import subprocess
+import logging
+import math
+import re
+import time
+import argparse
+import datetime as dt
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+handler.setLevel(logging.INFO)
+formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s')
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+
+
+def SendMail(message, subject, email_id):
+    try:
+        subprocess.Popen('echo "{message}" | mail -s "{subject}" {email} '.format(
+            message = message,
+            subject = subject,
+            email = email_id), shell=True)
+    except Exception as e:
+        logger.info(" Unable to send mail due to error:\n {error}".format(error = str(e)))
+        pass
+
+class StrToBoolAction(argparse.Action):
+    """ A custom action to convert bools from shell format i.e., true/false
+        to python format i.e., True/False """
+    def __call__(self, parser, namespace, values, option_string=None):
+        if values == "true":
+            setattr(namespace, self.dest, True)
+        elif values == "false":
+            setattr(namespace, self.dest, False)
+        else:
+            raise Exception("Unknown value {0} for --{1}".format(values, self.dest))
+
+class NullstrToNoneAction(argparse.Action):
+    """ A custom action to convert empty strings passed by shell
+        to None in python. This is necessary as shell scripts print null strings
+        when a variable is not specified. We could use the more apt None
+        in python. """
+    def __call__(self, parser, namespace, values, option_string=None):
+            if values.strip() == "":
+                setattr(namespace, self.dest, None)
+            else:
+                setattr(namespace, self.dest, values)
+
+
+def CheckIfCudaCompiled():
+    p = subprocess.Popen("cuda-compiled")
+    p.communicate()
+    if p.returncode == 1:
+        return False
+    else:
+        return True
+
+def RunKaldiCommand(command, wait = True, measure_time = False):
+    """ Runs commands frequently seen in Kaldi scripts. These are usually a
+        sequence of commands connected by pipes, so we use shell=True """
+    #logger.info("Running the command\n{0}".format(command))
+    start_time = dt.datetime.now()
+    p = subprocess.Popen(command, shell = True,
+                         stdout = subprocess.PIPE,
+                         stderr = subprocess.PIPE)
+    end_time = dt.datetime.now()
+    if measure_time:
+        duration = end_time - start_time
+        logger.info("Ran for {0} seconds".format(duration.seconds))
+    if wait:
+        [stdout, stderr] = p.communicate()
+        if p.returncode is not 0:
+            raise Exception("There was an error while running the command {0}\n".format(command)+"-"*10+"\n"+stderr)
+        return stdout, stderr
+    else:
+        return p
+
+def GetSuccessfulModels(num_models, log_file_pattern, difference_threshold=1.0):
+    assert(num_models > 0)
+
+    parse_regex = re.compile("LOG .* Overall average objective function for 'output' is ([0-9e.\-+]+) over ([0-9e.\-+]+) frames")
+    objf = []
+    for i in range(num_models):
+        model_num = i + 1
+        logfile = re.sub('%', str(model_num), log_file_pattern)
+        lines = open(logfile, 'r').readlines()
+        this_objf = -100000
+        for line_num in range(1, len(lines) + 1):
+            # we search from the end as this would result in
+            # lesser number of regex searches. Python regex is slow !
+            mat_obj = parse_regex.search(lines[-1*line_num])
+            if mat_obj is not None:
+                this_objf = float(mat_obj.groups()[0])
+                break;
+        objf.append(this_objf);
+    max_index = objf.index(max(objf))
+    accepted_models = []
+    for i in range(num_models):
+        if (objf[max_index] - objf[i]) <= difference_threshold:
+            accepted_models.append(i+1)
+
+    if len(accepted_models) != num_models:
+        logger.warn("Only {0}/{1} of the models have been accepted for averaging, based on log files {2}.".format(len(accepted_models), num_models, log_file_pattern))
+
+    return [accepted_models, max_index+1]
+
+def GetNumberOfLeaves(alidir):
+    [stdout, stderr] = RunKaldiCommand("tree-info {0}/tree 2>/dev/null | grep num-pdfs".format(alidir))
+    parts = stdout.split()
+    assert(parts[0] == "num-pdfs")
+    num_leaves = int(parts[1])
+    if num_leaves == 0:
+        raise Exception("Number of leaves is 0")
+    return num_leaves
+
+def GetNumberOfJobs(alidir):
+    try:
+        num_jobs = int(open('{0}/num_jobs'.format(alidir), 'r').readline().strip())
+    except IOError, ValueError:
+        raise Exception('Exception while reading the number of alignment jobs')
+    return num_jobs
+def GetIvectorDim(ivector_dir = None):
+    if ivector_dir is None:
+        return 0
+    [stdout_val, stderr_val] = RunKaldiCommand("feat-to-dim --print-args=false scp:{dir}/ivector_online.scp -".format(dir = ivector_dir))
+    ivector_dim = int(stdout_val)
+    return ivector_dim
+
+def GetFeatDim(feat_dir):
+    [stdout_val, stderr_val] = RunKaldiCommand("feat-to-dim --print-args=false scp:{data}/feats.scp -".format(data = feat_dir))
+    feat_dim = int(stdout_val)
+    return feat_dim
+
+def ReadKaldiMatrix(matrix_file):
+    try:
+        lines = map(lambda x: x.split(), open(matrix_file).readlines())
+        first_field = lines[0][0]
+        last_field = lines[-1][-1]
+        lines[0] = lines[0][1:]
+        lines[-1] = lines[-1][:-1]
+        if not (first_field == "[" and last_field == "]"):
+            raise Exception("Kaldi matrix file has incorrect format, only text format matrix files can be read by this script")
+        for i in range(len(lines)):
+            lines[i] = map(lambda x: int(float(x)), lines[i])
+        return lines
+    except IOError:
+        raise Exception("Error while reading the kaldi matrix file {0}".format(matrix_file))
+
+def WriteKaldiMatrix(output_file, matrix):
+    # matrix is a list of lists
+    file = open(output_file, 'w')
+    file.write("[ ")
+    num_rows = len(matrix)
+    if num_rows == 0:
+        raise Exception("Matrix is empty")
+    num_cols = len(matrix[0])
+
+    for row_index in range(len(matrix)):
+        if num_cols != len(matrix[row_index]):
+            raise Exception("All the rows of a matrix are expected to have the same length")
+        file.write(" ".join(map(lambda x: str(x), matrix[row_index])))
+        if row_index != num_rows - 1:
+            file.write("\n")
+    file.write(" ]")
+    file.close()
+
+import shutil
+def CopyEgsPropertiesToExpDir(egs_dir, dir):
+    try:
+        for file in ['cmvn_opts', 'splice_opts', 'final.mat']:
+            file_name = '{dir}/{file}'.format(dir = egs_dir, file = file)
+            if os.path.isfile(file_name):
+                shutil.copy2(file_name, dir)
+    except IOError:
+        raise Exception("Error while trying to copy egs property files to {dir}".format(dir = dir))
+
+def SplitData(data, num_jobs):
+   RunKaldiCommand("utils/split_data.sh {data} {num_jobs}".format(data = data,
+                                                                  num_jobs = num_jobs))
+
+def ParseModelConfigVarsFile(var_file):
+    try:
+        var_file_handle = open(var_file, 'r')
+        model_left_context = None
+        model_right_context = None
+        num_hidden_layers = None
+        for line in var_file_handle:
+            parts = line.split('=')
+            field_name = parts[0].strip()
+            field_value = parts[1]
+            if field_name in ['model_left_context', 'left_context']:
+                model_left_context = int(field_value)
+            elif field_name in ['model_right_context', 'right_context']:
+                model_right_context = int(field_value)
+            elif field_name == 'num_hidden_layers':
+                num_hidden_layers = int(field_value)
+
+        if model_left_context is not None and model_right_context is not None and num_hidden_layers is not None:
+            return [model_left_context, model_right_context, num_hidden_layers]
+
+    except ValueError:
+        # we will throw an error at the end of the function so I will just pass
+        pass
+
+    raise Exception('Error while parsing the file {0}'.format(var_file))
+
+
+def GenerateEgs(data, alidir, egs_dir,
+                left_context, right_context,
+                valid_left_context, valid_right_context,
+                run_opts, stage = 0,
+                feat_type = 'raw', online_ivector_dir = None,
+                samples_per_iter = 20000, frames_per_eg = 20,
+                egs_opts = None, cmvn_opts = None, transform_dir = None):
+
+    RunKaldiCommand("""
+steps/nnet3/get_egs.sh {egs_opts} \
+  --cmd "{command}" \
+  --cmvn-opts "{cmvn_opts}" \
+  --feat-type {feat_type} \
+  --transform-dir "{transform_dir}" \
+  --online-ivector-dir "{ivector_dir}" \
+  --left-context {left_context} --right-context {right_context} \
+  --valid-left-context {valid_left_context} \
+  --valid-right-context {valid_right_context} \
+  --stage {stage} \
+  --samples-per-iter {samples_per_iter} \
+  --frames-per-eg {frames_per_eg} \
+  {data} {alidir} {egs_dir}
+      """.format(command = run_opts.command,
+          cmvn_opts = cmvn_opts if cmvn_opts is not None else '',
+          feat_type = feat_type,
+          transform_dir = transform_dir if transform_dir is not None else '',
+          ivector_dir = online_ivector_dir if online_ivector_dir is not None else '',
+          left_context = left_context, right_context = right_context,
+          valid_left_context = valid_left_context,
+          valid_right_context = valid_right_context,
+          stage = stage, samples_per_iter = samples_per_iter,
+          frames_per_eg = frames_per_eg, data = data, alidir = alidir,
+          egs_dir = egs_dir,
+          egs_opts = egs_opts if egs_opts is not None else '' ))
+
+def VerifyEgsDir(egs_dir, feat_dim, ivector_dim, left_context, right_context):
+    try:
+        egs_feat_dim = int(open('{0}/info/feat_dim'.format(egs_dir)).readline())
+        egs_ivector_dim = int(open('{0}/info/ivector_dim'.format(egs_dir)).readline())
+        egs_left_context = int(open('{0}/info/left_context'.format(egs_dir)).readline())
+        egs_right_context = int(open('{0}/info/right_context'.format(egs_dir)).readline())
+        if (feat_dim != egs_feat_dim) or (ivector_dim != egs_ivector_dim):
+            raise Exception('There is mismatch between featdim/ivector_dim of the current experiment and the provided egs directory')
+
+        if (egs_left_context < left_context) or (egs_right_context < right_context):
+            raise Exception("""The egs have insufficient context.
+Required left context is {left_req_ctx} and available context is {left_av_ctx}.
+Required right context is {right_req_ctx} and available context is {right_av_ctx}.
+""".format(left_req_ctx = left_context, left_av_ctx = egs_left_context,
+           right_req_ctx = right_context, right_av_ctx = right_context))
+
+        frames_per_eg = int(open('{0}/info/frames_per_eg'.format(egs_dir)).readline())
+        num_archives = int(open('{0}/info/num_archives'.format(egs_dir)).readline())
+
+        return [egs_left_context, egs_right_context, frames_per_eg, num_archives]
+    except IOError, ValueError:
+        raise Exception('The egs dir {0} has missing or malformed files'.format(egs_dir))
+
+def ComputePreconditioningMatrix(dir, egs_dir, num_lda_jobs, run_opts,
+                                 max_lda_jobs = None, rand_prune = 4.0,
+                                 lda_opts = None):
+    if max_lda_jobs is not None:
+        if num_lda_jobs > max_lda_jobs:
+            num_lda_jobs = max_lda_jobs
+
+    RunKaldiCommand("""
+{command} JOB=1:{num_lda_jobs} {dir}/log/get_lda_stats.JOB.log \
+ nnet3-acc-lda-stats --rand-prune={rand_prune} \
+    {dir}/init.raw "ark:{egs_dir}/egs.JOB.ark" {dir}/JOB.lda_stats""".format(
+        command = run_opts.command,
+        num_lda_jobs = num_lda_jobs,
+        dir = dir,
+        egs_dir = egs_dir,
+        rand_prune = rand_prune))
+
+    # the above command would have generated dir/{1..num_lda_jobs}.lda_stats
+    lda_stat_files = map(lambda x: '{0}/{1}.lda_stats'.format(dir, x),
+                         range(1, num_lda_jobs + 1))
+
+    RunKaldiCommand("""
+{command} {dir}/log/sum_transform_stats.log \
+    sum-lda-accs {dir}/lda_stats {lda_stat_files}""".format(
+        command = run_opts.command,
+        dir = dir, lda_stat_files = " ".join(lda_stat_files)))
+
+    for file in lda_stat_files:
+        try:
+            os.remove(file)
+        except OSError:
+            raise Exception("There was error while trying to remove lda stat files.")
+    # this computes a fixed affine transform computed in the way we described in
+    # Appendix C.6 of http://arxiv.org/pdf/1410.7455v6.pdf; it's a scaled variant
+    # of an LDA transform but without dimensionality reduction.
+
+    RunKaldiCommand("""
+{command} {dir}/log/get_transform.log \
+ nnet-get-feature-transform {lda_opts} {dir}/lda.mat {dir}/lda_stats
+     """.format(command = run_opts.command,dir = dir,
+                lda_opts = lda_opts if lda_opts is not None else ""))
+
+    ForceSymlink("../lda.mat", "{0}/configs/lda.mat".format(dir))
+
+import os, errno
+
+def ForceSymlink(file1, file2):
+    try:
+        os.symlink(file1, file2)
+    except OSError, e:
+        if e.errno == errno.EEXIST:
+            os.remove(file2)
+            os.symlink(file1, file2)
+
+def ComputePresoftmaxPriorScale(dir, alidir, num_jobs, run_opts,
+                                presoftmax_prior_scale_power = None):
+
+    # getting the raw pdf count
+    RunKaldiCommand("""
+{command} JOB=1:{num_jobs} {dir}/log/acc_pdf.JOB.log \
+ali-to-post "ark:gunzip -c {alidir}/ali.JOB.gz|" ark:- \| \
+post-to-tacc --per-pdf=true  {alidir}/final.mdl ark:- {dir}/pdf_counts.JOB
+     """.format(command = run_opts.command,
+                num_jobs = num_jobs,
+                dir = dir,
+                alidir = alidir))
+
+    RunKaldiCommand("""
+{command} {dir}/log/sum_pdf_counts.log \
+vector-sum --binary=false {dir}/pdf_counts.* {dir}/pdf_counts
+       """.format(command = run_opts.command,  dir = dir))
+
+    import glob
+    for file in glob.glob('{0}/pdf_counts.*'.format(dir)):
+        os.remove(file)
+
+    smooth=0.01
+    pdf_counts = ReadKaldiMatrix('{0}/pdf_counts'.format(dir))[0]
+    total = sum(pdf_counts)
+    average_count = total/len(pdf_counts)
+    scales = []
+    for i in range(len(pdf_counts)):
+        scales.append(math.pow(pdf_counts[i] + smooth * average_count, presoftmax_prior_scale_power))
+    num_pdfs = len(pdf_counts)
+    scaled_counts = map(lambda x: x * float(num_pdfs) / sum(scales), scales)
+
+    output_file = "{0}/presoftmax_prior_scale.vec".format(dir)
+    WriteKaldiMatrix(output_file, [scaled_counts])
+    ForceSymlink("../presoftmax_prior_scale.vec", "{0}/configs/presoftmax_prior_scale.vec".format(dir))
+
+def PrepareInitialAcousticModel(dir, alidir, run_opts):
+    """ Adds the first layer; this will also add in the lda.mat and
+        presoftmax_prior_scale.vec. It will also prepare the acoustic model
+        with the transition model."""
+
+    RunKaldiCommand("""
+{command} {dir}/log/add_first_layer.log \
+   nnet3-init --srand=-3 {dir}/init.raw {dir}/configs/layer1.config {dir}/0.raw     """.format(command = run_opts.command,
+               dir = dir))
+
+  # Convert to .mdl, train the transitions, set the priors.
+    RunKaldiCommand("""
+{command} {dir}/log/init_mdl.log \
+    nnet3-am-init {alidir}/final.mdl {dir}/0.raw - \| \
+    nnet3-am-train-transitions - "ark:gunzip -c {alidir}/ali.*.gz|" {dir}/0.mdl
+        """.format(command = run_opts.command,
+                   dir = dir, alidir = alidir))
+
+def VerifyIterations(num_iters, num_epochs, num_hidden_layers,
+                     num_archives, max_models_combine, add_layers_period,
+                     num_jobs_final):
+    """ Verifies that number of iterations are sufficient for various
+        phases of training."""
+
+    finish_add_layers_iter = num_hidden_layers * add_layers_period
+
+    if num_iters <= (finish_add_layers_iter + 2):
+        raise Exception(' There are insufficient number of epochs. These are not even sufficient for layer-wise discriminatory training.')
+
+
+    approx_iters_per_epoch_final = num_archives/num_jobs_final
+    # First work out how many iterations we want to combine over in the final
+    # nnet3-combine-fast invocation.  (We may end up subsampling from these if the
+    # number exceeds max_model_combine).  The number we use is:
+    # min(max(max_models_combine, approx_iters_per_epoch_final),
+    #     1/2 * iters_after_last_layer_added)
+    half_iters_after_add_layers = (num_iters - finish_add_layers_iter)/2
+    num_iters_combine = min(max(max_models_combine, approx_iters_per_epoch_final), half_iters_after_add_layers)
+    return num_iters_combine
+
+def GetRealignIters(realign_times, num_iters,
+                    num_jobs_initial, num_jobs_final):
+    """ Takes the realign_times string and identifies the approximate
+        iterations at which realignments have to be done."""
+    # realign_times is a space seperated string of values between 0 and 1
+
+    realign_iters = []
+    for realign_time in realign_times.split():
+        realign_time = float(realign_time)
+        assert(realign_time > 0 and realign_time < 1)
+        if num_jobs_initial == num_jobs_final:
+            realign_iter = int(0.5 + num_iters * realign_time)
+        else:
+            realign_iter = math.sqrt((1 - realign_time) * math.pow(num_jobs_initial, 2)
+                            + realign_time * math.pow(num_jobs_final, 2))
+            realign_iter = realign_iter - num_jobs_initial
+            realign_iter = realign_iter / (num_jobs_final - num_jobs_initial)
+            realign_iter = realign_iter * num_iters
+        realign_iters.append(int(realign_iter))
+
+    return realign_iters
+
+def Align(dir, data, lang, run_opts, iter = None, transform_dir = None,
+          online_ivector_dir = None):
+
+    alidir = '{dir}/ali{ali_suffix}'.format(dir = dir,
+               ali_suffix = "_iter_{0}".format(iter) if iter is not None else "")
+
+    logger.info("Aligning the data{gpu}with {num_jobs} jobs.".format(
+        gpu = " using gpu " if run_opts.realign_use_gpu else " ",
+        num_jobs = run_opts.realign_num_jobs ))
+    RunKaldiCommand("""
+steps/nnet3/align.sh --nj {num_jobs_align} --cmd "{align_cmd} {align_queue_opt}" \
+        --use-gpu {align_use_gpu} \
+        --transform-dir "{transform_dir}" \
+        --online-ivector-dir "{online_ivector_dir}" \
+        --iter "{iter}" {data} {lang} {dir} {alidir}
+    """.format(dir = dir, align_use_gpu = "yes" if run_opts.realign_use_gpu else "no",
+               align_cmd = run_opts.realign_command,
+               align_queue_opt = run_opts.realign_queue_opt,
+               num_jobs_align = run_opts.realign_num_jobs,
+               transform_dir = transform_dir if transform_dir is not None else "",
+               online_ivector_dir = online_ivector_dir if online_ivector_dir is not None else "",
+               iter = iter if iter is not None else "",
+               alidir = alidir,
+               lang = lang, data = data))
+    return alidir
+
+def Realign(dir, iter, feat_dir, lang, prev_egs_dir, cur_egs_dir,
+            prior_subset_size, num_archives, run_opts,
+            transform_dir = None, online_ivector_dir = None):
+    raise Exception("Realignment stage has not been implemented in nnet3")
+    logger.info("Getting average posterior for purposes of adjusting the priors.")
+    # Note: this just uses CPUs, using a smallish subset of data.
+    # always use the first egs archive, which makes the script simpler;
+    # we're using different random subsets of it.
+
+    avg_post_vec_file = ComputeAveragePosterior(dir, iter, prev_egs_dir,
+                            num_archives, prior_subset_size, run_opts)
+
+    avg_post_vec_file = "{dir}/post.{iter}.vec".format(dir=dir, iter=iter)
+    logger.info("Re-adjusting priors based on computed posteriors")
+    model = '{0}/{1}.mdl'.format(dir, iter)
+    AdjustAmPriors(dir, model, avg_post_vec_file, model, run_opts)
+
+    alidir = Align(dir, feat_dir, lang, run_opts, iter,
+                   transform_dir, online_ivector_dir)
+    RunKaldiCommand("""
+steps/nnet3/relabel_egs.sh --cmd "{command}" --iter {iter} {alidir} \
+    {prev_egs_dir} {cur_egs_dir}""".format(
+            command = run_opts.command,
+            iter = iter,
+            dir = dir,
+            alidir = alidir,
+            prev_egs_dir = prev_egs_dir,
+            cur_egs_dir = cur_egs_dir))
+
+def GetLearningRate(iter, num_jobs, num_iters, num_archives_processed,
+                    num_archives_to_process,
+                    initial_effective_lrate, final_effective_lrate):
+    if iter + 1 >= num_iters:
+        effective_learning_rate = final_effective_lrate
+    else:
+        effective_learning_rate =  initial_effective_lrate * math.exp(num_archives_processed * math.log(final_effective_lrate/ initial_effective_lrate)/num_archives_to_process)
+
+    return num_jobs * effective_learning_rate
+
+def DoShrinkage(iter, model_file, non_linearity, shrink_threshold):
+
+    if iter == 0:
+        return True
+
+    try:
+        output, error = RunKaldiCommand("nnet3-am-info --print-args=false {model_file} | grep {non_linearity}".format(non_linearity = non_linearity, model_file = model_file))
+        output = output.strip().split("\n")
+        # eg.
+        # component name=Lstm1_f type=SigmoidComponent, dim=1280, count=5.02e+05, value-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.06,0.17,0.19,0.24 0.28,0.33,0.44,0.62,0.79 0.96,0.99,1.0,1.0), mean=0.482, stddev=0.198], deriv-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.0001,0.003,0.004,0.03 0.12,0.18,0.22,0.24,0.25 0.25,0.25,0.25,0.25), mean=0.198, stddev=0.0591]
+
+        mean_pattern = re.compile(".*deriv-avg=.*mean=([0-9\.]+).*")
+        total_mean_deriv = 0
+        num_derivs = 0
+        for line in output:
+            mat_obj = mean_pattern.search(line)
+            if mat_obj is None:
+                raise Exception("Something went wrong, unable to find deriv-avg in the line \n{0}".format(line))
+            mean_deriv = float(mat_obj.groups()[0])
+            total_mean_deriv += mean_deriv
+            num_derivs += 1
+        if total_mean_deriv / num_derivs < shrink_threshold:
+            return True
+    except ValueError:
+        raise Exception("Error while parsing the model info output")
+
+    return False
+
+def ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts, wait = False):
+
+    model = '{0}/{1}.mdl'.format(dir, iter)
+
+    RunKaldiCommand("""
+{command} {dir}/log/compute_prob_valid.{iter}.log \
+  nnet3-compute-prob "nnet3-am-copy --raw=true {model} - |" \
+        "ark,bg:nnet3-merge-egs ark:{egs_dir}/valid_diagnostic.egs ark:- |"
+    """.format(command = run_opts.command,
+               dir = dir,
+               iter = iter,
+               model = model,
+               egs_dir = egs_dir), wait = wait)
+
+    RunKaldiCommand("""
+{command} {dir}/log/compute_prob_train.{iter}.log \
+  nnet3-compute-prob "nnet3-am-copy --raw=true {model} - |" \
+       "ark,bg:nnet3-merge-egs ark:{egs_dir}/train_diagnostic.egs ark:- |"
+    """.format(command = run_opts.command,
+               dir = dir,
+               iter = iter,
+               model = model,
+               egs_dir = egs_dir), wait = wait)
+
+
+def ComputeProgress(dir, iter, egs_dir, run_opts, wait=False):
+
+    prev_model = '{0}/{1}.mdl'.format(dir, iter - 1)
+    model = '{0}/{1}.mdl'.format(dir, iter)
+    RunKaldiCommand("""
+{command} {dir}/log/progress.{iter}.log \
+nnet3-info "nnet3-am-copy --raw=true {model} - |" '&&' \
+nnet3-show-progress --use-gpu=no "nnet3-am-copy --raw=true {prev_model} - |" "nnet3-am-copy --raw=true {model} - |" \
+"ark,bg:nnet3-merge-egs --minibatch-size=256 ark:{egs_dir}/train_diagnostic.egs ark:-|"
+    """.format(command = run_opts.command,
+               dir = dir,
+               iter = iter,
+               model = model,
+               prev_model = prev_model,
+               egs_dir = egs_dir), wait = wait)
+
+def CombineModels(dir, num_iters, num_iters_combine, egs_dir,
+                  run_opts, chunk_width = None):
+    # Now do combination.  In the nnet3 setup, the logic
+    # for doing averaging of subsets of the models in the case where
+    # there are too many models to reliably esetimate interpolation
+    # factors (max_models_combine) is moved into the nnet3-combine
+    raw_model_strings = []
+    print num_iters_combine
+    for iter in range(num_iters - num_iters_combine + 1, num_iters + 1):
+      model_file = '{0}/{1}.mdl'.format(dir, iter)
+      if not os.path.exists(model_file):
+          raise Exception('Model file {0} missing'.format(model_file))
+      raw_model_strings.append('"nnet3-am-copy --raw=true {0} -|"'.format(model_file))
+
+    if chunk_width is not None:
+        # this is an RNN model
+        mbsize = int(1024.0/(chunk_width))
+    else:
+        mbsize = 1024
+
+    RunKaldiCommand("""
+{command} {combine_queue_opt} {dir}/log/combine.log \
+nnet3-combine --num-iters=40 \
+   --enforce-sum-to-one=true --enforce-positive-weights=true \
+   --verbose=3 {raw_models} "ark,bg:nnet3-merge-egs --measure-output-frames=false --minibatch-size={mbsize} ark:{egs_dir}/combine.egs ark:-|" \
+"|nnet3-am-copy --set-raw-nnet=- {dir}/{num_iters}.mdl {dir}/combined.mdl"
+    """.format(command = run_opts.command,
+               combine_queue_opt = run_opts.combine_queue_opt,
+               dir = dir, raw_models = " ".join(raw_model_strings),
+               mbsize = mbsize,
+               num_iters = num_iters,
+               egs_dir = egs_dir))
+
+  # Compute the probability of the final, combined model with
+  # the same subset we used for the previous compute_probs, as the
+  # different subsets will lead to different probs.
+    ComputeTrainCvProbabilities(dir, 'combined', egs_dir, run_opts, wait = False)
+
+def ComputeAveragePosterior(dir, iter, egs_dir, num_archives,
+                            prior_subset_size, run_opts):
+    # Note: this just uses CPUs, using a smallish subset of data.
+    """ Computes the average posterior of the network"""
+    import glob
+    for file in glob.glob('{0}/post.{1}.*.vec'.format(dir, iter)):
+        os.remove(file)
+
+    if run_opts.num_jobs_compute_prior > num_archives:
+        egs_part = 1
+    else:
+        egs_part = 'JOB'
+
+    RunKaldiCommand("""
+{command} JOB=1:{num_jobs_compute_prior} {prior_queue_opt} {dir}/log/get_post.{iter}.JOB.log \
+    nnet3-subset-egs --srand=JOB --n={prior_subset_size} ark:{egs_dir}/egs.{egs_part}.ark ark:- \| \
+    nnet3-merge-egs --measure-output-frames=true --minibatch-size=128 ark:- ark:- \| \
+    nnet3-compute-from-egs {prior_gpu_opt} --apply-exp=true \
+  "nnet3-am-copy --raw=true {dir}/combined.mdl -|" ark:- ark:- \| \
+matrix-sum-rows ark:- ark:- \| vector-sum ark:- {dir}/post.{iter}.JOB.vec
+    """.format(command = run_opts.command,
+               dir = dir,
+               num_jobs_compute_prior = run_opts.num_jobs_compute_prior,
+               prior_queue_opt = run_opts.prior_queue_opt,
+               iter = iter, prior_subset_size = prior_subset_size,
+               egs_dir = egs_dir, egs_part = egs_part,
+               prior_gpu_opt = run_opts.prior_gpu_opt))
+
+    # make sure there is time for $dir/post.{iter}.*.vec to appear.
+    time.sleep(5)
+    avg_post_vec_file = "{dir}/post.{iter}.vec".format(dir=dir, iter=iter)
+    RunKaldiCommand("""
+{command} {dir}/log/vector_sum.{iter}.log \
+    vector-sum {dir}/post.{iter}.*.vec {output_file}
+        """.format(command = run_opts.command,
+                   dir = dir, iter = iter, output_file = avg_post_vec_file))
+
+    for file in glob.glob('{0}/post.{1}.*.vec'.format(dir, iter)):
+        os.remove(file)
+    return avg_post_vec_file
+
+def AdjustAmPriors(dir, input_model, avg_posterior_vector, output_model, run_opts):
+    RunKaldiCommand("""
+{command} {dir}/log/adjust_priors.final.log \
+nnet3-am-adjust-priors {input_model} {avg_posterior_vector} {output_model}
+    """.format(command = run_opts.command,
+               dir = dir, input_model = input_model,
+               avg_posterior_vector = avg_posterior_vector,
+               output_model = output_model))
+
+def RemoveEgs(egs_dir):
+    RunKaldiCommand("steps/nnet2/remove_egs.sh {egs_dir}".format(egs_dir=egs_dir))
+
+def CleanNnetDir(nnet_dir, num_iters, egs_dir, num_iters_combine = None,
+                 preserve_model_interval = 100,
+                 remove_egs = True):
+    try:
+        if remove_egs:
+            RemoveEgs(egs_dir)
+
+        for iter in range(num_iters):
+            RemoveModel(nnet_dir, iter, num_iters, 1,
+                        preserve_model_interval)
+    except (IOError, OSError) as err:
+        logger.warning("Error while cleaning up the nnet directory")
+        raise err
+
+def RemoveModel(nnet_dir, iter, num_iters, num_iters_combine = None,
+               preserve_model_interval = 100):
+    if iter % preserve_model_interval == 0:
+        return
+    if num_iters_combine is not None and iter >= num_iters - num_iters_combine + 1 :
+        return
+    file_name = '{0}/{1}.mdl'.format(nnet_dir, iter)
+    if os.path.isfile(file_name):
+        os.remove(file_name)
+
+def ComputeLifterCoeffs(lifter, dim):
+    coeffs = [0] * dim
+    for i in range(0, dim):
+        coeffs[i] = 1.0 + 0.5 * lifter * math.sin(math.pi * i / float(lifter));
+
+    return coeffs
+
+def ComputeIdctMatrix(K, N, cepstral_lifter=0):
+    matrix = [[0] * K for i in range(N)]
+    # normalizer for X_0
+    normalizer = math.sqrt(1.0 / float(N));
+    for j in range(0, N):
+        matrix[j][0] = normalizer;
+    # normalizer for other elements
+    normalizer = math.sqrt(2.0 / float(N));
+    for k in range(1, K):
+      for n in range(0, N):
+        matrix[n][k] = normalizer * math.cos(math.pi / float(N) * (n + 0.5) * k);
+
+    if cepstral_lifter != 0:
+        lifter_coeffs = ComputeLifterCoeffs(cepstral_lifter, K)
+        for k in range(0, K):
+          for n in range(0, N):
+            matrix[n][k] = matrix[n][k] / lifter_coeffs[k];
+
+    return matrix
+
+def WriteIdctMatrix(feat_dim, cepstral_lifter, file_path):
+    # generate the IDCT matrix and write to the file
+    idct_matrix = ComputeIdctMatrix(feat_dim, feat_dim, cepstral_lifter)
+    # append a zero column to the matrix, this is the bias of the fixed affine component
+    for k in range(0, feat_dim):
+        idct_matrix[k].append(0)
+    WriteKaldiMatrix(file_path, idct_matrix)
+
diff --git a/egs/wsj/s5/steps/nnet3/report/generate_plots.py b/egs/wsj/s5/steps/nnet3/report/generate_plots.py
new file mode 100755
index 00000000000..bed8abd132b
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/report/generate_plots.py
@@ -0,0 +1,298 @@
+#!/usr/bin/env python
+
+
+# Copyright 2016 Vijayaditya Peddinti.
+# Apache 2.0.
+
+import warnings
+import imp
+import argparse
+import os
+import errno
+import logging
+import re
+import subprocess
+train_lib = imp.load_source('ntl', 'steps/nnet3/nnet3_train_lib.py')
+
+try:
+    import matplotlib as mpl
+    mpl.use('Agg')
+    import matplotlib.pyplot as plt
+    from matplotlib.backends.backend_pdf import PdfPages
+    import numpy as np
+
+    plot = True
+except ImportError:
+    warnings.warn("""
+This script requires matplotlib and numpy. Please install them to generate plots. Proceeding with generation of tables.
+If you are on a cluster where you do not have admin rights you could try using virtualenv.""")
+
+nlp = imp.load_source('nlp', 'steps/nnet3/report/nnet3_log_parse_lib.py')
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+handler.setLevel(logging.INFO)
+formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s')
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+logger.info('Generating plots')
+
+
+
+
+def GetArgs():
+    parser = argparse.ArgumentParser(description="""
+Parses the training logs and generates a variety of plots.
+example : steps/nnet3/report/generate_plots.py --comparison-dir exp/nnet3/tdnn1 --comparison-dir exp/nnet3/tdnn2 exp/nnet3/tdnn exp/nnet3/tdnn/report
+""")
+    parser.add_argument("--comparison-dir", type=str, action='append', help="other experiment directories for comparison. These will only be used for plots, not tables")
+    parser.add_argument("--start-iter", type=int, help="Iteration from which plotting will start", default = 1)
+    parser.add_argument("--is-chain", type=str, default = False, action = train_lib.StrToBoolAction, help="Iteration from which plotting will start")
+    parser.add_argument("exp_dir", help="experiment directory, e.g. exp/nnet3/tdnn")
+    parser.add_argument("output_dir", help="experiment directory, e.g. exp/nnet3/tdnn/report")
+
+    args = parser.parse_args()
+    if args.comparison_dir is not None and len(args.comparison_dir) > 6:
+        raise Exception("max 6 --comparison-dir options can be specified. If you want to compare with more comparison_dir, you would have to carefully tune the plot_colors variable which specified colors used for plotting.")
+    assert(args.start_iter >= 1)
+    return args
+
+plot_colors = ['red', 'blue', 'green', 'black', 'magenta', 'yellow', 'cyan' ]
+
+
+
+class LatexReport:
+    def __init__(self, pdf_file):
+        self.pdf_file = pdf_file
+        self.document=[]
+        self.document.append("""
+\documentclass[prl,10pt,twocolumn]{revtex4}
+\usepackage{graphicx}    % Used to import the graphics
+\\begin{document}
+""")
+
+    def AddFigure(self, figure_pdf, title):
+        # we will have keep extending this replacement list based on errors during compilation
+        # escaping underscores in the title
+        title = "\\texttt{"+re.sub("_","\_", title)+"}"
+        fig_latex = """
+%...
+\\newpage
+\\begin{figure}[h]
+  \\begin{center}
+    \caption{""" + title + """}
+    \includegraphics[width=\\textwidth]{""" + figure_pdf + """}
+  \end{center}
+\end{figure}
+\clearpage
+%...
+"""
+        self.document.append(fig_latex)
+
+    def Close(self):
+        self.document.append("\end{document}")
+        return self.Compile()
+
+    def Compile(self):
+        root, ext = os.path.splitext(self.pdf_file)
+        dir_name = os.path.dirname(self.pdf_file)
+        latex_file = root + ".tex"
+        lat_file = open(latex_file, "w")
+        lat_file.write("\n".join(self.document))
+        lat_file.close()
+        logger.info("Compiling the latex report.")
+        try:
+            proc = subprocess.Popen(['pdflatex', '-output-directory='+str(dir_name), latex_file], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            proc.communicate()
+        except Exception as e:
+            logger.warning("There was an error compiling the latex file {0}, please do it manually.".format(latex_file))
+            return False
+        return True
+
+def GenerateAccuracyPlots(exp_dir, output_dir, plot, key = 'accuracy', file_basename = 'accuracy', comparison_dir = None, start_iter = 1, latex_report = None):
+    assert(start_iter >= 1)
+
+    if plot:
+        fig = plt.figure()
+        plots = []
+
+    comparison_dir = [] if comparison_dir is None else comparison_dir
+    dirs = [exp_dir] + comparison_dir
+    index = 0
+    for dir in dirs:
+        [accuracy_report, accuracy_times, accuracy_data] = nlp.GenerateAccuracyReport(dir, key)
+        if index == 0:
+            # this is the main experiment directory
+            acc_file = open("{0}/{1}.log".format(output_dir, file_basename), "w")
+            acc_file.write(accuracy_report)
+            acc_file.close()
+
+        if plot:
+            color_val = plot_colors[index]
+            data = np.array(accuracy_data)
+            if data.shape[0] == 0:
+                raise Exception("Couldn't find any rows for the accuracy plot")
+            data = data[data[:,0]>=start_iter, :]
+            plot_handle, = plt.plot(data[:, 0], data[:, 1], color = color_val, linestyle = "--", label = "train {0}".format(dir))
+            plots.append(plot_handle)
+            plot_handle, = plt.plot(data[:, 0], data[:, 2], color = color_val, label = "valid {0}".format(dir))
+            plots.append(plot_handle)
+        index += 1
+    if plot:
+        plt.xlabel('Iteration')
+        plt.ylabel(key)
+        lgd = plt.legend(handles=plots, loc='lower center', bbox_to_anchor=(0.5, -0.2 + len(dirs) * -0.1 ), ncol=1, borderaxespad=0.)
+        plt.grid(True)
+        fig.suptitle("{0} plot".format(key))
+        figfile_name = '{0}/{1}.pdf'.format(output_dir, file_basename)
+        plt.savefig(figfile_name, bbox_extra_artists=(lgd,), bbox_inches='tight')
+        if latex_report is not None:
+            latex_report.AddFigure(figfile_name, "Plot of {0} vs iterations".format(key))
+
+def GenerateNonlinStatsPlots(exp_dir, output_dir, plot, comparison_dir = None, start_iter = 1, latex_report = None):
+    assert(start_iter >= 1)
+
+    comparison_dir = [] if comparison_dir is None else comparison_dir
+    dirs = [exp_dir] + comparison_dir
+    index = 0
+    stats_per_dir = {}
+
+    for dir in dirs:
+        stats_per_component_per_iter = nlp.ParseProgressLogsForNonlinearityStats(dir)
+        stats_per_dir[dir] = stats_per_component_per_iter
+
+    # convert the nonlin stats into tables
+    stat_tables_per_component_per_dir = {}
+    for dir in dirs:
+        stats_per_component_per_iter = stats_per_dir[dir]
+        component_names = stats_per_component_per_iter.keys()
+        stat_tables_per_component = {}
+        for component_name in component_names:
+            comp_data = stats_per_component_per_iter[component_name]
+            comp_type = comp_data['type']
+            comp_stats = comp_data['stats']
+            iters = comp_stats.keys()
+            iters.sort()
+            iter_stats = []
+            for iter in iters:
+                iter_stats.append([iter] + comp_stats[iter])
+            stat_tables_per_component[component_name] = iter_stats
+        stat_tables_per_component_per_dir[dir] = stat_tables_per_component
+
+    main_stat_tables = stat_tables_per_component_per_dir[exp_dir]
+    for component_name in main_stat_tables.keys():
+        # this is the main experiment directory
+        file = open("{dir}/nonlinstats_{comp_name}.log".format(dir = output_dir, comp_name = component_name), "w")
+        file.write("Iteration\tValueMean\tValueStddev\tDerivMean\tDerivStddev\n")
+        iter_stat_report = ""
+        iter_stats = main_stat_tables[component_name]
+        for row in iter_stats:
+            iter_stat_report += "\t".join(map(lambda x: str(x), row)) + "\n"
+        file.write(iter_stat_report)
+        file.close()
+
+    if plot:
+        main_component_names = main_stat_tables.keys()
+        main_component_names.sort()
+
+        plot_component_names = set(main_component_names)
+        for dir in dirs:
+            component_names = set(stats_per_dir[dir].keys())
+            plot_component_names = plot_component_names.intersection(component_names)
+        plot_component_names = list(plot_component_names)
+        plot_component_names.sort()
+        if plot_component_names != main_component_names:
+            logger.warning("The components in all the neural networks in the given experiment dirs are not the same, so comparison plots are provided only for common component names. Make sure that these are comparable experiments before analyzing these plots.")
+
+        fig = plt.figure()
+        for component_name in main_component_names:
+            fig.clf()
+            index = 0
+            plots = []
+            for dir in dirs:
+                color_val = plot_colors[index]
+                index += 1
+                try:
+                    iter_stats = stat_tables_per_component_per_dir[dir][component_name]
+                except KeyError:
+                    # this component is not available in this network so lets not just plot it
+                    continue
+
+                data = np.array(iter_stats)
+                data = data[data[:,0] >=start_iter, :]
+                ax = plt.subplot(211)
+                mp, = ax.plot(data[:,0], data[:,1], color=color_val, label="Mean {0}".format(dir))
+                msph, = ax.plot(data[:,0], data[:,1] + data[:,2], color=color_val, linestyle='--', label = "Mean+-Stddev {0}".format(dir))
+                mspl, = ax.plot(data[:,0], data[:,1] - data[:,2], color=color_val, linestyle='--')
+                plots.append(mp)
+                plots.append(msph)
+                ax.set_ylabel('Value-{0}'.format(comp_type))
+                ax.grid(True)
+
+                ax = plt.subplot(212)
+                mp, = ax.plot(data[:,0], data[:,3], color=color_val)
+                msph, = ax.plot(data[:,0], data[:,3] + data[:,4], color=color_val, linestyle='--')
+                mspl, = ax.plot(data[:,0], data[:,3] - data[:,4], color=color_val, linestyle='--')
+                ax.set_xlabel('Iteration')
+                ax.set_ylabel('Derivative-{0}'.format(comp_type))
+                ax.grid(True)
+
+            lgd = plt.legend(handles=plots, loc='lower center', bbox_to_anchor=(0.5, -0.5 + len(dirs) * -0.2 ), ncol=1, borderaxespad=0.)
+            plt.grid(True)
+            fig.suptitle("Mean and stddev of the value and derivative at {comp_name}".format(comp_name = component_name))
+            figfile_name = '{dir}/nonlinstats_{comp_name}.pdf'.format(dir = output_dir, comp_name = component_name)
+            fig.savefig(figfile_name, bbox_extra_artists=(lgd,), bbox_inches='tight')
+            if latex_report is not None:
+                latex_report.AddFigure(figfile_name, "Mean and stddev of the value and derivative at {0}".format(component_name))
+
+def GeneratePlots(exp_dir, output_dir, comparison_dir = None, start_iter = 1, is_chain = False):
+    try:
+        os.makedirs(output_dir)
+    except OSError as e:
+        if e.errno == errno.EEXIST and os.path.isdir(output_dir):
+            pass
+        else:
+            raise e
+    if plot:
+        latex_report = LatexReport("{0}/report.pdf".format(output_dir))
+    else:
+        latex_report = None
+
+    if is_chain:
+        logger.info("Generating log-probability plots")
+        GenerateAccuracyPlots(exp_dir, output_dir, plot, key = 'log-probability', file_basename = 'log_probability', comparison_dir = comparison_dir, start_iter = start_iter, latex_report = latex_report)
+    else:
+        logger.info("Generating accuracy plots")
+        GenerateAccuracyPlots(exp_dir, output_dir, plot, key = 'accuracy', file_basename = 'accuracy', comparison_dir = comparison_dir, start_iter = start_iter, latex_report = latex_report)
+
+        logger.info("Generating log-likelihood plots")
+        GenerateAccuracyPlots(exp_dir, output_dir, plot, key = 'log-likelihood', file_basename = 'loglikelihood', comparison_dir = comparison_dir, start_iter = start_iter, latex_report = latex_report)
+
+    logger.info("Generating non-linearity stats plots")
+    GenerateNonlinStatsPlots(exp_dir, output_dir, plot, comparison_dir = comparison_dir, start_iter = start_iter, latex_report = latex_report)
+
+    logger.info("Generating parameter difference files")
+    # Parameter changes
+    key_file = {"Parameter differences":"parameter.diff",
+                "Relative parameter differences":"relative_parameter.diff"}
+    for key in key_file.keys():
+        file = open("{0}/{1}".format(output_dir, key_file[key]), "w")
+        data = nlp.ParseProgressLogsForParamDiff(exp_dir, key)
+        for row in data:
+            file.write(" ".join(map(lambda x:str(x),row))+"\n")
+        file.close()
+    if plot and latex_report is not None:
+        has_compiled = latex_report.Close()
+        if has_compiled:
+            logger.info("Report has been generated. You can find it at the location {0}".format("{0}/report.pdf".format(output_dir)))
+
+def Main():
+    args = GetArgs()
+    GeneratePlots(args.exp_dir, args.output_dir,
+                  comparison_dir = args.comparison_dir,
+                  start_iter = args.start_iter,
+                  is_chain = args.is_chain)
+
+if __name__ == "__main__":
+    Main()
diff --git a/egs/wsj/s5/steps/nnet3/report/nnet3_log_parse_lib.py b/egs/wsj/s5/steps/nnet3/report/nnet3_log_parse_lib.py
new file mode 100755
index 00000000000..2268fbadd72
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/report/nnet3_log_parse_lib.py
@@ -0,0 +1,154 @@
+# Copyright 2016 Vijayaditya Peddinti.
+# Apache 2.0.
+
+from __future__ import division
+import sys, glob, re, math, datetime, argparse
+import imp
+import datetime as dt
+
+ntl = imp.load_source('ntl', 'steps/nnet3/nnet3_train_lib.py')
+
+#exp/nnet3/lstm_self_repair_ld5_sp/log/progress.9.log:component name=Lstm3_i type=SigmoidComponent, dim=1280, self-repair-scale=1e-05, count=1.96e+05, value-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.05,0.09,0.11,0.15 0.19,0.27,0.50,0.72,0.83 0.88,0.92,0.94,0.99), mean=0.502, stddev=0.23], deriv-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.009,0.04,0.05,0.06 0.08,0.10,0.14,0.17,0.18 0.19,0.20,0.20,0.21), mean=0.134, stddev=0.0397]
+def ParseProgressLogsForNonlinearityStats(exp_dir):
+    progress_log_files = "%s/log/progress.*.log" % (exp_dir)
+    stats_per_component_per_iter = {}
+    progress_log_lines  = ntl.RunKaldiCommand('grep -e "value-avg.*deriv-avg" {0}'.format(progress_log_files), measure_time = False)[0]
+    parse_regex = re.compile(".*progress.([0-9]+).log:component name=(.+) type=(.*)Component,.*value-avg=\[.*mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\].*deriv-avg=\[.*mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\]")
+    for line in progress_log_lines.split("\n") :
+        mat_obj = parse_regex.search(line)
+        if mat_obj is None:
+            continue
+        groups = mat_obj.groups()
+        # groups  = ('9', 'Lstm3_i', 'Sigmoid', '0.502', '0.23', '0.134', '0.0397')
+        iteration = int(groups[0])
+        component_name = groups[1]
+        component_type = groups[2]
+        value_mean = float(groups[3])
+        value_stddev = float(groups[4])
+        deriv_mean = float(groups[5])
+        deriv_stddev = float(groups[6])
+        try:
+            stats_per_component_per_iter[component_name]['stats'][iteration] = [value_mean, value_stddev, deriv_mean, deriv_stddev]
+        except KeyError:
+            stats_per_component_per_iter[component_name] = {}
+            stats_per_component_per_iter[component_name]['type'] = component_type
+            stats_per_component_per_iter[component_name]['stats'] = {}
+            stats_per_component_per_iter[component_name]['stats'][iteration] = [value_mean, value_stddev, deriv_mean, deriv_stddev]
+
+    return stats_per_component_per_iter
+
+def ParseDifferenceString(string):
+    dict = {}
+    for parts in string.split():
+        sub_parts = parts.split(":")
+        dict[sub_parts[0]] = float(sub_parts[1])
+    return dict
+
+#exp/chain/cwrnn_trial2_ld5_sp/log/progress.245.log:LOG (nnet3-show-progress:main():nnet3-show-progress.cc:144) Relative parameter differences per layer are [ Cwrnn1_T3_W_r:0.0171537 Cwrnn1_T3_W_x:1.33338e-07 Cwrnn1_T2_W_r:0.048075 Cwrnn1_T2_W_x:1.34088e-07 Cwrnn1_T1_W_r:0.0157277 Cwrnn1_T1_W_x:0.0212704 Final_affine:0.0321521 Cwrnn2_T3_W_r:0.0212082 Cwrnn2_T3_W_x:1.33691e-07 Cwrnn2_T2_W_r:0.0212978 Cwrnn2_T2_W_x:1.33401e-07 Cwrnn2_T1_W_r:0.014976 Cwrnn2_T1_W_x:0.0233588 Cwrnn3_T3_W_r:0.0237165 Cwrnn3_T3_W_x:1.33184e-07 Cwrnn3_T2_W_r:0.0239754 Cwrnn3_T2_W_x:1.3296e-07 Cwrnn3_T1_W_r:0.0194809 Cwrnn3_T1_W_x:0.0271934 ]
+def ParseProgressLogsForParamDiff(exp_dir, pattern):
+    if pattern not in set(["Relative parameter differences", "Parameter differences"]):
+        raise Exception("Unknown value for pattern : {0}".format(pattern))
+
+    progress_log_files = "%s/log/progress.*.log" % (exp_dir)
+    progress_per_iter = {}
+    component_names = set([])
+    progress_log_lines = ntl.RunKaldiCommand('grep -e "{0}" {1}'.format(pattern, progress_log_files), measure_time = False)[0]
+    parse_regex = re.compile(".*progress\.([0-9]+)\.log:LOG.*{0}.*\[(.*)\]".format(pattern))
+    for line in progress_log_lines.split("\n") :
+        mat_obj = parse_regex.search(line)
+        if mat_obj is None:
+            continue
+        groups = mat_obj.groups()
+        iteration = groups[0]
+        differences = ParseDifferenceString(groups[1])
+        component_names  = component_names.union(differences.keys())
+        progress_per_iter[int(iteration)] = differences
+
+    component_names = list(component_names)
+    component_names.sort()
+    # rearranging the data into an array
+    data = []
+    data.append(["iteration"]+component_names)
+    max_iter = max(progress_per_iter.keys())
+    for iter in range(max_iter + 1):
+        try:
+            component_dict = progress_per_iter[iter]
+        except KeyError:
+            continue
+        iter_values = []
+        for component_name in component_names:
+            try:
+                iter_values.append(component_dict[component_name])
+            except KeyError:
+                # the component was not found this iteration, may be because of layerwise discriminative training
+                iter_values.append(0)
+        data.append([iter] + iter_values)
+
+    return data
+
+def ParseTrainLogs(exp_dir):
+  train_log_files = "%s/log/train.*.log" % (exp_dir)
+  train_log_lines = ntl.RunKaldiCommand('grep -e Accounting {0}'.format(train_log_files), measure_time = False)[0]
+  parse_regex = re.compile(".*train\.([0-9]+)\.([0-9]+)\.log:# Accounting: time=([0-9]+) thread.*")
+
+  train_times = {}
+  for line in train_log_lines.split('\n'):
+    mat_obj = parse_regex.search(line)
+    if mat_obj is not None:
+        groups = mat_obj.groups()
+        try:
+            train_times[int(groups[0])][int(groups[1])] = float(groups[2])
+        except KeyError:
+            train_times[int(groups[0])] = {}
+            train_times[int(groups[0])][int(groups[1])] = float(groups[2])
+  iters = train_times.keys()
+  for iter in iters:
+      values = train_times[iter].values()
+      train_times[iter] = max(values)
+  return train_times
+
+def ParseProbLogs(exp_dir, key = 'accuracy'):
+    train_prob_files = "%s/log/compute_prob_train.*.log" % (exp_dir)
+    valid_prob_files = "%s/log/compute_prob_valid.*.log" % (exp_dir)
+    train_prob_strings = ntl.RunKaldiCommand('grep -e {0} {1}'.format(key, train_prob_files), wait = True, measure_time = False)[0]
+    valid_prob_strings = ntl.RunKaldiCommand('grep -e {0} {1}'.format(key, valid_prob_files), measure_time = False)[0]
+
+    #LOG (nnet3-chain-compute-prob:PrintTotalStats():nnet-chain-diagnostics.cc:149) Overall log-probability for 'output' is -0.399395 + -0.013437 = -0.412832 per frame, over 20000 fra
+    #LOG (nnet3-chain-compute-prob:PrintTotalStats():nnet-chain-diagnostics.cc:144) Overall log-probability for 'output' is -0.307255 per frame, over 20000 frames.
+    parse_regex = re.compile(".*compute_prob_.*\.([0-9]+).log:LOG .nnet3.*compute-prob:PrintTotalStats..:nnet.*diagnostics.cc:[0-9]+. Overall ([a-zA-Z\-]+) for 'output'.*is ([0-9.\-e]+) .*per frame")
+    train_loss={}
+    valid_loss={}
+
+
+    for line in train_prob_strings.split('\n'):
+        mat_obj = parse_regex.search(line)
+        if mat_obj is not None:
+            groups = mat_obj.groups()
+            if groups[1] == key:
+                train_loss[int(groups[0])] = groups[2]
+    for line in valid_prob_strings.split('\n'):
+        mat_obj = parse_regex.search(line)
+        if mat_obj is not None:
+            groups = mat_obj.groups()
+            if groups[1] == key:
+                valid_loss[int(groups[0])] = groups[2]
+    iters = list(set(valid_loss.keys()).intersection(train_loss.keys()))
+    iters.sort()
+    return map(lambda x: (int(x), float(train_loss[x]), float(valid_loss[x])), iters)
+
+def GenerateAccuracyReport(exp_dir, key = "accuracy"):
+    times = ParseTrainLogs(exp_dir)
+    data = ParseProbLogs(exp_dir, key)
+    report = []
+    report.append("%Iter\tduration\ttrain_loss\tvalid_loss\tdifference")
+    for x in data:
+        try:
+            report.append("%d\t%s\t%g\t%g\t%g" % (x[0], str(times[x[0]]), x[1], x[2], x[2]-x[1]))
+        except KeyError:
+            continue
+
+    total_time = 0
+    for iter in times.keys():
+        total_time += times[iter]
+    report.append("Total training time is {0}\n".format(str(datetime.timedelta(seconds = total_time))))
+    return ["\n".join(report), times, data]
diff --git a/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py b/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py
new file mode 100755
index 00000000000..5290a4c1abe
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py
@@ -0,0 +1,638 @@
+#!/usr/bin/env python
+
+# we're using python 3.x style print but want it to work in python 2.x,
+from __future__ import print_function
+import os
+import argparse
+import shlex
+import sys
+import warnings
+import copy
+import imp
+import ast
+
+nodes = imp.load_source('', 'steps/nnet3/components.py')
+nnet3_train_lib = imp.load_source('ntl', 'steps/nnet3/nnet3_train_lib.py')
+chain_lib = imp.load_source('ncl', 'steps/nnet3/chain/nnet3_chain_lib.py')
+
+def GetArgs():
+    # we add compulsary arguments as named arguments for readability
+    parser = argparse.ArgumentParser(description="Writes config files and variables "
+                                                 "for TDNNs creation and training",
+                                     epilog="See steps/nnet3/tdnn/train.sh for example.")
+
+    # Only one of these arguments can be specified, and one of them has to
+    # be compulsarily specified
+    feat_group = parser.add_mutually_exclusive_group(required = True)
+    feat_group.add_argument("--feat-dim", type=int,
+                            help="Raw feature dimension, e.g. 13")
+    feat_group.add_argument("--feat-dir", type=str,
+                            help="Feature directory, from which we derive the feat-dim")
+
+    # only one of these arguments can be specified
+    ivector_group = parser.add_mutually_exclusive_group(required = False)
+    ivector_group.add_argument("--ivector-dim", type=int,
+                                help="iVector dimension, e.g. 100", default=0)
+    ivector_group.add_argument("--ivector-dir", type=str,
+                                help="iVector dir, which will be used to derive the ivector-dim  ", default=None)
+
+    num_target_group = parser.add_mutually_exclusive_group(required = True)
+    num_target_group.add_argument("--num-targets", type=int,
+                                  help="number of network targets (e.g. num-pdf-ids/num-leaves)")
+    num_target_group.add_argument("--ali-dir", type=str,
+                                  help="alignment directory, from which we derive the num-targets")
+    num_target_group.add_argument("--tree-dir", type=str,
+                                  help="directory with final.mdl, from which we derive the num-targets")
+
+    # CNN options
+    parser.add_argument('--cnn.layer', type=str, action='append', dest = "cnn_layer",
+                        help="CNN parameters at each CNN layer, e.g. --filt-x-dim=3 --filt-y-dim=8 "
+                        "--filt-x-step=1 --filt-y-step=1 --num-filters=256 --pool-x-size=1 --pool-y-size=3 "
+                        "--pool-z-size=1 --pool-x-step=1 --pool-y-step=3 --pool-z-step=1, "
+                        "when CNN layers are used, no LDA will be added", default = None)
+    parser.add_argument("--cnn.bottleneck-dim", type=int, dest = "cnn_bottleneck_dim",
+                        help="Output dimension of the linear layer at the CNN output "
+                        "for dimension reduction, e.g. 256."
+                        "The default zero means this layer is not needed.", default=0)
+    parser.add_argument("--cnn.cepstral-lifter", type=float, dest = "cepstral_lifter",
+                        help="The factor used for determining the liftering vector in the production of MFCC. "
+                        "User has to ensure that it matches the lifter used in MFCC generation, "
+                        "e.g. 22.0", default=22.0)
+
+    # General neural network options
+    parser.add_argument("--splice-indexes", type=str, required = True,
+                        help="Splice indexes at each layer, e.g. '-3,-2,-1,0,1,2,3' "
+                        "If CNN layers are used the first set of splice indexes will be used as input "
+                        "to the first CNN layer and later splice indexes will be interpreted as indexes "
+                        "for the TDNNs.")
+    parser.add_argument("--add-lda", type=str, action=nnet3_train_lib.StrToBoolAction,
+                        help="If \"true\" an LDA matrix computed from the input features "
+                        "(spliced according to the first set of splice-indexes) will be used as "
+                        "the first Affine layer. This affine layer's parameters are fixed during training. "
+                        "If --cnn.layer is specified this option will be forced to \"false\".",
+                        default=True, choices = ["false", "true"])
+
+    parser.add_argument("--include-log-softmax", type=str, action=nnet3_train_lib.StrToBoolAction,
+                        help="add the final softmax layer ", default=True, choices = ["false", "true"])
+    parser.add_argument("--add-final-sigmoid", type=str, action=nnet3_train_lib.StrToBoolAction,
+                        help="add a final sigmoid layer as alternate to log-softmax-layer. "
+                        "Can only be used if include-log-softmax is false. "
+                        "This is useful in cases where you want the output to be "
+                        "like probabilities between 0 and 1. Typically the nnet "
+                        "is trained with an objective such as quadratic",
+                        default=False, choices = ["false", "true"])
+
+    parser.add_argument("--objective-type", type=str,
+                        help = "the type of objective; i.e. quadratic or linear",
+                        default="linear", choices = ["linear", "quadratic"])
+    parser.add_argument("--xent-regularize", type=float,
+                        help="For chain models, if nonzero, add a separate output for cross-entropy "
+                        "regularization (with learning-rate-factor equal to the inverse of this)",
+                        default=0.0)
+    parser.add_argument("--xent-separate-forward-affine", type=str, action=nnet3_train_lib.StrToBoolAction,
+                        help="if using --xent-regularize, gives it separate last-but-one weight matrix",
+                        default=False, choices = ["false", "true"])
+    parser.add_argument("--final-layer-normalize-target", type=float,
+                        help="RMS target for final layer (set to <1 if final layer learns too fast",
+                        default=1.0)
+    parser.add_argument("--subset-dim", type=int, default=0,
+                        help="dimension of the subset of units to be sent to the central frame")
+    parser.add_argument("--pnorm-input-dim", type=int,
+                        help="input dimension to p-norm nonlinearities")
+    parser.add_argument("--pnorm-output-dim", type=int,
+                        help="output dimension of p-norm nonlinearities")
+    parser.add_argument("--relu-dim", type=int,
+                        help="dimension of ReLU nonlinearities")
+
+    parser.add_argument("--self-repair-scale", type=float,
+                        help="A non-zero value activates the self-repair mechanism in the sigmoid and tanh non-linearities of the LSTM", default=None)
+
+
+    parser.add_argument("--pool-type", type=str, default = 'none',
+                        help="Type of pooling to be used.", choices = ['low-pass', 'weighted-average', 'per-dim-weighted-average', 'multi-dim-weighted-average', 'none'])
+    parser.add_argument("--pool-window", type=int, default = None,
+                        help="Width of the pooling window")
+    parser.add_argument("--pool-lpfilter-width", type=float,
+                        default = None, help="Nyquist frequency of the lpfilter to be used for pooling")
+    parser.add_argument("--use-presoftmax-prior-scale", type=str, action=nnet3_train_lib.StrToBoolAction,
+                        help="if true, a presoftmax-prior-scale is added",
+                        choices=['true', 'false'], default = True)
+    parser.add_argument("config_dir",
+                        help="Directory to write config files and variables")
+
+    print(' '.join(sys.argv))
+
+    args = parser.parse_args()
+    args = CheckArgs(args)
+
+    return args
+
+def CheckArgs(args):
+    if not os.path.exists(args.config_dir):
+        os.makedirs(args.config_dir)
+
+    ## Check arguments.
+    if args.feat_dir is not None:
+        args.feat_dim = nnet3_train_lib.GetFeatDim(args.feat_dir)
+
+    if args.ali_dir is not None:
+        args.num_targets = nnet3_train_lib.GetNumberOfLeaves(args.ali_dir)
+    elif args.tree_dir is not None:
+        args.num_targets = chain_lib.GetNumberOfLeaves(args.tree_dir)
+
+    if args.ivector_dir is not None:
+        args.ivector_dim = nnet3_train_lib.GetIvectorDim(args.ivector_dir)
+
+    if not args.feat_dim > 0:
+        raise Exception("feat-dim has to be postive")
+
+    if not args.num_targets > 0:
+        print(args.num_targets)
+        raise Exception("num_targets has to be positive")
+
+    if not args.ivector_dim >= 0:
+        raise Exception("ivector-dim has to be non-negative")
+
+    if (args.subset_dim < 0):
+        raise Exception("--subset-dim has to be non-negative")
+    if (args.pool_window is not None) and (args.pool_window <= 0):
+        raise Exception("--pool-window has to be positive")
+
+    if not args.relu_dim is None:
+        if not args.pnorm_input_dim is None or not args.pnorm_output_dim is None:
+            raise Exception("--relu-dim argument not compatible with "
+                            "--pnorm-input-dim or --pnorm-output-dim options");
+        args.nonlin_input_dim = args.relu_dim
+        args.nonlin_output_dim = args.relu_dim
+    else:
+        if not args.pnorm_input_dim > 0 or not args.pnorm_output_dim > 0:
+            raise Exception("--relu-dim not set, so expected --pnorm-input-dim and "
+                            "--pnorm-output-dim to be provided.");
+        args.nonlin_input_dim = args.pnorm_input_dim
+        args.nonlin_output_dim = args.pnorm_output_dim
+
+    if args.add_final_sigmoid and args.include_log_softmax:
+        raise Exception("--include-log-softmax and --add-final-sigmoid cannot both be true.")
+
+    if args.xent_separate_forward_affine and args.add_final_sigmoid:
+        raise Exception("It does not make sense to have --add-final-sigmoid=true when xent-separate-forward-affine is true")
+
+    if args.add_lda and args.cnn_layer is not None:
+        args.add_lda = False
+        warnings.warn("--add-lda is set to false as CNN layers are used.")
+
+    return args
+
+
+def AddMultiDimAffineLayer(config_lines, name, input, input_window, block_input_dim, block_output_dim):
+    assert(block_input_dim % input_window== 0)
+    filter_context = int((input_window - 1) / 2)
+    filter_input_splice_indexes = range(-1 * filter_context, filter_context + 1)
+    list = [('Offset({0}, {1})'.format(input['descriptor'], n) if n != 0 else input['descriptor']) for n in filter_input_splice_indexes]
+    filter_input_descriptor = 'Append({0})'.format(' , '.join(list))
+    filter_input_descriptor = {'descriptor':filter_input_descriptor,
+                               'dimension':len(filter_input_splice_indexes) * input['dimension']}
+
+
+    # add permute component to shuffle the feature columns of the Append
+    # descriptor output so that columns corresponding to the same feature index
+    # are contiguous add a block-affine component to collapse all the feature
+    # indexes across time steps into a single value
+    num_feats = input['dimension']
+    num_times = len(filter_input_splice_indexes)
+    column_map = []
+    for i in range(num_feats):
+        for j in range(num_times):
+            column_map.append(j * num_feats + i)
+    permuted_output_descriptor = nodes.AddPermuteLayer(config_lines,
+            name, filter_input_descriptor, column_map)
+    # add a block-affine component
+    output_descriptor = nodes.AddBlockAffineLayer(config_lines, name,
+                                                  permuted_output_descriptor,
+                                                  num_feats / (block_input_dim / input_window) * block_output_dim, num_feats / (block_input_dim/ input_window))
+
+    return [output_descriptor, filter_context, filter_context]
+
+def AddMultiDimAffineLayer(config_lines, name, input, input_window, block_input_dim, block_output_dim):
+    assert(block_input_dim % input_window== 0)
+    filter_context = int((input_window - 1) / 2)
+    filter_input_splice_indexes = range(-1 * filter_context, filter_context + 1)
+    list = [('Offset({0}, {1})'.format(input['descriptor'], n) if n != 0 else input['descriptor']) for n in filter_input_splice_indexes]
+    filter_input_descriptor = 'Append({0})'.format(' , '.join(list))
+    filter_input_descriptor = {'descriptor':filter_input_descriptor,
+                               'dimension':len(filter_input_splice_indexes) * input['dimension']}
+
+
+    # add permute component to shuffle the feature columns of the Append
+    # descriptor output so that columns corresponding to the same feature index
+    # are contiguous add a block-affine component to collapse all the feature
+    # indexes across time steps into a single value
+    num_feats = input['dimension']
+    num_times = len(filter_input_splice_indexes)
+    column_map = []
+    for i in range(num_feats):
+        for j in range(num_times):
+            column_map.append(j * num_feats + i)
+    permuted_output_descriptor = nodes.AddPermuteLayer(config_lines,
+            name, filter_input_descriptor, column_map)
+    # add a block-affine component
+    output_descriptor = nodes.AddBlockAffineLayer(config_lines, name,
+                                                  permuted_output_descriptor,
+                                                  num_feats / (block_input_dim / input_window) * block_output_dim, num_feats / (block_input_dim/ input_window))
+
+    return [output_descriptor, filter_context, filter_context]
+
+def AddLpFilter(config_lines, name, input, rate, num_lpfilter_taps, lpfilt_filename, is_updatable = False):
+    try:
+        import scipy.signal as signal
+        import numpy as np
+    except ImportError:
+        raise Exception(" This recipe cannot be run without scipy."
+                        " You can install it using the command \n"
+                        " pip install scipy\n"
+                        " If you do not have admin access on the machine you are"
+                        " trying to run this recipe, you can try using"
+                        " virtualenv")
+    # low-pass smoothing of input was specified. so we will add a low-pass filtering layer
+    lp_filter = signal.firwin(num_lpfilter_taps, rate, width=None, window='hamming', pass_zero=True, scale=True, nyq=1.0)
+    lp_filter = list(np.append(lp_filter, 0))
+    nnet3_train_lib.WriteKaldiMatrix(lpfilt_filename, [lp_filter])
+    filter_context = int((num_lpfilter_taps - 1) / 2)
+    filter_input_splice_indexes = range(-1 * filter_context, filter_context + 1)
+    list = [('Offset({0}, {1})'.format(input['descriptor'], n) if n != 0 else input['descriptor']) for n in filter_input_splice_indexes]
+    filter_input_descriptor = 'Append({0})'.format(' , '.join(list))
+    filter_input_descriptor = {'descriptor':filter_input_descriptor,
+                               'dimension':len(filter_input_splice_indexes) * input['dimension']}
+
+    input_x_dim = len(filter_input_splice_indexes)
+    input_y_dim = input['dimension']
+    input_z_dim = 1
+    filt_x_dim = len(filter_input_splice_indexes)
+    filt_y_dim = 1
+    filt_x_step = 1
+    filt_y_step = 1
+    input_vectorization = 'zyx'
+
+    tdnn_input_descriptor = nodes.AddConvolutionLayer(config_lines, name,
+                                                     filter_input_descriptor,
+                                                     input_x_dim, input_y_dim, input_z_dim,
+                                                     filt_x_dim, filt_y_dim,
+                                                     filt_x_step, filt_y_step,
+                                                     1, input_vectorization,
+                                                     filter_bias_file = lpfilt_filename,
+                                                     is_updatable = is_updatable)
+
+
+    return [tdnn_input_descriptor, filter_context, filter_context]
+
+def AddConvMaxpLayer(config_lines, name, input, args):
+    if '3d-dim' not in input:
+        raise Exception("The input to AddConvMaxpLayer() needs '3d-dim' parameters.")
+
+    input = nodes.AddConvolutionLayer(config_lines, name, input,
+                              input['3d-dim'][0], input['3d-dim'][1], input['3d-dim'][2],
+                              args.filt_x_dim, args.filt_y_dim,
+                              args.filt_x_step, args.filt_y_step,
+                              args.num_filters, input['vectorization'])
+
+    if args.pool_x_size > 1 or args.pool_y_size > 1 or args.pool_z_size > 1:
+      input = nodes.AddMaxpoolingLayer(config_lines, name, input,
+                                input['3d-dim'][0], input['3d-dim'][1], input['3d-dim'][2],
+                                args.pool_x_size, args.pool_y_size, args.pool_z_size,
+                                args.pool_x_step, args.pool_y_step, args.pool_z_step)
+
+    return input
+
+# The ivectors are processed through an affine layer parallel to the CNN layers,
+# then concatenated with the CNN output and passed to the deeper part of the network.
+def AddCnnLayers(config_lines, cnn_layer, cnn_bottleneck_dim, cepstral_lifter, config_dir, feat_dim, splice_indexes=[0], ivector_dim=0):
+    cnn_args = ParseCnnString(cnn_layer)
+    num_cnn_layers = len(cnn_args)
+    # We use an Idct layer here to convert MFCC to FBANK features
+    nnet3_train_lib.WriteIdctMatrix(feat_dim, cepstral_lifter, config_dir.strip() + "/idct.mat")
+    prev_layer_output = {'descriptor':  "input",
+                         'dimension': feat_dim}
+    prev_layer_output = nodes.AddFixedAffineLayer(config_lines, "Idct", prev_layer_output, config_dir.strip() + '/idct.mat')
+
+    list = [('Offset({0}, {1})'.format(prev_layer_output['descriptor'],n) if n != 0 else prev_layer_output['descriptor']) for n in splice_indexes]
+    splice_descriptor = "Append({0})".format(", ".join(list))
+    cnn_input_dim = len(splice_indexes) * feat_dim
+    prev_layer_output = {'descriptor':  splice_descriptor,
+                         'dimension': cnn_input_dim,
+                         '3d-dim': [len(splice_indexes), feat_dim, 1],
+                         'vectorization': 'yzx'}
+
+    for cl in range(0, num_cnn_layers):
+        prev_layer_output = AddConvMaxpLayer(config_lines, "L{0}".format(cl), prev_layer_output, cnn_args[cl])
+
+    if cnn_bottleneck_dim > 0:
+        prev_layer_output = nodes.AddAffineLayer(config_lines, "cnn-bottleneck", prev_layer_output, cnn_bottleneck_dim, "")
+
+    if ivector_dim > 0:
+        iv_layer_output = {'descriptor':  'ReplaceIndex(ivector, t, 0)',
+                           'dimension': ivector_dim}
+        iv_layer_output = nodes.AddAffineLayer(config_lines, "ivector", iv_layer_output, ivector_dim, "")
+        prev_layer_output['descriptor'] = 'Append({0}, {1})'.format(prev_layer_output['descriptor'], iv_layer_output['descriptor'])
+        prev_layer_output['dimension'] = prev_layer_output['dimension'] + iv_layer_output['dimension']
+
+    return prev_layer_output
+
+def PrintConfig(file_name, config_lines):
+    f = open(file_name, 'w')
+    f.write("\n".join(config_lines['components'])+"\n")
+    f.write("\n#Component nodes\n")
+    f.write("\n".join(config_lines['component-nodes']))
+    f.close()
+
+def ParseCnnString(cnn_param_string_list):
+    cnn_parser = argparse.ArgumentParser(description="cnn argument parser")
+
+    cnn_parser.add_argument("--filt-x-dim", required=True, type=int)
+    cnn_parser.add_argument("--filt-y-dim", required=True, type=int)
+    cnn_parser.add_argument("--filt-x-step", type=int, default = 1)
+    cnn_parser.add_argument("--filt-y-step", type=int, default = 1)
+    cnn_parser.add_argument("--num-filters", required=True, type=int)
+    cnn_parser.add_argument("--pool-x-size", type=int, default = 1)
+    cnn_parser.add_argument("--pool-y-size", type=int, default = 1)
+    cnn_parser.add_argument("--pool-z-size", type=int, default = 1)
+    cnn_parser.add_argument("--pool-x-step", type=int, default = 1)
+    cnn_parser.add_argument("--pool-y-step", type=int, default = 1)
+    cnn_parser.add_argument("--pool-z-step", type=int, default = 1)
+
+    cnn_args = []
+    for cl in range(0, len(cnn_param_string_list)):
+         cnn_args.append(cnn_parser.parse_args(shlex.split(cnn_param_string_list[cl])))
+
+    return cnn_args
+
+def ParseSpliceString(splice_indexes):
+    splice_array = []
+    left_context = 0
+    right_context = 0
+    split1 = splice_indexes.split();  # we already checked the string is nonempty.
+    if len(split1) < 1:
+        raise Exception("invalid splice-indexes argument, too short: "
+                 + splice_indexes)
+    try:
+        for string in split1:
+            split2 = string.split(",")
+            if len(split2) < 1:
+                raise Exception("invalid splice-indexes argument, too-short element: "
+                         + splice_indexes)
+            int_list = []
+            for int_str in split2:
+                int_list.append(int(int_str))
+            if not int_list == sorted(int_list):
+                raise Exception("elements of splice-indexes must be sorted: "
+                         + splice_indexes)
+            left_context += -int_list[0]
+            right_context += int_list[-1]
+            splice_array.append(int_list)
+    except ValueError as e:
+        raise Exception("invalid splice-indexes argument " + splice_indexes + str(e))
+    left_context = max(0, left_context)
+    right_context = max(0, right_context)
+
+    return {'left_context':left_context,
+            'right_context':right_context,
+            'splice_indexes':splice_array,
+            'num_hidden_layers':len(splice_array)
+            }
+
+# The function signature of MakeConfigs is changed frequently as it is intended for local use in this script.
+def MakeConfigs(config_dir, splice_indexes_string,
+                cnn_layer, cnn_bottleneck_dim, cepstral_lifter,
+                feat_dim, ivector_dim, num_targets, add_lda,
+                nonlin_input_dim, nonlin_output_dim, subset_dim,
+                pool_type, pool_window, pool_lpfilter_width,
+                use_presoftmax_prior_scale,
+                final_layer_normalize_target,
+                include_log_softmax,
+                add_final_sigmoid,
+                xent_regularize,
+                xent_separate_forward_affine,
+                self_repair_scale,
+                objective_type):
+
+    parsed_splice_output = ParseSpliceString(splice_indexes_string.strip())
+
+    left_context = parsed_splice_output['left_context']
+    right_context = parsed_splice_output['right_context']
+    num_hidden_layers = parsed_splice_output['num_hidden_layers']
+    splice_indexes = parsed_splice_output['splice_indexes']
+    input_dim = len(parsed_splice_output['splice_indexes'][0]) + feat_dim + ivector_dim
+
+    if xent_separate_forward_affine:
+        if splice_indexes[-1] != [0]:
+            raise Exception("--xent-separate-forward-affine option is supported only if the last-hidden layer has no splicing before it. Please use a splice-indexes with just 0 as the final splicing config.")
+
+    prior_scale_file = '{0}/presoftmax_prior_scale.vec'.format(config_dir)
+
+    config_lines = {'components':[], 'component-nodes':[]}
+
+    config_files={}
+    prev_layer_output = nodes.AddInputLayer(config_lines, feat_dim, splice_indexes[0], ivector_dim)
+
+    # Add the init config lines for estimating the preconditioning matrices
+    init_config_lines = copy.deepcopy(config_lines)
+    init_config_lines['components'].insert(0, '# Config file for initializing neural network prior to')
+    init_config_lines['components'].insert(0, '# preconditioning matrix computation')
+    nodes.AddOutputLayer(init_config_lines, prev_layer_output)
+    config_files[config_dir + '/init.config'] = init_config_lines
+
+    if cnn_layer is not None:
+        prev_layer_output = AddCnnLayers(config_lines, cnn_layer, cnn_bottleneck_dim, cepstral_lifter, config_dir,
+                                         feat_dim, splice_indexes[0], ivector_dim)
+
+    if add_lda:
+        prev_layer_output = nodes.AddLdaLayer(config_lines, "L0", prev_layer_output, config_dir + '/lda.mat')
+
+    left_context = 0
+    right_context = 0
+    # we moved the first splice layer to before the LDA..
+    # so the input to the first affine layer is going to [0] index
+    splice_indexes[0] = [0]
+
+    for i in range(0, num_hidden_layers):
+        # make the intermediate config file for layerwise discriminative training
+        # if specified, pool the input from the previous layer
+
+        # prepare the spliced input
+        if not (len(splice_indexes[i]) == 1 and splice_indexes[i][0] == 0):
+            if pool_type != "none" and pool_window is None:
+                raise Exception("Pooling type was specified as {0}, this requires specification of the pool-window".format(pool_type))
+            if pool_type in set(["low-pass", "weighted-average"]):
+                if pool_type == "weighted-average":
+                    lpfilter_is_updatable = True
+                else:
+                    lpfilter_is_updatable = False
+                # low-pass filter the input to smooth it before the sub-sampling
+                [prev_layer_output, cur_left_context, cur_right_context] = AddLpFilter(config_lines,
+                                                                                      'Tdnn_input_smoother_{0}'.format(i),
+                                                                                       prev_layer_output,
+                                                                                       pool_lpfilter_width,
+                                                                                       pool_window,
+                                                                                       config_dir + '/Tdnn_input_smoother_{0}.txt'.format(i),
+                                                                                       is_updatable = lpfilter_is_updatable)
+                left_context += cur_left_context
+                right_context += cur_right_context
+
+            elif pool_type == "per-dim-weighted-average":
+                # add permute component to shuffle the feature columns of the Append descriptor output so
+                # that columns corresponding to the same feature index are contiguous
+                # add a block-affine component to collapse all the feature indexes across time steps into a single value
+                [prev_layer_output, cur_left_context, cur_right_context] = nodes.AddPerDimAffineLayer(config_lines,
+                                                                                                      'Tdnn_input_{0}'.format(i),
+                                                                                                       prev_layer_output,
+                                                                                                       pool_window)
+
+                left_context += cur_left_context
+                right_context += cur_right_context
+
+            elif pool_type == "multi-dim-weighted-average":
+                [prev_layer_output, cur_left_context, cur_right_context] = AddMultiDimAffineLayer(config_lines,
+                                                                                                  'Tdnn_input_{0}'.format(i),
+                                                                                                   prev_layer_output,
+                                                                                                   pool_window,
+                                                                                                   8 * pool_window, 8)
+                left_context += cur_left_context
+                right_context += cur_right_context
+
+
+            try:
+                zero_index = splice_indexes[i].index(0)
+            except ValueError:
+                zero_index = None
+            # I just assume the prev_layer_output_descriptor is a simple forwarding descriptor
+            prev_layer_output_descriptor = prev_layer_output['descriptor']
+            subset_output = prev_layer_output
+            if subset_dim > 0:
+                # if subset_dim is specified the script expects a zero in the splice indexes
+                assert(zero_index is not None)
+                subset_node_config = "dim-range-node name=Tdnn_input_{0} input-node={1} dim-offset={2} dim={3}".format(i, prev_layer_output_descriptor, 0, subset_dim)
+                subset_output = {'descriptor' : 'Tdnn_input_{0}'.format(i),
+                                 'dimension' : subset_dim}
+                config_lines['component-nodes'].append(subset_node_config)
+            appended_descriptors = []
+            appended_dimension = 0
+            for j in range(len(splice_indexes[i])):
+                if j == zero_index:
+                    appended_descriptors.append(prev_layer_output['descriptor'])
+                    appended_dimension += prev_layer_output['dimension']
+                    continue
+                appended_descriptors.append('Offset({0}, {1})'.format(subset_output['descriptor'], splice_indexes[i][j]))
+                appended_dimension += subset_output['dimension']
+            prev_layer_output = {'descriptor' : "Append({0})".format(" , ".join(appended_descriptors)),
+                                 'dimension'  : appended_dimension}
+        else:
+            # this is a normal affine node
+            pass
+
+        if xent_separate_forward_affine and i == num_hidden_layers - 1:
+            if xent_regularize == 0.0:
+                raise Exception("xent-separate-forward-affine=True is valid only if xent-regularize is non-zero")
+
+            prev_layer_output_chain = nodes.AddAffRelNormLayer(config_lines, "Tdnn_pre_final_chain",
+                                                    prev_layer_output, nonlin_output_dim,
+                                                    self_repair_scale = self_repair_scale,
+                                                    norm_target_rms = final_layer_normalize_target)
+
+
+            nodes.AddFinalLayer(config_lines, prev_layer_output_chain, num_targets,
+                               use_presoftmax_prior_scale = use_presoftmax_prior_scale,
+                               prior_scale_file = prior_scale_file,
+                               include_log_softmax = include_log_softmax)
+
+
+            prev_layer_output_xent = nodes.AddAffRelNormLayer(config_lines, "Tdnn_pre_final_xent",
+                                                    prev_layer_output, nonlin_output_dim,
+                                                    self_repair_scale = self_repair_scale,
+                                                    norm_target_rms = final_layer_normalize_target)
+
+            nodes.AddFinalLayer(config_lines, prev_layer_output_xent, num_targets,
+                                ng_affine_options = " param-stddev=0 bias-stddev=0 learning-rate-factor={0} ".format(
+                                    0.5 / xent_regularize),
+                                use_presoftmax_prior_scale = use_presoftmax_prior_scale,
+                                prior_scale_file = prior_scale_file,
+                                include_log_softmax = True,
+                                name_affix = 'xent')
+        else:
+            prev_layer_output = nodes.AddAffRelNormLayer(config_lines, "Tdnn_{0}".format(i),
+                                                        prev_layer_output, nonlin_output_dim,
+                                                        self_repair_scale = self_repair_scale,
+                                                        norm_target_rms = 1.0 if i < num_hidden_layers -1 else final_layer_normalize_target)
+
+            # a final layer is added after each new layer as we are generating
+            # configs for layer-wise discriminative training
+
+            # add_final_sigmoid adds a sigmoid as a final layer as alternative
+            # to log-softmax layer.
+            # http://ufldl.stanford.edu/wiki/index.php/Softmax_Regression#Softmax_Regression_vs._k_Binary_Classifiers
+            # This is useful when you need the final outputs to be probabilities between 0 and 1.
+            # Usually used with an objective-type such as "quadratic".
+            # Applications are k-binary classification such Ideal Ratio Mask prediction.
+            nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets,
+                               use_presoftmax_prior_scale = use_presoftmax_prior_scale,
+                               prior_scale_file = prior_scale_file,
+                               include_log_softmax = include_log_softmax,
+                               add_final_sigmoid = add_final_sigmoid,
+                               objective_type = objective_type)
+            if xent_regularize != 0.0:
+                nodes.AddFinalLayer(config_lines, prev_layer_output, num_targets,
+                                    ng_affine_options = " param-stddev=0 bias-stddev=0 learning-rate-factor={0} ".format(
+                                          0.5 / xent_regularize),
+                                    use_presoftmax_prior_scale = use_presoftmax_prior_scale,
+                                    prior_scale_file = prior_scale_file,
+                                    include_log_softmax = True,
+                                    name_affix = 'xent')
+
+        config_files['{0}/layer{1}.config'.format(config_dir, i+1)] = config_lines
+        config_lines = {'components':[], 'component-nodes':[]}
+
+    left_context += int(parsed_splice_output['left_context'])
+    right_context += int(parsed_splice_output['right_context'])
+
+    # write the files used by other scripts like steps/nnet3/get_egs.sh
+    f = open(config_dir + "/vars", "w")
+    print('model_left_context=' + str(left_context), file=f)
+    print('model_right_context=' + str(right_context), file=f)
+    print('num_hidden_layers=' + str(num_hidden_layers), file=f)
+    print('num_targets=' + str(num_targets), file=f)
+    print('add_lda=' + ('true' if add_lda else 'false'), file=f)
+    print('include_log_softmax=' + ('true' if include_log_softmax else 'false'), file=f)
+    print('objective_type=' + objective_type, file=f)
+    f.close()
+
+    # printing out the configs
+    # init.config used to train lda-mllt train
+    for key in config_files.keys():
+        PrintConfig(key, config_files[key])
+
+def Main():
+    args = GetArgs()
+
+    MakeConfigs(config_dir = args.config_dir,
+                splice_indexes_string = args.splice_indexes,
+                feat_dim = args.feat_dim, ivector_dim = args.ivector_dim,
+                num_targets = args.num_targets,
+                add_lda = args.add_lda,
+                cnn_layer = args.cnn_layer,
+                cnn_bottleneck_dim = args.cnn_bottleneck_dim,
+                cepstral_lifter = args.cepstral_lifter,
+                nonlin_input_dim = args.nonlin_input_dim,
+                nonlin_output_dim = args.nonlin_output_dim,
+                subset_dim = args.subset_dim,
+                pool_type = args.pool_type, pool_window = args.pool_window,
+                pool_lpfilter_width = args.pool_lpfilter_width,
+                use_presoftmax_prior_scale = args.use_presoftmax_prior_scale,
+                final_layer_normalize_target = args.final_layer_normalize_target,
+                include_log_softmax = args.include_log_softmax,
+                add_final_sigmoid = args.add_final_sigmoid,
+                xent_regularize = args.xent_regularize,
+                xent_separate_forward_affine = args.xent_separate_forward_affine,
+                self_repair_scale = args.self_repair_scale,
+                objective_type = args.objective_type)
+
+if __name__ == "__main__":
+    Main()
+
diff --git a/egs/wsj/s5/steps/nnet2/train_multisplice_accel2_fix.sh b/egs/wsj/s5/steps/nnet3/tdnn/train.sh
similarity index 54%
rename from egs/wsj/s5/steps/nnet2/train_multisplice_accel2_fix.sh
rename to egs/wsj/s5/steps/nnet3/tdnn/train.sh
index 7e5990bc5e5..e21f5403737 100755
--- a/egs/wsj/s5/steps/nnet2/train_multisplice_accel2_fix.sh
+++ b/egs/wsj/s5/steps/nnet3/tdnn/train.sh
@@ -1,18 +1,14 @@
 #!/bin/bash
 
-# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey). 
+# note, TDNN is the same as what we used to call multisplice.
+
+# Copyright 2012-2015  Johns Hopkins University (Author: Daniel Povey).
 #           2013  Xiaohui Zhang
 #           2013  Guoguo Chen
 #           2014  Vimal Manohar
 #           2014  Vijayaditya Peddinti
 # Apache 2.0.
 
-# train_multisplice_accel2.sh is a modified version of
-# train_pnorm_multisplice2.sh (still using pnorm).  The "accel" refers to the
-# fact that we increase the number of jobs during training (from
-# --num-jobs-initial to --num-jobs-final).  We dropped "pnorm" from the name as
-# it was getting too long.
-
 
 # Begin configuration section.
 cmd=run.pl
@@ -20,26 +16,24 @@ num_epochs=15      # Number of epochs of training;
                    # the number of iterations is worked out from this.
 initial_effective_lrate=0.01
 final_effective_lrate=0.001
-bias_stddev=0.5
-pnorm_input_dim=3000 
+pnorm_input_dim=3000
 pnorm_output_dim=300
-minibatch_size=128 # by default use a smallish minibatch size for neural net
-                   # training; this controls instability which would otherwise
-                   # be a problem with multi-threaded update. 
-
+relu_dim=  # you can use this to make it use ReLU's instead of p-norms.
+rand_prune=4.0 # Relates to a speedup we do for LDA.
+minibatch_size=512  # This default is suitable for GPU-based training.
+                    # Set it to 128 for multi-threaded CPU-based training.
+max_param_change=2.0  # max param change per minibatch
 samples_per_iter=400000 # each iteration of training, see this many samples
                         # per job.  This option is passed to get_egs.sh
 num_jobs_initial=1  # Number of neural net jobs to run in parallel at the start of training
 num_jobs_final=8   # Number of neural net jobs to run in parallel at the end of training
-prior_subset_size=10000 # 10k samples per job, for computing priors.  Should be
-                        # more than enough.
+prior_subset_size=20000 # 20k samples per job, for computing priors.
 num_jobs_compute_prior=10 # these are single-threaded, run on CPU.
-get_egs_stage=0
-fix_nnet=true
-min_average=0.05
-max_average=0.95
+get_egs_stage=0    # can be used for rerunning after partial
 online_ivector_dir=
-remove_egs=false  # set to false to disable removing egs.
+presoftmax_prior_scale_power=-0.25
+use_presoftmax_prior_scale=true
+remove_egs=true  # set to false to disable removing egs after training is done.
 
 max_models_combine=20 # The "max_models_combine" is the maximum number of models we give
   # to the final 'combine' stage, but these models will themselves be averages of
@@ -55,51 +49,37 @@ shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of
                 # affect each others' gradients.
 
 add_layers_period=2 # by default, add new layers every 2 iterations.
-num_hidden_layers=3
-stage=-4
+stage=-6
 exit_stage=-100 # you can set this to terminate the training early.  Exits before running this stage
 
-splice_indexes="layer0/-4:-3:-2:-1:0:1:2:3:4 layer2/-5:-1:3"
+# count space-separated fields in splice_indexes to get num-hidden-layers.
+splice_indexes="-4,-3,-2,-1,0,1,2,3,4  0  -2,2  0  -4,4 0"
 # Format : layer<hidden_layer>/<frame_indices>....layer<hidden_layer>/<frame_indices> "
 # note: hidden layers which are composed of one or more components,
 # so hidden layer indexing is different from component count
+chunk_training=false  # if true training is done with chunk randomization, rather than frame randomization
 
-
-io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
 randprune=4.0 # speeds up LDA.
-alpha=4.0 # relates to preconditioning.
-update_period=4 # relates to online preconditioning: says how often we update the subspace.
-num_samples_history=2000 # relates to online preconditioning
-max_change_per_sample=0.075
-precondition_rank_in=20  # relates to online preconditioning
-precondition_rank_out=80 # relates to online preconditioning
-
-mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
-        # specified.)
-num_threads=16
-parallel_opts="-pe smp 16 -l ram_free=1G,mem_free=1G" 
-  # by default we use 16 threads; this lets the queue know.
-  # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
-combine_num_threads=8
-combine_parallel_opts="-pe smp 8"  # queue options for the "combine" stage.
+use_gpu=true    # if true, we run on GPU.
 cleanup=true
 egs_dir=
+max_lda_jobs=10  # use no more than 10 jobs for the LDA accumulation.
 lda_opts=
-lda_dim=
 egs_opts=
-transform_dir=     # If supplied, overrides alidir
-cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.  
+transform_dir=     # If supplied, this dir used instead of alidir to find transforms.
+cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.
             # only relevant for "raw" features, not lda.
-feat_type=  # Can be used to force "raw" features.
+feat_type=raw  # or set to 'lda' to use LDA features.
 align_cmd=              # The cmd that is passed to steps/nnet2/align.sh
 align_use_gpu=          # Passed to use_gpu in steps/nnet2/align.sh [yes/no]
-realign_times=          # List of times on which we realign.  Each time is 
+realign_times=          # List of times on which we realign.  Each time is
                         # floating point number strictly between 0 and 1, which
                         # will be multiplied by the num-iters to get an iteration
                         # number.
 num_jobs_align=30       # Number of jobs for realignment
 # End configuration section.
-frames_per_eg=8 # to be passed on to get_egs2.sh
+frames_per_eg=8 # to be passed on to get_egs.sh
+subset_dim=0
 
 trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM
 
@@ -121,17 +101,16 @@ if [ $# != 4 ]; then
   echo "                                                   # data, 0.00025 for large data"
   echo "  --num-hidden-layers <#hidden-layers|2>           # Number of hidden layers, e.g. 2 for 3 hours of data, 4 for 100hrs"
   echo "  --add-layers-period <#iters|2>                   # Number of iterations between adding hidden layers"
-  echo "  --mix-up <#pseudo-gaussians|0>                   # Can be used to have multiple targets in final output layer,"
-  echo "                                                   # per context-dependent state.  Try a number several times #states."
+  echo "  --presoftmax-prior-scale-power <power|-0.25>     # use the specified power value on the priors (inverse priors) to scale"
+  echo "                                                   # the pre-softmax outputs (set to 0.0 to disable the presoftmax element scale)"
   echo "  --num-jobs-initial <num-jobs|1>                  # Number of parallel jobs to use for neural net training, at the start."
   echo "  --num-jobs-final <num-jobs|8>                    # Number of parallel jobs to use for neural net training, at the end"
-  echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job (will affect results"
-  echo "                                                   # as well as speed; may interact with batch size; if you increase"
+  echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job, for CPU-based training (will affect"
+  echo "                                                   # results as well as speed; may interact with batch size; if you increase"
   echo "                                                   # this, you may want to decrease the batch size."
   echo "  --parallel-opts <opts|\"-pe smp 16 -l ram_free=1G,mem_free=1G\">      # extra options to pass to e.g. queue.pl for processes that"
   echo "                                                   # use multiple threads... note, you might have to reduce mem_free,ram_free"
   echo "                                                   # versus your defaults, because it gets multiplied by the -pe smp argument."
-  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
   echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
   echo "                                                   # should not get too large, e.g. >2k)."
   echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
@@ -141,15 +120,15 @@ if [ $# != 4 ]; then
   echo "                                                   # Format : layer<hidden_layer_index>/<frame_indices>....layer<hidden_layer>/<frame_indices> "
   echo "                                                   # (note: we splice processed, typically 40-dimensional frames"
   echo "  --lda-dim <dim|''>                               # Dimension to reduce spliced features to with LDA"
-  echo "  --realign-epochs <list-of-epochs|''>             # A list of space-separated epoch indices the beginning of which"
-  echo "                                                   # realignment is to be done"
+  echo "  --realign-times <list-of-times|\"\">             # A list of space-separated floating point numbers between 0.0 and"
+  echo "                                                   # 1.0 to specify how far through training realignment is to be done"
   echo "  --align-cmd (utils/run.pl|utils/queue.pl <queue opts>) # passed to align.sh"
   echo "  --align-use-gpu (yes/no)                         # specify is gpu is to be used for realignment"
   echo "  --num-jobs-align <#njobs|30>                     # Number of jobs to perform realignment"
   echo "  --stage <stage|-4>                               # Used to run a partially-completed training process from somewhere in"
   echo "                                                   # the middle."
 
-  
+
   exit 1;
 fi
 
@@ -183,62 +162,116 @@ mkdir -p $dir/log
 echo $nj > $dir/num_jobs
 cp $alidir/tree $dir
 
-# process the splice_inds string, to get a layer-wise context string
-# to be processed by the nnet-components
-# this would be mainly used by SpliceComponent|SpliceMaxComponent
-python steps/nnet2/make_multisplice_configs.py contexts --splice-indexes "$splice_indexes" $dir || exit -1;
-context_string=$(cat $dir/vars) || exit -1
-echo $context_string
-eval $context_string || exit -1; #
-  # initializes variables used by get_lda.sh and get_egs.sh
-  # get_lda.sh : first_left_context, first_right_context,
-  # get_egs.sh : nnet_left_context & nnet_right_context
-
-extra_opts=()
-[ ! -z "$cmvn_opts" ] && extra_opts+=(--cmvn-opts "$cmvn_opts")
-[ ! -z "$feat_type" ] && extra_opts+=(--feat-type $feat_type)
-[ ! -z "$online_ivector_dir" ] && extra_opts+=(--online-ivector-dir $online_ivector_dir)
-[ -z "$transform_dir" ] && transform_dir=$alidir
-extra_opts+=(--transform-dir $transform_dir)
 
-if [ $stage -le -4 ]; then
-  echo "$0: calling get_lda.sh"
-  steps/nnet2/get_lda.sh $lda_opts "${extra_opts[@]}" --left-context $first_left_context --right-context $first_right_context --cmd "$cmd" $data $lang $alidir $dir || exit 1;
+# First work out the feature and iVector dimension, needed for tdnn config creation.
+case $feat_type in
+  raw) feat_dim=$(feat-to-dim --print-args=false scp:$data/feats.scp -) || \
+      { echo "$0: Error getting feature dim"; exit 1; }
+    ;;
+  lda)  [ ! -f $alidir/final.mat ] && echo "$0: With --feat-type lda option, expect $alidir/final.mat to exist."
+   # get num-rows in lda matrix, which is the lda feature dim.
+   feat_dim=$(matrix-dim --print-args=false $alidir/final.mat | cut -f 1)
+    ;;
+  *)
+   echo "$0: Bad --feat-type '$feat_type';"; exit 1;
+esac
+if [ -z "$online_ivector_dir" ]; then
+  ivector_dim=0
+else
+  ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
+fi
+
+
+if [ $stage -le -5 ]; then
+  echo "$0: creating neural net configs";
+
+  if [ ! -z "$relu_dim" ]; then
+    dim_opts="--relu-dim $relu_dim"
+  else
+    dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim  $pnorm_output_dim"
+  fi
+
+  # create the config files for nnet initialization
+  python steps/nnet3/tdnn/make_configs.py  \
+    --splice-indexes "$splice_indexes"  \
+    --subset-dim "$subset_dim" \
+    --feat-dim $feat_dim \
+    --ivector-dim $ivector_dim  \
+     $dim_opts \
+    --use-presoftmax-prior-scale $use_presoftmax_prior_scale \
+    --num-targets  $num_leaves  \
+   $dir/configs || exit 1;
+
+  # Initialize as "raw" nnet, prior to training the LDA-like preconditioning
+  # matrix.  This first config just does any initial splicing that we do;
+  # we do this as it's a convenient way to get the stats for the 'lda-like'
+  # transform.
+  $cmd $dir/log/nnet_init.log \
+    nnet3-init --srand=-2 $dir/configs/init.config $dir/init.raw || exit 1;
 fi
-# these files will have been written by get_lda.sh
-feat_dim=$(cat $dir/feat_dim) || exit 1;
-ivector_dim=$(cat $dir/ivector_dim) || exit 1;
-lda_dim=$(cat $dir/lda_dim) || exit 1;
 
-if [ $stage -le -3 ] && [ -z "$egs_dir" ]; then
+# sourcing the "vars" below sets
+# left_context=(something)
+# right_context=(something)
+# num_hidden_layers=(something)
+. $dir/configs/vars || exit 1;
+
+context_opts="--left-context=$left_context --right-context=$right_context"
+
+! [ "$num_hidden_layers" -gt 0 ] && echo \
+ "$0: Expected num_hidden_layers to be defined" && exit 1;
+
+[ -z "$transform_dir" ] && transform_dir=$alidir
+
 
-  extra_opts+=(--left-context $nnet_left_context )
-  extra_opts+=(--right-context $nnet_right_context )
-  echo "$0: calling get_egs2.sh"
-  steps/nnet2/get_egs2.sh $egs_opts "${extra_opts[@]}" \
+if [ $stage -le -4 ] && [ -z "$egs_dir" ]; then
+  extra_opts=()
+  [ ! -z "$cmvn_opts" ] && extra_opts+=(--cmvn-opts "$cmvn_opts")
+  [ ! -z "$feat_type" ] && extra_opts+=(--feat-type $feat_type)
+  [ ! -z "$online_ivector_dir" ] && extra_opts+=(--online-ivector-dir $online_ivector_dir)
+  extra_opts+=(--transform-dir $transform_dir)
+  extra_opts+=(--left-context $left_context)
+  extra_opts+=(--right-context $right_context)
+  echo "$0: calling get_egs.sh"
+  steps/nnet3/get_egs.sh $egs_opts "${extra_opts[@]}" \
       --samples-per-iter $samples_per_iter --stage $get_egs_stage \
-      --io-opts "$io_opts" \
       --cmd "$cmd" $egs_opts \
       --frames-per-eg $frames_per_eg \
       $data $alidir $dir/egs || exit 1;
 fi
 
-if [ -z $egs_dir ]; then
-  egs_dir=$dir/egs
-  # confirm that the provided egs_dir has the necessary context
-  egs_left_context=$(cat $egs_dir/info/left_context) || exit -1
-  egs_right_context=$(cat $egs_dir/info/right_context) || exit -1
-  echo $egs_left_context  $nnet_left_context $egs_right_context $nnet_right_context
-  ([[ $egs_left_context -lt $nnet_left_context ]] || [[ $egs_right_context -lt $nnet_right_context ]]) &&
-    echo "Provided egs_dir $egs_dir does not have sufficient context to train the neural network." && exit -1;
+[ -z $egs_dir ] && egs_dir=$dir/egs
+
+if [ "$feat_dim" != "$(cat $egs_dir/info/feat_dim)" ]; then
+  echo "$0: feature dimension mismatch with egs, $feat_dim vs $(cat $egs_dir/info/feat_dim)";
+  exit 1;
+fi
+if [ "$ivector_dim" != "$(cat $egs_dir/info/ivector_dim)" ]; then
+  echo "$0: ivector dimension mismatch with egs, $ivector_dim vs $(cat $egs_dir/info/ivector_dim)";
+  exit 1;
 fi
 
+# copy any of the following that exist, to $dir.
+cp $egs_dir/{cmvn_opts,splice_opts,final.mat} $dir 2>/dev/null
+
+# confirm that the egs_dir has the necessary context (especially important if
+# the --egs-dir option was used on the command line).
+egs_left_context=$(cat $egs_dir/info/left_context) || exit -1
+egs_right_context=$(cat $egs_dir/info/right_context) || exit -1
+ ( [ $egs_left_context -lt $left_context ] || \
+   [ $egs_right_context -lt $right_context ] ) && \
+   echo "$0: egs in $egs_dir have too little context" && exit -1;
+
 frames_per_eg=$(cat $egs_dir/info/frames_per_eg) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }
 num_archives=$(cat $egs_dir/info/num_archives) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }
 
 # num_archives_expanded considers each separate label-position from
 # 0..frames_per_eg-1 to be a separate archive.
-num_archives_expanded=$[$num_archives*$frames_per_eg]
+if [ "$chunk_training" == "true" ]; then
+  num_archives_expanded=$num_archives
+else
+  num_archives_expanded=$[$num_archives*$frames_per_eg]
+fi
 
 [ $num_jobs_initial -gt $num_jobs_final ] && \
   echo "$0: --initial-num-jobs cannot exceed --final-num-jobs" && exit 1;
@@ -246,49 +279,67 @@ num_archives_expanded=$[$num_archives*$frames_per_eg]
 [ $num_jobs_final -gt $num_archives_expanded ] && \
   echo "$0: --final-num-jobs cannot exceed #archives $num_archives_expanded." && exit 1;
 
-if ! [ $num_hidden_layers -ge 1 ]; then
-  echo "Invalid num-hidden-layers $num_hidden_layers"
-  exit 1
-fi
 
-if [ $stage -le -2 ]; then
-  echo "$0: initializing neural net";
-  lda_mat=$dir/lda.mat
-  tot_input_dim=$[$feat_dim+$ivector_dim]
+if [ $stage -le -3 ]; then
+  echo "$0: getting preconditioning matrix for input features."
+  num_lda_jobs=$num_archives
+  [ $num_lda_jobs -gt $max_lda_jobs ] && num_lda_jobs=$max_lda_jobs
 
-  online_preconditioning_opts="alpha=$alpha num-samples-history=$num_samples_history update-period=$update_period rank-in=$precondition_rank_in rank-out=$precondition_rank_out max-change-per-sample=$max_change_per_sample"
+  # Write stats with the same format as stats for LDA.
+  $cmd JOB=1:$num_lda_jobs $dir/log/get_lda_stats.JOB.log \
+      nnet3-acc-lda-stats --rand-prune=$rand_prune \
+        $dir/init.raw "ark:$egs_dir/egs.JOB.ark" $dir/JOB.lda_stats || exit 1;
 
-  initial_lrate=$(perl -e "print ($initial_effective_lrate*$num_jobs_initial);")
+  all_lda_accs=$(for n in $(seq $num_lda_jobs); do echo $dir/$n.lda_stats; done)
+  $cmd $dir/log/sum_transform_stats.log \
+    sum-lda-accs $dir/lda_stats $all_lda_accs || exit 1;
 
-  # create the config files for nnet initialization
-  python steps/nnet2/make_multisplice_configs.py  \
-    --splice-indexes "$splice_indexes"  \
-    --total-input-dim $tot_input_dim  \
-    --ivector-dim $ivector_dim  \
-    --lda-mat "$lda_mat"  \
-    --lda-dim $lda_dim  \
-    --pnorm-input-dim $pnorm_input_dim  \
-    --pnorm-output-dim  $pnorm_output_dim \
-    --online-preconditioning-opts "$online_preconditioning_opts"  \
-    --initial-learning-rate $initial_lrate \
-    --bias-stddev  $bias_stddev  \
-    --num-hidden-layers $num_hidden_layers \
-    --num-targets  $num_leaves  \
-    configs  $dir || exit -1;
+  rm $all_lda_accs || exit 1;
 
-  $cmd $dir/log/nnet_init.log \
-    nnet-am-init $alidir/tree $lang/topo "nnet-init $dir/nnet.config -|" \
-    $dir/0.mdl || exit 1;
+  # this computes a fixed affine transform computed in the way we described in
+  # Appendix C.6 of http://arxiv.org/pdf/1410.7455v6.pdf; it's a scaled variant
+  # of an LDA transform but without dimensionality reduction.
+  $cmd $dir/log/get_transform.log \
+     nnet-get-feature-transform $lda_opts $dir/lda.mat $dir/lda_stats || exit 1;
+
+  ln -sf ../lda.mat $dir/configs/lda.mat
+fi
+
+
+if [ $stage -le -2 ]; then
+  echo "$0: preparing initial vector for FixedScaleComponent before softmax"
+  echo "  ... using priors^$presoftmax_prior_scale_power and rescaling to average 1"
+
+  # obtains raw pdf count
+  $cmd JOB=1:$nj $dir/log/acc_pdf.JOB.log \
+     ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
+     post-to-tacc --per-pdf=true  $alidir/final.mdl ark:- $dir/pdf_counts.JOB || exit 1;
+  $cmd $dir/log/sum_pdf_counts.log \
+       vector-sum --binary=false $dir/pdf_counts.* $dir/pdf_counts || exit 1;
+  rm $dir/pdf_counts.*
+
+  awk -v power=$presoftmax_prior_scale_power -v smooth=0.01 \
+     '{ for(i=2; i<=NF-1; i++) { count[i-2] = $i;  total += $i; }
+        num_pdfs=NF-2;  average_count = total/num_pdfs;
+        for (i=0; i<num_pdfs; i++) stot += (scale[i] = (count[i] + smooth * average_count)^power)
+        printf " [ "; for (i=0; i<num_pdfs; i++) printf("%f ", scale[i]*num_pdfs/stot); print "]" }' \
+     $dir/pdf_counts > $dir/presoftmax_prior_scale.vec
+  ln -sf ../presoftmax_prior_scale.vec $dir/configs/presoftmax_prior_scale.vec
 fi
-#if [ $pnorm_input_dim -eq $pnorm_output_dim ]; then fix_nnet=true;fi
 
 if [ $stage -le -1 ]; then
-  echo "Training transition probabilities and setting priors"
-  $cmd $dir/log/train_trans.log \
-    nnet-train-transitions $dir/0.mdl "ark:gunzip -c $alidir/ali.*.gz|" $dir/0.mdl \
-    || exit 1;
+  # Add the first layer; this will add in the lda.mat and
+  # presoftmax_prior_scale.vec.
+  $cmd $dir/log/add_first_layer.log \
+       nnet3-init --srand=-3 $dir/init.raw $dir/configs/layer1.config $dir/0.raw || exit 1;
+
+  # Convert to .mdl, train the transitions, set the priors.
+  $cmd $dir/log/init_mdl.log \
+    nnet3-am-init $alidir/final.mdl $dir/0.raw - \| \
+    nnet3-am-train-transitions - "ark:gunzip -c $alidir/ali.*.gz|" $dir/0.mdl || exit 1;
 fi
 
+
 # set num_iters so that as close as possible, we process the data $num_epochs
 # times, i.e. $num_iters*$avg_num_jobs) == $num_epochs*$num_archives_expanded,
 # where avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.
@@ -302,59 +353,50 @@ num_iters=$[($num_archives_to_process*2)/($num_jobs_initial+$num_jobs_final)]
 
 finish_add_layers_iter=$[$num_hidden_layers * $add_layers_period]
 
-
-# mix up at the iteration where we've processed about half the data; this keeps
-# the overall training procedure fairly invariant to the number of initial and
-# final jobs.
-# j = initial, k = final, n = num-iters, x = half-of-data epoch,
-# p is proportion of data we want to process (e.g. p=0.5 here).
-# solve for x if the amount of data processed by epoch x is p
-# times the amount by iteration n.
-# put this in wolfram alpha:
-# solve { x*j + (k-j)*x*x/(2*n) = p * (j*n + (k-j)*n/2), {x} }
-# got: x = (j n-sqrt(-n^2 (j^2 (p-1)-k^2 p)))/(j-k) and j!=k and n!=0
-# simplified manually to: n * (sqrt(((1-p)j^2 + p k^2)/2) - j)/(j-k)
-mix_up_iter=$(perl -e '($j,$k,$n,$p)=@ARGV; print int(0.5 + ($j==$k ? $n*$p : $n*(sqrt((1-$p)*$j*$j+$p*$k*$k)-$j)/($k-$j))); ' $num_jobs_initial $num_jobs_final $num_iters 0.5)
-! [ $mix_up_iter -gt $finish_add_layers_iter ] && \
-  echo "Mix-up-iter is $mix_up_iter, should be greater than $finish_add_layers_iter -> add more epochs?" \
-  && exit 1;
-
 echo "$0: Will train for $num_epochs epochs = $num_iters iterations"
-[ $mix_up -gt 0 ] && echo "$0: Will mix up on iteration $mix_up_iter"
 
-if [ $num_threads -eq 1 ]; then
-  parallel_suffix="-simple" # this enables us to use GPU code if
-                         # we have just one thread.
+if $use_gpu; then
+  parallel_suffix=""
+  train_queue_opt="--gpu 1"
+  combine_queue_opt="--gpu 1"
+  prior_gpu_opt="--use-gpu=yes"
+  prior_queue_opt="--gpu 1"
   parallel_train_opts=
   if ! cuda-compiled; then
     echo "$0: WARNING: you are running with one thread but you have not compiled"
     echo "   for CUDA.  You may be running a setup optimized for GPUs.  If you have"
     echo "   GPUs and have nvcc installed, go to src/ and do ./configure; make"
+    exit 1
   fi
 else
-  parallel_suffix="-parallel"
-  parallel_train_opts="--num-threads=$num_threads"
+  echo "$0: without using a GPU this will be very slow.  nnet3 does not yet support multiple threads."
+  parallel_train_opts="--use-gpu=no"
+  combine_queue_opt=""  # the combine stage will be quite slow if not using
+                        # GPU, as we didn't enable that program to use
+                        # multiple threads.
+  prior_gpu_opt="--use-gpu=no"
+  prior_queue_opt=""
 fi
 
 
 approx_iters_per_epoch_final=$[$num_archives_expanded/$num_jobs_final]
-# First work out how many models we want to combine over in the final
-# nnet-combine-fast invocation.  This equals
+# First work out how many iterations we want to combine over in the final
+# nnet3-combine-fast invocation.  (We may end up subsampling from these if the
+# number exceeds max_model_combine).  The number we use is:
 # min(max(max_models_combine, approx_iters_per_epoch_final),
-#     2/3 * iters_after_mixup)
-num_models_combine=$max_models_combine
-if [ $num_models_combine -lt $approx_iters_per_epoch_final ]; then
-   num_models_combine=$approx_iters_per_epoch_final
+#     1/2 * iters_after_last_layer_added)
+num_iters_combine=$max_models_combine
+if [ $num_iters_combine -lt $approx_iters_per_epoch_final ]; then
+   num_iters_combine=$approx_iters_per_epoch_final
 fi
-iters_after_mixup_23=$[(($num_iters-$mix_up_iter-1)*2)/3]
-if [ $num_models_combine -gt $iters_after_mixup_23 ]; then
-  num_models_combine=$iters_after_mixup_23
+half_iters_after_add_layers=$[($num_iters-$finish_add_layers_iter)/2]
+if [ $num_iters_combine -gt $half_iters_after_add_layers ]; then
+  num_iters_combine=$half_iters_after_add_layers
 fi
-first_model_combine=$[$num_iters-$num_models_combine+1]
+first_model_combine=$[$num_iters-$num_iters_combine+1]
 
 x=0
 
-
 for realign_time in $realign_times; do
   # Work out the iterations on which we will re-align, if the --realign-times
   # option was used.  This is slightly approximate.
@@ -369,13 +411,13 @@ cur_egs_dir=$egs_dir
 
 while [ $x -lt $num_iters ]; do
   [ $x -eq $exit_stage ] && echo "$0: Exiting early due to --exit-stage $exit_stage" && exit 0;
-  if [ $x -gt $[$num_iters/2] ]; then fix_nnet=false; fi
+
   this_num_jobs=$(perl -e "print int(0.5+$num_jobs_initial+($num_jobs_final-$num_jobs_initial)*$x/$num_iters);")
 
   ilr=$initial_effective_lrate; flr=$final_effective_lrate; np=$num_archives_processed; nt=$num_archives_to_process;
   this_learning_rate=$(perl -e "print (($x + 1 >= $num_iters ? $flr : $ilr*exp($np*log($flr/$ilr)/$nt))*$this_num_jobs);");
 
-  echo "On iteration $x, learning rate is $this_learning_rate."    
+  echo "On iteration $x, learning rate is $this_learning_rate."
 
   if [ ! -z "${realign_this_iter[$x]}" ]; then
     prev_egs_dir=$cur_egs_dir
@@ -392,9 +434,10 @@ while [ $x -lt $num_iters ]; do
       # we're using different random subsets of it.
       rm $dir/post.$x.*.vec 2>/dev/null
       $cmd JOB=1:$num_jobs_compute_prior $dir/log/get_post.$x.JOB.log \
-        nnet-copy-egs --srand=JOB --frame=random ark:$prev_egs_dir/egs.1.ark ark:- \| \
-        nnet-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
-        nnet-compute-from-egs "nnet-to-raw-nnet $dir/$x.mdl -|" ark:- ark:- \| \
+        nnet3-copy-egs --srand=JOB --frame=random $context_opts ark:$prev_egs_dir/egs.1.ark ark:- \| \
+        nnet3-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
+        nnet3-merge-egs ark:- ark:- \| \
+        nnet3-compute-from-egs --apply-exp=true "nnet3-am-copy --raw=true $dir/$x.mdl -|" ark:- ark:- \| \
         matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1;
 
       sleep 3;  # make sure there is time for $dir/post.$x.*.vec to appear.
@@ -405,33 +448,36 @@ while [ $x -lt $num_iters ]; do
 
       echo "Re-adjusting priors based on computed posteriors"
       $cmd $dir/log/adjust_priors.$x.log \
-        nnet-adjust-priors $dir/$x.mdl $dir/post.$x.vec $dir/$x.mdl || exit 1;
+        nnet3-am-adjust-priors $dir/$x.mdl $dir/post.$x.vec $dir/$x.mdl || exit 1;
 
       sleep 2
 
-      steps/nnet2/align.sh --nj $num_jobs_align --cmd "$align_cmd" --use-gpu $align_use_gpu \
+      steps/nnet3/align.sh --nj $num_jobs_align --cmd "$align_cmd" --use-gpu $align_use_gpu \
         --transform-dir "$transform_dir" --online-ivector-dir "$online_ivector_dir" \
         --iter $x $data $lang $dir $dir/ali_$time || exit 1
 
-      steps/nnet2/relabel_egs2.sh --cmd "$cmd" --iter $x $dir/ali_$time \
+      steps/nnet3/relabel_egs.sh --cmd "$cmd" --iter $x $dir/ali_$time \
         $prev_egs_dir $cur_egs_dir || exit 1
 
       if $cleanup && [[ $prev_egs_dir =~ $dir/egs* ]]; then
-        steps/nnet2/remove_egs.sh $prev_egs_dir
+        steps/nnet3/remove_egs.sh $prev_egs_dir
       fi
     fi
-    
+
     # Set off jobs doing some diagnostics, in the background.
     # Use the egs dir from the previous iteration for the diagnostics
     $cmd $dir/log/compute_prob_valid.$x.log \
-      nnet-compute-prob $dir/$x.mdl ark:$cur_egs_dir/valid_diagnostic.egs &
+      nnet3-compute-prob "nnet3-am-copy --raw=true $dir/$x.mdl - |" \
+            "ark,bg:nnet3-merge-egs ark:$cur_egs_dir/valid_diagnostic.egs ark:- |" &
     $cmd $dir/log/compute_prob_train.$x.log \
-      nnet-compute-prob $dir/$x.mdl ark:$cur_egs_dir/train_diagnostic.egs &
-    if [ $x -gt 0 ] && [ ! -f $dir/log/mix_up.$[$x-1].log ]; then
+      nnet3-compute-prob "nnet3-am-copy --raw=true $dir/$x.mdl - |" \
+           "ark,bg:nnet3-merge-egs ark:$cur_egs_dir/train_diagnostic.egs ark:- |" &
+
+    if [ $x -gt 0 ]; then
       $cmd $dir/log/progress.$x.log \
-        nnet-show-progress --use-gpu=no $dir/$[$x-1].mdl $dir/$x.mdl \
-        ark:$cur_egs_dir/train_diagnostic.egs '&&' \
-        nnet-am-info $dir/$x.mdl &
+        nnet3-show-progress --use-gpu=no "nnet3-am-copy --raw=true $dir/$[$x-1].mdl - |" "nnet3-am-copy --raw=true $dir/$x.mdl - |" \
+        "ark,bg:nnet3-merge-egs ark:$cur_egs_dir/train_diagnostic.egs ark:-|" '&&' \
+        nnet3-info "nnet3-am-copy --raw=true $dir/$x.mdl - |" &
     fi
 
     echo "Training neural net (pass $x)"
@@ -439,22 +485,24 @@ while [ $x -lt $num_iters ]; do
     if [ $x -gt 0 ] && \
       [ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \
       [ $[$x%$add_layers_period] -eq 0 ]; then
-      do_average=false # if we've just mixed up, don't do averaging take the best.
-      cur_num_hidden_layers=$[$x/$add_layers_period];
-      mdl="nnet-init --srand=$x $dir/hidden_${cur_num_hidden_layers}.config - | nnet-insert $dir/$x.mdl - - | nnet-am-copy --learning-rate=$this_learning_rate - -|"
+      do_average=false # if we've just mixed up, don't do averaging but take the
+                       # best.
+      cur_num_hidden_layers=$[1+$x/$add_layers_period]
+      config=$dir/configs/layer$cur_num_hidden_layers.config
+      raw="nnet3-am-copy --raw=true --learning-rate=$this_learning_rate $dir/$x.mdl - | nnet3-init --srand=$x - $config - |"
     else
       do_average=true
       if [ $x -eq 0 ]; then do_average=false; fi # on iteration 0, pick the best, don't average.
-      mdl="nnet-am-copy --learning-rate=$this_learning_rate $dir/$x.mdl -|"
+      raw="nnet3-am-copy --raw=true --learning-rate=$this_learning_rate $dir/$x.mdl -|"
     fi
     if $do_average; then
       this_minibatch_size=$minibatch_size
     else
       # on iteration zero or when we just added a layer, use a smaller minibatch
-      # size and just one job: the model-averaging doesn't seem to be helpful
-      # when the model is changing too fast (i.e. it worsens the objective
-      # function), and the smaller minibatch size will help to keep
-      # the update stable.
+      # size (and we will later choose the output of just one of the jobs): the
+      # model-averaging isn't always helpful when the model is changing too fast
+      # (i.e. it can worsen the objective function), and the smaller minibatch
+      # size will help to keep the update stable.
       this_minibatch_size=$[$minibatch_size/2];
     fi
 
@@ -464,7 +512,7 @@ while [ $x -lt $num_iters ]; do
     ( # this sub-shell is so that when we "wait" below,
       # we only wait for the training jobs that we just spawned,
       # not the diagnostic jobs that we spawned above.
-      
+
       # We can't easily use a single parallel SGE job to do the main training,
       # because the computation of which archive and which --frame option
       # to use for each job is a little complex, so we spawn each one separately.
@@ -477,11 +525,11 @@ while [ $x -lt $num_iters ]; do
         # same archive with different frame indexes will give similar gradients,
         # so we want to separate them in time.
 
-        $cmd $parallel_opts $dir/log/train.$x.$n.log \
-          nnet-train$parallel_suffix $parallel_train_opts \
-          --minibatch-size=$this_minibatch_size --srand=$x "$mdl" \
-          "ark:nnet-copy-egs --frame=$frame ark:$cur_egs_dir/egs.$archive.ark ark:-|nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-|" \
-          $dir/$[$x+1].$n.mdl || touch $dir/.error &
+        $cmd $train_queue_opt $dir/log/train.$x.$n.log \
+          nnet3-train $parallel_train_opts \
+          --max-param-change=$max_param_change "$raw" \
+          "ark,bg:nnet3-copy-egs --frame=$frame $context_opts ark:$cur_egs_dir/egs.$archive.ark ark:- | nnet3-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-| nnet3-merge-egs --minibatch-size=$this_minibatch_size --discard-partial-minibatches=true ark:- ark:- |" \
+          $dir/$[$x+1].$n.raw || touch $dir/.error &
       done
       wait
     )
@@ -491,36 +539,26 @@ while [ $x -lt $num_iters ]; do
 
     nnets_list=
     for n in `seq 1 $this_num_jobs`; do
-      nnets_list="$nnets_list $dir/$[$x+1].$n.mdl"
+      nnets_list="$nnets_list $dir/$[$x+1].$n.raw"
     done
 
     if $do_average; then
       # average the output of the different jobs.
       $cmd $dir/log/average.$x.log \
-        nnet-am-average $nnets_list $dir/$[$x+1].mdl || exit 1;
+        nnet3-average $nnets_list - \| \
+        nnet3-am-copy --set-raw-nnet=- $dir/$x.mdl $dir/$[$x+1].mdl || exit 1;
     else
       # choose the best from the different jobs.
       n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
           $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
           undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
-          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob; 
+          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
           $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
       [ -z "$n" ] && echo "Error getting best model" && exit 1;
-      cp $dir/$[$x+1].$n.mdl $dir/$[$x+1].mdl || exit 1;
-    fi
-    
-    if $fix_nnet; then
-      # do nnet-am-fix to fix some pathology in the network
-      nnet-am-fix --max-average-deriv=$max_average --min-average-deriv=$min_average $dir/$[$x+1].mdl $dir/$[$x+1].mdl 2>$dir/log/fix.$x.log || exit;
+      $cmd $dir/log/select.$x.log \
+        nnet3-am-copy --set-raw-nnet=$dir/$[$x+1].$n.raw  $dir/$x.mdl $dir/$[$x+1].mdl || exit 1;
     fi
 
-    if [ "$mix_up" -gt 0 ] && [ $x -eq $mix_up_iter ]; then
-      # mix up.
-      echo Mixing up from $num_leaves to $mix_up components
-      $cmd $dir/log/mix_up.$x.log \
-        nnet-am-mixup --min-count=10 --num-mixtures=$mix_up \
-        $dir/$[$x+1].mdl $dir/$[$x+1].mdl || exit 1;
-    fi
     rm $nnets_list
     [ ! -f $dir/$[$x+1].mdl ] && exit 1;
     if [ -f $dir/$[$x-1].mdl ] && $cleanup && \
@@ -536,77 +574,51 @@ done
 if [ $stage -le $num_iters ]; then
   echo "Doing final combination to produce final.mdl"
 
-  # Now do combination.
+  # Now do combination.  In the nnet3 setup, the logic
+  # for doing averaging of subsets of the models in the case where
+  # there are too many models to reliably esetimate interpolation
+  # factors (max_models_combine) is moved into the nnet3-combine
   nnets_list=()
-  # the if..else..fi statement below sets 'nnets_list'.
-  if [ $max_models_combine -lt $num_models_combine ]; then
-    # The number of models to combine is too large, e.g. > 20.  In this case,
-    # each argument to nnet-combine-fast will be an average of multiple models.
-    cur_offset=0 # current offset from first_model_combine.
-    for n in $(seq $max_models_combine); do
-      next_offset=$[($n*$num_models_combine)/$max_models_combine]
-      sub_list="" 
-      for o in $(seq $cur_offset $[$next_offset-1]); do
-        iter=$[$first_model_combine+$o]
-        mdl=$dir/$iter.mdl
-        [ ! -f $mdl ] && echo "Expected $mdl to exist" && exit 1;
-        sub_list="$sub_list $mdl"
-      done
-      nnets_list[$[$n-1]]="nnet-am-average $sub_list - |"
-      cur_offset=$next_offset
-    done
-  else
-    nnets_list=
-    for n in $(seq 0 $[num_models_combine-1]); do
-      iter=$[$first_model_combine+$n]
-      mdl=$dir/$iter.mdl
-      [ ! -f $mdl ] && echo "Expected $mdl to exist" && exit 1;
-      nnets_list[$n]=$mdl
-    done
-  fi
+  for n in $(seq 0 $[num_iters_combine-1]); do
+    iter=$[$first_model_combine+$n]
+    mdl=$dir/$iter.mdl
+    [ ! -f $mdl ] && echo "Expected $mdl to exist" && exit 1;
+    nnets_list[$n]="nnet3-am-copy --raw=true $mdl -|";
+  done
 
+  # Below, we use --use-gpu=no to disable nnet3-combine-fast from using a GPU,
+  # as if there are many models it can give out-of-memory error; and we set
+  # num-threads to 8 to speed it up (this isn't ideal...)
 
-  # Below, use --use-gpu=no to disable nnet-combine-fast from using a GPU, as
-  # if there are many models it can give out-of-memory error; set num-threads to 8
-  # to speed it up (this isn't ideal...)
-  num_egs=`nnet-copy-egs ark:$cur_egs_dir/combine.egs ark:/dev/null 2>&1 | tail -n 1 | awk '{print $NF}'`
-  mb=$[($num_egs+$combine_num_threads-1)/$combine_num_threads]
-  [ $mb -gt 512 ] && mb=512
-  # Setting --initial-model to a large value makes it initialize the combination
-  # with the average of all the models.  It's important not to start with a
-  # single model, or, due to the invariance to scaling that these nonlinearities
-  # give us, we get zero diagonal entries in the fisher matrix that
-  # nnet-combine-fast uses for scaling, which after flooring and inversion, has
-  # the effect that the initial model chosen gets much higher learning rates
-  # than the others.  This prevents the optimization from working well.
-  $cmd $combine_parallel_opts $dir/log/combine.log \
-    nnet-combine-fast --initial-model=100000 --num-lbfgs-iters=40 --use-gpu=no \
-      --num-threads=$combine_num_threads \
-      --verbose=3 --minibatch-size=$mb "${nnets_list[@]}" ark:$cur_egs_dir/combine.egs \
-      $dir/final.mdl || exit 1;
-
-  # Normalize stddev for affine or block affine layers that are followed by a
-  # pnorm layer and then a normalize layer.
-  $cmd $dir/log/normalize.log \
-    nnet-normalize-stddev $dir/final.mdl $dir/final.mdl || exit 1;
+  $cmd $combine_queue_opt $dir/log/combine.log \
+    nnet3-combine --num-iters=40 \
+       --enforce-sum-to-one=true --enforce-positive-weights=true \
+       --verbose=3 "${nnets_list[@]}" "ark,bg:nnet3-merge-egs --minibatch-size=1024 ark:$cur_egs_dir/combine.egs ark:-|" \
+    "|nnet3-am-copy --set-raw-nnet=- $dir/$num_iters.mdl $dir/combined.mdl" || exit 1;
 
   # Compute the probability of the final, combined model with
   # the same subset we used for the previous compute_probs, as the
   # different subsets will lead to different probs.
   $cmd $dir/log/compute_prob_valid.final.log \
-    nnet-compute-prob $dir/final.mdl ark:$cur_egs_dir/valid_diagnostic.egs &
+    nnet3-compute-prob "nnet3-am-copy --raw=true $dir/combined.mdl -|" \
+    "ark,bg:nnet3-merge-egs ark:$cur_egs_dir/valid_diagnostic.egs ark:- |" &
   $cmd $dir/log/compute_prob_train.final.log \
-    nnet-compute-prob $dir/final.mdl ark:$cur_egs_dir/train_diagnostic.egs &
+    nnet3-compute-prob  "nnet3-am-copy --raw=true $dir/combined.mdl -|" \
+    "ark,bg:nnet3-merge-egs ark:$cur_egs_dir/train_diagnostic.egs ark:- |" &
 fi
 
 if [ $stage -le $[$num_iters+1] ]; then
   echo "Getting average posterior for purposes of adjusting the priors."
   # Note: this just uses CPUs, using a smallish subset of data.
+  if [ $num_jobs_compute_prior -gt $num_archives ]; then egs_part=1;
+  else egs_part=JOB; fi
   rm $dir/post.$x.*.vec 2>/dev/null
-  $cmd JOB=1:$num_jobs_compute_prior $dir/log/get_post.$x.JOB.log \
-    nnet-copy-egs --frame=random --srand=JOB ark:$cur_egs_dir/egs.1.ark ark:- \| \
-    nnet-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
-    nnet-compute-from-egs "nnet-to-raw-nnet $dir/final.mdl -|" ark:- ark:- \| \
+  $cmd JOB=1:$num_jobs_compute_prior $prior_queue_opt $dir/log/get_post.$x.JOB.log \
+    nnet3-copy-egs --frame=random $context_opts --srand=JOB ark:$cur_egs_dir/egs.$egs_part.ark ark:- \| \
+    nnet3-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
+    nnet3-merge-egs ark:- ark:- \| \
+    nnet3-compute-from-egs $prior_gpu_opt --apply-exp=true \
+      "nnet3-am-copy --raw=true $dir/combined.mdl -|" ark:- ark:- \| \
     matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1;
 
   sleep 3;  # make sure there is time for $dir/post.$x.*.vec to appear.
@@ -618,7 +630,7 @@ if [ $stage -le $[$num_iters+1] ]; then
 
   echo "Re-adjusting priors based on computed posteriors"
   $cmd $dir/log/adjust_priors.final.log \
-    nnet-adjust-priors $dir/final.mdl $dir/post.$x.vec $dir/final.mdl || exit 1;
+    nnet3-am-adjust-priors $dir/combined.mdl $dir/post.$x.vec $dir/final.mdl || exit 1;
 fi
 
 
@@ -646,4 +658,3 @@ if $cleanup; then
     fi
   done
 fi
-
diff --git a/egs/wsj/s5/steps/nnet3/tdnn/train_raw_nnet.sh b/egs/wsj/s5/steps/nnet3/tdnn/train_raw_nnet.sh
new file mode 100755
index 00000000000..6fe772f7e0d
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/tdnn/train_raw_nnet.sh
@@ -0,0 +1,547 @@
+#!/bin/bash
+
+# note, TDNN is the same as what we used to call multisplice.
+
+# Copyright 2012-2015  Johns Hopkins University (Author: Daniel Povey).
+#           2013  Xiaohui Zhang
+#           2013  Guoguo Chen
+#           2014-2016  Vimal Manohar
+#           2014  Vijayaditya Peddinti
+# Apache 2.0.
+
+
+# Begin configuration section.
+cmd=run.pl
+num_epochs=15      # Number of epochs of training;
+                   # the number of iterations is worked out from this.
+initial_effective_lrate=0.01
+final_effective_lrate=0.001
+rand_prune=4.0 # Relates to a speedup we do for LDA.
+minibatch_size=512  # This default is suitable for GPU-based training.
+                    # Set it to 128 for multi-threaded CPU-based training.
+max_param_change=2.0  # max param change per minibatch
+samples_per_iter=400000 # each iteration of training, see this many samples
+                        # per job.  This option is passed to get_egs.sh
+num_jobs_initial=1  # Number of neural net jobs to run in parallel at the start of training
+num_jobs_final=8   # Number of neural net jobs to run in parallel at the end of training
+prior_subset_size=20000 # 20k samples per job, for computing priors.
+num_jobs_compute_prior=10 # these are single-threaded, run on CPU.
+get_egs_stage=0    # can be used for rerunning after partial
+online_ivector_dir=
+remove_egs=true  # set to false to disable removing egs after training is done.
+
+max_models_combine=20 # The "max_models_combine" is the maximum number of models we give
+  # to the final 'combine' stage, but these models will themselves be averages of
+  # iteration-number ranges.
+
+shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples
+                # on each iter.  You could set it to 0 or to a large value for complete
+                # randomization, but this would both consume memory and cause spikes in
+                # disk I/O.  Smaller is easier on disk and memory but less random.  It's
+                # not a huge deal though, as samples are anyway randomized right at the start.
+                # (the point of this is to get data in different minibatches on different iterations,
+                # since in the preconditioning method, 2 samples in the same minibatch can
+                # affect each others' gradients.
+
+add_layers_period=2 # by default, add new layers every 2 iterations.
+stage=-6
+exit_stage=-100 # you can set this to terminate the training early.  Exits before running this stage
+
+chunk_training=false  # if true training is done with chunk randomization, rather than frame randomization
+
+randprune=4.0 # speeds up LDA.
+use_gpu=true    # if true, we run on GPU.
+cleanup=true
+egs_dir=
+configs_dir=
+max_lda_jobs=10  # use no more than 10 jobs for the LDA accumulation.
+lda_opts=
+egs_opts=
+transform_dir=     # If supplied, this dir used instead of alidir to find transforms.
+cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.
+frames_per_eg=8 # to be passed on to get_egs.sh
+
+# Raw nnet training options i.e. without transition model
+nj=4
+dense_targets=true        # Use dense targets instead of sparse targets
+
+# End configuration section.
+
+trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+if [ $# != 3 ]; then
+  echo "Usage: $0 [opts] <data> <targets-scp> <exp-dir>"
+  echo " e.g.: $0 data/train scp:snr_targets/targets.scp exp/nnet3_snr_predictor"
+  echo ""
+  echo "Main options (for others, see top of script file)"
+  echo "  --config <config-file>                           # config file containing options"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --num-epochs <#epochs|15>                        # Number of epochs of training"
+  echo "  --initial-effective-lrate <lrate|0.02> # effective learning rate at start of training."
+  echo "  --final-effective-lrate <lrate|0.004>   # effective learning rate at end of training."
+  echo "                                                   # data, 0.00025 for large data"
+  echo "  --num-hidden-layers <#hidden-layers|2>           # Number of hidden layers, e.g. 2 for 3 hours of data, 4 for 100hrs"
+  echo "  --add-layers-period <#iters|2>                   # Number of iterations between adding hidden layers"
+  echo "  --num-jobs-initial <num-jobs|1>                  # Number of parallel jobs to use for neural net training, at the start."
+  echo "  --num-jobs-final <num-jobs|8>                    # Number of parallel jobs to use for neural net training, at the end"
+  echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job, for CPU-based training (will affect"
+  echo "                                                   # results as well as speed; may interact with batch size; if you increase"
+  echo "                                                   # this, you may want to decrease the batch size."
+  echo "  --parallel-opts <opts|\"-pe smp 16 -l ram_free=1G,mem_free=1G\">      # extra options to pass to e.g. queue.pl for processes that"
+  echo "                                                   # use multiple threads... note, you might have to reduce mem_free,ram_free"
+  echo "                                                   # versus your defaults, because it gets multiplied by the -pe smp argument."
+  echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
+  echo "                                                   # should not get too large, e.g. >2k)."
+  echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
+  echo "                                                   # process."
+  echo "  --splice-indexes <string|layer0/-4:-3:-2:-1:0:1:2:3:4> "
+  echo "                                                   # Frame indices used for each splice layer."
+  echo "                                                   # Format : layer<hidden_layer_index>/<frame_indices>....layer<hidden_layer>/<frame_indices> "
+  echo "                                                   # (note: we splice processed, typically 40-dimensional frames"
+  echo "  --lda-dim <dim|''>                               # Dimension to reduce spliced features to with LDA"
+  echo "  --stage <stage|-4>                               # Used to run a partially-completed training process from somewhere in"
+  echo "                                                   # the middle."
+
+
+  exit 1;
+fi
+
+data=$1
+targets_scp=$2
+dir=$3
+
+# Check some files.
+for f in $data/feats.scp $targets_scp; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+# in this dir we'll have just one job.
+sdata=$data/split$nj
+utils/split_data.sh $data $nj
+
+mkdir -p $dir/log
+echo $nj > $dir/num_jobs
+
+
+# First work out the feature and iVector dimension, needed for tdnn config creation.
+feat_dim=$(feat-to-dim --print-args=false scp:$data/feats.scp -) || \
+      { echo "$0: Error getting feature dim"; exit 1; }
+
+if [ -z "$online_ivector_dir" ]; then
+  ivector_dim=0
+else
+  ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
+fi
+
+if [ ! -z "$configs_dir" ]; then
+  cp -rT $configs_dir $dir/configs || exit 1
+fi
+
+if [ $stage -le -5 ]; then
+  # Initialize as "raw" nnet, prior to training the LDA-like preconditioning
+  # matrix.  This first config just does any initial splicing that we do;
+  # we do this as it's a convenient way to get the stats for the 'lda-like'
+  # transform.
+  $cmd $dir/log/nnet_init.log \
+    nnet3-init --srand=-2 $dir/configs/init.config $dir/init.raw || exit 1;
+fi
+
+# sourcing the "vars" below sets
+# model_left_context=(something)
+# model_right_context=(something)
+# num_hidden_layers=(something)
+# num_targets=(something)
+# add_lda=(true|false)
+# include_log_softmax=(true|false)
+# objective_type=(something)
+. $dir/configs/vars || exit 1;
+left_context=$model_left_context
+right_context=$model_right_context
+
+[ -z "$num_targets" ] && echo "\$num_targets is not defined. Needs to be defined in $dir/configs/vars." && exit 1
+[ -z "$add_lda" ] && echo "\$add_lda is not defined. Needs to be defined in $dir/configs/vars." && exit 1
+[ -z "$include_log_softmax" ] && echo "\$include_log_softmax is not defined. Needs to be defined in $dir/configs/vars." && exit 1
+[ -z "$objective_type" ] && echo "\$objective_type is not defined. Needs to be defined in $dir/configs/vars." && exit 1
+
+context_opts="--left-context=$left_context --right-context=$right_context"
+
+! [ "$num_hidden_layers" -gt 0 ] && echo \
+ "$0: Expected num_hidden_layers to be defined" && exit 1;
+
+if $dense_targets; then
+  tmp_num_targets=`feat-to-dim scp:$targets_scp - 2>/dev/null` || exit 1
+
+  if [ $tmp_num_targets -ne $num_targets ]; then
+    echo "Mismatch between num-targets provided to script vs configs"
+    exit 1
+  fi
+fi
+
+if [ $stage -le -4 ] && [ -z "$egs_dir" ]; then
+  extra_opts=()
+  [ ! -z "$cmvn_opts" ] && extra_opts+=(--cmvn-opts "$cmvn_opts")
+  [ ! -z "$feat_type" ] && extra_opts+=(--feat-type $feat_type)
+  [ ! -z "$online_ivector_dir" ] && extra_opts+=(--online-ivector-dir $online_ivector_dir)
+  extra_opts+=(--transform-dir "$transform_dir")
+  extra_opts+=(--left-context $left_context)
+  extra_opts+=(--right-context $right_context)
+  echo "$0: calling get_egs.sh"
+
+  if $dense_targets; then
+    target_type=dense
+  else
+    target_type=sparse
+  fi
+
+  steps/nnet3/get_egs_targets.sh $egs_opts "${extra_opts[@]}" \
+    --samples-per-iter $samples_per_iter --stage $get_egs_stage \
+    --cmd "$cmd" --nj $nj \
+    --frames-per-eg $frames_per_eg \
+    --target-type $target_type --num-targets $num_targets \
+    $data $targets_scp $dir/egs || exit 1;
+fi
+
+[ -z $egs_dir ] && egs_dir=$dir/egs
+
+if [ "$feat_dim" != "$(cat $egs_dir/info/feat_dim)" ]; then
+  echo "$0: feature dimension mismatch with egs, $feat_dim vs $(cat $egs_dir/info/feat_dim)";
+  exit 1;
+fi
+if [ "$ivector_dim" != "$(cat $egs_dir/info/ivector_dim)" ]; then
+  echo "$0: ivector dimension mismatch with egs, $ivector_dim vs $(cat $egs_dir/info/ivector_dim)";
+  exit 1;
+fi
+
+# copy any of the following that exist, to $dir.
+cp $egs_dir/{cmvn_opts,splice_opts,final.mat} $dir 2>/dev/null
+
+# confirm that the egs_dir has the necessary context (especially important if
+# the --egs-dir option was used on the command line).
+egs_left_context=$(cat $egs_dir/info/left_context) || exit -1
+egs_right_context=$(cat $egs_dir/info/right_context) || exit -1
+ ( [ $egs_left_context -lt $left_context ] || \
+   [ $egs_right_context -lt $right_context ] ) && \
+   echo "$0: egs in $egs_dir have too little context" && exit -1;
+
+frames_per_eg=$(cat $egs_dir/info/frames_per_eg) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }
+num_archives=$(cat $egs_dir/info/num_archives) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }
+
+# num_archives_expanded considers each separate label-position from
+# 0..frames_per_eg-1 to be a separate archive.
+if [ "$chunk_training" == "true" ]; then
+  num_archives_expanded=$num_archives
+else
+  num_archives_expanded=$[$num_archives*$frames_per_eg]
+fi
+
+[ $num_jobs_initial -gt $num_jobs_final ] && \
+  echo "$0: --initial-num-jobs cannot exceed --final-num-jobs" && exit 1;
+
+[ $num_jobs_final -gt $num_archives_expanded ] && \
+  echo "$0: --final-num-jobs cannot exceed #archives $num_archives_expanded." && exit 1;
+
+
+if $add_lda && [ $stage -le -3 ]; then
+  echo "$0: getting preconditioning matrix for input features."
+  num_lda_jobs=$num_archives
+  [ $num_lda_jobs -gt $max_lda_jobs ] && num_lda_jobs=$max_lda_jobs
+
+  # Write stats with the same format as stats for LDA.
+  $cmd JOB=1:$num_lda_jobs $dir/log/get_lda_stats.JOB.log \
+      nnet3-acc-lda-stats --rand-prune=$rand_prune \
+        $dir/init.raw "ark:$egs_dir/egs.JOB.ark" $dir/JOB.lda_stats || exit 1;
+
+  all_lda_accs=$(for n in $(seq $num_lda_jobs); do echo $dir/$n.lda_stats; done)
+  $cmd $dir/log/sum_transform_stats.log \
+    sum-lda-accs $dir/lda_stats $all_lda_accs || exit 1;
+
+  rm $all_lda_accs || exit 1;
+
+  # this computes a fixed affine transform computed in the way we described in
+  # Appendix C.6 of http://arxiv.org/pdf/1410.7455v6.pdf; it's a scaled variant
+  # of an LDA transform but without dimensionality reduction.
+  $cmd $dir/log/get_transform.log \
+     nnet-get-feature-transform $lda_opts $dir/lda.mat $dir/lda_stats || exit 1;
+
+  ln -sf ../lda.mat $dir/configs/lda.mat
+fi
+
+
+if [ $stage -le -1 ]; then
+  # Add the first layer; this will add in the lda.mat
+  $cmd $dir/log/add_first_layer.log \
+       nnet3-init --srand=-3 $dir/init.raw $dir/configs/layer1.config $dir/0.raw || exit 1;
+
+fi
+
+
+# set num_iters so that as close as possible, we process the data $num_epochs
+# times, i.e. $num_iters*$avg_num_jobs) == $num_epochs*$num_archives_expanded,
+# where avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.
+
+num_archives_to_process=$[$num_epochs*$num_archives_expanded]
+num_archives_processed=0
+num_iters=$[($num_archives_to_process*2)/($num_jobs_initial+$num_jobs_final)]
+
+finish_add_layers_iter=$[$num_hidden_layers * $add_layers_period]
+
+! [ $num_iters -gt $[$finish_add_layers_iter+2] ] \
+  && echo "$0: Insufficient epochs" && exit 1
+
+echo "$0: Will train for $num_epochs epochs = $num_iters iterations"
+
+if $use_gpu; then
+  parallel_suffix=""
+  train_queue_opt="--gpu 1"
+  combine_queue_opt="--gpu 1"
+  prior_gpu_opt="--use-gpu=yes"
+  prior_queue_opt="--gpu 1"
+  parallel_train_opts=
+  if ! cuda-compiled; then
+    echo "$0: WARNING: you are running with one thread but you have not compiled"
+    echo "   for CUDA.  You may be running a setup optimized for GPUs.  If you have"
+    echo "   GPUs and have nvcc installed, go to src/ and do ./configure; make"
+    exit 1
+  fi
+else
+  echo "$0: without using a GPU this will be very slow.  nnet3 does not yet support multiple threads."
+  parallel_train_opts="--use-gpu=no"
+  combine_queue_opt=""  # the combine stage will be quite slow if not using
+                        # GPU, as we didn't enable that program to use
+                        # multiple threads.
+  prior_gpu_opt="--use-gpu=no"
+  prior_queue_opt=""
+fi
+
+
+approx_iters_per_epoch_final=$[$num_archives_expanded/$num_jobs_final]
+# First work out how many iterations we want to combine over in the final
+# nnet3-combine-fast invocation.  (We may end up subsampling from these if the
+# number exceeds max_model_combine).  The number we use is:
+# min(max(max_models_combine, approx_iters_per_epoch_final),
+#     1/2 * iters_after_last_layer_added)
+num_iters_combine=$max_models_combine
+if [ $num_iters_combine -lt $approx_iters_per_epoch_final ]; then
+   num_iters_combine=$approx_iters_per_epoch_final
+fi
+half_iters_after_add_layers=$[($num_iters-$finish_add_layers_iter)/2]
+if [ $num_iters_combine -gt $half_iters_after_add_layers ]; then
+  num_iters_combine=$half_iters_after_add_layers
+fi
+first_model_combine=$[$num_iters-$num_iters_combine+1]
+
+x=0
+
+
+compute_accuracy=false
+if [ "$objective_type" == "linear" ]; then
+  compute_accuracy=true
+fi
+
+while [ $x -lt $num_iters ]; do
+  [ $x -eq $exit_stage ] && echo "$0: Exiting early due to --exit-stage $exit_stage" && exit 0;
+
+  this_num_jobs=$(perl -e "print int(0.5+$num_jobs_initial+($num_jobs_final-$num_jobs_initial)*$x/$num_iters);")
+
+  ilr=$initial_effective_lrate; flr=$final_effective_lrate; np=$num_archives_processed; nt=$num_archives_to_process;
+  this_learning_rate=$(perl -e "print (($x + 1 >= $num_iters ? $flr : $ilr*exp($np*log($flr/$ilr)/$nt))*$this_num_jobs);");
+
+  echo "On iteration $x, learning rate is $this_learning_rate."
+
+  if [ $x -ge 0 ] && [ $stage -le $x ]; then
+
+    # Set off jobs doing some diagnostics, in the background.
+    # Use the egs dir from the previous iteration for the diagnostics
+    $cmd $dir/log/compute_prob_valid.$x.log \
+      nnet3-compute-prob --compute-accuracy=$compute_accuracy $dir/$x.raw \
+      "ark,bg:nnet3-merge-egs ark:$egs_dir/valid_diagnostic.egs ark:- |" &
+    $cmd $dir/log/compute_prob_train.$x.log \
+      nnet3-compute-prob --compute-accuracy=$compute_accuracy $dir/$x.raw \
+      "ark,bg:nnet3-merge-egs ark:$egs_dir/train_diagnostic.egs ark:- |" &
+
+    if [ $x -gt 0 ]; then
+      $cmd $dir/log/progress.$x.log \
+        nnet3-show-progress --use-gpu=no $dir/$[x-1].raw $dir/$x.raw \
+        "ark,bg:nnet3-merge-egs ark:$egs_dir/train_diagnostic.egs ark:-|" '&&' \
+        nnet3-info $dir/$x.raw &
+    fi
+
+    echo "Training neural net (pass $x)"
+
+    if [ $x -gt 0 ] && \
+      [ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \
+      [ $[$x%$add_layers_period] -eq 0 ]; then
+      do_average=false # if we've just mixed up, don't do averaging but take the
+                       # best.
+      cur_num_hidden_layers=$[1+$x/$add_layers_period]
+      config=$dir/configs/layer$cur_num_hidden_layers.config
+      raw="nnet3-copy --learning-rate=$this_learning_rate $dir/$x.raw - | nnet3-init --srand=$x - $config - |"
+    else
+      do_average=true
+      if [ $x -eq 0 ]; then do_average=false; fi # on iteration 0, pick the best, don't average.
+      raw="nnet3-copy --learning-rate=$this_learning_rate $dir/$x.raw -|"
+    fi
+    if $do_average; then
+      this_minibatch_size=$minibatch_size
+    else
+      # on iteration zero or when we just added a layer, use a smaller minibatch
+      # size (and we will later choose the output of just one of the jobs): the
+      # model-averaging isn't always helpful when the model is changing too fast
+      # (i.e. it can worsen the objective function), and the smaller minibatch
+      # size will help to keep the update stable.
+      this_minibatch_size=$[$minibatch_size/2];
+    fi
+
+    rm $dir/.error 2>/dev/null
+
+
+    ( # this sub-shell is so that when we "wait" below,
+      # we only wait for the training jobs that we just spawned,
+      # not the diagnostic jobs that we spawned above.
+
+      # We can't easily use a single parallel SGE job to do the main training,
+      # because the computation of which archive and which --frame option
+      # to use for each job is a little complex, so we spawn each one separately.
+      for n in $(seq $this_num_jobs); do
+        k=$[$num_archives_processed + $n - 1]; # k is a zero-based index that we'll derive
+                                               # the other indexes from.
+        archive=$[($k%$num_archives)+1]; # work out the 1-based archive index.
+        frame=$[(($k/$num_archives)%$frames_per_eg)]; # work out the 0-based frame
+        # index; this increases more slowly than the archive index because the
+        # same archive with different frame indexes will give similar gradients,
+        # so we want to separate them in time.
+
+        $cmd $train_queue_opt $dir/log/train.$x.$n.log \
+          nnet3-train $parallel_train_opts \
+          --max-param-change=$max_param_change "$raw" \
+          "ark,bg:nnet3-copy-egs --frame=$frame $context_opts ark:$egs_dir/egs.$archive.ark ark:- | nnet3-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-| nnet3-merge-egs --minibatch-size=$this_minibatch_size --discard-partial-minibatches=true ark:- ark:- |" \
+          $dir/$[$x+1].$n.raw || touch $dir/.error &
+      done
+      wait
+    )
+    # the error message below is not that informative, but $cmd will
+    # have printed a more specific one.
+    [ -f $dir/.error ] && echo "$0: error on iteration $x of training" && exit 1;
+
+    nnets_list=
+    for n in `seq 1 $this_num_jobs`; do
+      nnets_list="$nnets_list $dir/$[$x+1].$n.raw"
+    done
+
+    if $do_average; then
+      # average the output of the different jobs.
+      $cmd $dir/log/average.$x.log \
+        nnet3-average $nnets_list $dir/$[x+1].raw || exit 1;
+    else
+      # choose the best from the different jobs.
+      n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
+          $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
+          undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
+          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
+          $best_n=$n; } } print "$best_n\n"; ' $this_num_jobs $dir/log/train.$x.%d.log) || exit 1;
+      [ -z "$n" ] && echo "Error getting best model" && exit 1;
+      $cmd $dir/log/select.$x.log \
+        nnet3-copy $dir/$[$x+1].$n.raw $dir/$[$x+1].raw || exit 1;
+    fi
+
+    rm $nnets_list
+    [ ! -f $dir/$[$x+1].raw ] && exit 1;
+    if [ -f $dir/$[$x-1].raw ] && $cleanup && \
+       [ $[($x-1)%100] -ne 0  ] && [ $[$x-1] -lt $first_model_combine ]; then
+      rm $dir/$[$x-1].raw
+    fi
+  fi
+  x=$[$x+1]
+  num_archives_processed=$[$num_archives_processed+$this_num_jobs]
+done
+
+if [ $stage -le $num_iters ]; then
+  echo "Doing final combination to produce final.raw"
+
+  # Now do combination.  In the nnet3 setup, the logic
+  # for doing averaging of subsets of the models in the case where
+  # there are too many models to reliably esetimate interpolation
+  # factors (max_models_combine) is moved into the nnet3-combine
+  nnets_list=()
+  for n in $(seq 0 $[num_iters_combine-1]); do
+    iter=$[$first_model_combine+$n]
+    nnet=$dir/$iter.raw
+    [ ! -f $nnet ] && echo "Expected $nnet to exist" && exit 1;
+    nnets_list[$n]=$nnet
+  done
+
+  # Below, we use --use-gpu=no to disable nnet3-combine-fast from using a GPU,
+  # as if there are many models it can give out-of-memory error; and we set
+  # num-threads to 8 to speed it up (this isn't ideal...)
+
+  $cmd $combine_queue_opt $dir/log/combine.log \
+    nnet3-combine --num-iters=40 \
+    --enforce-sum-to-one=true --enforce-positive-weights=true \
+    --verbose=3 "${nnets_list[@]}" "ark,bg:nnet3-merge-egs --minibatch-size=1024 ark:$egs_dir/combine.egs ark:-|" \
+    $dir/final.raw || exit 1;
+
+  # Compute the probability of the final, combined model with
+  # the same subset we used for the previous compute_probs, as the
+  # different subsets will lead to different probs.
+  $cmd $dir/log/compute_prob_valid.final.log \
+    nnet3-compute-prob --compute-accuracy=$compute_accuracy $dir/final.raw \
+    "ark,bg:nnet3-merge-egs ark:$egs_dir/valid_diagnostic.egs ark:- |" &
+  $cmd $dir/log/compute_prob_train.final.log \
+    nnet3-compute-prob --compute-accuracy=$compute_accuracy $dir/final.raw \
+    "ark,bg:nnet3-merge-egs ark:$egs_dir/train_diagnostic.egs ark:- |" &
+fi
+
+if $include_log_softmax && [ $stage -le $[$num_iters+1] ]; then
+  echo "Getting average posterior for purpose of using as prior to convert posteriors to likelihoods."
+  # Note: this just uses CPUs, using a smallish subset of data.
+  if [ $num_jobs_compute_prior -gt $num_archives ]; then egs_part=1;
+  else egs_part=JOB; fi
+  rm $dir/post.$x.*.vec 2>/dev/null
+  $cmd JOB=1:$num_jobs_compute_prior $prior_queue_opt $dir/log/get_post.$x.JOB.log \
+    nnet3-copy-egs --frame=random $context_opts --srand=JOB ark:$egs_dir/egs.$egs_part.ark ark:- \| \
+    nnet3-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
+    nnet3-merge-egs ark:- ark:- \| \
+    nnet3-compute-from-egs $prior_gpu_opt --apply-exp=true \
+    $dir/final.raw ark:- ark:- \| \
+    matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1;
+
+  sleep 3;  # make sure there is time for $dir/post.$x.*.vec to appear.
+
+  $cmd $dir/log/vector_sum.$x.log \
+   vector-sum $dir/post.$x.*.vec $dir/post.$x.vec || exit 1;
+
+  rm -f $dir/post.$x.*.vec;
+
+fi
+
+
+if [ ! -f $dir/final.raw ]; then
+  echo "$0: $dir/final.raw does not exist."
+  # we don't want to clean up if the training didn't succeed.
+  exit 1;
+fi
+
+sleep 2
+
+echo Done
+
+if $cleanup; then
+  echo Cleaning up data
+  if $remove_egs && [[ $egs_dir =~ $dir/egs* ]]; then
+    steps/nnet2/remove_egs.sh $egs_dir
+  fi
+
+  echo Removing most of the models
+  for x in `seq 0 $num_iters`; do
+    if [ $[$x%100] -ne 0 ] && [ $x -ne $num_iters ] && [ -f $dir/$x.mdl ]; then
+       # delete all but every 100th model; don't delete the ones which combine to form the final model.
+      rm $dir/$x.raw
+    fi
+  done
+fi
+
diff --git a/egs/wsj/s5/steps/nnet3/train_discriminative.sh b/egs/wsj/s5/steps/nnet3/train_discriminative.sh
new file mode 100755
index 00000000000..838ae311906
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/train_discriminative.sh
@@ -0,0 +1,391 @@
+#!/bin/bash
+
+# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey)
+#           2014-2015  Vimal Manohar
+# Apache 2.0.
+
+set -o pipefail
+
+# This script does MPE or MMI or state-level minimum bayes risk (sMBR) training
+# using egs obtained by steps/nnet3/get_egs_discriminative.sh
+
+# Begin configuration section.
+cmd=run.pl
+num_epochs=4       # Number of epochs of training;
+                   # the number of iterations is worked out from this.
+                   # Be careful with this: we actually go over the data
+                   # num-epochs * frame-subsampling-factor times, due to
+                   # using different data-shifts.
+use_gpu=true
+truncate_deriv_weights=0  # can be used to set to zero the weights of derivs from frames
+                          # near the edges.  (counts subsampled frames).
+apply_deriv_weights=true
+use_frame_shift=false
+run_diagnostics=true
+learning_rate=0.00002
+max_param_change=2.0
+scale_max_param_change=false # if this option is used, scale it by num-jobs.
+
+effective_lrate=    # If supplied, overrides the learning rate, which gets set to effective_lrate * num_jobs_nnet.
+acoustic_scale=0.1  # acoustic scale for MMI/MPFE/SMBR training.
+boost=0.0       # option relevant for MMI
+
+criterion=smbr
+drop_frames=false #  option relevant for MMI
+one_silence_class=true # option relevant for MPE/SMBR
+num_jobs_nnet=4    # Number of neural net jobs to run in parallel.  Note: this
+                   # will interact with the learning rates (if you decrease
+                   # this, you'll have to decrease the learning rate, and vice
+                   # versa).
+regularization_opts=
+minibatch_size=64  # This is the number of examples rather than the number of output frames.
+modify_learning_rates=false
+last_layer_factor=1.0  # relates to modify-learning-rates
+first_layer_factor=1.0 # relates to modify-learning-rates
+shuffle_buffer_size=1000 # This "buffer_size" variable controls randomization of the samples
+                # on each iter.  You could set it to 0 or to a large value for complete
+                # randomization, but this would both consume memory and cause spikes in
+                # disk I/O.  Smaller is easier on disk and memory but less random.  It's
+                # not a huge deal though, as samples are anyway randomized right at the start.
+
+
+stage=-3
+
+adjust_priors=true
+num_threads=16  # this is the default but you may want to change it, e.g. to 1 if
+                # using GPUs.
+
+cleanup=true
+keep_model_iters=1
+retroactive=false
+remove_egs=false
+src_model=  # will default to $degs_dir/final.mdl
+
+left_deriv_truncate=   # number of time-steps to avoid using the deriv of, on the left.
+right_deriv_truncate=  # number of time-steps to avoid using the deriv of, on the right.
+# End configuration section.
+
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+
+if [ $# != 2 ]; then
+  echo "Usage: $0 [opts] <degs-dir> <exp-dir>"
+  echo " e.g.: $0 exp/nnet3/tdnn_sp_degs exp/nnet3/tdnn_sp_smbr"
+  echo ""
+  echo "Main options (for others, see top of script file)"
+  echo "  --config <config-file>                           # config file containing options"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --num-epochs <#epochs|4>                        # Number of epochs of training"
+  echo "  --learning-rate <learning-rate|0.0002>           # Learning rate to use"
+  echo "  --effective-lrate <effective-learning-rate>      # If supplied, learning rate will be set to"
+  echo "                                                   # this value times num-jobs-nnet."
+  echo "  --num-jobs-nnet <num-jobs|8>                     # Number of parallel jobs to use for main neural net"
+  echo "                                                   # training (will affect results as well as speed; try 8, 16)"
+  echo "                                                   # Note: if you increase this, you may want to also increase"
+  echo "                                                   # the learning rate.  Also note: if there are fewer archives"
+  echo "                                                   # of egs than this, it will get reduced automatically."
+  echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job (will affect results"
+  echo "                                                   # as well as speed; may interact with batch size; if you increase"
+  echo "                                                   # this, you may want to decrease the batch size.  With GPU, must be 1."
+  echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
+  echo "                                                   # use multiple threads... "
+  echo "  --stage <stage|-3>                               # Used to run a partially-completed training process from somewhere in"
+  echo "                                                   # the middle."
+  echo "  --criterion <criterion|smbr>                     # Training criterion: may be smbr, mmi or mpfe"
+  echo "  --boost <boost|0.0>                              # Boosting factor for MMI (e.g., 0.1)"
+  echo "  --drop-frames <true,false|false>                 # Option that affects MMI training: if true, we exclude gradients from frames"
+  echo "                                                   # where the numerator transition-id is not in the denominator lattice."
+  echo "  --one-silence-class <true,false|false>           # Option that affects MPE/SMBR training (will tend to reduce insertions)"
+  echo "  --modify-learning-rates <true,false|false>       # If true, modify learning rates to try to equalize relative"
+  echo "                                                   # changes across layers."
+  exit 1;
+fi
+
+degs_dir=$1
+dir=$2
+
+[ -z "$src_model" ] && src_model=$degs_dir/final.mdl
+
+# Check some files.
+for f in $degs_dir/degs.1.ark $degs_dir/info/{num_archives,silence.csl,frames_per_eg,egs_per_archive} $src_model; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+mkdir -p $dir/log || exit 1;
+
+# copy some things
+for f in splice_opts cmvn_opts tree final.mat; do
+  if [ -f $degs_dir/$f ]; then
+    cp $degs_dir/$f $dir/ || exit 1;
+  fi
+done
+
+silphonelist=`cat $degs_dir/info/silence.csl` || exit 1;
+
+num_archives_priors=0
+if $adjust_priors; then
+  num_archives_priors=`cat $degs_dir/info/num_archives_priors` || exit 1
+fi
+
+frames_per_eg=$(cat $degs_dir/info/frames_per_eg) || { echo "error: no such file $degs_dir/info/frames_per_eg"; exit 1; }
+num_archives=$(cat $degs_dir/info/num_archives) || exit 1;
+frame_subsampling_factor=$(cat $degs_dir/info/frame_subsampling_factor)
+
+echo $frame_subsampling_factor > $dir/frame_subsampling_factor
+
+if $use_frame_shift; then
+  num_archives_expanded=$[$num_archives*$frame_subsampling_factor]
+else
+  num_archives_expanded=$num_archives
+fi
+
+if [ $num_jobs_nnet -gt $num_archives_expanded ]; then
+  echo "$0: num-jobs-nnet $num_jobs_nnet exceeds number of archives $num_archives_expanded,"
+  echo " ... setting it to $num_archives."
+  num_jobs_nnet=$num_archives_expanded
+fi
+
+num_archives_to_process=$[$num_epochs*$num_archives_expanded]
+num_archives_processed=0
+num_iters=$[$num_archives_to_process/$num_jobs_nnet]
+
+echo "$0: Will train for $num_epochs epochs = $num_iters iterations"
+
+if $use_gpu; then
+  parallel_suffix=""
+  train_queue_opt="--gpu 1"
+  parallel_train_opts=
+  if ! cuda-compiled; then
+    echo "$0: WARNING: you are running with one thread but you have not compiled"
+    echo "   for CUDA.  You may be running a setup optimized for GPUs.  If you have"
+    echo "   GPUs and have nvcc installed, go to src/ and do ./configure; make"
+    exit 1
+  fi
+else
+  echo "$0: without using a GPU this will be very slow.  nnet3 does not yet support multiple threads."
+  parallel_train_opts="--use-gpu=no"
+fi
+
+if $use_frame_shift; then
+  num_epochs_expanded=$[num_epochs*frame_subsampling_factor]
+else
+  num_epochs_expanded=$num_epochs
+fi
+
+for e in $(seq 1 $num_epochs_expanded); do
+  x=$[($e*$num_archives)/$num_jobs_nnet] # gives the iteration number.
+  iter_to_epoch[$x]=$e
+done
+
+if [ $stage -le -1 ]; then
+  echo "$0: Copying initial model and modifying preconditioning setup"
+
+  # Note, the baseline model probably had preconditioning, and we'll keep it;
+  # but we want online preconditioning with a larger number of samples of
+  # history, since in this setup the frames are only randomized at the segment
+  # level so they are highly correlated.  It might make sense to tune this a
+  # little, later on, although I doubt it matters once the --num-samples-history
+  # is large enough.
+
+  if [ ! -z "$effective_lrate" ]; then
+    learning_rate=$(perl -e "print ($num_jobs_nnet*$effective_lrate);")
+    echo "$0: setting learning rate to $learning_rate = --num-jobs-nnet * --effective-lrate."
+  fi
+
+  $cmd $dir/log/convert.log \
+    nnet3-am-copy --learning-rate=$learning_rate "$src_model" $dir/0.mdl || exit 1;
+fi
+
+
+rm $dir/.error 2>/dev/null
+
+x=0   
+
+deriv_time_opts=
+[ ! -z "$left_deriv_truncate" ] && deriv_time_opts="--optimization.min-deriv-time=$left_deriv_truncate"
+[ ! -z "$right_deriv_truncate" ] && \
+  deriv_time_opts="$deriv_time_opts --optimization.max-deriv-time=$((frames_per_eg - right_deriv_truncate))"
+
+while [ $x -lt $num_iters ]; do
+  if [ $stage -le $x ]; then
+    if $run_diagnostics; then
+      # Set off jobs doing some diagnostics, in the background.  # Use the egs dir from the previous iteration for the diagnostics
+      $cmd $dir/log/compute_objf_valid.$x.log \
+        nnet3-discriminative-compute-objf  $regularization_opts \
+        --silence-phones=$silphonelist \
+        --criterion=$criterion --drop-frames=$drop_frames \
+        --one-silence-class=$one_silence_class \
+        --boost=$boost --acoustic-scale=$acoustic_scale \
+        $dir/$x.mdl \
+        ark:$degs_dir/valid_diagnostic.degs &
+      $cmd $dir/log/compute_objf_train.$x.log \
+        nnet3-discriminative-compute-objf  $regularization_opts \
+        --silence-phones=$silphonelist \
+        --criterion=$criterion --drop-frames=$drop_frames \
+        --one-silence-class=$one_silence_class \
+        --boost=$boost --acoustic-scale=$acoustic_scale \
+        $dir/$x.mdl \
+        ark:$degs_dir/train_diagnostic.degs &
+    fi
+    
+    if [ $x -gt 0 ]; then
+      $cmd $dir/log/progress.$x.log \
+        nnet3-show-progress --use-gpu=no "nnet3-am-copy --raw=true $dir/$[$x-1].mdl - |" "nnet3-am-copy --raw=true $dir/$x.mdl - |" \
+        '&&' \
+        nnet3-info "nnet3-am-copy --raw=true $dir/$x.mdl - |" &
+    fi
+
+
+    echo "Training neural net (pass $x)"
+      
+    cache_read_opt="--read-cache=$dir/cache.$x"
+    
+    ( # this sub-shell is so that when we "wait" below,
+      # we only wait for the training jobs that we just spawned,
+      # not the diagnostic jobs that we spawned above.
+
+      # We can't easily use a single parallel SGE job to do the main training,
+      # because the computation of which archive and which --frame option
+      # to use for each job is a little complex, so we spawn each one separately.
+      for n in `seq $num_jobs_nnet`; do
+        k=$[$num_archives_processed + $n - 1]; # k is a zero-based index that we'll derive
+                                               # the other indexes from.
+        archive=$[($k%$num_archives)+1]; # work out the 1-based archive index.
+        
+        if [ $n -eq 1 ]; then
+          # an option for writing cache (storing pairs of nnet-computations and
+          # computation-requests) during training.
+          cache_write_opt=" --write-cache=$dir/cache.$[$x+1]"
+        else
+          cache_write_opt=""
+        fi
+
+        if $use_frame_shift; then
+          if [ $[num_archives % frame_subsampling_factor] -ne 0 ]; then
+            frame_shift=$[k % frame_subsampling_factor]
+          else
+            frame_shift=$[(k + k/num_archives) % frame_subsampling_factor]
+          fi
+        else
+          frame_shift=0
+        fi
+
+        #archive=$[(($n+($x*$num_jobs_nnet))%$num_archives)+1]
+        if $scale_max_param_change; then
+          this_max_param_change=$(perl -e "print ($max_param_change * $num_jobs_nnet);")
+        else
+          this_max_param_change=$max_param_change
+        fi
+
+        $cmd $train_queue_opt $dir/log/train.$x.$n.log \
+          nnet3-discriminative-train $cache_read_opt $cache_write_opt \
+          --apply-deriv-weights=$apply_deriv_weights \
+          $parallel_train_opts $deriv_time_opts \
+          --max-param-change=$this_max_param_change \
+          --silence-phones=$silphonelist \
+          --criterion=$criterion --drop-frames=$drop_frames \
+          --one-silence-class=$one_silence_class \
+          --boost=$boost --acoustic-scale=$acoustic_scale $regularization_opts \
+          $dir/$x.mdl \
+          "ark:nnet3-discriminative-copy-egs --frame-shift=$frame_shift --truncate-deriv-weights=$truncate_deriv_weights ark:$degs_dir/degs.$archive.ark ark:- | nnet3-discriminative-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:- | nnet3-discriminative-merge-egs --minibatch-size=$minibatch_size ark:- ark:- |" \
+          $dir/$[$x+1].$n.raw || touch $dir/.error &
+      done
+      wait
+      [ -f $dir/.error ] && exit 1
+    )
+    [ -f $dir/.error ] && { echo "Found $dir/.error. See $dir/log/train.$x.*.log"; exit 1; }
+
+    nnets_list=$(for n in $(seq $num_jobs_nnet); do echo $dir/$[$x+1].$n.raw; done)
+
+    # below use run.pl instead of a generic $cmd for these very quick stages,
+    # so that we don't run the risk of waiting for a possibly hard-to-get GPU.
+    run.pl $dir/log/average.$x.log \
+      nnet3-average $nnets_list - \| \
+      nnet3-am-copy --set-raw-nnet=- $dir/$x.mdl $dir/$[$x+1].mdl || exit 1;
+
+    if $modify_learning_rates; then
+      run.pl $dir/log/modify_learning_rates.$x.log \
+        nnet3-modify-learning-rates --retroactive=$retroactive \
+        --last-layer-factor=$last_layer_factor \
+        --first-layer-factor=$first_layer_factor \
+        "nnet3-am-copy --raw $dir/$x.mdl -|" "nnet3-am-copy --raw $dir/$[$x+1].mdl -|" - \| \
+        nnet3-am-copy --set-raw-nnet=- $dir/$x.mdl $dir/$[$x+1].mdl || exit 1;
+    fi
+    rm $nnets_list
+
+    if [ ! -z "${iter_to_epoch[$x]}" ]; then
+      e=${iter_to_epoch[$x]}
+      ln -sf $x.mdl $dir/epoch$e.mdl
+    fi
+
+    if $adjust_priors && [ ! -z "${iter_to_epoch[$x]}" ]; then
+      if [ ! -f $degs_dir/priors_egs.1.ark ]; then
+        echo "$0: Expecting $degs_dir/priors_egs.1.ark to exist since --adjust-priors was true."
+        echo "$0: Run this script with --adjust-priors false to not adjust priors"
+        exit 1
+      fi
+      (
+        e=${iter_to_epoch[$x]}
+        rm $dir/.error 2> /dev/null
+
+        steps/nnet3/adjust_priors.sh --egs-type priors_egs \
+          --num-jobs-compute-prior $num_archives_priors \
+          --cmd "$cmd" --use-gpu false \
+          --use-raw-nnet false --iter epoch$e $dir $degs_dir \
+          || { touch $dir/.error; echo "Error in adjusting priors. See $dir/log/adjust_priors.epoch$e.log"; exit 1; }
+      ) &
+    fi
+
+    [ -f $dir/.error ] && { echo "Found $dir/.error. Error on iteration $x"; exit 1; }
+  fi
+
+  rm $dir/cache.$x 2>/dev/null || true
+  x=$[$x+1]
+  num_archives_processed=$[num_archives_processed+num_jobs_nnet]
+done
+
+rm $dir/final.mdl 2>/dev/null
+cp $dir/$x.mdl $dir/final.mdl
+ln -sf final.mdl $dir/epoch$num_epochs_expanded.mdl
+
+if $adjust_priors && [ $stage -le $num_iters ]; then
+  if [ ! -f $degs_dir/priors_egs.1.ark ]; then
+    echo "$0: Expecting $degs_dir/priors_egs.1.ark to exist since --adjust-priors was true."
+    echo "$0: Run this script with --adjust-priors false to not adjust priors"
+    exit 1
+  fi
+
+  steps/nnet3/adjust_priors.sh --egs-type priors_egs \
+    --num-jobs-compute-prior $num_archives_priors \
+    --cmd "$cmd $prior_queue_opt" --use-gpu false \
+    --use-raw-nnet false --iter epoch$num_epochs_expanded \
+    $dir $degs_dir || exit 1
+fi
+
+echo Done
+
+
+# function to remove egs that might be soft links.
+remove () { for x in $*; do [ -L $x ] && rm $(readlink -f $x); rm $x; done }
+
+if $cleanup && $remove_egs; then  # note: this is false by default.
+  echo Removing training examples
+  remove $degs_dir/degs.*
+  remove $degs_dir/priors_egs.*
+fi
+
+
+if $cleanup; then
+  echo Removing most of the models
+  for x in `seq 1 $keep_model_iters $num_iters`; do
+    if [ -z "${iter_to_epoch[$x]}" ]; then
+      # if $x is not an epoch-final iteration..
+      rm $dir/$x.mdl 2>/dev/null
+    fi
+  done
+fi
+
diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py
new file mode 100755
index 00000000000..e56baed97a9
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/train_dnn.py
@@ -0,0 +1,632 @@
+#!/usr/bin/env python
+
+
+# Copyright 2016 Vijayaditya Peddinti.
+# Apache 2.0.
+
+
+# this script is based on steps/nnet3/lstm/train.sh
+
+
+import subprocess
+import argparse
+import sys
+import pprint
+import logging
+import imp
+import traceback
+from nnet3_train_lib import *
+
+nnet3_log_parse = imp.load_source('', 'steps/nnet3/report/nnet3_log_parse_lib.py')
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+handler.setLevel(logging.INFO)
+formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s')
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+logger.info('Starting RNN trainer (train_rnn.py)')
+
+
+def GetArgs():
+    # we add compulsary arguments as named arguments for readability
+    parser = argparse.ArgumentParser(description="""
+    Trains a feed forward DNN acoustic model using the cross-entropy objective.
+    DNNs include simple DNNs, TDNNs and CNNs.
+    """,
+    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+    # feat options
+    parser.add_argument("--feat.online-ivector-dir", type=str, dest='online_ivector_dir',
+                        default = None, action = NullstrToNoneAction,
+                        help="""directory with the ivectors extracted in
+                        an online fashion.""")
+    parser.add_argument("--feat.cmvn-opts", type=str, dest='cmvn_opts',
+                        default = None, action = NullstrToNoneAction,
+                        help="A string specifying '--norm-means' and '--norm-vars' values")
+
+    # egs extraction options
+    parser.add_argument("--egs.frames-per-eg", type=int, dest='frames_per_eg',
+                        default = 8,
+                        help="Number of output labels per example")
+    parser.add_argument("--egs.transform_dir", type=str, dest='transform_dir',
+                        default = None, action = NullstrToNoneAction,
+                        help="""String to provide options directly to steps/nnet3/get_egs.sh script""")
+    parser.add_argument("--egs.dir", type=str, dest='egs_dir',
+                        default = None, action = NullstrToNoneAction,
+                        help="""Directory with egs. If specified this directory
+                        will be used rather than extracting egs""")
+    parser.add_argument("--egs.stage", type=int, dest='egs_stage',
+                        default = 0, help="Stage at which get_egs.sh should be restarted")
+    parser.add_argument("--egs.opts", type=str, dest='egs_opts',
+                        default = None, action = NullstrToNoneAction,
+                        help="""String to provide options directly to steps/nnet3/get_egs.sh script""")
+
+    # trainer options
+    parser.add_argument("--trainer.num-epochs", type=int, dest='num_epochs',
+                        default = 8,
+                        help="Number of epochs to train the model")
+    parser.add_argument("--trainer.prior-subset-size", type=int, dest='prior_subset_size',
+                        default = 20000,
+                        help="Number of samples for computing priors")
+    parser.add_argument("--trainer.num-jobs-compute-prior", type=int, dest='num_jobs_compute_prior',
+                        default = 10,
+                        help="The prior computation jobs are single threaded and run on the CPU")
+    parser.add_argument("--trainer.max-models-combine", type=int, dest='max_models_combine',
+                        default = 20,
+                        help="The maximum number of models used in the final model combination stage. These models will themselves be averages of iteration-number ranges")
+    parser.add_argument("--trainer.shuffle-buffer-size", type=int, dest='shuffle_buffer_size',
+                        default = 5000,
+                        help="Controls randomization of the samples on each"
+                        "iteration. If 0 or a large value the randomization is"
+                        "complete, but this will consume memory and cause spikes"
+                        "in disk I/O.  Smaller is easier on disk and memory but"
+                        "less random.  It's not a huge deal though, as samples"
+                        "are anyway randomized right at the start."
+                        "(the point of this is to get data in different"
+                        "minibatches on different iterations, since in the"
+                        "preconditioning method, 2 samples in the same minibatch"
+                        "can affect each others' gradients.")
+    parser.add_argument("--trainer.add-layers-period", type=int, dest='add_layers_period',
+                        default=2,
+                        help="The number of iterations between adding layers"
+                        "during layer-wise discriminative training.")
+    parser.add_argument("--trainer.max-param-change", type=float, dest='max_param_change',
+                        default=2.0,
+                        help="The maximum change in parameters allowed per minibatch,"
+                        "measured in Frobenius norm over the entire model")
+    parser.add_argument("--trainer.samples-per-iter", type=int, dest='samples_per_iter',
+                        default=400000,
+                        help="This is really the number of egs in each archive.")
+    parser.add_argument("--trainer.lda.rand-prune", type=float, dest='rand_prune',
+                        default=4.0,
+                        help="""Value used in preconditioning matrix estimation""")
+    parser.add_argument("--trainer.lda.max-lda-jobs", type=float, dest='max_lda_jobs',
+                        default=10,
+                        help="""Max number of jobs used for LDA stats accumulation""")
+    parser.add_argument("--trainer.presoftmax-prior-scale-power", type=float, dest='presoftmax_prior_scale_power',
+                        default=-0.25,
+                        help="")
+
+    # Realignment parameters
+    parser.add_argument("--trainer.realign.command", type=str, dest='realign_command',
+                        default=None, action=NullstrToNoneAction,
+                        help="""Command to be used with steps/nnet3/align.sh during realignment""")
+    parser.add_argument("--trainer.realign.num-jobs", type=int, dest='realign_num_jobs',
+                        default=30,
+                        help="Number of jobs to use for realignment")
+    parser.add_argument("--trainer.realign.times", type=str, dest='realign_times',
+                        default=None, action=NullstrToNoneAction,
+                        help="""A space seperated string of realignment
+                        times. Values must be between 0 and 1
+                        e.g. '0.1 0.2 0.3' """)
+
+    parser.add_argument("--trainer.realign.use_gpu", type=str, dest='realign_use_gpu',
+                        default=True, action=StrToBoolAction,
+                        choices = ["true", "false"],
+                        help="If true, gpu is used with steps/nnet3/align.sh")
+
+    # Parameters for the optimization
+    parser.add_argument("--trainer.optimization.minibatch-size", type=float, dest='minibatch_size',
+                        default = 512,
+                        help="Size of the minibatch used to compute the gradient")
+    parser.add_argument("--trainer.optimization.initial-effective-lrate", type=float, dest='initial_effective_lrate',
+                        default = 0.0003,
+                        help="Learning rate used during the initial iteration")
+    parser.add_argument("--trainer.optimization.final-effective-lrate", type=float, dest='final_effective_lrate',
+                        default = 0.00003,
+                        help="Learning rate used during the final iteration")
+    parser.add_argument("--trainer.optimization.num-jobs-initial", type=int, dest='num_jobs_initial',
+                        default = 1,
+                        help="Number of neural net jobs to run in parallel at the start of training")
+    parser.add_argument("--trainer.optimization.num-jobs-final", type=int, dest='num_jobs_final',
+                        default = 8,
+                        help="Number of neural net jobs to run in parallel at the end of training")
+    parser.add_argument("--trainer.optimization.max-models-combine", type=int, dest='max_models_combine',
+                        default = 20,
+                        help = """ The is the maximum number of models we give to the
+                                   final 'combine' stage, but these models will themselves
+                                   be averages of iteration-number ranges. """)
+    parser.add_argument("--trainer.optimization.momentum", type=float, dest='momentum',
+                        default = 0.0,
+                        help="""Momentum used in update computation.
+                        Note: we implemented it in such a way that
+                        it doesn't increase the effective learning rate.""")
+    # General options
+    parser.add_argument("--stage", type=int, default=-4,
+                        help="Specifies the stage of the experiment to execution from")
+    parser.add_argument("--exit-stage", type=int, default=None,
+                        help="If specified, training exits before running this stage")
+    parser.add_argument("--cmd", type=str, action = NullstrToNoneAction,
+                        dest = "command",
+                        help="""Specifies the script to launch jobs.
+                        e.g. queue.pl for launching on SGE cluster
+                             run.pl for launching on local machine
+                        """, default = "queue.pl")
+    parser.add_argument("--use-gpu", type=str, action = StrToBoolAction,
+                        choices = ["true", "false"],
+                        help="Use GPU for training", default=True)
+    parser.add_argument("--cleanup", type=str, action = StrToBoolAction,
+                        choices = ["true", "false"],
+                        help="Clean up models after training", default=True)
+    parser.add_argument("--cleanup.remove-egs", type=str, dest='remove_egs',
+                        default = True, action = StrToBoolAction,
+                        choices = ["true", "false"],
+                        help="""If true, remove egs after experiment""")
+    parser.add_argument("--cleanup.preserve-model-interval", dest = "preserve_model_interval",
+                        type=int, default=100,
+                        help="Determines iterations for which models will be preserved during cleanup. If iter % preserve_model_interval == 0 model will be preserved.")
+
+    parser.add_argument("--reporting.email", dest = "email",
+                        type=str, default=None, action = NullstrToNoneAction,
+                        help=""" Email-id to report about the progress of the experiment.
+                              NOTE: It assumes the machine on which the script is being run can send
+                              emails from command line via. mail program. The
+                              Kaldi mailing list will not support this feature.
+                              It might require local expertise to setup. """)
+    parser.add_argument("--reporting.interval", dest = "reporting_interval",
+                        type=int, default=0.1,
+                        help="Frequency with which reports have to be sent, measured in terms of fraction of iterations. If 0 and reporting mail has been specified then only failure notifications are sent")
+
+    parser.add_argument("--feat-dir", type=str, required = True,
+                        help="Directory with features used for training the neural network.")
+    parser.add_argument("--lang", type=str, required = True,
+                        help="Languade directory")
+    parser.add_argument("--ali-dir", type=str, required = True,
+                        help="Directory with alignments used for training the neural network.")
+    parser.add_argument("--dir", type=str, required = True,
+                        help="Directory to store the models and all other files.")
+
+    print(' '.join(sys.argv))
+
+    args = parser.parse_args()
+
+    [args, run_opts] = ProcessArgs(args)
+
+    return [args, run_opts]
+
+def ProcessArgs(args):
+    # process the options
+    if args.frames_per_eg < 1:
+        raise Exception("--egs.frames-per-eg should have a minimum value of 1")
+
+    if (not os.path.exists(args.dir)) or (not os.path.exists(args.dir+"/configs")):
+        raise Exception("This scripts expects {0} to exist and have a configs"
+        " directory which is the output of make_configs.py script")
+
+    if args.transform_dir is None:
+        args.transform_dir = args.ali_dir
+    # set the options corresponding to args.use_gpu
+    run_opts = RunOpts()
+    if args.use_gpu:
+        if not CheckIfCudaCompiled():
+            logger.warning("""
+    You are running with one thread but you have not compiled
+    for CUDA.  You may be running a setup optimized for GPUs.  If you have
+    GPUs and have nvcc installed, go to src/ and do ./configure; make""")
+
+        run_opts.train_queue_opt = "--gpu 1"
+        run_opts.parallel_train_opts = ""
+        run_opts.combine_queue_opt = "--gpu 1"
+        run_opts.prior_gpu_opt = "--use-gpu=yes"
+        run_opts.prior_queue_opt = "--gpu 1"
+
+    else:
+        logger.warning("""
+    Without using a GPU this will be very slow.  nnet3 does not yet support multiple threads.""")
+
+        run_opts.train_queue_opt = ""
+        run_opts.parallel_train_opts = "--use-gpu=no"
+        run_opts.combine_queue_opt = ""
+        run_opts.prior_gpu_opt = "--use-gpu=no"
+        run_opts.prior_queue_opt = ""
+
+    if args.realign_use_gpu is True:
+        run_opts.realign_use_gpu = True
+        run_opts.realign_queue_opt = "--gpu 1"
+    else:
+        run_opts.realign_use_gpu = False
+        run_opts.realign_queue_opt = ""
+
+    if args.realign_command is None:
+        run_opts.realign_command = args.command
+    else:
+        run_opts.realign_command = args.realign_command
+    run_opts.realign_num_jobs = args.realign_num_jobs
+
+    run_opts.command = args.command
+    run_opts.num_jobs_compute_prior = args.num_jobs_compute_prior
+
+    return [args, run_opts]
+
+# a class to store run options
+class RunOpts:
+    def __init__(self):
+        self.command = None
+        self.train_queue_opt = None
+        self.combine_queue_opt = None
+        self.prior_gpu_opt = None
+        self.prior_queue_opt = None
+        self.parallel_train_opts = None
+        self.realign_use_gpu = None
+
+# this is the main method which differs between RNN and DNN training
+def TrainNewModels(dir, iter, num_jobs, num_archives_processed, num_archives,
+                   raw_model_string, egs_dir, frames_per_eg,
+                   left_context, right_context,
+                   momentum, max_param_change,
+                   shuffle_buffer_size, minibatch_size,
+                   run_opts):
+      # We cannot easily use a single parallel SGE job to do the main training,
+      # because the computation of which archive and which --frame option
+      # to use for each job is a little complex, so we spawn each one separately.
+      # this is no longer true for RNNs as we use do not use the --frame option
+      # but we use the same script for consistency with FF-DNN code
+
+    context_opts="--left-context={0} --right-context={1}".format(
+                  left_context, right_context)
+    processes = []
+    for job in range(1,num_jobs+1):
+        k = num_archives_processed + job - 1 # k is a zero-based index that we will derive
+                                               # the other indexes from.
+        archive_index = (k % num_archives) + 1 # work out the 1-based archive index.
+        frame = (k / num_archives) % frames_per_eg
+        process_handle = RunKaldiCommand("""
+{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \
+  nnet3-train {parallel_train_opts} \
+  --print-interval=10 --momentum={momentum} \
+  --max-param-change={max_param_change} \
+  "{raw_model}" \
+  "ark,bg:nnet3-copy-egs --frame={frame} {context_opts} ark:{egs_dir}/egs.{archive_index}.ark ark:- | nnet3-shuffle-egs --buffer-size={shuffle_buffer_size} --srand={iter} ark:- ark:-| nnet3-merge-egs --minibatch-size={minibatch_size} --measure-output-frames=false --discard-partial-minibatches=true ark:- ark:- |" \
+  {dir}/{next_iter}.{job}.raw
+          """.format(command = run_opts.command,
+                     train_queue_opt = run_opts.train_queue_opt,
+                     dir = dir, iter = iter, next_iter = iter + 1, job = job,
+                     parallel_train_opts = run_opts.parallel_train_opts,
+                     frame = frame,
+                     momentum = momentum, max_param_change = max_param_change,
+                     raw_model = raw_model_string, context_opts = context_opts,
+                     egs_dir = egs_dir, archive_index = archive_index,
+                     shuffle_buffer_size = shuffle_buffer_size,
+                     minibatch_size = minibatch_size),
+          wait = False)
+
+        processes.append(process_handle)
+
+    all_success = True
+    for process in processes:
+        process.wait()
+        [stdout_value, stderr_value] = process.communicate()
+        print(stderr_value)
+        if process.returncode != 0:
+            all_success = False
+
+    if not all_success:
+        open('{0}/.error'.format(dir), 'w').close()
+        raise Exception("There was error during training iteration {0}".format(iter))
+
+def TrainOneIteration(dir, iter, egs_dir,
+                      num_jobs, num_archives_processed, num_archives,
+                      learning_rate, minibatch_size,
+                      frames_per_eg, num_hidden_layers, add_layers_period,
+                      left_context, right_context,
+                      momentum, max_param_change, shuffle_buffer_size,
+                      run_opts):
+
+
+
+    # Set off jobs doing some diagnostics, in the background.
+    # Use the egs dir from the previous iteration for the diagnostics
+    logger.info("Training neural net (pass {0})".format(iter))
+
+    ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts)
+
+    if iter > 0:
+        ComputeProgress(dir, iter, egs_dir, run_opts)
+
+    if iter > 0 and (iter <= (num_hidden_layers-1) * add_layers_period) and (iter % add_layers_period == 0):
+
+        do_average = False # if we've just mixed up, don't do averaging but take the
+                           # best.
+        cur_num_hidden_layers = 1 + iter / add_layers_period
+        config_file = "{0}/configs/layer{1}.config".format(dir, cur_num_hidden_layers)
+        raw_model_string = "nnet3-am-copy --raw=true --learning-rate={lr} {dir}/{iter}.mdl - | nnet3-init --srand={iter} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, config=config_file )
+    else:
+        do_average = True
+        if iter == 0:
+            do_average = False   # on iteration 0, pick the best, don't average.
+        raw_model_string = "nnet3-am-copy --raw=true --learning-rate={0} {1}/{2}.mdl - |".format(learning_rate, dir, iter)
+
+    if do_average:
+      cur_minibatch_size = minibatch_size
+      cur_max_param_change = max_param_change
+    else:
+      # on iteration zero or when we just added a layer, use a smaller minibatch
+      # size (and we will later choose the output of just one of the jobs): the
+      # model-averaging isn't always helpful when the model is changing too fast
+      # (i.e. it can worsen the objective function), and the smaller minibatch
+      # size will help to keep the update stable.
+      cur_minibatch_size = minibatch_size / 2
+      cur_max_param_change = float(max_param_change) / math.sqrt(2)
+
+    try:
+        os.remove("{0}/.error".format(dir))
+    except OSError:
+        pass
+
+    TrainNewModels(dir, iter, num_jobs, num_archives_processed, num_archives,
+                   raw_model_string, egs_dir, frames_per_eg,
+                   left_context, right_context,
+                   momentum, max_param_change,
+                   shuffle_buffer_size, cur_minibatch_size,
+                   run_opts)
+    [models_to_average, best_model] = GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter))
+    nnets_list = []
+    for n in models_to_average:
+      nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n))
+
+    if do_average:
+        # average the output of the different jobs.
+        RunKaldiCommand("""
+{command} {dir}/log/average.{iter}.log \
+nnet3-average {nnet_list} - \| \
+nnet3-am-copy --set-raw-nnet=- {dir}/{iter}.mdl {dir}/{new_iter}.mdl
+        """.format(command = run_opts.command,
+                   dir = dir,
+                   iter = iter,
+                   nnet_list = " ".join(nnets_list),
+                   new_iter = iter + 1))
+
+    else:
+        # choose the best model from different jobs
+        RunKaldiCommand("""
+{command} {dir}/log/select.{iter}.log \
+    nnet3-am-copy --set-raw-nnet={dir}/{next_iter}.{best_model_index}.raw  {dir}/{iter}.mdl {dir}/{next_iter}.mdl
+        """.format(command = run_opts.command,
+                   dir = dir, iter = iter, next_iter = iter + 1,
+                   best_model_index =  best_model))
+
+    try:
+        for i in range(1, num_jobs + 1):
+            os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i))
+    except OSError:
+        raise Exception("Error while trying to delete the raw models")
+
+    new_model = "{0}/{1}.mdl".format(dir, iter + 1)
+
+    if not os.path.isfile(new_model):
+        raise Exception("Could not find {0}, at the end of iteration {1}".format(new_model, iter))
+    elif os.stat(new_model).st_size == 0:
+        raise Exception("{0} has size 0. Something went wrong in iteration {1}".format(new_model, iter))
+
+# args is a Namespace with the required parameters
+def Train(args, run_opts):
+    arg_string = pprint.pformat(vars(args))
+    logger.info("Arguments for the experiment\n{0}".format(arg_string))
+
+    # Set some variables.
+    num_leaves = GetNumberOfLeaves(args.ali_dir)
+    num_jobs = GetNumberOfJobs(args.ali_dir)
+    feat_dim = GetFeatDim(args.feat_dir)
+    ivector_dim = GetIvectorDim(args.online_ivector_dir)
+
+    # split the training data into parts for individual jobs
+    # we will use the same number of jobs as that used for alignment
+    SplitData(args.feat_dir, num_jobs)
+    shutil.copy('{0}/tree'.format(args.ali_dir), args.dir)
+    f = open('{0}/num_jobs'.format(args.dir), 'w')
+    f.write(str(num_jobs))
+    f.close()
+
+    config_dir = '{0}/configs'.format(args.dir)
+    var_file = '{0}/vars'.format(config_dir)
+
+    [left_context, right_context, num_hidden_layers] = ParseModelConfigVarsFile(var_file)
+    # Initialize as "raw" nnet, prior to training the LDA-like preconditioning
+    # matrix.  This first config just does any initial splicing that we do;
+    # we do this as it's a convenient way to get the stats for the 'lda-like'
+    # transform.
+
+    if (args.stage <= -5):
+        logger.info("Initializing a basic network for estimating preconditioning matrix")
+        RunKaldiCommand("""
+{command} {dir}/log/nnet_init.log \
+    nnet3-init --srand=-2 {dir}/configs/init.config {dir}/init.raw
+    """.format(command = run_opts.command,
+               dir = args.dir))
+
+    default_egs_dir = '{0}/egs'.format(args.dir)
+    if (args.stage <= -4) and args.egs_dir is None:
+        logger.info("Generating egs")
+
+        GenerateEgs(args.feat_dir, args.ali_dir, default_egs_dir,
+                    left_context, right_context,
+                    left_context, right_context, run_opts,
+                    frames_per_eg = args.frames_per_eg,
+                    egs_opts = args.egs_opts,
+                    cmvn_opts = args.cmvn_opts,
+                    online_ivector_dir = args.online_ivector_dir,
+                    samples_per_iter = args.samples_per_iter,
+                    transform_dir = args.transform_dir,
+                    stage = args.egs_stage)
+
+    if args.egs_dir is None:
+        egs_dir = default_egs_dir
+    else:
+        egs_dir = args.egs_dir
+
+    [egs_left_context, egs_right_context, frames_per_eg, num_archives] = VerifyEgsDir(egs_dir, feat_dim, ivector_dim, left_context, right_context)
+    assert(args.frames_per_eg == frames_per_eg)
+
+    if (args.num_jobs_final > num_archives):
+        raise Exception('num_jobs_final cannot exceed the number of archives in the egs directory')
+
+    # copy the properties of the egs to dir for
+    # use during decoding
+    CopyEgsPropertiesToExpDir(egs_dir, args.dir)
+
+    if (args.stage <= -3):
+        logger.info('Computing the preconditioning matrix for input features')
+
+        ComputePreconditioningMatrix(args.dir, egs_dir, num_archives, run_opts,
+                                     max_lda_jobs = args.max_lda_jobs,
+                                     rand_prune = args.rand_prune)
+
+    if (args.stage <= -2):
+        logger.info("Computing initial vector for FixedScaleComponent before"
+                    " softmax, using priors^{prior_scale} and rescaling to"
+                    " average 1".format(prior_scale = args.presoftmax_prior_scale_power))
+
+        ComputePresoftmaxPriorScale(args.dir, args.ali_dir, num_jobs, run_opts,
+                                    presoftmax_prior_scale_power = args.presoftmax_prior_scale_power)
+
+
+    if (args.stage <= -1):
+        logger.info("Preparing the initial acoustic model.")
+        PrepareInitialAcousticModel(args.dir, args.ali_dir, run_opts)
+
+
+    # set num_iters so that as close as possible, we process the data $num_epochs
+    # times, i.e. $num_iters*$avg_num_jobs) == $num_epochs*$num_archives,
+    # where avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.
+    num_archives_expanded = num_archives * args.frames_per_eg
+    num_archives_to_process = args.num_epochs * num_archives_expanded
+    num_archives_processed = 0
+    num_iters=(num_archives_to_process * 2) / (args.num_jobs_initial + args.num_jobs_final)
+
+    num_iters_combine = VerifyIterations(num_iters, args.num_epochs,
+                                         num_hidden_layers, num_archives_expanded,
+                                         args.max_models_combine, args.add_layers_period,
+                                         args.num_jobs_final)
+
+    learning_rate = lambda iter, current_num_jobs, num_archives_processed: GetLearningRate(iter, current_num_jobs, num_iters,
+                                                                   num_archives_processed,
+                                                                    num_archives_to_process,
+                                                                    args.initial_effective_lrate,
+                                                                    args.final_effective_lrate)
+    realign_iters = []
+    if args.realign_times is not None:
+        realign_iters = GetRealignIters(args.realign_times,
+                                        num_iters,
+                                        args.num_jobs_initial,
+                                        args.num_jobs_final)
+        print(realign_iters)
+    # egs_dir will be updated if there is realignment
+    cur_egs_dir=egs_dir
+
+    logger.info("Training will run for {0} epochs = {1} iterations".format(args.num_epochs, num_iters))
+    for iter in range(num_iters):
+        if (args.exit_stage is not None) and (iter == args.exit_stage):
+            logger.info("Exiting early due to --exit-stage {0}".format(iter))
+            return
+        current_num_jobs = int(0.5 + args.num_jobs_initial + (args.num_jobs_final - args.num_jobs_initial) * float(iter) / num_iters)
+
+        if args.stage <= iter:
+            if iter in realign_iters:
+                logger.info("Re-aligning the data at iteration {0}".format(iter))
+                prev_egs_dir=cur_egs_dir
+                cur_egs_dir="{0}/egs_{1}".format(args.dir, "iter"+str(iter))
+                new_ali_dir="{0}/ali_{1}".format(args.dir, "iter"+str(iter))
+                Realign(args.dir, iter, args.feat_dir, args.lang,
+                        prev_egs_dir, cur_egs_dir,
+                        args.prior_subset_size, num_archives, run_opts,
+                        transform_dir = args.transform_dir, online_ivector_dir = args.online_ivector_dir)
+                if args.cleanup and args.egs_dir is None:
+                    RemoveEgs(prev_egs_dir)
+            model_file = "{dir}/{iter}.mdl".format(dir = args.dir, iter = iter)
+
+            logger.info("On iteration {0}, learning rate is {1}.".format(iter, learning_rate(iter, current_num_jobs, num_archives_processed)))
+
+            TrainOneIteration(args.dir, iter, egs_dir, current_num_jobs,
+                              num_archives_processed, num_archives,
+                              learning_rate(iter, current_num_jobs, num_archives_processed),
+                              args.minibatch_size, args.frames_per_eg,
+                              num_hidden_layers, args.add_layers_period,
+                              left_context, right_context,
+                              args.momentum, args.max_param_change,
+                              args.shuffle_buffer_size, run_opts)
+            if args.cleanup:
+                # do a clean up everythin but the last 2 models, under certain conditions
+                RemoveModel(args.dir, iter-2, num_iters, num_iters_combine,
+                            args.preserve_model_interval)
+
+            if args.email is not None:
+                reporting_iter_interval = num_iters * args.reporting_interval
+                if iter % reporting_iter_interval == 0:
+                # lets do some reporting
+                    [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir)
+                    message = report
+                    subject = "Update : Expt {dir} : Iter {iter}".format(dir = args.dir, iter = iter)
+                    sendMail(message, subject, args.email)
+
+        num_archives_processed = num_archives_processed + current_num_jobs
+
+    if args.stage <= num_iters:
+        logger.info("Doing final combination to produce final.mdl")
+        CombineModels(args.dir, num_iters, num_iters_combine, egs_dir, run_opts)
+
+    if args.stage <= num_iters + 1:
+        logger.info("Getting average posterior for purposes of adjusting the priors.")
+        avg_post_vec_file = ComputeAveragePosterior(args.dir, 'combined', egs_dir,
+                                num_archives, args.prior_subset_size, run_opts)
+
+        logger.info("Re-adjusting priors based on computed posteriors")
+        combined_model = "{dir}/combined.mdl".format(dir = args.dir)
+        final_model = "{dir}/final.mdl".format(dir = args.dir)
+        AdjustAmPriors(args.dir, combined_model, avg_post_vec_file, final_model, run_opts)
+
+    if args.cleanup:
+        logger.info("Cleaning up the experiment directory {0}".format(args.dir))
+        remove_egs = args.remove_egs
+        if args.egs_dir is not None:
+            # this egs_dir was not created by this experiment so we will not
+            # delete it
+            remove_egs = False
+
+        CleanNnetDir(args.dir, num_iters, cur_egs_dir,
+                     preserve_model_interval = args.preserve_model_interval,
+                     remove_egs = remove_egs)
+
+    # do some reporting
+    [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir)
+    if args.email is not None:
+        SendMail(report, "Update : Expt {0} : complete".format(args.dir), args.email)
+
+    report_handle = open("{dir}/accuracy.report".format(dir = args.dir), "w")
+    report_handle.write(report)
+    report_handle.close()
+
+def Main():
+    [args, run_opts] = GetArgs()
+    try:
+        Train(args, run_opts)
+    except Exception as e:
+        if args.email is not None:
+            message = "Training session for experiment {dir} died due to an error.".format(dir = args.dir)
+            sendMail(message, message, args.email)
+        traceback.print_exc()
+        raise e
+
+if __name__ == "__main__":
+    Main()
diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py
new file mode 100755
index 00000000000..dec41409b06
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/train_rnn.py
@@ -0,0 +1,717 @@
+#!/usr/bin/env python
+
+
+# Copyright 2016 Vijayaditya Peddinti.
+# Apache 2.0.
+
+
+# this script is based on steps/nnet3/lstm/train.sh
+
+
+import subprocess
+import argparse
+import sys
+import pprint
+import logging
+import imp
+import traceback
+from nnet3_train_lib import *
+
+nnet3_log_parse = imp.load_source('', 'steps/nnet3/report/nnet3_log_parse_lib.py')
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+handler.setLevel(logging.INFO)
+formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)s - %(funcName)s - %(levelname)s ] %(message)s')
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+logger.info('Starting RNN trainer (train_rnn.py)')
+
+
+def GetArgs():
+    # we add compulsary arguments as named arguments for readability
+    parser = argparse.ArgumentParser(description="""
+    Trains an RNN acoustic model using the cross-entropy objective.
+    RNNs include LSTMs, BLSTMs and GRUs.
+    RNN acoustic model training differs from feed-forward DNN training
+    in the following ways
+        1. RNN acoustic models train on output chunks rather than individual
+           outputs
+        2. The training includes additional stage of shrinkage, where
+           the parameters of the model are scaled when the derivative averages
+           at the non-linearities are below a threshold.
+        3. RNNs can also be trained with state preservation training
+    """,
+    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+    # feat options
+    parser.add_argument("--feat.online-ivector-dir", type=str, dest='online_ivector_dir',
+                        default = None, action = NullstrToNoneAction,
+                        help="""directory with the ivectors extracted in
+                        an online fashion.""")
+    parser.add_argument("--feat.cmvn-opts", type=str, dest='cmvn_opts',
+                        default = None, action = NullstrToNoneAction,
+                        help="A string specifying '--norm-means' and '--norm-vars' values")
+
+    # egs extraction options
+    parser.add_argument("--egs.chunk-width", type=int, dest='chunk_width',
+                        default = 20,
+                        help="""Number of output labels in the sequence
+                        used to train an LSTM.
+                        Caution: if you double this you should halve
+                        --trainer.samples-per-iter.""")
+    parser.add_argument("--egs.chunk-left-context", type=int, dest='chunk_left_context',
+                        default = 40,
+                        help="""Number of left steps used in the estimation of LSTM
+                        state before prediction of the first label""")
+    parser.add_argument("--egs.chunk-right-context", type=int, dest='chunk_right_context',
+                        default = 0,
+                        help="""Number of right steps used in the estimation of BLSTM
+                        state before prediction of the first label""")
+    parser.add_argument("--egs.transform_dir", type=str, dest='transform_dir',
+                        default = None, action = NullstrToNoneAction,
+                        help="""String to provide options directly to steps/nnet3/get_egs.sh script""")
+    parser.add_argument("--egs.dir", type=str, dest='egs_dir',
+                        default = None, action = NullstrToNoneAction,
+                        help="""Directory with egs. If specified this directory
+                        will be used rather than extracting egs""")
+    parser.add_argument("--egs.stage", type=int, dest='egs_stage',
+                        default = 0, help="Stage at which get_egs.sh should be restarted")
+    parser.add_argument("--egs.opts", type=str, dest='egs_opts',
+                        default = None, action = NullstrToNoneAction,
+                        help="""String to provide options directly to steps/nnet3/get_egs.sh script""")
+
+    # trainer options
+    parser.add_argument("--trainer.num-epochs", type=int, dest='num_epochs',
+                        default = 8,
+                        help="Number of epochs to train the model")
+    parser.add_argument("--trainer.prior-subset-size", type=int, dest='prior_subset_size',
+                        default = 20000,
+                        help="Number of samples for computing priors")
+    parser.add_argument("--trainer.num-jobs-compute-prior", type=int, dest='num_jobs_compute_prior',
+                        default = 10,
+                        help="The prior computation jobs are single threaded and run on the CPU")
+    parser.add_argument("--trainer.max-models-combine", type=int, dest='max_models_combine',
+                        default = 20,
+                        help="The maximum number of models used in the final model combination stage. These models will themselves be averages of iteration-number ranges")
+    parser.add_argument("--trainer.shuffle-buffer-size", type=int, dest='shuffle_buffer_size',
+                        default = 5000,
+                        help=""" Controls randomization of the samples on each
+                        iteration. If 0 or a large value the randomization is
+                        complete, but this will consume memory and cause spikes
+                        in disk I/O.  Smaller is easier on disk and memory but
+                        less random.  It's not a huge deal though, as samples
+                        are anyway randomized right at the start.
+                        (the point of this is to get data in different
+                        minibatches on different iterations, since in the
+                        preconditioning method, 2 samples in the same minibatch
+                        can affect each others' gradients.""")
+    parser.add_argument("--trainer.add-layers-period", type=int, dest='add_layers_period',
+                        default=2,
+                        help="The number of iterations between adding layers during layer-wise discriminative training.")
+    parser.add_argument("--trainer.max-param-change", type=float, dest='max_param_change',
+                        default=2.0,
+                        help="""The maximum change in parameters allowed
+                        per minibatch, measured in Frobenius norm over
+                        the entire model""")
+    parser.add_argument("--trainer.samples-per-iter", type=int, dest='samples_per_iter',
+                        default=20000,
+                        help="""This is really the number of egs in each
+                        archive.  Each eg has 'chunk_width' frames in it--
+                        for chunk_width=20, this value (20k) is equivalent
+                        to the 400k number that we use as a default in
+                        regular DNN training.""")
+    parser.add_argument("--trainer.lda.rand-prune", type=float, dest='rand_prune',
+                        default=4.0,
+                        help="""Value used in preconditioning matrix estimation""")
+    parser.add_argument("--trainer.lda.max-lda-jobs", type=float, dest='max_lda_jobs',
+                        default=10,
+                        help="""Max number of jobs used for LDA stats accumulation""")
+
+    # Realignment parameters
+    parser.add_argument("--trainer.realign.command", type=str, dest='realign_command',
+                        default=None, action=NullstrToNoneAction,
+                        help="""Command to be used with steps/nnet3/align.sh during realignment""")
+    parser.add_argument("--trainer.realign.num-jobs", type=int, dest='realign_num_jobs',
+                        default=30,
+                        help="Number of jobs to use for realignment")
+    parser.add_argument("--trainer.realign.times", type=str, dest='realign_times',
+                        default=None, action=NullstrToNoneAction,
+                        help="""A space seperated string of realignment
+                        times. Values must be between 0 and 1
+                        e.g. '0.1 0.2 0.3' """)
+
+    parser.add_argument("--trainer.realign.use_gpu", type=str, dest='realign_use_gpu',
+                        default=True, action=StrToBoolAction,
+                        choices = ["true", "false"],
+                        help="If true, gpu is used with steps/nnet3/align.sh")
+
+    # Parameters for the optimization
+    parser.add_argument("--trainer.optimization.initial-effective-lrate", type=float, dest='initial_effective_lrate',
+                        default = 0.0003,
+                        help="Learning rate used during the initial iteration")
+    parser.add_argument("--trainer.optimization.final-effective-lrate", type=float, dest='final_effective_lrate',
+                        default = 0.00003,
+                        help="Learning rate used during the final iteration")
+    parser.add_argument("--trainer.optimization.num-jobs-initial", type=int, dest='num_jobs_initial',
+                        default = 1,
+                        help="Number of neural net jobs to run in parallel at the start of training")
+    parser.add_argument("--trainer.optimization.num-jobs-final", type=int, dest='num_jobs_final',
+                        default = 8,
+                        help="Number of neural net jobs to run in parallel at the end of training")
+    parser.add_argument("--trainer.optimization.max-models-combine", type=int, dest='max_models_combine',
+                        default = 20,
+                        help = """ The is the maximum number of models we give to the
+                                   final 'combine' stage, but these models will themselves
+                                   be averages of iteration-number ranges. """)
+    parser.add_argument("--trainer.optimization.momentum", type=float, dest='momentum',
+                        default = 0.5,
+                        help="""Momentum used in update computation.
+                        Note: we implemented it in such a way that
+                        it doesn't increase the effective learning rate.""")
+    parser.add_argument("--trainer.optimization.shrink-value", type=float, dest='shrink_value',
+                        default = 0.99,
+                        help="Scaling factor used for scaling the parameter matrices when the derivative averages are below the shrink-threshold at the non-linearities")
+    parser.add_argument("--trainer.optimization.shrink-threshold", type=float, dest='shrink_threshold',
+                        default = 0.15,
+                        help="If the derivative averages are below this threshold we scale the parameter matrices with the shrink-value. It is less than 0.25 for sigmoid non-linearities.")
+
+    # RNN specific trainer options
+    parser.add_argument("--trainer.rnn.num-chunk-per-minibatch", type=int, dest='num_chunk_per_minibatch',
+                        default=100,
+                        help="Number of sequences to be processed in parallel every minibatch" )
+    parser.add_argument("--trainer.rnn.num-bptt-steps", type=int, dest='num_bptt_steps',
+                        default=None,
+                        help="The number of time steps to back-propagate from the last label in the chunk. By default it is same as the chunk-width." )
+
+    # General options
+    parser.add_argument("--stage", type=int, default=-4,
+                        help="Specifies the stage of the experiment to execution from")
+    parser.add_argument("--exit-stage", type=int, default=None,
+                        help="If specified, training exits before running this stage")
+    parser.add_argument("--cmd", type=str, action = NullstrToNoneAction,
+                        dest = "command",
+                        help="""Specifies the script to launch jobs.
+                        e.g. queue.pl for launching on SGE cluster
+                             run.pl for launching on local machine
+                        """, default = "queue.pl")
+    parser.add_argument("--use-gpu", type=str, action = StrToBoolAction,
+                        choices = ["true", "false"],
+                        help="Use GPU for training", default=True)
+    parser.add_argument("--cleanup", type=str, action = StrToBoolAction,
+                        choices = ["true", "false"],
+                        help="Clean up models after training", default=True)
+    parser.add_argument("--cleanup.remove-egs", type=str, dest='remove_egs',
+                        default = True, action = StrToBoolAction,
+                        choices = ["true", "false"],
+                        help="""If true, remove egs after experiment""")
+    parser.add_argument("--cleanup.preserve-model-interval", dest = "preserve_model_interval",
+                        type=int, default=100,
+                        help="Determines iterations for which models will be preserved during cleanup. If iter % preserve_model_interval == 0 model will be preserved.")
+
+    parser.add_argument("--reporting.email", dest = "email",
+                        type=str, default=None, action = NullstrToNoneAction,
+                        help=""" Email-id to report about the progress of the experiment.
+                              NOTE: It assumes the machine on which the script is being run can send
+                              emails from command line via. mail program. The
+                              Kaldi mailing list will not support this feature.
+                              It might require local expertise to setup. """)
+    parser.add_argument("--reporting.interval", dest = "reporting_interval",
+                        type=int, default=0.1,
+                        help="Frequency with which reports have to be sent, measured in terms of fraction of iterations. If 0 and reporting mail has been specified then only failure notifications are sent")
+
+    parser.add_argument("--feat-dir", type=str, required = True,
+                        help="Directory with features used for training the neural network.")
+    parser.add_argument("--lang", type=str, required = True,
+                        help="Languade directory")
+    parser.add_argument("--ali-dir", type=str, required = True,
+                        help="Directory with alignments used for training the neural network.")
+    parser.add_argument("--dir", type=str, required = True,
+                        help="Directory to store the models and all other files.")
+
+    print(' '.join(sys.argv))
+
+    args = parser.parse_args()
+
+    [args, run_opts] = ProcessArgs(args)
+
+    return [args, run_opts]
+
+def ProcessArgs(args):
+    # process the options
+    if args.chunk_width < 1:
+        raise Exception("--egs.chunk-width should have a minimum value of 1")
+
+    if args.chunk_left_context < 0:
+        raise Exception("--egs.chunk-left-context should be positive")
+
+    if args.chunk_right_context < 0:
+        raise Exception("--egs.chunk-right-context should be positive")
+
+    if (not os.path.exists(args.dir)) or (not os.path.exists(args.dir+"/configs")):
+        raise Exception("""This scripts expects {0} to exist and have a configs
+        directory which is the output of make_configs.py script""")
+
+    if args.transform_dir is None:
+        args.transform_dir = args.ali_dir
+    # set the options corresponding to args.use_gpu
+    run_opts = RunOpts()
+    if args.use_gpu:
+        if not CheckIfCudaCompiled():
+            logger.warning("""
+    You are running with one thread but you have not compiled
+    for CUDA.  You may be running a setup optimized for GPUs.  If you have
+    GPUs and have nvcc installed, go to src/ and do ./configure; make""")
+
+        run_opts.train_queue_opt = "--gpu 1"
+        run_opts.parallel_train_opts = ""
+        run_opts.combine_queue_opt = "--gpu 1"
+        run_opts.prior_gpu_opt = "--use-gpu=yes"
+        run_opts.prior_queue_opt = "--gpu 1"
+
+    else:
+        logger.warning("""
+    Without using a GPU this will be very slow.  nnet3 does not yet support multiple threads.""")
+
+        run_opts.train_queue_opt = ""
+        run_opts.parallel_train_opts = "--use-gpu=no"
+        run_opts.combine_queue_opt = ""
+        run_opts.prior_gpu_opt = "--use-gpu=no"
+        run_opts.prior_queue_opt = ""
+
+    if args.realign_use_gpu is True:
+        run_opts.realign_use_gpu = True
+        run_opts.realign_queue_opt = "--gpu 1"
+    else:
+        run_opts.realign_use_gpu = False
+        run_opts.realign_queue_opt = ""
+
+    if args.realign_command is None:
+        run_opts.realign_command = args.command
+    else:
+        run_opts.realign_command = args.realign_command
+    run_opts.realign_num_jobs = args.realign_num_jobs
+
+    run_opts.command = args.command
+    run_opts.num_jobs_compute_prior = args.num_jobs_compute_prior
+
+    return [args, run_opts]
+
+class StrToBoolAction(argparse.Action):
+    """ A custom action to convert bools from shell format i.e., true/false
+        to python format i.e., True/False """
+    def __call__(self, parser, namespace, values, option_string=None):
+        if values == "true":
+            setattr(namespace, self.dest, True)
+        elif values == "false":
+            setattr(namespace, self.dest, False)
+        else:
+            raise Exception("Unknown value {0} for --{1}".format(values, self.dest))
+
+class NullstrToNoneAction(argparse.Action):
+    """ A custom action to convert empty strings passed by shell
+        to None in python. This is necessary as shell scripts print null strings
+        when a variable is not specified. We could use the more apt None
+        in python. """
+    def __call__(self, parser, namespace, values, option_string=None):
+            if values.strip() == "":
+                setattr(namespace, self.dest, None)
+            else:
+                setattr(namespace, self.dest, values)
+
+
+# a class to store run options
+class RunOpts:
+    def __init__(self):
+        self.command = None
+        self.train_queue_opt = None
+        self.combine_queue_opt = None
+        self.prior_gpu_opt = None
+        self.prior_queue_opt = None
+        self.parallel_train_opts = None
+        self.realign_use_gpu = None
+
+
+def TrainNewModels(dir, iter, num_jobs, num_archives_processed, num_archives,
+                   raw_model_string, egs_dir,
+                   left_context, right_context, min_deriv_time,
+                   momentum, max_param_change,
+                   shuffle_buffer_size, num_chunk_per_minibatch,
+                   cache_read_opt, run_opts):
+      # We cannot easily use a single parallel SGE job to do the main training,
+      # because the computation of which archive and which --frame option
+      # to use for each job is a little complex, so we spawn each one separately.
+      # this is no longer true for RNNs as we use do not use the --frame option
+      # but we use the same script for consistency with FF-DNN code
+
+    context_opts="--left-context={0} --right-context={1}".format(
+                  left_context, right_context)
+    processes = []
+    for job in range(1,num_jobs+1):
+        k = num_archives_processed + job - 1 # k is a zero-based index that we will derive
+                                               # the other indexes from.
+        archive_index = (k % num_archives) + 1 # work out the 1-based archive index.
+
+        cache_write_opt = ""
+        if job == 1:
+          # an option for writing cache (storing pairs of nnet-computations and
+          # computation-requests) during training.
+          cache_write_opt="--write-cache={dir}/cache.{iter}".format(dir=dir, iter=iter+1)
+
+        process_handle = RunKaldiCommand("""
+{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \
+  nnet3-train {parallel_train_opts} {cache_read_opt} {cache_write_opt} \
+  --print-interval=10 --momentum={momentum} \
+  --max-param-change={max_param_change} \
+  --optimization.min-deriv-time={min_deriv_time} "{raw_model}" \
+  "ark,bg:nnet3-copy-egs {context_opts} ark:{egs_dir}/egs.{archive_index}.ark ark:- | nnet3-shuffle-egs --buffer-size={shuffle_buffer_size} --srand={iter} ark:- ark:-| nnet3-merge-egs --minibatch-size={num_chunk_per_minibatch} --measure-output-frames=false --discard-partial-minibatches=true ark:- ark:- |" \
+  {dir}/{next_iter}.{job}.raw
+          """.format(command = run_opts.command,
+                     train_queue_opt = run_opts.train_queue_opt,
+                     dir = dir, iter = iter, next_iter = iter + 1, job = job,
+                     parallel_train_opts = run_opts.parallel_train_opts,
+                     cache_read_opt = cache_read_opt, cache_write_opt = cache_write_opt,
+                     momentum = momentum, max_param_change = max_param_change,
+                     min_deriv_time = min_deriv_time,
+                     raw_model = raw_model_string, context_opts = context_opts,
+                     egs_dir = egs_dir, archive_index = archive_index,
+                     shuffle_buffer_size = shuffle_buffer_size,
+                     num_chunk_per_minibatch = num_chunk_per_minibatch),
+          wait = False)
+
+        processes.append(process_handle)
+
+    all_success = True
+    for process in processes:
+        process.wait()
+        [stdout_value, stderr_value] = process.communicate()
+        print(stderr_value)
+        if process.returncode != 0:
+            all_success = False
+
+    if not all_success:
+        open('{0}/.error'.format(dir), 'w').close()
+        raise Exception("There was error during training iteration {0}".format(iter))
+
+def TrainOneIteration(dir, iter, egs_dir,
+                      num_jobs, num_archives_processed, num_archives,
+                      learning_rate, shrinkage_value, num_chunk_per_minibatch,
+                      num_hidden_layers, add_layers_period,
+                      left_context, right_context, min_deriv_time,
+                      momentum, max_param_change, shuffle_buffer_size,
+                      run_opts):
+    # Set off jobs doing some diagnostics, in the background.
+    # Use the egs dir from the previous iteration for the diagnostics
+    logger.info("Training neural net (pass {0})".format(iter))
+
+    ComputeTrainCvProbabilities(dir, iter, egs_dir, run_opts)
+
+    if iter > 0:
+        ComputeProgress(dir, iter, egs_dir, run_opts)
+
+    # an option for writing cache (storing pairs of nnet-computations
+    # and computation-requests) during training.
+    cache_read_opt = ""
+    if iter > 0 and (iter <= (num_hidden_layers-1) * add_layers_period) and (iter % add_layers_period == 0):
+        do_average = False # if we've just mixed up, don't do averaging but take the
+                           # best.
+        cur_num_hidden_layers = 1 + iter / add_layers_period
+        config_file = "{0}/configs/layer{1}.config".format(dir, cur_num_hidden_layers)
+        raw_model_string = "nnet3-am-copy --raw=true --learning-rate={lr} {dir}/{iter}.mdl - | nnet3-init --srand={iter} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, config=config_file)
+    else:
+        do_average = True
+        if iter == 0:
+            do_average = False   # on iteration 0, pick the best, don't average.
+        else:
+            cache_read_opt = "--read-cache={dir}/cache.{iter}".format(dir=dir, iter=iter)
+        raw_model_string = "nnet3-am-copy --raw=true --learning-rate={0} {1}/{2}.mdl - |".format(learning_rate, dir, iter)
+
+    if do_average:
+      cur_num_chunk_per_minibatch = num_chunk_per_minibatch
+    else:
+      # on iteration zero or when we just added a layer, use a smaller minibatch
+      # size (and we will later choose the output of just one of the jobs): the
+      # model-averaging isn't always helpful when the model is changing too fast
+      # (i.e. it can worsen the objective function), and the smaller minibatch
+      # size will help to keep the update stable.
+      cur_num_chunk_per_minibatch = num_chunk_per_minibatch / 2
+
+    try:
+        os.remove("{0}/.error".format(dir))
+    except OSError:
+        pass
+
+    TrainNewModels(dir, iter, num_jobs, num_archives_processed, num_archives,
+                   raw_model_string, egs_dir,
+                   left_context, right_context, min_deriv_time,
+                   momentum, max_param_change,
+                   shuffle_buffer_size, cur_num_chunk_per_minibatch,
+                   cache_read_opt, run_opts)
+    [models_to_average, best_model] = GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter))
+    nnets_list = []
+    for n in models_to_average:
+      nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n))
+
+    if do_average:
+        # average the output of the different jobs.
+        RunKaldiCommand("""
+{command} {dir}/log/average.{iter}.log \
+nnet3-average {nnet_list} - \| \
+nnet3-am-copy --scale={shrink} --set-raw-nnet=- {dir}/{iter}.mdl {dir}/{new_iter}.mdl
+        """.format(command = run_opts.command,
+                   dir = dir,
+                   iter = iter,
+                   nnet_list = " ".join(nnets_list),
+                   shrink = shrinkage_value,
+                   new_iter = iter + 1))
+
+    else:
+        # choose the best model from different jobs
+        RunKaldiCommand("""
+{command} {dir}/log/select.{iter}.log \
+    nnet3-am-copy --scale={shrink} --set-raw-nnet={dir}/{next_iter}.{best_model_index}.raw  {dir}/{iter}.mdl {dir}/{next_iter}.mdl
+        """.format(command = run_opts.command,
+                   dir = dir, iter = iter, next_iter = iter + 1,
+                   shrink = shrinkage_value, best_model_index =  best_model))
+
+    try:
+        for i in range(1, num_jobs + 1):
+            os.remove("{0}/{1}.{2}.raw".format(dir, iter + 1, i))
+    except OSError:
+        raise Exception("Error while trying to delete the raw models")
+
+    new_model = "{0}/{1}.mdl".format(dir, iter + 1)
+
+    if not os.path.isfile(new_model):
+        raise Exception("Could not find {0}, at the end of iteration {1}".format(new_model, iter))
+    elif os.stat(new_model).st_size == 0:
+        raise Exception("{0} has size 0. Something went wrong in iteration {1}".format(new_model, iter))
+    if cache_read_opt and os.path.exists("{0}/cache.{1}".format(dir, iter)):
+        os.remove("{0}/cache.{1}".format(dir, iter))
+
+
+# args is a Namespace with the required parameters
+def Train(args, run_opts):
+    arg_string = pprint.pformat(vars(args))
+    logger.info("Arguments for the experiment\n{0}".format(arg_string))
+
+    # Set some variables.
+    num_leaves = GetNumberOfLeaves(args.ali_dir)
+    num_jobs = GetNumberOfJobs(args.ali_dir)
+    feat_dim = GetFeatDim(args.feat_dir)
+    ivector_dim = GetIvectorDim(args.online_ivector_dir)
+
+    # split the training data into parts for individual jobs
+    # we will use the same number of jobs as that used for alignment
+    SplitData(args.feat_dir, num_jobs)
+    shutil.copy('{0}/tree'.format(args.ali_dir), args.dir)
+    f = open('{0}/num_jobs'.format(args.dir), 'w')
+    f.write(str(num_jobs))
+    f.close()
+
+    config_dir = '{0}/configs'.format(args.dir)
+    var_file = '{0}/vars'.format(config_dir)
+
+    [model_left_context, model_right_context, num_hidden_layers] = ParseModelConfigVarsFile(var_file)
+    # Initialize as "raw" nnet, prior to training the LDA-like preconditioning
+    # matrix.  This first config just does any initial splicing that we do;
+    # we do this as it's a convenient way to get the stats for the 'lda-like'
+    # transform.
+
+    if (args.stage <= -4):
+        logger.info("Initializing a basic network for estimating preconditioning matrix")
+        RunKaldiCommand("""
+{command} {dir}/log/nnet_init.log \
+    nnet3-init --srand=-2 {dir}/configs/init.config {dir}/init.raw
+    """.format(command = run_opts.command,
+               dir = args.dir))
+
+    left_context = args.chunk_left_context + model_left_context
+    right_context = args.chunk_right_context + model_right_context
+
+    default_egs_dir = '{0}/egs'.format(args.dir)
+    if (args.stage <= -3) and args.egs_dir is None:
+        logger.info("Generating egs")
+
+        GenerateEgs(args.feat_dir, args.ali_dir, default_egs_dir,
+                    left_context, right_context,
+                    args.chunk_width + left_context,
+                    args.chunk_width + right_context, run_opts,
+                    frames_per_eg = args.chunk_width,
+                    egs_opts = args.egs_opts,
+                    cmvn_opts = args.cmvn_opts,
+                    online_ivector_dir = args.online_ivector_dir,
+                    samples_per_iter = args.samples_per_iter,
+                    transform_dir = args.transform_dir,
+                    stage = args.egs_stage)
+
+    if args.egs_dir is None:
+        egs_dir = default_egs_dir
+    else:
+        egs_dir = args.egs_dir
+
+    [egs_left_context, egs_right_context, frames_per_eg, num_archives] = VerifyEgsDir(egs_dir, feat_dim, ivector_dim, left_context, right_context)
+    assert(args.chunk_width == frames_per_eg)
+
+    if (args.num_jobs_final > num_archives):
+        raise Exception('num_jobs_final cannot exceed the number of archives in the egs directory')
+
+    # copy the properties of the egs to dir for
+    # use during decoding
+    CopyEgsPropertiesToExpDir(egs_dir, args.dir)
+
+    if (args.stage <= -2):
+        logger.info('Computing the preconditioning matrix for input features')
+
+        ComputePreconditioningMatrix(args.dir, egs_dir, num_archives, run_opts,
+                                     max_lda_jobs = args.max_lda_jobs,
+                                     rand_prune = args.rand_prune)
+
+    if (args.stage <= -1):
+        logger.info("Preparing the initial acoustic model.")
+        PrepareInitialAcousticModel(args.dir, args.ali_dir, run_opts)
+
+
+    # set num_iters so that as close as possible, we process the data $num_epochs
+    # times, i.e. $num_iters*$avg_num_jobs) == $num_epochs*$num_archives,
+    # where avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.
+    num_archives_to_process = args.num_epochs * num_archives
+    num_archives_processed = 0
+    num_iters=(num_archives_to_process * 2) / (args.num_jobs_initial + args.num_jobs_final)
+
+    num_iters_combine = VerifyIterations(num_iters, args.num_epochs,
+                                         num_hidden_layers, num_archives,
+                                         args.max_models_combine, args.add_layers_period,
+                                         args.num_jobs_final)
+
+    learning_rate = lambda iter, current_num_jobs, num_archives_processed: GetLearningRate(iter, current_num_jobs, num_iters,
+                                                                   num_archives_processed,
+                                                                    num_archives_to_process,
+                                                                    args.initial_effective_lrate,
+                                                                    args.final_effective_lrate)
+    realign_iters = []
+    if args.realign_times is not None:
+        realign_iters = GetRealignIters(args.realign_times,
+                                        num_iters,
+                                        args.num_jobs_initial,
+                                        args.num_jobs_final)
+        print(realign_iters)
+    # egs_dir will be updated if there is realignment
+    cur_egs_dir=egs_dir
+
+    if args.num_bptt_steps is None:
+        num_bptt_steps = args.chunk_width
+    else:
+        num_bptt_steps = args.num_bptt_steps
+
+    min_deriv_time = args.chunk_width - num_bptt_steps
+
+
+    logger.info("Training will run for {0} epochs = {1} iterations".format(args.num_epochs, num_iters))
+    for iter in range(num_iters):
+        if (args.exit_stage is not None) and (iter == args.exit_stage):
+            logger.info("Exiting early due to --exit-stage {0}".format(iter))
+            return
+        current_num_jobs = int(0.5 + args.num_jobs_initial + (args.num_jobs_final - args.num_jobs_initial) * float(iter) / num_iters)
+
+        if args.stage <= iter:
+            if iter in realign_iters:
+                logger.info("Re-aligning the data at iteration {0}".format(iter))
+                prev_egs_dir=cur_egs_dir
+                cur_egs_dir="{0}/egs_{1}".format(args.dir, "iter"+str(iter))
+                new_ali_dir="{0}/ali_{1}".format(args.dir, "iter"+str(iter))
+                Realign(args.dir, iter, args.feat_dir, args.lang,
+                        prev_egs_dir, cur_egs_dir,
+                        args.prior_subset_size, num_archives, run_opts,
+                        transform_dir = args.transform_dir, online_ivector_dir = args.online_ivector_dir)
+                if args.cleanup and args.egs_dir is None:
+                    RemoveEgs(prev_egs_dir)
+            model_file = "{dir}/{iter}.mdl".format(dir = args.dir, iter = iter)
+            shrinkage_value = args.shrink_value if DoShrinkage(iter, model_file, "SigmoidComponent", args.shrink_threshold) else 1
+            logger.info("On iteration {0}, learning rate is {1} and shrink value is {2}.".format(iter, learning_rate(iter, current_num_jobs, num_archives_processed), shrinkage_value))
+
+            TrainOneIteration(args.dir, iter, egs_dir, current_num_jobs,
+                              num_archives_processed, num_archives,
+                              learning_rate(iter, current_num_jobs, num_archives_processed),
+                              shrinkage_value,
+                              args.num_chunk_per_minibatch,
+                              num_hidden_layers, args.add_layers_period,
+                              left_context, right_context, min_deriv_time,
+                              args.momentum, args.max_param_change,
+                              args.shuffle_buffer_size, run_opts)
+            if args.cleanup:
+                # do a clean up everythin but the last 2 models, under certain conditions
+                RemoveModel(args.dir, iter-2, num_iters, num_iters_combine,
+                            args.preserve_model_interval)
+
+            if args.email is not None:
+                reporting_iter_interval = num_iters * args.reporting_interval
+                if iter % reporting_iter_interval == 0:
+                # lets do some reporting
+                    [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir)
+                    message = report
+                    subject = "Update : Expt {dir} : Iter {iter}".format(dir = args.dir, iter = iter)
+                    sendMail(message, subject, args.email)
+
+        num_archives_processed = num_archives_processed + current_num_jobs
+
+    if args.stage <= num_iters:
+        logger.info("Doing final combination to produce final.mdl")
+        CombineModels(args.dir, num_iters, num_iters_combine, egs_dir, run_opts,
+                chunk_width = args.chunk_width)
+
+    if args.stage <= num_iters + 1:
+        logger.info("Getting average posterior for purposes of adjusting the priors.")
+        avg_post_vec_file = ComputeAveragePosterior(args.dir, 'combined', egs_dir,
+                                num_archives, args.prior_subset_size, run_opts)
+
+        logger.info("Re-adjusting priors based on computed posteriors")
+        combined_model = "{dir}/combined.mdl".format(dir = args.dir)
+        final_model = "{dir}/final.mdl".format(dir = args.dir)
+        AdjustAmPriors(args.dir, combined_model, avg_post_vec_file, final_model, run_opts)
+
+    if args.cleanup:
+        logger.info("Cleaning up the experiment directory {0}".format(args.dir))
+        remove_egs = args.remove_egs
+        if args.egs_dir is not None:
+            # this egs_dir was not created by this experiment so we will not
+            # delete it
+            remove_egs = False
+
+        CleanNnetDir(args.dir, num_iters, cur_egs_dir,
+                     preserve_model_interval = args.preserve_model_interval,
+                     remove_egs = remove_egs)
+
+    # do some reporting
+    [report, times, data] = nnet3_log_parse.GenerateAccuracyReport(args.dir)
+    if args.email is not None:
+        sendMail(report, "Update : Expt {0} : complete".format(args.dir), args.email)
+
+    report_handle = open("{dir}/accuracy.report".format(dir = args.dir), "w")
+    report_handle.write(report)
+    report_handle.close()
+
+def Main():
+    [args, run_opts] = GetArgs()
+    try:
+        Train(args, run_opts)
+    except Exception as e:
+        if args.email is not None:
+            message = "Training session for experiment {dir} died due to an error.".format(dir = args.dir)
+            sendMail(message, message, args.email)
+        traceback.print_exc()
+        raise e
+
+def SendMail(message, subject, email_id):
+    try:
+        subprocess.Popen('echo "{message}" | mail -s "{subject}" {email} '.format(
+            message = message,
+            subject = subject,
+            email = email_id), shell=True)
+    except Exception as e:
+        logger.info(" Unable to send mail due to error:\n {error}".format(error = str(e)))
+        pass
+
+if __name__ == "__main__":
+    Main()
diff --git a/egs/wsj/s5/steps/nnet3/train_tdnn.sh b/egs/wsj/s5/steps/nnet3/train_tdnn.sh
index 842ce7e9c94..99122fedd73 100755
--- a/egs/wsj/s5/steps/nnet3/train_tdnn.sh
+++ b/egs/wsj/s5/steps/nnet3/train_tdnn.sh
@@ -2,7 +2,7 @@
 
 # note, TDNN is the same as what we used to call multisplice.
 
-# Copyright 2012-2015  Johns Hopkins University (Author: Daniel Povey). 
+# Copyright 2012-2015  Johns Hopkins University (Author: Daniel Povey).
 #           2013  Xiaohui Zhang
 #           2013  Guoguo Chen
 #           2014  Vimal Manohar
@@ -16,22 +16,23 @@ num_epochs=15      # Number of epochs of training;
                    # the number of iterations is worked out from this.
 initial_effective_lrate=0.01
 final_effective_lrate=0.001
-pnorm_input_dim=3000 
+pnorm_input_dim=3000
 pnorm_output_dim=300
 relu_dim=  # you can use this to make it use ReLU's instead of p-norms.
 rand_prune=4.0 # Relates to a speedup we do for LDA.
 minibatch_size=512  # This default is suitable for GPU-based training.
                     # Set it to 128 for multi-threaded CPU-based training.
-
+max_param_change=2.0  # max param change per minibatch
 samples_per_iter=400000 # each iteration of training, see this many samples
                         # per job.  This option is passed to get_egs.sh
 num_jobs_initial=1  # Number of neural net jobs to run in parallel at the start of training
 num_jobs_final=8   # Number of neural net jobs to run in parallel at the end of training
-prior_subset_size=20000 # 20k samples per job, for computing priors. 
+prior_subset_size=20000 # 20k samples per job, for computing priors.
 num_jobs_compute_prior=10 # these are single-threaded, run on CPU.
 get_egs_stage=0    # can be used for rerunning after partial
 online_ivector_dir=
 presoftmax_prior_scale_power=-0.25
+use_presoftmax_prior_scale=true
 remove_egs=true  # set to false to disable removing egs after training is done.
 
 max_models_combine=20 # The "max_models_combine" is the maximum number of models we give
@@ -57,25 +58,20 @@ splice_indexes="-4,-3,-2,-1,0,1,2,3,4  0  -2,2  0  -4,4 0"
 # note: hidden layers which are composed of one or more components,
 # so hidden layer indexing is different from component count
 
-
-io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
 randprune=4.0 # speeds up LDA.
-affine_opts=
-
 use_gpu=true    # if true, we run on GPU.
-num_threads=16  # if using CPU, the number of threads we use.
 cleanup=true
 egs_dir=
 max_lda_jobs=10  # use no more than 10 jobs for the LDA accumulation.
 lda_opts=
 egs_opts=
 transform_dir=     # If supplied, this dir used instead of alidir to find transforms.
-cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.  
+cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.
             # only relevant for "raw" features, not lda.
 feat_type=raw  # or set to 'lda' to use LDA features.
 align_cmd=              # The cmd that is passed to steps/nnet2/align.sh
 align_use_gpu=          # Passed to use_gpu in steps/nnet2/align.sh [yes/no]
-realign_times=          # List of times on which we realign.  Each time is 
+realign_times=          # List of times on which we realign.  Each time is
                         # floating point number strictly between 0 and 1, which
                         # will be multiplied by the num-iters to get an iteration
                         # number.
@@ -113,7 +109,6 @@ if [ $# != 4 ]; then
   echo "  --parallel-opts <opts|\"-pe smp 16 -l ram_free=1G,mem_free=1G\">      # extra options to pass to e.g. queue.pl for processes that"
   echo "                                                   # use multiple threads... note, you might have to reduce mem_free,ram_free"
   echo "                                                   # versus your defaults, because it gets multiplied by the -pe smp argument."
-  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
   echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
   echo "                                                   # should not get too large, e.g. >2k)."
   echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
@@ -123,15 +118,15 @@ if [ $# != 4 ]; then
   echo "                                                   # Format : layer<hidden_layer_index>/<frame_indices>....layer<hidden_layer>/<frame_indices> "
   echo "                                                   # (note: we splice processed, typically 40-dimensional frames"
   echo "  --lda-dim <dim|''>                               # Dimension to reduce spliced features to with LDA"
-  echo "  --realign-epochs <list-of-epochs|''>             # A list of space-separated epoch indices the beginning of which"
-  echo "                                                   # realignment is to be done"
+  echo "  --realign-times <list-of-times|\"\">             # A list of space-separated floating point numbers between 0.0 and"
+  echo "                                                   # 1.0 to specify how far through training realignment is to be done"
   echo "  --align-cmd (utils/run.pl|utils/queue.pl <queue opts>) # passed to align.sh"
   echo "  --align-use-gpu (yes/no)                         # specify is gpu is to be used for realignment"
   echo "  --num-jobs-align <#njobs|30>                     # Number of jobs to perform realignment"
   echo "  --stage <stage|-4>                               # Used to run a partially-completed training process from somewhere in"
   echo "                                                   # the middle."
 
-  
+
   exit 1;
 fi
 
@@ -193,13 +188,14 @@ if [ $stage -le -5 ]; then
   else
     dim_opts="--pnorm-input-dim $pnorm_input_dim --pnorm-output-dim  $pnorm_output_dim"
   fi
-  
+
   # create the config files for nnet initialization
   python steps/nnet3/make_tdnn_configs.py  \
     --splice-indexes "$splice_indexes"  \
     --feat-dim $feat_dim \
     --ivector-dim $ivector_dim  \
      $dim_opts \
+    --use-presoftmax-prior-scale $use_presoftmax_prior_scale \
     --num-targets  $num_leaves  \
    $dir/configs || exit 1;
 
@@ -236,23 +232,22 @@ if [ $stage -le -4 ] && [ -z "$egs_dir" ]; then
   echo "$0: calling get_egs.sh"
   steps/nnet3/get_egs.sh $egs_opts "${extra_opts[@]}" \
       --samples-per-iter $samples_per_iter --stage $get_egs_stage \
-      --io-opts "$io_opts" \
       --cmd "$cmd" $egs_opts \
       --frames-per-eg $frames_per_eg \
       $data $alidir $dir/egs || exit 1;
 fi
 
-if [ "$feat_dim" != "$(cat $dir/egs/info/feat_dim)" ]; then
-  echo "$0: feature dimension mismatch with egs, $feat_dim vs $(cat $dir/egs/info/feat_dim)";
+[ -z $egs_dir ] && egs_dir=$dir/egs
+
+if [ "$feat_dim" != "$(cat $egs_dir/info/feat_dim)" ]; then
+  echo "$0: feature dimension mismatch with egs, $feat_dim vs $(cat $egs_dir/info/feat_dim)";
   exit 1;
 fi
-if [ "$ivector_dim" != "$(cat $dir/egs/info/ivector_dim)" ]; then
-  echo "$0: ivector dimension mismatch with egs, $ivector_dim vs $(cat $dir/egs/info/ivector_dim)";
+if [ "$ivector_dim" != "$(cat $egs_dir/info/ivector_dim)" ]; then
+  echo "$0: ivector dimension mismatch with egs, $ivector_dim vs $(cat $egs_dir/info/ivector_dim)";
   exit 1;
 fi
 
-[ -z $egs_dir ] && egs_dir=$dir/egs
-
 # copy any of the following that exist, to $dir.
 cp $egs_dir/{cmvn_opts,splice_opts,final.mat} $dir 2>/dev/null
 
@@ -260,8 +255,8 @@ cp $egs_dir/{cmvn_opts,splice_opts,final.mat} $dir 2>/dev/null
 # the --egs-dir option was used on the command line).
 egs_left_context=$(cat $egs_dir/info/left_context) || exit -1
 egs_right_context=$(cat $egs_dir/info/right_context) || exit -1
-( ! [ $(cat $egs_dir/info/left_context) -le $left_context ] ||
-  ! [ $(cat $egs_dir/info/right_context) -le $right_context ] ) && \
+ ( [ $egs_left_context -lt $left_context ] || \
+   [ $egs_right_context -lt $right_context ] ) && \
    echo "$0: egs in $egs_dir have too little context" && exit -1;
 
 frames_per_eg=$(cat $egs_dir/info/frames_per_eg) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }
@@ -308,14 +303,14 @@ if [ $stage -le -2 ]; then
   echo "$0: preparing initial vector for FixedScaleComponent before softmax"
   echo "  ... using priors^$presoftmax_prior_scale_power and rescaling to average 1"
 
-  # obtains raw pdf count    
+  # obtains raw pdf count
   $cmd JOB=1:$nj $dir/log/acc_pdf.JOB.log \
      ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
      post-to-tacc --per-pdf=true  $alidir/final.mdl ark:- $dir/pdf_counts.JOB || exit 1;
   $cmd $dir/log/sum_pdf_counts.log \
        vector-sum --binary=false $dir/pdf_counts.* $dir/pdf_counts || exit 1;
   rm $dir/pdf_counts.*
-  
+
   awk -v power=$presoftmax_prior_scale_power -v smooth=0.01 \
      '{ for(i=2; i<=NF-1; i++) { count[i-2] = $i;  total += $i; }
         num_pdfs=NF-2;  average_count = total/num_pdfs;
@@ -367,16 +362,11 @@ if $use_gpu; then
     exit 1
   fi
 else
-  if [ $num_threads -gt 1 ]; then
-    parallel_suffix="-parallel"
-    parallel_train_opts="--num-threads=$num_threads"
-    train_queue_opt="--num-threads $num_threads"
-    combine_queue_opt=""  # the combine stage will be quite slow if not using
-                          # GPU, as we didn't enable that program to use
-                          # multiple threads.
-  else
-    parallel_suffix=""
-  fi
+  echo "$0: without using a GPU this will be very slow.  nnet3 does not yet support multiple threads."
+  parallel_train_opts="--use-gpu=no"
+  combine_queue_opt=""  # the combine stage will be quite slow if not using
+                        # GPU, as we didn't enable that program to use
+                        # multiple threads.
   prior_gpu_opt="--use-gpu=no"
   prior_queue_opt=""
 fi
@@ -420,7 +410,7 @@ while [ $x -lt $num_iters ]; do
   ilr=$initial_effective_lrate; flr=$final_effective_lrate; np=$num_archives_processed; nt=$num_archives_to_process;
   this_learning_rate=$(perl -e "print (($x + 1 >= $num_iters ? $flr : $ilr*exp($np*log($flr/$ilr)/$nt))*$this_num_jobs);");
 
-  echo "On iteration $x, learning rate is $this_learning_rate."    
+  echo "On iteration $x, learning rate is $this_learning_rate."
 
   if [ ! -z "${realign_this_iter[$x]}" ]; then
     prev_egs_dir=$cur_egs_dir
@@ -466,7 +456,7 @@ while [ $x -lt $num_iters ]; do
         steps/nnet3/remove_egs.sh $prev_egs_dir
       fi
     fi
-    
+
     # Set off jobs doing some diagnostics, in the background.
     # Use the egs dir from the previous iteration for the diagnostics
     $cmd $dir/log/compute_prob_valid.$x.log \
@@ -476,13 +466,12 @@ while [ $x -lt $num_iters ]; do
       nnet3-compute-prob "nnet3-am-copy --raw=true $dir/$x.mdl - |" \
            "ark:nnet3-merge-egs ark:$cur_egs_dir/train_diagnostic.egs ark:- |" &
 
-    # nnet3-show-progress not implemented yet
-    #if [ $x -gt 0 ] && [ ! -f $dir/log/mix_up.$[$x-1].log ]; then
-    #  $cmd $dir/log/progress.$x.log \
-    #    nnet3-show-progress --use-gpu=no $dir/$[$x-1].mdl $dir/$x.mdl \
-    #    ark:$cur_egs_dir/train_diagnostic.egs '&&' \
-    #    nnet3-info $dir/$x.mdl &
-    #fi
+    if [ $x -gt 0 ]; then
+      $cmd $dir/log/progress.$x.log \
+        nnet3-show-progress --use-gpu=no "nnet3-am-copy --raw=true $dir/$[$x-1].mdl - |" "nnet3-am-copy --raw=true $dir/$x.mdl - |" \
+        "ark:nnet3-merge-egs ark:$cur_egs_dir/train_diagnostic.egs ark:-|" '&&' \
+        nnet3-info "nnet3-am-copy --raw=true $dir/$x.mdl - |" &
+    fi
 
     echo "Training neural net (pass $x)"
 
@@ -516,7 +505,7 @@ while [ $x -lt $num_iters ]; do
     ( # this sub-shell is so that when we "wait" below,
       # we only wait for the training jobs that we just spawned,
       # not the diagnostic jobs that we spawned above.
-      
+
       # We can't easily use a single parallel SGE job to do the main training,
       # because the computation of which archive and which --frame option
       # to use for each job is a little complex, so we spawn each one separately.
@@ -530,8 +519,9 @@ while [ $x -lt $num_iters ]; do
         # so we want to separate them in time.
 
         $cmd $train_queue_opt $dir/log/train.$x.$n.log \
-          nnet3-train$parallel_suffix $parallel_train_opts "$raw" \
-          "ark:nnet3-copy-egs --frame=$frame $context_opts ark:$cur_egs_dir/egs.$archive.ark ark:- | nnet3-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-| nnet3-merge-egs --minibatch-size=$this_minibatch_size ark:- ark:- |" \
+          nnet3-train $parallel_train_opts \
+          --max-param-change=$max_param_change "$raw" \
+          "ark:nnet3-copy-egs --frame=$frame $context_opts ark:$cur_egs_dir/egs.$archive.ark ark:- | nnet3-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-| nnet3-merge-egs --minibatch-size=$this_minibatch_size --discard-partial-minibatches=true ark:- ark:- |" \
           $dir/$[$x+1].$n.raw || touch $dir/.error &
       done
       wait
@@ -555,7 +545,7 @@ while [ $x -lt $num_iters ]; do
       n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
           $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
           undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
-          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob; 
+          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
           $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
       [ -z "$n" ] && echo "Error getting best model" && exit 1;
       $cmd $dir/log/select.$x.log \
@@ -613,9 +603,11 @@ fi
 if [ $stage -le $[$num_iters+1] ]; then
   echo "Getting average posterior for purposes of adjusting the priors."
   # Note: this just uses CPUs, using a smallish subset of data.
+  if [ $num_jobs_compute_prior -gt $num_archives ]; then egs_part=1;
+  else egs_part=JOB; fi
   rm $dir/post.$x.*.vec 2>/dev/null
   $cmd JOB=1:$num_jobs_compute_prior $prior_queue_opt $dir/log/get_post.$x.JOB.log \
-    nnet3-copy-egs --frame=random $context_opts --srand=JOB ark:$cur_egs_dir/egs.1.ark ark:- \| \
+    nnet3-copy-egs --frame=random $context_opts --srand=JOB ark:$cur_egs_dir/egs.$egs_part.ark ark:- \| \
     nnet3-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
     nnet3-merge-egs ark:- ark:- \| \
     nnet3-compute-from-egs $prior_gpu_opt --apply-exp=true \
diff --git a/egs/wsj/s5/steps/online/nnet2/copy_data_dir.sh b/egs/wsj/s5/steps/online/nnet2/copy_data_dir.sh
index 78b5b1bde2f..b4e70fc6af0 100755
--- a/egs/wsj/s5/steps/online/nnet2/copy_data_dir.sh
+++ b/egs/wsj/s5/steps/online/nnet2/copy_data_dir.sh
@@ -9,7 +9,6 @@
 # versions, so that each speaker has no more than --utts-per-spk-max
 # utterances.
 
-
 # begin configuration section
 utts_per_spk_max=-1
 # end configuration section
@@ -34,7 +33,7 @@ srcdir=$1
 destdir=$2
 
 if [ ! -f $srcdir/utt2spk ]; then
-  echo "$0: no such file $srcdir/utt2spk" 
+  echo "$0: no such file $srcdir/utt2spk"
   exit 1;
 fi
 
@@ -81,5 +80,6 @@ echo "$0: copied data from $srcdir to $destdir, with --utts-per-spk-max $utts_pe
 opts=
 [ ! -f $srcdir/feats.scp ] && opts="--no-feats"
 [ ! -f $srcdir/text ] && opts="$opts --no-text"
+[ ! -f $srcdir/wav.scp ] && opts="$opts --no-wav"
 
 utils/validate_data_dir.sh $opts $destdir
diff --git a/egs/wsj/s5/steps/online/nnet2/extract_ivectors.sh b/egs/wsj/s5/steps/online/nnet2/extract_ivectors.sh
index 924a2f20eaf..f27baecd673 100755
--- a/egs/wsj/s5/steps/online/nnet2/extract_ivectors.sh
+++ b/egs/wsj/s5/steps/online/nnet2/extract_ivectors.sh
@@ -258,7 +258,7 @@ base_feat_dim=$(feat-to-dim scp:$data/feats.scp -) || exit 1;
 
 start_dim=$base_feat_dim
 end_dim=$[$base_feat_dim+$ivector_dim-1]
-
+absdir=$(readlink -f $dir)
 
 if [ $stage -le 4 ]; then
   # here, we are just using the original features in $sdata/JOB/feats.scp for
@@ -269,7 +269,7 @@ if [ $stage -le 4 ]; then
     select-feats "$start_dim-$end_dim" ark:- ark:- \| \
     subsample-feats --n=$ivector_period ark:- ark:- \| \
     copy-feats --compress=$compress ark:- \
-    ark,scp:$dir/ivector_online.JOB.ark,$dir/ivector_online.JOB.scp || exit 1;
+    ark,scp:$absdir/ivector_online.JOB.ark,$absdir/ivector_online.JOB.scp || exit 1;
 fi
 
 if [ $stage -le 5 ]; then
diff --git a/egs/wsj/s5/steps/online/nnet2/extract_ivectors_online.sh b/egs/wsj/s5/steps/online/nnet2/extract_ivectors_online.sh
index 81d8a3219dc..d8ac11da720 100755
--- a/egs/wsj/s5/steps/online/nnet2/extract_ivectors_online.sh
+++ b/egs/wsj/s5/steps/online/nnet2/extract_ivectors_online.sh
@@ -14,7 +14,7 @@
 # for online decoding.
 
 # Rather than treating each utterance separately, it carries forward
-# information from one utterance to the next, within the speaker. 
+# information from one utterance to the next, within the speaker.
 
 
 # Begin configuration section.
@@ -45,7 +45,6 @@ max_count=0         # The use of this option (e.g. --max-count 100) can make
 # End configuration section.
 
 echo "$0 $@"  # Print the command line for logging
-
 if [ -f path.sh ]; then . ./path.sh; fi
 . parse_options.sh || exit 1;
 
@@ -56,7 +55,7 @@ if [ $# != 3 ]; then
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config containing options"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
-  echo "  --nj <n|10>                                      # Number of jobs (also see num-processes and num-threads)"
+  echo "  --nj <n|10>                                      # Number of jobs"
   echo "  --stage <stage|0>                                # To control partial reruns"
   echo "  --num-gselect <n|5>                              # Number of Gaussians to select using"
   echo "                                                   # diagonal model."
@@ -94,6 +93,7 @@ echo -n >$ieconf
 cp $srcdir/online_cmvn.conf $dir/conf/ || exit 1;
 echo "--cmvn-config=$dir/conf/online_cmvn.conf" >>$ieconf
 for x in $(echo $splice_opts); do echo "$x"; done > $dir/conf/splice.conf
+echo "--ivector-period=$ivector_period" >>$ieconf
 echo "--splice-config=$dir/conf/splice.conf" >>$ieconf
 echo "--lda-matrix=$srcdir/final.mat" >>$ieconf
 echo "--global-cmvn-stats=$srcdir/global_cmvn.stats" >>$ieconf
@@ -106,6 +106,7 @@ echo "--max-remembered-frames=1000" >>$ieconf # the default
 echo "--max-count=$max_count" >>$ieconf
 
 
+absdir=$(readlink -f $dir)
 
 for n in $(seq $nj); do
   # This will do nothing unless the directory $dir/storage exists;
@@ -118,7 +119,7 @@ if [ $stage -le 0 ]; then
   $cmd JOB=1:$nj $dir/log/extract_ivectors.JOB.log \
      ivector-extract-online2 --config=$ieconf ark:$sdata/JOB/spk2utt scp:$sdata/JOB/feats.scp ark:- \| \
      copy-feats --compress=$compress ark:- \
-      ark,scp:$dir/ivector_online.JOB.ark,$dir/ivector_online.JOB.scp || exit 1;
+      ark,scp:$absdir/ivector_online.JOB.ark,$absdir/ivector_online.JOB.scp || exit 1;
 fi
 
 if [ $stage -le 1 ]; then
diff --git a/egs/wsj/s5/steps/online/nnet3/decode.sh b/egs/wsj/s5/steps/online/nnet3/decode.sh
new file mode 100755
index 00000000000..af8a33f3ac3
--- /dev/null
+++ b/egs/wsj/s5/steps/online/nnet3/decode.sh
@@ -0,0 +1,149 @@
+#!/bin/bash
+
+# Copyright 2014  Johns Hopkins University (Author: Daniel Povey)
+#           2016  Api.ai (Author: Ilya Platonov)
+# Apache 2.0
+
+# Begin configuration section.  
+stage=0
+nj=4
+cmd=run.pl
+max_active=7000
+beam=15.0
+lattice_beam=6.0
+acwt=0.1   # note: only really affects adaptation and pruning (scoring is on
+           # lattices).
+post_decode_acwt=1.0  # can be used in 'chain' systems to scale acoustics by 10 so the
+                      # regular scoring script works.
+per_utt=false
+online=true  # only relevant to non-threaded decoder.
+do_endpointing=false
+do_speex_compressing=false
+scoring_opts=
+skip_scoring=false
+silence_weight=1.0  # set this to a value less than 1 (e.g. 0) to enable silence weighting.
+max_state_duration=40 # This only has an effect if you are doing silence
+  # weighting.  This default is probably reasonable.  transition-ids repeated
+  # more than this many times in an alignment are treated as silence.
+iter=final
+online_config=
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# != 3 ]; then
+   echo "Usage: $0 [options] <graph-dir> <data-dir> <decode-dir>"
+   echo "... where <decode-dir> is assumed to be a sub-directory of the directory"
+   echo " where the models are, as prepared by steps/online/nnet3/prepare_online_decoding.sh"
+   echo "e.g.: $0 exp/chain/tdnn/graph data/test exp/chain/tdnn_online/decode/"
+   echo ""
+   echo ""
+   echo "main options (for others, see top of script file)"
+   echo "  --config <config-file>                           # config containing options"
+   echo "  --online-config <config-file>                    # online decoder options"
+   echo "  --nj <nj>                                        # number of parallel jobs"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   echo "  --acwt <float>                                   # acoustic scale used for lattice generation "
+   echo "  --per-utt <true|false>                           # If true, decode per utterance without"
+   echo "                                                   # carrying forward adaptation info from previous"
+   echo "                                                   # utterances of each speaker.  Default: false"
+   echo "  --online <true|false>                            # Set this to false if you don't really care about"
+   echo "                                                   # simulating online decoding and just want the best"
+   echo "                                                   # results.  This will use all the data within each"
+   echo "                                                   # utterance (plus any previous utterance, if not in"
+   echo "                                                   # per-utterance mode) to estimate the iVectors."
+   echo "  --scoring-opts <string>                          # options to local/score.sh"
+   echo "  --iter <iter>                                    # Iteration of model to decode; default is final."
+   exit 1;
+fi
+
+
+graphdir=$1
+data=$2
+dir=$3
+srcdir=`dirname $dir`; # The model directory is one level up from decoding directory.
+sdata=$data/split$nj;
+
+if [ "$online_config" == "" ]; then
+  online_config=$srcdir/conf/online.conf;
+fi
+
+mkdir -p $dir/log
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+echo $nj > $dir/num_jobs
+
+for f in $online_config $srcdir/${iter}.mdl \
+    $graphdir/HCLG.fst $graphdir/words.txt $data/wav.scp; do
+  if [ ! -f $f ]; then
+    echo "$0: no such file $f"
+    exit 1;
+  fi
+done
+
+if ! $per_utt; then
+  spk2utt_rspecifier="ark:$sdata/JOB/spk2utt"
+else
+  mkdir -p $dir/per_utt
+  for j in $(seq $nj); do
+    awk '{print $1, $1}' <$sdata/$j/utt2spk >$dir/per_utt/utt2spk.$j || exit 1;
+  done
+  spk2utt_rspecifier="ark:$dir/per_utt/utt2spk.JOB"
+fi
+
+if [ -f $data/segments ]; then
+  wav_rspecifier="ark,s,cs:extract-segments scp,p:$sdata/JOB/wav.scp $sdata/JOB/segments ark:- |"
+else
+  wav_rspecifier="ark,s,cs:wav-copy scp,p:$sdata/JOB/wav.scp ark:- |"
+fi
+if $do_speex_compressing; then
+  wav_rspecifier="$wav_rspecifier compress-uncompress-speex ark:- ark:- |"
+fi
+if $do_endpointing; then
+  wav_rspecifier="$wav_rspecifier extend-wav-with-silence ark:- ark:- |"  
+fi
+
+if [ "$silence_weight" != "1.0" ]; then
+  silphones=$(cat $graphdir/phones/silence.csl) || exit 1
+  silence_weighting_opts="--ivector-silence-weighting.max-state-duration=$max_state_duration --ivector-silence-weighting.silence_phones=$silphones --ivector-silence-weighting.silence-weight=$silence_weight"
+else
+  silence_weighting_opts=
+fi
+
+
+decoder=online2-wav-nnet3-latgen-faster
+parallel_opts=
+opts="--online=$online"
+
+
+if [ "$post_decode_acwt" == 1.0 ]; then
+  lat_wspecifier="ark:|gzip -c >$dir/lat.JOB.gz"
+else
+  lat_wspecifier="ark:|lattice-scale --acoustic-scale=$post_decode_acwt ark:- ark:- | gzip -c >$dir/lat.JOB.gz"
+fi
+
+
+if [ -f $srcdir/frame_subsampling_factor ]; then
+  # e.g. for 'chain' systems
+  frame_subsampling_opt="--frame-subsampling-factor=$(cat $srcdir/frame_subsampling_factor)"
+fi
+
+if [ $stage -le 0 ]; then
+  $cmd $parallel_opts JOB=1:$nj $dir/log/decode.JOB.log \
+    $decoder $opts $silence_weighting_opts --do-endpointing=$do_endpointing $frame_subsampling_opt \
+     --config=$online_config \
+     --max-active=$max_active --beam=$beam --lattice-beam=$lattice_beam \
+     --acoustic-scale=$acwt --word-symbol-table=$graphdir/words.txt \
+     $srcdir/${iter}.mdl $graphdir/HCLG.fst $spk2utt_rspecifier "$wav_rspecifier" \
+      "$lat_wspecifier" || exit 1;
+fi
+
+if ! $skip_scoring ; then
+  [ ! -x local/score.sh ] && \
+    echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
+  local/score.sh --cmd "$cmd" $scoring_opts $data $graphdir $dir
+fi
+
+exit 0;
diff --git a/egs/wsj/s5/steps/online/nnet3/prepare_online_decoding.sh b/egs/wsj/s5/steps/online/nnet3/prepare_online_decoding.sh
new file mode 100755
index 00000000000..c7d7156068f
--- /dev/null
+++ b/egs/wsj/s5/steps/online/nnet3/prepare_online_decoding.sh
@@ -0,0 +1,168 @@
+#!/bin/bash
+
+# Copyright 2014  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0
+
+# Begin configuration.
+stage=0 # This allows restarting after partway, when something when wrong.
+feature_type=mfcc
+add_pitch=false
+mfcc_config=conf/mfcc.conf # you can override any of these you need to override.
+plp_config=conf/plp.conf
+fbank_config=conf/fbank.conf 
+# online_pitch_config is the config file for both pitch extraction and
+# post-processing; we combine them into one because during training this
+# is given to the program compute-and-process-kaldi-pitch-feats.
+online_pitch_config=conf/online_pitch.conf
+
+# Below are some options that affect the iVectors, and should probably
+# match those used in extract_ivectors_online.sh.
+num_gselect=5 # Gaussian-selection using diagonal model: number of Gaussians to select
+posterior_scale=0.1 # Scale on the acoustic posteriors, intended to account for
+                    # inter-frame correlations.
+min_post=0.025 # Minimum posterior to use (posteriors below this are pruned out)
+               # caution: you should use the same value in the online-estimation
+               # code.
+max_count=100   # This max-count of 100 can make iVectors more consistent for
+                # different lengths of utterance, by scaling up the prior term
+                # when the data-count exceeds this value.  The data-count is
+                # after posterior-scaling, so assuming the posterior-scale is
+                # 0.1, --max-count 100 starts having effect after 1000 frames,
+                # or 10 seconds of data.
+iter=final
+# End configuration.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f path.sh ] && . ./path.sh;
+. parse_options.sh || exit 1;
+
+if [ $# -ne 4 ] && [ $# -ne 3 ]; then
+   echo "Usage: $0 [options] <lang-dir> [<ivector-extractor-dir>] <nnet-dir> <output-dir>"
+   echo "e.g.: $0 data/lang exp/nnet2_online/extractor exp/nnet2_online/nnet exp/nnet2_online/nnet_online"
+   echo "main options (for others, see top of script file)"
+   echo "  --feature-type <mfcc|plp>                        # Type of the base features; "
+   echo "                                                   # important to generate the correct"
+   echo "                                                   # configs in <output-dir>/conf/"
+   echo "  --add-pitch <true|false>                         # Append pitch features to cmvn"
+   echo "                                                   # (default: false)"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   echo "  --config <config-file>                           # config containing options"
+   echo "  --iter <model-iteration|final>                   # iteration of model to take."
+   echo "  --stage <stage>                                  # stage to do partial re-run from."
+   exit 1;
+fi
+
+
+if [ $# -eq 4 ]; then
+  lang=$1
+  iedir=$2
+  srcdir=$3
+  dir=$4
+else
+  [ $# -eq 3 ] || exit 1;
+  lang=$1
+  iedir=
+  srcdir=$2
+  dir=$3
+fi
+
+for f in $lang/phones/silence.csl $srcdir/${iter}.mdl $srcdir/tree; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+if [ ! -z "$iedir" ]; then
+  for f in final.{mat,ie,dubm} splice_opts global_cmvn.stats online_cmvn.conf; do
+    [ ! -f $iedir/$f ] && echo "$0: no such file $iedir/$f" && exit 1;
+  done
+fi
+
+
+dir=$(readlink -f $dir) # Convert $dir to an absolute pathname, so that the
+                        # configuration files we write will contain absolute
+                        # pathnames.
+mkdir -p $dir/conf
+
+
+cp $srcdir/${iter}.mdl $dir/final.mdl || exit 1;
+cp $srcdir/tree $dir/ || exit 1;
+if [ -f $srcdir/frame_subsampling_factor ]; then
+	cp $srcdir/frame_subsampling_factor $dir/
+fi
+
+if [ ! -z "$iedir" ]; then
+  mkdir -p $dir/ivector_extractor/
+  cp $iedir/final.{mat,ie,dubm} $iedir/global_cmvn.stats $dir/ivector_extractor/ || exit 1;
+
+  # The following things won't be needed directly by the online decoding, but
+  # will allow us to run prepare_online_decoding.sh again with
+  # $dir/ivector_extractor/ as the input directory (useful in certain
+  # cross-system training scenarios).
+  cp $iedir/splice_opts $iedir/online_cmvn.conf $dir/ivector_extractor/ || exit 1;
+fi
+
+
+mkdir -p $dir/conf
+rm $dir/{plp,mfcc,fbank}.conf 2>/dev/null
+echo "$0: preparing configuration files in $dir/conf"
+
+if [ -f $dir/conf/online.conf ]; then
+  echo "$0: moving $dir/conf/online.conf to $dir/conf/online.conf.bak"
+  mv $dir/conf/online.conf $dir/conf/online.conf.bak
+fi
+
+conf=$dir/conf/online.conf
+echo -n >$conf
+
+echo "--feature-type=$feature_type" >>$conf
+
+case "$feature_type" in
+  mfcc)
+    echo "--mfcc-config=$dir/conf/mfcc.conf" >>$conf
+    cp $mfcc_config $dir/conf/mfcc.conf || exit 1;;
+  plp)
+    echo "--plp-config=$dir/conf/plp.conf" >>$conf
+    cp $plp_config $dir/conf/plp.conf || exit 1;;
+  fbank)
+    echo "--fbank-config=$dir/conf/fbank.conf" >>$conf
+    cp $fbank_config $dir/conf/fbank.conf || exit 1;;
+  *)
+    echo "Unknown feature type $feature_type"
+esac
+
+
+
+if [ ! -z "$iedir" ]; then
+  ieconf=$dir/conf/ivector_extractor.conf
+  echo -n >$ieconf
+  echo "--ivector-extraction-config=$ieconf" >>$conf
+  cp $iedir/online_cmvn.conf $dir/conf/online_cmvn.conf || exit 1;
+  # the next line puts each option from splice_opts on its own line in the config.
+  for x in $(cat $iedir/splice_opts); do echo "$x"; done > $dir/conf/splice.conf
+  echo "--splice-config=$dir/conf/splice.conf" >>$ieconf
+  echo "--cmvn-config=$dir/conf/online_cmvn.conf" >>$ieconf
+  echo "--lda-matrix=$dir/ivector_extractor/final.mat" >>$ieconf
+  echo "--global-cmvn-stats=$dir/ivector_extractor/global_cmvn.stats" >>$ieconf
+  echo "--diag-ubm=$dir/ivector_extractor/final.dubm" >>$ieconf
+  echo "--ivector-extractor=$dir/ivector_extractor/final.ie" >>$ieconf
+  echo "--num-gselect=$num_gselect"  >>$ieconf
+  echo "--min-post=$min_post" >>$ieconf
+  echo "--posterior-scale=$posterior_scale" >>$ieconf # this is currently the default in the scripts.
+  echo "--max-remembered-frames=1000" >>$ieconf # the default
+  echo "--max-count=$max_count" >>$ieconf
+fi
+
+if $add_pitch; then
+  echo "$0: enabling pitch features"
+  echo "--add-pitch=true" >>$conf
+  echo "$0: creating $dir/conf/online_pitch.conf"
+  if [ ! -f $online_pitch_config ]; then
+    echo "$0: expected file '$online_pitch_config' to exist.";
+    exit 1;
+  fi
+  cp $online_pitch_config $dir/conf/online_pitch.conf || exit 1;
+  echo "--online-pitch-config=$dir/conf/online_pitch.conf" >>$conf
+fi
+
+silphonelist=`cat $lang/phones/silence.csl` || exit 1;
+echo "--endpoint.silence-phones=$silphonelist" >>$conf
+echo "$0: created config file $conf"
diff --git a/egs/wsj/s5/steps/paste_feats.sh b/egs/wsj/s5/steps/paste_feats.sh
index da82179f616..abeee5aba23 100755
--- a/egs/wsj/s5/steps/paste_feats.sh
+++ b/egs/wsj/s5/steps/paste_feats.sh
@@ -44,10 +44,10 @@ done
 
 mkdir -p $ark_dir $logdir
 
-mkdir -p $data 
+mkdir -p $data
 cp $data_src_first/* $data/ 2>/dev/null # so we get the other files, such as utt2spk.
-rm $data/cmvn.scp 2>/dev/null 
-rm $data/feats.scp 2>/dev/null 
+rm $data/cmvn.scp 2>/dev/null
+rm $data/feats.scp 2>/dev/null
 
 # use "name" as part of name of the archive.
 name=`basename $data`
@@ -58,19 +58,25 @@ for data_src in ${data_src_arr[@]}; do
   data_src_args="$data_src_args scp:$data_src/split$nj/JOB/feats.scp"
 done
 
+for n in $(seq $nj); do
+  # the next command does nothing unless $arkdir/storage/ exists, see
+  # utils/create_data_link.pl for more info.
+  utils/create_data_link.pl $arkdir/pasted_$name.$n.ark
+done
+
 $cmd JOB=1:$nj $logdir/append.JOB.log \
    paste-feats --length-tolerance=$length_tolerance $data_src_args ark:- \| \
    copy-feats --compress=$compress ark:- \
     ark,scp:$ark_dir/pasted_$name.JOB.ark,$ark_dir/pasted_$name.JOB.scp || exit 1;
-              
+
 # concatenate the .scp files together.
 for ((n=1; n<=nj; n++)); do
   cat $ark_dir/pasted_$name.$n.scp >> $data/feats.scp || exit 1;
 done > $data/feats.scp || exit 1;
 
 
-nf=`cat $data/feats.scp | wc -l` 
-nu=`cat $data/utt2spk | wc -l` 
+nf=`cat $data/feats.scp | wc -l`
+nu=`cat $data/utt2spk | wc -l`
 if [ $nf -ne $nu ]; then
   echo "It seems not all of the feature files were successfully processed ($nf != $nu);"
   echo "consider using utils/fix_data_dir.sh $data"
diff --git a/egs/wsj/s5/steps/rnnlmrescore.sh b/egs/wsj/s5/steps/rnnlmrescore.sh
index c1302e2beed..2cb6700432a 100755
--- a/egs/wsj/s5/steps/rnnlmrescore.sh
+++ b/egs/wsj/s5/steps/rnnlmrescore.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
 
+# please see lmrescore_rnnlm_lat.sh which is a newer script using lattices.
 
 # Begin configuration section.
 N=10
@@ -104,12 +105,14 @@ if [ "$oldlm" == "$oldlang/G.fst" ]; then
     if [ $stage -le 2 ]; then
       echo "$0: removing old LM scores."
       # Use the phi-matcher style of composition.. this is appropriate
-      # if the old LM scores were added e.g. by lmrescore.sh, using 
+      # if the old LM scores were added e.g. by lmrescore.sh, using
       # phi-matcher composition.
       $cmd JOB=1:$nj $dir/log/remove_old.JOB.log \
-        lattice-compose --phi-label=$phi "ark:gunzip -c $dir/nbest1.JOB.gz|" $oldlm \
-        "ark:|gzip -c >$dir/nbest2.JOB.gz"  || exit 1;
-    fi    
+        lattice-scale --acoustic-scale=-1 --lm-scale=-1 "ark:gunzip -c $dir/nbest1.JOB.gz|" ark:- \| \
+        lattice-compose --phi-label=$phi ark:- $oldlm ark:- \| \
+        lattice-scale --acoustic-scale=-1 --lm-scale=-1 ark:- "ark:|gzip -c >$dir/nbest2.JOB.gz" \
+        || exit 1;
+    fi
   else
     if [ $stage -le 2 ]; then
       echo "$0: removing old LM scores."
@@ -187,7 +190,7 @@ if [ $stage -le 7 ]; then
   echo "$0: reconstructing total LM+graph scores including interpolation of RNNLM and old LM scores."
   for n in `seq $nj`; do
     paste $adir.$n/lmwt.nolm $adir.$n/lmwt.lmonly $adir.$n/lmwt.rnn | awk -v rnnweight=$rnnweight \
-      '{ key=$1; graphscore=$2; lmscore=$4; rnnscore=$6; 
+      '{ key=$1; graphscore=$2; lmscore=$4; rnnscore=$6;
      score = graphscore+(rnnweight*rnnscore)+((1-rnnweight)*lmscore);
      print $1,score; } ' > $adir.$n/lmwt.interp.$rnnweight || exit 1;
   done
diff --git a/egs/wsj/s5/steps/score_kaldi.sh b/egs/wsj/s5/steps/score_kaldi.sh
index 5ed223d0312..36fc0e429bc 100755
--- a/egs/wsj/s5/steps/score_kaldi.sh
+++ b/egs/wsj/s5/steps/score_kaldi.sh
@@ -14,6 +14,7 @@ beam=6
 word_ins_penalty=0.0,0.5,1.0
 min_lmwt=9
 max_lmwt=20
+iter=final
 #end configuration section.
 
 echo "$0 $@"  # Print the command line for logging
@@ -137,12 +138,18 @@ if [ $stage -le 1 ]; then
       cat $dir/scoring_kaldi/wer_details/per_utt \| \
       utils/scoring/wer_ops_details.pl --special-symbol "'***'" \| \
       sort -b -i -k 1,1 -k 4,4rn -k 2,2 -k 3,3 \> $dir/scoring_kaldi/wer_details/ops || exit 1;
+
+    $cmd $dir/scoring_kaldi/log/wer_bootci.log \
+      compute-wer-bootci \
+        ark:$dir/scoring_kaldi/test_filt.txt ark:$dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \
+        '>' $dir/scoring_kaldi/wer_details/wer_bootci || exit 1;
+
   fi
 fi
 
 # If we got here, the scoring was successful.
 # As a  small aid to prevent confusion, we remove all wer_{?,??} files;
-# these originate from the previous version of the scoring files 
+# these originate from the previous version of the scoring files
 rm $dir/wer_{?,??} 2>/dev/null
 
 exit 0;
diff --git a/egs/wsj/s5/steps/score_kaldi_compare.sh b/egs/wsj/s5/steps/score_kaldi_compare.sh
new file mode 100755
index 00000000000..91fc057b906
--- /dev/null
+++ b/egs/wsj/s5/steps/score_kaldi_compare.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+# Copyright 2016 Nicolas Serrano
+# Apache 2.0
+
+[ -f ./path.sh ] && . ./path.sh
+
+# begin configuration section.
+cmd=run.pl
+replications=10000
+#end configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+  echo "Usage: local/score_compare.sh [--cmd (run.pl|queue.pl...)] <score-dir1> <score-dir2> <score-compare-dir>"
+  echo " Options:"
+  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
+  echo "    --replications <int>            # number of bootstrap evaluation to compute confidence."
+  exit 1;
+fi
+
+dir1=$1
+dir2=$2
+dir_compare=$3
+
+mkdir -p $dir_compare/log
+
+for d in $dir1 $dir2; do
+  for f in test_filt.txt best_wer; do
+    [ ! -f $d/$f ] && echo "score_compare.sh: no such file $d/$f" && exit 1;
+  done
+done
+
+
+best_wer_file1=$(awk '{print $NF}' $dir1/best_wer)
+best_transcript_file1=$(echo $best_wer_file1 | sed -e 's=.*/wer_==' | \
+        awk -v FS='_' -v dir=$dir1 '{print dir"/penalty_"$2"/"$1".txt"}')
+
+best_wer_file2=$(awk '{print $NF}' $dir2/best_wer)
+best_transcript_file2=$(echo $best_wer_file2 | sed -e 's=.*/wer_==' | \
+        awk -v FS='_' -v dir=$dir2 '{print dir"/penalty_"$2"/"$1".txt"}')
+
+$cmd $dir_compare/log/score_compare.log \
+  compute-wer-bootci --replications=$replications \
+    ark:$dir1/test_filt.txt ark:$best_transcript_file1 ark:$best_transcript_file2 \
+    '>' $dir_compare/wer_bootci_comparison || exit 1;
+
+exit 0;
diff --git a/egs/wsj/s5/steps/select_feats.sh b/egs/wsj/s5/steps/select_feats.sh
index 970823fdf25..072dd3194cf 100755
--- a/egs/wsj/s5/steps/select_feats.sh
+++ b/egs/wsj/s5/steps/select_feats.sh
@@ -43,31 +43,31 @@ mkdir -p $ark_dir $logdir
 mkdir -p $data
 
 cp $data_in/* $data/ 2>/dev/null # so we get the other files, such as utt2spk.
-rm $data/cmvn.scp 2>/dev/null 
-rm $data/feats.scp 2>/dev/null 
+rm $data/cmvn.scp 2>/dev/null
+rm $data/feats.scp 2>/dev/null
 
 # use "name" as part of name of the archive.
 name=`basename $data`
 
-for j in $(seq $nj); do 
+for j in $(seq $nj); do
   # the next command does nothing unless $mfccdir/storage/ exists, see
   # utils/create_data_link.pl for more info.
-  utils/create_data_link.pl $ark_dir/pasted_$name.$j.ark
+  utils/create_data_link.pl $ark_dir/selected_$name.$j.ark
 done
 
 $cmd JOB=1:$nj $logdir/append.JOB.log \
    select-feats "$selector" scp:$data_in/split$nj/JOB/feats.scp ark:- \| \
    copy-feats --compress=$compress ark:- \
-    ark,scp:$ark_dir/pasted_$name.JOB.ark,$ark_dir/pasted_$name.JOB.scp || exit 1;
-              
+    ark,scp:$ark_dir/selected_$name.JOB.ark,$ark_dir/selected_$name.JOB.scp || exit 1;
+
 # concatenate the .scp files together.
 for ((n=1; n<=nj; n++)); do
-  cat $ark_dir/pasted_$name.$n.scp >> $data/feats.scp || exit 1;
+  cat $ark_dir/selected_$name.$n.scp >> $data/feats.scp || exit 1;
 done > $data/feats.scp || exit 1;
 
 
-nf=`cat $data/feats.scp | wc -l` 
-nu=`cat $data/utt2spk | wc -l` 
+nf=`cat $data/feats.scp | wc -l`
+nu=`cat $data/utt2spk | wc -l`
 if [ $nf -ne $nu ]; then
   echo "It seems not all of the feature files were successfully processed ($nf != $nu);"
   exit 1;
diff --git a/egs/wsj/s5/steps/shift_feats.sh b/egs/wsj/s5/steps/shift_feats.sh
new file mode 100755
index 00000000000..9ad85368c3f
--- /dev/null
+++ b/egs/wsj/s5/steps/shift_feats.sh
@@ -0,0 +1,85 @@
+#!/bin/bash
+
+# Copyright 2016    Vimal Manohar
+# Apache 2.0
+
+# This script shifts the feats in the input data directory and creates a
+# new directory <input-data>_fs<num-frames-shift> with shifted feats.
+# If the shift is negative, the initial frames get truncated.
+# If the shift is positive, the first frame is repeated.
+# Usually applicable for sequence training
+
+# To be run from .. (one directory up from here)
+# see ../run.sh for example
+
+# Begin configuration section.
+cmd=run.pl
+nj=4
+compress=true
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+if [ $# -ne 4 ]; then
+   echo "usage: $0 [options] <frame-shift> <src-data-dir> <log-dir> <path-to-storage-dir>";
+   echo "e.g.: $0 -1 data/train exp/shift-1_train mfcc"
+   echo "options: "
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   exit 1;
+fi
+
+num_frames_shift=$1
+data_in=$2
+logdir=$3
+featdir=$4
+
+utt_prefix="fs$num_frames_shift-"
+spk_prefix="fs$num_frames_shift-"
+
+# make $featdir an absolute pathname.
+featdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $featdir ${PWD}`
+
+utils/split_data.sh $data_in $nj || exit 1;
+
+data=${data_in}_fs$num_frames_shift
+
+mkdir -p $featdir $logdir
+mkdir -p $data
+
+utils/copy_data_dir.sh --utt-prefix $utt_prefix --spk-prefix $spk_prefix \
+  $data_in $data
+
+rm $data/feats.scp 2>/dev/null
+
+# use "name" as part of name of the archive.
+name=`basename $data`
+
+for j in $(seq $nj); do
+  # the next command does nothing unless $mfccdir/storage/ exists, see
+  # utils/create_data_link.pl for more info.
+  utils/create_data_link.pl $featdir/raw_feats_$name.$j.ark
+done
+
+$cmd JOB=1:$nj $logdir/shift.JOB.log \
+  shift-feats --shift=$num_frames_shift \
+  scp:$data_in/split$nj/JOB/feats.scp ark:- \| \
+  copy-feats --compress=$compress ark:- \
+  ark,scp:$featdir/raw_feats_$name.JOB.ark,$featdir/raw_feats_$name.JOB.scp || exit 1;
+
+# concatenate the .scp files together.
+for ((n=1; n<=nj; n++)); do
+  cat $featdir/raw_feats_$name.$n.scp
+done | awk -v nfs=$num_frames_shift '{print "fs"nfs"-"$0}'>$data/feats.scp || exit 1;
+
+nf=`cat $data/feats.scp | wc -l`
+nu=`cat $data/utt2spk | wc -l`
+if [ $nf -ne $nu ]; then
+  echo "It seems not all of the feature files were successfully processed ($nf != $nu);"
+  exit 1;
+fi
+
+echo "Succeeded shifting features for $name into $data"
+
diff --git a/egs/wsj/s5/steps/train_diag_ubm.sh b/egs/wsj/s5/steps/train_diag_ubm.sh
index 5ec4696c75c..5cac8c462da 100755
--- a/egs/wsj/s5/steps/train_diag_ubm.sh
+++ b/egs/wsj/s5/steps/train_diag_ubm.sh
@@ -53,6 +53,7 @@ silphonelist=`cat $lang/phones/silence.csl` || exit 1;
 sdata=$data/split$nj
 splice_opts=`cat $alidir/splice_opts 2>/dev/null`
 cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null`
+delta_opts=`cat $alidir/delta_opts 2>/dev/null`
 mkdir -p $dir/log
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
 echo $nj > $dir/num_jobs
@@ -61,7 +62,7 @@ if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
 echo "$0: feature type is $feat_type"
 
 case $feat_type in
-  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
   lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
     cp $alidir/final.mat $dir    
     ;;
diff --git a/egs/wsj/s5/steps/train_lda_mllt.sh b/egs/wsj/s5/steps/train_lda_mllt.sh
index c8522985a6d..f8f05c87f92 100755
--- a/egs/wsj/s5/steps/train_lda_mllt.sh
+++ b/egs/wsj/s5/steps/train_lda_mllt.sh
@@ -1,6 +1,13 @@
 #!/bin/bash
 
 # Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
+#
+# LDA+MLLT refers to the way we transform the features after computing
+# the MFCCs: we splice across several frames, reduce the dimension (to 40
+# by default) using Linear Discriminant Analysis), and then later estimate,
+# over multiple iterations, a diagonalizing transform known as MLLT or CTC.
+# See http://kaldi.sourceforge.net/transform.html for more explanation.
+#
 # Apache 2.0.
 
 # Begin configuration.
@@ -85,7 +92,7 @@ feats="$splicedfeats transform-feats $dir/0.mat ark:- ark:- |"
 if [ $stage -le -5 ]; then
   if [ -z "$use_lda_mat" ]; then
     echo "Accumulating LDA statistics."
-    rm $dir/lda.*.acc
+    rm $dir/lda.*.acc 2>/dev/null
     $cmd JOB=1:$nj $dir/log/lda_acc.JOB.log \
     ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
       weight-silence-post 0.0 $silphonelist $alidir/final.mdl ark:- ark:- \| \
@@ -204,7 +211,7 @@ while [ $x -lt $num_iters ]; do
     $cmd $dir/log/update.$x.log \
       gmm-est --write-occs=$dir/$[$x+1].occs --mix-up=$numgauss --power=$power \
         $dir/$x.mdl "gmm-sum-accs - $dir/$x.*.acc |" $dir/$[$x+1].mdl || exit 1;
-    rm $dir/$x.mdl $dir/$x.*.acc $dir/$x.occs 
+    rm $dir/$x.mdl $dir/$x.*.acc $dir/$x.occs
   fi
   [ $x -le $max_iter_inc ] && numgauss=$[$numgauss+$incgauss];
   x=$[$x+1];
diff --git a/egs/wsj/s5/steps/train_map.sh b/egs/wsj/s5/steps/train_map.sh
index a0b4e54bc3f..2bdf4d6cd77 100755
--- a/egs/wsj/s5/steps/train_map.sh
+++ b/egs/wsj/s5/steps/train_map.sh
@@ -45,6 +45,7 @@ nj=`cat $alidir/num_jobs` || exit 1;
 sdata=$data/split$nj
 splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options.
 cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null`
+delta_opts=`cat $alidir/delta_opts 2>/dev/null`
 
 
 mkdir -p $dir/log
@@ -57,6 +58,7 @@ utils/ln.pl $alidir/ali.*.gz $dir
 echo $nj >$dir/num_jobs
 cp $alidir/splice_opts $dir 2>/dev/null
 cp $alidir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option.
+cp $alidir/delta_opts $dir 2>/dev/null
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
 
 ## Set up features.
@@ -64,7 +66,7 @@ if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
 echo "$0: feature type is $feat_type"
 
 case $feat_type in
-  delta) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  delta) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
   lda) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
     cp $alidir/final.mat $dir    
     cp $alidir/full.mat $dir 2>/dev/null
diff --git a/egs/wsj/s5/steps/train_mmi.sh b/egs/wsj/s5/steps/train_mmi.sh
index dcee408c0d7..7ee0a135d00 100755
--- a/egs/wsj/s5/steps/train_mmi.sh
+++ b/egs/wsj/s5/steps/train_mmi.sh
@@ -57,9 +57,11 @@ nj=`cat $alidir/num_jobs` || exit 1;
 sdata=$data/split$nj
 splice_opts=`cat $alidir/splice_opts 2>/dev/null`
 cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null`
+delta_opts=`cat $alidir/delta_opts 2>/dev/null`
 mkdir -p $dir/log
 cp $alidir/splice_opts $dir 2>/dev/null
 cp $alidir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option.
+cp $alidir/delta_opts $dir 2>/dev/null
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
 echo $nj > $dir/num_jobs
 
@@ -74,7 +76,7 @@ if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
 echo "$0: feature type is $feat_type"
 
 case $feat_type in
-  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
   lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
     cp $alidir/final.mat $dir    
     ;;
diff --git a/egs/wsj/s5/steps/train_mmi_fmmi.sh b/egs/wsj/s5/steps/train_mmi_fmmi.sh
index 36130c3456b..4fd25ab13f3 100755
--- a/egs/wsj/s5/steps/train_mmi_fmmi.sh
+++ b/egs/wsj/s5/steps/train_mmi_fmmi.sh
@@ -76,9 +76,11 @@ nj=`cat $alidir/num_jobs` || exit 1;
 sdata=$data/split$nj
 splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options.
 cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null`
+delta_opts=`cat $alidir/delta_opts 2>/dev/null`
 mkdir -p $dir/log
 cp $alidir/splice_opts $dir 2>/dev/null # frame-splicing options.
 cp $alidir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option.
+cp $alidir/delta_opts $dir 2>/dev/null
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
 
 
@@ -87,7 +89,7 @@ echo "$0: feature type is $feat_type"
 
 # Note: $feats is the features before fMPE.
 case $feat_type in
-  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
   lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
     cp $alidir/final.mat $dir    
     ;;
diff --git a/egs/wsj/s5/steps/train_mmi_fmmi_indirect.sh b/egs/wsj/s5/steps/train_mmi_fmmi_indirect.sh
index 24670103917..42bb660cbf6 100755
--- a/egs/wsj/s5/steps/train_mmi_fmmi_indirect.sh
+++ b/egs/wsj/s5/steps/train_mmi_fmmi_indirect.sh
@@ -74,9 +74,11 @@ nj=`cat $alidir/num_jobs` || exit 1;
 sdata=$data/split$nj
 splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options.
 cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null`
+delta_opts=`cat $alidir/delta_opts 2>/dev/null`
 mkdir -p $dir/log
 cp $alidir/splice_opts $dir 2>/dev/null # frame-splicing options.
 cp $alidir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option.
+cp $alidir/delta_opts $dir 2>/dev/null
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
 
 
@@ -85,7 +87,7 @@ echo "$0: feature type is $feat_type"
 
 # Note: $feats is the features before fMPE.
 case $feat_type in
-  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
   lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
     cp $alidir/final.mat $dir    
     ;;
diff --git a/egs/wsj/s5/steps/train_mono.sh b/egs/wsj/s5/steps/train_mono.sh
index c03fbf4b118..9efeb9a084d 100755
--- a/egs/wsj/s5/steps/train_mono.sh
+++ b/egs/wsj/s5/steps/train_mono.sh
@@ -13,7 +13,7 @@ cmd=run.pl
 scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
 num_iters=40    # Number of iterations of training
 max_iter_inc=30 # Last iter to increase #Gauss on.
-totgauss=1000 # Target #Gaussians.  
+totgauss=1000 # Target #Gaussians.
 careful=false
 boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment
 realign_iters="1 2 3 4 5 6 7 8 9 10 12 14 16 18 20 23 26 29 32 35 38";
@@ -65,7 +65,7 @@ shared_phones_opt="--shared-phones=$lang/phones/sets.int"
 if [ $stage -le -3 ]; then
   # Note: JOB=1 just uses the 1st part of the features-- we only need a subset anyway.
   if ! feat_dim=`feat-to-dim "$example_feats" - 2>/dev/null` || [ -z $feat_dim ]; then
-    feat-to-dim "$example_feats"  
+    feat-to-dim "$example_feats" -
     echo "error getting feature dimension"
     exit 1;
   fi
diff --git a/egs/wsj/s5/steps/train_quick.sh b/egs/wsj/s5/steps/train_quick.sh
index 38d67cdd182..b6e99334b74 100755
--- a/egs/wsj/s5/steps/train_quick.sh
+++ b/egs/wsj/s5/steps/train_quick.sh
@@ -59,11 +59,13 @@ nj=`cat $alidir/num_jobs` || exit 1;
 sdata=$data/split$nj
 splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options.
 cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null`
+delta_opts=`cat $alidir/delta_opts 2>/dev/null`
 
 mkdir -p $dir/log
 echo $nj >$dir/num_jobs
 cp $alidir/splice_opts $dir 2>/dev/null
 cp $alidir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option.
+cp $alidir/delta_opts $dir 2>/dev/null
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
 
 ## Set up features.
@@ -71,7 +73,7 @@ if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
 echo "$0: feature type is $feat_type"
 
 case $feat_type in
-  delta) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  delta) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
   lda) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
     cp $alidir/final.mat $dir    
     cp $alidir/full.mat $dir 2>/dev/null
diff --git a/egs/wsj/s5/steps/train_sat.sh b/egs/wsj/s5/steps/train_sat.sh
index 4fb35b2a722..51fddd3fe4b 100755
--- a/egs/wsj/s5/steps/train_sat.sh
+++ b/egs/wsj/s5/steps/train_sat.sh
@@ -32,6 +32,9 @@ power=0.2 # Exponent for number of gaussians according to occurrence counts
 cluster_thresh=-1  # for build-tree control final bottom-up clustering of leaves
 phone_map=
 train_tree=true
+tree_stats_opts=
+cluster_phones_opts=
+compile_questions_opts=
 # End configuration section.
 
 echo "$0 $@"  # Print the command line for logging
@@ -90,7 +93,7 @@ echo "$0: feature type is $feat_type"
 case $feat_type in
   delta) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
   lda) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
-    cp $alidir/final.mat $dir    
+    cp $alidir/final.mat $dir
     cp $alidir/full.mat $dir 2>/dev/null
     ;;
   *) echo "$0: invalid feature type $feat_type" && exit 1;
@@ -101,7 +104,7 @@ if [ -f $alidir/trans.1 ]; then
   echo "$0: Using transforms from $alidir"
   feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$alidir/trans.JOB ark:- ark:- |"
   cur_trans_dir=$alidir
-else 
+else
   if [ $stage -le -5 ]; then
     echo "$0: obtaining initial fMLLR transforms since not present in $alidir"
     # The next line is necessary because of $silphonelist otherwise being incorrect; would require
@@ -123,7 +126,7 @@ if [ $stage -le -4 ] && $train_tree; then
   # Get tree stats.
   echo "$0: Accumulating tree stats"
   $cmd JOB=1:$nj $dir/log/acc_tree.JOB.log \
-    acc-tree-stats $context_opts $phone_map_opt --ci-phones=$ciphonelist $alidir/final.mdl "$feats" \
+    acc-tree-stats $context_opts $tree_stats_opts $phone_map_opt --ci-phones=$ciphonelist $alidir/final.mdl "$feats" \
     "ark:gunzip -c $alidir/ali.JOB.gz|" $dir/JOB.treeacc || exit 1;
   [ "`ls $dir/*.treeacc | wc -w`" -ne "$nj" ] && echo "$0: Wrong #tree-accs" && exit 1;
   $cmd $dir/log/sum_tree_acc.log \
@@ -134,9 +137,9 @@ fi
 if [ $stage -le -3 ] && $train_tree; then
   echo "$0: Getting questions for tree clustering."
   # preparing questions, roots file...
-  cluster-phones $context_opts $dir/treeacc $lang/phones/sets.int $dir/questions.int 2> $dir/log/questions.log || exit 1;
+  cluster-phones $cluster_phones_opts $context_opts $dir/treeacc $lang/phones/sets.int $dir/questions.int 2>$dir/log/questions.log || exit 1;
   cat $lang/phones/extra_questions.int >> $dir/questions.int
-  compile-questions $context_opts $lang/topo $dir/questions.int $dir/questions.qst 2>$dir/log/compile_questions.log || exit 1;
+  compile-questions $context_opts $compile_questions_opts $lang/topo $dir/questions.int $dir/questions.qst 2>$dir/log/compile_questions.log || exit 1;
 
   echo "$0: Building the tree"
   $cmd $dir/log/build_tree.log \
@@ -212,7 +215,7 @@ while [ $x -lt $num_iters ]; do
     feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |"
     cur_trans_dir=$dir
   fi
-  
+
   if [ $stage -le $x ]; then
     $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
       gmm-acc-stats-ali $dir/$x.mdl "$feats" \
@@ -222,7 +225,7 @@ while [ $x -lt $num_iters ]; do
       gmm-est --power=$power --write-occs=$dir/$[$x+1].occs --mix-up=$numgauss $dir/$x.mdl \
       "gmm-sum-accs - $dir/$x.*.acc |" $dir/$[$x+1].mdl || exit 1;
     rm $dir/$x.mdl $dir/$x.*.acc
-    rm $dir/$x.occs 
+    rm $dir/$x.occs
   fi
   [ $x -le $max_iter_inc ] && numgauss=$[$numgauss+$incgauss];
   x=$[$x+1];
@@ -257,7 +260,7 @@ utils/summarize_warnings.pl $dir/log
   echo "$0: Likelihood evolution:"
   for x in `seq $[$num_iters-1]`; do
     tail -n 30 $dir/log/acc.$x.*.log | awk '/Overall avg like/{l += $(NF-3)*$(NF-1); t += $(NF-1); }
-        /Overall average logdet/{d += $(NF-3)*$(NF-1); t2 += $(NF-1);} 
+        /Overall average logdet/{d += $(NF-3)*$(NF-1); t2 += $(NF-1);}
         END{ d /= t2; l /= t; printf("%s ", d+l); } '
   done
   echo
diff --git a/egs/wsj/s5/steps/train_sat_basis.sh b/egs/wsj/s5/steps/train_sat_basis.sh
index a709096760a..cbe14249646 100755
--- a/egs/wsj/s5/steps/train_sat_basis.sh
+++ b/egs/wsj/s5/steps/train_sat_basis.sh
@@ -63,10 +63,12 @@ ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1;
 sdata=$data/split$nj;
 splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options.
 cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null`
+delta_opts=`cat $alidir/delta_opts 2>/dev/null`
 
 mkdir -p $dir/log
 cp $alidir/splice_opts $dir 2>/dev/null # frame-splicing options.
 cp $alidir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option.
+cp $alidir/delta_opts $dir 2>/dev/null
 
 echo $nj >$dir/num_jobs
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
@@ -78,7 +80,7 @@ echo "$0: feature type is $feat_type"
 
 ## Set up speaker-independent features.
 case $feat_type in
-  delta) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  delta) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
   lda) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
     cp $alidir/final.mat $dir    
     ;;
diff --git a/egs/wsj/s5/steps/train_smbr.sh b/egs/wsj/s5/steps/train_smbr.sh
index 1d38dc4532a..c8f9e8f7139 100755
--- a/egs/wsj/s5/steps/train_smbr.sh
+++ b/egs/wsj/s5/steps/train_smbr.sh
@@ -56,9 +56,11 @@ nj=`cat $alidir/num_jobs` || exit 1;
 sdata=$data/split$nj
 splice_opts=`cat $alidir/splice_opts 2>/dev/null`
 cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null`
+delta_opts=`cat $alidir/delta_opts 2>/dev/null`
 mkdir -p $dir/log
 cp $alidir/splice_opts $dir 2>/dev/null
 cp $alidir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option.
+cp $alidir/delta_opts $dir 2>/dev/null
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
 echo $nj > $dir/num_jobs
 
@@ -72,7 +74,7 @@ if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
 echo "$0: feature type is $feat_type"
 
 case $feat_type in
-  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
   lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
     cp $alidir/final.mat $dir    
     ;;
diff --git a/egs/wsj/s5/steps/train_ubm.sh b/egs/wsj/s5/steps/train_ubm.sh
index bc8b19cd3b6..3b483872497 100755
--- a/egs/wsj/s5/steps/train_ubm.sh
+++ b/egs/wsj/s5/steps/train_ubm.sh
@@ -63,13 +63,14 @@ sdata=$data/split$nj;
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
 splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options.
 cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null`
+delta_opts=`cat $alidir/delta_opts 2>/dev/null`
 
 ## Set up features.
 if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
 echo "$0: feature type is $feat_type"
 
 case $feat_type in
-  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
   lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
     cp $alidir/final.mat $dir    
     ;;
diff --git a/egs/wsj/s5/utils/build_const_arpa_lm.sh b/egs/wsj/s5/utils/build_const_arpa_lm.sh
index 41760159b75..375ffd79eb4 100755
--- a/egs/wsj/s5/utils/build_const_arpa_lm.sh
+++ b/egs/wsj/s5/utils/build_const_arpa_lm.sh
@@ -33,7 +33,6 @@ mkdir -p $new_lang
 mkdir -p $new_lang
 cp -r $old_lang/* $new_lang
 
-
 unk=`cat $new_lang/oov.int`
 bos=`grep "<s>" $new_lang/words.txt | awk '{print $2}'`
 eos=`grep "</s>" $new_lang/words.txt | awk '{print $2}'`
diff --git a/egs/wsj/s5/utils/combine_ali_dirs.sh b/egs/wsj/s5/utils/combine_ali_dirs.sh
new file mode 100755
index 00000000000..ae05326a3ee
--- /dev/null
+++ b/egs/wsj/s5/utils/combine_ali_dirs.sh
@@ -0,0 +1,99 @@
+#!/bin/bash
+# Copyright 2016  Xiaohui Zhang  Apache 2.0.
+
+# This srcipt operates on alignment directories, such as exp/tri4a_ali
+
+# Begin configuration section. 
+cmd=run.pl
+extra_files= 
+num_jobs=4
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+if [[ $# -lt 3 ]]; then
+  echo "Usage: $0 [options] <data> <dest-ali-dir> <src-ali-dir1> <src-ali-dir2> ..."
+  echo "e.g.: $0 --num-jobs 32 data/train exp/tri3_ali_combined exp/tri3_ali_1 exp_tri3_ali_2"
+  echo "Options:"
+  echo " --extra-files <file1 file2...>   # specify addtional files in 'src-ali-dir1' to copy"
+  echo " --num-jobs <nj>                  # number of jobs used to split the data directory."
+  echo " Note, files that don't appear in the first source dir will not be added even if they appear in later ones."
+  echo " Other than alignments, only files from the first src ali dir are copied."
+  exit 1;
+fi
+
+data=$1; 
+shift;
+dest=$1;
+shift;
+first_src=$1;
+
+mkdir -p $dest;
+rm $dest/{ali.*.gz,num_jobs} 2>/dev/null
+
+export LC_ALL=C
+
+for dir in $*; do
+  if [ ! -f $dir/ali.1.gz ]; then
+    echo "$0: check if alignments (ali.*.gz) are present in $dir."
+    exit 1;
+  fi
+done
+
+for dir in $*; do
+  for f in tree; do
+    diff $first_src/$f $dir/$f 1>/dev/null 2>&1
+    if [ $? -ne 0 ]; then
+      echo "$0: Cannot combine alignment directories with different $f files."
+    fi
+  done
+done
+
+for f in final.mdl tree cmvn_opts num_jobs $extra_files; do
+  if [ ! -f $first_src/$f ]; then
+    echo "combine_ali_dir.sh: no such file $first_src/$f"
+    exit 1;
+  fi
+  cp $first_src/$f $dest/
+done
+
+src_id=0
+temp_dir=$dest/temp
+mkdir -p $temp_dir
+echo "$0: dumping alignments in each source directory as single archive and index."
+for dir in $*; do
+  src_id=$((src_id + 1))
+  cur_num_jobs=$(cat $dir/num_jobs) || exit 1;
+  all_ids=$(seq -s, $cur_num_jobs)
+  $cmd $dir/log/copy_alignments.log \
+    copy-int-vector "ark:gunzip -c $dir/ali.{$all_ids}.gz|" \
+    ark,scp:$temp_dir/ali.$src_id.ark,$temp_dir/ali.$src_id.scp || exit 1;
+done
+cat $temp_dir/ali.*.scp | sort -m > $temp_dir/ali.scp || exit 1;
+
+echo "$0: splitting data to get reference utt2spk for individual ali.JOB.gz files."
+utils/split_data.sh $data $num_jobs || exit 1;
+echo $num_jobs > $dest/num_jobs  || exit 1
+   
+echo "$0: splitting the alignments to appropriate chunks according to the reference utt2spk files."
+for i in `seq 1 $num_jobs`; do
+  awk '{print $1}' $data/split$num_jobs/$i/utt2spk | sort > $temp_dir/utt_subset.$i
+  utils/filter_scp.pl $temp_dir/utt_subset.$i $temp_dir/ali.scp | \
+    copy-int-vector scp:- "ark:|gzip -c >$dest/ali.$i.gz" || exit 1;
+done   
+
+echo "$0: checking the alignment files generated have at least 90% of the utterances."
+for i in `seq 1 $num_jobs`; do
+  num_lines=` utils/filter_scp.pl $temp_dir/utt_subset.$i $temp_dir/ali.scp | wc -l` || exit 1;
+  num_lines_tot=`cat $temp_dir/utt_subset.$i |wc -l` || exit 1;
+  python -c "import sys;
+percent = 100.0 * float($num_lines) / $num_lines_tot
+if percent < 90 :
+  print ('$dest/ali.$i.gz {0}% utterances missing.'.format(percent))"  || exit 1;
+done
+rm -r $temp_dir 2>/dev/null
+
+echo "Combined alignments and stored in $dest"
+exit 0
diff --git a/egs/wsj/s5/utils/combine_data.sh b/egs/wsj/s5/utils/combine_data.sh
index 2611a53045a..96fe99d42b3 100755
--- a/egs/wsj/s5/utils/combine_data.sh
+++ b/egs/wsj/s5/utils/combine_data.sh
@@ -39,9 +39,37 @@ for dir in $*; do
   fi
 done
 
-for file in utt2spk utt2lang feats.scp text cmvn.scp segments reco2file_and_channel wav.scp spk2gender $extra_files; do
+# W.r.t. utt2uniq file the script has different behavior compared to other files
+# it is not compulsary for it to exist in src directories, but if it exists in 
+# even one it should exist in all. We will create the files where necessary
+has_utt2uniq=false
+for in_dir in $*; do
+  if [ -f $in_dir/utt2uniq ]; then
+    has_utt2uniq=true
+    break
+  fi
+done
+
+if $has_utt2uniq; then
+  # we are going to create an utt2uniq file in the destdir
+  for in_dir in $*; do
+    if [ ! -f $in_dir/utt2uniq ]; then
+      # we assume that utt2uniq is a one to one mapping
+      cat $in_dir/utt2spk | awk '{printf("%s %s\n", $1, $1);}' 
+    else
+      cat $in_dir/utt2uniq
+    fi
+  done | sort -k1 > $dest/utt2uniq
+  echo "$0: combined utt2uniq"
+fi
+# some of the old scripts might provide utt2uniq as an extrafile, so just remove it
+extra_files=$(echo "$extra_files"|sed -e "s/utt2uniq//g")
+
+for file in utt2spk utt2lang utt2dur feats.scp text cmvn.scp segments reco2file_and_channel wav.scp spk2gender $extra_files; do
   if [ -f $first_src/$file ]; then
+    set -o pipefail
     ( for f in $*; do cat $f/$file; done ) | sort -k1 > $dest/$file || exit 1;
+    set +o pipefail
     echo "$0: combined $file"
   else
     echo "$0 [info]: not combining $file as it does not exist"
diff --git a/egs/wsj/s5/utils/convert_slf.pl b/egs/wsj/s5/utils/convert_slf.pl
index ee1941011e5..1bc6421f2da 100755
--- a/egs/wsj/s5/utils/convert_slf.pl
+++ b/egs/wsj/s5/utils/convert_slf.pl
@@ -115,7 +115,7 @@
     $ss = scalar split(/_/, $ss);
     
     # update the end time
-    die "Node $s not yet visited, is lattice sorted topologically? $utt" unless exists $nodes{$s};
+    die "Node $s not yet visited, is lattice sorted topologically? $utt" unless exists $nodes{$s}{t};
     $time_end = $nodes{$s}{t} + $ss;
     if ($latest_time < $time_end) { $latest_time = $time_end; }
 
diff --git a/egs/wsj/s5/utils/convert_slf_parallel.sh b/egs/wsj/s5/utils/convert_slf_parallel.sh
index 1b4b2ef75fc..4e4ce41d236 100755
--- a/egs/wsj/s5/utils/convert_slf_parallel.sh
+++ b/egs/wsj/s5/utils/convert_slf_parallel.sh
@@ -33,7 +33,7 @@ dir=$3
 
 model=$(dirname $dir)/final.mdl # assume model one level up from decoding dir.
 
-for f in $lang/words.txt $lang/phones/word_boundary.int $model $dir/lat.1.gz; do
+for f in $lang/words.txt $lang/phones/align_lexicon.int $model $dir/lat.1.gz; do
   [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
 done
 
@@ -50,7 +50,8 @@ nj=$(cat $dir/num_jobs)
 # convert the lattices (individually, gzipped)
 $cmd $parallel_opts JOB=1:$nj $dir/$dirname/log/lat_convert.JOB.log \
   mkdir -p $dir/$dirname/JOB/ '&&' \
-  lattice-align-words-lexicon --output-error-lats=true --output-if-empty=true $lang/phones/align_lexicon.int $model "ark:gunzip -c $dir/lat.JOB.gz |" ark,t:- \| \
+  lattice-align-words-lexicon --output-error-lats=true --output-if-empty=true \
+    $lang/phones/align_lexicon.int $model "ark:gunzip -c $dir/lat.JOB.gz |" ark,t:- \| \
   utils/int2sym.pl -f 3 $lang/words.txt \| \
   utils/convert_slf.pl $word_to_node_arg - $dir/$dirname/JOB/ || exit 1
 
diff --git a/egs/wsj/s5/utils/copy_data_dir.sh b/egs/wsj/s5/utils/copy_data_dir.sh
index e7a4b8276b3..5e1a9cba470 100755
--- a/egs/wsj/s5/utils/copy_data_dir.sh
+++ b/egs/wsj/s5/utils/copy_data_dir.sh
@@ -46,7 +46,7 @@ srcdir=$1
 destdir=$2
 
 if [ ! -f $srcdir/utt2spk ]; then
-  echo "copy_data_dir.sh: no such file $srcdir/utt2spk" 
+  echo "copy_data_dir.sh: no such file $srcdir/utt2spk"
   exit 1;
 fi
 
@@ -57,6 +57,14 @@ mkdir -p $destdir
 cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/utt_map
 cat $srcdir/spk2utt | awk -v p=$spk_prefix -v s=$spk_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/spk_map
 
+if [ ! -f $srcdir/utt2uniq ]; then
+  if [[ ! -z $utt_prefix  ||  ! -z $utt_suffix ]]; then
+    cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $1);}' > $destdir/utt2uniq
+  fi
+else
+  cat $srcdir/utt2uniq | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $2);}' > $destdir/utt2uniq
+fi
+
 cat $srcdir/utt2spk | utils/apply_map.pl -f 1 $destdir/utt_map  | \
   utils/apply_map.pl -f 2 $destdir/spk_map >$destdir/utt2spk
 
@@ -74,7 +82,7 @@ if [ -f $srcdir/segments ]; then
     cp $srcdir/reco2file_and_channel $destdir/
   fi
 else # no segments->wav indexed by utt.
-  if [ -f $srcdir/wav.scp ]; then 
+  if [ -f $srcdir/wav.scp ]; then
     utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/wav.scp >$destdir/wav.scp
   fi
 fi
@@ -82,6 +90,9 @@ fi
 if [ -f $srcdir/text ]; then
   utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text >$destdir/text
 fi
+if [ -f $srcdir/utt2dur ]; then
+  utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2dur >$destdir/utt2dur
+fi
 if [ -f $srcdir/spk2gender ]; then
   utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/spk2gender >$destdir/spk2gender
 fi
diff --git a/egs/wsj/s5/utils/create_data_link.pl b/egs/wsj/s5/utils/create_data_link.pl
index 0fafa2e041b..eeed315e6dd 100755
--- a/egs/wsj/s5/utils/create_data_link.pl
+++ b/egs/wsj/s5/utils/create_data_link.pl
@@ -43,29 +43,30 @@ sub GetGCD {
 
   foo/egs.3.4.ark -> storage/4/egs.3.4.ark
 
-Usage: utils/create_data_link.pl <data-archive>
- e.g.: utils/create_data_link.pl foo/bar/egs.3.4.ark
+Usage: utils/create_data_link.pl <data-archive1> [<data-archive2> ... ]
+ e.g.: utils/create_data_link.pl foo/bar/egs.3.4.ark foo/bar/egs.3.5.ark
+ (note: the dirname, e.g. foo/bar/, must be the same in all cases).
 
 See also utils/remove_data_links.sh
 EOU
 
 GetOptions();
 
-if (@ARGV != 1) {
+if (@ARGV == 0) {
   die $Usage;
 }
 
-my $fullpath = shift(@ARGV);
+my $example_fullpath = $ARGV[0];
 
 # Check if the storage has been created. If so, do nothing.
-my $dirname = dirname($fullpath);
+my $dirname = dirname($example_fullpath);
 if (! -d "$dirname/storage") {
   exit(0);
 }
 
 # Storage exists, create symbolic links in the next few steps.
 
-# First, get a list of the available storage direstories, and check if they are
+# First, get a list of the available storage directories, and check if they are
 # properly created.
 opendir(my $dh, "$dirname/storage/") || die "$0: Fail to open $dirname/storage/\n";
 my @storage_dirs = grep(/^[0-9]*$/, readdir($dh));
@@ -83,25 +84,48 @@ sub GetGCD {
   }
 }
 
-# Finally, work out the directory index where we should put the data to.
-my $basename = basename($fullpath);
-my $filename_numbers = $basename;
-$filename_numbers =~ s/[^0-9]+/ /g;
-my @filename_numbers = split(" ", $filename_numbers);
-my $total = 0;
-my $index = 0;
-foreach my $x (@filename_numbers) {
-  if ($index >= scalar(@coprimes)) {
-    $index = 0;
+my $ret = 0;
+
+foreach my $fullpath (@ARGV) {
+  if ($dirname ne dirname($fullpath)) {
+    die "Mismatch in directory names of arguments: $example_fullpath versus $fullpath";
   }
-  $total += $x * $coprimes[$index];
-  $index++;
-}
-my $dir_index = $total % $num_storage + 1;
 
-# Make the symbolic link.
-if (-e $fullpath) {
-  unlink($fullpath);
+  # Finally, work out the directory index where we should put the data to.
+  my $basename = basename($fullpath);
+  my $filename_numbers = $basename;
+  $filename_numbers =~ s/[^0-9]+/ /g;
+  my @filename_numbers = split(" ", $filename_numbers);
+  my $total = 0;
+  my $index = 0;
+  foreach my $x (@filename_numbers) {
+    if ($index >= scalar(@coprimes)) {
+      $index = 0;
+    }
+    $total += $x * $coprimes[$index];
+    $index++;
+  }
+  my $dir_index = $total % $num_storage + 1;
+
+  # Make the symbolic link.
+  if (-e $fullpath) {
+    unlink($fullpath);
+  }
+  if (symlink("storage/$dir_index/$basename", $fullpath) != 1) { # failure
+    $ret = 1;  # will exit with error status.
+  }
 }
-my $ret = symlink("storage/$dir_index/$basename", $fullpath);
-exit($ret == 1 ? 0 : 1);
+
+exit($ret);
+
+## testing:
+# rm -rf foo bar
+# mkdir -p bar/{1,2,3,4}
+# mkdir -p foo/storage
+# for x in 1 2 3 4; do ln -s ../../bar/$x foo/storage/$x; done
+# utils/create_data_link.pl utils/create_data_link.pl foo/1.3.ark  foo/2.3.ark
+# ls -l foo
+# total 0
+# lrwxrwxrwx 1 dpovey fax 17 Sep  2 17:41 1.3.ark -> storage/3/1.3.ark
+# lrwxrwxrwx 1 dpovey fax 17 Sep  2 17:41 2.3.ark -> storage/4/2.3.ark
+# drwxr-xr-x 2 dpovey fax 38 Sep  2 17:40 storage
diff --git a/egs/wsj/s5/utils/create_split_dir.pl b/egs/wsj/s5/utils/create_split_dir.pl
index 0c4f023f7f3..0acf53f4c2c 100755
--- a/egs/wsj/s5/utils/create_split_dir.pl
+++ b/egs/wsj/s5/utils/create_split_dir.pl
@@ -44,20 +44,39 @@
 
 my $dir = pop(@ARGV);
 system("mkdir -p $dir 2>/dev/null");
-my $index = 1;
+
+my @all_actual_storage = ();
 foreach my $file (@ARGV) {
-  $file = $file . "/" . $suffix;
-  my $actual_storage = File::Spec->rel2abs($file);
+  push @all_actual_storage, File::Spec->rel2abs($file . "/" . $suffix);
+}
+
+my $index = 1;
+foreach my $actual_storage (@all_actual_storage) {
   my $pseudo_storage = "$dir/$index";
 
   # If the symbolic link already exists, delete it.
   if (-l $pseudo_storage) {
     print STDERR "$0: link $pseudo_storage already exists, not overwriting.\n";
+    $index++;
     next;
   }
 
   # Create the destination directory and make the link.
   system("mkdir -p $actual_storage 2>/dev/null");
+  if ($? != 0) {
+    print STDERR "$0: error creating directory $actual_storage\n";
+    exit(1);
+  }
+  { # create a README file for easier deletion.
+    open(R, ">$actual_storage/README.txt");
+    my $storage_dir = File::Spec->rel2abs($dir);
+    print R "# This directory is linked from $storage_dir, as part of Kaldi striped data\n";
+    print R "# The full list of directories where this data resides is:\n";
+    foreach my $d (@all_actual_storage) {
+      print R "$d\n";
+    }
+    close(R);
+  }
   my $ret = symlink($actual_storage, $pseudo_storage);
 
   # Process the returned values
diff --git a/egs/wsj/s5/utils/data/combine_data.sh b/egs/wsj/s5/utils/data/combine_data.sh
new file mode 120000
index 00000000000..0aed7e823b7
--- /dev/null
+++ b/egs/wsj/s5/utils/data/combine_data.sh
@@ -0,0 +1 @@
+../combine_data.sh
\ No newline at end of file
diff --git a/egs/wsj/s5/utils/data/copy_data_dir.sh b/egs/wsj/s5/utils/data/copy_data_dir.sh
new file mode 120000
index 00000000000..b9854db4655
--- /dev/null
+++ b/egs/wsj/s5/utils/data/copy_data_dir.sh
@@ -0,0 +1 @@
+../copy_data_dir.sh
\ No newline at end of file
diff --git a/egs/wsj/s5/utils/data/get_frame_shift.sh b/egs/wsj/s5/utils/data/get_frame_shift.sh
new file mode 100755
index 00000000000..77f5f8eb7dc
--- /dev/null
+++ b/egs/wsj/s5/utils/data/get_frame_shift.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+
+# Copyright 2016  Johns Hopkins University (author: Daniel Povey)
+# Apache 2.0
+
+# This script takes as input a data directory, such as data/train/, preferably
+# with utt2dur file already existing (or the utt2dur file will be created if
+# not), and it attempts to work out the approximate frame shift by comparing the
+# utt2dur with the output of feat-to-len on the feats.scp.  It prints it out.
+# if the shift is very close to, but above, 0.01 (the normal frame shift) it
+# rounds it down.
+
+. utils/parse_options.sh
+. ./path.sh
+
+if [ $# != 1 ]; then
+  echo "Usage: $0 <datadir>"
+  echo "e.g.:"
+  echo " $0 data/train"
+  echo "This script prints the frame-shift (e.g. 0.01) to the standard out."
+  echo "If <datadir> does not contain utt2dur, this script will call utils/data/get_utt2dur.sh,"
+  echo "which will require write permission to <datadir>"
+  exit 1
+fi
+
+export LC_ALL=C
+
+dir=$1
+
+if [ ! -f $dir/utt2dur ]; then
+  echo "$0: $dir/utt2dur does not exist: creating it" 1>&2
+  utils/data/get_utt2dur.sh $dir 1>&2
+fi
+
+if [ ! -f $dir/feats.scp ]; then
+  echo "$0: $dir/feats.scp does not exist" 1>&2
+  exit 1
+fi
+
+temp=$(mktemp /tmp/tmp.XXXX)
+
+feat-to-len scp:$dir/feats.scp ark,t:- | head -n 10 > $temp
+
+if [ -z $temp ]; then
+  echo "$0: error running feat-to-len" 1>&2
+  exit 1
+fi
+
+head -n 10 $dir/utt2dur | paste - $temp | \
+   awk '{ dur += $2; frames += $4; } END { shift = dur / frames; if (shift > 0.01 && shift < 0.0102) shift = 0.01; print shift; }' || exit 1;
+
+rm $temp
+
+exit 0
diff --git a/egs/wsj/s5/utils/data/get_num_frames.sh b/egs/wsj/s5/utils/data/get_num_frames.sh
new file mode 100755
index 00000000000..9c4aae5e693
--- /dev/null
+++ b/egs/wsj/s5/utils/data/get_num_frames.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+# This script works out the approximate number of frames in a training directory.
+# This is sometimes needed by higher-level scripts
+
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+if [ $# -ne 1 ]; then
+  (
+    echo "Usage: $0 <data-dir>"
+    echo "Prints the number of frames of data in the data-dir"
+  ) 1>&2
+fi
+
+data=$1
+
+if [ ! -f $data/utt2dur ]; then
+  utils/data/get_utt2dur.sh $data 1>&2 || exit 1
+fi
+
+frame_shift=$(utils/data/get_frame_shift.sh $data) || exit 1
+
+awk -v s=$frame_shift '{n += $2} END{print int(n / s)}' <$data/utt2dur
diff --git a/egs/wsj/s5/utils/data/get_utt2dur.sh b/egs/wsj/s5/utils/data/get_utt2dur.sh
new file mode 100755
index 00000000000..20e89e44ed9
--- /dev/null
+++ b/egs/wsj/s5/utils/data/get_utt2dur.sh
@@ -0,0 +1,101 @@
+#!/bin/bash
+
+# Copyright 2016  Johns Hopkins University (author: Daniel Povey)
+# Apache 2.0
+
+# This script operates on a data directory, such as in data/train/, and adds the
+# utt2dur file if it does not already exist.  The file 'utt2dur' maps from
+# utterance to the duration of the utterance in seconds.  This script works it
+# out from the 'segments' file, or, if not present, from the wav.scp file (it
+# first tries interrogating the headers, and if this fails, it reads the wave
+# files in entirely.)
+
+frame_shift=0.01
+
+. utils/parse_options.sh
+. ./path.sh
+
+if [ $# != 1 ]; then
+  echo "Usage: $0 [options] <datadir>"
+  echo "e.g.:"
+  echo " $0 data/train"
+  echo " Options:"
+  echo " --frame-shift      # frame shift in seconds. Only relevant when we are"
+  echo "                    # getting duration from feats.scp (default: 0.01). "
+  exit 1
+fi
+
+export LC_ALL=C
+
+data=$1
+
+if [ -f $data/utt2dur ]; then
+  echo "$0: $data/utt2dur file already exists. The script is not going to be executed."
+  exit 0;
+fi
+
+if [ -f $data/segments ]; then
+  echo "$0: working out $data/utt2dur from $data/segments"
+  cat $data/segments | awk '{len=$4-$3; print $1, len;}' > $data/utt2dur
+elif [ -f $data/wav.scp ]; then
+  echo "$0: segments file does not exist so getting durations from wave files"
+
+  # if the wav.scp contains only lines of the form
+  # utt1  /foo/bar/sph2pipe -f wav /baz/foo.sph |
+  if cat $data/wav.scp | perl -e '
+     while (<>) { s/\|\s*$/ |/;  # make sure final | is preceded by space.
+             @A = split; if (!($#A == 5 && $A[1] =~ m/sph2pipe$/ &&
+                               $A[2] eq "-f" && $A[3] eq "wav" && $A[5] eq "|")) { exit(1); }
+             $utt = $A[0]; $sphere_file = $A[4];
+             if (!open(F, "<$sphere_file")) { die "Error opening sphere file $sphere_file"; }
+             $sample_rate = -1;  $sample_count = -1;
+             for ($n = 0; $n <= 30; $n++) {
+                $line = <F>;
+                if ($line =~ m/sample_rate -i (\d+)/) { $sample_rate = $1; }
+                if ($line =~ m/sample_count -i (\d+)/) { $sample_count = $1; }
+                if ($line =~ m/end_head/) { break; }
+             }
+             close(F);
+             if ($sample_rate == -1 || $sample_count == -1) {
+               die "could not parse sphere header from $sphere_file";
+             }
+             $duration = $sample_count * 1.0 / $sample_rate;
+             print "$utt $duration\n";
+     } ' > $data/utt2dur; then
+    echo "$0: successfully obtained utterance lengths from sphere-file headers"
+  else
+    echo "$0: could not get utterance lengths from sphere-file headers, using wav-to-duration"
+    if ! command -v wav-to-duration >/dev/null; then
+      echo  "$0: wav-to-duration is not on your path"
+      exit 1;
+    fi
+
+    read_entire_file=false
+    if cat $data/wav.scp | grep -q 'sox.*speed'; then
+      read_entire_file=true
+      echo "$0: reading from the entire wav file to fix the problem caused by sox commands with speed perturbation. It is going to be slow."
+    fi
+    
+    if ! wav-to-duration --read-entire-file=$read_entire_file scp:$data/wav.scp ark,t:$data/utt2dur 2>&1 | grep -v 'nonzero return status'; then
+      echo "$0: there was a problem getting the durations; moving $data/utt2dur to $data/.backup/"
+      mkdir -p $data/.backup/
+      mv $data/utt2dur $data/.backup/
+    fi
+  fi
+elif [ -f $data/feats.scp ]; then
+  echo "$0: wave file does not exist so getting durations from feats files"
+  feat-to-len scp:$data/feats.scp ark,t:- | awk -v frame_shift=$frame_shift '{print $1, $2*frame_shift;}' >$data/utt2dur
+else
+  echo "$0: Expected $data/wav.scp, $data/segments or $data/feats.scp to exist"
+  exit 1
+fi
+
+len1=$(cat $data/utt2spk | wc -l)
+len2=$(cat $data/utt2dur | wc -l)
+if [ "$len1" != "$len2" ]; then
+  echo "$0: warning: length of utt2dur does not equal that of utt2spk, $len2 != $len1"
+fi
+
+echo "$0: computed $data/utt2dur"
+
+exit 0
diff --git a/egs/wsj/s5/utils/data/perturb_data_dir_speed.sh b/egs/wsj/s5/utils/data/perturb_data_dir_speed.sh
new file mode 120000
index 00000000000..1cd5db30d92
--- /dev/null
+++ b/egs/wsj/s5/utils/data/perturb_data_dir_speed.sh
@@ -0,0 +1 @@
+../perturb_data_dir_speed.sh
\ No newline at end of file
diff --git a/egs/wsj/s5/utils/data/perturb_data_dir_speed_3way.sh b/egs/wsj/s5/utils/data/perturb_data_dir_speed_3way.sh
new file mode 100755
index 00000000000..a5a030ffdd8
--- /dev/null
+++ b/egs/wsj/s5/utils/data/perturb_data_dir_speed_3way.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+# Copyright 2016  Johns Hopkins University (author: Daniel Povey)
+
+# Apache 2.0
+
+# This script does the standard 3-way speed perturbing of
+# a data directory (it operates on the wav.scp).
+
+. utils/parse_options.sh
+
+if [ $# != 2 ]; then
+  echo "Usage: perturb_data_dir_speed_3way.sh <srcdir> <destdir>"
+  echo "Applies standard 3-way speed perturbation using factors of 0.9, 1.0 and 1.1."
+  echo "e.g.:"
+  echo " $0 data/train data/train_sp"
+  echo "Note: if <destdir>/feats.scp already exists, this will refuse to run."
+  exit 1
+fi
+
+srcdir=$1
+destdir=$2
+
+if [ ! -f $srcdir/wav.scp ]; then
+  echo "$0: expected $srcdir/wav.scp to exist"
+  exit 1
+fi
+
+if [ -f $destdir/feats.scp ]; then
+  echo "$0: $destdir/feats.scp already exists: refusing to run this (please delete $destdir/feats.scp if you want this to run)"
+  exit 1
+fi
+
+utils/data/perturb_data_dir_speed.sh 0.9 ${srcdir} ${destdir}_speed0.9 || exit 1
+utils/data/perturb_data_dir_speed.sh 1.1 ${srcdir} ${destdir}_speed1.1 || exit 1
+utils/data/combine_data.sh $destdir ${srcdir} ${destdir}_speed0.9 ${destdir}_speed1.1 || exit 1
+
+rm -r ${destdir}_speed0.9 ${destdir}_speed1.1
+
+echo "$0: generated 3-way speed-perturbed version of data in $srcdir, in $destdir"
+utils/validate_data_dir.sh --no-feats $destdir
+
diff --git a/egs/wsj/s5/utils/data/perturb_data_dir_volume.sh b/egs/wsj/s5/utils/data/perturb_data_dir_volume.sh
new file mode 100755
index 00000000000..7c58b59a846
--- /dev/null
+++ b/egs/wsj/s5/utils/data/perturb_data_dir_volume.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+
+# Copyright 2016  Johns Hopkins University (author: Daniel Povey)
+# Apache 2.0
+
+# This script operates on a data directory, such as in data/train/, and modifies
+# the wav.scp to perturb the volume (typically useful for training data when
+# using systems that don't have cepstral mean normalization).
+
+. utils/parse_options.sh
+
+if [ $# != 1 ]; then
+  echo "Usage: $0 <datadir>"
+  echo "e.g.:"
+  echo " $0 data/train"
+  exit 1
+fi
+
+export LC_ALL=C
+
+data=$1
+
+if [ ! -f $data/wav.scp ]; then
+  echo "$0: Expected $data/wav.scp to exist"
+  exit 1
+fi
+
+if grep -q "sox --vol" $data/wav.scp; then
+  echo "$0: It looks like the data was already volume perturbed.  Not doing anything."
+  exit 0
+fi
+
+cat $data/wav.scp | python -c "
+import sys, os, subprocess, re, random
+random.seed(0)
+scale_low = 1.0/8
+scale_high = 2.0
+for line in sys.stdin.readlines():
+  if len(line.strip()) == 0:
+    continue
+  if line.strip()[-1] == '|':
+    print '{0} sox --vol {1} -t wav - -t wav - |'.format(line.strip(), random.uniform(scale_low, scale_high))
+  else:
+    parts = line.split()
+    print '{id} sox --vol {vol} -t wav {wav} -t wav - |'.format(id = parts[0], wav=' '.join(parts[1:]), vol = random.uniform(scale_low, scale_high))
+"  > $data/wav.scp_scaled || exit 1;
+
+len1=$(cat $data/wav.scp | wc -l)
+len2=$(cat $data/wav.scp_scaled | wc -l)
+if [ "$len1" != "$len2" ]; then
+  echo "$0: error detected: number of lines changed $len1 vs $len2";
+  exit 1
+fi
+
+mv $data/wav.scp_scaled $data/wav.scp
+
+if [ -f $data/feats.scp ]; then
+  echo "$0: $data/feats.scp exists; moving it to $data/.backup/ as it wouldn't be valid any more."
+  mkdir -p $data/.backup/
+  mv $data/feats.scp $data/.backup/
+fi
+
+echo "$0: added volume perturbation to the data in $data"
+exit 0
+
diff --git a/egs/wsj/s5/utils/data/validate_data_dir.sh b/egs/wsj/s5/utils/data/validate_data_dir.sh
new file mode 120000
index 00000000000..1e19b4d921f
--- /dev/null
+++ b/egs/wsj/s5/utils/data/validate_data_dir.sh
@@ -0,0 +1 @@
+../validate_data_dir.sh
\ No newline at end of file
diff --git a/egs/wsj/s5/utils/dict_dir_add_pronprobs.sh b/egs/wsj/s5/utils/dict_dir_add_pronprobs.sh
index 5493f4b03cb..f9d2890ea39 100755
--- a/egs/wsj/s5/utils/dict_dir_add_pronprobs.sh
+++ b/egs/wsj/s5/utils/dict_dir_add_pronprobs.sh
@@ -75,7 +75,7 @@ cat <(awk '{print 1, $0;}' <$dir/lexicon.txt) $pron_counts | \
        END{ for (p in pron_count) { word = pron2word[p]; num = pron_count[p]; den = word_count[word]; 
           print num / den, p } } ' | \
     awk '{ word = $2; $2 = $1; $1 = word; print; }' | grep -v '^<eps>' |\
-    sort -k1,1 -k3 > $dir/lexiconp.txt
+    sort -k1,1 -k2g,2 -k3 > $dir/lexiconp.txt
 
 
 n_old=$(wc -l <$dir/lexicon.txt)
@@ -201,7 +201,7 @@ if [ -n "$sil_counts" ]; then
     if ($F_nl_EOS == "0.00") { $F_nl_EOS = "0.01"; }
     print SP "<s> $P_BOS_sr\n</s>_s $F_sl_EOS\n</s>_n $F_nl_EOS\noverall $sil_prob\n";
     ' $dir/lexiconp.txt $bigram_counts $dir/lexiconp_silprob_unsorted.txt $dir/silprob.txt
-    sort -k1,1 -k6 $dir/lexiconp_silprob_unsorted.txt > $dir/lexiconp_silprob.txt
+    sort -k1,1 -k2g,2 -k6 $dir/lexiconp_silprob_unsorted.txt > $dir/lexiconp_silprob.txt
 fi
 
 # now regenerate lexicon.txt from lexiconp.txt, to make sure the lines are
diff --git a/egs/wsj/s5/utils/filter_scps.pl b/egs/wsj/s5/utils/filter_scps.pl
index 36c96a7a872..c4c283fb599 100755
--- a/egs/wsj/s5/utils/filter_scps.pl
+++ b/egs/wsj/s5/utils/filter_scps.pl
@@ -1,7 +1,7 @@
 #!/usr/bin/env perl
-# Copyright 2010-2012 Microsoft Corporation
-#                     Johns Hopkins University (author: Daniel Povey)
-#           2015      Xiaohui Zhang
+# Copyright 2010-2012   Microsoft Corporation
+#           2012-2016   Johns Hopkins University (author: Daniel Povey)
+#                2015   Xiaohui Zhang
 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -58,7 +58,7 @@
 
 $idlist = shift @ARGV;
 
-if (defined $jobname && $idlist !~ m/$jobname/ &&
+if ($idlist !~ m/$jobname/ &&
     $jobend > $jobstart) {
   print STDERR "filter_scps.pl: you are trying to use multiple filter files as filter patterns but "
     . "you are providing just one filter file ($idlist)\n";
@@ -67,52 +67,96 @@
 
 
 $infile = shift @ARGV;
-open (F, "< $infile") or die "Can't open $infile for read: $!";
-my @inlines;
-@inlines = <F>;
-close(F);
 
 $outfile = shift @ARGV;
 
-if (defined $jobname && $outfile !~ m/$jobname/ &&
-    $jobend > $jobstart) {
+if ($outfile !~ m/$jobname/ &&  $jobend > $jobstart) {
   print STDERR "filter_scps.pl: you are trying to create multiple filtered files but "
     . "you are providing just one output file ($outfile)\n";
   exit(1);
 }
 
+# This hashes from the id (e.g. utterance-id) to an array of the relevant
+# job-ids (which are integers).  In any normal use-case, this array will contain
+# exactly one job-id for any given id, but we want to be agnostic about this.
+%id2jobs = ( );
+
+# Some variables that we set to produce a warning.
+$warn_uncovered = 0;
+$warn_multiply_covered = 0;
+
 for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) {
-  $outfile_n = $outfile;
   $idlist_n = $idlist;
-  if (defined $jobname) { 
-    $idlist_n =~ s/$jobname/$jobid/g;
-    $outfile_n =~ s/$jobname/$jobid/g;
-  }
+  $idlist_n =~ s/$jobname/$jobid/g;
 
   open(F, "<$idlist_n") || die "Could not open id-list file $idlist_n";
-  my %seen;
+
   while(<F>) {
     @A = split;
-    @A>=1 || die "Invalid line $_ in id-list file $idlist_n";
-    $seen{$A[0]} = 1;
+    @A >= 1 || die "Invalid line $_ in id-list file $idlist_n";
+    $id = $A[0];
+    if (! defined $id2jobs{$id}) {
+      $id2jobs{$id} = [ ];  # new anonymous array.
+    }
+    push @{$id2jobs{$id}}, $jobid;
   }
   close(F);
-  open(FW, ">$outfile_n") || die "Could not open output file $outfile_n";
-  foreach (@inlines) {
-    if ($field == 1) { # Treat this as special case, since it is common.
-      $_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field.";
-      # $1 is what we filter on.
-      if ($seen{$1}) {
-        print FW $_;
-      }
-    } else {
-      @A = split;
-      @A > 0 || die "Invalid scp file line $_";
-      @A >= $field || die "Invalid scp file line $_";
-      if ($seen{$A[$field-1]}) {
-        print FW $_;
+}
+
+# job2output hashes from the job-id, to an anonymous array containing
+# a sequence of output lines.
+%job2output = ( );
+for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) {
+  $job2output{$jobid} = [ ];  # new anonymous array.
+}
+
+open (F, "< $infile") or die "Can't open $infile for read: $!";
+while (<F>) {
+  if ($field == 1) {           # Treat this as special case, since it is common.
+    $_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field.";
+    # $1 is what we filter on.
+    $id = $1;
+  } else {
+    @A = split;
+    @A > 0 || die "Invalid scp file line $_";
+    @A >= $field || die "Invalid scp file line $_";
+    $id = $A[$field-1];
+  }
+  if ( ! defined $id2jobs{$id}) {
+    $warn_uncovered = 1;
+  } else {
+    @jobs = @{$id2jobs{$id}};   # this dereferences the array reference.
+    if (@jobs > 1) {
+      $warn_multiply_covered = 1;
+    }
+    foreach $job_id (@jobs) {
+      if (!defined $job2output{$job_id}) {
+        die "Likely code error";
       }
+      push @{$job2output{$job_id}}, $_;
     }
   }
+}
+close(F);
+
+for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) {
+  $outfile_n = $outfile;
+  $outfile_n =~ s/$jobname/$jobid/g;
+  open(FW, ">$outfile_n") || die "Could not open output file $outfile_n";
+  $printed = 0;
+  foreach $line (@{$job2output{$jobid}}) {
+    print FW $line;
+    $printed = 1;
+  }
+  if (!printed) {
+    print STDERR "filter_scps.pl: warning: output to $outfile_n is empty\n";
+  }
   close(FW);
 }
+
+if ($warn_uncovered) {
+  print STDERR "filter_scps.pl: warning: some input lines did not get output\n";
+}
+if ($warn_multiply_covered) {
+  print STDERR "filter_scps.pl: warning: some input lines were output to multiple files\n";
+}
diff --git a/egs/wsj/s5/utils/fix_data_dir.sh b/egs/wsj/s5/utils/fix_data_dir.sh
index 2ccaa89f507..b6ce1511814 100755
--- a/egs/wsj/s5/utils/fix_data_dir.sh
+++ b/egs/wsj/s5/utils/fix_data_dir.sh
@@ -1,9 +1,9 @@
 #!/bin/bash
 
-# This script makes sure that only the segments present in 
+# This script makes sure that only the segments present in
 # all of "feats.scp", "wav.scp" [if present], segments [if present]
 # text, and utt2spk are present in any of them.
-# It puts the original contents of data-dir into 
+# It puts the original contents of data-dir into
 # data-dir/.backup
 
 if [ $# != 1 ]; then
@@ -35,7 +35,8 @@ function check_sorted {
   fi
 }
 
-for x in utt2spk spk2utt feats.scp text segments wav.scp cmvn.scp vad.scp reco2file_and_channel spk2gender utt2lang; do
+for x in utt2spk spk2utt feats.scp text segments wav.scp cmvn.scp vad.scp \
+    reco2file_and_channel spk2gender utt2lang utt2uniq utt2dur; do
   if [ -f $data/$x ]; then
     cp $data/$x $data/.backup/$x
     check_sorted $data/$x
@@ -61,7 +62,7 @@ function filter_file {
 function filter_recordings {
   # We call this once before the stage when we filter on utterance-id, and once
   # after.
-  
+
   if [ -f $data/segments ]; then
   # We have a segments file -> we need to filter this and the file wav.scp, and
   # reco2file_and_utt, if it exists, to make sure they have the same list of
@@ -78,7 +79,7 @@ function filter_recordings {
     utils/filter_scp.pl $data/wav.scp $tmpdir/recordings > $tmpdir/recordings.tmp
     mv $tmpdir/recordings.tmp $tmpdir/recordings
 
-    
+
     cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments
     filter_file $tmpdir/recordings $data/segments
     cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments
@@ -86,7 +87,7 @@ function filter_recordings {
 
     filter_file $tmpdir/recordings $data/wav.scp
     [ -f $data/reco2file_and_channel ] && filter_file $tmpdir/recordings $data/reco2file_and_channel
-    
+
   fi
 }
 
@@ -116,8 +117,6 @@ function filter_speakers {
 function filter_utts {
   cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts
 
-# Do a check.
-
   ! cat $data/utt2spk | sort | cmp - $data/utt2spk && \
     echo "utt2spk is not in sorted order (fix this yourself)" && exit 1;
 
@@ -128,6 +127,10 @@ function filter_utts {
   ! cat $data/spk2utt | sort | cmp - $data/spk2utt && \
     echo "spk2utt is not in sorted order (fix this yourself)" && exit 1;
 
+  if [ -f $data/utt2uniq ]; then
+    ! cat $data/utt2uniq | sort | cmp - $data/utt2uniq && \
+      echo "utt2uniq is not in sorted order (fix this yourself)" && exit 1;
+  fi
 
   maybe_wav=
   [ ! -f $data/segments ] && maybe_wav=wav.scp  # wav indexed by utts only if segments does not exist.
@@ -151,7 +154,7 @@ function filter_utts {
     fi
   fi
 
-  for x in utt2spk feats.scp vad.scp text segments utt2lang $maybe_wav; do
+  for x in utt2spk utt2uniq feats.scp vad.scp text segments utt2lang utt2dur $maybe_wav; do
     if [ -f $data/$x ]; then
       cp $data/$x $data/.backup/$x
       if ! cmp -s $data/$x <( utils/filter_scp.pl $tmpdir/utts $data/$x ) ; then
@@ -168,8 +171,6 @@ filter_utts
 filter_speakers
 filter_recordings
 
-
-
 utils/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt
 
 echo "fix_data_dir.sh: old files are kept in $data/.backup"
diff --git a/egs/wsj/s5/utils/format_lm.sh b/egs/wsj/s5/utils/format_lm.sh
index 32dbc68031a..4ab1c73217e 100755
--- a/egs/wsj/s5/utils/format_lm.sh
+++ b/egs/wsj/s5/utils/format_lm.sh
@@ -39,20 +39,9 @@ for f in phones.txt words.txt L.fst L_disambig.fst phones/; do
 done
 
 lm_base=$(basename $lm '.gz')
-gunzip -c $lm | utils/find_arpa_oovs.pl $out_dir/words.txt \
-  > $out_dir/oovs_${lm_base}.txt
-
-# Removing all "illegal" combinations of <s> and </s>, which are supposed to 
-# occur only at being/end of utt.  These can cause determinization failures 
-# of CLG [ends up being epsilon cycles].
 gunzip -c $lm \
-  | egrep -v '<s> <s>|</s> <s>|</s> </s>' \
-  | arpa2fst - | fstprint \
-  | utils/remove_oovs.pl $out_dir/oovs_${lm_base}.txt \
-  | utils/eps2disambig.pl | utils/s2eps.pl \
-  | fstcompile --isymbols=$out_dir/words.txt --osymbols=$out_dir/words.txt \
-    --keep_isymbols=false --keep_osymbols=false \
-  | fstrmepsilon | fstarcsort --sort_type=ilabel > $out_dir/G.fst
+  | arpa2fst --disambig-symbol=#0 \
+             --read-symbol-table=$out_dir/words.txt - $out_dir/G.fst
 set +e
 fstisstochastic $out_dir/G.fst
 set -e
@@ -66,7 +55,7 @@ set -e
 # this might cause determinization failure of CLG.
 # #0 is treated as an empty word.
 mkdir -p $out_dir/tmpdir.g
-awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} 
+awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }}
      END{print "0 0 #0 #0"; print "0";}' \
      < "$lexicon" > $out_dir/tmpdir.g/select_empty.fst.txt
 
diff --git a/egs/wsj/s5/utils/format_lm_sri.sh b/egs/wsj/s5/utils/format_lm_sri.sh
index 7753c186045..c62a356e05f 100755
--- a/egs/wsj/s5/utils/format_lm_sri.sh
+++ b/egs/wsj/s5/utils/format_lm_sri.sh
@@ -71,8 +71,8 @@ if [ -z $loc ]; then
     export PATH=$PATH:$sdir:$sdir/..
   else
     echo You appear to not have SRILM tools installed, either on your path,
-    echo or installed in $sdir.  See tools/install_srilm.sh for installation
-    echo instructions.
+    echo or installed in $sdir.  cd to ../../../tools and run
+    echo extras/install_srilm.sh.
     exit 1
   fi
 fi
@@ -85,30 +85,15 @@ mkdir -p $out_dir
 cp -r $lang_dir/* $out_dir || exit 1;
 
 lm_base=$(basename $lm '.gz')
-gunzip -c $lm | utils/find_arpa_oovs.pl $out_dir/words.txt \
-  > $out_dir/oovs_${lm_base}.txt || exit 1;
-
-# Removing all "illegal" combinations of <s> and </s>, which are supposed to 
-# occur only at being/end of utt.  These can cause determinization failures 
-# of CLG [ends up being epsilon cycles].
-gunzip -c $lm \
-  | egrep -v '<s> <s>|</s> <s>|</s> </s>' \
-  | gzip -c > $tmpdir/lm.gz || exit 1;
-
 awk '{print $1}' $out_dir/words.txt > $tmpdir/voc || exit 1;
 
 # Change the LM vocabulary to be the intersection of the current LM vocabulary
-# and the set of words in the pronunciation lexicon. This also renormalizes the 
-# LM by recomputing the backoff weights, and remove those ngrams whose 
+# and the set of words in the pronunciation lexicon. This also renormalizes the
+# LM by recomputing the backoff weights, and remove those ngrams whose
 # probabilities are lower than the backed-off estimates.
-change-lm-vocab -vocab $tmpdir/voc -lm $tmpdir/lm.gz -write-lm $tmpdir/out_lm \
-  $srilm_opts || exit 1;
-
-arpa2fst $tmpdir/out_lm | fstprint \
-  | utils/eps2disambig.pl | utils/s2eps.pl \
-  | fstcompile --isymbols=$out_dir/words.txt --osymbols=$out_dir/words.txt \
-    --keep_isymbols=false --keep_osymbols=false \
-  | fstrmepsilon | fstarcsort --sort_type=ilabel > $out_dir/G.fst || exit 1;
+change-lm-vocab -vocab $tmpdir/voc -lm $lm -write-lm - $srilm_opts | \
+  arpa2fst --disambig-symbol=#0 \
+           --read-symbol-table=$out_dir/words.txt - $out_dir/G.fst || exit 1
 
 fstisstochastic $out_dir/G.fst
 
diff --git a/egs/wsj/s5/utils/gen_topo.pl b/egs/wsj/s5/utils/gen_topo.pl
index 2ed33113260..b2e85a43606 100755
--- a/egs/wsj/s5/utils/gen_topo.pl
+++ b/egs/wsj/s5/utils/gen_topo.pl
@@ -5,7 +5,7 @@
 # Generate a topology file.  This allows control of the number of states in the
 # non-silence HMMs, and in the silence HMMs.
 
-if(@ARGV != 4) {
+if (@ARGV != 4) {
   print STDERR "Usage: utils/gen_topo.pl <num-nonsilence-states> <num-silence-states> <colon-separated-nonsilence-phones> <colon-separated-silence-phones>\n";
   print STDERR "e.g.:  utils/gen_topo.pl 3 5 4:5:6:7:8:9:10 1:2:3\n";
   exit (1);
@@ -13,8 +13,10 @@
 
 ($num_nonsil_states, $num_sil_states, $nonsil_phones, $sil_phones) = @ARGV;
 
-( $num_nonsil_states >= 1 && $num_nonsil_states <= 100 ) || die "Unexpected number of nonsilence-model states $num_nonsil_states\n";
-( $num_sil_states >= 3 && $num_sil_states <= 100 ) || die "Unexpected number of silence-model states $num_sil_states\n";
+( $num_nonsil_states >= 1 && $num_nonsil_states <= 100 ) ||
+  die "Unexpected number of nonsilence-model states $num_nonsil_states\n";
+(( $num_sil_states == 1 || $num_sil_states >= 3) && $num_sil_states <= 100 ) ||
+  die "Unexpected number of silence-model states $num_sil_states\n";
 
 $nonsil_phones =~ s/:/ /g;
 $sil_phones =~ s/:/ /g;
@@ -33,31 +35,45 @@
 print "<State> $num_nonsil_states </State>\n"; # non-emitting final state.
 print "</TopologyEntry>\n";
 # Now silence phones.  They have a different topology-- apart from the first and
-# last states, it's fully connected.
-$transp = 1.0 / ($num_sil_states-1);
+# last states, it's fully connected, as long as you have >= 3 states.
 
-print "<TopologyEntry>\n";
-print "<ForPhones>\n";
-print "$sil_phones\n";
-print "</ForPhones>\n";
-print "<State> 0 <PdfClass> 0 ";
-for ($nextstate = 0; $nextstate < $num_sil_states-1; $nextstate++) { # Transitions to all but last 
-  # emitting state.
-  print "<Transition> $nextstate $transp ";
-}
-print "</State>\n";
-for ($state = 1; $state < $num_sil_states-1; $state++) { # the central states all have transitions to
-  # themselves and to the last emitting state.
-  print "<State> $state <PdfClass> $state ";
-  for ($nextstate = 1; $nextstate < $num_sil_states; $nextstate++) {
+if ($num_sil_states > 1) {
+  $transp = 1.0 / ($num_sil_states-1);
+  print "<TopologyEntry>\n";
+  print "<ForPhones>\n";
+  print "$sil_phones\n";
+  print "</ForPhones>\n";
+  print "<State> 0 <PdfClass> 0 ";
+  for ($nextstate = 0; $nextstate < $num_sil_states-1; $nextstate++) { # Transitions to all but last
+    # emitting state.
     print "<Transition> $nextstate $transp ";
   }
   print "</State>\n";
+  for ($state = 1; $state < $num_sil_states-1; $state++) { # the central states all have transitions to
+    # themselves and to the last emitting state.
+    print "<State> $state <PdfClass> $state ";
+    for ($nextstate = 1; $nextstate < $num_sil_states; $nextstate++) {
+      print "<Transition> $nextstate $transp ";
+    }
+    print "</State>\n";
+  }
+  # Final emitting state (non-skippable).
+  $state = $num_sil_states-1;
+  print "<State> $state <PdfClass> $state <Transition> $state 0.75 <Transition> $num_sil_states 0.25 </State>\n";
+  # Final nonemitting state:
+  print "<State> $num_sil_states </State>\n";
+  print "</TopologyEntry>\n";
+} else {
+  print "<TopologyEntry>\n";
+  print "<ForPhones>\n";
+  print "$sil_phones\n";
+  print "</ForPhones>\n";
+  print "<State> 0 <PdfClass> 0 ";
+  print "<Transition> 0 0.75 ";
+  print "<Transition> 1 0.25 ";
+  print "</State>\n";
+  print "<State> $num_nonsil_states </State>\n"; # non-emitting final state.
+  print "</TopologyEntry>\n";
 }
-# Final emitting state (non-skippable).
-$state = $num_sil_states-1;
-print "<State> $state <PdfClass> $state <Transition> $state 0.75 <Transition> $num_sil_states 0.25 </State>\n";
-# Final nonemitting state:
-print "<State> $num_sil_states </State>\n"; 
-print "</TopologyEntry>\n";
+
 print "</Topology>\n";
diff --git a/egs/wsj/s5/utils/lang/add_lex_disambig.pl b/egs/wsj/s5/utils/lang/add_lex_disambig.pl
new file mode 120000
index 00000000000..2d1d4425b49
--- /dev/null
+++ b/egs/wsj/s5/utils/lang/add_lex_disambig.pl
@@ -0,0 +1 @@
+../add_lex_disambig.pl
\ No newline at end of file
diff --git a/egs/wsj/s5/utils/lang/check_g_properties.pl b/egs/wsj/s5/utils/lang/check_g_properties.pl
new file mode 100755
index 00000000000..ee0f6ddb515
--- /dev/null
+++ b/egs/wsj/s5/utils/lang/check_g_properties.pl
@@ -0,0 +1,89 @@
+#!/usr/bin/env perl
+
+use IPC::Open2;
+
+if (@ARGV != 1) {
+  print "Usage: $0 [options] <lang_directory>\n";
+  print "e.g.:  $0 data/lang\n";
+  exit(1);
+}
+
+$lang = shift @ARGV;
+
+# This script checks that G.fst in the lang.fst directory is OK with respect
+# to certain expected properties, and returns nonzero exit status if a problem was
+# detected.  It is called from validate_lang.pl.
+# This only checks the properties of G that relate to disambiguation symbols,
+# epsilons and forbidden symbols <s> and </s>.
+
+if (! -e "$lang/G.fst") {
+  print "$0: error: $lang/G.fst does not exist\n";
+  exit(1);
+}
+
+open(W, "<$lang/words.txt") || die "opening $lang/words.txt";
+$hash_zero = -1;
+while (<W>) {
+  @A = split(" ", $_);
+  ($sym, $int) = @A;
+  if ($sym eq "<s>" || $sym eq "</s>") { $is_forbidden{$int} = 1; }
+  if ($sym eq "#0") { $hash_zero = $int; }
+}
+
+if (-e "$lang/phones/wdisambig_words.int") {
+  open(F, "<$lang/phones/wdisambig_words.int") || die "opening $lang/phones/wdisambig_words.int";
+  while (<F>) {
+    chop;
+    $is_disambig{$_} = 1;
+  }
+} else {
+  $is_disambig{$hash_zero} = 1;
+}
+
+$input_cmd = ". ./path.sh; fstprint $lang/G.fst|";
+open(G, $input_cmd) || die "running command $input_cmd";
+
+$info_cmd = ". ./path.sh; fstcompile | fstinfo ";
+open2(O, I, "$info_cmd") || die "running command $info_cmd";
+
+$has_epsilons = 0;
+
+while (<G>) {
+  @A = split(" ", $_);
+  if (@A >= 4) {
+    if ($is_forbidden{$A[2]} || $is_forbidden{$A[3]}) {
+      chop;
+      print "$0: validating $lang: error: line $_ in G.fst contains forbidden symbol <s> or </s>\n";
+      exit(1);
+    } elsif ($is_disambig{$A[2]}) {
+      print I $_;
+      if ($A[3] != 0) {
+        chop;
+        print "$0: validating $lang: error: line $_ in G.fst has disambig on input but no epsilon on output\n";
+        exit(1);
+      }
+    } elsif ($A[2] == 0) {
+      print I $_;
+      $has_epsilons = 1;
+    } elsif ($A[2] != $A[3]) {
+      chop;
+      print "$0: validating $lang: error: line $_ in G.fst has inputs and outputs different but input is not disambig symbol.\n";
+      exit(1);
+    }
+  }
+}
+
+close(I);  # tell 'fstcompile | fstinfo' pipeline that its input is done.
+while (<O>) {
+  if (m/cyclic\s+y/) {
+    print "$0: validating $lang: error: G.fst has cycles containing only disambig symbols and epsilons.  Would cause determinization failure\n";
+    exit(1);
+  }
+}
+
+if ($has_epsilons) {
+  print "$0: warning: validating $lang: G.fst has epsilon-input arcs.  We don't expect these in most setups.\n";
+}
+
+print "--> $0 successfully validated $lang/G.fst\n";
+exit(0);
diff --git a/egs/wsj/s5/utils/lang/prepare_lang.sh b/egs/wsj/s5/utils/lang/prepare_lang.sh
new file mode 120000
index 00000000000..96b9f592e82
--- /dev/null
+++ b/egs/wsj/s5/utils/lang/prepare_lang.sh
@@ -0,0 +1 @@
+../prepare_lang.sh
\ No newline at end of file
diff --git a/egs/wsj/s5/utils/lang/validate_lang.pl b/egs/wsj/s5/utils/lang/validate_lang.pl
new file mode 120000
index 00000000000..edb66bf3149
--- /dev/null
+++ b/egs/wsj/s5/utils/lang/validate_lang.pl
@@ -0,0 +1 @@
+../validate_lang.pl
\ No newline at end of file
diff --git a/egs/wsj/s5/utils/make_lexicon_fst.pl b/egs/wsj/s5/utils/make_lexicon_fst.pl
index 0558ab20bca..bcf0f4df13a 100755
--- a/egs/wsj/s5/utils/make_lexicon_fst.pl
+++ b/egs/wsj/s5/utils/make_lexicon_fst.pl
@@ -29,7 +29,7 @@
 if (@ARGV != 1 && @ARGV != 3 && @ARGV != 4) {
   print STDERR
     "Usage: make_lexicon_fst.pl [--pron-probs] lexicon.txt [silprob silphone [sil_disambig_sym]] >lexiconfst.txt
-Creates a lexicon FST that transduces phones to words, and may allow optional silence. 
+Creates a lexicon FST that transduces phones to words, and may allow optional silence.
 Note: ordinarily, each line of lexicon.txt is: word phone1 phone2 ... phoneN; if the --pron-probs option is
 used, each line is: word pronunciation-probability phone1 phone2 ... phoneN.  The probability 'prob' will
 typically be between zero and one, and note that it's generally helpful to normalize so the largest one
@@ -42,7 +42,7 @@
 $lexfn = shift @ARGV;
 if (@ARGV == 0) {
   $silprob = 0.0;
-} elsif (@ARGV == 2) { 
+} elsif (@ARGV == 2) {
   ($silprob,$silphone) = @ARGV;
 } else {
   ($silprob,$silphone,$sildisambig) = @ARGV;
@@ -57,19 +57,6 @@
 open(L, "<$lexfn") || die "Error opening lexicon $lexfn";
 
 
-sub is_sil {
-  # Return true (1) if provided with a phone-sequence
-  # that means silence.
-  # @_ is the parameters of the function
-  # This function returns true if @_ equals ( $silphone )
-  # or something of the form ( "#0", $silphone, "#1" )
-  # where the "#0" and "#1" are disambiguation symbols.
-  return ( @_ == 1 && $_[0] eq $silphone ||
-           (@_ == 3 && $_[1] eq $silphone &&
-            $_[0] =~ m/^\#\d+$/ &&
-            $_[0] =~ m/^\#\d+$/));
-}
-
 if ( $silprob == 0.0 ) { # No optional silences: just have one (loop+final) state which is numbered zero.
   $loopstate = 0;
   $nextstate = 1;               # next unallocated state.
@@ -92,7 +79,7 @@ sub is_sil {
       $pron_cost = -log($pron_prob);
     }
     if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; }
-    
+
     $s = $loopstate;
     $word_or_eps = $w;
     while (@A > 0) {
@@ -148,18 +135,16 @@ sub is_sil {
         $word_or_eps = "<eps>";
         $pron_cost_string = ""; $pron_cost = 0.0; # so we only print it the 1st time.
         $s = $ns;
+      } elsif (!defined($silphone) || $p ne $silphone) {
+        # This is non-deterministic but relatively compact,
+        # and avoids epsilons.
+        $local_nosilcost = $nosilcost + $pron_cost;
+        $local_silcost = $silcost + $pron_cost;
+        print "$s\t$loopstate\t$p\t$word_or_eps\t$local_nosilcost\n";
+        print "$s\t$silstate\t$p\t$word_or_eps\t$local_silcost\n";
       } else {
-        if (!is_sil($p)) {
-          # This is non-deterministic but relatively compact,
-          # and avoids epsilons.
-          $local_nosilcost = $nosilcost + $pron_cost;
-          $local_silcost = $silcost + $pron_cost;
-          print "$s\t$loopstate\t$p\t$word_or_eps\t$local_nosilcost\n";
-          print "$s\t$silstate\t$p\t$word_or_eps\t$local_silcost\n";
-        } else {
-          # no point putting opt-sil after silence word.
-          print "$s\t$loopstate\t$p\t$word_or_eps$pron_cost_string\n";
-        }
+        # no point putting opt-sil after silence word.
+        print "$s\t$loopstate\t$p\t$word_or_eps$pron_cost_string\n";
       }
     }
   }
diff --git a/egs/wsj/s5/utils/make_phone_bigram_lang.sh b/egs/wsj/s5/utils/make_phone_bigram_lang.sh
index 87d1db8f3e8..548cb223632 100755
--- a/egs/wsj/s5/utils/make_phone_bigram_lang.sh
+++ b/egs/wsj/s5/utils/make_phone_bigram_lang.sh
@@ -11,7 +11,7 @@
 # language-id.
 
 
-# We might later have options here; if not, I'llr emove this.
+# We might later have options here; if not, I'll emove this.
 
 echo "$0 $@"  # Print the command line for logging
 
@@ -42,11 +42,16 @@ rm -r $lang_out/phones 2>/dev/null
 cp -r $lang/phones/ $lang_out/
 rm $lang_out/phones/word_boundary.* 2>/dev/null # these would
   # no longer be valid.
+rm $lang_out/phones/wdisambig* 2>/dev/null  # ditto this.
+
 # List of disambig symbols will be empty: not needed, since G.fst and L.fst * G.fst
 # are determinizable without any.
 echo -n > $lang_out/phones/disambig.txt
 echo -n > $lang_out/phones/disambig.int
 echo -n > $lang_out/phones/disambig.csl
+echo -n > $lang_out/phones/wdisambig.txt
+echo -n > $lang_out/phones/wdisambig_phones.int
+echo -n > $lang_out/phones/wdisambig_words.int
 
 # Let OOV symbol be the first phone.  This is arbitrary, it's just
 # so that validate_lang.pl succeeds.  We should never actually use
@@ -81,7 +86,7 @@ ali-to-phones $alidir/final.mdl "ark:gunzip -c $alidir/ali.*.gz|" ark,t:- | \
   foreach $p (@phones) {
     $src = $phn2state{$p};
     $hist = $histcount{$p};
-    $hist > 0 || die;    
+    $hist > 0 || die;
     foreach $q (@phones) {
       $c = $count{$p,$q};
       if (defined $c) {
@@ -92,7 +97,7 @@ ali-to-phones $alidir/final.mdl "ark:gunzip -c $alidir/ali.*.gz|" ark,t:- | \
     }
     $c = $count{$p,"</s>"};
     if (defined $c) {
-      $cost = -log($c / $hist); # cost on FST arc.      
+      $cost = -log($c / $hist); # cost on FST arc.
       print "$src $cost\n"; # final-prob.
     }
   } ' | fstcompile --acceptor=true | \
@@ -101,7 +106,7 @@ ali-to-phones $alidir/final.mdl "ark:gunzip -c $alidir/ali.*.gz|" ark,t:- | \
 # symbols for phones and words are the same.
 # Neither has disambig symbols.
 cp $lang_out/phones.txt $lang_out/words.txt
-  
+
 grep -v '<eps>' $lang_out/phones.txt | awk '{printf("0 0 %s %s\n", $2, $2);} END{print("0 0.0");}' | \
    fstcompile  > $lang_out/L.fst
 
@@ -115,5 +120,4 @@ utils/sym2int.pl $lang_out/phones.txt <$lang_out/phones/align_lexicon.txt >$lang
 # L and L_disambig are the same.
 cp $lang_out/L.fst $lang_out/L_disambig.fst
 
-utils/validate_lang.pl $lang_out || exit 1;
-echo "$0: ignore warnings RE disambiguation symbols from validate_lang.pl (these are expected)"
+utils/validate_lang.pl --skip-disambig-check $lang_out || exit 1;
diff --git a/egs/wsj/s5/utils/mkgraph.sh b/egs/wsj/s5/utils/mkgraph.sh
index f68c6f4099c..b7023538e9b 100755
--- a/egs/wsj/s5/utils/mkgraph.sh
+++ b/egs/wsj/s5/utils/mkgraph.sh
@@ -7,24 +7,26 @@
 # all the language-model, pronunciation dictionary (lexicon), context-dependency,
 # and HMM structure in our model.  The output is a Finite State Transducer
 # that has word-ids on the output, and pdf-ids on the input (these are indexes
-# that resolve to Gaussian Mixture Models).  
+# that resolve to Gaussian Mixture Models).
 # See
 #  http://kaldi.sourceforge.net/graph_recipe_test.html
 # (this is compiled from this repository using Doxygen,
 # the source for this part is in src/doc/graph_recipe_test.dox)
 
+set -o pipefail
 
-N=3
-P=1
 tscale=1.0
 loopscale=0.1
 
 reverse=false
+remove_oov=false
 
-for x in `seq 5`; do 
-  [ "$1" == "--mono" ] && N=1 && P=0 && shift;
-  [ "$1" == "--quinphone" ] && N=5 && P=2 && shift;
+for x in `seq 6`; do
+  [ "$1" == "--mono" ] && context=mono && shift;
+  [ "$1" == "--left-biphone" ] && context=lbiphone && shift;
+  [ "$1" == "--quinphone" ] && context=quinphone && shift;
   [ "$1" == "--reverse" ] && reverse=true && shift;
+  [ "$1" == "--remove-oov" ] && remove_oov=true && shift;
   [ "$1" == "--transition-scale" ] && tscale=$2 && shift 2;
   [ "$1" == "--self-loop-scale" ] && loopscale=$2 && shift 2;
 done
@@ -56,13 +58,23 @@ for f in $required; do
   [ ! -f $f ] && echo "mkgraph.sh: expected $f to exist" && exit 1;
 done
 
+N=$(tree-info $tree | grep "context-width" | cut -d' ' -f2) || { echo "Error when getting context-width"; exit 1; }
+P=$(tree-info $tree | grep "central-position" | cut -d' ' -f2) || { echo "Error when getting central-position"; exit 1; }
+if [[ $context == mono && ($N != 1 || $P != 0) || \
+      $context == lbiphone && ($N != 2 || $P != 1) || \
+      $context == quinphone && ($N != 5 || $P != 2) ]]; then
+  echo "mkgraph.sh: mismatch between the specified context (--$context) and the one in the tree: N=$N, P=$P"
+  exit 1
+fi
+
 mkdir -p $lang/tmp
-# Note: [[ ]] is like [ ] but enables certain extra constructs, e.g. || in 
+# Note: [[ ]] is like [ ] but enables certain extra constructs, e.g. || in
 # place of -o
 if [[ ! -s $lang/tmp/LG.fst || $lang/tmp/LG.fst -ot $lang/G.fst || \
       $lang/tmp/LG.fst -ot $lang/L_disambig.fst ]]; then
   fsttablecompose $lang/L_disambig.fst $lang/G.fst | fstdeterminizestar --use-log=true | \
-    fstminimizeencoded | fstarcsort --sort_type=ilabel > $lang/tmp/LG.fst || exit 1;
+    fstminimizeencoded | fstpushspecial | \
+    fstarcsort --sort_type=ilabel > $lang/tmp/LG.fst || exit 1;
   fstisstochastic $lang/tmp/LG.fst || echo "[info]: LG not stochastic."
 fi
 
@@ -94,7 +106,12 @@ fi
 
 if [[ ! -s $dir/HCLGa.fst || $dir/HCLGa.fst -ot $dir/Ha.fst || \
       $dir/HCLGa.fst -ot $clg ]]; then
-  fsttablecompose $dir/Ha.fst $clg | fstdeterminizestar --use-log=true \
+  if $remove_oov; then
+    [ ! -f $lang/oov.int ] && \
+      echo "$0: --remove-oov option: no file $lang/oov.int" && exit 1;
+    clg="fstrmsymbols --remove-arcs=true --apply-to-output=true $lang/oov.int $clg|"
+  fi
+  fsttablecompose $dir/Ha.fst "$clg" | fstdeterminizestar --use-log=true \
     | fstrmsymbols $dir/disambig_tid.int | fstrmepslocal | \
      fstminimizeencoded > $dir/HCLGa.fst || exit 1;
   fstisstochastic $dir/HCLGa.fst || echo "HCLGa is not stochastic"
@@ -105,7 +122,7 @@ if [[ ! -s $dir/HCLG.fst || $dir/HCLG.fst -ot $dir/HCLGa.fst ]]; then
     $model < $dir/HCLGa.fst > $dir/HCLG.fst || exit 1;
 
   if [ $tscale == 1.0 -a $loopscale == 1.0 ]; then
-    # No point doing this test if transition-scale not 1, as it is bound to fail. 
+    # No point doing this test if transition-scale not 1, as it is bound to fail.
     fstisstochastic $dir/HCLG.fst || echo "[info]: final HCLG is not stochastic."
   fi
 fi
diff --git a/egs/wsj/s5/utils/pbs.pl b/egs/wsj/s5/utils/pbs.pl
new file mode 100755
index 00000000000..6c8d4488882
--- /dev/null
+++ b/egs/wsj/s5/utils/pbs.pl
@@ -0,0 +1,587 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).
+#           2014  Johns Hopkins University (Author: Vimal Manohar)
+#           2015  Queensland University of Technology (Author: Ahilan Kanagasundaram <a.kanagasundaram@qut.edu.au>)
+# Apache 2.0.
+
+use File::Basename;
+use Cwd;
+use Getopt::Long;
+
+# This is a version of the queue.pl modified so that it works under PBS 
+# The PBS is one of the several "almost compatible" queueing systems. The
+# command switches and environment variables are different, so we are adding
+# a this script. An optimal solution might probably be to make the variable
+# names and the commands configurable, as similar problems can be expected
+# with Torque, Univa... and who knows what else
+#
+# queue.pl has the same functionality as run.pl, except that
+# it runs the job in question on the queue (Sun GridEngine).
+# This version of queue.pl uses the task array functionality
+# of the grid engine.  Note: it's different from the queue.pl
+# in the s4 and earlier scripts.
+
+# The script now supports configuring the queue system using a config file
+# (default in conf/pbs.conf; but can be passed specified with --config option)
+# and a set of command line options.
+# The current script handles:
+# 1) Normal configuration arguments
+# For e.g. a command line option of "--gpu 1" could be converted into the option
+# "-q g.q -l gpu=1" to qsub. How the CLI option is handled is determined by a
+# line in the config file like
+# gpu=* -q g.q -l gpu=$0
+# $0 here in the line is replaced with the argument read from the CLI and the
+# resulting string is passed to qsub.
+# 2) Special arguments to options such as
+# gpu=0
+# If --gpu 0 is given in the command line, then no special "-q" is given.
+# 3) Default argument
+# default gpu=0
+# If --gpu option is not passed in the command line, then the script behaves as
+# if --gpu 0 was passed since 0 is specified as the default argument for that
+# option
+# 4) Arbitrary options and arguments.
+# Any command line option starting with '--' and its argument would be handled
+# as long as its defined in the config file.
+# 5) Default behavior
+# If the config file that is passed using is not readable, then the script
+# behaves as if the queue has the following config file:
+# $ cat conf/pbs.conf
+# # Default configuration
+# command qsub -v PATH -S /bin/bash -l arch=*64*
+# option mem=* -l mem_free=$0,ram_free=$0
+# option mem=0          # Do not add anything to qsub_opts
+# option num_threads=* -pe smp $0
+# option num_threads=1  # Do not add anything to qsub_opts
+# option max_jobs_run=* -tc $0
+# default gpu=0
+# option gpu=0 -q all.q
+# option gpu=* -l gpu=$0 -q g.q
+
+my $qsub_opts = "";
+my $sync = 0;
+my $num_threads = 1;
+my $gpu = 0;
+
+my $config = "conf/pbs.conf";
+
+my %cli_options = ();
+
+my $jobname;
+my $jobstart;
+my $jobend;
+
+my $array_job = 0;
+
+sub print_usage() {
+  print STDERR
+   "Usage: queue.pl [options] [JOB=1:n] log-file command-line arguments...\n" .
+   "e.g.: queue.pl foo.log echo baz\n" .
+   " (which will echo \"baz\", with stdout and stderr directed to foo.log)\n" .
+   "or: queue.pl -q all.q\@xyz foo.log echo bar \| sed s/bar/baz/ \n" .
+   " (which is an example of using a pipe; you can provide other escaped bash constructs)\n" .
+   "or: queue.pl -q all.q\@qyz JOB=1:10 foo.JOB.log echo JOB \n" .
+   " (which illustrates the mechanism to submit parallel jobs; note, you can use \n" .
+   "  another string other than JOB)\n" .
+   "Note: if you pass the \"-sync y\" option to qsub, this script will take note\n" .
+   "and change its behavior.  Otherwise it uses qstat to work out when the job finished\n" .
+   "Options:\n" .
+   "  --config <config-file> (default: $config)\n" .
+   "  --mem <mem-requirement> (e.g. --mem 2G, --mem 500M, \n" .
+   "                           also support K and numbers mean bytes)\n" .
+   "  --num-threads <num-threads> (default: $num_threads)\n" .
+   "  --max-jobs-run <num-jobs>\n" .
+   "  --gpu <0|1> (default: $gpu)\n";
+  exit 1;
+}
+
+if (@ARGV < 2) {
+  print_usage();
+}
+
+for (my $x = 1; $x <= 2; $x++) { # This for-loop is to
+  # allow the JOB=1:n option to be interleaved with the
+  # options to qsub.
+  while (@ARGV >= 2 && $ARGV[0] =~ m:^-:) {
+    my $switch = shift @ARGV;
+
+    if ($switch eq "-V") {
+      $qsub_opts .= "-V ";
+    } else {
+      my $argument = shift @ARGV;
+      if ($argument =~ m/^--/) {
+        print STDERR "queue.pl: Warning: suspicious argument '$argument' to $switch; starts with '-'\n";
+      }
+      if ($switch eq "-sync" && $argument =~ m/^[yY]/) {
+        $sync = 1;
+        $qsub_opts .= "$switch $argument ";
+      } elsif ($switch eq "-pe") { # e.g. -pe smp 5
+        my $argument2 = shift @ARGV;
+        $qsub_opts .= "$switch $argument $argument2 ";
+        $num_threads = $argument2;
+      } elsif ($switch =~ m/^--/) { # Config options
+        # Convert CLI option to variable name
+        # by removing '--' from the switch and replacing any
+        # '-' with a '_'
+        $switch =~ s/^--//;
+        $switch =~ s/-/_/g;
+        $cli_options{$switch} = $argument;
+      } else {  # Other qsub options - passed as is
+        $qsub_opts .= "$switch $argument ";
+      }
+    }
+  }
+  if ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+):(\d+)$/) { # e.g. JOB=1:20
+    $array_job = 1;
+    $jobname = $1;
+    $jobstart = $2;
+    $jobend = $3;
+    shift;
+    if ($jobstart > $jobend) {
+      die "queue.pl: invalid job range $ARGV[0]";
+    }
+    if ($jobstart <= 0) {
+      die "run.pl: invalid job range $ARGV[0], start must be strictly positive (this is a GridEngine limitation).";
+    }
+  } elsif ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+)$/) { # e.g. JOB=1.
+    $array_job = 1;
+    $jobname = $1;
+    $jobstart = $2;
+    $jobend = $2;
+    shift;
+  } elsif ($ARGV[0] =~ m/.+\=.*\:.*$/) {
+    print STDERR "queue.pl: Warning: suspicious first argument to queue.pl: $ARGV[0]\n";
+  }
+}
+
+if (@ARGV < 2) {
+  print_usage();
+}
+
+if (exists $cli_options{"config"}) {
+  $config = $cli_options{"config"};
+}
+
+my $default_config_file = <<'EOF';
+# Default configuration
+command qsub -V -v PATH -S /bin/bash -l mem=4G
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+default gpu=0
+option gpu=0
+option gpu=* -l ncpus=$0
+EOF
+
+# Here the configuration options specified by the user on the command line
+# (e.g. --mem 2G) are converted to options to the qsub system as defined in
+# the config file. (e.g. if the config file has the line
+# "option mem=* -l ram_free=$0,mem_free=$0"
+# and the user has specified '--mem 2G' on the command line, the options
+# passed to queue system would be "-l ram_free=2G,mem_free=2G
+# A more detailed description of the ways the options would be handled is at
+# the top of this file.
+
+my $opened_config_file = 1;
+
+open CONFIG, "<$config" or $opened_config_file = 0;
+
+my %cli_config_options = ();
+my %cli_default_options = ();
+
+if ($opened_config_file == 0 && exists($cli_options{"config"})) {
+  print STDERR "Could not open config file $config\n";
+  exit(1);
+} elsif ($opened_config_file == 0 && !exists($cli_options{"config"})) {
+  # Open the default config file instead
+  open (CONFIG, "echo '$default_config_file' |") or die "Unable to open pipe\n";
+  $config = "Default config";
+}
+
+my $qsub_cmd = "";
+my $read_command = 0;
+
+while(<CONFIG>) {
+  chomp;
+  my $line = $_;
+  $_ =~ s/\s*#.*//g;
+  if ($_ eq "") { next; }
+  if ($_ =~ /^command (.+)/) {
+    $read_command = 1;
+    $qsub_cmd = $1 . " ";
+  } elsif ($_ =~ m/^option ([^=]+)=\* (.+)$/) {
+    # Config option that needs replacement with parameter value read from CLI
+    # e.g.: option mem=* -l mem_free=$0,ram_free=$0
+    my $option = $1;     # mem
+    my $arg= $2;         # -l mem_free=$0,ram_free=$0
+    if ($arg !~ m:\$0:) {
+      die "Unable to parse line '$line' in config file ($config)\n";
+    }
+    if (exists $cli_options{$option}) {
+      # Replace $0 with the argument read from command line.
+      # e.g. "-l mem_free=$0,ram_free=$0" -> "-l mem_free=2G,ram_free=2G"
+      $arg =~ s/\$0/$cli_options{$option}/g;
+      $cli_config_options{$option} = $arg;
+    }
+  } elsif ($_ =~ m/^option ([^=]+)=(\S+)\s?(.*)$/) {
+    # Config option that does not need replacement
+    # e.g. option gpu=0 -q all.q
+    my $option = $1;      # gpu
+    my $value = $2;       # 0
+    my $arg = $3;         # -q all.q
+    if (exists $cli_options{$option}) {
+      $cli_default_options{($option,$value)} = $arg;
+    }
+  } elsif ($_ =~ m/^default (\S+)=(\S+)/) {
+    # Default options. Used for setting default values to options i.e. when
+    # the user does not specify the option on the command line
+    # e.g. default gpu=0
+    my $option = $1;  # gpu
+    my $value = $2;   # 0
+    if (!exists $cli_options{$option}) {
+      # If the user has specified this option on the command line, then we
+      # don't have to do anything
+      $cli_options{$option} = $value;
+    }
+  } else {
+    print STDERR "queue.pl: unable to parse line '$line' in config file ($config)\n";
+    exit(1);
+  }
+}
+
+close(CONFIG);
+
+if ($read_command != 1) {
+  print STDERR "queue.pl: config file ($config) does not contain the line \"command .*\"\n";
+  exit(1);
+}
+
+for my $option (keys %cli_options) {
+  if ($option eq "config") { next; }
+  if ($option eq "max_jobs_run" && $array_job != 1) { next; }
+  my $value = $cli_options{$option};
+
+  if (exists $cli_default_options{($option,$value)}) {
+    $qsub_opts .= "$cli_default_options{($option,$value)} ";
+  } elsif (exists $cli_config_options{$option}) {
+    $qsub_opts .= "$cli_config_options{$option} ";
+  } else {
+    if ($opened_config_file == 0) { $config = "default config file"; }
+    die "queue.pl: Command line option $option not described in $config (or value '$value' not allowed)\n";
+  }
+}
+
+my $cwd = getcwd();
+my $logfile = shift @ARGV;
+
+if ($array_job == 1 && $logfile !~ m/$jobname/
+    && $jobend > $jobstart) {
+  print STDERR "queue.pl: you are trying to run a parallel job but "
+    . "you are putting the output into just one log file ($logfile)\n";
+  exit(1);
+}
+
+#
+# Work out the command; quote escaping is done here.
+# Note: the rules for escaping stuff are worked out pretty
+# arbitrarily, based on what we want it to do.  Some things that
+# we pass as arguments to queue.pl, such as "|", we want to be
+# interpreted by bash, so we don't escape them.  Other things,
+# such as archive specifiers like 'ark:gunzip -c foo.gz|', we want
+# to be passed, in quotes, to the Kaldi program.  Our heuristic
+# is that stuff with spaces in should be quoted.  This doesn't
+# always work.
+#
+my $cmd = "";
+
+foreach my $x (@ARGV) {
+  if ($x =~ m/^\S+$/) { $cmd .= $x . " "; } # If string contains no spaces, take
+                                            # as-is.
+  elsif ($x =~ m:\":) { $cmd .= "'$x' "; } # else if no dbl-quotes, use single
+  else { $cmd .= "\"$x\" "; }  # else use double.
+}
+
+#
+# Work out the location of the script file, and open it for writing.
+#
+my $dir = dirname($logfile);
+my $base = basename($logfile);
+my $qdir = "$dir/q";
+$qdir =~ s:/(log|LOG)/*q:/q:; # If qdir ends in .../log/q, make it just .../q.
+my $queue_logfile = "$qdir/$base";
+
+if (!-d $dir) { system "mkdir -p $dir 2>/dev/null"; } # another job may be doing this...
+if (!-d $dir) { die "Cannot make the directory $dir\n"; }
+# make a directory called "q",
+# where we will put the log created by qsub... normally this doesn't contain
+# anything interesting, evertyhing goes to $logfile.
+if (! -d "$qdir") {
+  system "mkdir $qdir 2>/dev/null";
+  sleep(5); ## This is to fix an issue we encountered in denominator lattice creation,
+  ## where if e.g. the exp/tri2b_denlats/log/15/q directory had just been
+  ## created and the job immediately ran, it would die with an error because nfs
+  ## had not yet synced.  I'm also decreasing the acdirmin and acdirmax in our
+  ## NFS settings to something like 5 seconds.
+}
+
+my $queue_array_opt = "";
+if ($array_job == 1) { # It's an array job.
+  $queue_array_opt = "-J $jobstart-$jobend";
+  $logfile =~ s/$jobname/\$PBS_ARRAY_INDEX/g; # This variable will get
+  # replaced by qsub, in each job, with the job-id.
+  $cmd =~ s/$jobname/\$\{PBS_ARRAY_INDEX\}/g; # same for the command...
+  $queue_logfile =~ s/\.?$jobname//; # the log file in the q/ subdirectory
+  # is for the queue to put its log, and this doesn't need the task array subscript
+  # so we remove it.
+}
+
+# queue_scriptfile is as $queue_logfile [e.g. dir/q/foo.log] but
+# with the suffix .sh.
+my $queue_scriptfile = $queue_logfile;
+($queue_scriptfile =~ s/\.[a-zA-Z]{1,5}$/.sh/) || ($queue_scriptfile .= ".sh");
+if ($queue_scriptfile !~ m:^/:) {
+  $queue_scriptfile = $cwd . "/" . $queue_scriptfile; # just in case.
+}
+
+# We'll write to the standard input of "qsub" (the file-handle Q),
+# the job that we want it to execute.
+# Also keep our current PATH around, just in case there was something
+# in it that we need (although we also source ./path.sh)
+
+my $syncfile = "$qdir/done.$$";
+
+system("rm $queue_logfile $syncfile 2>/dev/null");
+#
+# Write to the script file, and then close it.
+#
+open(Q, ">$queue_scriptfile") || die "Failed to write to $queue_scriptfile";
+
+print Q "#!/bin/bash\n";
+print Q "cd $cwd\n";
+print Q ". ./path.sh\n";
+print Q "( echo '#' Running on \`hostname\`\n";
+print Q "  echo '#' Started at \`date\`\n";
+print Q "  echo -n '# '; cat <<EOF\n";
+print Q "$cmd\n"; # this is a way of echoing the command into a comment in the log file,
+print Q "EOF\n"; # without having to escape things like "|" and quote characters.
+print Q ") >$logfile\n";
+print Q "time1=\`date +\"%s\"\`\n";
+print Q " ( $cmd ) 2>>$logfile >>$logfile\n";
+print Q "ret=\$?\n";
+print Q "time2=\`date +\"%s\"\`\n";
+print Q "echo '#' Accounting: time=\$((\$time2-\$time1)) threads=$num_threads >>$logfile\n";
+print Q "echo '#' Finished at \`date\` with status \$ret >>$logfile\n";
+print Q "[ \$ret -eq 137 ] && exit 100;\n"; # If process was killed (e.g. oom) it will exit with status 137;
+  # let the script return with status 100 which will put it to E state; more easily rerunnable.
+if ($array_job == 0) { # not an array job
+  print Q "touch $syncfile\n"; # so we know it's done.
+} else {
+  print Q "touch $syncfile.\$PBS_ARRAY_INDEX\n"; # touch a bunch of sync-files.
+}
+print Q "exit \$[\$ret ? 1 : 0]\n"; # avoid status 100 which grid-engine
+print Q "## submitted with:\n";       # treats specially.
+$qsub_cmd .= "-o $queue_logfile $qsub_opts $queue_array_opt $queue_scriptfile >>$queue_logfile 2>&1";
+print Q "# $qsub_cmd\n";
+if (!close(Q)) { # close was not successful... || die "Could not close script file $shfile";
+  die "Failed to close the script file (full disk?)";
+}
+
+my $ret = system ($qsub_cmd);
+if ($ret != 0) {
+  if ($sync && $ret == 256) { # this is the exit status when a job failed (bad exit status)
+    if (defined $jobname) { $logfile =~ s/\$PBS_ARRAY_INDEX/*/g; }
+    print STDERR "queue.pl: job writing to $logfile failed\n";
+  } else {
+    print STDERR "queue.pl: error submitting jobs to queue (return status was $ret)\n";
+    print STDERR "queue log file is $queue_logfile, command was $qsub_cmd\n";
+    print STDERR `tail $queue_logfile`;
+  }
+  exit(1);
+}
+
+my $sge_job_id;
+if (! $sync) { # We're not submitting with -sync y, so we
+  # need to wait for the jobs to finish.  We wait for the
+  # sync-files we "touched" in the script to exist.
+  my @syncfiles = ();
+  if (!defined $jobname) { # not an array job.
+    push @syncfiles, $syncfile;
+  } else {
+    for (my $jobid = $jobstart; $jobid <= $jobend; $jobid++) {
+      push @syncfiles, "$syncfile.$jobid";
+    }
+  }
+  # We will need the sge_job_id, to check that job still exists
+  { # Get the SGE job-id from the log file in q/
+    open(L, "<$queue_logfile") || die "Error opening log file $queue_logfile";
+    undef $sge_job_id;
+    while (<L>) {
+      if (m/Your job\S* (\d+)[. ].+ has been submitted/) {
+        if (defined $sge_job_id) {
+          die "Error: your job was submitted more than once (see $queue_logfile)";
+        } else {
+          $sge_job_id = $1;
+        }
+      }
+    }
+    close(L);
+    if (!defined $sge_job_id) {
+      die "Error: log file $queue_logfile does not specify the SGE job-id.";
+    }
+  }
+  my $check_sge_job_ctr=1;
+  #
+  my $wait = 0.1;
+  my $counter = 0;
+  foreach my $f (@syncfiles) {
+    # wait for them to finish one by one.
+    while (! -f $f) {
+      sleep($wait);
+      $wait *= 1.2;
+      if ($wait > 3.0) {
+        $wait = 3.0; # never wait more than 3 seconds.
+        # the following (.kick) commands are basically workarounds for NFS bugs.
+        if (rand() < 0.25) { # don't do this every time...
+          if (rand() > 0.5) {
+            system("touch $qdir/.kick");
+          } else {
+            system("rm $qdir/.kick 2>/dev/null");
+          }
+        }
+        if ($counter++ % 10 == 0) {
+          # This seems to kick NFS in the teeth to cause it to refresh the
+          # directory.  I've seen cases where it would indefinitely fail to get
+          # updated, even though the file exists on the server.
+          # Only do this every 10 waits (every 30 seconds) though, or if there
+          # are many jobs waiting they can overwhelm the file server.
+          system("ls $qdir >/dev/null");
+        }
+      }
+
+      # Check that the job exists in SGE. Job can be killed if duration
+      # exceeds some hard limit, or in case of a machine shutdown.
+      if (($check_sge_job_ctr++ % 10) == 0) { # Don't run qstat too often, avoid stress on SGE.
+        if ( -f $f ) { next; }; #syncfile appeared: OK.
+        $ret = system("qstat -t $sge_job_id >/dev/null 2>/dev/null");
+        # system(...) : To get the actual exit value, shift $ret right by eight bits.
+        if ($ret>>8 == 1) {     # Job does not seem to exist
+          # Don't consider immediately missing job as error, first wait some
+          # time to make sure it is not just delayed creation of the syncfile.
+
+          sleep(3);
+          # Sometimes NFS gets confused and thinks it's transmitted the directory
+          # but it hasn't, due to timestamp issues.  Changing something in the
+          # directory will usually fix that.
+          system("touch $qdir/.kick");
+          system("rm $qdir/.kick 2>/dev/null");
+          if ( -f $f ) { next; }   #syncfile appeared, ok
+          sleep(7);
+          system("touch $qdir/.kick");
+          sleep(1);
+          system("rm $qdir/.kick 2>/dev/null");
+          if ( -f $f ) {  next; }   #syncfile appeared, ok
+          sleep(60);
+          system("touch $qdir/.kick");
+          sleep(1);
+          system("rm $qdir/.kick 2>/dev/null");
+          if ( -f $f ) { next; }  #syncfile appeared, ok
+          $f =~ m/\.(\d+)$/ || die "Bad sync-file name $f";
+          my $job_id = $1;
+          if (defined $jobname) {
+            $logfile =~ s/\$PBS_ARRAY_INDEX/$job_id/g;
+          }
+          my $last_line = `tail -n 1 $logfile`;
+          if ($last_line =~ m/status 0$/ && (-M $logfile) < 0) {
+            # if the last line of $logfile ended with "status 0" and
+            # $logfile is newer than this program [(-M $logfile) gives the
+            # time elapsed between file modification and the start of this
+            # program], then we assume the program really finished OK,
+            # and maybe something is up with the file system.
+            print STDERR "**queue.pl: syncfile $f was not created but job seems\n" .
+              "**to have finished OK.  Probably your file-system has problems.\n" .
+              "**This is just a warning.\n";
+            last;
+          } else {
+            chop $last_line;
+            print STDERR "queue.pl: Error, unfinished job no " .
+              "longer exists, log is in $logfile, last line is '$last_line', " .
+              "syncfile is $f, return status of qstat was $ret\n" .
+              "Possible reasons: a) Exceeded time limit? -> Use more jobs!" .
+              " b) Shutdown/Frozen machine? -> Run again!\n";
+            exit(1);
+          }
+        } elsif ($ret != 0) {
+          print STDERR "queue.pl: Warning: qstat command returned status $ret (qstat -t $sge_job_id,$!)\n";
+        }
+      }
+    }
+  }
+  my $all_syncfiles = join(" ", @syncfiles);
+  system("rm $all_syncfiles 2>/dev/null");
+}
+
+# OK, at this point we are synced; we know the job is done.
+# But we don't know about its exit status.  We'll look at $logfile for this.
+# First work out an array @logfiles of file-locations we need to
+# read (just one, unless it's an array job).
+my @logfiles = ();
+if (!defined $jobname) { # not an array job.
+  push @logfiles, $logfile;
+} else {
+  for (my $jobid = $jobstart; $jobid <= $jobend; $jobid++) {
+    my $l = $logfile;
+    $l =~ s/\$PBS_ARRAY_INDEX/$jobid/g;
+    push @logfiles, $l;
+  }
+}
+
+my $num_failed = 0;
+my $status = 1;
+foreach my $l (@logfiles) {
+  my @wait_times = (0.1, 0.2, 0.2, 0.3, 0.5, 0.5, 1.0, 2.0, 5.0, 5.0, 5.0, 10.0, 25.0);
+  for (my $iter = 0; $iter <= @wait_times; $iter++) {
+    my $line = `tail -10 $l 2>/dev/null`; # Note: although this line should be the last
+    # line of the file, I've seen cases where it was not quite the last line because
+    # of delayed output by the process that was running, or processes it had called.
+    # so tail -10 gives it a little leeway.
+    if ($line =~ m/with status (\d+)/) {
+      $status = $1;
+      last;
+    } else {
+      if ($iter < @wait_times) {
+        sleep($wait_times[$iter]);
+      } else {
+        if (! -f $l) {
+          print STDERR "Log-file $l does not exist.\n";
+        } else {
+          print STDERR "The last line of log-file $l does not seem to indicate the "
+            . "return status as expected\n";
+        }
+        exit(1);                # Something went wrong with the queue, or the
+        # machine it was running on, probably.
+      }
+    }
+  }
+  # OK, now we have $status, which is the return-status of
+  # the command in the job.
+  if ($status != 0) { $num_failed++; }
+}
+if ($num_failed == 0) { exit(0); }
+else { # we failed.
+  if (@logfiles == 1) {
+    if (defined $jobname) { $logfile =~ s/\$PBS_ARRAY_INDEX/$jobstart/g; }
+    print STDERR "queue.pl: job failed with status $status, log is in $logfile\n";
+    if ($logfile =~ m/JOB/) {
+      print STDERR "queue.pl: probably you forgot to put JOB=1:\$nj in your script.\n";
+    }
+  } else {
+    if (defined $jobname) { $logfile =~ s/\$PBS_ARRAY_INDEX/*/g; }
+    my $numjobs = 1 + $jobend - $jobstart;
+    print STDERR "queue.pl: $num_failed / $numjobs failed, log is in $logfile\n";
+  }
+  exit(1);
+}
diff --git a/egs/wsj/s5/utils/perturb_data_dir_speed.sh b/egs/wsj/s5/utils/perturb_data_dir_speed.sh
index 61c0962cf15..5de8b994705 100755
--- a/egs/wsj/s5/utils/perturb_data_dir_speed.sh
+++ b/egs/wsj/s5/utils/perturb_data_dir_speed.sh
@@ -1,4 +1,4 @@
-#!/bin/bash 
+#!/bin/bash
 
 # Copyright 2013  Johns Hopkins University (author: Daniel Povey)
 #           2014  Tom Ko
@@ -36,7 +36,7 @@ which sox &>/dev/null
 ! [ $? -eq 0 ] && echo "sox: command not found" && exit 1;
 
 if [ ! -f $srcdir/utt2spk ]; then
-  echo "$0: no such file $srcdir/utt2spk" 
+  echo "$0: no such file $srcdir/utt2spk"
   exit 1;
 fi
 
@@ -65,18 +65,18 @@ if [ -f $srcdir/segments ]; then
 
   utils/apply_map.pl -f 1 $destdir/reco_map <$srcdir/wav.scp | sed 's/| *$/ |/' | \
     awk -v factor=$factor \
-        '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"} 
+        '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"}
           else {print wid " sox -t wav" $_ " -t wav - speed " factor " |"}}' > $destdir/wav.scp
   if [ -f $srcdir/reco2file_and_channel ]; then
     utils/apply_map.pl -f 1 $destdir/reco_map <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel
   fi
-  
+
   rm $destdir/reco_map 2>/dev/null
 else # no segments->wav indexed by utterance.
   if [ -f $srcdir/wav.scp ]; then
     utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/wav.scp | sed 's/| *$/ |/' | \
      awk -v factor=$factor \
-       '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"} 
+       '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"}
          else {print wid " sox -t wav" $_ " -t wav - speed " factor " |"}}' > $destdir/wav.scp
   fi
 fi
@@ -88,6 +88,13 @@ if [ -f $srcdir/spk2gender ]; then
   utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/spk2gender >$destdir/spk2gender
 fi
 
+if [ ! -f $srcdir/utt2dur ]; then
+  # generate utt2dur if it does not exist in srcdir
+  utils/data/get_utt2dur.sh $srcdir
+fi
+
+cat $srcdir/utt2dur | utils/apply_map.pl -f 1 $destdir/utt_map  | \
+  awk -v factor=$factor '{print $1, $2/factor;}' >$destdir/utt2dur
 
 rm $destdir/spk_map $destdir/utt_map 2>/dev/null
 echo "$0: generated speed-perturbed version of data in $srcdir, in $destdir"
diff --git a/egs/wsj/s5/utils/prepare_lang.sh b/egs/wsj/s5/utils/prepare_lang.sh
index 7701a956235..0014f22a04e 100755
--- a/egs/wsj/s5/utils/prepare_lang.sh
+++ b/egs/wsj/s5/utils/prepare_lang.sh
@@ -28,20 +28,21 @@
 # and also files silence_phones.txt, nonsilence_phones.txt, optional_silence.txt
 # and extra_questions.txt
 # Here, silence_phones.txt and nonsilence_phones.txt are lists of silence and
-# non-silence phones respectively (where silence includes various kinds of 
-# noise, laugh, cough, filled pauses etc., and nonsilence phones includes the 
+# non-silence phones respectively (where silence includes various kinds of
+# noise, laugh, cough, filled pauses etc., and nonsilence phones includes the
 # "real" phones.)
-# In each line of those files is a list of phones, and the phones on each line 
-# are assumed to correspond to the same "base phone", i.e. they will be 
+# In each line of those files is a list of phones, and the phones on each line
+# are assumed to correspond to the same "base phone", i.e. they will be
 # different stress or tone variations of the same basic phone.
-# The file "optional_silence.txt" contains just a single phone (typically SIL) 
+# The file "optional_silence.txt" contains just a single phone (typically SIL)
 # which is used for optional silence in the lexicon.
 # extra_questions.txt might be empty; typically will consist of lists of phones,
-# all members of each list with the same stress or tone; and also possibly a 
-# list for the silence phones.  This will augment the automtically generated 
-# questions (note: the automatically generated ones will treat all the 
-# stress/tone versions of a phone the same, so will not "get to ask" about 
+# all members of each list with the same stress or tone; and also possibly a
+# list for the silence phones.  This will augment the automatically generated
+# questions (note: the automatically generated ones will treat all the
+# stress/tone versions of a phone the same, so will not "get to ask" about
 # stress or tone).
+#
 
 # This script adds word-position-dependent phones and constructs a host of other
 # derived files, that go in data/lang/.
@@ -49,19 +50,20 @@
 # Begin configuration section.
 num_sil_states=5
 num_nonsil_states=3
+num_word_disambig_syms=1
 position_dependent_phones=true
-# position_dependent_phones is false also when position dependent phones and word_boundary.txt 
+# position_dependent_phones is false also when position dependent phones and word_boundary.txt
 # have been generated by another source
 reverse=false
-share_silence_phones=false  # if true, then share pdfs of different silence 
+share_silence_phones=false  # if true, then share pdfs of different silence
                             # phones together.
 sil_prob=0.5
 phone_symbol_table=              # if set, use a specified phones.txt file.
 # end configuration sections
 
-. utils/parse_options.sh 
+. utils/parse_options.sh
 
-if [ $# -ne 4 ]; then 
+if [ $# -ne 4 ]; then
   echo "usage: utils/prepare_lang.sh <dict-src-dir> <oov-dict-entry> <tmp-dir> <lang-dir>"
   echo "e.g.: utils/prepare_lang.sh data/local/dict <SPOKEN_NOISE> data/local/lang data/lang"
   echo "<dict-src-dir> should contain the following files:"
@@ -114,8 +116,8 @@ fi
 # phones.txt file provided, we will do some sanity check here.
 if [[ ! -z $phone_symbol_table ]]; then
   # Checks if we have position dependent phones
-  n1=`cat $phone_symbol_table | grep -v -P "^#[0-9]+$" | cut -d' ' -f1 | sort -u | wc -l`
-  n2=`cat $phone_symbol_table | grep -v -P "^#[0-9]+$" | cut -d' ' -f1 | sed 's/_[BIES]$//g' | sort -u | wc -l`
+  n1=`cat $phone_symbol_table | grep -v -E "^#[0-9]+$" | cut -d' ' -f1 | sort -u | wc -l`
+  n2=`cat $phone_symbol_table | grep -v -E "^#[0-9]+$" | cut -d' ' -f1 | sed 's/_[BIES]$//g' | sort -u | wc -l`
   $position_dependent_phones && [ $n1 -eq $n2 ] &&\
     echo "$0: Position dependent phones requested, but not in provided phone symbols" && exit 1;
   ! $position_dependent_phones && [ $n1 -ne $n2 ] &&\
@@ -123,7 +125,7 @@ if [[ ! -z $phone_symbol_table ]]; then
 
   # Checks if the phone sets match.
   cat $srcdir/{,non}silence_phones.txt | awk -v f=$phone_symbol_table '
-  BEGIN { while ((getline < f) > 0) { sub(/((_[BEIS])|) [0-9]+$/, "", $0); phones[$0] = 1; }}
+  BEGIN { while ((getline < f) > 0) { sub(/_[BEIS]$/, "", $1); phones[$1] = 1; }}
   { for (x = 1; x <= NF; ++x) { if (!($x in phones)) {
       print "Phone appears in the lexicon but not in the provided phones.txt: "$x; exit 1; }}}' || exit 1;
 fi
@@ -133,10 +135,10 @@ if $position_dependent_phones; then
   # adding the markers _B, _E, _S, _I depending on word position.
   # In this recipe, these markers apply to silence also.
   # Do this starting from lexiconp.txt only.
-  if "$silprob"; then 
+  if "$silprob"; then
     perl -ane '@A=split(" ",$_); $w = shift @A; $p = shift @A; $silword_p = shift @A;
               $wordsil_f = shift @A; $wordnonsil_f = shift @A; @A>0||die;
-         if(@A==1) { print "$w $p $silword_p $wordsil_f $wordnonsil_f $A[0]_S\n"; } 
+         if(@A==1) { print "$w $p $silword_p $wordsil_f $wordnonsil_f $A[0]_S\n"; }
          else { print "$w $p $silword_p $wordsil_f $wordnonsil_f $A[0]_B ";
          for($n=1;$n<@A-1;$n++) { print "$A[$n]_I "; } print "$A[$n]_E\n"; } ' \
                 < $srcdir/lexiconp_silprob.txt > $tmpdir/lexiconp_silprob.txt
@@ -158,11 +160,11 @@ if $position_dependent_phones; then
       mv $tmpdir/lexiconp.pre_reverse $tmpdir/lexiconp.txt
     fi
   fi
-  
+
   # create $tmpdir/phone_map.txt
   # this has the format (on each line)
   # <original phone> <version 1 of original phone> <version 2> ...
-  # where the versions depend on the position of the phone within a word. 
+  # where the versions depend on the position of the phone within a word.
   # For instance, we'd have:
   # AA AA_B AA_E AA_I AA_S
   # for (B)egin, (E)nd, (I)nternal and (S)ingleton
@@ -174,11 +176,11 @@ if $position_dependent_phones; then
   # This phone map expands the phone lists into all the word-position-dependent
   # versions of the phone lists.
 
-  cat <(for x in `cat $srcdir/silence_phones.txt`; do for y in "" "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \
-    <(for x in `cat $srcdir/nonsilence_phones.txt`; do for y in "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \
+  cat <(set -f; for x in `cat $srcdir/silence_phones.txt`; do for y in "" "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \
+    <(set -f; for x in `cat $srcdir/nonsilence_phones.txt`; do for y in "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \
     > $tmpdir/phone_map.txt
 else
-  if "$silprob"; then 
+  if "$silprob"; then
     cp $srcdir/lexiconp_silprob.txt $tmpdir/lexiconp_silprob.txt
     if $reverse; then
       echo "We do not support reverse option and silprob at the same time"
@@ -245,10 +247,10 @@ cat $srcdir/extra_questions.txt 2>/dev/null | utils/apply_map.pl $tmpdir/phone_m
 # be inside a word.
 if $position_dependent_phones; then
   for suffix in _B _E _I _S; do
-    (for x in `cat $srcdir/nonsilence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt
+    (set -f; for x in `cat $srcdir/nonsilence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt
   done
   for suffix in "" _B _E _I _S; do
-    (for x in `cat $srcdir/silence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt
+    (set -f; for x in `cat $srcdir/silence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt
   done
 fi
 
@@ -277,7 +279,7 @@ if [[ ! -z $phone_symbol_table ]]; then
   start_symbol=`grep \#0 $phone_symbol_table | awk '{print $2}'`
   echo "<eps>" | cat - $dir/phones/{silence,nonsilence}.txt | awk -v f=$phone_symbol_table '
   BEGIN { while ((getline < f) > 0) { phones[$1] = $2; }} { print $1" "phones[$1]; }' | sort -k2 -g |\
-    cat - <(cat $dir/phones/disambig.txt | awk -v x=$start_symbol '{n=x+NR-1; print $1, n;}') > $dir/phones.txt 
+    cat - <(cat $dir/phones/disambig.txt | awk -v x=$start_symbol '{n=x+NR-1; print $1, n;}') > $dir/phones.txt
 else
   echo "<eps>" | cat - $dir/phones/{silence,nonsilence,disambig}.txt | \
     awk '{n=NR-1; print $1, n;}' > $dir/phones.txt
@@ -313,7 +315,7 @@ fi
 cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | uniq  | awk '
   BEGIN {
     print "<eps> 0";
-  } 
+  }
   {
     if ($1 == "<s>") {
       print "<s> is in the vocabulary!" | "cat 1>&2"
@@ -362,7 +364,7 @@ cat $dir/phones/align_lexicon.txt | utils/sym2int.pl -f 3- $dir/phones.txt | \
   utils/sym2int.pl -f 1-2 $dir/words.txt > $dir/phones/align_lexicon.int
 
 # Create the basic L.fst without disambiguation symbols, for use
-# in training. 
+# in training.
 
 if $silprob; then
   # Usually it's the same as having a fixed-prob L.fst
@@ -386,7 +388,18 @@ cat $dir/oov.txt | utils/sym2int.pl $dir/words.txt >$dir/oov.int || exit 1;
 # integer version of oov symbol, used in some scripts.
 
 
-# Create these lists of phones in colon-separated integer list form too, 
+# the file wdisambig.txt contains a (line-by-line) list of the text-form of the
+# disambiguation symbols that are used in the grammar and passed through by the
+# lexicon.  At this stage it's hardcoded as '#0', but we're laying the groundwork
+# for more generality (which probably would be added by another script).
+# wdisambig_words.int contains the corresponding list interpreted by the
+# symbol table words.txt, and wdisambig_phones.int contains the corresponding
+# list interpreted by the symbol table phones.txt.
+echo '#0' >$dir/phones/wdisambig.txt
+utils/sym2int.pl $dir/phones.txt <$dir/phones/wdisambig.txt >$dir/phones/wdisambig_phones.int
+utils/sym2int.pl $dir/words.txt <$dir/phones/wdisambig.txt >$dir/phones/wdisambig_words.int
+
+# Create these lists of phones in colon-separated integer list form too,
 # for purposes of being given to programs as command-line options.
 for f in silence nonsilence optional_silence disambig context_indep; do
   utils/sym2int.pl $dir/phones.txt <$dir/phones/$f.txt >$dir/phones/$f.int
@@ -415,20 +428,18 @@ utils/gen_topo.pl $num_nonsil_states $num_sil_states $nonsilphonelist $silphonel
 # Create the lexicon FST with disambiguation symbols, and put it in lang_test.
 # There is an extra step where we create a loop to "pass through" the
 # disambiguation symbols from G.fst.
-phone_disambig_symbol=`grep \#0 $dir/phones.txt | awk '{print $2}'`
-word_disambig_symbol=`grep \#0 $dir/words.txt | awk '{print $2}'`
 
 if $silprob; then
   utils/make_lexicon_fst_silprob.pl $tmpdir/lexiconp_silprob_disambig.txt $srcdir/silprob.txt $silphone '#'$ndisambig | \
      fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
      --keep_isymbols=false --keep_osymbols=false |   \
-     fstaddselfloops  "echo $phone_disambig_symbol |" "echo $word_disambig_symbol |" | \
+     fstaddselfloops  $dir/phones/wdisambig_phones.int $dir/phones/wdisambig_words.int | \
      fstarcsort --sort_type=olabel > $dir/L_disambig.fst || exit 1;
 else
   utils/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt $sil_prob $silphone '#'$ndisambig | \
      fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
      --keep_isymbols=false --keep_osymbols=false |   \
-     fstaddselfloops  "echo $phone_disambig_symbol |" "echo $word_disambig_symbol |" | \
+     fstaddselfloops  $dir/phones/wdisambig_phones.int $dir/phones/wdisambig_words.int | \
      fstarcsort --sort_type=olabel > $dir/L_disambig.fst || exit 1;
 fi
 
diff --git a/egs/wsj/s5/utils/queue.pl b/egs/wsj/s5/utils/queue.pl
index 1e36de63053..cba8ff1a191 100755
--- a/egs/wsj/s5/utils/queue.pl
+++ b/egs/wsj/s5/utils/queue.pl
@@ -18,7 +18,7 @@
 
 # The script now supports configuring the queue system using a config file
 # (default in conf/queue.conf; but can be passed specified with --config option)
-# and a set of command line options. 
+# and a set of command line options.
 # The current script handles:
 # 1) Normal configuration arguments
 # For e.g. a command line option of "--gpu 1" could be converted into the option
@@ -28,7 +28,7 @@
 # $0 here in the line is replaced with the argument read from the CLI and the
 # resulting string is passed to qsub.
 # 2) Special arguments to options such as
-# gpu=0 
+# gpu=0
 # If --gpu 0 is given in the command line, then no special "-q" is given.
 # 3) Default argument
 # default gpu=0
@@ -94,12 +94,12 @@ ()
   print_usage();
 }
 
-for (my $x = 1; $x <= 3; $x++) { # This for-loop is to 
+for (my $x = 1; $x <= 2; $x++) { # This for-loop is to
   # allow the JOB=1:n option to be interleaved with the
   # options to qsub.
   while (@ARGV >= 2 && $ARGV[0] =~ m:^-:) {
     my $switch = shift @ARGV;
-    
+
     if ($switch eq "-V") {
       $qsub_opts .= "-V ";
     } else {
@@ -116,10 +116,10 @@ ()
         $num_threads = $argument2;
       } elsif ($switch =~ m/^--/) { # Config options
         # Convert CLI option to variable name
-        # by removing '--' from the switch and replacing any 
+        # by removing '--' from the switch and replacing any
         # '-' with a '_'
         $switch =~ s/^--//;
-        $switch =~ s/-/_/g;         
+        $switch =~ s/-/_/g;
         $cli_options{$switch} = $argument;
       } else {  # Other qsub options - passed as is
         $qsub_opts .= "$switch $argument ";
@@ -145,7 +145,7 @@ ()
     $jobend = $2;
     shift;
   } elsif ($ARGV[0] =~ m/.+\=.*\:.*$/) {
-    print STDERR "Warning: suspicious first argument to queue.pl: $ARGV[0]\n";
+    print STDERR "queue.pl: Warning: suspicious first argument to queue.pl: $ARGV[0]\n";
   }
 }
 
@@ -155,7 +155,7 @@ ()
 
 if (exists $cli_options{"config"}) {
   $config = $cli_options{"config"};
-}  
+}
 
 my $default_config_file = <<'EOF';
 # Default configuration
@@ -172,7 +172,7 @@ ()
 
 # Here the configuration options specified by the user on the command line
 # (e.g. --mem 2G) are converted to options to the qsub system as defined in
-# the config file. (e.g. if the config file has the line 
+# the config file. (e.g. if the config file has the line
 # "option mem=* -l ram_free=$0,mem_free=$0"
 # and the user has specified '--mem 2G' on the command line, the options
 # passed to queue system would be "-l ram_free=2G,mem_free=2G
@@ -186,7 +186,7 @@ ()
 my %cli_config_options = ();
 my %cli_default_options = ();
 
-if ($opened_config_file == 0 && exists($cli_options{"config"})) {   
+if ($opened_config_file == 0 && exists($cli_options{"config"})) {
   print STDERR "Could not open config file $config\n";
   exit(1);
 } elsif ($opened_config_file == 0 && !exists($cli_options{"config"})) {
@@ -206,12 +206,12 @@ ()
   if ($_ =~ /^command (.+)/) {
     $read_command = 1;
     $qsub_cmd = $1 . " ";
-  } elsif ($_ =~ m/^option ([^=]+)=\* (.+)$/) { 
+  } elsif ($_ =~ m/^option ([^=]+)=\* (.+)$/) {
     # Config option that needs replacement with parameter value read from CLI
     # e.g.: option mem=* -l mem_free=$0,ram_free=$0
     my $option = $1;     # mem
     my $arg= $2;         # -l mem_free=$0,ram_free=$0
-    if ($arg !~ m:\$0:) {  
+    if ($arg !~ m:\$0:) {
       die "Unable to parse line '$line' in config file ($config)\n";
     }
     if (exists $cli_options{$option}) {
@@ -231,7 +231,7 @@ ()
     }
   } elsif ($_ =~ m/^default (\S+)=(\S+)/) {
     # Default options. Used for setting default values to options i.e. when
-    # the user does not specify the option on the command line 
+    # the user does not specify the option on the command line
     # e.g. default gpu=0
     my $option = $1;  # gpu
     my $value = $2;   # 0
@@ -291,7 +291,7 @@ ()
 #
 my $cmd = "";
 
-foreach my $x (@ARGV) { 
+foreach my $x (@ARGV) {
   if ($x =~ m/^\S+$/) { $cmd .= $x . " "; } # If string contains no spaces, take
                                             # as-is.
   elsif ($x =~ m:\":) { $cmd .= "'$x' "; } # else if no dbl-quotes, use single
@@ -312,19 +312,19 @@ ()
 # make a directory called "q",
 # where we will put the log created by qsub... normally this doesn't contain
 # anything interesting, evertyhing goes to $logfile.
-if (! -d "$qdir") { 
+if (! -d "$qdir") {
   system "mkdir $qdir 2>/dev/null";
   sleep(5); ## This is to fix an issue we encountered in denominator lattice creation,
   ## where if e.g. the exp/tri2b_denlats/log/15/q directory had just been
   ## created and the job immediately ran, it would die with an error because nfs
   ## had not yet synced.  I'm also decreasing the acdirmin and acdirmax in our
   ## NFS settings to something like 5 seconds.
-} 
+}
 
 my $queue_array_opt = "";
 if ($array_job == 1) { # It's an array job.
-  $queue_array_opt = "-t $jobstart:$jobend"; 
-  $logfile =~ s/$jobname/\$SGE_TASK_ID/g; # This variable will get 
+  $queue_array_opt = "-t $jobstart:$jobend";
+  $logfile =~ s/$jobname/\$SGE_TASK_ID/g; # This variable will get
   # replaced by qsub, in each job, with the job-id.
   $cmd =~ s/$jobname/\$\{SGE_TASK_ID\}/g; # same for the command...
   $queue_logfile =~ s/\.?$jobname//; # the log file in the q/ subdirectory
@@ -455,14 +455,14 @@ ()
         }
       }
 
-      # Check that the job exists in SGE. Job can be killed if duration 
-      # exceeds some hard limit, or in case of a machine shutdown. 
+      # Check that the job exists in SGE. Job can be killed if duration
+      # exceeds some hard limit, or in case of a machine shutdown.
       if (($check_sge_job_ctr++ % 10) == 0) { # Don't run qstat too often, avoid stress on SGE.
         if ( -f $f ) { next; }; #syncfile appeared: OK.
         $ret = system("qstat -j $sge_job_id >/dev/null 2>/dev/null");
         # system(...) : To get the actual exit value, shift $ret right by eight bits.
         if ($ret>>8 == 1) {     # Job does not seem to exist
-          # Don't consider immediately missing job as error, first wait some  
+          # Don't consider immediately missing job as error, first wait some
           # time to make sure it is not just delayed creation of the syncfile.
 
           sleep(3);
@@ -526,7 +526,7 @@ ()
   push @logfiles, $logfile;
 } else {
   for (my $jobid = $jobstart; $jobid <= $jobend; $jobid++) {
-    my $l = $logfile; 
+    my $l = $logfile;
     $l =~ s/\$SGE_TASK_ID/$jobid/g;
     push @logfiles, $l;
   }
diff --git a/egs/wsj/s5/utils/reverse_lm.sh b/egs/wsj/s5/utils/reverse_lm.sh
index cc6b66dca03..228fff20fbe 100755
--- a/egs/wsj/s5/utils/reverse_lm.sh
+++ b/egs/wsj/s5/utils/reverse_lm.sh
@@ -38,25 +38,13 @@ mkdir -p $outdir
 for f in phones.txt words.txt L.fst L_disambig.fst phones/; do
   cp -r $langdir/$f $outdir
 done
-gunzip -c $lm | utils/find_arpa_oovs.pl $outdir/words.txt  > $tmpdir/oovs.txt
 
-# grep -v '<s> <s>' because the LM seems to have some strange and useless
-# stuff in it with multiple <s>'s in the history.  Encountered some other similar
-# things in a LM from Geoff.  Removing all "illegal" combinations of <s> and </s>,
-# which are supposed to occur only at being/end of utt.  These can cause 
-# determinization failures of CLG [ends up being epsilon cycles].
-gunzip -c $lm | \
-  grep -v '<s> <s>' | \
-  grep -v '</s> <s>' | \
-  grep -v '</s> </s>' > $outdir/forward.arpa
+gunzip -c $lm > $outdir/forward.arpa
 echo "Mapping ARPA to reverse ARPA"
 python utils/reverse_arpa.py $outdir/forward.arpa > $outdir/reverse.arpa
-arpa2fst $outdir/reverse.arpa | fstprint | \
-  grep -v "230258.5" | \
-  utils/remove_oovs.pl $tmpdir/oovs.txt | \
-  utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$outdir/words.txt \
-    --osymbols=$outdir/words.txt  --keep_isymbols=false --keep_osymbols=false \
-    | fstrmepsilon > $outdir/G_org.fst
+arpa2fst --disambig-symbol=#0 --read-symbol-table=$outdir/words.txt \
+         $outdir/reverse.arpa | \
+  fstprint | fgrep -v '230258.5' | fstcompile > $outdir/G_org.fst
 #--arc_type=log
 
 echo "Push weights to make it stochastic (log semi-ring)"
@@ -84,7 +72,7 @@ if [ -f $lexicon ]; then
     < "$lexicon"  >$tmpdir/g/select_empty.fst.txt
   fstcompile --isymbols=$outdir/words.txt --osymbols=$outdir/words.txt $tmpdir/g/select_empty.fst.txt | \
     fstarcsort --sort_type=olabel | fstcompose - $outdir/G.fst > $tmpdir/g/empty_words.fst
-  fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' && 
+  fstinfo $tmpdir/g/empty_words.fst | grep cyclic | grep -w 'y' &&
   echo "Language model has cycles with empty words" && exit 1
   rm -r $tmpdir/g
 fi
diff --git a/egs/wsj/s5/utils/rnnlm_compute_scores.sh b/egs/wsj/s5/utils/rnnlm_compute_scores.sh
index 75c4c262c49..060b645aca3 100755
--- a/egs/wsj/s5/utils/rnnlm_compute_scores.sh
+++ b/egs/wsj/s5/utils/rnnlm_compute_scores.sh
@@ -62,8 +62,16 @@ cat $tempdir/text | awk -v voc=$dir/wordlist.rnn -v unk=$dir/unk.probs \
 # OK, now we compute the scores on the text with OOVs replaced
 # with <RNN_UNK>
 
-$rnnlm -independent -rnnlm $dir/rnnlm -test $tempdir/text.nounk -nbest -debug 0 | \
-   awk '{print $1*log(10);}' > $tempdir/loglikes.rnn
+if [ $rnnlm_ver == "faster-rnnlm" ]; then
+  $rnnlm -independent -rnnlm $dir/rnnlm -test $tempdir/text.nounk -nbest -debug 0 | \
+     awk '{print $1*log(10);}' > $tempdir/loglikes.rnn
+else
+  # add the utterance_id as required by Mikolove's rnnlm
+  paste $tempdir/ids $tempdir/text.nounk > $tempdir/id_text.nounk
+
+  $rnnlm -independent -rnnlm $dir/rnnlm -test $tempdir/id_text.nounk -nbest -debug 0 | \
+     awk '{print $1*log(10);}' > $tempdir/loglikes.rnn
+fi
 
 [ `cat $tempdir/loglikes.rnn | wc -l` -ne `cat $tempdir/loglikes.oov | wc -l` ] && \
   echo "rnnlm rescoring failed" && exit 1;
diff --git a/egs/wsj/s5/utils/run.pl b/egs/wsj/s5/utils/run.pl
index 6145a7ac54f..7df65c086d9 100755
--- a/egs/wsj/s5/utils/run.pl
+++ b/egs/wsj/s5/utils/run.pl
@@ -1,7 +1,7 @@
 #!/usr/bin/env perl
 use warnings; #sed replacement for -w perl parameter
 
-# In general, doing 
+# In general, doing
 #  run.pl some.log a b c is like running the command a b c in
 # the bash shell, and putting the standard error and output into some.log.
 # To run parallel jobs (backgrounded on the host machine), you can do (e.g.)
@@ -13,7 +13,7 @@
 #  run.pl some.log my-prog "--opt=foo bar" foo \|  other-prog baz
 # and run.pl will run something like:
 # ( my-prog '--opt=foo bar' foo |  other-prog baz ) >& some.log
-# 
+#
 # Basically it takes the command-line arguments, quotes them
 # as necessary to preserve spaces, and evaluates them with bash.
 # In addition it puts the command line at the top of the log, and
@@ -35,10 +35,12 @@
 # options that would normally be given to
 # queue.pl, which we will just discard.
 
-if (@ARGV > 0) {
-  while (@ARGV >= 2 && $ARGV[0] =~ m:^-:) { # parse any options
-    # that would normally go to qsub, but which will be ignored here.
-    $switch = shift @ARGV;
+for (my $x = 1; $x <= 2; $x++) { # This for-loop is to
+  # allow the JOB=1:n option to be interleaved with the
+  # options to qsub.
+  while (@ARGV >= 2 && $ARGV[0] =~ m:^-:) {
+    # parse any options that would normally go to qsub, but which will be ignored here.
+    my $switch = shift @ARGV;
     if ($switch eq "-V") {
       $ignored_opts .= "-V ";
     } elsif ($switch eq "--max-jobs-run" || $switch eq "-tc") {
@@ -48,19 +50,26 @@
         die "run.pl: invalid option --max-jobs-run $max_jobs_run";
       }
     } else {
-      $option = shift @ARGV;
-      if ($switch eq "-sync" && $option =~ m/^[yY]/) {
+      my $argument = shift @ARGV;
+      if ($argument =~ m/^--/) {
+        print STDERR "WARNING: suspicious argument '$argument' to $switch; starts with '-'\n";
+      }
+      if ($switch eq "-sync" && $argument =~ m/^[yY]/) {
         $ignored_opts .= "-sync "; # Note: in the
         # corresponding code in queue.pl it says instead, just "$sync = 1;".
-      }
-      $ignored_opts .= "$switch $option ";
-      if ($switch eq "-pe") { # e.g. -pe smp 5
-        $option2 = shift @ARGV;
-        $ignored_opts .= "$option2 ";
+      } elsif ($switch eq "-pe") { # e.g. -pe smp 5
+        my $argument2 = shift @ARGV;
+        $ignored_opts .= "$switch $argument $argument2 ";
+      } elsif ($switch =~ m/^--/) { # Config options
+        # Convert CLI new-style options
+        # Ignore all options
+        $ignored_opts .= "$switch $argument ";
+      } else {  # Other qsub options - passed as is
+        $ignored_opts .= "$switch $argument ";
       }
     }
   }
-  if ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+):(\d+)$/) { # e.g. JOB=1:10
+  if ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+):(\d+)$/) { # e.g. JOB=1:20
     $jobname = $1;
     $jobstart = $2;
     $jobend = $3;
@@ -83,7 +92,7 @@
 
 # Users found this message confusing so we are removing it.
 # if ($ignored_opts ne "") {
-#  print STDERR "run.pl: Warning: ignoring options \"$ignored_opts\"\n";
+#   print STDERR "run.pl: Warning: ignoring options \"$ignored_opts\"\n";
 # }
 
 if ($max_jobs_run == -1) { # If --max-jobs-run option not set,
@@ -136,10 +145,10 @@
 
 $cmd = "";
 
-foreach $x (@ARGV) { 
+foreach $x (@ARGV) {
     if ($x =~ m/^\S+$/) { $cmd .=  $x . " "; }
     elsif ($x =~ m:\":) { $cmd .= "'$x' "; }
-    else { $cmd .= "\"$x\" "; } 
+    else { $cmd .= "\"$x\" "; }
 }
 
 #$Data::Dumper::Indent=0;
@@ -150,7 +159,7 @@
 use POSIX ":sys_wait_h";
 for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) {
   if (scalar(keys %active_pids) >= $max_jobs_run) {
-    
+
     # Lets wait for a change in any child's status
     # Then we have to work out which child finished
     $r = waitpid(-1, 0);
@@ -158,7 +167,7 @@
     if ($r < 0 ) { die "run.pl: Error waiting for child process"; } # should never happen.
     if ( defined $active_pids{$r} ) {
         $jid=$active_pids{$r};
-        $fail[$jid]=$code; 
+        $fail[$jid]=$code;
         if ($code !=0) { $numfail++;}
         delete $active_pids{$r};
         # print STDERR "Finished: $r/$jid " .  Dumper(\%active_pids) . "\n";
@@ -166,7 +175,7 @@
         die "run.pl: Cannot find the PID of the chold process that just finished.";
     }
 
-    # In theory we could do a non-blocking waitpid over all jobs running just 
+    # In theory we could do a non-blocking waitpid over all jobs running just
     # to find out if only one or more jobs finished during the previous waitpid()
     # However, we just omit this and will reap the next one in the next pass
     # through the for(;;) cycle
@@ -175,7 +184,7 @@
   if (!defined $childpid) { die "run.pl: Error forking in run.pl (writing to $logfile)"; }
   if ($childpid == 0) { # We're in the child... this branch
     # executes the job and returns (possibly with an error status).
-    if (defined $jobname) { 
+    if (defined $jobname) {
       $cmd =~ s/$jobname/$jobid/g;
       $logfile =~ s/$jobname/$jobid/g;
     }
@@ -188,7 +197,7 @@
     close(F);
 
     # Pipe into bash.. make sure we're not using any other shell.
-    open(B, "|bash") || die "run.pl: Error opening shell command"; 
+    open(B, "|bash") || die "run.pl: Error opening shell command";
     print B "( " . $cmd . ") 2>>$logfile >> $logfile";
     close(B);                   # If there was an error, exit status is in $?
     $ret = $?;
@@ -231,9 +240,9 @@
   $job_return = $fail[$jobid];
   if (not defined $job_return ) {
     # print Dumper(\@fail);
-    
-    die "run.pl: Sanity check failed: we have indication that some jobs are running " . 
-      "even after we waited for all jobs to finish" ; 
+
+    die "run.pl: Sanity check failed: we have indication that some jobs are running " .
+      "even after we waited for all jobs to finish" ;
   }
   if ($job_return != 0 ){ $failed_jids++;}
 }
@@ -244,7 +253,7 @@
 
 if ($ret != 0) {
   $njobs = $jobend - $jobstart + 1;
-  if ($njobs == 1) { 
+  if ($njobs == 1) {
     if (defined $jobname) {
       $logfile =~ s/$jobname/$jobstart/; # only one numbered job, so replace name with
                                          # that job.
diff --git a/egs/wsj/s5/utils/scoring/wer_per_spk_details.pl b/egs/wsj/s5/utils/scoring/wer_per_spk_details.pl
index 710da8a4b4c..6db8ea7455f 100755
--- a/egs/wsj/s5/utils/scoring/wer_per_spk_details.pl
+++ b/egs/wsj/s5/utils/scoring/wer_per_spk_details.pl
@@ -96,6 +96,8 @@ sub format_sys {
   my @F=split;
   die "Incompatible format of the utt2spk file: $_" if @F != 2; 
   $UTTMAP{$F[0]} = $F[1];
+  # Set width of speaker column by its longest label,
+  if($SPK_WIDTH < length($F[1])) { $SPK_WIDTH = length($F[1]) }
 }
 close(UTT2SPK);
 
diff --git a/egs/wsj/s5/utils/slurm.pl b/egs/wsj/s5/utils/slurm.pl
index 68c269080ac..8095272732e 100755
--- a/egs/wsj/s5/utils/slurm.pl
+++ b/egs/wsj/s5/utils/slurm.pl
@@ -11,7 +11,7 @@
 use Cwd;
 use Getopt::Long;
 
-# slurm.pl was created from the queue.pl 
+# slurm.pl was created from the queue.pl
 # queue.pl has the same functionality as run.pl, except that
 # it runs the job in question on the queue (Sun GridEngine).
 # This version of queue.pl uses the task array functionality
@@ -20,7 +20,7 @@
 
 # The script now supports configuring the queue system using a config file
 # (default in conf/queue.conf; but can be passed specified with --config option)
-# and a set of command line options. 
+# and a set of command line options.
 # The current script handles:
 # 1) Normal configuration arguments
 # For e.g. a command line option of "--gpu 1" could be converted into the option
@@ -30,7 +30,7 @@
 # $0 here in the line is replaced with the argument read from the CLI and the
 # resulting string is passed to qsub.
 # 2) Special arguments to options such as
-# gpu=0 
+# gpu=0
 # If --gpu 0 is given in the command line, then no special "-q" is given.
 # 3) Default argument
 # default gpu=0
@@ -60,7 +60,7 @@
 my $qsub_opts = "";
 my $sync = 0;
 my $num_threads = 1;
-my $max_jobs_run;  
+my $max_jobs_run;
 my $gpu = 0;
 
 my $config = "conf/slurm.conf";
@@ -99,12 +99,12 @@ ()
   print_usage();
 }
 
-for (my $x = 1; $x <= 3; $x++) { # This for-loop is to 
+for (my $x = 1; $x <= 3; $x++) { # This for-loop is to
   # allow the JOB=1:n option to be interleaved with the
   # options to qsub.
   while (@ARGV >= 2 && $ARGV[0] =~ m:^-:) {
     my $switch = shift @ARGV;
-    
+
     if ($switch eq "-V") {
       $qsub_opts .= "-V ";
     } else {
@@ -121,10 +121,10 @@ ()
         $num_threads = $argument2;
       } elsif ($switch =~ m/^--/) { # Config options
         # Convert CLI option to variable name
-        # by removing '--' from the switch and replacing any 
+        # by removing '--' from the switch and replacing any
         # '-' with a '_'
         $switch =~ s/^--//;
-        $switch =~ s/-/_/g;         
+        $switch =~ s/-/_/g;
         $cli_options{$switch} = $argument;
       } else {  # Other qsub options - passed as is
         $qsub_opts .= "$switch $argument ";
@@ -160,7 +160,7 @@ ()
 
 if (exists $cli_options{"config"}) {
   $config = $cli_options{"config"};
-}  
+}
 
 my $default_config_file = <<'EOF';
 # Default configuration
@@ -168,17 +168,18 @@ ()
 option time=* --time $0
 option mem=* --mem-per-cpu $0
 option mem=0          # Do not add anything to qsub_opts
-option num_threads=* --cpus-per-task $0 --ntasks-per-node=1 
+option num_threads=* --cpus-per-task $0 --ntasks-per-node=1
 option num_threads=1 --cpus-per-task 1  --ntasks-per-node=1 # Do not add anything to qsub_opts
-option max_jobs_run=*     # Do nothing
 default gpu=0
 option gpu=0 -p shared
 option gpu=* -p gpu --gres=gpu:$0 --time 4:0:0  # this has to be figured out
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
 EOF
 
 # Here the configuration options specified by the user on the command line
 # (e.g. --mem 2G) are converted to options to the qsub system as defined in
-# the config file. (e.g. if the config file has the line 
+# the config file. (e.g. if the config file has the line
 # "option mem=* -l ram_free=$0,mem_free=$0"
 # and the user has specified '--mem 2G' on the command line, the options
 # passed to queue system would be "-l ram_free=2G,mem_free=2G
@@ -192,7 +193,7 @@ ()
 my %cli_config_options = ();
 my %cli_default_options = ();
 
-if ($opened_config_file == 0 && exists($cli_options{"config"})) {   
+if ($opened_config_file == 0 && exists($cli_options{"config"})) {
   print STDERR "Could not open config file $config\n";
   exit(1);
 } elsif ($opened_config_file == 0 && !exists($cli_options{"config"})) {
@@ -212,12 +213,12 @@ ()
   if ($_ =~ /^command (.+)/) {
     $read_command = 1;
     $qsub_cmd = $1 . " ";
-  } elsif ($_ =~ m/^option ([^=]+)=\* (.+)$/) { 
+  } elsif ($_ =~ m/^option ([^=]+)=\* (.+)$/) {
     # Config option that needs replacement with parameter value read from CLI
     # e.g.: option mem=* -l mem_free=$0,ram_free=$0
     my $option = $1;     # mem
     my $arg= $2;         # -l mem_free=$0,ram_free=$0
-    if ($arg !~ m:\$0:) {  
+    if ($arg !~ m:\$0:) {
       print STDERR "Warning: the line '$line' in config file ($config) does not substitution variable \$0\n";
     }
     if (exists $cli_options{$option}) {
@@ -237,7 +238,7 @@ ()
     }
   } elsif ($_ =~ m/^default (\S+)=(\S+)/) {
     # Default options. Used for setting default values to options i.e. when
-    # the user does not specify the option on the command line 
+    # the user does not specify the option on the command line
     # e.g. default gpu=0
     my $option = $1;  # gpu
     my $value = $2;   # 0
@@ -261,19 +262,25 @@ ()
 
 for my $option (keys %cli_options) {
   if ($option eq "config") { next; }
-  if ($option eq "max_jobs_run" && $array_job != 1) { print STDERR "Ignoring $option\n"; next; }
+
   my $value = $cli_options{$option};
-  
-  if ($option eq "max_jobs_run") { $max_jobs_run = $value; }
 
-  if (exists $cli_default_options{($option,$value)}) {
+  if ($option eq "max_jobs_run") {
+    if ($array_job != 1) {
+      print STDERR "Ignoring $option since this is not an array task.";
+    } else {
+      $max_jobs_run = $value;
+    }
+  } elsif (exists $cli_default_options{($option,$value)}) {
     $qsub_opts .= "$cli_default_options{($option,$value)} ";
   } elsif (exists $cli_config_options{$option}) {
     $qsub_opts .= "$cli_config_options{$option} ";
   } elsif (exists $cli_default_options{($option,"*")}) {
     $qsub_opts .= $cli_default_options{($option,"*")} . " ";
   } else {
-    if ($opened_config_file == 0) { $config = "default config file"; }
+    if ($opened_config_file == 0) {
+      $config = "default config file";
+    }
     die "$0: Command line option $option not described in $config (or value '$value' not allowed)\n";
   }
 }
@@ -301,7 +308,7 @@ ()
 #
 my $cmd = "";
 
-foreach my $x (@ARGV) { 
+foreach my $x (@ARGV) {
   if ($x =~ m/^\S+$/) { $cmd .= $x . " "; } # If string contains no spaces, take
                                             # as-is.
   elsif ($x =~ m:\":) { $cmd .= "'$x' "; } # else if no dbl-quotes, use single
@@ -322,23 +329,23 @@ ()
 # make a directory called "q",
 # where we will put the log created by qsub... normally this doesn't contain
 # anything interesting, evertyhing goes to $logfile.
-if (! -d "$qdir") { 
+if (! -d "$qdir") {
   system "mkdir $qdir 2>/dev/null";
   sleep(5); ## This is to fix an issue we encountered in denominator lattice creation,
   ## where if e.g. the exp/tri2b_denlats/log/15/q directory had just been
   ## created and the job immediately ran, it would die with an error because nfs
   ## had not yet synced.  I'm also decreasing the acdirmin and acdirmax in our
   ## NFS settings to something like 5 seconds.
-} 
+}
 
 my $queue_array_opt = "";
 if ($array_job == 1) { # It's an array job.
   if ($max_jobs_run) {
-      $queue_array_opt = "--array ${jobstart}-${jobend}%${max_jobs_run}"; 
+      $queue_array_opt = "--array ${jobstart}-${jobend}%${max_jobs_run}";
   } else {
-      $queue_array_opt = "--array ${jobstart}-${jobend}"; 
+      $queue_array_opt = "--array ${jobstart}-${jobend}";
   }
-  $logfile =~ s/$jobname/\$SLURM_ARRAY_TASK_ID/g; # This variable will get 
+  $logfile =~ s/$jobname/\$SLURM_ARRAY_TASK_ID/g; # This variable will get
   # replaced by qsub, in each job, with the job-id.
   $cmd =~ s/$jobname/\$\{SLURM_ARRAY_TASK_ID\}/g; # same for the command...
   $queue_logfile =~ s/\.?$jobname//; # the log file in the q/ subdirectory
@@ -475,14 +482,14 @@ ()
         }
       }
 
-      # Check that the job exists in SLURM. Job can be killed if duration 
-      # exceeds some hard limit, or in case of a machine shutdown. 
+      # Check that the job exists in SLURM. Job can be killed if duration
+      # exceeds some hard limit, or in case of a machine shutdown.
       if (($check_sge_job_ctr++ % 10) == 0) { # Don't run qstat too often, avoid stress on SGE.
         if ( -f $f ) { next; }; #syncfile appeared: OK.
         $ret = system("squeue -j $sge_job_id >/dev/null 2>/dev/null");
         # system(...) : To get the actual exit value, shift $ret right by eight bits.
         if ($ret>>8 == 1) {     # Job does not seem to exist
-          # Don't consider immediately missing job as error, first wait some  
+          # Don't consider immediately missing job as error, first wait some
           # time to make sure it is not just delayed creation of the syncfile.
 
           sleep(3);
@@ -546,7 +553,7 @@ ()
   push @logfiles, $logfile;
 } else {
   for (my $jobid = $jobstart; $jobid <= $jobend; $jobid++) {
-    my $l = $logfile; 
+    my $l = $logfile;
     $l =~ s/\$SLURM_ARRAY_TASK_ID/$jobid/g;
     push @logfiles, $l;
   }
diff --git a/egs/wsj/s5/utils/split_data.sh b/egs/wsj/s5/utils/split_data.sh
index 941890cdd57..c6b501e2b0c 100755
--- a/egs/wsj/s5/utils/split_data.sh
+++ b/egs/wsj/s5/utils/split_data.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright 2010-2013 Microsoft Corporation 
+# Copyright 2010-2013 Microsoft Corporation
 #                     Johns Hopkins University (Author: Daniel Povey)
 
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -56,9 +56,9 @@ if [ -f $data/text ] && [ $nu -ne $nt ]; then
 fi
 
 s1=$data/split$numsplit/1
-if [ ! -d $s1 ]; then 
+if [ ! -d $s1 ]; then
   need_to_split=true
-else 
+else
   need_to_split=false
   for f in utt2spk spk2utt spk2warp feats.scp text wav.scp cmvn.scp spk2gender \
     vad.scp segments reco2file_and_channel utt2lang; do
@@ -71,11 +71,17 @@ fi
 if ! $need_to_split; then
   exit 0;
 fi
-  
-for n in `seq $numsplit`; do
-   mkdir -p $data/split$numsplit/$n
-   utt2spks="$utt2spks $data/split$numsplit/$n/utt2spk"
-done
+
+utt2spks=$(for n in `seq $numsplit`; do echo $data/split$numsplit/$n/utt2spk; done)
+
+directories=$(for n in `seq $numsplit`; do echo $data/split$numsplit/$n; done)
+
+# if this mkdir fails due to argument-list being too long, iterate.
+if ! mkdir -p $directories >&/dev/null; then
+  for n in `seq $numsplit`; do
+    mkdir -p $data/split$numsplit/$n
+  done
+fi
 
 if $split_per_spk; then
   utt2spk_opt="--utt2spk=$data/utt2spk"
@@ -84,7 +90,8 @@ else
 fi
 
 # If lockfile is not installed, just don't lock it.  It's not a big deal.
-which lockfile >&/dev/null && lockfile -l 60 $data/.split_lock 
+which lockfile >&/dev/null && lockfile -l 60 $data/.split_lock
+trap 'rm -f $data/.split_lock' EXIT HUP INT PIPE TERM
 
 utils/split_scp.pl $utt2spk_opt $data/utt2spk $utt2spks || exit 1
 
@@ -115,21 +122,24 @@ for f in spk2gender spk2warp cmvn.scp; do
   fi
 done
 
-for n in `seq $numsplit`; do
-   dsn=$data/split$numsplit/$n
-   if [ -f $data/segments ]; then
-     utils/filter_scp.pl $dsn/utt2spk $data/segments > $dsn/segments
-     awk '{print $2;}' $dsn/segments | sort | uniq > $data/tmp.reco # recording-ids.
-     if [ -f $data/reco2file_and_channel ]; then
-       utils/filter_scp.pl $data/tmp.reco $data/reco2file_and_channel > $dsn/reco2file_and_channel
-     fi
-     if [ -f $data/wav.scp ]; then
-       utils/filter_scp.pl $data/tmp.reco $data/wav.scp >$dsn/wav.scp
-     fi
-     rm $data/tmp.reco
-   fi # else it would have been handled above, see maybe_wav.
-done
-
-rm -f $data/.split_lock
+if [ -f $data/segments ]; then
+  utils/filter_scps.pl JOB=1:$numsplit \
+     $data/split$numsplit/JOB/utt2spk $data/segments $data/split$numsplit/JOB/segments || exit 1
+  for n in `seq $numsplit`; do
+    dsn=$data/split$numsplit/$n
+    awk '{print $2;}' $dsn/segments | sort | uniq > $dsn/tmp.reco # recording-ids.
+  done
+  if [ -f $data/reco2file_and_channel ]; then
+    utils/filter_scps.pl JOB=1:$numsplit \
+      $data/split$numsplit/JOB/tmp.reco $data/reco2file_and_channel \
+      $data/split$numsplit/JOB/reco2file_and_channel || exit 1
+  fi
+  if [ -f $data/wav.scp ]; then
+    utils/filter_scps.pl JOB=1:$numsplit \
+      $data/split$numsplit/JOB/tmp.reco $data/wav.scp \
+      $data/split$numsplit/JOB/wav.scp || exit 1
+  fi
+  for f in $data/split$numsplit/*/tmp.reco; do rm $f; done
+fi
 
 exit 0
diff --git a/egs/wsj/s5/utils/split_scp.pl b/egs/wsj/s5/utils/split_scp.pl
index 70bc8033c9d..be2767ccb8d 100755
--- a/egs/wsj/s5/utils/split_scp.pl
+++ b/egs/wsj/s5/utils/split_scp.pl
@@ -72,7 +72,7 @@
     @OUTPUTS = @ARGV;
 } else {
     for ($j = 0; $j < $num_jobs; $j++) {
-        if ($j == $job_id) { 
+        if ($j == $job_id) {
             if (@ARGV > 0) { push @OUTPUTS, $ARGV[0]; }
             else { push @OUTPUTS, "-"; }
         } else {
@@ -98,12 +98,12 @@
         $s = $utt2spk{$u};
         if(!defined $s) { die "No such utterance $u in utt2spk file $utt2spk_file"; }
         if(!defined $spk_count{$s}) {
-            push @spkrs, $s; 
+            push @spkrs, $s;
             $spk_count{$s} = 0;
-            $spk_data{$s} = "";
+            $spk_data{$s} = [];  # ref to new empty array.
         }
         $spk_count{$s}++;
-        $spk_data{$s} = $spk_data{$s} . $_;
+        push @{$spk_data{$s}}, $_;
     }
     # Now split as equally as possible ..
     # First allocate spks to files by allocating an approximately
@@ -182,7 +182,7 @@
             $error = 1;
         } else {
             foreach $spk ( @{$scparray[$scpidx]} ) {
-                print F $spk_data{$spk};
+                print F @{$spk_data{$spk}};
                 $count += $spk_count{$spk};
             }
             if($count != $scpcount[$scpidx]) { die "Count mismatch [code error]"; }
@@ -190,7 +190,7 @@
         close(F);
     }
 } else {
-   # This block is the "normal" case where there is no --utt2spk 
+   # This block is the "normal" case where there is no --utt2spk
    # option and we just break into equal size chunks.
 
     open(I, "<$inscp") || die "Opening input scp file $inscp";
diff --git a/egs/wsj/s5/utils/subset_data_dir.sh b/egs/wsj/s5/utils/subset_data_dir.sh
index be74ac8c177..154b9c81c0a 100755
--- a/egs/wsj/s5/utils/subset_data_dir.sh
+++ b/egs/wsj/s5/utils/subset_data_dir.sh
@@ -106,6 +106,7 @@ function do_filtering {
   [ -f $srcdir/feats.scp ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/feats.scp >$destdir/feats.scp
   [ -f $srcdir/vad.scp ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/vad.scp >$destdir/vad.scp
   [ -f $srcdir/utt2lang ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2lang >$destdir/utt2lang
+  [ -f $srcdir/utt2dur ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2dur >$destdir/utt2dur
   [ -f $srcdir/wav.scp ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/wav.scp >$destdir/wav.scp
   [ -f $srcdir/spk2warp ] && utils/filter_scp.pl $destdir/spk2utt <$srcdir/spk2warp >$destdir/spk2warp
   [ -f $srcdir/utt2warp ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2warp >$destdir/utt2warp
@@ -159,7 +160,7 @@ elif $perspk; then
   do_filtering; # bash function.
   exit 0;
 else
-  if [ $numutt -gt `cat $srcdir/feats.scp | wc -l` ]; then
+  if [ $numutt -gt `cat $srcdir/utt2spk | wc -l` ]; then
     echo "subset_data_dir.sh: cannot subset to more utterances than you originally had."
     exit 1;
   fi 
diff --git a/egs/wsj/s5/utils/subset_scp.pl b/egs/wsj/s5/utils/subset_scp.pl
index a8bcdfc1fc3..11fddc09a0f 100755
--- a/egs/wsj/s5/utils/subset_scp.pl
+++ b/egs/wsj/s5/utils/subset_scp.pl
@@ -71,23 +71,27 @@
 }
 
 sub select_n {
-    my ($start,$end,$num_needed) = @_;
-    my $diff = $end - $start;
-    if($num_needed > $diff) { die "select_n: code error"; }
-    if($diff == 1 ) {
-        if($num_needed  > 0) {
-            print $F[$start];
-        }
-    } else {
-        my $halfdiff = int($diff/2);
-        my $halfneeded = int($num_needed/2);
-        select_n($start, $start+$halfdiff, $halfneeded);
-        select_n($start+$halfdiff, $end, $num_needed - $halfneeded);
+  my ($start,$end,$num_needed) = @_;
+  my $diff = $end - $start;
+  if ($num_needed > $diff) {
+    die "select_n: code error";
+  }
+  if ($diff == 1 ) {
+    if ($num_needed  > 0) {
+      print $F[$start];
     }
+  } else {
+    my $halfdiff = int($diff/2);
+    my $halfneeded = int($num_needed/2);
+    select_n($start, $start+$halfdiff, $halfneeded);
+    select_n($start+$halfdiff, $end, $num_needed - $halfneeded);
+  }
 }
 
 if ( ! $first && ! $last) {
-  select_n(0, $numlines, $N);
+  if ($N > 0) {
+    select_n(0, $numlines, $N);
+  }
 } else {
   if ($first) { # --first option: same as head.
     for ($n = 0; $n < $N; $n++) {
diff --git a/egs/wsj/s5/utils/validate_data_dir.sh b/egs/wsj/s5/utils/validate_data_dir.sh
index da962177bef..19452c3c235 100755
--- a/egs/wsj/s5/utils/validate_data_dir.sh
+++ b/egs/wsj/s5/utils/validate_data_dir.sh
@@ -133,7 +133,7 @@ if [ -f $data/wav.scp ]; then
     ! cat $data/segments | \
       awk '{if (NF != 4 || ($4 <= $3 && $4 != -1)) { print "Bad line in segments file", $0; exit(1); }}' && \
       echo "$0: badly formatted segments file" && exit 1;
-    
+
     segments_len=`cat $data/segments | wc -l`
     if [ -f $data/text ]; then
       ! cmp -s $tmpdir/utts <(awk '{print $1}' <$data/text) && \
@@ -153,14 +153,14 @@ if [ -f $data/wav.scp ]; then
       # this file is needed only for ctm scoring; it's indexed by recording-id.
       check_sorted_and_uniq $data/reco2file_and_channel
       ! cat $data/reco2file_and_channel | \
-        awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { 
+        awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) {
                 if ( NF == 3 && $3 == "1" ) {
                   warning_issued = 1;
                 } else {
-                  print "Bad line ", $0; exit 1; 
+                  print "Bad line ", $0; exit 1;
                 }
               }
-            } 
+            }
             END {
               if (warning_issued == 1) {
                 print "The channel should be marked as A or B, not 1! You should change it ASAP! "
@@ -188,14 +188,14 @@ if [ -f $data/wav.scp ]; then
       # this file is needed only for ctm scoring; it's indexed by recording-id.
       check_sorted_and_uniq $data/reco2file_and_channel
       ! cat $data/reco2file_and_channel | \
-        awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { 
+        awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) {
                 if ( NF == 3 && $3 == "1" ) {
                   warning_issued = 1;
                 } else {
-                  print "Bad line ", $0; exit 1; 
+                  print "Bad line ", $0; exit 1;
                 }
               }
-            } 
+            }
             END {
               if (warning_issued == 1) {
                 print "The channel should be marked as A or B, not 1! You should change it ASAP! "
@@ -228,6 +228,7 @@ if [ -f $data/feats.scp ]; then
   fi
 fi
 
+
 if [ -f $data/cmvn.scp ]; then
   check_sorted_and_uniq $data/cmvn.scp
   cat $data/cmvn.scp | awk '{print $1}' > $tmpdir/speakers.cmvn
@@ -283,7 +284,7 @@ if [ -f $data/utt2warp ]; then
 fi
 
 # check some optionally-required things
-for f in vad.scp utt2lang; do
+for f in vad.scp utt2lang utt2uniq; do
   if [ -f $data/$f ]; then
     check_sorted_and_uniq $data/$f
     if ! cmp -s <( awk '{print $1}' $data/utt2spk ) \
@@ -294,4 +295,19 @@ for f in vad.scp utt2lang; do
   fi
 done
 
+
+if [ -f $data/utt2dur ]; then
+  check_sorted_and_uniq $data/utt2dur
+  cat $data/utt2dur | awk '{print $1}' > $tmpdir/utts.utt2dur
+  if ! cmp -s $tmpdir/utts{,.utt2dur}; then
+    echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2dur file"
+    echo "$0: differ, partial diff is:"
+    partial_diff $tmpdir/utts{,.feats}
+    exit 1;
+  fi
+  cat $data/utt2dur | \
+    awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line : " $0; exit(1) }}' || exit 1
+fi
+
+
 echo "$0: Successfully validated data-directory $data"
diff --git a/egs/wsj/s5/utils/validate_dict_dir.pl b/egs/wsj/s5/utils/validate_dict_dir.pl
index ca33f84c8c4..5cc04c1e6ff 100755
--- a/egs/wsj/s5/utils/validate_dict_dir.pl
+++ b/egs/wsj/s5/utils/validate_dict_dir.pl
@@ -25,6 +25,7 @@
 if(!open(S, "<$dict/silence_phones.txt")) {print "--> ERROR: fail to open $dict/silence_phones.txt\n"; exit 1;}
 $idx = 1;
 %silence = ();
+$crlf = 1;
 
 print "--> reading $dict/silence_phones.txt\n";
 while(<S>) {
@@ -32,19 +33,24 @@
     print "--> ERROR: last line '$_' of $dict/silence_phones.txt does not end in newline.\n";
     set_to_fail();
   }
+  if ($crlf == 1 && m/\r/) {
+    print "--> ERROR: $dict/silence_phones.txt contains Carriage Return (^M) characters.\n";
+    set_to_fail();
+    $crlf = 0;
+  }
   my @col = split(" ", $_);
   if (@col == 0) {
-    set_to_fail(); 
-    print "--> ERROR: empty line in $dict/silence_phones.txt (line $idx)\n"; 
+    set_to_fail();
+    print "--> ERROR: empty line in $dict/silence_phones.txt (line $idx)\n";
   }
   foreach(0 .. @col-1) {
     my $p = $col[$_];
     if($silence{$p}) {set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/silence_phones.txt (line $idx)\n"; }
     else {$silence{$p} = 1;}
-    if ($p =~ m/_$/ || $p =~ m/#/ || $p =~ m/_[BESI]$/){
+    if ($p =~ m/#(\d)+/ || $p =~ m/_[BESI]$/){
       set_to_fail();
-      print "--> ERROR: phone \"$p\" has disallowed written form";
-      
+      print "--> ERROR: phone \"$p\" has disallowed written form\n";
+
     }
   }
   $idx ++;
@@ -59,14 +65,20 @@
 if(!open(OS, "<$dict/optional_silence.txt")) {print "--> ERROR: fail to open $dict/optional_silence.txt\n"; exit 1;}
 $idx = 1;
 $success = 1;
+$crlf = 1;
 print "--> reading $dict/optional_silence.txt\n";
 while(<OS>) {
   chomp;
   my @col = split(" ", $_);
   if ($idx > 1 or @col > 1) {
-    set_to_fail(); print "--> ERROR: only 1 phone expected in $dict/optional_silence.txt\n"; 
+    set_to_fail(); print "--> ERROR: only 1 phone expected in $dict/optional_silence.txt\n";
   } elsif (!$silence{$col[0]}) {
-    set_to_fail(); print "--> ERROR: phone $col[0] not found in $dict/silence_phones.txt\n"; 
+    set_to_fail(); print "--> ERROR: phone $col[0] not found in $dict/silence_phones.txt\n";
+  }
+  if ($crlf == 1 && m/\r/) {
+    print "--> ERROR: $dict/optional_silence.txt contains Carriage Return (^M) characters.\n";
+    set_to_fail();
+    $crlf = 0;
   }
   $idx ++;
 }
@@ -81,25 +93,31 @@
 $idx = 1;
 %nonsilence = ();
 $success = 1;
+$crlf = 1;
 print "--> reading $dict/nonsilence_phones.txt\n";
 while(<NS>) {
+  if ($crlf == 1 && m/\r/) {
+    print "--> ERROR: $dict/nonsilence_phones.txt contains Carriage Return (^M) characters.\n";
+    set_to_fail();
+    $crlf = 0;
+  }
   if (! s/\n$//) {
     print "--> ERROR: last line '$_' of $dict/nonsilence_phones.txt does not end in newline.\n";
     set_to_fail();
   }
   my @col = split(" ", $_);
   if (@col == 0) {
-    set_to_fail(); 
-    print "--> ERROR: empty line in $dict/nonsilence_phones.txt (line $idx)\n"; 
+    set_to_fail();
+    print "--> ERROR: empty line in $dict/nonsilence_phones.txt (line $idx)\n";
   }
   foreach(0 .. @col-1) {
     my $p = $col[$_];
     if($nonsilence{$p}) {set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/nonsilence_phones.txt (line $idx)\n"; }
     else {$nonsilence{$p} = 1;}
-    if ($p =~ m/_$/ || $p =~ m/#/ || $p =~ m/_[BESI]$/){
+    if ($p =~ m/#(\d)+/ || $p =~ m/_[BESI]$/){
       set_to_fail();
-      print "--> ERROR: phone \"$p\" has disallowed written form";
-      
+      print "--> ERROR: phone \"$p\" has disallowed written form\n";
+
     }
   }
   $idx ++;
@@ -134,9 +152,14 @@ sub check_lexicon {
   print "Checking $lex\n";
   !open(L, "<$lex") && print "--> ERROR: fail to open $lex\n" && set_to_fail();
   my %seen_line = {};
-  $idx = 1; $success = 1;
+  $idx = 1; $success = 1; $crlf = 1;
   print "--> reading $lex\n";
   while (<L>) {
+    if ($crlf == 1 && m/\r/) {
+      print "--> ERROR: $lex contains Carriage Return (^M) characters.\n";
+      set_to_fail();
+      $crlf = 0;
+    }
     if (defined $seen_line{$_}) {
       print "--> ERROR: line '$_' of $lex is repeated\n";
       set_to_fail();
@@ -157,7 +180,7 @@ sub check_lexicon {
     }
     for ($n = 0; $n < $num_prob_cols; $n++) {
       $prob = shift @col;
-      if (!($prob > 0.0 && $prob <= 1.0)) { 
+      if (!($prob > 0.0 && $prob <= 1.0)) {
         print "--> ERROR: bad pron-prob in lexicon-line '$_', in $lex\n";
         set_to_fail();
       }
@@ -171,7 +194,7 @@ sub check_lexicon {
     foreach (0 .. @col-1) {
       if (!$silence{@col[$_]} and !$nonsilence{@col[$_]}) {
         print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt ";
-        print "(line $idx)\n"; 
+        print "(line $idx)\n";
         set_to_fail();
       }
     }
@@ -191,16 +214,22 @@ sub check_lexicon {
   if (-f "$dict/silprob.txt") {
     !open(SP, "<$dict/silprob.txt") &&
       print "--> ERROR: fail to open $dict/silprob.txt\n" && set_to_fail();
+      $crlf = 1;
     while (<SP>) {
+      if ($crlf == 1 && m/\r/) {
+        print "--> ERROR: $dict/silprob.txt contains Carriage Return (^M) characters.\n";
+        set_to_fail();
+        $crlf = 0;
+      }
       chomp; my @col = split;
       @col != 2 && die "--> ERROR: bad line \"$_\"\n" && set_to_fail();
       if ($col[0] eq "<s>" || $col[0] eq "overall") {
-        if (!($col[1] > 0.0 && $col[1] <= 1.0)) { 
+        if (!($col[1] > 0.0 && $col[1] <= 1.0)) {
           set_to_fail();
           print "--> ERROR: bad probability in $dir/silprob.txt \"$_\"\n";
         }
       } elsif ($col[0] eq "</s>_s" || $col[0] eq "</s>_n") {
-        if ($col[1] <= 0.0) { 
+        if ($col[1] <= 0.0) {
           set_to_fail();
           print "--> ERROR: bad correction term in $dir/silprob.txt \"$_\"\n";
         }
@@ -290,8 +319,14 @@ sub check_lexicon_pair {
   }
   $idx = 1;
   $success = 1;
+  $crlf = 1;
   print "--> reading $dict/extra_questions.txt\n";
   while(<EX>) {
+    if ($crlf == 1 && m/\r/) {
+      print "--> ERROR: $dict/extra_questions.txt contains Carriage Return (^M) characters.\n";
+      set_to_fail();
+      $crlf = 0;
+    }
     if (! s/\n$//) {
       print "--> ERROR: last line '$_' of $dict/extra_questions.txt does not end in newline.\n";
       set_to_fail();
@@ -302,7 +337,7 @@ sub check_lexicon_pair {
     }
     foreach (0 .. @col-1) {
       if(!$silence{@col[$_]} and !$nonsilence{@col[$_]}) {
-        set_to_fail();  print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt (line $idx, block ", $_+1, ")\n"; 
+        set_to_fail();  print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt (line $idx, block ", $_+1, ")\n";
       }
       $idx ++;
     }
@@ -336,7 +371,7 @@ sub check_lexicon_pair {
 $num_warn_nosplit_limit = 10;
 while(<NS>) {
   my @col = split(" ", $_);
-  foreach $p1 (@col) { 
+  foreach $p1 (@col) {
     foreach $p2 (@col) {
       if ($p1 ne $p2 && ! $distinguished{$p1,$p2}) {
         set_to_fail();
diff --git a/egs/wsj/s5/utils/validate_lang.pl b/egs/wsj/s5/utils/validate_lang.pl
index 0d00379f82c..657142689ee 100755
--- a/egs/wsj/s5/utils/validate_lang.pl
+++ b/egs/wsj/s5/utils/validate_lang.pl
@@ -8,20 +8,28 @@
 
 
 $skip_det_check = 0;
+$skip_disambig_check = 0;
 
 if (@ARGV > 0 && $ARGV[0] eq "--skip-determinization-check") {
   $skip_det_check = 1;
   shift @ARGV;
 }
 
+if (@ARGV > 0 && $ARGV[0] eq "--skip-disambig-check") {
+  $skip_disambig_check = 1;
+  shift @ARGV;
+}
+
 if (@ARGV != 1) {
   print "Usage: $0 [options] <lang_directory>\n";
   print "e.g.:  $0 data/lang\n";
   print "Options:\n";
   print " --skip-determinization-check             (this flag causes it to skip a time consuming check).\n";
+  print " --skip-disambig-check                    (this flag causes it to skip a disambig check in phone bigram models).\n";
   exit(1);
 }
 
+print "$0 " . join(" ", @ARGV) . "\n";
 
 $lang = shift @ARGV;
 $exit = 0;
@@ -48,7 +56,7 @@
   $idx ++;
 }
 close(P);
-%pint2sym = (); 
+%pint2sym = ();
 foreach (keys %psymtab) {
   if ($pint2sym{$psymtab{$_}}) {
     print "--> ERROR: ID \"$psymtab{$_}\" duplicates\n"; exit 1;
@@ -81,7 +89,7 @@
   $idx ++;
 }
 close(W);
-%wint2sym = (); 
+%wint2sym = ();
 foreach (keys %wsymtab) {
   if ($wint2sym{$wsymtab{$_}}) {
     print "--> ERROR: ID \"$wsymtab{$_}\" duplicates\n"; exit 1;
@@ -89,15 +97,7 @@
     $wint2sym{$wsymtab{$_}} = $_;
   }
 }
-if (exists $wsymtab{"#0"}) {
-  print "--> $lang/words.txt has \"#0\"\n";
-  print "--> $lang/words.txt is OK\n";
-} else {
-  $warning = 1;
-  print "--> WARNING: $lang/words.txt doesn't have \"#0\"\n";
-  print "-->          (if you are using ARPA-type language models, you will normally\n";
-  print "-->           need the disambiguation symbol \"#0\" to ensure determinizability)\n";
-}
+print "--> $lang/words.txt is OK\n";
 print "\n";
 
 # Checking phones/* -------------------------------
@@ -113,7 +113,6 @@ sub check_txt_int_csl {
   if (!open(CSL, "<$cat.csl")) {
     $exit = 1; return print "--> ERROR: fail to open $cat.csl\n";
   }
-
   if (-z "$cat.txt") {
     $warning = 1; print "--> WARNING: $cat.txt is empty\n";
   }
@@ -172,7 +171,7 @@ sub check_txt_int_csl {
   close(CSL);
   if ($idx1 != 0) {             # nonempty .txt,.int files
     if ($num_lines != 1) {
-      $exit = 1; 
+      $exit = 1;
       return print "--> ERROR: expect 1 line in $cat.csl\n";
     }
   } else {
@@ -212,7 +211,7 @@ sub check_txt_int {
     s/ internal$//g;
     s/ singleton$//g;
     $entry[$idx1] = $_;
-    $idx1 ++; 
+    $idx1 ++;
   }
   close(TXT); $idx1 --;
   print "--> $idx1 entry/entries in $cat.txt\n";
@@ -287,7 +286,7 @@ sub check_disjoint {
   if (!open(N, "<$lang/phones/nonsilence.txt")) {
     $exit = 1; return print "--> ERROR: fail to open $lang/phones/nonsilence.txt\n";
   }
-  if (!open(D, "<$lang/phones/disambig.txt")) {
+  if (!$skip_disambig_check && !open(D, "<$lang/phones/disambig.txt")) {
     $exit = 1; return print "--> ERROR: fail to open $lang/phones/disambig.txt\n";
   }
 
@@ -305,7 +304,7 @@ sub check_disjoint {
   }
   close(S);
 
-  $idx = 1; 
+  $idx = 1;
   while (<N>) {
     chomp;
     my @col = split(" ", $_);
@@ -382,7 +381,7 @@ sub check_summation {
   if (scalar(keys %nonsilence) == 0) {
     $exit = 1; return print "--> ERROR: $lang/phones/nonsilence.txt is empty or does not exist\n";
   }
-  if (scalar(keys %disambig) == 0) {
+  if (!$skip_disambig_check && scalar(keys %disambig) == 0) {
     $warning = 1; print "--> WARNING: $lang/phones/disambig.txt is empty or does not exist\n";
   }
 
@@ -427,8 +426,11 @@ sub check_summation {
 check_disjoint; print "\n";
 check_summation; print "\n";
 
-@list1 = ("context_indep", "disambig", "nonsilence", "silence", "optional_silence");
+@list1 = ("context_indep", "nonsilence", "silence", "optional_silence");
 @list2 = ("roots", "sets");
+if (!$skip_disambig_check) {
+    push(@list1, "disambig");
+}
 foreach (@list1) {
   check_txt_int_csl("$lang/phones/$_", \%psymtab); print "\n";
 }
@@ -439,14 +441,11 @@ sub check_summation {
   check_txt_int("$lang/phones/extra_questions", \%psymtab, 0); print "\n";
 } else {
   print "Checking $lang/phones/extra_questions.\{txt, int\} ...\n";
-  if ((-f "$lang/phones/extra_questions.txt") && (-f "$lang/phones/extra_questions.int")) {
-    print "--> WARNING: the optional $lang/phones/extra_questions.\{txt, int\} are empty!\n\n";
-    $warning = 1;
-  } else {
+  if (!((-f "$lang/phones/extra_questions.txt") && (-f "$lang/phones/extra_questions.int"))) {
     print "--> ERROR: $lang/phones/extra_questions.\{txt, int\} do not exist (they may be empty, but should be present)\n\n";
     $exit = 1;
   }
-} 
+}
 if (-e "$lang/phones/word_boundary.txt") {
   check_txt_int("$lang/phones/word_boundary", \%psymtab, 0); print "\n";
 }
@@ -476,19 +475,21 @@ sub check_summation {
 $success == 0 || print "--> $lang/phones/optional_silence.txt is OK\n";
 print "\n";
 
-# Check disambiguation symbols -------------------------------
-print "Checking disambiguation symbols: #0 and #1\n";
-if (scalar(keys %disambig) == 0) {
-  $warning = 1; print "--> WARNING: $lang/phones/disambig.txt is empty or does not exist\n";
-}
-if (exists $disambig{"#0"} and exists $disambig{"#1"}) {
-  print "--> $lang/phones/disambig.txt has \"#0\" and \"#1\"\n";
-  print "--> $lang/phones/disambig.txt is OK\n\n";
-} else {
-  print "--> WARNING: $lang/phones/disambig.txt doesn't have \"#0\" or \"#1\";\n";
-  print "-->          this would not be OK with a conventional ARPA-type language\n";
-  print "-->          model or a conventional lexicon (L.fst)\n";
-  $warning = 1;
+if (!$skip_disambig_check) {
+  # Check disambiguation symbols -------------------------------
+  print "Checking disambiguation symbols: #0 and #1\n";
+  if (scalar(keys %disambig) == 0) {
+    $warning = 1; print "--> WARNING: $lang/phones/disambig.txt is empty or does not exist\n";
+  }
+  if (exists $disambig{"#0"} and exists $disambig{"#1"}) {
+    print "--> $lang/phones/disambig.txt has \"#0\" and \"#1\"\n";
+    print "--> $lang/phones/disambig.txt is OK\n\n";
+  } else {
+    print "--> WARNING: $lang/phones/disambig.txt doesn't have \"#0\" or \"#1\";\n";
+    print "-->          this would not be OK with a conventional ARPA-type language\n";
+    print "-->          model or a conventional lexicon (L.fst)\n";
+    $warning = 1;
+  }
 }
 
 
@@ -500,48 +501,46 @@ sub check_summation {
 if (!open(T, "<$lang/topo")) {
   $exit = 1; print "--> ERROR: fail to open $lang/topo\n";
 } else {
+  $topo_ok = 1;
   $idx = 1;
+  %phones_in_topo_int_hash = ( );
+  %phones_in_topo_hash = ( );
   while (<T>) {
     chomp;
     next if (m/^<.*>[ ]*$/);
-    if ($idx == 1) {
-      $nonsilence_seq = $_; $idx ++;
-    }
-    if ($idx == 2) {
-      $silence_seq = $_;
+    foreach $i (split(" ", $_)) {
+      if (defined $phones_in_topo_int_hash{$i}) {
+        $topo_ok = 0;
+        $exit = 1; print "--> ERROR: $lang/topo has phone $i twice\n";
+      }
+      if (!defined $pint2sym{$i}) {
+        $topo_ok = 0;
+        $exit = 1; print "--> ERROR: $lang/topo has phone $i which is not in phones.txt\n";
+      }
+      $phones_in_topo_int_hash{$i} = 1;
+      $phones_in_topo_hash{$pint2sym{$i}} = 1;
     }
   }
   close(T);
-  if ($silence_seq == 0 || $nonsilence_seq == 0) {
-    $exit = 1; print "--> ERROR: $lang/topo doesn't have nonsilence section or silence section\n";
-  }
-  @silence_seq = split(" ", $silence_seq);
-  @nonsilence_seq = split(" ", $nonsilence_seq);
-  $success1 = 1;
-  if (@nonsilence_seq != @nonsilence) {
-    $exit = 1; print "--> ERROR: $lang/topo's nonsilence section doesn't correspond to nonsilence.txt\n";
-  } else {
-    foreach (0 .. scalar(@nonsilence)-1) {
-      if ($psymtab{@nonsilence[$_]} ne @nonsilence_seq[$_]) {
-        $exit = 1; print "--> ERROR: $lang/topo's nonsilence section doesn't correspond to nonsilence.txt\n";
-        $success = 0;
-      }
+  $phones_that_should_be_in_topo_hash = {};
+  foreach $p (@silence, @nonsilence) { $phones_that_should_be_in_topo_hash{$p} = 1; }
+  foreach $p (keys %phones_that_should_be_in_topo_hash) {
+    if ( ! defined $phones_in_topo_hash{$p}) {
+      $topo_ok = 0;
+      $i = $pint2sym{$p};
+      $exit = 1; print "--> ERROR: $lang/topo does not cover phone $p (label = $i)\n";
     }
   }
-  $success1 != 1 || print "--> $lang/topo's nonsilence section is OK\n";
-  $success2 = 1;
-  if (@silence_seq != @silence) {
-    $exit = 1; print "--> ERROR: $lang/topo's silence section doesn't correspond to silence.txt\n";
-  } else {
-    foreach (0 .. scalar(@silence)-1) {
-      if ($psymtab{@silence[$_]} ne @silence_seq[$_]) {
-        $exit = 1; print "--> ERROR: $lang/topo's silence section doesn't correspond to silence.txt\n";
-        $success = 0;
-      }
+  foreach $i (keys %phones_in_topo_int_hash) {
+    $p = $pint2sym{$i};
+    if ( ! defined $phones_that_should_be_in_topo_hash{$p}) {
+      $topo_ok = 0;
+      $exit = 1; print "--> ERROR: $lang/topo covers phone $p (label = $i) which is not a real phone\n";
     }
   }
-  $success2 != 1 || print "--> $lang/topo's silence section is OK\n";
-  $success1 != 1 or $success2 != 1 || print "--> $lang/topo is OK\n";
+  if ($topo_ok) {
+    "--> $lang/topo is OK\n";
+  }
   print "\n";
 }
 
@@ -606,7 +605,7 @@ sub check_summation {
     foreach (keys %sum) {
       if (!$itset{$_}) {
         print "$_ ";
-      }            
+      }
     }
     print "\n";
   }
@@ -625,6 +624,80 @@ sub check_summation {
   print "\n";
 }
 
+
+
+{
+  print "Checking word-level disambiguation symbols...\n";
+  # This block checks that one of the two following conditions hold:
+  # (1) for lang diretories prepared by older versions of prepare_lang.sh:
+  #  The symbol  '#0' should appear in words.txt and phones.txt, and should
+  # or (2): the files wdisambig.txt, wdisambig_phones.int and wdisambig_words.int
+  #  exist, and have the expected properties (see below for details).
+
+  # note, %wdisambig_words_hash hashes from the integer word-id of word-level
+  # disambiguation symbols, to 1 if the word is a disambig symbol.
+  my %wdisambig_words_hash;
+  my %wdisambig_words_string = "";
+
+  if (! -e "$lang/phones/wdisambig.txt") {
+    print "--> no $lang/phones/wdisambig.txt (older prepare_lang.sh)\n";
+    if (exists $wsymtab{"#0"}) {
+      print "--> $lang/words.txt has \"#0\"\n";
+      $wdisambig_words_hash{$wsymtab{"#0"}} = 1;
+      $wdisambig_words_string = $wsymtab{"#0"};
+    } else {
+      print "--> WARNING: $lang/words.txt doesn't have \"#0\"\n";
+      print "-->          (if you are using ARPA-type language models, you will normally\n";
+      print "-->           need the disambiguation symbol \"#0\" to ensure determinizability)\n";
+    }
+  } else {
+    print "--> $lang/phones/wdisambig.txt exists (newer prepare_lang.sh)\n";
+    if (!open(T, "<$lang/phones/wdisambig.txt")) {
+      print "--> ERROR: fail to open $lang/phones/wdisambig.txt\n"; $exit = 1; return;
+    }
+    chomp(my @wdisambig = <T>);
+    close(T);
+    if (!open(W, "<$lang/phones/wdisambig_words.int")) {
+      print "--> ERROR: fail to open $lang/phones/wdisambig_words.int\n"; $exit = 1; return;
+    }
+    chomp(my @wdisambig_words = <W>);
+    close(W);
+    if (!open(P, "<$lang/phones/wdisambig_phones.int")) {
+      print "--> ERROR: fail to open $lang/phones/wdisambig_phones.int\n"; $exit = 1; return;
+    }
+    chomp(my @wdisambig_phones = <P>);
+    close(P);
+    my $len = @wdisambig, $len2;
+    if (($len2 = @wdisambig_words) != $len) {
+      print "--> ERROR: files $lang/phones/wdisambig.txt and $lang/phones/wdisambig_words.int have different lengths";
+      $exit = 1; return;
+    }
+    if (($len2 = @wdisambig_phones) != $len) {
+      print "--> ERROR: files $lang/phones/wdisambig.txt and $lang/phones/wdisambig_phones.int have different lengths";
+      $exit = 1; return;
+    }
+    for (my $i = 0; $i < $len; $i++) {
+      if ($wsymtab{$wdisambig[$i]} ne $wdisambig_words[$i]) {
+        my $ii = $i + 1;
+        print "--> ERROR: line $ii of files $lang/phones/wdisambig.txt and $lang/phones/wdisambig_words.int mismatch\n";
+        $exit = 1; return;
+      }
+    }
+    for (my $i = 0; $i < $len; $i++) {
+      if ($psymtab{$wdisambig[$i]} ne $wdisambig_phones[$i]) {
+        my $ii = $i + 1;
+        print "--> ERROR: line $ii of files $lang/phones/wdisambig.txt and $lang/phones/wdisambig_phones.int mismatch\n";
+        $exit = 1; return;
+      }
+    }
+    foreach my $i ( @wdisambig_words ) {
+      $wdisambig_words_hash{$i} = 1;
+      $wdisambig_words_string .= " " . $i;
+    }
+  }
+}
+
+
 if (-s "$lang/phones/word_boundary.int") {
   print "Checking word_boundary.int and disambig.int\n";
   if (!open (W, "<$lang/phones/word_boundary.int")) {
@@ -641,7 +714,7 @@ sub check_summation {
   if (!open (D, "<$lang/phones/disambig.int")) {
     $exit = 1; print "--> ERROR: fail to open $lang/phones/disambig.int\n";
   }
-  while (<D>) { 
+  while (<D>) {
     @A = split;
     if (@A != 1) {
       $exit = 1; print "--> ERROR: bad line $_ in $lang/phones/disambig.int\n";
@@ -657,7 +730,9 @@ sub check_summation {
     $wordseq_syms = "";
     foreach (1 .. $wlen) {
       $id = int(rand(scalar(keys %wint2sym)));
-      while ($wint2sym{$id} =~ m/^#[0-9]*$/ or
+      # exclude disambiguation symbols, BOS and EOS and epsilon from the word
+      # sequence.
+      while (defined $wdisambig_words_hash{$wint2sym{$id}} or
              $wint2sym{$id} eq "<s>" or $wint2sym{$id} eq "</s>" or $id == 0) {
         $id = int(rand(scalar(keys %wint2sym)));
       }
@@ -781,21 +856,17 @@ sub check_summation {
   }
 
   # Check that G.fst does not have cycles with only disambiguation symbols or
-  # epsilons on the input, or the forbidden symbols <s> and </s>.
-  $cmd = ". ./path.sh; fstprint $lang/G.fst | awk -v disambig=$lang/phones/disambig.int -v words=$lang/words.txt 'BEGIN{while((getline<disambig)>0) is_disambig[$1]=1; is_disambig[0] = 1; while((getline<words)>0){ if(\$1==\"<s>\"||\$1==\"</s>\") is_forbidden[\$2]=1;}} {if(NF<3 || is_disambig[\$3]) print; else if(is_forbidden[\$3] || is_forbidden[\$4]) { print \"Error: line \" \$0 \" in G.fst contains forbidden symbol <s> or </s>\" | \"cat 1>&2\"; exit(1); }}' | fstcompile | fstinfo ";
-  $output = `$cmd`;
-  if ($output !~ m/# of states\s+[1-9]/) { # fstinfo did not read a nonempty FST (there should be final probs at least)...
-    print "--> ERROR: failure running command to check for disambig-sym loops [possibly G.fst " .
-         "contained the forbidden symbols <s> or </s>, or possibly some other error..  Output was: \n";
-    print $output;
-    $exit = 1;
-  }
-  if ($output !~ m/cyclic\s+n/) { # FST was cyclic after selecting only for disambig symbols.   This is now allowed.
-    print "--> ERROR: G.fst contained cycles with only disambiguation symbols or epsilons on the input.  Would cause determinization failure in graph creation.\n";
-    $exit = 1;
-  } else {
-    print "--> G.fst did not contain cycles with only disambig symbols or epsilon on the input, and did not contain\n" .
-      "the forbidden symbols <s> or </s> (if present in vocab) on the input or output.\n";
+  # epsilons on the input, or the forbidden symbols <s> and </s> (and a few
+  # related checks
+
+  if (-e "$lang/G.fst") {
+    system("utils/lang/check_g_properties.pl $lang");
+    if ($? != 0) {
+      print "--> ERROR: failure running check_g_properties.pl\n";
+      $exit = 1;
+    } else {
+      print("--> utils/lang/check_g_properties.pl succeeded.\n");
+    }
   }
 }
 
diff --git a/egs/yesno/s5/input/task.arpabo b/egs/yesno/s5/input/task.arpabo
index 415391c98bd..5c6b525b9d7 100644
--- a/egs/yesno/s5/input/task.arpabo
+++ b/egs/yesno/s5/input/task.arpabo
@@ -1,6 +1,6 @@
 
 \data\
-ngram 1=3
+ngram 1=4
 
 \1-grams:
 -1	NO
diff --git a/egs/yesno/s5/local/prepare_lm.sh b/egs/yesno/s5/local/prepare_lm.sh
index de5884d3a86..a5f5431efd3 100755
--- a/egs/yesno/s5/local/prepare_lm.sh
+++ b/egs/yesno/s5/local/prepare_lm.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 . path.sh
- 
+
 echo Preparing language models for test
 
 for lm_suffix in tg; do
@@ -10,10 +10,10 @@ for lm_suffix in tg; do
   rm -rf data/lang_test_${lm_suffix}
   cp -r data/lang data/lang_test_${lm_suffix}
 
-  cat input/task.arpabo | arpa2fst - | fstprint | utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
-  #cat input/G.txt | utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
+  arpa2fst --disambig-symbol=#0 --read-symbol-table=$test/words.txt input/task.arpabo $test/G.fst
+
   fstisstochastic $test/G.fst
-      
+
  # The output is like:
  # 9.14233e-05 -0.259833
  # we do expect the first of these 2 numbers to be close to zero (the second is
@@ -30,7 +30,7 @@ for lm_suffix in tg; do
     < data/local/dict/lexicon.txt  >tmpdir.g/select_empty.fst.txt
   fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt tmpdir.g/select_empty.fst.txt | \
    fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > tmpdir.g/empty_words.fst
-  fstinfo tmpdir.g/empty_words.fst | grep cyclic | grep -w 'y' && 
+  fstinfo tmpdir.g/empty_words.fst | grep cyclic | grep -w 'y' &&
     echo "Language model has cycles with empty words" && exit 1
   rm -r tmpdir.g
 done
diff --git a/egs/yesno/s5/path.sh b/egs/yesno/s5/path.sh
index 708524a5587..21bfd1440fa 100644
--- a/egs/yesno/s5/path.sh
+++ b/egs/yesno/s5/path.sh
@@ -1,3 +1,8 @@
-
-export PATH=$PWD/utils/:$PWD/../../../src/bin:$PWD/../../../tools/openfst/bin:$PWD/../../../src/fstbin/:$PWD/../../../src/gmmbin/:$PWD/../../../src/featbin/:$PWD/../../../src/lm/:$PWD/../../../src/sgmmbin/:$PWD/../../../src/fgmmbin/:$PWD/../../../src/latbin/:$PWD:$PATH
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
 export LC_ALL=C
+
+
diff --git a/egs/yesno/s5/run.sh b/egs/yesno/s5/run.sh
index 3e5d59a9656..12b00273f8b 100755
--- a/egs/yesno/s5/run.sh
+++ b/egs/yesno/s5/run.sh
@@ -26,6 +26,7 @@ local/prepare_lm.sh
 for x in train_yesno test_yesno; do 
  steps/make_mfcc.sh --nj 1 data/$x exp/make_mfcc/$x mfcc
  steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x mfcc
+ utils/fix_data_dir.sh data/$x
 done
 
 # Mono training
diff --git a/misc/maintenance/find_missing_dependencies.sh b/misc/maintenance/find_missing_dependencies.sh
index 3854dd5ceaa..55e300fe9f0 100755
--- a/misc/maintenance/find_missing_dependencies.sh
+++ b/misc/maintenance/find_missing_dependencies.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 
+echo "$0: finding missing inter-directory dependencies in src/Makefile"
+
 cd src
 
 for x in */Makefile; do
@@ -9,4 +11,4 @@ for x in */Makefile; do
       echo "$dir: $dependency"; 
     fi
   done
-done
\ No newline at end of file
+done
diff --git a/misc/maintenance/fix_apache_headers.sh b/misc/maintenance/fix_apache_headers.sh
index 8653bdf6457..7fb813b2624 100755
--- a/misc/maintenance/fix_apache_headers.sh
+++ b/misc/maintenance/fix_apache_headers.sh
@@ -4,6 +4,11 @@
 # authors appears in the apache headers in the source, and that source files
 # have their Apache headers.  Including this mainly for documentation, as I
 # doubt the issue will occur much in future.
+#
+# Also makes sure that where the filename appears in a comment at the top of the
+# file, e.g. as in
+# // somedir/some-file.cc
+# the filename is accurate.
 
 # run this from the top level of the repo, as
 # misc/maintenance/fix_apache_headers.sh
@@ -11,17 +16,31 @@
 set -e
 cd src
 rm -rf tmp
-for x in */*.{h,cc,dox}; do 
+for x in */*.{h,cc,dox}; do
   if [ $x != "util/basic-filebuf.h" ]; then
     if ! grep 'COPYING for clarification' $x >/dev/null; then
       echo Fixing $x;
       if ! grep "Apache License" $x >/dev/null; then
         echo "$0: warning: file $x may not have an Apache license header"
       else
-        cp $x tmp; cat tmp | perl -ape ' if (m/Licensed under the Apache License/) { 
-        print "// See ../../COPYING for clarification regarding multiple authors\n"; 
+        cp $x tmp; cat tmp | perl -ape ' if (m/Licensed under the Apache License/) {
+        print "// See ../../COPYING for clarification regarding multiple authors\n";
         print "//\n";} ' > $x;
       fi
     fi
   fi
 done
+
+for x in */*.{h,cc,dox}; do
+  if [ $x != "util/basic-filebuf.h" ]; then
+    echo "// $x" | cat - <(tail -n +2 $x)  >tmp
+    if ! diff tmp $x; then
+      if head -n 1 $x | grep -E '// [-a-z0-9_]+/[-a-z0-9_.]+$'; then
+        echo "Fixing $x automatically"
+        cp tmp $x
+      else
+        echo "**Please fix $x manually"
+      fi
+    fi
+  fi
+done
diff --git a/misc/maintenance/fix_include_guards.sh b/misc/maintenance/fix_include_guards.sh
index b1338371a78..dde5e6cf155 100755
--- a/misc/maintenance/fix_include_guards.sh
+++ b/misc/maintenance/fix_include_guards.sh
@@ -8,13 +8,13 @@ set -e
 cd src
 rm -rf tmp
 
-for x in */*.h ; do 
+for x in */*.h ; do
   name=`echo $x | tr '[a-z]/.-' '[A-Z]___' `
-  m=KALDI_${name}_  
+  m=KALDI_${name}_
   n=`grep ifndef $x | awk '{print $2}' | head -n 1`
-  if [ "$m" != "$n" ]; then 
-    echo "$m != $n"; 
-    if [ ! -z "$n" ]; then 
+  if [ "$m" != "$n" ]; then
+    echo "$m != $n";
+    if [ ! -z "$n" ]; then
       cp $x tmp; sed s/$n/$m/ <tmp >$x;
     else
       echo "Something wrong for file $x, maybe no include guard."
@@ -23,3 +23,12 @@ for x in */*.h ; do
 done
 
 
+for x in */*.h ; do 
+  name=`echo $x | tr '[a-z]/.-' '[A-Z]___' `
+  m=KALDI_${name}_  
+  n=`grep endif $x | grep _H_ | sed s://:: | awk '{print $2}' | head -n 1`
+  if [ ! -s $n ] && [ "$m" != "$n" ]; then 
+    echo "#endif: $m != $n";
+    cp $x tmp; sed s/$n/$m/ <tmp > $x;
+  fi
+done
diff --git a/src/Makefile b/src/Makefile
index 260879c788b..c10bb518e9d 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -8,11 +8,11 @@ SHELL := /bin/bash
 SUBDIRS = base matrix util feat tree thread gmm transform sgmm \
           fstext hmm lm decoder lat kws cudamatrix nnet \
           bin fstbin gmmbin fgmmbin sgmmbin featbin \
-          nnetbin latbin sgmm2 sgmm2bin nnet2 nnet3 nnet3bin nnet2bin kwsbin \
-          ivector ivectorbin online2 online2bin lmbin
+          nnetbin latbin sgmm2 sgmm2bin nnet2 nnet3 chain nnet3bin nnet2bin kwsbin \
+          ivector ivectorbin online2 online2bin lmbin chainbin
 
 MEMTESTDIRS = base matrix util feat tree thread gmm transform sgmm \
-          fstext hmm lm decoder lat nnet \
+          fstext hmm lm decoder lat nnet kws chain \
           bin fstbin gmmbin fgmmbin sgmmbin featbin \
           nnetbin latbin sgmm2 nnet2 nnet3 nnet2bin nnet3bin sgmm2bin kwsbin \
           ivector ivectorbin online2 online2bin lmbin
@@ -111,7 +111,7 @@ ext_test: $(addsuffix /test, $(EXT_SUBDIRS))
 
 # Define an implicit rule, expands to e.g.:
 #  base/test: base
-#     $(MAKE) -C base test 
+#     $(MAKE) -C base test
 %/test: % mklibdir
 	$(MAKE) -C $< test
 
@@ -134,7 +134,7 @@ ext_depend: check_portaudio
 
 .PHONY: $(SUBDIRS)
 $(SUBDIRS) : mklibdir
-	$(MAKE) -C $@ 
+	$(MAKE) -C $@
 
 .PHONY: $(EXT_SUBDIRS)
 $(EXT_SUBDIRS) : mklibdir
@@ -145,37 +145,37 @@ $(EXT_SUBDIRS) : mklibdir
 # this is necessary for correct parallel compilation
 #1)The tools depend on all the libraries
 
-bin fstbin gmmbin fgmmbin sgmmbin sgmm2bin featbin nnetbin nnet2bin nnet3bin latbin ivectorbin lmbin: \
+bin fstbin gmmbin fgmmbin sgmmbin sgmm2bin featbin nnetbin nnet2bin nnet3bin chainbin latbin ivectorbin lmbin kwsbin online2bin: \
  base matrix util feat tree optimization thread gmm transform sgmm sgmm2 fstext hmm \
- lm decoder lat cudamatrix nnet nnet2 nnet3 ivector
+ lm decoder lat cudamatrix nnet nnet2 nnet3 ivector chain kws online2
 
 #2)The libraries have inter-dependencies
 base:
 matrix : base
-util: base matrix
-thread: util matrix base
+thread : base
+util: base matrix thread
 feat: base matrix util gmm transform tree thread
-tree: base util matrix
+tree: base util thread matrix
 optimization: base matrix
 gmm: base util matrix tree thread
 transform: base util matrix gmm tree thread
 sgmm: base util matrix gmm tree transform thread hmm
 sgmm2: base util matrix gmm tree transform thread hmm
-fstext: base util matrix tree
-hmm: base tree matrix util
-lm: base util fstext
-decoder: base util matrix gmm sgmm hmm tree transform lat
-lat: base util hmm tree matrix
-cudamatrix: base util matrix	
-nnet: base util matrix cudamatrix
+fstext: base util thread matrix tree
+hmm: base tree matrix util thread
+lm: base util thread matrix fstext
+decoder: base util thread matrix gmm sgmm hmm tree transform lat
+lat: base util thread hmm tree matrix
+cudamatrix: base util thread matrix
+nnet: base util thread matrix cudamatrix
 nnet2: base util matrix thread lat gmm hmm tree transform cudamatrix
-nnet3: base util matrix thread lat gmm hmm tree transform cudamatrix
-ivector: base util matrix thread transform tree gmm 
+nnet3: base util matrix thread lat gmm hmm tree transform cudamatrix chain fstext
+chain: lat hmm tree fstext matrix cudamatrix util thread base
+ivector: base util matrix thread transform tree gmm
 #3)Dependencies for optional parts of Kaldi
 onlinebin: base matrix util feat tree optimization gmm transform sgmm sgmm2 fstext hmm lm decoder lat cudamatrix nnet nnet2 online thread
-online2bin: base matrix util feat tree optimization gmm transform sgmm sgmm2 fstext hmm lm decoder lat cudamatrix nnet nnet2 online2 thread ivector
 # python-kaldi-decoding: base matrix util feat tree optimization thread gmm transform sgmm sgmm2 fstext hmm decoder lat online
 online: decoder gmm transform feat matrix util base lat hmm thread tree
-online2: decoder gmm transform feat matrix util base lat hmm thread ivector cudamatrix nnet2
-kws: base util hmm tree matrix lat
-kwsbin: fstext kws lat base util hmm tree matrix
+online2: decoder gmm transform feat matrix util base lat hmm thread ivector cudamatrix nnet2 nnet3 chain
+kws: base util thread hmm tree matrix lat
+
diff --git a/src/base/io-funcs-inl.h b/src/base/io-funcs-inl.h
index e55458ed43c..6b87f4c1a24 100644
--- a/src/base/io-funcs-inl.h
+++ b/src/base/io-funcs-inl.h
@@ -1,7 +1,9 @@
 // base/io-funcs-inl.h
 
 // Copyright 2009-2011  Microsoft Corporation;  Saarland University;
-//                      Jan Silovsky;   Yanmin Qian;  Johns Hopkins University (Author: Daniel Povey)
+//                      Jan Silovsky;   Yanmin Qian;
+//                      Johns Hopkins University (Author: Daniel Povey)
+//                2016  Xiaohui Zhang
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -62,7 +64,6 @@ template<class T> inline void ReadBasicType(std::istream &is,
     char len_c = static_cast<char>(len_c_in), len_c_expected
       = (std::numeric_limits<T>::is_signed ? 1 :  -1)
       * static_cast<char>(sizeof(*t));
-    
     if (len_c !=  len_c_expected) {
       KALDI_ERR << "ReadBasicType: did not get expected integer type, "
                 << static_cast<int>(len_c)
@@ -87,6 +88,112 @@ template<class T> inline void ReadBasicType(std::istream &is,
   }
 }
 
+// Template that covers integers.
+template<class T>
+inline void WriteIntegerPairVector(std::ostream &os, bool binary,
+                                   const std::vector<std::pair<T, T> > &v) {
+  // Compile time assertion that this is not called with a wrong type.
+  KALDI_ASSERT_IS_INTEGER_TYPE(T);
+  if (binary) {
+    char sz = sizeof(T);  // this is currently just a check.
+    os.write(&sz, 1);
+    int32 vecsz = static_cast<int32>(v.size());
+    KALDI_ASSERT((size_t)vecsz == v.size());
+    os.write(reinterpret_cast<const char *>(&vecsz), sizeof(vecsz));
+    if (vecsz != 0) {
+      os.write(reinterpret_cast<const char *>(&(v[0])), sizeof(T) * vecsz * 2);
+    }
+  } else {
+    // focus here is on prettiness of text form rather than
+    // efficiency of reading-in.
+    // reading-in is dominated by low-level operations anyway:
+    // for efficiency use binary.
+    os << "[ ";
+    typename std::vector<std::pair<T, T> >::const_iterator iter = v.begin(),
+                                                            end = v.end();
+    for (; iter != end; ++iter) {
+      if (sizeof(T) == 1)
+        os << static_cast<int16>(iter->first) << ','
+           << static_cast<int16>(iter->second) << ' ';
+      else
+        os << iter->first << ','
+           << iter->second << ' ';
+    }
+    os << "]\n";
+  }
+  if (os.fail()) {
+    throw std::runtime_error("Write failure in WriteIntegerPairVector.");
+  }
+}
+
+// Template that covers integers.
+template<class T>
+inline void ReadIntegerPairVector(std::istream &is, bool binary,
+                                  std::vector<std::pair<T, T> > *v) {
+  KALDI_ASSERT_IS_INTEGER_TYPE(T);
+  KALDI_ASSERT(v != NULL);
+  if (binary) {
+    int sz = is.peek();
+    if (sz == sizeof(T)) {
+      is.get();
+    } else {  // this is currently just a check.
+      KALDI_ERR << "ReadIntegerPairVector: expected to see type of size "
+                << sizeof(T) << ", saw instead " << sz << ", at file position "
+                << is.tellg();
+    }
+    int32 vecsz;
+    is.read(reinterpret_cast<char *>(&vecsz), sizeof(vecsz));
+    if (is.fail() || vecsz < 0) goto bad;
+    v->resize(vecsz);
+    if (vecsz > 0) {
+      is.read(reinterpret_cast<char *>(&((*v)[0])), sizeof(T)*vecsz*2);
+    }
+  } else {
+    std::vector<std::pair<T, T> > tmp_v;  // use temporary so v doesn't use extra memory
+                           // due to resizing.
+    is >> std::ws;
+    if (is.peek() != static_cast<int>('[')) {
+      KALDI_ERR << "ReadIntegerPairVector: expected to see [, saw "
+                << is.peek() << ", at file position " << is.tellg();
+    }
+    is.get();  // consume the '['.
+    is >> std::ws;  // consume whitespace.
+    while (is.peek() != static_cast<int>(']')) {
+      if (sizeof(T) == 1) {  // read/write chars as numbers.
+        int16 next_t1, next_t2;
+        is >> next_t1;
+        if (is.fail()) goto bad;
+        if (is.peek() != static_cast<int>(','))
+          KALDI_ERR << "ReadIntegerPairVector: expected to see ',', saw "
+                    << is.peek() << ", at file position " << is.tellg();
+        is.get();  // consume the ','.
+        is >> next_t2 >> std::ws;
+        if (is.fail()) goto bad;
+        else
+            tmp_v.push_back(std::make_pair<T, T>((T)next_t1, (T)next_t2));
+      } else {
+        T next_t1, next_t2;
+        is >> next_t1;
+        if (is.fail()) goto bad;
+        if (is.peek() != static_cast<int>(','))
+          KALDI_ERR << "ReadIntegerPairVector: expected to see ',', saw "
+                    << is.peek() << ", at file position " << is.tellg();
+        is.get();  // consume the ','.
+        is >> next_t2 >> std::ws;
+        if (is.fail()) goto bad;
+        else
+            tmp_v.push_back(std::pair<T, T>(next_t1, next_t2));
+      }
+    }
+    is.get();  // get the final ']'.
+    *v = tmp_v;  // could use std::swap to use less temporary memory, but this
+    // uses less permanent memory.
+  }
+  if (!is.fail()) return;
+ bad:
+  KALDI_ERR << "ReadIntegerPairVector: read failure at file position "
+            << is.tellg();
+}
 
 template<class T> inline void WriteIntegerVector(std::ostream &os, bool binary,
                                                  const std::vector<T> &v) {
@@ -117,7 +224,7 @@ template<class T> inline void WriteIntegerVector(std::ostream &os, bool binary,
     os << "]\n";
   }
   if (os.fail()) {
-    throw std::runtime_error("Write failure in WriteIntegerType.");
+    throw std::runtime_error("Write failure in WriteIntegerVector.");
   }
 }
 
@@ -178,6 +285,7 @@ template<class T> inline void ReadIntegerVector(std::istream &is,
             << is.tellg();
 }
 
+
 // Initialize an opened stream for writing by writing an optional binary
 // header and modifying the floating-point precision.
 inline void InitKaldiOutputStream(std::ostream &os, bool binary) {
diff --git a/src/base/io-funcs-test.cc b/src/base/io-funcs-test.cc
index 36a9e1e5f3f..dd05326d5ed 100644
--- a/src/base/io-funcs-test.cc
+++ b/src/base/io-funcs-test.cc
@@ -43,8 +43,20 @@ void UnitTestIo(bool binary) {
     WriteIntegerVector(outfile, binary, vec2);
     if (!binary) outfile << " \n";
     std::vector<char> vec3;
-    for (size_t i = 0; i < 10; i++) vec3.push_back(Rand()%100);
+
+    int32 size = RandInt(0, 10);
+    for (size_t i = 0; i < size; i++) vec3.push_back(Rand()%100);
     WriteIntegerVector(outfile, binary, vec3);
+    std::vector<std::pair<int32, int32> > vec4;
+    WriteIntegerPairVector(outfile, binary, vec4);
+    if (!binary && Rand()%2 == 0) outfile << " \n";
+    std::vector<std::pair<uint16, uint16> > vec5;
+    for (size_t i = 0; i < size; i++) vec5.push_back(std::make_pair<uint16, uint16>(Rand()%100 - 10, Rand()%100 - 10));
+    WriteIntegerPairVector(outfile, binary, vec5);
+    if (!binary) outfile << " \n";
+    std::vector<std::pair<char, char> > vec6;
+    for (size_t i = 0; i < size; i++) vec6.push_back(std::make_pair<char, char>(Rand()%100, Rand()%100));
+    WriteIntegerPairVector(outfile, binary, vec6);
     if (!binary && Rand()%2 == 0) outfile << " \n";
     const char *token1 = "Hi";
     WriteToken(outfile, binary, token1);
@@ -90,9 +102,19 @@ void UnitTestIo(bool binary) {
       std::vector<char> vec3_in;
       ReadIntegerVector(infile, binary_in, &vec3_in);
       KALDI_ASSERT(vec3_in == vec3);
+      std::vector<std::pair<int32, int32> > vec4_in;
+      ReadIntegerPairVector(infile, binary_in, &vec4_in);
+      KALDI_ASSERT(vec4_in == vec4);
+      std::vector<std::pair<uint16, uint16> > vec5_in;
+      ReadIntegerPairVector(infile, binary_in, &vec5_in);
+      KALDI_ASSERT(vec5_in == vec5);
+      std::vector<std::pair<char, char> > vec6_in;
+      ReadIntegerPairVector(infile, binary_in, &vec6_in);
+      KALDI_ASSERT(vec6_in == vec6);
       std::string  token1_in, token2_in;
       KALDI_ASSERT(Peek(infile, binary_in) == static_cast<int>(*token1));
-      KALDI_ASSERT(PeekToken(infile, binary_in) == (int)*token1); // Note:
+      KALDI_ASSERT(PeekToken(infile, binary_in) == static_cast<int>(*token1));
+      // Note:
       // the stuff with skipping over '<' is tested in ../util/kaldi-io-test.cc,
       // since we need to make sure it works with pipes.
       ReadToken(infile, binary_in, &token1_in);
@@ -132,7 +154,7 @@ int main() {
     UnitTestIo(false);
     UnitTestIo(true);
   }
-  KALDI_ASSERT(1);  // just wanted to check that KALDI_ASSERT does not fail for 1.
+  KALDI_ASSERT(1);  // just to check that KALDI_ASSERT does not fail for 1.
   return 0;
 }
 
diff --git a/src/base/io-funcs.h b/src/base/io-funcs.h
index 2bc9da895d4..4caddc6b5b3 100644
--- a/src/base/io-funcs.h
+++ b/src/base/io-funcs.h
@@ -2,6 +2,7 @@
 
 // Copyright 2009-2011  Microsoft Corporation;  Saarland University;
 //                      Jan Silovsky;   Yanmin Qian
+//                2016  Xiaohui Zhang
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -98,7 +99,6 @@ namespace kaldi {
     void ReadToken(std::istream &is, bool binary, std::string *str);
     void PeekToken(std::istream &is, bool binary, std::string *str);
 
-
   WriteToken writes the token and one space (whether in binary or text mode).
 
   Peek returns the first character of the next token, by consuming whitespace
@@ -182,6 +182,16 @@ template<class T> inline void WriteIntegerVector(std::ostream &os, bool binary,
 template<class T> inline void ReadIntegerVector(std::istream &is, bool binary,
                                                 std::vector<T> *v);
 
+/// Function for writing STL vectors of pairs of integer types.
+template<class T>
+inline void WriteIntegerPairVector(std::ostream &os, bool binary,
+                                   const std::vector<std::pair<T, T> > &v);
+
+/// Function for reading STL vector of pairs of integer types.
+template<class T> 
+inline void ReadIntegerPairVector(std::istream &is, bool binary,
+                                  std::vector<std::pair<T, T> > *v);
+
 /// The WriteToken functions are for writing nonempty sequences of non-space
 /// characters. They are not for general strings.
 void WriteToken(std::ostream &os, bool binary, const char *token);
diff --git a/src/base/kaldi-common.h b/src/base/kaldi-common.h
index 33f6f314db4..e0002d91bb7 100644
--- a/src/base/kaldi-common.h
+++ b/src/base/kaldi-common.h
@@ -28,8 +28,8 @@
 #include <stdexcept>
 #include <cassert>
 #include <vector>
-#include <iostream>  
-#include <fstream>  
+#include <iostream>
+#include <fstream>
 
 #include "base/kaldi-utils.h"
 #include "base/kaldi-error.h"
@@ -38,4 +38,3 @@
 #include "base/kaldi-math.h"
 
 #endif  // KALDI_BASE_KALDI_COMMON_H_
-
diff --git a/src/base/kaldi-error-test.cc b/src/base/kaldi-error-test.cc
index 20301e2702f..527de852cac 100644
--- a/src/base/kaldi-error-test.cc
+++ b/src/base/kaldi-error-test.cc
@@ -46,7 +46,8 @@ int main() {
   try {
     kaldi::UnitTestError();
     KALDI_ASSERT(0);  // should not happen.
-  } catch (std::runtime_error &r) {
+    exit(1);
+  } catch(std::runtime_error &r) {
     std::cout << "UnitTestError: the error we generated was: " << r.what();
   }
 }
diff --git a/src/base/kaldi-error.cc b/src/base/kaldi-error.cc
index 96349e17742..5ca884e996f 100644
--- a/src/base/kaldi-error.cc
+++ b/src/base/kaldi-error.cc
@@ -1,5 +1,6 @@
 // base/kaldi-error.cc
 
+// Copyright 2016 Brno University of Technology (author: Karel Vesely)
 // Copyright 2009-2011  Microsoft Corporation;  Lukas Burget;  Ondrej Glembek
 
 // See ../../COPYING for clarification regarding multiple authors
@@ -20,8 +21,8 @@
 #ifdef HAVE_EXECINFO_H
 #include <execinfo.h>  // To get stack trace in error messages.
 // If this #include fails there is an error in the Makefile, it does not
-// support your platform well. Make sure HAVE_EXECINFO_H is undefined, and the
-// code will compile.
+// support your platform well. Make sure HAVE_EXECINFO_H is undefined, 
+// and the code will compile.
 #ifdef HAVE_CXXABI_H
 #include <cxxabi.h>  // For name demangling.
 // Useful to decode the stack trace, but only used if we have execinfo.h
@@ -32,24 +33,31 @@
 #include "base/kaldi-error.h"
 
 namespace kaldi {
-int32 g_kaldi_verbose_level = 0;  // Just initialize this global variable.
+
+/***** GLOBAL VARIABLES FOR LOGGING *****/
+
+int32 g_kaldi_verbose_level = 0;
 const char *g_program_name = NULL;
+static LogHandler g_log_handler = NULL;
 
 // If the program name was set (g_program_name != ""), the function
 // GetProgramName returns the program name (without the path) followed by a
 // colon, e.g. "gmm-align:".  Otherwise it returns the empty string "".
 const char *GetProgramName() {
-  if (g_program_name == NULL) return "";
-  else return g_program_name;
+  return g_program_name == NULL ? "" : g_program_name;
 }
 
+
+/***** HELPER FUNCTIONS *****/
+
 // Given a filename like "/a/b/c/d/e/f.cc",  GetShortFileName
 // returns "e/f.cc".  Does not currently work if backslash is
 // the filename separator.
-const char *GetShortFileName(const char *filename) {
+static const char *GetShortFileName(const char *filename) {
   const char *last_slash = strrchr(filename, '/');
-  if (!last_slash) { return filename; }
-  else {
+  if (!last_slash) {
+    return filename;
+  } else {
     while (last_slash > filename && last_slash[-1] != '/')
       last_slash--;
     return last_slash;
@@ -57,133 +65,180 @@ const char *GetShortFileName(const char *filename) {
 }
 
 
+/***** STACKTRACE *****/
+
+static std::string Demangle(std::string trace_name) {
 #if defined(HAVE_CXXABI_H) && defined(HAVE_EXECINFO_H)
-// The function name looks like a macro: it's a macro if we don't have ccxxabi.h
-inline void KALDI_APPEND_POSSIBLY_DEMANGLED_STRING(std::string &ans,  
-                                                   const char *to_append) {
-  // at input the string "to_append" looks like:
+  // at input the string looks like:
   //   ./kaldi-error-test(_ZN5kaldi13UnitTestErrorEv+0xb) [0x804965d]
   // We want to extract the name e.g. '_ZN5kaldi13UnitTestErrorEv",
   // demangle it and return it.
-  int32 status;
-  const char *paren = strchr(to_append, '(');
-  const char *plus = (paren ? strchr(paren, '+') : NULL);
-  if (!plus) {  // did not find the '(' or did not find the '+'
-    // This is a soft failure in case we did not get what we expected.
-    ans += to_append;
-    return;
+  
+  // try to locate '(' and '+', take the string in between,
+  size_t begin(trace_name.find("(")),
+         end(trace_name.rfind("+"));
+  if (begin != std::string::npos && end != std::string::npos && begin < end) {
+    trace_name = trace_name.substr(begin+1,end-(begin+1));
   }
-  std::string stripped(paren+1, plus-(paren+1));  // the bit between ( and +.
-
-  char *demangled_name = abi::__cxa_demangle(stripped.c_str(), 0, 0, &status);
-
-  // if status != 0 it is an error (demangling failure),  but not all names seem
-  // to demangle, so we don't check it.
-
-  if (demangled_name != NULL) {
-    ans += demangled_name;
+  // demangle,
+  int status;
+  char *demangled_name = abi::__cxa_demangle(trace_name.c_str(), 0, 0, &status);
+  std::string ans;
+  if (status == 0) {
+    ans = demangled_name;
     free(demangled_name);
   } else {
-    ans += to_append;  // add the original string.
+    ans = trace_name;
   }
+  // return,
+  return ans;
+#else
+  return trace_name;
+#endif
 }
-#else  // defined(HAVE_CXXABI_H) && defined(HAVE_EXECINFO_H)
-#define KALDI_APPEND_POSSIBLY_DEMANGLED_STRING(ans, to_append) ans += to_append
-#endif  // defined(HAVE_CXXABI_H) && defined(HAVE_EXECINFO_H)
 
+
+static std::string KaldiGetStackTrace() {
+  std::string ans;
 #ifdef HAVE_EXECINFO_H
-std::string KaldiGetStackTrace() {
 #define KALDI_MAX_TRACE_SIZE 50
 #define KALDI_MAX_TRACE_PRINT 20  // must be even.
-  std::string ans;
-  void *array[KALDI_MAX_TRACE_SIZE];
-  size_t size = backtrace(array, KALDI_MAX_TRACE_SIZE);
-  char **strings = backtrace_symbols(array, size);
+  // buffer for the trace,
+  void *trace[KALDI_MAX_TRACE_SIZE];
+  // get the trace,
+  size_t size = backtrace(trace, KALDI_MAX_TRACE_SIZE);
+  // get the trace symbols,
+  char **trace_symbol = backtrace_symbols(trace, size);
+
+  // Compose the 'string',
+  ans += "[ Stack-Trace: ]\n";
   if (size <= KALDI_MAX_TRACE_PRINT) {
     for (size_t i = 0; i < size; i++) {
-      KALDI_APPEND_POSSIBLY_DEMANGLED_STRING(ans, strings[i]);
-      ans += "\n";
+      ans += Demangle(trace_symbol[i]) + "\n";
     }
   } else {  // print out first+last (e.g.) 5.
     for (size_t i = 0; i < KALDI_MAX_TRACE_PRINT/2; i++) {
-      KALDI_APPEND_POSSIBLY_DEMANGLED_STRING(ans, strings[i]);
-      ans += "\n";
+      ans += Demangle(trace_symbol[i]) + "\n";
     }
     ans += ".\n.\n.\n";
     for (size_t i = size - KALDI_MAX_TRACE_PRINT/2; i < size; i++) {
-      KALDI_APPEND_POSSIBLY_DEMANGLED_STRING(ans, strings[i]);
-      ans += "\n";
+      ans += Demangle(trace_symbol[i]) + "\n";
     }
     if (size == KALDI_MAX_TRACE_SIZE)
       ans += ".\n.\n.\n";  // stack was too long, probably a bug.
   }
-  free(strings);  // it's all in one big malloc()ed block.
-
 
-#ifdef HAVE_CXXABI_H  // demangle the name, if possible.
-#endif  // HAVE_CXXABI_H
+  // cleanup,
+  free(trace_symbol);  // it's okay, just the pointers, not the strings.
+#endif  // HAVE_EXECINFO_H
   return ans;
 }
-#endif
 
-void KaldiAssertFailure_(const char *func, const char *file,
-                         int32 line, const char *cond_str) {
-  std::ostringstream ss;
-  ss << "KALDI_ASSERT: at " << GetProgramName() << func << ':'
-     << GetShortFileName(file)
-     << ':' << line << ", failed: " << cond_str << '\n';
-#ifdef HAVE_EXECINFO_H
-  ss << "Stack trace is:\n" << KaldiGetStackTrace();
-#endif
-  std::cerr << ss.str();
-  std::cerr.flush();
-  // We used to call abort() here, but switch to throwing an exception
-  // (like KALDI_ERR) because it's easier to deal with in multi-threaded
-  // code.
-  throw std::runtime_error(ss.str());
-}
 
+/***** KALDI LOGIGNG *****/
 
-KaldiWarnMessage::KaldiWarnMessage(const char *func, const char *file,
-                                   int32 line) {
-  this->stream() << "WARNING (" << GetProgramName() << func << "():"
-                 << GetShortFileName(file) << ':' << line << ") ";
+MessageLogger::MessageLogger(LogMessageEnvelope::Severity severity,
+                             const char *func, const char *file, int32 line) {
+  // Obviously, we assume the strings survive the destruction of this object.
+  envelope_.severity = severity;
+  envelope_.func = func;
+  envelope_.file = GetShortFileName(file);  // Pointer inside 'file'.
+  envelope_.line = line;
 }
 
 
-KaldiLogMessage::KaldiLogMessage(const char *func, const char *file,
-                                 int32 line) {
-  this->stream() << "LOG (" << GetProgramName() << func << "():"
-                 << GetShortFileName(file) << ':' << line << ") ";
+MessageLogger::~MessageLogger() KALDI_NOEXCEPT(false) {
+  // remove trailing '\n',
+  std::string str = ss_.str();
+  while (!str.empty() && str[str.length() - 1] == '\n')
+    str.resize(str.length() - 1);
+
+  // print the mesage (or send to logging handler),
+  MessageLogger::HandleMessage(envelope_, str.c_str());
 }
 
 
-KaldiVlogMessage::KaldiVlogMessage(const char *func, const char *file,
-                                   int32 line, int32 verbose) {
-  this->stream() << "VLOG[" << verbose << "] (" << GetProgramName() << func
-                 << "():" << GetShortFileName(file) << ':' << line << ") ";
+void MessageLogger::HandleMessage(const LogMessageEnvelope &envelope,
+                                  const char *message) {
+  // Send to a logging handler if provided.
+  if (g_log_handler != NULL) {
+    g_log_handler(envelope, message);
+  } else {
+    // Otherwise, we use the default Kaldi logging.
+    // Build the log-message 'header',
+    std::stringstream header;
+    if (envelope.severity > LogMessageEnvelope::kInfo) {
+      header << "VLOG[" << envelope.severity << "] (";
+    } else {
+      switch (envelope.severity) {
+        case LogMessageEnvelope::kInfo :
+          header << "LOG (";
+          break;
+        case LogMessageEnvelope::kWarning :
+          header << "WARNING (";
+          break;
+        case LogMessageEnvelope::kError :
+          header << "ERROR (";
+          break;
+        case LogMessageEnvelope::kAssertFailed :
+          header << "ASSERTION_FAILED (";
+          break;
+        default:
+          abort();  // coding errror (unknown 'severity'),
+      }
+    }
+    // fill the other info from the envelope,
+    header << GetProgramName() << envelope.func << "():"
+           << envelope.file << ':' << envelope.line << ")";
+
+    // Printing the message,
+    if (envelope.severity >= LogMessageEnvelope::kWarning) {
+      // VLOG, LOG, WARNING: 
+      fprintf(stderr, "%s %s\n", header.str().c_str(), message);
+    } else {
+      // ERROR, ASSERT_FAILED (print with stack-trace):
+      fprintf(stderr, "%s %s\n\n%s\n", header.str().c_str(), message, 
+              KaldiGetStackTrace().c_str());
+    }
+  }
+
+  // Should we throw exception, or abort?
+  switch (envelope.severity) {
+    case LogMessageEnvelope::kAssertFailed:
+      abort(); // ASSERT_FAILED,
+      break;
+    case LogMessageEnvelope::kError:
+      if (!std::uncaught_exception()) {
+        // throw exception with empty message,
+        throw std::runtime_error(""); // KALDI_ERR,
+      } else {
+        // If we got here, this thread has already thrown exception,
+        // and this exception has not yet arrived to its 'catch' clause...
+        // Throwing a new exception would be unsafe!
+        // (can happen during 'stack unwinding', if we have 'KALDI_ERR << msg' 
+        // in a destructor of some local object).
+        abort();
+      }
+      break;
+  }
 }
 
-KaldiErrorMessage::KaldiErrorMessage(const char *func, const char *file,
-                                     int32 line) {
-  this->stream() << "ERROR (" << GetProgramName() << func << "():"
-                 << GetShortFileName(file) << ':' << line << ") ";
+
+/***** KALDI ASSERTS *****/
+
+void KaldiAssertFailure_(const char *func, const char *file,
+                         int32 line, const char *cond_str) {
+  MessageLogger ml(LogMessageEnvelope::kAssertFailed, func, file, line);
+  ml.stream() << ": '" << cond_str << "' ";
 }
 
-KaldiErrorMessage::~KaldiErrorMessage() KALDI_NOEXCEPT(false) {
-  // (1) Print the message to stderr.
-  std::cerr << ss.str() << '\n';
-  // (2) Throw an exception with the message, plus traceback info if available.
-  if (!std::uncaught_exception()) {
-#ifdef HAVE_EXECINFO_H
-    throw std::runtime_error(ss.str() + "\n\n[stack trace: ]\n" +
-                             KaldiGetStackTrace() + "\n");
-#else
-    throw std::runtime_error(ss.str());
-#endif
-  } else {
-    abort(); // This may be temporary...
-  }
+
+/***** THIRD-PARTY LOG-HANDLER *****/
+
+LogHandler SetLogHandler(LogHandler new_handler) {
+  LogHandler old_handler = g_log_handler;
+  g_log_handler = new_handler;
+  return old_handler;
 }
 
 }  // end namespace kaldi
diff --git a/src/base/kaldi-error.h b/src/base/kaldi-error.h
index 6de7eeea775..2911036d1b7 100644
--- a/src/base/kaldi-error.h
+++ b/src/base/kaldi-error.h
@@ -1,5 +1,6 @@
 // base/kaldi-error.h
 
+// Copyright 2016 Brno University of Technology (author: Karel Vesely)
 // Copyright 2009-2011  Microsoft Corporation;  Ondrej Glembek;  Lukas Burget;
 //                      Saarland University
 
@@ -21,33 +22,40 @@
 #ifndef KALDI_BASE_KALDI_ERROR_H_
 #define KALDI_BASE_KALDI_ERROR_H_ 1
 
-#include <stdexcept>
-#include <string>
+#include <cstdio>
 #include <cstring>
 #include <sstream>
-#include <cstdio>
+#include <stdexcept>
+#include <string>
 
+#include "base/kaldi-types.h"
+#include "base/kaldi-utils.h"
+/* Important that this file does not depend on any other kaldi headers. */
+
+// By adding 'KALDI_NOEXCEPT(bool)' immediately after function declaration,
+// we can tell the compiler that the function must-not produce 
+// exceptions (true), or may produce exceptions (false):
 #if _MSC_VER >= 1900 || (!defined(_MSC_VER) && __cplusplus >= 201103L)
 #define KALDI_NOEXCEPT(Predicate) noexcept((Predicate))
 #elif defined(__GXX_EXPERIMENTAL_CXX0X__) && \
-  (__GNUC__ >= 4 && __GNUC_MINOR__ >= 6)
+      (__GNUC__ >= 4 && __GNUC_MINOR__ >= 6)
 #define KALDI_NOEXCEPT(Predicate) noexcept((Predicate))
 #else
 #define KALDI_NOEXCEPT(Predicate)
 #endif
 
-#include "base/kaldi-types.h"
-#include "base/kaldi-utils.h"
-
-/* Important that this file does not depend on any other kaldi headers. */
-
+#ifdef _MSC_VER
+#define __func__ __FUNCTION__
+#endif
 
 namespace kaldi {
 
 /// \addtogroup error_group
 /// @{
 
-/// This is set by util/parse-options.{h, cc} if you set --verbose = ? option
+/***** VERBOSITY LEVEL *****/
+  
+/// This is set by util/parse-options.{h, cc} if you set --verbose=? option.
 extern int32 g_kaldi_verbose_level;
 
 /// This is set by util/parse-options.{h, cc} (from argv[0]) and used (if set)
@@ -63,64 +71,82 @@ inline int32 GetVerboseLevel() { return g_kaldi_verbose_level; }
 /// automatically from ParseOptions.
 inline void SetVerboseLevel(int32 i) { g_kaldi_verbose_level = i; }
 
-// Class KaldiLogMessage is invoked from the  KALDI_WARN, KALDI_VLOG and
-// KALDI_LOG macros. It prints the message to stderr.  Note: we avoid
-// using cerr, due to problems with thread safety.  fprintf is guaranteed
-// thread-safe.
-
-// class KaldiWarnMessage is invoked from the KALDI_WARN macro.
-class KaldiWarnMessage {
- public:
-  inline std::ostream &stream() { return ss; }
-  KaldiWarnMessage(const char *func, const char *file, int32 line);
-  ~KaldiWarnMessage()  { fprintf(stderr, "%s\n", ss.str().c_str()); }
- private:
-  std::ostringstream ss;
-};
 
-// class KaldiLogMessage is invoked from the KALDI_LOG macro.
-class KaldiLogMessage {
- public:
-  inline std::ostream &stream() { return ss; }
-  KaldiLogMessage(const char *func, const char *file, int32 line);
-  ~KaldiLogMessage() { fprintf(stderr, "%s\n", ss.str().c_str()); }
- private:
-  std::ostringstream ss;
+/***** KALDI LOGGING *****/
+
+/// Log message severity and source location info.
+struct LogMessageEnvelope {
+  enum Severity {
+    kAssertFailed = -3,
+    kError = -2,
+    kWarning = -1,
+    kInfo = 0,
+  };
+  // An 'enum Severity' value, or a positive number indicating verbosity level.
+  int severity;
+  const char *func;
+  const char *file;
+  int32 line;
 };
 
-// Class KaldiVlogMessage is invoked from the KALDI_VLOG macro.
-class KaldiVlogMessage {
- public:
-  KaldiVlogMessage(const char *func, const char *file, int32 line,
-                   int32 verbose_level);
-  inline std::ostream &stream() { return ss; }
-  ~KaldiVlogMessage() { fprintf(stderr, "%s\n", ss.str().c_str()); }
- private:
-  std::ostringstream ss;
+// Class MessageLogger is invoked from the KALDI_ASSERT, KALDI_ERR, KALDI_WARN and
+// KALDI_LOG macros. It formats the message, then either prints it to stderr or
+// passes to the log custom handler if provided, then, in case of the error,
+// throws an std::runtime_exception, in case of failed KALDI_ASSERT calls abort().
+//
+// Note: we avoid using std::cerr for thread safety issues.
+// fprintf(stderr,...) is guaranteed thread-safe, and outputs 
+// its formatted string atomically.
+class MessageLogger {
+public:
+  /// Constructor stores the info,
+  MessageLogger(LogMessageEnvelope::Severity severity, 
+                const char *func,
+                const char *file, 
+                int32 line);
+
+  /// Destructor, calls 'HandleMessage' which prints the message,
+  /// (since C++11 a 'throwing' destructor must be declared 'noexcept(false)')
+  ~MessageLogger() KALDI_NOEXCEPT(false);
+
+  /// The hook for the 'insertion operator', e.g.
+  /// 'KALDI_LOG << "Message,"',
+  inline std::ostream &stream() { return ss_; }
+
+private:
+  /// The logging function,
+  static void HandleMessage(const LogMessageEnvelope &env, const char *msg);
+
+private:
+  LogMessageEnvelope envelope_;
+  std::ostringstream ss_;
 };
 
-
-// class KaldiErrorMessage is invoked from the KALDI_ERROR macro.
-// The destructor throws an exception.
-class KaldiErrorMessage {
- public:
-  KaldiErrorMessage(const char *func, const char *file, int32 line);
-  inline std::ostream &stream() { return ss; }
-  ~KaldiErrorMessage() KALDI_NOEXCEPT(false);  // defined in kaldi-error.cc
- private:
-  std::ostringstream ss;
-};
+// The definition of the logging macros,
+#define KALDI_ERR \
+  ::kaldi::MessageLogger(::kaldi::LogMessageEnvelope::kError, \
+                         __func__, __FILE__, __LINE__).stream()
+#define KALDI_WARN \
+  ::kaldi::MessageLogger(::kaldi::LogMessageEnvelope::kWarning, \
+                         __func__, __FILE__, __LINE__).stream()
+#define KALDI_LOG \
+  ::kaldi::MessageLogger(::kaldi::LogMessageEnvelope::kInfo, \
+                         __func__, __FILE__, __LINE__).stream()
+#define KALDI_VLOG(v) if ((v) <= ::kaldi::g_kaldi_verbose_level)     \
+  ::kaldi::MessageLogger((::kaldi::LogMessageEnvelope::Severity)(v), \
+                         __func__, __FILE__, __LINE__).stream()
 
 
+/***** KALDI ASSERTS *****/
 
-#ifdef _MSC_VER
-#define __func__ __FUNCTION__
-#endif
+void KaldiAssertFailure_(const char *func, const char *file,
+                         int32 line, const char *cond_str);
 
 // Note on KALDI_ASSERT and KALDI_PARANOID_ASSERT
 // The original (simple) version of the code was this
 //
-// #define KALDI_ASSERT(cond) if (!(cond)) kaldi::KaldiAssertFailure_(__func__, __FILE__, __LINE__, #cond);
+// #define KALDI_ASSERT(cond) if (!(cond))
+//              kaldi::KaldiAssertFailure_(__func__, __FILE__, __LINE__, #cond);
 //
 // That worked well, but we were concerned that it
 // could potentially cause a performance issue due to failed branch
@@ -139,35 +165,34 @@ class KaldiErrorMessage {
 // and compilers will be able to optimize the loop away (as the condition
 // is always false).
 #ifndef NDEBUG
-#define KALDI_ASSERT(cond) \
-  do { if ((cond)) ; else kaldi::KaldiAssertFailure_(__func__, __FILE__, __LINE__, #cond);} while(0)
+#define KALDI_ASSERT(cond) do { if (cond) (void)0; else \
+  ::kaldi::KaldiAssertFailure_(__func__, __FILE__, __LINE__, #cond); } while(0)
 #else
-#define KALDI_ASSERT(cond)
+#define KALDI_ASSERT(cond) (void)0
 #endif
-// also see KALDI_COMPILE_TIME_ASSERT, defined in base/kaldi-utils.h,
+// Also see KALDI_COMPILE_TIME_ASSERT, defined in base/kaldi-utils.h,
 // and KALDI_ASSERT_IS_INTEGER_TYPE and KALDI_ASSERT_IS_FLOATING_TYPE,
 // also defined there.
-#ifdef KALDI_PARANOID // some more expensive asserts only checked if this defined
-#define KALDI_PARANOID_ASSERT(cond) \
-  do { if ((cond)) ; else kaldi::KaldiAssertFailure_(__func__, __FILE__, __LINE__, #cond);} while(0)
+// some more expensive asserts only checked if this defined
+#ifdef KALDI_PARANOID
+#define KALDI_PARANOID_ASSERT(cond) do { if (cond) (void)0; else \
+  ::kaldi::KaldiAssertFailure_(__func__, __FILE__, __LINE__, #cond); } while(0)
 #else
-#define KALDI_PARANOID_ASSERT(cond)
+#define KALDI_PARANOID_ASSERT(cond) (void)0
 #endif
 
 
-#define KALDI_ERR kaldi::KaldiErrorMessage(__func__, __FILE__, __LINE__).stream()
-#define KALDI_WARN kaldi::KaldiWarnMessage(__func__, __FILE__, __LINE__).stream()
-#define KALDI_LOG kaldi::KaldiLogMessage(__func__, __FILE__, __LINE__).stream()
+/***** THIRD-PARTY LOG-HANDLER *****/
 
-#define KALDI_VLOG(v) if (v <= kaldi::g_kaldi_verbose_level)     \
-           kaldi::KaldiVlogMessage(__func__, __FILE__, __LINE__, v).stream()
+/// Type of third-party logging function,
+typedef void (*LogHandler)(const LogMessageEnvelope &envelope,
+                           const char *message);
 
-inline bool IsKaldiError(const std::string &str) {
-  return(!strncmp(str.c_str(), "ERROR ", 6));
-}
-
-void KaldiAssertFailure_(const char *func, const char *file,
-                         int32 line, const char *cond_str);
+/// Set logging handler. If called with a non-NULL function pointer, the
+/// function pointed by it is called to send messages to a caller-provided
+/// log. If called with NULL pointer, restores default Kaldi error logging to
+/// stderr.  SetLogHandler is obviously not thread safe.
+LogHandler SetLogHandler(LogHandler);
 
 /// @} end "addtogroup error_group"
 
diff --git a/src/base/kaldi-math-test.cc b/src/base/kaldi-math-test.cc
index 3026f05502f..52719cc4669 100644
--- a/src/base/kaldi-math-test.cc
+++ b/src/base/kaldi-math-test.cc
@@ -1,5 +1,5 @@
 // base/kaldi-math-test.cc
-// 
+//
 // Copyright 2009-2011  Microsoft Corporation;  Yanmin Qian;  Jan Silovsky
 
 // See ../../COPYING for clarification regarding multiple authors
@@ -17,8 +17,8 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 #include "base/kaldi-math.h"
-#include "base/timer.h"
 #include <limits>
+#include "base/timer.h"
 
 namespace kaldi {
 
@@ -37,7 +37,7 @@ template<class I> void UnitTestGcdLcmTpl() {
     KALDI_ASSERT((c*a) % g == 0);
 
     // test least common multiple
-    if (b <= 0 || c <= 0) continue; // lcm not defined unless both positive.
+    if (b <= 0 || c <= 0) continue;  // lcm not defined unless both positive.
     I h = Lcm(b*a, c*a);
     KALDI_ASSERT(h != 0 && (h % (b*a)) == 0 &&
                  (h % (c*a)) == 0);
@@ -54,18 +54,17 @@ void UnitTestRoundUpToNearestPowerOfTwo() {
   KALDI_ASSERT(RoundUpToNearestPowerOfTwo(255) == 256);
   KALDI_ASSERT(RoundUpToNearestPowerOfTwo(256) == 256);
   KALDI_ASSERT(RoundUpToNearestPowerOfTwo(257) == 512);
-  KALDI_ASSERT(RoundUpToNearestPowerOfTwo(1073700000) == 1073741824  );
+  KALDI_ASSERT(RoundUpToNearestPowerOfTwo(1073700000) == 1073741824);
 }
 
 void UnitTestGcdLcm() {
   UnitTestGcdLcmTpl<int>();
   UnitTestGcdLcmTpl<size_t>();
-  UnitTestGcdLcmTpl<unsigned short>();
+  UnitTestGcdLcmTpl<int16>();
 }
 
 void UnitTestRand() {
   // Testing random-number generation.
-  using namespace kaldi;
   std::cout << "Testing random-number generation.  "
             << "If there is an error this may not terminate.\n";
   std::cout << "If this does not terminate, look more closely.  "
@@ -77,14 +76,14 @@ void UnitTestRand() {
       float sum = RandUniform()-0.5;
       for (int j = 0; ; j++) {
         sum += RandUniform()-0.5;
-        if (std::abs(sum) < 0.5*sqrt((double)j)) break;
+        if (std::abs(sum) < 0.5*sqrt(static_cast<double>(j))) break;
       }
     }
     {  // test RandGauss.
       float sum = RandGauss();
       for (int j = 0; ; j++) {
         sum += RandGauss();
-        if (std::abs(sum) < 0.5*sqrt((double)j)) break;
+        if (std::abs(sum) < 0.5*sqrt(static_cast<double>(j))) break;
       }
     }
     {  // test RandGauss.
@@ -93,8 +92,9 @@ void UnitTestRand() {
         float a, b;
         RandGauss2(&a, &b);
         if (i % 2 == 0) sum += a;
-        else sum += b;
-        if (std::abs(sum) < 0.5*sqrt((double)j)) break;
+        else
+          sum += b;
+        if (std::abs(sum) < 0.5*sqrt(static_cast<double>(j))) break;
       }
     }
     {  // test poisson_Rand().
@@ -105,7 +105,7 @@ void UnitTestRand() {
       double sum = RandPoisson(lambda) - lambda;  // expected value is zero.
       for (int j = 0; ; j++) {
         sum += RandPoisson(lambda) - lambda;
-        if (std::abs(sum) < 0.5*sqrt((double)j)) break;
+        if (std::abs(sum) < 0.5*sqrt(static_cast<double>(j))) break;
       }
     }
 
@@ -138,7 +138,8 @@ void UnitTestRand() {
       float sum = RandInt(minint, maxint) +  0.5*(minint+maxint);
       for (int j = 0; ; j++) {
         sum += RandInt(minint, maxint) - 0.5*(minint+maxint);
-        if (std::abs((float)sum) < 0.5*sqrt((double)j)*(maxint-minint)) break;
+        if (std::abs(static_cast<float>(sum)) <
+            0.5*sqrt(static_cast<double>(j))*(maxint-minint)) break;
       }
     }
     { // test RandPrune in basic way.
@@ -157,7 +158,6 @@ void UnitTestRand() {
 }
 
 void UnitTestLogAddSub() {
-  using namespace kaldi;
   for (int i = 0; i < 100; i++) {
     double f1 = Rand() % 10000, f2 = Rand() % 20;
     double add1 = Exp(LogAdd(Log(f1), Log(f2)));
@@ -167,7 +167,8 @@ void UnitTestLogAddSub() {
 
 
     try {
-      double f2_check = Exp(LogSub(Log(add), Log(f1))), thresh = (f2*0.01)+0.001;
+      double f2_check = Exp(LogSub(Log(add), Log(f1))),
+             thresh = (f2*0.01)+0.001;
       KALDI_ASSERT(std::abs(f2_check-f2) < thresh);
     } catch(...) {
       KALDI_ASSERT(f2 == 0);  // It will probably crash for f2=0.
@@ -192,17 +193,20 @@ void UnitTestDefines() {  // Yes, we even unit-test the preprocessor statements.
   std::cout << 1.0+DBL_EPSILON;
   std::cout << 1.0 + 0.5*DBL_EPSILON;
   KALDI_ASSERT(1.0 + DBL_EPSILON != 1.0 && 1.0 + (0.5*DBL_EPSILON) == 1.0
-               && "If this test fails, you can probably just comment it out-- may mean your CPU exceeds expected floating point precision");
+               && "If this test fails, you can probably just comment it out-- "
+               "may mean your CPU exceeds expected floating point precision");
   KALDI_ASSERT(1.0f + FLT_EPSILON != 1.0f && 1.0f + (0.5f*FLT_EPSILON) == 1.0f
-               && "If this test fails, you can probably just comment it out-- may mean your CPU exceeds expected floating point precision");
-  KALDI_ASSERT(std::abs(sin(M_PI)) < 1.0e-05 && std::abs(cos(M_PI)+1.0) < 1.0e-05);
-  KALDI_ASSERT(std::abs(sin(M_2PI)) < 1.0e-05 && std::abs(cos(M_2PI)-1.0) < 1.0e-05);
+               && "If this test fails, you can probably just comment it out-- "
+               "may mean your CPU exceeds expected floating point precision");
+  KALDI_ASSERT(std::abs(sin(M_PI)) < 1.0e-05
+               && std::abs(cos(M_PI)+1.0) < 1.0e-05);
+  KALDI_ASSERT(std::abs(sin(M_2PI)) < 1.0e-05
+               && std::abs(cos(M_2PI)-1.0) < 1.0e-05);
   KALDI_ASSERT(std::abs(sin(Exp(M_LOG_2PI))) < 1.0e-05);
   KALDI_ASSERT(std::abs(cos(Exp(M_LOG_2PI)) - 1.0) < 1.0e-05);
 }
 
 void UnitTestAssertFunc() {  // Testing Assert** *functions
-  using namespace kaldi;
   for (int i = 1; i < 100; i++) {
     float f1 = Rand() % 10000 + 1, f2 = Rand() % 20 + 1;
     float tmp1 = f1 * f2;
@@ -234,7 +238,7 @@ template<class I> void UnitTestFactorizeTpl() {
 void UnitTestFactorize() {
   UnitTestFactorizeTpl<int>();
   UnitTestFactorizeTpl<size_t>();
-  UnitTestFactorizeTpl<unsigned short>();
+  UnitTestFactorizeTpl<int16>();
 }
 
 void UnitTestApproxEqual() {
@@ -254,7 +258,7 @@ void UnitTestApproxEqual() {
   KALDI_ASSERT(!ApproxEqual(-std::numeric_limits<float>::infinity(),
                             0));
   KALDI_ASSERT(!ApproxEqual(-std::numeric_limits<float>::infinity(),
-                            1));               
+                            1));
 }
 
 template<class Real>
@@ -273,8 +277,8 @@ void UnitTestExpSpeed() {
   KALDI_ASSERT(sum > 0.0);  // make it harder for the compiler to optimize Exp
                             // away, as we have a conditional.
   Real flops = 1.0e-06 * num_ops / tim.Elapsed();
-  KALDI_LOG << "Megaflops doing Exp(" << (sizeof(Real) == 4 ? "float" : "double")
-            << ") is " << flops;
+  KALDI_LOG << "Megaflops doing Exp("
+            << (sizeof(Real) == 4 ? "float" : "double") << ") is " << flops;
 }
 
 
@@ -287,15 +291,15 @@ void UnitTestLogSpeed() {
   Timer tim;
   while (tim.Elapsed() < time) {
     for (int i = 0; i < block_size; i++) {
-      sum += Log((float)(i + 1));
+      sum += Log(static_cast<float>(i + 1));
     }
     num_ops += block_size;
   }
   KALDI_ASSERT(sum > 0.0);  // make it harder for the compiler to optimize Log
                             // away, as we have a conditional.
   Real flops = 1.0e-06 * num_ops / tim.Elapsed();
-  KALDI_LOG << "Megaflops doing Log(" << (sizeof(Real) == 4 ? "float" : "double")
-            << ") is " << flops;
+  KALDI_LOG << "Megaflops doing Log("
+            << (sizeof(Real) == 4 ? "float" : "double") << ") is " << flops;
 }
 
 }  // end namespace kaldi.
diff --git a/src/base/kaldi-math.cc b/src/base/kaldi-math.cc
index 3496794e78a..dd269fd0cbc 100644
--- a/src/base/kaldi-math.cc
+++ b/src/base/kaldi-math.cc
@@ -18,11 +18,11 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
-#include <string>
 #include "base/kaldi-math.h"
 #ifndef _MSC_VER
 #include <pthread.h>
 #endif
+#include <string>
 
 namespace kaldi {
 // These routines are tested in matrix/matrix-test.cc
@@ -42,16 +42,14 @@ int32 RoundUpToNearestPowerOfTwo(int32 n) {
 static pthread_mutex_t _RandMutex = PTHREAD_MUTEX_INITIALIZER;
 #endif
 
-int Rand(struct RandomState* state)
-{
+int Rand(struct RandomState* state) {
 #ifdef _MSC_VER
   // On Windows, just call Rand()
   return rand();
 #else
   if (state) {
     return rand_r(&(state->seed));
-  }
-  else {
+  } else {
     int rs = pthread_mutex_lock(&_RandMutex);
     KALDI_ASSERT(rs == 0);
     int val = rand();
@@ -86,7 +84,7 @@ bool WithProb(BaseFloat prob, struct RandomState* state) {
     // prob is very small but nonzero, and the "main algorithm"
     // wouldn't work that well.  So: with probability 1/128, we
     // return WithProb (prob * 128), else return false.
-    if (Rand(state) < RAND_MAX / 128) { // with probability 128...
+    if (Rand(state) < RAND_MAX / 128) {  // with probability 128...
       // Note: we know that prob * 128.0 < 1.0, because
       // we asserted RAND_MAX > 128 * 128.
       return WithProb(prob * 128.0);
@@ -98,7 +96,8 @@ bool WithProb(BaseFloat prob, struct RandomState* state) {
   }
 }
 
-int32 RandInt(int32 min_val, int32 max_val, struct RandomState* state) {  // This is not exact.
+int32 RandInt(int32 min_val, int32 max_val, struct RandomState* state) {
+  // This is not exact.
   KALDI_ASSERT(max_val >= min_val);
   if (max_val == min_val) return min_val;
 
@@ -106,9 +105,11 @@ int32 RandInt(int32 min_val, int32 max_val, struct RandomState* state) {  // Thi
   // RAND_MAX is quite small on Windows -> may need to handle larger numbers.
   if (RAND_MAX > (max_val-min_val)*8) {
         // *8 to avoid large inaccuracies in probability, from the modulus...
-    return min_val + ((unsigned int)Rand(state) % (unsigned int)(max_val+1-min_val));
+    return min_val +
+      ((unsigned int)Rand(state) % (unsigned int)(max_val+1-min_val));
   } else {
-    if ((unsigned int)(RAND_MAX*RAND_MAX) > (unsigned int)((max_val+1-min_val)*8)) {
+    if ((unsigned int)(RAND_MAX*RAND_MAX) >
+        (unsigned int)((max_val+1-min_val)*8)) {
         // *8 to avoid inaccuracies in probability, from the modulus...
       return min_val + ( (unsigned int)( (Rand(state)+RAND_MAX*Rand(state)))
                     % (unsigned int)(max_val+1-min_val));
@@ -121,7 +122,7 @@ int32 RandInt(int32 min_val, int32 max_val, struct RandomState* state) {  // Thi
   }
 #else
   return min_val +
-      (static_cast<int32>(Rand(state)) % (int32)(max_val+1-min_val));
+      (static_cast<int32>(Rand(state)) % static_cast<int32>(max_val+1-min_val));
 #endif
 }
 
@@ -141,8 +142,7 @@ int32 RandPoisson(float lambda, struct RandomState* state) {
   return k-1;
 }
 
-void RandGauss2(float *a, float *b, RandomState *state)
-{
+void RandGauss2(float *a, float *b, RandomState *state) {
   KALDI_ASSERT(a);
   KALDI_ASSERT(b);
   float u1 = RandUniform(state);
@@ -153,15 +153,15 @@ void RandGauss2(float *a, float *b, RandomState *state)
   *b = u1 * sinf(u2);
 }
 
-void RandGauss2(double *a, double *b, RandomState *state)
-{
+void RandGauss2(double *a, double *b, RandomState *state) {
   KALDI_ASSERT(a);
   KALDI_ASSERT(b);
   float a_float, b_float;
   // Just because we're using doubles doesn't mean we need super-high-quality
   // random numbers, so we just use the floating-point version internally.
   RandGauss2(&a_float, &b_float, state);
-  *a = a_float; *b = b_float;
+  *a = a_float;
+  *b = b_float;
 }
 
 
diff --git a/src/base/kaldi-math.h b/src/base/kaldi-math.h
index edbc8010195..ac590a06a25 100644
--- a/src/base/kaldi-math.h
+++ b/src/base/kaldi-math.h
@@ -41,20 +41,19 @@
 #endif
 
 #ifndef M_PI
-#  define M_PI 3.1415926535897932384626433832795
+#define M_PI 3.1415926535897932384626433832795
 #endif
 
 #ifndef M_SQRT2
-#  define M_SQRT2 1.4142135623730950488016887
+#define M_SQRT2 1.4142135623730950488016887
 #endif
 
-
 #ifndef M_2PI
-#  define M_2PI 6.283185307179586476925286766559005
+#define M_2PI 6.283185307179586476925286766559005
 #endif
 
 #ifndef M_SQRT1_2
-# define M_SQRT1_2 0.7071067811865475244008443621048490
+#define M_SQRT1_2 0.7071067811865475244008443621048490
 #endif
 
 #ifndef M_LOG_2PI
@@ -65,6 +64,11 @@
 #define M_LN2 0.693147180559945309417232121458
 #endif
 
+#ifndef M_LN10
+#define M_LN10 2.302585092994045684017991454684
+#endif
+
+
 #define KALDI_ISNAN std::isnan
 #define KALDI_ISINF std::isinf
 #define KALDI_ISFINITE(x) std::isfinite(x)
@@ -81,7 +85,7 @@ inline double Exp(double x) { return exp(x); }
 inline float Exp(float x) { return expf(x); }
 #else
 inline float Exp(float x) { return exp(static_cast<double>(x)); }
-#endif // KALDI_NO_EXPF
+#endif  // KALDI_NO_EXPF
 #else
 inline double Exp(double x) { return exp(x); }
 #if !defined(__INTEL_COMPILER) && _MSC_VER == 1800 && defined(_M_X64)
@@ -90,8 +94,8 @@ inline double Exp(double x) { return exp(x); }
 inline float Exp(float x) { return exp(static_cast<double>(x)); }
 #else
 inline float Exp(float x) { return expf(x); }
-#endif // !defined(__INTEL_COMPILER) && _MSC_VER == 1800 && defined(_M_X64)
-#endif // !defined(_MSC_VER) || (_MSC_VER >= 1900)
+#endif  // !defined(__INTEL_COMPILER) && _MSC_VER == 1800 && defined(_M_X64)
+#endif  // !defined(_MSC_VER) || (_MSC_VER >= 1900)
 
 inline double Log(double x) { return log(x); }
 inline float Log(float x) { return logf(x); }
@@ -126,7 +130,7 @@ const double kLogZeroDouble = -std::numeric_limits<double>::infinity();
 const BaseFloat kLogZeroBaseFloat = -std::numeric_limits<BaseFloat>::infinity();
 
 // Returns a random integer between 0 and RAND_MAX, inclusive
-int Rand(struct RandomState* state=NULL);
+int Rand(struct RandomState* state = NULL);
 
 // State for thread-safe random number generator
 struct RandomState {
@@ -135,9 +139,10 @@ struct RandomState {
 };
 
 // Returns a random integer between min and max inclusive.
-int32 RandInt(int32 min, int32 max, struct RandomState* state=NULL);
+int32 RandInt(int32 min, int32 max, struct RandomState* state = NULL);
 
-bool WithProb(BaseFloat prob, struct RandomState* state=NULL); // Returns true with probability "prob",
+// Returns true with probability "prob",
+bool WithProb(BaseFloat prob, struct RandomState* state = NULL);
 // with 0 <= prob <= 1 [we check this].
 // Internally calls Rand().  This function is carefully implemented so
 // that it should work even if prob is very small.
@@ -155,7 +160,7 @@ inline float RandGauss(struct RandomState* state = NULL) {
 // Returns poisson-distributed random number.  Uses Knuth's algorithm.
 // Take care: this takes time proportinal
 // to lambda.  Faster algorithms exist but are more complex.
-int32 RandPoisson(float lambda, struct RandomState* state=NULL);
+int32 RandPoisson(float lambda, struct RandomState* state = NULL);
 
 // Returns a pair of gaussian random numbers. Uses Box-Muller transform
 void RandGauss2(float *a, float *b, RandomState *state = NULL);
@@ -166,7 +171,8 @@ void RandGauss2(double *a, double *b, RandomState *state = NULL);
 // This is a randomized pruning mechanism that preserves expectations,
 // that we typically use to prune posteriors.
 template<class Float>
-inline Float RandPrune(Float post, BaseFloat prune_thresh, struct RandomState* state=NULL) {
+inline Float RandPrune(Float post, BaseFloat prune_thresh,
+                       struct RandomState* state = NULL) {
   KALDI_ASSERT(prune_thresh >= 0.0);
   if (post == 0.0 || std::abs(post) >= prune_thresh)
     return post;
@@ -256,11 +262,11 @@ inline float LogSub(float x, float y) {
 static inline bool ApproxEqual(float a, float b,
                                float relative_tolerance = 0.001) {
   // a==b handles infinities.
-  if (a==b) return true;
+  if (a == b) return true;
   float diff = std::abs(a-b);
   if (diff == std::numeric_limits<float>::infinity()
-      || diff != diff) return false; // diff is +inf or nan.
-  return (diff <= relative_tolerance*(std::abs(a)+std::abs(b))); 
+      || diff != diff) return false;  // diff is +inf or nan.
+  return (diff <= relative_tolerance*(std::abs(a)+std::abs(b)));
 }
 
 /// assert abs(a - b) <= relative_tolerance * (abs(a)+abs(b))
diff --git a/src/base/kaldi-types.h b/src/base/kaldi-types.h
index c67529eb917..7ebf4f85386 100644
--- a/src/base/kaldi-types.h
+++ b/src/base/kaldi-types.h
@@ -39,15 +39,37 @@ typedef float   BaseFloat;
 // we find in the future lacks stdint.h
 #include <stdint.h>
 
+// for discussion on what to do if you need compile kaldi
+// without OpenFST, see the bottom of this this file
+#include <fst/types.h>
+
 namespace kaldi {
-typedef uint16_t        uint16;
-typedef uint32_t        uint32;
-typedef uint64_t        uint64;
-typedef int16_t         int16;
-typedef int32_t         int32;
-typedef int64_t         int64;
-typedef float           float32;
-typedef double         double64;
+  using ::int16;
+  using ::int32;
+  using ::int64;
+  using ::uint16;
+  using ::uint32;
+  using ::uint64;
+  typedef float   float32;
+  typedef double double64;
+}  // end namespace kaldi
+
+// In a theoretical case you decide compile Kaldi without the OpenFST
+// comment the previous namespace statement and uncomment the following
+/*
+namespace kaldi {
+  typedef int8_t   int8;
+  typedef int16_t  int16;
+  typedef int32_t  int32;
+  typedef int64_t  int64;
+
+  typedef uint8_t  uint8;
+  typedef uint16_t uint16;
+  typedef uint32_t uint32;
+  typedef uint64_t uint64;
+  typedef float    float32;
+  typedef double   double64;
 }  // end namespace kaldi
+*/
 
 #endif  // KALDI_BASE_KALDI_TYPES_H_
diff --git a/src/base/kaldi-utils.cc b/src/base/kaldi-utils.cc
index c7d82a7c4c5..1ae1dc0b758 100644
--- a/src/base/kaldi-utils.cc
+++ b/src/base/kaldi-utils.cc
@@ -16,26 +16,29 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
-#include <string>
-#include "base/kaldi-common.h"
-
-
 #ifdef _WIN32_WINNT_WIN8
 #include <Synchapi.h>
-#elif defined (_WIN32) || defined(_MSC_VER) || defined(MINGW)
+#elif defined(_WIN32) || defined(_MSC_VER) || defined(MINGW)
 #include <Windows.h>
+#if defined(_MSC_VER) && _MSC_VER < 1900
+#define snprintf _snprintf
+#endif /* _MSC_VER < 1900 */
 #else
 #include <unistd.h>
 #endif
 
+#include <string>
+#include "base/kaldi-common.h"
+
+
 namespace kaldi {
 
 std::string CharToString(const char &c) {
   char buf[20];
   if (std::isprint(c))
-    sprintf(buf, "\'%c\'", c);
+    snprintf(buf, sizeof(buf), "\'%c\'", c);
   else
-    sprintf(buf, "[character %d]", (int) c);
+    snprintf(buf, sizeof(buf), "[character %d]", static_cast<int>(c));
   return (std::string) buf;
 }
 
diff --git a/src/base/kaldi-utils.h b/src/base/kaldi-utils.h
index deac0f6b634..47c60b4b01d 100644
--- a/src/base/kaldi-utils.h
+++ b/src/base/kaldi-utils.h
@@ -21,15 +21,22 @@
 #ifndef KALDI_BASE_KALDI_UTILS_H_
 #define KALDI_BASE_KALDI_UTILS_H_ 1
 
-#include <limits>
-#include <string>
-
 #if defined(_MSC_VER)
 # define WIN32_LEAN_AND_MEAN
 # define NOMINMAX
 # include <windows.h>
 #endif
 
+#ifdef _MSC_VER
+#include <stdio.h>
+#define unlink _unlink
+#else
+#include <unistd.h>
+#endif
+
+#include <limits>
+#include <string>
+
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4056 4305 4800 4267 4996 4756 4661)
 #if _MSC_VER < 1400
@@ -39,22 +46,14 @@
 #endif
 #endif
 
-#ifdef HAVE_POSIX_MEMALIGN
-#  define KALDI_MEMALIGN(align, size, pp_orig) \
-     (!posix_memalign(pp_orig, align, size) ? *(pp_orig) : NULL)
-#  define KALDI_MEMALIGN_FREE(x) free(x)
-#elif defined(HAVE_MEMALIGN)
-  /* Some systems have memalign() but no declaration for it */
-  void * memalign(size_t align, size_t size);
-#  define KALDI_MEMALIGN(align, size, pp_orig) \
-     (*(pp_orig) = memalign(align, size))
-#  define KALDI_MEMALIGN_FREE(x) free(x)
-#elif defined(_MSC_VER)
+#ifdef _MSC_VER
 #  define KALDI_MEMALIGN(align, size, pp_orig) \
   (*(pp_orig) = _aligned_malloc(size, align))
 #  define KALDI_MEMALIGN_FREE(x) _aligned_free(x)
 #else
-#error Manual memory alignment is no longer supported
+#  define KALDI_MEMALIGN(align, size, pp_orig) \
+     (!posix_memalign(pp_orig, align, size) ? *(pp_orig) : NULL)
+#  define KALDI_MEMALIGN_FREE(x) free(x)
 #endif
 
 #ifdef __ICC
@@ -82,22 +81,36 @@ inline int MachineIsLittleEndian() {
   return (*reinterpret_cast<char*>(&check) != 0);
 }
 
-// This function kaldi::Sleep() provides a portable way to sleep for a possibly fractional
+// This function kaldi::Sleep() provides a portable way
+// to sleep for a possibly fractional
 // number of seconds.  On Windows it's only accurate to microseconds.
 void Sleep(float seconds);
-
 }
 
 #define KALDI_SWAP8(a) { \
-  int t = ((char*)&a)[0]; ((char*)&a)[0]=((char*)&a)[7]; ((char*)&a)[7]=t;\
-      t = ((char*)&a)[1]; ((char*)&a)[1]=((char*)&a)[6]; ((char*)&a)[6]=t;\
-      t = ((char*)&a)[2]; ((char*)&a)[2]=((char*)&a)[5]; ((char*)&a)[5]=t;\
-      t = ((char*)&a)[3]; ((char*)&a)[3]=((char*)&a)[4]; ((char*)&a)[4]=t;}
+  int t = (reinterpret_cast<char*>(&a))[0];\
+          (reinterpret_cast<char*>(&a))[0]=(reinterpret_cast<char*>(&a))[7];\
+          (reinterpret_cast<char*>(&a))[7]=t;\
+      t = (reinterpret_cast<char*>(&a))[1];\
+          (reinterpret_cast<char*>(&a))[1]=(reinterpret_cast<char*>(&a))[6];\
+          (reinterpret_cast<char*>(&a))[6]=t;\
+      t = (reinterpret_cast<char*>(&a))[2];\
+          (reinterpret_cast<char*>(&a))[2]=(reinterpret_cast<char*>(&a))[5];\
+          (reinterpret_cast<char*>(&a))[5]=t;\
+      t = (reinterpret_cast<char*>(&a))[3];\
+          (reinterpret_cast<char*>(&a))[3]=(reinterpret_cast<char*>(&a))[4];\
+          (reinterpret_cast<char*>(&a))[4]=t;}
 #define KALDI_SWAP4(a) { \
-  int t = ((char*)&a)[0]; ((char*)&a)[0]=((char*)&a)[3]; ((char*)&a)[3]=t;\
-      t = ((char*)&a)[1]; ((char*)&a)[1]=((char*)&a)[2]; ((char*)&a)[2]=t;}
+  int t = (reinterpret_cast<char*>(&a))[0];\
+          (reinterpret_cast<char*>(&a))[0]=(reinterpret_cast<char*>(&a))[3];\
+          (reinterpret_cast<char*>(&a))[3]=t;\
+      t = (reinterpret_cast<char*>(&a))[1];\
+          (reinterpret_cast<char*>(&a))[1]=(reinterpret_cast<char*>(&a))[2];\
+          (reinterpret_cast<char*>(&a))[2]=t;}
 #define KALDI_SWAP2(a) { \
-  int t = ((char*)&a)[0]; ((char*)&a)[0]=((char*)&a)[1]; ((char*)&a)[1]=t;}
+  int t = (reinterpret_cast<char*>(&a))[0];\
+          (reinterpret_cast<char*>(&a))[0]=(reinterpret_cast<char*>(&a))[1];\
+          (reinterpret_cast<char*>(&a))[1]=t;}
 
 
 // Makes copy constructor and operator= private.  Same as in compat.h of OpenFst
@@ -109,7 +122,7 @@ void Sleep(float seconds);
 template<bool B> class KaldiCompileTimeAssert { };
 template<> class KaldiCompileTimeAssert<true> {
  public:
-  static inline void Check() { }  
+  static inline void Check() { }
 };
 
 #define KALDI_COMPILE_TIME_ASSERT(b) KaldiCompileTimeAssert<(b)>::Check()
@@ -122,14 +135,6 @@ template<> class KaldiCompileTimeAssert<true> {
   KaldiCompileTimeAssert<std::numeric_limits<F>::is_specialized \
                 && !std::numeric_limits<F>::is_integer>::Check()
 
-#ifdef _MSC_VER
-#include <stdio.h>
-#define unlink _unlink
-#else
-#include <unistd.h>
-#endif
-
-
 #ifdef _MSC_VER
 #define KALDI_STRCASECMP _stricmp
 #else
diff --git a/src/base/timer-test.cc b/src/base/timer-test.cc
index 32ceebd9f6e..86a20b486ee 100644
--- a/src/base/timer-test.cc
+++ b/src/base/timer-test.cc
@@ -36,7 +36,6 @@ void TimerTest() {
     KALDI_ERR << "Timer fail: waited " << f << " seconds instead of "
               <<  time_secs << " secs.";
 }
-
 }
 
 
diff --git a/src/base/timer.h b/src/base/timer.h
index d93a46143c2..eff7da31529 100644
--- a/src/base/timer.h
+++ b/src/base/timer.h
@@ -25,9 +25,7 @@
 
 #if defined(_MSC_VER) || defined(MINGW)
 
-namespace kaldi
-{
-
+namespace kaldi {
 class Timer {
  public:
   Timer() { Reset(); }
@@ -38,9 +36,14 @@ class Timer {
     LARGE_INTEGER time_end;
     LARGE_INTEGER freq;
     QueryPerformanceCounter(&time_end);
-    if (QueryPerformanceFrequency(&freq) == 0) return 0.0;  // Hardware does not support this.
-    return ((double)time_end.QuadPart - (double)time_start_.QuadPart) /
-        ((double)freq.QuadPart);
+
+    if (QueryPerformanceFrequency(&freq) == 0) {
+      //  Hardware does not support this.
+      return 0.0;
+    }
+    return (static_cast<double>(time_end.QuadPart) -
+            static_cast<double>(time_start_.QuadPart)) /
+           (static_cast<double>(freq.QuadPart));
   }
  private:
   LARGE_INTEGER time_start_;
@@ -48,13 +51,11 @@ class Timer {
 }
 
 #else
+#include <sys/time.h>
+#include <unistd.h>
 
-# include <sys/time.h>
-# include <unistd.h>
-namespace kaldi
-{
-class Timer
-{
+namespace kaldi {
+class Timer {
  public:
   Timer() { Reset(); }
 
@@ -65,9 +66,10 @@ class Timer
     struct timeval time_end;
     gettimeofday(&time_end, &time_zone_);
     double t1, t2;
-    t1 =  (double)time_start_.tv_sec +
-        (double)time_start_.tv_usec/(1000*1000);
-    t2 =  (double)time_end.tv_sec + (double)time_end.tv_usec/(1000*1000);
+    t1 =  static_cast<double>(time_start_.tv_sec) +
+          static_cast<double>(time_start_.tv_usec)/(1000*1000);
+    t2 =  static_cast<double>(time_end.tv_sec) +
+          static_cast<double>(time_end.tv_usec)/(1000*1000);
     return t2-t1;
   }
 
@@ -80,4 +82,4 @@ class Timer
 #endif
 
 
-#endif
+#endif  // KALDI_BASE_TIMER_H_
diff --git a/src/bin/Makefile b/src/bin/Makefile
index ac175e42e0e..a1df9b5d48a 100644
--- a/src/bin/Makefile
+++ b/src/bin/Makefile
@@ -1,12 +1,14 @@
 
 all:
+	-rm -f arpa2fst
 EXTRA_CXXFLAGS = -Wno-sign-compare
 include ../kaldi.mk
 
 BINFILES = align-equal align-equal-compiled acc-tree-stats \
         show-alignments compile-questions cluster-phones \
-        compute-wer make-h-transducer add-self-loops convert-ali \
-        compile-train-graphs compile-train-graphs-fsts arpa2fst \
+        compute-wer compute-wer-bootci make-h-transducer \
+        add-self-loops convert-ali \
+        compile-train-graphs compile-train-graphs-fsts \
         make-pdf-to-tid-transducer make-ilabel-transducer show-transitions \
         ali-to-phones ali-to-post weight-silence-post acc-lda est-lda \
         ali-to-pdf est-mllt build-tree build-tree-two-level decode-faster \
@@ -36,4 +38,3 @@ ADDLIBS = ../lm/kaldi-lm.a ../decoder/kaldi-decoder.a ../lat/kaldi-lat.a \
 TESTFILES =
 
 include ../makefiles/default_rules.mk
-
diff --git a/src/bin/acc-lda.cc b/src/bin/acc-lda.cc
index 8169ae79bde..92cd192b9a6 100644
--- a/src/bin/acc-lda.cc
+++ b/src/bin/acc-lda.cc
@@ -86,7 +86,7 @@ int main(int argc, char *argv[]) {
 
       if (feats.NumRows() != static_cast<int32>(post.size())) {
         KALDI_WARN << "Posterior vs. feats size mismatch "
-                   << feats.NumRows() << " vs. " <<post.size();
+                   << post.size() << " vs. " << feats.NumRows();
         num_fail++;
         continue;
       }
diff --git a/src/bin/acc-tree-stats.cc b/src/bin/acc-tree-stats.cc
index d48346c9b5a..90432c2e58a 100644
--- a/src/bin/acc-tree-stats.cc
+++ b/src/bin/acc-tree-stats.cc
@@ -27,7 +27,7 @@
 
 /** @brief Accumulate tree statistics for decision tree training. The
 program reads in a feature archive, and the corresponding alignments,
-and generats the sufficient statistics for the decision tree
+and generates the sufficient statistics for the decision tree
 creation. Context width and central phone position are used to
 identify the contexts.Transition model is used as an input to identify
 the PDF's and the phones.  */
@@ -37,31 +37,19 @@ int main(int argc, char *argv[]) {
   try {
     const char *usage =
         "Accumulate statistics for phonetic-context tree building.\n"
-        "Usage:  acc-tree-stats [options] model-in features-rspecifier alignments-rspecifier [tree-accs-out]\n"
+        "Usage:  acc-tree-stats [options] <model-in> <features-rspecifier> <alignments-rspecifier> <tree-accs-out>\n"
         "e.g.: \n"
         " acc-tree-stats 1.mdl scp:train.scp ark:1.ali 1.tacc\n";
-    ParseOptions po(usage);
+
     bool binary = true;
-    float var_floor = 0.01;
-    string ci_phones_str;
-    std::string phone_map_rxfilename;
-    int N = 3;
-    int P = 1;
+    AccumulateTreeStatsOptions opts;
+    ParseOptions po(usage);
     po.Register("binary", &binary, "Write output in binary mode");
-    po.Register("var-floor", &var_floor, "Variance floor for tree clustering.");
-    po.Register("ci-phones", &ci_phones_str, "Colon-separated list of integer "
-                "indices of context-independent phones (after mapping, if "
-                "--phone-map option is used).");
-    po.Register("context-width", &N, "Context window size.");
-    po.Register("central-position", &P, "Central context-window position "
-                "(zero-based)");
-    po.Register("phone-map", &phone_map_rxfilename,
-                "File name containing old->new phone mapping (each line is: "
-                "old-integer-id new-integer-id)");
-    
+    opts.Register(&po);
+
     po.Read(argc, argv);
 
-    if (po.NumArgs() < 3 || po.NumArgs() > 4) {
+    if (po.NumArgs() != 4) {
       po.PrintUsage();
       exit(1);
     }
@@ -71,22 +59,8 @@ int main(int argc, char *argv[]) {
         alignment_rspecifier = po.GetArg(3),
         accs_out_wxfilename = po.GetOptArg(4);
 
-    std::vector<int32> phone_map;
-    if (phone_map_rxfilename != "") {  // read phone map.
-      ReadPhoneMap(phone_map_rxfilename,
-                   &phone_map);
-    }
-    
-    std::vector<int32> ci_phones;
-    if (ci_phones_str != "") {
-      SplitStringToIntegers(ci_phones_str, ":", false, &ci_phones);
-      std::sort(ci_phones.begin(), ci_phones.end());
-      if (!IsSortedAndUniq(ci_phones) || ci_phones[0] == 0) {
-        KALDI_ERR << "Invalid set of ci_phones: " << ci_phones_str;
-      }
-    }
 
-    
+    AccumulateTreeStatsInfo acc_tree_stats_info(opts);
 
     TransitionModel trans_model;
     {
@@ -117,15 +91,10 @@ int main(int argc, char *argv[]) {
           continue;
         }
 
-        ////// This is the important part of this program.  ////////
         AccumulateTreeStats(trans_model,
-                            var_floor,
-                            N,
-                            P,
-                            ci_phones,
+                            acc_tree_stats_info,
                             alignment,
                             mat,
-                            (phone_map_rxfilename != "" ? &phone_map : NULL),
                             &tree_stats);
         num_done++;
         if (num_done % 1000 == 0)
@@ -135,9 +104,9 @@ int main(int argc, char *argv[]) {
 
     BuildTreeStatsType stats;  // vectorized form.
 
-    for (std::map<EventType, GaussClusterable*>::const_iterator iter = tree_stats.begin();  
-        iter != tree_stats.end();
-        iter++ ) {
+    for (std::map<EventType, GaussClusterable*>::const_iterator iter = tree_stats.begin();
+         iter != tree_stats.end();
+         ++iter) {
       stats.push_back(std::make_pair(iter->first, iter->second));
     }
     tree_stats.clear();
diff --git a/src/bin/ali-to-phones.cc b/src/bin/ali-to-phones.cc
index 9b95721bd33..b370dbc7f18 100644
--- a/src/bin/ali-to-phones.cc
+++ b/src/bin/ali-to-phones.cc
@@ -67,7 +67,7 @@ int main(int argc, char *argv[]) {
 
     std::string model_filename = po.GetArg(1),
         alignments_rspecifier = po.GetArg(2);
-    
+
     TransitionModel trans_model;
     ReadKaldiObject(model_filename, &trans_model);
 
@@ -77,12 +77,12 @@ int main(int argc, char *argv[]) {
                                     (write_lengths ? empty : po.GetArg(3)));
     Int32PairVectorWriter pair_writer(ctm_output ? empty :
                                       (write_lengths ? po.GetArg(3) : empty));
-    
+
     std::string ctm_wxfilename(ctm_output ? po.GetArg(3) : empty);
     Output ctm_writer(ctm_wxfilename, false);
     if (ctm_output) {
       ctm_writer.Stream() << std::fixed;
-      ctm_writer.Stream().precision(2);
+      ctm_writer.Stream().precision(frame_shift >= 0.01 ? 2 : 3);
     }
 
     int32 n_done = 0;
diff --git a/src/bin/align-equal-compiled.cc b/src/bin/align-equal-compiled.cc
index 663309a589b..c4ab9d4205a 100644
--- a/src/bin/align-equal-compiled.cc
+++ b/src/bin/align-equal-compiled.cc
@@ -39,7 +39,7 @@ int main(int argc, char *argv[]) {
     const char *usage =  "Write an equally spaced alignment (for getting training started)"
         "Usage:  align-equal-compiled <graphs-rspecifier> <features-rspecifier> <alignments-wspecifier>\n"
         "e.g.: \n"
-        " align-equal-compiled 1.mdl 1.fsts scp:train.scp ark:equal.ali\n";
+        " align-equal-compiled 1.fsts scp:train.scp ark:equal.ali\n";
 
     ParseOptions po(usage);
     bool binary = true;
diff --git a/src/bin/align-text.cc b/src/bin/align-text.cc
index 04172f3b5f3..833e29efe3b 100644
--- a/src/bin/align-text.cc
+++ b/src/bin/align-text.cc
@@ -47,7 +47,9 @@ int main(int argc, char *argv[]) {
         "\n"
         "Usage: align-text [options] <text1-rspecifier> <text2-rspecifier> \\\n"
         "                              <alignment-wspecifier>\n"
-        " e.g.: align-text ark:text1.txt ark:text2.txt ark,t:alignment.txt\n";
+        " e.g.: align-text ark:text1.txt ark:text2.txt ark,t:alignment.txt\n"
+        "See also: compute-wer,\n"
+        "Example scoring script: egs/wsj/s5/steps/score_kaldi.sh\n";
 
     ParseOptions po(usage);
 
diff --git a/src/bin/am-info.cc b/src/bin/am-info.cc
index e8cdc1977ec..6afb0c5014e 100644
--- a/src/bin/am-info.cc
+++ b/src/bin/am-info.cc
@@ -1,4 +1,4 @@
-// gmmbin/am-info.cc
+// bin/am-info.cc
 
 // Copyright 2012-2013  Johns Hopkins University (Author: Daniel Povey)
 
diff --git a/src/bin/analyze-counts.cc b/src/bin/analyze-counts.cc
index 60be710c79d..80d43891696 100644
--- a/src/bin/analyze-counts.cc
+++ b/src/bin/analyze-counts.cc
@@ -1,6 +1,6 @@
 // bin/analyze-counts.cc
 
-// Copyright 2012-2014 Brno University of Technology (Author: Karel Vesely)
+// Copyright 2012-2016 Brno University of Technology (Author: Karel Vesely)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -34,22 +34,38 @@ int main(int argc, char *argv[]) {
   try {
     const char *usage =
         "Computes element counts from integer vector table.\n"
-        "(e.g. for example to get pdf-counts to estimate DNN-output priors, for data analysis)\n"
+        "(e.g. get pdf-counts to estimate DNN-output priors "
+        "for data analysis)\n"
         "Verbosity : level 1 => print frequencies and histogram\n"
         "\n"
-        "Usage:  analyze-counts  [options] <alignments-rspecifier> <counts-wxfilname>\n"
+        "Usage: analyze-counts [options] <alignments-rspecifier> "
+        "<counts-wxfilname>\n"
         "e.g.: \n"
         " analyze-counts ark:1.ali prior.counts\n"
         " Show phone counts by:\n"
-        " ali-to-phone --per-frame=true ark:1.ali ark:- | analyze-counts --verbose=1 ark:- - >/dev/null\n";
-    
+        " ali-to-phone --per-frame=true ark:1.ali ark:- |"
+        " analyze-counts --verbose=1 ark:- - >/dev/null\n";
+
     ParseOptions po(usage);
-    
+
     bool binary = false;
     std::string symbol_table_filename = "";
-    
+
     po.Register("binary", &binary, "write in binary mode");
-    po.Register("symbol-table", &symbol_table_filename, "Read symbol table for display of counts");
+    po.Register("symbol-table", &symbol_table_filename,
+                "Read symbol table for display of counts");
+
+    int32 counts_dim = 0;
+    po.Register("counts-dim", &counts_dim,
+                "Output dimension of the counts, "
+                "a hint for dimension auto-detection.");
+
+    std::string frame_weights;
+    po.Register("frame-weights", &frame_weights,
+                "Per-frame weights (counting weighted frames).");
+    std::string utt_weights;
+    po.Register("utt-weights", &utt_weights,
+                "Per-utterance weights (counting weighted frames).");
 
     po.Read(argc, argv);
 
@@ -61,79 +77,121 @@ int main(int argc, char *argv[]) {
     std::string alignments_rspecifier = po.GetArg(1),
         wxfilename = po.GetArg(2);
 
-    SequentialInt32VectorReader reader(alignments_rspecifier);
+    SequentialInt32VectorReader alignment_reader(alignments_rspecifier);
 
-    // Get the counts
-    std::vector<uint64> counts;
-    int32 num_done = 0;
-    for (; !reader.Done(); reader.Next()) {
-      std::string key = reader.Key();
-      std::vector<int32> alignment = reader.Value();
+    RandomAccessBaseFloatVectorReader weights_reader;
+    if (frame_weights != "") {
+      weights_reader.Open(frame_weights);
+    }
+    RandomAccessBaseFloatReader utt_weights_reader;
+    if (utt_weights != "") {
+      utt_weights_reader.Open(utt_weights);
+    }
 
-      for (size_t i = 0; i < alignment.size(); i++) {
-        int32 value = alignment[i];
-        if(value >= counts.size()) {
-          counts.resize(value+1);
+    // Buffer for accumulating the counts
+    Vector<double> counts(counts_dim, kSetZero);
+
+    int32 num_done = 0, num_other_error = 0;
+    for (; !alignment_reader.Done(); alignment_reader.Next()) {
+      std::string utt = alignment_reader.Key();
+      const std::vector<int32> &alignment = alignment_reader.Value();
+
+      BaseFloat utt_w = 1.0;
+      // Check if per-utterance weights are provided
+      if (utt_weights != "") {
+        if (!utt_weights_reader.HasKey(utt)) {
+          KALDI_WARN << utt << ", missing per-utterance weight";
+          num_other_error++;
+          continue;
+        } else {
+          utt_w = utt_weights_reader.Value(utt);
+        }
+      }
+
+      Vector<BaseFloat> frame_w;
+      // Check if per-frame weights are provided
+      if (frame_weights != "") {
+        if (!weights_reader.HasKey(utt)) {
+          KALDI_WARN << utt << ", missing per-frame weights";
+          num_other_error++;
+          continue;
+        } else {
+          frame_w = weights_reader.Value(utt);
+          KALDI_ASSERT(frame_w.Dim() == alignment.size());
         }
-        counts[value]++; // Accumulate
       }
 
+      // Accumulate the counts
+      for (size_t i = 0; i < alignment.size(); i++) {
+        KALDI_ASSERT(alignment[i] >= 0);
+        // Extend the vector if it is not large enough to hold every pdf-ids
+        if (alignment[i] >= counts.Dim()) {
+          counts.Resize(alignment[i]+1, kCopyData);
+        }
+        if (frame_weights != "") {
+          counts(alignment[i]) += 1.0 * utt_w * frame_w(i);
+        } else {
+          counts(alignment[i]) += 1.0 * utt_w;
+        }
+      }
       num_done++;
     }
 
-    // We need at least one occurence for each tgt, so there is no nan during decoding
-    std::vector<uint64> counts_nozero(counts);
-    for(size_t i = 0; i < counts.size(); i++) {
-      if(counts_nozero[i] == 0) {
-        KALDI_WARN << "Zero count for element " << i << ", force setting to one."
-                   << " This avoids divide-by-zero when we use the counts in decoding.";
-        counts_nozero[i]++;
+    // Report elements with zero counts
+    for (size_t i = 0; i < counts.Dim(); i++) {
+      if (0.0 == counts(i)) {
+        KALDI_WARN << "Zero count for label " << i << ", this is suspicious.";
       }
     }
 
-    // Write
+    // Add a ``half-frame'' to all the elements to
+    // avoid zero-counts which would cause problems in decoding
+    Vector<double> counts_nozero(counts);
+    counts_nozero.Add(0.5);
+
     Output ko(wxfilename, binary);
-    WriteIntegerVector(ko.Stream(), binary, counts_nozero);
+    counts_nozero.Write(ko.Stream(), binary);
 
-    ////
-    //// THE REST IS FOR ANALYSIS, IT GETS PRINTED TO LOG
-    ////
+    //
+    // THE REST IS FOR ANALYSIS, IT GETS PRINTED TO LOG
+    //
     if (symbol_table_filename != "" || (kaldi::g_kaldi_verbose_level >= 1)) {
-
       // load the symbol table
       fst::SymbolTable *elem_syms = NULL;
       if (symbol_table_filename != "") {
           elem_syms = fst::SymbolTable::ReadText(symbol_table_filename);
           if (!elem_syms)
-            KALDI_ERR << "Could not read symbol table from file " << symbol_table_filename;
+            KALDI_ERR << "Could not read symbol table from file "
+                      << symbol_table_filename;
       }
-      
+
       // sort the counts
-      std::vector<std::pair<int32,int32> > sorted_counts;
-      for (int32 i = 0; i < counts.size(); i++) {
-        sorted_counts.push_back(std::make_pair(static_cast<int32>(counts[i]), i));
+      std::vector<std::pair<double, int32> > sorted_counts;
+      for (int32 i = 0; i < counts.Dim(); i++) {
+        sorted_counts.push_back(
+                        std::make_pair(static_cast<double>(counts(i)), i));
       }
       std::sort(sorted_counts.begin(), sorted_counts.end());
-      
-      // print
       std::ostringstream os;
-      int32 sum = std::accumulate(counts.begin(),counts.end(), 0);
+      double sum = counts.Sum();
       os << "Printing...\n### The sorted count table," << std::endl;
       os << "count\t(norm),\tid\t(symbol):" << std::endl;
-      for (int32 i=0; i<sorted_counts.size(); i++) {
-        os << sorted_counts[i].first << "\t(" 
+      for (int32 i = 0; i < sorted_counts.size(); i++) {
+        os << sorted_counts[i].first << "\t("
            << static_cast<float>(sorted_counts[i].first) / sum << "),\t"
-           << sorted_counts[i].second << "\t" 
-           << (elem_syms != NULL ? std::string("(")+elem_syms->Find(sorted_counts[i].second)+")" : "")
+           << sorted_counts[i].second << "\t"
+           << (elem_syms != NULL ? "(" +
+                           elem_syms->Find(sorted_counts[i].second) + ")" : "")
            << std::endl;
       }
-      os << "\n#total " << sum 
-         << " (" << static_cast<float>(sum)/100/3600 << "h)" 
+      os << "\n#total " << sum
+         << " (" << static_cast<float>(sum)/100/3600 << "h)"
          << std::endl;
       KALDI_LOG << os.str();
     }
 
-    KALDI_LOG << "Summed " << num_done << " int32 vectors to counts.";
+    KALDI_LOG << "Summed " << num_done << " int32 vectors to counts, "
+              << "skipped " << num_other_error << " vectors.";
     KALDI_LOG << "Counts written to " << wxfilename;
     return 0;
   } catch(const std::exception &e) {
@@ -141,5 +199,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
-
diff --git a/src/bin/arpa2fst.cc b/src/bin/arpa2fst.cc
deleted file mode 100755
index b118aba3f94..00000000000
--- a/src/bin/arpa2fst.cc
+++ /dev/null
@@ -1,62 +0,0 @@
-// bin/arpa2fst.cc
-//
-// Copyright 2009-2011  Gilles Boulianne.
-//
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-//  http://www.apache.org/licenses/LICENSE-2.0
-
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-/// @addtogroup LanguageModel
-/// @{
-
-/**
- * @file arpa2fst.cc
- * @brief Example for converting an ARPA format language model into an FST.
- *
- */
-
-#include <string>
-#include "lm/kaldi-lm.h"
-#include "util/parse-options.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    const char *usage  =
-        "Converts an ARPA format language model into a FST\n"
-        "Usage: arpa2fst [opts] (input_arpa|-)  [output_fst|-]\n";
-    kaldi::ParseOptions po(usage);
-
-    bool natural_base = true;
-    po.Register("natural-base", &natural_base, "Use log-base e (not log-base 10)");
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 1 && po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-    std::string arpa_filename = po.GetArg(1),
-        fst_filename = po.GetOptArg(2);
-    
-    kaldi::LangModelFst lm;
-    // read from standard input and write to standard output
-    lm.Read(arpa_filename, kaldi::kArpaLm, NULL, natural_base);
-    lm.Write(fst_filename);
-    exit(0);
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
-/// @}
-
diff --git a/src/bin/compile-questions.cc b/src/bin/compile-questions.cc
index 09225f58217..a6caafcc3f4 100644
--- a/src/bin/compile-questions.cc
+++ b/src/bin/compile-questions.cc
@@ -67,14 +67,23 @@ int main(int argc, char *argv[]) {
         " compile-questions questions.txt questions.qst\n";
     bool binary = true;
     int32 P = 1, N = 3;
-    int32 num_iters_refine = 0;
+    int32 num_iters_refine = 0,
+        leftmost_questions_truncate = -1;
 
 
     ParseOptions po(usage);
-    po.Register("binary", &binary, "Write output in binary mode");
-    po.Register("context-width", &N, "Context window size [must match acc-tree-stats].");
-    po.Register("central-position", &P, "Central position in phone context window [must match acc-tree-stats]");
-    po.Register("num-iters-refine", &num_iters_refine, "Number of iters of refining questions at each node.  >0 --> questions not shared");
+    po.Register("binary", &binary,
+                "Write output in binary mode");
+    po.Register("context-width", &N,
+                "Context window size [must match acc-tree-stats].");
+    po.Register("central-position", &P,
+                "Central position in phone context window [must match acc-tree-stats]");
+    po.Register("num-iters-refine", &num_iters_refine,
+                "Number of iters of refining questions at each node.  >0 --> questions "
+                "not refined");
+    po.Register("leftmost-questions-truncate", &leftmost_questions_truncate,
+                "If > 0, the questions for the left-most context position will be "
+                "truncated to the specified number.");
 
     po.Read(argc, argv);
 
@@ -118,9 +127,17 @@ int main(int argc, char *argv[]) {
     QuestionsForKey phone_opts(num_iters_refine);
     // the questions-options corresponding to keys 0, 1, .. N-1 which
     // represent the phonetic context positions (including the central phone).
-    phone_opts.initial_questions = questions;
     for (int32 n = 0; n < N; n++) {
       KALDI_LOG << "Setting questions for phonetic-context position "<< n;
+      if (n == 0 && leftmost_questions_truncate > 0 &&
+          leftmost_questions_truncate < questions.size()) {
+        KALDI_LOG << "Truncating " << questions.size() << " to "
+                  << leftmost_questions_truncate << " for position 0.";
+        phone_opts.initial_questions.assign(
+            questions.begin(), questions.begin() + leftmost_questions_truncate);
+      } else {
+        phone_opts.initial_questions = questions;
+      }
       qo.SetQuestionsOf(n, phone_opts);
     }
 
diff --git a/src/bin/compute-wer-bootci.cc b/src/bin/compute-wer-bootci.cc
new file mode 100644
index 00000000000..1166cae2421
--- /dev/null
+++ b/src/bin/compute-wer-bootci.cc
@@ -0,0 +1,254 @@
+// bin/compute-wer-bootci.cc
+
+// Copyright 2009-2011  Microsoft Corporation
+//                2014  Johns Hopkins University (authors: Jan Trmal, Daniel Povey)
+//                2015  Brno Universiry of technology (author: Karel Vesely)
+//                2016  Nicolas Serrano
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "util/parse-options.h"
+#include "tree/context-dep.h"
+#include "util/edit-distance.h"
+#include "base/kaldi-math.h"
+
+namespace kaldi {
+
+void GetEditsSingleHyp( const std::string &hyp_rspecifier,
+      const std::string &ref_rspecifier,
+      const std::string &mode,
+      std::vector<std::pair<int32, int32> > & edit_word_per_hyp) {
+
+    // Both text and integers are loaded as vector of strings,
+    SequentialTokenVectorReader ref_reader(ref_rspecifier);
+    RandomAccessTokenVectorReader hyp_reader(hyp_rspecifier);
+    int32 num_words = 0, word_errs = 0, num_ins = 0, num_del = 0, num_sub = 0;
+
+    // Main loop, store WER stats per hyp,
+    for (; !ref_reader.Done(); ref_reader.Next()) {
+      std::string key = ref_reader.Key();
+      const std::vector<std::string> &ref_sent = ref_reader.Value();
+      std::vector<std::string> hyp_sent;
+      if (!hyp_reader.HasKey(key)) {
+        if (mode == "strict")
+          KALDI_ERR << "No hypothesis for key " << key << " and strict "
+              "mode specifier.";
+        if (mode == "present")  // do not score this one.
+          continue;
+      } else {
+        hyp_sent = hyp_reader.Value(key);
+      }
+      num_words = ref_sent.size();
+      word_errs = LevenshteinEditDistance(ref_sent, hyp_sent,
+                                            &num_ins, &num_del, &num_sub);
+      edit_word_per_hyp.push_back(std::pair<int32, int32>(word_errs, num_words));
+    }
+}
+
+void GetEditsDualHyp(const std::string &hyp_rspecifier,
+      const std::string &hyp_rspecifier2,
+      const std::string &ref_rspecifier,
+      const std::string &mode,
+      std::vector<std::pair<int32, int32> > & edit_word_per_hyp,
+      std::vector<std::pair<int32, int32> > & edit_word_per_hyp2) {
+
+    // Both text and integers are loaded as vector of strings,
+    SequentialTokenVectorReader ref_reader(ref_rspecifier);
+    RandomAccessTokenVectorReader hyp_reader(hyp_rspecifier);
+    RandomAccessTokenVectorReader hyp_reader2(hyp_rspecifier2);
+    int32 num_words = 0, word_errs = 0,
+            num_ins = 0, num_del = 0, num_sub = 0;
+
+    // Main loop, store WER stats per hyp,
+    for (; !ref_reader.Done(); ref_reader.Next()) {
+      std::string key = ref_reader.Key();
+      const std::vector<std::string> &ref_sent = ref_reader.Value();
+      std::vector<std::string> hyp_sent, hyp_sent2;
+      if (mode == "strict" &&
+              (!hyp_reader.HasKey(key) || !hyp_reader2.HasKey(key))) {
+          KALDI_ERR << "No hypothesis for key " << key << " in both transcripts "
+              "comparison is not possible.";
+      } else if (mode == "present" &&
+              (!hyp_reader.HasKey(key) || !hyp_reader2.HasKey(key)))
+          continue;
+
+      num_words = ref_sent.size();
+
+      //all mode, if a hypothesis is not present, consider as an error
+      if(hyp_reader.HasKey(key)){
+        hyp_sent = hyp_reader.Value(key);
+        word_errs = LevenshteinEditDistance(ref_sent, hyp_sent,
+                                            &num_ins, &num_del, &num_sub);
+      }
+      else
+        word_errs = num_words;
+      edit_word_per_hyp.push_back(std::pair<int32, int32>(word_errs, num_words));
+
+      if(hyp_reader2.HasKey(key)){
+        hyp_sent2 = hyp_reader2.Value(key);
+        word_errs = LevenshteinEditDistance(ref_sent, hyp_sent2,
+                                            &num_ins, &num_del, &num_sub);
+      }
+      else
+        word_errs = num_words;
+      edit_word_per_hyp2.push_back(std::pair<int32, int32>(word_errs, num_words));
+    }
+}
+
+void GetBootstrapWERInterval(
+      const std::vector<std::pair<int32, int32> > & edit_word_per_hyp,
+      int32 replications,
+      BaseFloat *mean, BaseFloat *interval) {
+    BaseFloat wer_accum = 0.0, wer_mult_accum = 0.0;
+
+    for (int32 i = 0; i <= replications; ++i) {
+      int32 num_words = 0, word_errs = 0;
+      for (int32 j = 0; j <= edit_word_per_hyp.size(); ++j) {
+        int32 random_pos = kaldi::RandInt(0, edit_word_per_hyp.size());
+        word_errs += edit_word_per_hyp[random_pos].first;
+        num_words += edit_word_per_hyp[random_pos].second;
+        }
+
+      BaseFloat wer_rep = static_cast<BaseFloat>(word_errs) / num_words;
+      wer_accum += wer_rep;
+      wer_mult_accum += wer_rep*wer_rep;
+    }
+
+    // Compute mean WER and std WER
+    *mean = wer_accum / replications;
+    *interval = 1.96*sqrt(wer_mult_accum/replications-(*mean)*(*mean));
+}
+
+void GetBootstrapWERTwoSystemComparison(
+      const std::vector<std::pair<int32, int32> > & edit_word_per_hyp,
+      const std::vector<std::pair<int32, int32> > & edit_word_per_hyp2,
+      int32 replications, BaseFloat *p_improv) {
+    int32 improv_accum = 0.0;
+
+    for (int32 i = 0; i <= replications; ++i) {
+      int32 word_errs = 0;
+      for (int32 j = 0; j <= edit_word_per_hyp.size(); ++j) {
+        int32 random_pos = kaldi::RandInt(0, edit_word_per_hyp.size());
+        word_errs += edit_word_per_hyp[random_pos].first -
+                        edit_word_per_hyp2[random_pos].first;
+        }
+      if(word_errs > 0)
+        ++improv_accum;
+    }
+    // Compute mean WER and std WER
+    *p_improv = static_cast<BaseFloat>(improv_accum) / replications;
+}
+
+} //namespace kaldi
+
+int main(int argc, char *argv[]) {
+  using namespace kaldi;
+  typedef kaldi::int32 int32;
+
+  try {
+    const char *usage =
+      "Compute a bootstrapping of WER to extract the 95\% confidence interval.\n"
+      "Take a reference and a transcription file, in integer or text format,\n"
+      "and outputs overall WER statistics to standard output along with its\n"
+      "confidence interval using the bootstrap methos of Bisani and Ney.\n"
+      "If a second transcription file corresponding to the same reference is\n"
+      "provided, a bootstrap comparison of the two transcription is performed\n"
+      "to estimate the probability of improvement.\n"
+      "\n"
+      "Usage: compute-wer-bootci [options] <ref-rspecifier> <hyp-rspecifier> [<hyp2-rspecifier>]\n"
+      "E.g.: compute-wer-bootci --mode=present ark:data/train/text ark:hyp_text\n"
+      "or compute-wer-bootci ark:data/train/text ark:hyp_text ark:hyp_text2\n"
+      "See also: compute-wer\n";
+
+    ParseOptions po(usage);
+
+    std::string mode = "strict";
+    po.Register("mode", &mode,
+                "Scoring mode: \"present\"|\"all\"|\"strict\":\n"
+                "  \"present\" means score those we have transcriptions for\n"
+                "  \"all\" means treat absent transcriptions as empty\n"
+                "  \"strict\" means die if all in ref not also in hyp");
+
+    int32 replications = 10000;
+    po.Register("replications", &replications,
+            "Number of replications to compute the intervals");
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() < 2 || po.NumArgs() > 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string ref_rspecifier = po.GetArg(1);
+    std::string hyp_rspecifier = po.GetArg(2);
+    std::string hyp2_rspecifier = (po.NumArgs() == 3?po.GetArg(3):"");
+
+    if (mode != "strict" && mode != "present" && mode != "all") {
+      KALDI_ERR <<
+          "--mode option invalid: expected \"present\"|\"all\"|\"strict\", got "
+          << mode;
+    }
+
+    //Get editions per each utterance
+    std::vector<std::pair<int32, int32> > edit_word_per_hyp, edit_word_per_hyp2;
+    if(hyp2_rspecifier.empty())
+      GetEditsSingleHyp(hyp_rspecifier, ref_rspecifier, mode, edit_word_per_hyp);
+    else
+      GetEditsDualHyp(hyp_rspecifier, hyp2_rspecifier, ref_rspecifier, mode,
+              edit_word_per_hyp, edit_word_per_hyp2);
+
+    //Extract WER for a number of replications of the same size
+    //as the hypothesis extracted
+    BaseFloat mean_wer = 0.0, interval = 0.0,
+              mean_wer2 = 0.0, interval2 = 0.0,
+              p_improv = 0.0;
+
+    GetBootstrapWERInterval(edit_word_per_hyp, replications,
+            &mean_wer, &interval);
+
+    if(!hyp2_rspecifier.empty()) {
+      GetBootstrapWERInterval(edit_word_per_hyp2, replications,
+              &mean_wer2, &interval2);
+
+      GetBootstrapWERTwoSystemComparison(edit_word_per_hyp, edit_word_per_hyp2,
+             replications, &p_improv);
+    }
+
+    // Print the output,
+    std::cout.precision(2);
+    std::cerr.precision(2);
+    std::cout << "Set1: %WER " << std::fixed << 100*mean_wer <<
+              " 95\% Conf Interval [ " << 100*mean_wer-100*interval <<
+              ", " << 100*mean_wer+100*interval << " ]" << '\n';
+
+    if(!hyp2_rspecifier.empty()) {
+        std::cout << "Set2: %WER " << std::fixed << 100*mean_wer2 <<
+            " 95\% Conf Interval [ " << 100*mean_wer2-100*interval2 <<
+            ", " << 100*mean_wer2+100*interval2 << " ]" << '\n';
+
+        std::cout << "Probability of Set2 improving Set1: " << std::fixed <<
+            100*p_improv << '\n';
+    }
+
+    return 0;
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
diff --git a/src/bin/compute-wer.cc b/src/bin/compute-wer.cc
index 97e025d2c22..3d5b42c7f1d 100644
--- a/src/bin/compute-wer.cc
+++ b/src/bin/compute-wer.cc
@@ -2,6 +2,7 @@
 
 // Copyright 2009-2011  Microsoft Corporation
 //                2014  Johns Hopkins University (authors: Jan Trmal, Daniel Povey)
+//                2015  Brno Universiry of technology (author: Karel Vesely)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -24,42 +25,6 @@
 #include "tree/context-dep.h"
 #include "util/edit-distance.h"
 
-
-namespace kaldi {
-
-
-template<typename T>
-void PrintAlignmentStats(const std::vector<T> &ref,
-                         const std::vector<T> &hyp,
-                         T eps,
-                         std::ostream &os) {
-  // Make sure the eps symbol is not in the sentences we're aligning; this would
-  // not make sense.
-  KALDI_ASSERT(std::find(ref.begin(), ref.end(), eps) == ref.end());
-  KALDI_ASSERT(std::find(hyp.begin(), hyp.end(), eps) == hyp.end());
-
-  std::vector<std::pair<T, T> > aligned;
-  typedef typename std::vector<std::pair<T, T> >::const_iterator  aligned_iterator;
-
-  LevenshteinAlignment(ref, hyp, eps, &aligned);
-  for (aligned_iterator it = aligned.begin();
-       it != aligned.end(); ++it) {
-    KALDI_ASSERT(!(it->first == eps && it->second == eps));
-    if (it->first == eps) {
-      os << "insertion " << it->second << std::endl;
-    } else if (it->second == eps) {
-      os << "deletion " << it->first << std::endl;
-    } else if (it->first != it->second) {
-      os << "substitution " << it->first << ' ' << it->second << std::endl;
-    } else {
-      os << "correct " << it->first << std::endl;
-    }
-  }
-}
-
-}
-
-
 int main(int argc, char *argv[]) {
   using namespace kaldi;
   typedef kaldi::int32 int32;
@@ -69,29 +34,27 @@ int main(int argc, char *argv[]) {
         "Compute WER by comparing different transcriptions\n"
         "Takes two transcription files, in integer or text format,\n"
         "and outputs overall WER statistics to standard output.\n"
-        "Optionally, the third argument can be used to obtain detailed statistics\n"
-        "\n"
-        "Usage: compute-wer [options] <ref-rspecifier> <hyp-rspecifier> [<stats-out>]\n"
         "\n"
+        "Usage: compute-wer [options] <ref-rspecifier> <hyp-rspecifier>\n"
         "E.g.: compute-wer --text --mode=present ark:data/train/text ark:hyp_text\n"
-        "or: compute-wer --text --mode=present ark:data/train/text ark:hyp_text - | \\\n"
-        "   sort | uniq -c\n";
+        "See also: align-text,\n"
+        "Example scoring script: egs/wsj/s5/steps/score_kaldi.sh\n";
 
     ParseOptions po(usage);
 
     std::string mode = "strict";
-    bool text_input = false;  //  if this is true, we expect symbols as strings,
-
     po.Register("mode", &mode,
                 "Scoring mode: \"present\"|\"all\"|\"strict\":\n"
                 "  \"present\" means score those we have transcriptions for\n"
                 "  \"all\" means treat absent transcriptions as empty\n"
                 "  \"strict\" means die if all in ref not also in hyp");
-    po.Register("text", &text_input, "Expect strings, not integers, as input.");
+    
+    bool dummy = false;
+    po.Register("text", &dummy, "Deprecated option! Keeping for compatibility reasons.");
 
     po.Read(argc, argv);
 
-    if (po.NumArgs() < 2 || po.NumArgs() > 3) {
+    if (po.NumArgs() != 2) {
       po.PrintUsage();
       exit(1);
     }
@@ -99,103 +62,62 @@ int main(int argc, char *argv[]) {
     std::string ref_rspecifier = po.GetArg(1);
     std::string hyp_rspecifier = po.GetArg(2);
 
-    Output stats_output;
-    bool detailed_stats = (po.NumArgs() == 3);
-    if (detailed_stats)
-      stats_output.Open(po.GetOptArg(3), false, false);  // non-binary output
-    
     if (mode != "strict" && mode != "present" && mode != "all") {
       KALDI_ERR << "--mode option invalid: expected \"present\"|\"all\"|\"strict\", got "
                 << mode;
     }
 
-
-
     int32 num_words = 0, word_errs = 0, num_sent = 0, sent_errs = 0,
         num_ins = 0, num_del = 0, num_sub = 0, num_absent_sents = 0;
 
-    if (!text_input) {
-      SequentialInt32VectorReader ref_reader(ref_rspecifier);
-      RandomAccessInt32VectorReader hyp_reader(hyp_rspecifier);
-
-      for (; !ref_reader.Done(); ref_reader.Next()) {
-        std::string key = ref_reader.Key();
-        const std::vector<int32> &ref_sent = ref_reader.Value();
-        std::vector<int32> hyp_sent;
-        if (!hyp_reader.HasKey(key)) {
-          if (mode == "strict")
-            KALDI_ERR << "No hypothesis for key " << key << " and strict "
-                "mode specifier.";
-          num_absent_sents++;
-          if (mode == "present")  // do not score this one.
-            continue;
-        } else {
-          hyp_sent = hyp_reader.Value(key);
-        }
-        num_words += ref_sent.size();
-        int32 ins, del, sub;
-        word_errs += LevenshteinEditDistance(ref_sent, hyp_sent,
-                                             &ins, &del, &sub);
-        num_ins += ins;
-        num_del += del;
-        num_sub += sub;
-
-        if (detailed_stats) {
-          const int32 eps = -1;
-          PrintAlignmentStats(ref_sent, hyp_sent, eps, stats_output.Stream());
-        }
-        num_sent++;
-        sent_errs += (ref_sent != hyp_sent);
-      }
-    } else {
-      SequentialTokenVectorReader ref_reader(ref_rspecifier);
-      RandomAccessTokenVectorReader hyp_reader(hyp_rspecifier);
-
-      for (; !ref_reader.Done(); ref_reader.Next()) {
-        std::string key = ref_reader.Key();
-        const std::vector<std::string> &ref_sent = ref_reader.Value();
-        std::vector<std::string> hyp_sent;
-        if (!hyp_reader.HasKey(key)) {
-          if (mode == "strict")
-            KALDI_ERR << "No hypothesis for key " << key << " and strict "
-                "mode specifier.";
-          num_absent_sents++;
-          if (mode == "present")  // do not score this one.
-            continue;
-        } else {
-          hyp_sent = hyp_reader.Value(key);
-        }
-        num_words += ref_sent.size();
-        int32 ins, del, sub;
-        word_errs += LevenshteinEditDistance(ref_sent, hyp_sent,
-                                             &ins, &del, &sub);
-        num_ins += ins;
-        num_del += del;
-        num_sub += sub;
-
-        if (detailed_stats) {
-          const std::string eps = "";
-          PrintAlignmentStats(ref_sent, hyp_sent, eps, stats_output.Stream());
-        }
-        num_sent++;
-        sent_errs += (ref_sent != hyp_sent);
+    // Both text and integers are loaded as vector of strings,
+    SequentialTokenVectorReader ref_reader(ref_rspecifier);
+    RandomAccessTokenVectorReader hyp_reader(hyp_rspecifier);
+    
+    // Main loop, accumulate WER stats,
+    for (; !ref_reader.Done(); ref_reader.Next()) {
+      std::string key = ref_reader.Key();
+      const std::vector<std::string> &ref_sent = ref_reader.Value();
+      std::vector<std::string> hyp_sent;
+      if (!hyp_reader.HasKey(key)) {
+        if (mode == "strict")
+          KALDI_ERR << "No hypothesis for key " << key << " and strict "
+              "mode specifier.";
+        num_absent_sents++;
+        if (mode == "present")  // do not score this one.
+          continue;
+      } else {
+        hyp_sent = hyp_reader.Value(key);
       }
+      num_words += ref_sent.size();
+      int32 ins, del, sub;
+      word_errs += LevenshteinEditDistance(ref_sent, hyp_sent, &ins, &del, &sub);
+      num_ins += ins;
+      num_del += del;
+      num_sub += sub;
+
+      num_sent++;
+      sent_errs += (ref_sent != hyp_sent);
     }
 
+    // Compute WER, SER,
     BaseFloat percent_wer = 100.0 * static_cast<BaseFloat>(word_errs)
         / static_cast<BaseFloat>(num_words);
+    BaseFloat percent_ser = 100.0 * static_cast<BaseFloat>(sent_errs)
+        / static_cast<BaseFloat>(num_sent);
+
+    // Print the ouptut,
     std::cout.precision(2);
     std::cerr.precision(2);
     std::cout << "%WER " << std::fixed << percent_wer << " [ " << word_errs
               << " / " << num_words << ", " << num_ins << " ins, "
               << num_del << " del, " << num_sub << " sub ]"
               << (num_absent_sents != 0 ? " [PARTIAL]" : "") << '\n';
-    BaseFloat percent_ser = 100.0 * static_cast<BaseFloat>(sent_errs)
-        / static_cast<BaseFloat>(num_sent);
     std::cout << "%SER " << std::fixed << percent_ser <<  " [ "
                << sent_errs << " / " << num_sent << " ]\n";
     std::cout << "Scored " << num_sent << " sentences, "
               << num_absent_sents << " not present in hyp.\n";
+
     return 0;
   } catch(const std::exception &e) {
     std::cerr << e.what();
diff --git a/src/bin/convert-ali.cc b/src/bin/convert-ali.cc
index 97a503b26ab..3a52b7904a0 100644
--- a/src/bin/convert-ali.cc
+++ b/src/bin/convert-ali.cc
@@ -32,16 +32,24 @@ int main(int argc, char *argv[]) {
   try {
     const char *usage =
         "Convert alignments from one decision-tree/model to another\n"
-        "Usage:  convert-ali  [options] old-model new-model new-tree old-alignments-rspecifier new-alignments-wspecifier\n"
+        "Usage:  convert-ali  [options] <old-model> <new-model> <new-tree> "
+        "<old-alignments-rspecifier> <new-alignments-wspecifier>\n"
         "e.g.: \n"
-        " convert-ali old.mdl new.mdl new.tree ark:old.ali ark:new.ali\n";
+        " convert-ali old/final.mdl new/0.mdl new/tree ark:old/ali.1 ark:new/ali.1\n";
 
+    int32 frame_subsampling_factor = 1;
+    bool reorder = true;
 
     std::string phone_map_rxfilename;
     ParseOptions po(usage);
     po.Register("phone-map", &phone_map_rxfilename,
                 "File name containing old->new phone mapping (each line is: "
                 "old-integer-id new-integer-id)");
+    po.Register("reorder", &reorder,
+                "True if you want the converted alignments to be 'reordered' "
+                "versus the way they appear in the HmmTopology object");
+    po.Register("frame-subsampling-factor", &frame_subsampling_factor,
+                "Can be used in converting alignments to reduced frame rates.");
 
     po.Read(argc, argv);
 
@@ -61,7 +69,7 @@ int main(int argc, char *argv[]) {
       ReadPhoneMap(phone_map_rxfilename,
                    &phone_map);
     }
-    
+
     SequentialInt32VectorReader alignment_reader(old_alignments_rspecifier);
     Int32VectorWriter alignment_writer(new_alignments_wspecifier);
 
@@ -74,8 +82,8 @@ int main(int argc, char *argv[]) {
     if (!(old_trans_model.GetTopo() == new_trans_model.GetTopo()))
       KALDI_WARN << "Toplogies of models are not equal: "
                  << "conversion may not be correct or may fail.";
-    
-    
+
+
     ContextDependency new_ctx_dep;  // the tree.
     ReadKaldiObject(new_tree_filename, &new_ctx_dep);
 
@@ -86,11 +94,13 @@ int main(int argc, char *argv[]) {
       const std::vector<int32> &old_alignment = alignment_reader.Value();
       std::vector<int32> new_alignment;
       if (ConvertAlignment(old_trans_model,
-                          new_trans_model,
-                          new_ctx_dep,
-                          old_alignment,
-                          (phone_map_rxfilename != "" ? &phone_map : NULL),
-                          &new_alignment)) {
+                           new_trans_model,
+                           new_ctx_dep,
+                           old_alignment,
+                           frame_subsampling_factor,
+                           reorder,
+                           (phone_map_rxfilename != "" ? &phone_map : NULL),
+                           &new_alignment)) {
         alignment_writer.Write(key, new_alignment);
         num_success++;
       } else {
@@ -101,7 +111,7 @@ int main(int argc, char *argv[]) {
     }
 
     KALDI_LOG << "Succeeded converting alignments for " << num_success
-              <<" files, failed for " << num_fail;
+              << " files, failed for " << num_fail;
 
     if (num_success != 0) return 0;
     else return 1;
@@ -110,5 +120,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
-
diff --git a/src/bin/copy-transition-model.cc b/src/bin/copy-transition-model.cc
index 2debe64a674..62a5d0c51dd 100644
--- a/src/bin/copy-transition-model.cc
+++ b/src/bin/copy-transition-model.cc
@@ -35,10 +35,10 @@ int main(int argc, char *argv[]) {
         " models from the acoustic models they are written with.\n"
         "Usage:  copy-transition-model [options] <transition-model or model file> <transition-model-out>\n"
         "e.g.: \n"
-        " copy-transition-model --binarhy=false 1.mdl 1.txt\n";
+        " copy-transition-model --binary=false 1.mdl 1.txt\n";
 
     bool binary;
-    
+
     ParseOptions po(usage);
 
     po.Register("binary", &binary, "Write output in binary mode.");
diff --git a/src/bin/decode-faster-mapped.cc b/src/bin/decode-faster-mapped.cc
index 90c7125f927..c7411592504 100644
--- a/src/bin/decode-faster-mapped.cc
+++ b/src/bin/decode-faster-mapped.cc
@@ -160,7 +160,7 @@ int main(int argc, char *argv[]) {
     KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like/frame_count)
               << " over " << frame_count << " frames.";
 
-    if (word_syms) delete word_syms;
+    delete word_syms;
     delete decode_fst;
     if (num_success != 0) return 0;
     else return 1;
diff --git a/src/bin/decode-faster.cc b/src/bin/decode-faster.cc
index 6e5851e12f7..cbcdb771d56 100644
--- a/src/bin/decode-faster.cc
+++ b/src/bin/decode-faster.cc
@@ -156,7 +156,7 @@ int main(int argc, char *argv[]) {
     KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like/frame_count)
               << " over " << frame_count << " frames.";
 
-    if (word_syms) delete word_syms;
+    delete word_syms;
     delete decode_fst;
     if (num_success != 0) return 0;
     else return 1;
diff --git a/src/bin/draw-tree.cc b/src/bin/draw-tree.cc
index ed869ff250b..a534fdf78de 100644
--- a/src/bin/draw-tree.cc
+++ b/src/bin/draw-tree.cc
@@ -111,8 +111,8 @@ int main(int argc, char **argv) {
       renderer->Render(query);
     }
 
-    if (renderer) delete renderer;
-    if (query) delete query;
+    delete renderer;
+    delete query;
   } catch (const std::exception &e) {
     std::cerr << e.what();
     return -1;
diff --git a/src/bin/extract-ctx.cc b/src/bin/extract-ctx.cc
index d3c36119581..b361a1d8707 100644
--- a/src/bin/extract-ctx.cc
+++ b/src/bin/extract-ctx.cc
@@ -30,7 +30,6 @@
 #include "fst/fstlib.h"
 
 using namespace kaldi;
-
 using std::vector;
 
 // Generate a string representation of the given EventType;  the symtable is
@@ -41,7 +40,7 @@ static std::string EventTypeToString(EventType &e,
                                      bool addpos) {
   // make sure it's sorted so that the kPdfClass is the first element!
   std::sort(e.begin(), e.end());
-  
+
   // first plot the pdf-class
   std::stringstream ss;
   ss << e[0].second;
@@ -49,7 +48,7 @@ static std::string EventTypeToString(EventType &e,
     ss << " ";
     if (addpos)
       ss << (i-1) << ":";
-    
+
     if (phones_symtab == NULL)
       ss << e[i].second;
     else {
@@ -69,6 +68,7 @@ static std::string EventTypeToString(EventType &e,
 int main(int argc, char *argv[]) {
   try {
     typedef kaldi::int32 int32;
+
     const char *usage =
       "Given the tree stats and the resulting tree, output a mapping of phones\n"
       "in context (and pdf-class) to the pdf-id.  This can be used to link the\n"
@@ -77,16 +77,16 @@ int main(int argc, char *argv[]) {
       "e.g.: \n"
       " extract-ctx treeacc tree\n"
       " extract-ctx --mono 48 tree\n";
-    
+
     ParseOptions po(usage);
-    
+
     std::string fsymboltab;
     bool addpos = false;
     bool mono = false;
     std::string silphones = "1,2,3";
     int32 silpdfclasses = 5;
     int32 nonsilpdfclasses = 3;
-    
+
     po.Register("mono", &mono,
                 "Assume mono-phone tree;  instead of tree stats, specify highest id");
     po.Register("sil-phones", &silphones,
@@ -100,12 +100,12 @@ int main(int argc, char *argv[]) {
     po.Register("add-position-indicators", &addpos,
                 "Add position indicators for phonemes");
     po.Read(argc, argv);
-    
+
     if (po.NumArgs() != 2) {
       po.PrintUsage();
       exit(1);
     }
-    
+
     // read symtab if available
     fst::SymbolTable *phones_symtab = NULL;
     if (fsymboltab.length() > 0) {
@@ -115,17 +115,17 @@ int main(int argc, char *argv[]) {
       if (!phones_symtab)
         KALDI_ERR << "Could not read phones symbol table file "<< fsymboltab;
     }
-    
+
     // read the tree, get all the leaves
     ContextDependency ctx_dep;
     ReadKaldiObject(po.GetArg(2), &ctx_dep);
     const EventMap &map = ctx_dep.ToPdfMap();
-    
+
     // here we have to do different things for mono and tri+ trees
     if (mono) {
       // A mono-phone tree is not actually a real tree.  We test for EventTypes
       // that have the central phone and the possible pdf-classes
-      
+
       int32 maxs = atoi(po.GetArg(1).c_str());
       if (phones_symtab != NULL) {
         size_t ns = phones_symtab->NumSymbols();
@@ -135,10 +135,10 @@ int main(int argc, char *argv[]) {
           maxs = (ns-1);
         }
       }
-      
+
       // parse silphones
       std::set<int32> silset;
-      
+
       std::string::size_type i1 = 0, i2;
       do {
         i2 = silphones.find(',', i1);
@@ -148,19 +148,19 @@ int main(int argc, char *argv[]) {
           break;
         i1 = i2 + 1;
       } while (true);
-                      
-      
+
+
       // now query each phone (ignore <eps> which is 0)
       for (int32 p = 1; p <= maxs; ++p) {
         int32 mpdf = (silset.find(p) == silset.end() ?
                         nonsilpdfclasses :
                         silpdfclasses);
-        
+
         for (int i = 0; i < mpdf; ++i) {
           EventType et;
           et.push_back(std::pair<EventKeyType, EventValueType>(kPdfClass, i));
           et.push_back(std::pair<EventKeyType, EventValueType>(0, p));
-          
+
           EventAnswerType ans;
           if (map.Map(et, &ans)) {
             std::cout << ans << " "
@@ -173,12 +173,12 @@ int main(int argc, char *argv[]) {
 
         }
       }
-      
+
     } else {
       // for tri+ trees, read the tree stats;  this gives us basically all
       // phones-in-context that may be linked to an individual model
       // (in practice, many of them will be shared, but we plot them anyways)
-      
+
       // build-tree-questions.h:typedef std::vector<std::pair<EventType, Clusterable*> > BuildTreeStatsType
       BuildTreeStatsType stats;
       {
@@ -188,9 +188,9 @@ int main(int argc, char *argv[]) {
         ReadBuildTreeStats(ki.Stream(), binary_in, gc, &stats);
       }
       KALDI_LOG << "Number of separate statistics is " << stats.size();
-      
+
       // typedef std::vector<std::pair<EventKeyType,EventValueType> > EventType
-      
+
       // now, for each tree stats element, query the tree to get the pdf-id
       for (size_t i = 0; i < stats.size(); ++i) {
         EventAnswerType ans;
@@ -204,7 +204,7 @@ int main(int argc, char *argv[]) {
         }
       }
     }
-    
+
     return 0;
   } catch(const std::exception &e) {
     std::cerr << e.what();
diff --git a/src/bin/latgen-faster-mapped-parallel.cc b/src/bin/latgen-faster-mapped-parallel.cc
index 59e7f7170d1..dd4a3269cdf 100644
--- a/src/bin/latgen-faster-mapped-parallel.cc
+++ b/src/bin/latgen-faster-mapped-parallel.cc
@@ -170,7 +170,7 @@ int main(int argc, char *argv[]) {
     }
     sequencer.Wait();
 
-    if (decode_fst != NULL) delete decode_fst;
+    delete decode_fst;
       
     double elapsed = timer.Elapsed();
     KALDI_LOG << "Decoded with " << sequencer_config.num_threads << " threads.";
@@ -182,7 +182,7 @@ int main(int argc, char *argv[]) {
     KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like/frame_count) << " over "
               << frame_count<<" frames.";
 
-    if (word_syms) delete word_syms;
+    delete word_syms;
     if (num_success != 0) return 0;
     else return 1;
   } catch(const std::exception &e) {
diff --git a/src/bin/latgen-faster-mapped.cc b/src/bin/latgen-faster-mapped.cc
index 1ca62ca200c..8043bd31116 100644
--- a/src/bin/latgen-faster-mapped.cc
+++ b/src/bin/latgen-faster-mapped.cc
@@ -168,7 +168,7 @@ int main(int argc, char *argv[]) {
     KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like/frame_count) << " over "
               << frame_count<<" frames.";
 
-    if (word_syms) delete word_syms;
+    delete word_syms;
     if (num_success != 0) return 0;
     else return 1;
   } catch(const std::exception &e) {
diff --git a/src/bin/latgen-tracking-mapped.cc b/src/bin/latgen-tracking-mapped.cc
index 46d2fca5f71..cf89cb17b94 100644
--- a/src/bin/latgen-tracking-mapped.cc
+++ b/src/bin/latgen-tracking-mapped.cc
@@ -202,7 +202,7 @@ int main(int argc, char *argv[]) {
     KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like/frame_count) << " over "
               << frame_count<<" frames.";
 
-    if (word_syms) delete word_syms;
+    delete word_syms;
     if (num_success != 0) return 0;
     else return 1;
   } catch(const std::exception &e) {
diff --git a/src/bin/matrix-sum.cc b/src/bin/matrix-sum.cc
index 65078e26ff9..6c1d2ad9f12 100644
--- a/src/bin/matrix-sum.cc
+++ b/src/bin/matrix-sum.cc
@@ -110,7 +110,8 @@ int32 TypeTwoUsage(const ParseOptions &po,
                "matrix-sum: first argument must be an rspecifier");
   // if next assert fails it would be bug in the code as otherwise we shouldn't
   // be called.
-  KALDI_ASSERT(ClassifyRspecifier(po.GetArg(2), NULL, NULL) == kNoRspecifier);
+  KALDI_ASSERT(ClassifyWspecifier(po.GetArg(2), NULL, NULL, NULL) == 
+               kNoWspecifier);
 
   SequentialBaseFloatMatrixReader mat_reader(po.GetArg(1));
 
@@ -152,12 +153,17 @@ int32 TypeTwoUsage(const ParseOptions &po,
 int32 TypeThreeUsage(const ParseOptions &po,
                      bool binary) {
   KALDI_ASSERT(po.NumArgs() >= 2);
-  for (int32 i = 1; i <= po.NumArgs(); i++) {
-    if (ClassifyRspecifier(po.GetArg(1), NULL, NULL) != kNoRspecifier) {
+  for (int32 i = 1; i < po.NumArgs(); i++) {
+    if (ClassifyRspecifier(po.GetArg(i), NULL, NULL) != kNoRspecifier) {
       KALDI_ERR << "Wrong usage (type 3): if first and last arguments are not "
                 << "tables, the intermediate arguments must not be tables.";
     }
   }
+  if (ClassifyWspecifier(po.GetArg(po.NumArgs()), NULL, NULL, NULL) != 
+      kNoWspecifier) {
+    KALDI_ERR << "Wrong usage (type 3): if first and last arguments are not "
+              << "tables, the intermediate arguments must not be tables.";
+  }
 
   bool add = true;
   Matrix<BaseFloat> mat;
@@ -218,19 +224,19 @@ int main(int argc, char *argv[]) {
     int32 N = po.NumArgs(), exit_status;
     
     if (po.NumArgs() >= 2 &&
-        ClassifyRspecifier(po.GetArg(N), NULL, NULL) != kNoRspecifier) {
+        ClassifyWspecifier(po.GetArg(N), NULL, NULL, NULL) != kNoWspecifier) {
       // output to table.
       exit_status = TypeOneUsage(po, scale1, scale2);
     } else if (po.NumArgs() == 2 &&
                ClassifyRspecifier(po.GetArg(1), NULL, NULL) != kNoRspecifier &&
-               ClassifyRspecifier(po.GetArg(N), NULL, NULL) ==
-               kNoRspecifier) {
+               ClassifyWspecifier(po.GetArg(N), NULL, NULL, NULL) ==
+               kNoWspecifier) {
       KALDI_ASSERT(scale1 == 1.0 && scale2 == 1.0);
       // input from a single table, output not to table.
       exit_status = TypeTwoUsage(po, binary);
     } else if (po.NumArgs() >= 2 &&
                ClassifyRspecifier(po.GetArg(1), NULL, NULL) == kNoRspecifier &&
-               ClassifyRspecifier(po.GetArg(N), NULL, NULL) == kNoRspecifier) {
+               ClassifyWspecifier(po.GetArg(N), NULL, NULL, NULL) == kNoWspecifier) {
       KALDI_ASSERT(scale1 == 1.0 && scale2 == 1.0);
       // summing flat files.
       exit_status = TypeThreeUsage(po, binary);
diff --git a/src/bin/phones-to-prons.cc b/src/bin/phones-to-prons.cc
index 2d3caddf78d..f9b9291a90b 100644
--- a/src/bin/phones-to-prons.cc
+++ b/src/bin/phones-to-prons.cc
@@ -27,7 +27,7 @@
 #include "fstext/fstext-lib.h"
 
 // Create FST that accepts the phone sequence, with any number
-// of word-start and word-end symbol in between each phone. 
+// of word-start and word-end symbol in between each phone.
 void CreatePhonesAltFst(const std::vector<int32> &phones,
                         int32 word_start_sym,
                         int32 word_end_sym,
@@ -81,7 +81,7 @@ int main(int argc, char *argv[]) {
         "e.g.: \n"
         " ali-to-phones 1.mdl ark:1.ali ark:- | \\\n"
         "  phones-to-prons L_align.fst 46 47 ark:- 1.tra ark:1.prons\n";
-    
+
     ParseOptions po(usage);
     po.Read(argc, argv);
 
@@ -99,7 +99,7 @@ int main(int argc, char *argv[]) {
         prons_wspecifier = po.GetArg(6);
 
     int32 word_start_sym, word_end_sym;
-    
+
     if (!ConvertStringToInteger(word_start_sym_str, &word_start_sym)
         || word_start_sym <= 0)
       KALDI_ERR << "Invalid word start symbol (expecting integer >= 0): "
@@ -117,15 +117,15 @@ int main(int argc, char *argv[]) {
       fst::OLabelCompare<StdArc> olabel_comp;
       ArcSort(L, olabel_comp);
     }
-                
+
     SequentialInt32VectorReader phones_reader(phones_rspecifier);
     RandomAccessInt32VectorReader words_reader(words_rspecifier);
-    
+
     int32 n_done = 0, n_err = 0;
-    
+
     std::string empty;
     Int32VectorVectorWriter prons_writer(prons_wspecifier);
-    
+
     for (; !phones_reader.Done(); phones_reader.Next()) {
       std::string key = phones_reader.Key();
       const std::vector<int32> &phones = phones_reader.Value();
@@ -163,7 +163,7 @@ int main(int argc, char *argv[]) {
       // on the input side, and words on the output side.
       VectorFst<StdArc> phnx2word;
       Compose(phones_alt_fst, phn2word, &phnx2word);
-      
+
       if (phnx2word.Start() == fst::kNoStateId) {
         KALDI_WARN << "phnx2word FST for utterance " << key
                    << "is empty (either decoding for this utterance did "
@@ -196,7 +196,7 @@ int main(int argc, char *argv[]) {
         KALDI_ERR << "phnx2word is not a linear transducer (code error?)";
       if (words2 != words)
         KALDI_ERR << "words have changed! (code error?)";
-      
+
       // Now, "phnx" should be the phone sequence with start and end
       // symbols included.  At this point we break it up into segments,
       // and try to match it up with words.
@@ -211,7 +211,7 @@ int main(int argc, char *argv[]) {
         continue;
       }
       prons_writer.Write(key, prons);
-      n_done++;      
+      n_done++;
     }
     KALDI_LOG << "Done " << n_done << " utterances; " << n_err << " had errors.";
   } catch(const std::exception &e) {
diff --git a/src/bin/sum-lda-accs.cc b/src/bin/sum-lda-accs.cc
index 4988dfb57ca..22f11cc45ce 100644
--- a/src/bin/sum-lda-accs.cc
+++ b/src/bin/sum-lda-accs.cc
@@ -1,4 +1,4 @@
-// bin/sum-lda.cc
+// bin/sum-lda-accs.cc
 
 // Copyright 2014 LINSE/UFSC; Augusto Henrique Hentz
 
diff --git a/src/bin/sum-mllt-accs.cc b/src/bin/sum-mllt-accs.cc
index 2bb43f0d112..4d580e21e55 100644
--- a/src/bin/sum-mllt-accs.cc
+++ b/src/bin/sum-mllt-accs.cc
@@ -1,4 +1,4 @@
-// bin/sum-mllt.cc
+// bin/sum-mllt-accs.cc
 
 // Copyright 2014 LINSE/UFSC; Augusto Henrique Hentz
 
diff --git a/src/bin/sum-tree-stats.cc b/src/bin/sum-tree-stats.cc
index a4802a96847..fdaa3d178ca 100644
--- a/src/bin/sum-tree-stats.cc
+++ b/src/bin/sum-tree-stats.cc
@@ -76,7 +76,7 @@ int main(int argc, char *argv[]) {
 
     for (std::map<EventType, Clusterable*>::const_iterator iter = tree_stats.begin();  
         iter != tree_stats.end();
-        iter++ ) {
+         ++iter) {
       stats.push_back(std::make_pair(iter->first, iter->second));
     }
     tree_stats.clear();
diff --git a/src/bin/transform-vec.cc b/src/bin/transform-vec.cc
index d79c8afdef8..4fd390a9ce7 100644
--- a/src/bin/transform-vec.cc
+++ b/src/bin/transform-vec.cc
@@ -1,4 +1,4 @@
-// featbin/transform-vec.cc
+// bin/transform-vec.cc
 
 // Copyright 2009-2012  Microsoft Corporation
 //           2012-2014  Johns Hopkins University (author: Daniel Povey)
diff --git a/src/bin/vector-sum.cc b/src/bin/vector-sum.cc
index 70f5cec5b7b..42404e38384 100644
--- a/src/bin/vector-sum.cc
+++ b/src/bin/vector-sum.cc
@@ -101,13 +101,15 @@ int32 TypeOneUsage(const ParseOptions &po) {
 }
 
 int32 TypeTwoUsage(const ParseOptions &po,
-                   bool binary) {
+                   bool binary,
+                   bool average = false) {
   KALDI_ASSERT(po.NumArgs() == 2);
   KALDI_ASSERT(ClassifyRspecifier(po.GetArg(1), NULL, NULL) != kNoRspecifier &&
                "vector-sum: first argument must be an rspecifier");
   // if next assert fails it would be bug in the code as otherwise we shouldn't
   // be called.
-  KALDI_ASSERT(ClassifyRspecifier(po.GetArg(2), NULL, NULL) == kNoRspecifier);
+  KALDI_ASSERT(ClassifyWspecifier(po.GetArg(2), NULL, NULL, NULL) == 
+               kNoWspecifier);
 
   SequentialBaseFloatVectorReader vec_reader(po.GetArg(1));
 
@@ -132,6 +134,8 @@ int32 TypeTwoUsage(const ParseOptions &po,
       }
     }
   }
+  
+  if (num_done > 0 && average) sum.Scale(1.0 / num_done);
 
   Vector<BaseFloat> sum_float(sum);
   WriteKaldiObject(sum_float, po.GetArg(2), binary);
@@ -147,12 +151,17 @@ int32 TypeTwoUsage(const ParseOptions &po,
 int32 TypeThreeUsage(const ParseOptions &po,
                      bool binary) {
   KALDI_ASSERT(po.NumArgs() >= 2);
-  for (int32 i = 1; i <= po.NumArgs(); i++) {
-    if (ClassifyRspecifier(po.GetArg(1), NULL, NULL) != kNoRspecifier) {
+  for (int32 i = 1; i < po.NumArgs(); i++) {
+    if (ClassifyRspecifier(po.GetArg(i), NULL, NULL) != kNoRspecifier) {
       KALDI_ERR << "Wrong usage (type 3): if first and last arguments are not "
                 << "tables, the intermediate arguments must not be tables.";
     }
   }
+  if (ClassifyWspecifier(po.GetArg(po.NumArgs()), NULL, NULL, NULL) != 
+      kNoWspecifier) {
+    KALDI_ERR << "Wrong usage (type 3): if first and last arguments are not "
+              << "tables, the intermediate arguments must not be tables.";
+  }
 
   bool add = true;
   Vector<BaseFloat> vec;
@@ -193,30 +202,32 @@ int main(int argc, char *argv[]) {
         " e.g.: vector-sum --binary=false 1.vec 2.vec 3.vec sum.vec\n"
         "See also: copy-vector, dot-weights\n";
         
-    bool binary;
+    bool binary, average = false;
     
     ParseOptions po(usage);
 
     po.Register("binary", &binary, "If true, write output as binary (only "
                 "relevant for usage types two or three");
+    po.Register("average", &average, "Do average instead of sum");
     
     po.Read(argc, argv);
 
     int32 N = po.NumArgs(), exit_status;
 
     if (po.NumArgs() >= 2 &&
-        ClassifyRspecifier(po.GetArg(N), NULL, NULL) != kNoRspecifier) {
+        ClassifyWspecifier(po.GetArg(N), NULL, NULL, NULL) != kNoWspecifier) {
       // output to table.
       exit_status = TypeOneUsage(po);
     } else if (po.NumArgs() == 2 &&
                ClassifyRspecifier(po.GetArg(1), NULL, NULL) != kNoRspecifier &&
-               ClassifyRspecifier(po.GetArg(N), NULL, NULL) ==
-               kNoRspecifier) {
+               ClassifyWspecifier(po.GetArg(N), NULL, NULL, NULL) ==
+               kNoWspecifier) {
       // input from a single table, output not to table.
-      exit_status = TypeTwoUsage(po, binary);
+      exit_status = TypeTwoUsage(po, binary, average);
     } else if (po.NumArgs() >= 2 &&
                ClassifyRspecifier(po.GetArg(1), NULL, NULL) == kNoRspecifier &&
-               ClassifyRspecifier(po.GetArg(N), NULL, NULL) == kNoRspecifier) {
+               ClassifyWspecifier(po.GetArg(N), NULL, NULL, NULL) == 
+               kNoWspecifier) {
       // summing flat files.
       exit_status = TypeThreeUsage(po, binary);
     } else {      
diff --git a/src/chain/Makefile b/src/chain/Makefile
new file mode 100644
index 00000000000..d8fef6f6055
--- /dev/null
+++ b/src/chain/Makefile
@@ -0,0 +1,35 @@
+
+all:
+
+include ../kaldi.mk
+LDFLAGS += $(CUDA_LDFLAGS)
+LDLIBS += $(CUDA_LDLIBS)
+
+TESTFILES = chain-supervision-test language-model-test
+
+OBJFILES = chain-supervision.o chain-numerator.o chain-den-graph.o \
+          language-model.o chain-denominator.o chain-training.o
+ifeq ($(CUDA), true)
+  OBJFILES += chain-kernels.o
+endif
+
+LIBNAME = kaldi-chain
+
+ADDLIBS = ../lat/kaldi-lat.a ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a \
+           ../fstext/kaldi-fstext.a \
+           ../cudamatrix/kaldi-cudamatrix.a ../matrix/kaldi-matrix.a \
+          ../util/kaldi-util.a ../thread/kaldi-thread.a ../base/kaldi-base.a
+
+# Make sure we have CUDA_ARCH from kaldi.mk,
+ifeq ($(CUDA), true)
+  ifndef CUDA_ARCH
+    $(error CUDA_ARCH is undefined, run 'src/configure')
+  endif
+endif
+
+# Implicit rule for kernel compilation,
+%.o : %.cu
+	$(CUDATKDIR)/bin/nvcc -c $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../
+
+include ../makefiles/default_rules.mk
+
diff --git a/src/chain/chain-datastruct.h b/src/chain/chain-datastruct.h
new file mode 100644
index 00000000000..52e388a3f2e
--- /dev/null
+++ b/src/chain/chain-datastruct.h
@@ -0,0 +1,55 @@
+// chain/chain-datastruct.h
+
+// Copyright 2015    Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_CHAIN_CHAIN_DATASTRUCT_H_
+#define KALDI_CHAIN_CHAIN_DATASTRUCT_H_
+#include "cudamatrix/cu-matrixdim.h" // for CU1DBLOCK and CU2DBLOCK, and int32_cuda
+
+/**
+   This header is for declaring "C" structures that are to be used in the
+   CUDA interface for things in this directory.  We put it in a separate header from
+   the CUDA stuff as it may be needed regardless of whether we're actually compiling with
+   CUDA.
+ */
+
+extern "C" {
+  // "C" version of the BaseFloat typedef-- this saves us having to write
+  // multiple versions of these kernels.
+#if (KALDI_DOUBLEPRECISION != 0)
+  typedef double  BaseFloat;
+#else
+  typedef float   BaseFloat;
+#endif
+
+  struct DenominatorGraphTransition {
+    BaseFloat transition_prob;  // language-model part of the probability (not
+                                // in log)
+    int32_cuda pdf_id;   // pdf-id on the transition.
+    int32_cuda hmm_state;  // source, or destination, HMM state.
+  };
+
+
+  // Search for this in chain-kernels.cu for an explanation.
+  enum { kThresholdingPowerOfTwo = 14 };
+
+}
+
+
+
+#endif
diff --git a/src/chain/chain-den-graph.cc b/src/chain/chain-den-graph.cc
new file mode 100644
index 00000000000..eaeac25046d
--- /dev/null
+++ b/src/chain/chain-den-graph.cc
@@ -0,0 +1,389 @@
+// chain/chain-den-graph.cc
+
+// Copyright      2015   Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "chain/chain-den-graph.h"
+#include "hmm/hmm-utils.h"
+#include "fstext/push-special.h"
+
+namespace kaldi {
+namespace chain {
+
+
+DenominatorGraph::DenominatorGraph(const fst::StdVectorFst &fst,
+                                   int32 num_pdfs):
+    num_pdfs_(num_pdfs) {
+  SetTransitions(fst, num_pdfs);
+  SetInitialProbs(fst);
+}
+
+const Int32Pair* DenominatorGraph::BackwardTransitions() const {
+  return backward_transitions_.Data();
+}
+
+const Int32Pair* DenominatorGraph::ForwardTransitions() const {
+  return forward_transitions_.Data();
+}
+
+const DenominatorGraphTransition* DenominatorGraph::Transitions() const {
+  return transitions_.Data();
+}
+
+const CuVector<BaseFloat>& DenominatorGraph::InitialProbs() const {
+  return initial_probs_;
+}
+
+void DenominatorGraph::SetTransitions(const fst::StdVectorFst &fst,
+                                      int32 num_pdfs) {
+  int32 num_states = fst.NumStates();
+
+  std::vector<std::vector<DenominatorGraphTransition> >
+      transitions_out(num_states),
+      transitions_in(num_states);
+  for (int32 s = 0; s < num_states; s++) {
+    for (fst::ArcIterator<fst::StdVectorFst> aiter(fst, s); !aiter.Done();
+         aiter.Next()) {
+      const fst::StdArc &arc = aiter.Value();
+      DenominatorGraphTransition transition;
+      transition.transition_prob = exp(-arc.weight.Value());
+      transition.pdf_id = arc.ilabel - 1;
+      transition.hmm_state = arc.nextstate;
+      KALDI_ASSERT(transition.pdf_id >= 0 && transition.pdf_id < num_pdfs);
+      transitions_out[s].push_back(transition);
+      // now the reverse transition.
+      transition.hmm_state = s;
+      transitions_in[arc.nextstate].push_back(transition);
+    }
+  }
+
+  std::vector<Int32Pair> forward_transitions(num_states);
+  std::vector<Int32Pair> backward_transitions(num_states);
+  std::vector<DenominatorGraphTransition> transitions;
+
+  for (int32 s = 0; s < num_states; s++) {
+    forward_transitions[s].first = static_cast<int32>(transitions.size());
+    transitions.insert(transitions.end(), transitions_out[s].begin(),
+                       transitions_out[s].end());
+    forward_transitions[s].second = static_cast<int32>(transitions.size());
+  }
+  for (int32 s = 0; s < num_states; s++) {
+    backward_transitions[s].first = static_cast<int32>(transitions.size());
+    transitions.insert(transitions.end(), transitions_in[s].begin(),
+                       transitions_in[s].end());
+    backward_transitions[s].second = static_cast<int32>(transitions.size());
+  }
+
+  forward_transitions_ = forward_transitions;
+  backward_transitions_ = backward_transitions;
+  transitions_ = transitions;
+}
+
+void DenominatorGraph::SetInitialProbs(const fst::StdVectorFst &fst) {
+  // we set only the start-state to have probability mass, and then 100
+  // iterations of HMM propagation, over which we average the probabilities.
+  // initial probs won't end up making a huge difference as we won't be using
+  // derivatives from the first few frames, so this isn't 100% critical.
+  int32 num_iters = 100;
+  int32 num_states = fst.NumStates();
+
+  // we normalize each state so that it sums to one (including
+  // final-probs)... this is needed because the 'chain' code doesn't
+  // have transition probabilities.
+  Vector<double> normalizing_factor(num_states);
+  for (int32 s = 0; s < num_states; s++) {
+    double tot_prob = exp(-fst.Final(s).Value());
+    for (fst::ArcIterator<fst::StdVectorFst> aiter(fst, s); !aiter.Done();
+         aiter.Next()) {
+      tot_prob += exp(-aiter.Value().weight.Value());
+    }
+    KALDI_ASSERT(tot_prob > 0.0 && tot_prob < 100.0);
+    normalizing_factor(s) = 1.0 / tot_prob;
+  }
+
+  Vector<double> cur_prob(num_states), next_prob(num_states),
+      avg_prob(num_states);
+  cur_prob(fst.Start()) = 1.0;
+  for (int32 iter = 0; iter < num_iters; iter++) {
+    avg_prob.AddVec(1.0 / num_iters, cur_prob);
+    for (int32 s = 0; s < num_states; s++) {
+      double prob = cur_prob(s) * normalizing_factor(s);
+
+      for (fst::ArcIterator<fst::StdVectorFst> aiter(fst, s); !aiter.Done();
+           aiter.Next()) {
+        const fst::StdArc &arc = aiter.Value();
+        next_prob(arc.nextstate) += prob * exp(-arc.weight.Value());
+      }
+    }
+    cur_prob.Swap(&next_prob);
+    next_prob.SetZero();
+    // Renormalize, beause the HMM won't sum to one even after the
+    // previous normalization (due to final-probs).
+    cur_prob.Scale(1.0 / cur_prob.Sum());
+  }
+
+  Vector<BaseFloat> avg_prob_float(avg_prob);
+  initial_probs_ = avg_prob_float;
+}
+
+void DenominatorGraph::GetNormalizationFst(const fst::StdVectorFst &ifst,
+                                           fst::StdVectorFst *ofst) {
+  KALDI_ASSERT(ifst.NumStates() == initial_probs_.Dim());
+  if (&ifst != ofst)
+    *ofst = ifst;
+  int32 new_initial_state = ofst->AddState();
+  Vector<BaseFloat> initial_probs(initial_probs_);
+
+  for (int32 s = 0; s < initial_probs_.Dim(); s++) {
+    BaseFloat initial_prob = initial_probs(s);
+    KALDI_ASSERT(initial_prob > 0.0);
+    fst::StdArc arc(0, 0, fst::TropicalWeight(-log(initial_prob)), s);
+    ofst->AddArc(new_initial_state, arc);
+    ofst->SetFinal(s, fst::TropicalWeight::One());
+  }
+  ofst->SetStart(new_initial_state);
+  fst::RmEpsilon(ofst);
+  fst::ArcSort(ofst, fst::ILabelCompare<fst::StdArc>());
+}
+
+
+void MapFstToPdfIdsPlusOne(const TransitionModel &trans_model,
+                           fst::StdVectorFst *fst) {
+  int32 num_states = fst->NumStates();
+  for (int32 s = 0; s < num_states; s++) {
+    for (fst::MutableArcIterator<fst::StdVectorFst> aiter(fst, s);
+         !aiter.Done(); aiter.Next()) {
+      fst::StdArc arc = aiter.Value();
+      KALDI_ASSERT(arc.ilabel == arc.olabel);
+      if (arc.ilabel > 0) {
+        arc.ilabel = trans_model.TransitionIdToPdf(arc.ilabel) + 1;
+        arc.olabel = arc.ilabel;
+        aiter.SetValue(arc);
+      }
+    }
+  }
+}
+
+void MinimizeAcceptorNoPush(fst::StdVectorFst *fst) {
+  BaseFloat delta = fst::kDelta * 10.0;  // use fairly loose delta for
+                                         // aggressive minimimization.
+  fst::ArcMap(fst, fst::QuantizeMapper<fst::StdArc>(delta));
+  fst::EncodeMapper<fst::StdArc> encoder(fst::kEncodeLabels | fst::kEncodeWeights,
+                                         fst::ENCODE);
+  fst::Encode(fst, &encoder);
+  fst::AcceptorMinimize(fst);
+  fst::Decode(fst, encoder);
+}
+
+// This static function, used in CreateDenominatorFst, sorts an
+// fst's states in decreasing order of number of transitions (into + out of)
+// the state.  The aim is to have states that have a lot of transitions
+// either into them or out of them, be numbered earlier, so hopefully
+// they will be scheduled first and won't delay the computation
+static void SortOnTransitionCount(fst::StdVectorFst *fst) {
+  // negative_num_transitions[i] will contain (before sorting), the pair
+  // ( -(num-transitions-into(i) + num-transition-out-of(i)), i)
+  int32 num_states = fst->NumStates();
+  std::vector<std::pair<int32, int32> > negative_num_transitions(num_states);
+  for (int32 i = 0; i < num_states; i++) {
+    negative_num_transitions[i].first = 0;
+    negative_num_transitions[i].second = i;
+  }
+  for (int32 i = 0; i < num_states; i++) {
+    for (fst::ArcIterator<fst::StdVectorFst> aiter(*fst, i); !aiter.Done();
+         aiter.Next()) {
+      negative_num_transitions[i].first--;
+      negative_num_transitions[aiter.Value().nextstate].first--;
+    }
+  }
+  std::sort(negative_num_transitions.begin(), negative_num_transitions.end());
+  std::vector<fst::StdArc::StateId> order(num_states);
+  for (int32 i = 0; i < num_states; i++)
+    order[negative_num_transitions[i].second] = i;
+  fst::StateSort(fst, order);
+}
+
+void DenGraphMinimizeWrapper(fst::StdVectorFst *fst) {
+  for (int32 i = 1; i <= 3; i++) {
+    fst::PushSpecial(fst, fst::kDelta * 0.01);
+    MinimizeAcceptorNoPush(fst);
+    KALDI_LOG << "Number of states and arcs in transition-id FST after regular "
+              << "minimization is " << fst->NumStates() << " and "
+              << NumArcs(*fst) << " (pass " << i << ")";
+    fst::StdVectorFst fst_reversed;
+    fst::Reverse(*fst, &fst_reversed);
+    fst::PushSpecial(&fst_reversed, fst::kDelta * 0.01);
+    MinimizeAcceptorNoPush(&fst_reversed);
+    fst::Reverse(fst_reversed, fst);
+    KALDI_LOG << "Number of states and arcs in transition-id FST after reversed "
+              << "minimization is " << fst->NumStates() << " and "
+              << NumArcs(*fst) << " (pass " << i << ")";
+  }
+  fst::RmEpsilon(fst);
+  KALDI_LOG << "Number of states and arcs in transition-id FST after "
+            << "removing any epsilons introduced by reversal is "
+            << fst->NumStates() << " and "
+            << NumArcs(*fst);
+  fst::PushSpecial(fst, fst::kDelta * 0.01);
+}
+
+
+static void PrintDenGraphStats(const fst::StdVectorFst &den_graph) {
+  int32 num_states = den_graph.NumStates();
+  int32 degree_cutoff = 3;  // track states with <= transitions in/out.
+  int32 num_states_low_degree_in = 0,
+      num_states_low_degree_out = 0,
+      tot_arcs = 0;
+  std::vector<int32> num_in_arcs(num_states, 0);
+  for (int32 s = 0; s < num_states; s++) {
+    if (den_graph.NumArcs(s) <= degree_cutoff) {
+      num_states_low_degree_out++;
+    }
+    tot_arcs += den_graph.NumArcs(s);
+    for (fst::ArcIterator<fst::StdVectorFst> aiter(den_graph, s);
+         !aiter.Done(); aiter.Next()) {
+      int32 dest_state = aiter.Value().nextstate;
+      num_in_arcs[dest_state]++;
+    }
+  }
+  for (int32 s = 0; s < num_states; s++) {
+    if (num_in_arcs[s] <= degree_cutoff) {
+      num_states_low_degree_in++;
+    }
+  }
+  KALDI_LOG << "Number of states is " << num_states << " and arcs "
+            << tot_arcs << "; number of states with in-degree <= "
+            << degree_cutoff << " is " << num_states_low_degree_in
+            << " and with out-degree <= " << degree_cutoff
+            << " is " << num_states_low_degree_out;
+}
+
+
+// Check that every pdf is seen, warn if some are not.
+static void CheckDenominatorFst(int32 num_pdfs,
+                                const fst::StdVectorFst &den_fst) {
+  std::vector<bool> pdf_seen(num_pdfs);
+  int32 num_states = den_fst.NumStates();
+  for (int32 s = 0; s < num_states; s++) {
+    for (fst::ArcIterator<fst::StdVectorFst> aiter(den_fst, s);
+         !aiter.Done(); aiter.Next()) {
+      int32 pdf_id = aiter.Value().ilabel - 1;
+      KALDI_ASSERT(pdf_id >= 0 && pdf_id < num_pdfs);
+      pdf_seen[pdf_id] = true;
+    }
+  }
+  for (int32 pdf = 0; pdf < num_pdfs; pdf++) {
+    if (!pdf_seen[pdf]) {
+      KALDI_WARN << "Pdf-id " << pdf << " is not seen in denominator graph.";
+    }
+  }
+}
+
+void CreateDenominatorFst(const ContextDependency &ctx_dep,
+                          const TransitionModel &trans_model,
+                          const fst::StdVectorFst &phone_lm_in,
+                          fst::StdVectorFst *den_fst) {
+  using fst::StdVectorFst;
+  using fst::StdArc;
+  KALDI_ASSERT(phone_lm_in.NumStates() != 0);
+  fst::StdVectorFst phone_lm(phone_lm_in);
+
+  KALDI_LOG << "Number of states and arcs in phone-LM FST is "
+            << phone_lm.NumStates() << " and " << NumArcs(phone_lm);
+
+  int32 subsequential_symbol = trans_model.GetPhones().back() + 1;
+  if (ctx_dep.CentralPosition() != ctx_dep.ContextWidth() - 1) {
+    // note: this function only adds the subseq symbol to the input of what was
+    // previously an acceptor, so we project, i.e. copy the ilabels to the
+    // olabels
+    AddSubsequentialLoop(subsequential_symbol, &phone_lm);
+    fst::Project(&phone_lm, fst::PROJECT_INPUT);
+  }
+  std::vector<int32> disambig_syms;  // empty list of diambiguation symbols.
+  fst::ContextFst<StdArc> cfst(subsequential_symbol, trans_model.GetPhones(),
+                               disambig_syms, ctx_dep.ContextWidth(),
+                               ctx_dep.CentralPosition());
+  StdVectorFst context_dep_lm;
+  fst::ComposeContextFst(cfst, phone_lm, &context_dep_lm);
+  // at this point, context_dep_lm will have indexes into 'ilabels' as its
+  // input symbol (representing context-dependent phones), and phones on its
+  // output.  We don't need the phones, so we'll project.
+  fst::Project(&context_dep_lm, fst::PROJECT_INPUT);
+
+  KALDI_LOG << "Number of states and arcs in context-dependent LM FST is "
+            << context_dep_lm.NumStates() << " and " << NumArcs(context_dep_lm);
+
+  std::vector<int32> disambig_syms_h; // disambiguation symbols on input side
+  // of H -- will be empty.
+  HTransducerConfig h_config;
+  // the default is 1, but just document that we want this to stay as one.
+  // we'll use the same value in test time.  Consistency is the key here.
+  h_config.transition_scale = 1.0;
+  h_config.push_weights = true;
+
+  StdVectorFst *h_fst = GetHTransducer(cfst.ILabelInfo(),
+                                       ctx_dep,
+                                       trans_model,
+                                       h_config,
+                                       &disambig_syms_h);
+  KALDI_ASSERT(disambig_syms_h.empty());
+  StdVectorFst transition_id_fst;
+  TableCompose(*h_fst, context_dep_lm, &transition_id_fst);
+  delete h_fst;
+
+  BaseFloat self_loop_scale = 1.0;  // We have to be careful to use the same
+                                    // value in test time.
+  bool reorder = false;
+  // add self-loops to the FST with transition-ids as its labels.
+  AddSelfLoops(trans_model, disambig_syms_h, self_loop_scale, reorder,
+               &transition_id_fst);
+  // at this point transition_id_fst will have transition-ids as its ilabels and
+  // context-dependent phones (indexes into ILabelInfo()) as its olabels.
+  // Discard the context-dependent phones by projecting on the input, keeping
+  // only the transition-ids.
+  fst::Project(&transition_id_fst, fst::PROJECT_INPUT);
+
+  MapFstToPdfIdsPlusOne(trans_model, &transition_id_fst);
+  KALDI_LOG << "Number of states and arcs in transition-id FST is "
+            << transition_id_fst.NumStates() << " and "
+            << NumArcs(transition_id_fst);
+
+  // RemoveEpsLocal doesn't remove all epsilons, but it keeps the graph small.
+  fst::RemoveEpsLocal(&transition_id_fst);
+  // If there are remaining epsilons, remove them.
+  fst::RmEpsilon(&transition_id_fst);
+  KALDI_LOG << "Number of states and arcs in transition-id FST after "
+            << "removing epsilons is "
+            << transition_id_fst.NumStates() << " and "
+            << NumArcs(transition_id_fst);
+
+  DenGraphMinimizeWrapper(&transition_id_fst);
+
+  SortOnTransitionCount(&transition_id_fst);
+
+  *den_fst = transition_id_fst;
+  CheckDenominatorFst(trans_model.NumPdfs(), *den_fst);
+  PrintDenGraphStats(*den_fst);
+}
+
+
+int32 DenominatorGraph::NumStates() const {
+  return forward_transitions_.Dim();
+}
+}  // namespace chain
+}  // namespace kaldi
diff --git a/src/chain/chain-den-graph.h b/src/chain/chain-den-graph.h
new file mode 100644
index 00000000000..b2510651f39
--- /dev/null
+++ b/src/chain/chain-den-graph.h
@@ -0,0 +1,168 @@
+// chain/chain-den-graph.h
+
+// Copyright       2015  Johns Hopkins University (Author: Daniel Povey)
+
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#ifndef KALDI_CHAIN_CHAIN_DEN_GRAPH_H_
+#define KALDI_CHAIN_CHAIN_DEN_GRAPH_H_
+
+#include <vector>
+#include <map>
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "fstext/fstext-lib.h"
+#include "tree/context-dep.h"
+#include "lat/kaldi-lattice.h"
+#include "matrix/kaldi-matrix.h"
+#include "chain/chain-datastruct.h"
+#include "hmm/transition-model.h"
+#include "cudamatrix/cu-matrix.h"
+#include "cudamatrix/cu-vector.h"
+#include "cudamatrix/cu-array.h"
+
+namespace kaldi {
+namespace chain {
+
+
+/**  This class is responsible for storing the FST that we use as the
+     'anti-model' or 'denominator-model', that models all possible phone
+     sequences (or most possible phone sequences, depending how we built it)..
+     It stores the FST in a format where we can access both the transitions out
+     of each state, and the transitions into each state.
+
+     This class supports both GPU and non-GPU operation, but is optimized for
+     GPU.
+ */
+class DenominatorGraph {
+ public:
+
+  // the number of states in the HMM.
+  int32 NumStates() const;
+
+  // the number of PDFs (the labels on the transitions are numbered from 0 to
+  // NumPdfs() - 1).
+  int32 NumPdfs() const { return num_pdfs_; }
+
+  DenominatorGraph();
+
+  // Initialize from epsilon-free acceptor FST with pdf-ids plus one as the
+  // labels.  'num_pdfs' is only needeed for checking.
+  DenominatorGraph(const fst::StdVectorFst &fst,
+                   int32 num_pdfs);
+
+  // returns the pointer to the forward-transitions array, indexed by hmm-state,
+  // which will be on the GPU if we're using a GPU.
+  const Int32Pair *ForwardTransitions() const;
+
+  // returns the pointer to the backward-transitions array, indexed by
+  // hmm-state, which will be on the GPU if we're using a GPU.
+  const Int32Pair *BackwardTransitions() const;
+
+  // returns the array to the actual transitions (this is indexed by the ranges
+  // returned from the ForwardTransitions and BackwardTransitions arrays).  The
+  // memory will be GPU memory if we are using a GPU.
+  const DenominatorGraphTransition *Transitions() const;
+
+  // returns the initial-probs of the HMM-states... note, these initial-probs
+  // don't mean initial at the start of the file, because we usually train on
+  // pieces of a file.  They are approximate initial-probs obtained by running
+  // the HMM for a fixed number of time-steps (e.g. 100) and averaging the
+  // posteriors over those time-steps.  The exact values won't be very critical.
+  // Note: we renormalize each HMM-state to sum to one before doing this.
+  const CuVector<BaseFloat> &InitialProbs() const;
+
+  // This function outputs a modifified version of the FST that was used to
+  // build this object, that has an initial-state with epsilon transitions to
+  // each state, with weight determined by initial_probs_; and has each original
+  // state being final with probability one (note: we remove epsilons).  This is
+  // used in computing the 'penalty_logprob' of the Supervision objects, to
+  // ensure that the objective function is never positive, which makes it more
+  // easily interpretable.  'ifst' must be the same FST that was provided to the
+  // constructor of this object.  [note: ifst and ofst may be the same object.]
+  // This function ensures that 'ofst' is ilabel sorted (which will be useful in
+  // composition).
+  void GetNormalizationFst(const fst::StdVectorFst &ifst,
+                           fst::StdVectorFst *ofst);
+
+  // This function is only used in testing code.
+  void ScaleInitialProbs(BaseFloat s) { initial_probs_.Scale(s); }
+
+  // Use default copy constructor and assignment operator.
+ private:
+  // functions called from the constructor
+  void SetTransitions(const fst::StdVectorFst &fst, int32 num_pfds);
+
+  // work out the initial-probs.  Note, there are no final-probs; we treat all
+  // states as final with probability one [we have a justification for this..
+  // assuming it's roughly a well-normalized HMM, this makes sense; note that we
+  // train on chunks, so the beginning and end of a chunk appear at arbitrary
+  // points in the sequence.  At both beginning and end of the chunk, we limit
+  // ourselves to only those pdf-ids that were allowed in the numerator
+  // sequence.
+  void SetInitialProbs(const fst::StdVectorFst &fst);
+
+  // forward_transitions_ is an array, indexed by hmm-state index,
+  // of start and end indexes into the transition_ array, which
+  // give us the set of transitions out of this state.
+  CuArray<Int32Pair> forward_transitions_;
+  // backward_transitions_ is an array, indexed by hmm-state index,
+  // of start and end indexes into the transition_ array, which
+  // give us the set of transitions out of this state.
+  CuArray<Int32Pair> backward_transitions_;
+  // This stores the actual transitions.
+  CuArray<DenominatorGraphTransition> transitions_;
+
+  // The initial-probability of all states, used on the first frame of a
+  // sequence [although we also apply the constraint that on the first frame,
+  // only pdf-ids that were active on the 1st frame of the numerator, are
+  // active.  Because in general sequences won't start at the start of files, we
+  // make this a generic probability distribution close to the limiting
+  // distribution of the HMM.  This isn't too critical.
+  CuVector<BaseFloat> initial_probs_;
+
+  int32 num_pdfs_;
+};
+
+
+// Function that does acceptor minimization without weight pushing...
+// this is useful when constructing the denominator graph.
+void MinimizeAcceptorNoPush(fst::StdVectorFst *fst);
+
+// Utility function used while building the graph.  Converts
+// transition-ids to pdf-ids plus one.  Assumes 'fst'
+// is an acceptor, but does not check this (only looks at its
+// ilabels).
+void MapFstToPdfIdsPlusOne(const TransitionModel &trans_model,
+                           fst::StdVectorFst *fst);
+
+// Starting from an acceptor on phones that represents some kind of compiled
+// language model (with no disambiguation symbols), this funtion creates the
+// denominator-graph.  Note: there is similar code in chain-supervision.cc, when
+// creating the supervision graph.
+void CreateDenominatorFst(const ContextDependency &ctx_dep,
+                          const TransitionModel &trans_model,
+                          const fst::StdVectorFst &phone_lm,
+                          fst::StdVectorFst *den_graph);
+
+
+}  // namespace chain
+}  // namespace kaldi
+
+#endif  // KALDI_CHAIN_CHAIN_DEN_GRAPH_H_
diff --git a/src/chain/chain-denominator.cc b/src/chain/chain-denominator.cc
new file mode 100644
index 00000000000..b0bdc43ae97
--- /dev/null
+++ b/src/chain/chain-denominator.cc
@@ -0,0 +1,429 @@
+// chain/chain-denominator.cc
+
+// Copyright      2015   Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "chain/chain-denominator.h"
+#include "chain/chain-kernels-ansi.h"
+
+namespace kaldi {
+namespace chain {
+
+DenominatorComputation::DenominatorComputation(
+    const ChainTrainingOptions &opts,
+    const DenominatorGraph &den_graph,
+    int32 num_sequences,
+    const CuMatrixBase<BaseFloat> &nnet_output):
+    opts_(opts),
+    den_graph_(den_graph),
+    num_sequences_(num_sequences),
+    frames_per_sequence_(nnet_output.NumRows() / num_sequences_),
+    exp_nnet_output_transposed_(nnet_output, kTrans),
+    nnet_output_deriv_transposed_(
+        exp_nnet_output_transposed_.NumRows(),
+        std::min<int32>(exp_nnet_output_transposed_.NumCols(),
+                        static_cast<int32>(kMaxDerivTimeSteps) *
+                        num_sequences_)),
+    alpha_(frames_per_sequence_ + 1,
+           den_graph_.NumStates() * num_sequences_ + num_sequences_,
+           kUndefined),
+    beta_(2, den_graph_.NumStates() * num_sequences_ + num_sequences_,
+          kUndefined),
+    tot_prob_(num_sequences_, kUndefined),
+    tot_log_prob_(num_sequences_, kUndefined),
+    log_correction_term_(num_sequences_, kUndefined),
+    ok_(true) {
+  KALDI_ASSERT(opts_.leaky_hmm_coefficient > 0.0 &&
+               opts_.leaky_hmm_coefficient < 1.0);
+  // make sure the alpha sums and beta sums are zeroed.
+  alpha_.ColRange(den_graph_.NumStates() * num_sequences_,
+                  num_sequences_).SetZero();
+  beta_.ColRange(den_graph_.NumStates() * num_sequences_,
+                 num_sequences_).SetZero();
+
+  KALDI_ASSERT(nnet_output.NumRows() % num_sequences == 0);
+  exp_nnet_output_transposed_.ApplyExp();
+}
+
+
+void DenominatorComputation::AlphaFirstFrame() {
+  // dim == num_hmm_states_ * num_sequences_.
+  BaseFloat *first_frame_alpha = alpha_.RowData(0);
+  // create a 'fake matrix' - view this row as a matrix.
+  // initializer takes [pointer, num-rows, num-cols, stride].
+  CuSubMatrix<BaseFloat> alpha_mat(first_frame_alpha,
+                                   den_graph_.NumStates(),
+                                   num_sequences_,
+                                   num_sequences_);
+  // TODO (possible): It would be more efficient here if we implemented a
+  // CopyColsFromVec function in class CuMatrix.
+  alpha_mat.SetZero();
+  alpha_mat.AddVecToCols(1.0, den_graph_.InitialProbs(), 0.0);
+}
+
+
+// the alpha computation for some 0 < t <= num_time_steps_.
+void DenominatorComputation::AlphaGeneralFrame(int32 t) {
+  KALDI_ASSERT(t > 0 && t <= frames_per_sequence_);
+  BaseFloat *this_alpha = alpha_.RowData(t);
+  const BaseFloat *prev_alpha_dash = alpha_.RowData(t - 1);
+  const Int32Pair *backward_transitions = den_graph_.BackwardTransitions();
+  const DenominatorGraphTransition *transitions = den_graph_.Transitions();
+  int32 num_pdfs = exp_nnet_output_transposed_.NumRows(),
+      num_hmm_states = den_graph_.NumStates(),
+      num_sequences = num_sequences_;
+
+  // 'probs' is the matrix of pseudo-likelihoods for frame t - 1.
+  CuSubMatrix<BaseFloat> probs(exp_nnet_output_transposed_, 0, num_pdfs,
+                               (t-1) * num_sequences_, num_sequences_);
+  const BaseFloat *prob_data = probs.Data();
+
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    Timer tim;
+    dim3 dimBlock(std::min<int32>(CU1DBLOCK, num_sequences), 1, 1);
+    dim3 dimGrid(n_blocks(num_sequences, dimBlock.x), num_hmm_states, 1);
+
+    while (1) {
+      if (dimGrid.y > 65535)  // the hardware doesn't allow more than this.
+        dimGrid.y = 65535;
+      cuda_chain_hmm_forward(dimGrid, dimBlock,
+                             backward_transitions, transitions,
+                             num_sequences, den_graph_.NumStates(),
+                             prob_data, probs.Stride(), prev_alpha_dash,
+                             this_alpha);
+      CU_SAFE_CALL(cudaGetLastError());
+      if (dimGrid.y == num_hmm_states) {
+        break;  // this is the normal case.
+      } else {
+        // We reach this code only in the unusual case where num_hmm_states >
+        // 65535.  We can compute the alphas for the remaining HMM states by
+        // moving some of the array pointers and making the call again.
+        backward_transitions += dimGrid.y;
+        this_alpha += dimGrid.y * num_sequences;
+        num_hmm_states -= dimGrid.y;
+        dimGrid.y = num_hmm_states;
+      }
+    }
+    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
+  } else
+#endif
+  {
+    int32 prob_stride = probs.Stride();
+    for (int32 h = 0; h < num_hmm_states; h++) {
+      for (int32 s = 0; s < num_sequences; s++) {
+        double this_tot_alpha = 0.0;
+        const DenominatorGraphTransition
+            *trans_iter = transitions + backward_transitions[h].first,
+            *trans_end = transitions + backward_transitions[h].second;
+        for (; trans_iter != trans_end; ++trans_iter) {
+          BaseFloat transition_prob = trans_iter->transition_prob;
+          int32 pdf_id = trans_iter->pdf_id,
+              prev_hmm_state = trans_iter->hmm_state;
+          BaseFloat prob = prob_data[pdf_id * prob_stride + s],
+              this_prev_alpha = prev_alpha_dash[prev_hmm_state * num_sequences + s];
+          this_tot_alpha += this_prev_alpha * transition_prob * prob;
+        }
+        // Let arbitrary_scale be the inverse of the alpha-sum value that we
+        // store in the same place we'd store the alpha for the state numbered
+        // 'num_hmm_states'. We multiply this into all the
+        // transition-probabilities from the previous frame to this frame, in
+        // both the forward and backward passes, in order to keep the alphas in
+        // a good numeric range.  This won't affect the posteriors, but when
+        // computing the total likelihood we'll need to compensate for it later
+        // on.
+        BaseFloat arbitrary_scale =
+            1.0 / prev_alpha_dash[num_hmm_states * num_sequences + s];
+        KALDI_ASSERT(this_tot_alpha - this_tot_alpha == 0);
+        this_alpha[h * num_sequences + s] = this_tot_alpha * arbitrary_scale;
+      }
+    }
+  }
+}
+
+void DenominatorComputation::AlphaDash(int32 t) {
+  BaseFloat *this_alpha = alpha_.RowData(t);
+
+  // create a 'fake matrix' for the regular alphas- view this row as a matrix.
+  // initializer takes [pointer, num-rows, num-cols, stride].
+  CuSubMatrix<BaseFloat> alpha_mat(this_alpha,
+                                   den_graph_.NumStates(),
+                                   num_sequences_,
+                                   num_sequences_);
+
+  // the alpha-dash is the sum of alpha over all states.
+  CuSubVector<BaseFloat> alpha_sum_vec(this_alpha +
+                                       den_graph_.NumStates() * num_sequences_,
+                                       num_sequences_);
+  alpha_sum_vec.AddRowSumMat(1.0, alpha_mat, 0.0);
+
+  alpha_mat.AddVecVec(opts_.leaky_hmm_coefficient,
+                      den_graph_.InitialProbs(),
+                      alpha_sum_vec);
+  // it's now alpha-dash.
+}
+
+// compute beta from beta-dash.
+void DenominatorComputation::Beta(int32 t) {
+  BaseFloat *this_beta_dash = beta_.RowData(t % 2);
+  // create a 'fake matrix' for the regular beta-dash (which is
+  // the counterpart of alpha-dash)- view this row as a matrix.
+  // initializer takes [pointer, num-rows, num-cols, stride].
+  CuSubMatrix<BaseFloat> beta_dash_mat(this_beta_dash,
+                                       den_graph_.NumStates(),
+                                       num_sequences_,
+                                       num_sequences_);
+  // making the t index implicit, the beta-dash-sum for each sequence is the sum
+  // over all states i of beta_i * opts_.leaky_hmm_coefficient * initial_prob_i.
+  CuSubVector<BaseFloat> beta_dash_sum_vec(
+      this_beta_dash + den_graph_.NumStates() * num_sequences_,
+      num_sequences_);
+  beta_dash_sum_vec.AddMatVec(opts_.leaky_hmm_coefficient, beta_dash_mat,
+                              kTrans, den_graph_.InitialProbs(), 0.0);
+  // we are computing beta in place.  After the following, beta-dash-mat
+  // will contain the actual beta (i.e. the counterpart of alpha),
+  // not the beta-dash.
+  beta_dash_mat.AddVecToRows(1.0, beta_dash_sum_vec);
+}
+
+BaseFloat DenominatorComputation::Forward() {
+  AlphaFirstFrame();
+  AlphaDash(0);
+  for (int32 t = 1; t <= frames_per_sequence_; t++) {
+    AlphaGeneralFrame(t);
+    AlphaDash(t);
+  }
+  return ComputeTotLogLike();
+}
+
+BaseFloat DenominatorComputation::ComputeTotLogLike() {
+  tot_prob_.Resize(num_sequences_);
+  // View the last alpha-dash as a matrix of size num-hmm-states by num-sequences.
+  CuSubMatrix<BaseFloat> last_alpha_dash(
+      alpha_.RowData(frames_per_sequence_),
+      den_graph_.NumStates(),
+      num_sequences_,
+      num_sequences_);
+
+  tot_prob_.AddRowSumMat(1.0, last_alpha_dash, 0.0);
+  // we should probably add an ApplyLog() function that takes a vector argument.
+  tot_log_prob_ = tot_prob_;
+  tot_log_prob_.ApplyLog();
+  BaseFloat tot_log_prob = tot_log_prob_.Sum();
+
+  // We now have to add something for the arbitrary scaling factor.  [note: the
+  // purpose of the arbitrary scaling factors was to keep things in a good
+  // floating-point range]
+  // The inverses of all the tot-alpha quantities, for t = 0
+  // ... frames_per_sequence_ - 1, were included as the 'arbitrary factors' in
+  // the transition-probs, so we need to multiply them all together (not
+  // inversed) and add them as a correction term to the total log-likes.
+  // These tot-alpha quantities were stored in the same place that we would
+  // have stored the HMM-state numbered 'num_hmm_states'.
+  int32 num_hmm_states = den_graph_.NumStates();
+  CuSubMatrix<BaseFloat> inv_arbitrary_scales(
+      alpha_, 0, frames_per_sequence_,
+      num_sequences_ * num_hmm_states, num_sequences_);
+  CuMatrix<BaseFloat> log_inv_arbitrary_scales(
+      inv_arbitrary_scales);
+  log_inv_arbitrary_scales.ApplyLog();
+  BaseFloat log_inv_arbitrary_scales_product =
+      log_inv_arbitrary_scales.Sum();
+  return tot_log_prob + log_inv_arbitrary_scales_product;
+}
+
+
+
+bool DenominatorComputation::Backward(
+    BaseFloat deriv_weight,
+    CuMatrixBase<BaseFloat> *nnet_output_deriv) {
+  BetaDashLastFrame();
+  Beta(frames_per_sequence_);
+  for (int32 t = frames_per_sequence_ - 1; t >= 0; t--) {
+    BetaDashGeneralFrame(t);
+    if (GetVerboseLevel() >= 1 || t == 0)
+      BetaGeneralFrameDebug(t);
+    Beta(t);
+    if (t % kMaxDerivTimeSteps == 0) {
+      // commit the derivative stored in exp_nnet_output_transposed_ by adding
+      // its transpose to the appropriate sub-matrix of 'nnet_output_deriv'.
+      int32 chunk_frames = std::min<int32>(static_cast<int32>(kMaxDerivTimeSteps),
+                                           frames_per_sequence_ - t),
+                num_pdfs = exp_nnet_output_transposed_.NumRows();
+      CuSubMatrix<BaseFloat> transposed_deriv_part(
+          nnet_output_deriv_transposed_,
+          0, num_pdfs,
+          0, chunk_frames * num_sequences_);
+      CuSubMatrix<BaseFloat> output_deriv_part(
+          *nnet_output_deriv,
+          t * num_sequences_, chunk_frames * num_sequences_,
+          0, num_pdfs);
+      output_deriv_part.AddMat(deriv_weight, transposed_deriv_part, kTrans);
+      if (t != 0)
+        transposed_deriv_part.SetZero();
+    }
+  }
+  return ok_;
+}
+
+void DenominatorComputation::BetaDashLastFrame() {
+  // sets up the beta-dash quantity on the last frame (frame ==
+  // frames_per_sequence_).  Note that the betas we use here contain a
+  // 1/(tot-prob) factor in order to simplify the backprop.
+
+  int32 t = frames_per_sequence_;
+  BaseFloat *last_frame_beta_dash = beta_.RowData(t % 2);
+
+  // create a 'fake matrix' - view this row as a matrix.
+  CuSubMatrix<BaseFloat> beta_dash_mat(last_frame_beta_dash,
+                                       den_graph_.NumStates(),
+                                       num_sequences_,
+                                       num_sequences_);
+  CuVector<BaseFloat> inv_tot_prob(tot_prob_);
+  inv_tot_prob.InvertElements();
+  // the beta values at the end of the file only vary with the sequence-index,
+  // not with the HMM-index.  We treat all states as having a final-prob of one.
+  beta_dash_mat.CopyRowsFromVec(inv_tot_prob);
+}
+
+void DenominatorComputation::BetaDashGeneralFrame(int32 t) {
+  KALDI_ASSERT(t >= 0 && t < frames_per_sequence_);
+  int32 num_pdfs = exp_nnet_output_transposed_.NumRows();
+  // t_wrapped gives us the time-index we use when indexing
+  // nnet_output_deriv_transposed_; to save memory we limit the size of the
+  // matrix, storing only chunks of frames at a time, and we add it to the
+  // non-transposed output whenever we finish a chunk.
+  int32 t_wrapped = t % static_cast<int32>(kMaxDerivTimeSteps);
+  const BaseFloat *this_alpha_dash = alpha_.RowData(t),
+      *next_beta = beta_.RowData((t + 1) % 2);
+  BaseFloat *this_beta_dash = beta_.RowData(t % 2);
+  const Int32Pair *forward_transitions = den_graph_.ForwardTransitions();
+  const DenominatorGraphTransition *transitions = den_graph_.Transitions();
+  // 'probs' is the matrix of pseudo-likelihoods for frame t.
+  CuSubMatrix<BaseFloat> probs(exp_nnet_output_transposed_, 0, num_pdfs,
+                               t * num_sequences_, num_sequences_),
+      log_prob_deriv(nnet_output_deriv_transposed_, 0, num_pdfs,
+                     t_wrapped * num_sequences_, num_sequences_);
+
+  int32 num_hmm_states = den_graph_.NumStates(),
+      num_sequences = num_sequences_;
+
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    Timer tim;
+    dim3 dimBlock(std::min<int32>(CU1DBLOCK, num_sequences), 1, 1);
+    dim3 dimGrid(n_blocks(num_sequences, dimBlock.x), num_hmm_states, 1);
+    while (1) {
+      if (dimGrid.y > 65535)  // the hardware doesn't allow more than this.
+        dimGrid.y = 65535;
+      cuda_chain_hmm_backward(dimGrid, dimBlock, forward_transitions, transitions,
+                              num_sequences, num_hmm_states,
+                              probs.Data(), probs.Stride(),
+                              this_alpha_dash, next_beta, this_beta_dash,
+                              log_prob_deriv.Data(), log_prob_deriv.Stride());
+      CU_SAFE_CALL(cudaGetLastError());
+      if (dimGrid.y == num_hmm_states) {
+        break;  // this is the normal case.
+      } else {
+        // We reach this code only in the unusual case where num_hmm_states >
+        // 65535.  We can compute the betas (and log-prob derivatives) for the
+        // remaining HMM states by moving some of the array pointers and making
+        // the call again.
+        forward_transitions += dimGrid.y;
+        this_alpha_dash += dimGrid.y * num_sequences;
+        this_beta_dash += dimGrid.y * num_sequences;
+        num_hmm_states -= dimGrid.y;
+        dimGrid.y = num_hmm_states;
+      }
+    }
+    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
+  } else
+#endif
+  {
+    int32 prob_stride = probs.Stride(),
+         deriv_stride = log_prob_deriv.Stride();
+    const BaseFloat *prob_data = probs.Data();
+    BaseFloat *log_prob_deriv_data = log_prob_deriv.Data();
+    for (int32 h = 0; h < num_hmm_states; h++) {
+      for (int32 s = 0; s < num_sequences; s++) {
+        BaseFloat this_alpha_dash_prob = this_alpha_dash[h * num_sequences + s],
+            inv_arbitrary_scale =
+            this_alpha_dash[num_hmm_states * num_sequences + s];
+        double tot_variable_factor = 0.0;
+        BaseFloat occupation_factor = this_alpha_dash_prob /
+            inv_arbitrary_scale;
+        const DenominatorGraphTransition
+            *trans_iter = transitions + forward_transitions[h].first,
+            *trans_end = transitions + forward_transitions[h].second;
+        for (; trans_iter != trans_end; ++trans_iter) {
+          BaseFloat transition_prob = trans_iter->transition_prob;
+          int32 pdf_id = trans_iter->pdf_id,
+              next_hmm_state = trans_iter->hmm_state;
+          BaseFloat variable_factor = transition_prob *
+              next_beta[next_hmm_state * num_sequences + s] *
+              prob_data[pdf_id * prob_stride + s];
+          tot_variable_factor += variable_factor;
+          BaseFloat occupation_prob = variable_factor * occupation_factor;
+          log_prob_deriv_data[pdf_id * deriv_stride + s] += occupation_prob;
+        }
+        this_beta_dash[h * num_sequences + s] =
+            tot_variable_factor / inv_arbitrary_scale;
+      }
+    }
+  }
+}
+
+void DenominatorComputation::BetaGeneralFrameDebug(int32 t) {
+  BaseFloat num_hmm_states = den_graph_.NumStates(),
+      alpha_beta_size = num_hmm_states * num_sequences_;
+  CuSubVector<BaseFloat> this_alpha_dash(alpha_.RowData(t), alpha_beta_size),
+      this_beta_dash(beta_.RowData(t % 2), alpha_beta_size);
+  int32 t_wrapped = t % static_cast<int32>(kMaxDerivTimeSteps),
+      num_pdfs = exp_nnet_output_transposed_.NumRows();
+  CuSubMatrix<BaseFloat> this_log_prob_deriv(
+      nnet_output_deriv_transposed_, 0, num_pdfs,
+      t_wrapped * num_sequences_, num_sequences_);
+  BaseFloat alpha_beta_product = VecVec(this_alpha_dash,
+                                        this_beta_dash),
+      this_log_prob_deriv_sum = this_log_prob_deriv.Sum();
+  if (!ApproxEqual(alpha_beta_product, num_sequences_)) {
+    KALDI_WARN << "On time " << t << ", alpha-beta product "
+               << alpha_beta_product << " != " << num_sequences_
+               << " alpha-dash-sum = " << this_alpha_dash.Sum()
+               << ", beta-dash-sum = " << this_beta_dash.Sum();
+    if (fabs(alpha_beta_product - num_sequences_) > 2.0) {
+      KALDI_WARN << "Excessive error detected, will abandon this minibatch";
+      ok_ = false;
+    }
+  }
+  // use higher tolerance, since we are using randomized pruning for the
+  // log-prob derivatives.
+  if (!ApproxEqual(this_log_prob_deriv_sum,
+                   num_sequences_, 0.01)) {
+    KALDI_WARN << "On time " << t << ", log-prob-deriv sum "
+               << this_log_prob_deriv_sum << " != " << num_sequences_;
+    if (fabs(this_log_prob_deriv_sum - num_sequences_) > 2.0) {
+      KALDI_WARN << "Excessive error detected, will abandon this minibatch";
+      ok_ = false;
+    }
+  }
+}
+
+
+}  // namespace chain
+}  // namespace kaldi
diff --git a/src/chain/chain-denominator.h b/src/chain/chain-denominator.h
new file mode 100644
index 00000000000..2da47a03c51
--- /dev/null
+++ b/src/chain/chain-denominator.h
@@ -0,0 +1,316 @@
+// chain/chain-denominator.h
+
+// Copyright       2015  Johns Hopkins University (Author: Daniel Povey)
+
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#ifndef KALDI_CHAIN_CHAIN_DENOMINATOR_H_
+#define KALDI_CHAIN_CHAIN_DENOMINATOR_H_
+
+#include <vector>
+#include <map>
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "fstext/fstext-lib.h"
+#include "tree/context-dep.h"
+#include "lat/kaldi-lattice.h"
+#include "matrix/kaldi-matrix.h"
+#include "hmm/transition-model.h"
+#include "cudamatrix/cu-matrix.h"
+#include "cudamatrix/cu-array.h"
+#include "chain/chain-den-graph.h"
+#include "chain/chain-training.h"
+
+namespace kaldi {
+namespace chain {
+
+
+/*
+  This extended comment describes how we implement forward-backward without log
+  and without overflow, and also the leaky-HMM idea.
+
+  We'll start by establishing the notation for conventional forward-backward,
+  then add the 'arbitrary-scale' concept that prevents overflow, and then
+  add the 'leaky-hmm' concept.
+
+  All this is done in parallel over multiple sequences, but the computations
+  are independent over the separate sequences, so we won't introduce any notation
+  or index for the sequence; we'll just explain it for one sequences.
+
+  Suppose we have I hmm-states, numbered i = 0 ... I-1 (we'll use i and j for
+  hmm-state indexes).  Let foll(i) give a list of arcs leaving state i, and
+  pred(i) give a list of arcs entering state i, and we'll use notation like:
+    for (j, p, n) in foll(i):
+  for iterating over those arcs, where in this case j is the destination-state,
+  p is the transition-probability of the arc and n is the pdf-id index.
+  We can then look up the emission probability as x(t, n) for some frame
+  0 <= t < T.
+
+  ** Version 1 of the computation (naive version) **
+
+  * Forward computation (version 1)
+
+  In the forward computation we're computing alpha(i, t) for 0 <= t <= T):
+    - For the first frame, set alpha(0, i) = init(i), where init(i) is the
+      initial-probabilitiy from state i.  # in our framework these are obtained
+      #  by running the HMM for a while and getting an averaged occupation
+      # probability, and using this as an initial-prob, since the boundaries of
+      # chunks don't really correspond to utterance boundaries in general.]
+    - For t = 1 ... T:
+        for i = 0 ... I-1:
+           alpha(t, i) = 0
+           for (j, p, n) in pred(i):  # note: j is preceding-state.
+              alpha(t, i) += x(t-1, n) * alpha(t-1, j) * p.
+
+    - total-prob = \sum_i alpha(T, i).  # note, we take the final-probs of all states
+                                        # to be 1.0.
+
+  * Backward computation (version 1)
+
+  And now for the backward computation.  Contrary to tradition, we include the
+  inverse of the total-prob as a factor in the betas.  This is both more
+  convenient (it simplifies the way we obtain posteriors), and makes the
+  algorithm more generalizable as all the beta quantities can be interpreted as
+  the partial derivative of the overall logprob with respect to their
+  corresponding alpha.
+
+  In forward backward notation, gamma is normally used for state-level
+  occupation probabilities, but what we care about here is pdf-id-level
+  occupation probabilities (i.e. the partial derivative of the overall logprob
+  w.r.t. the logs of the x(t, n) quantities), so we use gamma for that.
+
+    - for the final frame:
+       for each i, beta(T, i) = 1 / total-prob.
+    - for t = T-1 ... 0:
+        for i = 0 ... I-1:
+           beta(t, i) = 0
+           for (j, p, n) in foll(i):  # note: j is following-state.
+              beta(t, i) += x(t, n) * beta(t+1, j) * p.
+              gamma(t, n) += alpha(t, i) * x(t, n) * beta(t+1, j) * p.
+
+  ** Version 2 of the computation (renormalized version) **
+
+  Version 1 of the algorithm is susceptible to numeric underflow and overflow,
+  due to the limited range of IEEE floating-point exponents.
+  Define tot-alpha(t) = \sum_i alpha(t, i).  Then the renormalized version of
+  the computation is as above, except whenever the quantity x(t, n) appears,
+  we replace it with x(t, n) / tot-alpha(t).  In the algorithm we refer to
+  1.0 / tot-alpha(t) as 'arbitrary_scale', because mathematically we can use any
+  value here as long as we are consistent and the value only varies with t
+  and not with n; we'll always get the same posteriors (gamma).
+
+  When the algorithm outputs log(total-prob) as the total log-probability
+  of the HMM, we have to instead return the expression:
+    log(total-prob) + \sum_{t=0}^{T-1} tot-alpha(t).
+  to correct for the scaling of the x values.
+
+  The algorithm is still vulnerable to overflow in the beta computation because
+  it's possible that the dominant path could have a very tiny alpha.  However,
+  once we introduce the leaky-HMM idea (below), this problem will disappear.
+
+  ** Version 3 of the computation (leaky-HMM version) **
+
+  The leaky-HMM idea is intended to improve generalization by allowing paths
+  other than those explicitly allowed by the FST we compiled.  Another way to
+  look at it is as a way of hedging our bets about where we split the utterance,
+  so it's as we're marginalizing over different splits of the utterance.  You
+  could also think of it as a modification of the FST so that there is an
+  epsilon transition from each state to a newly added state, with probability
+  one, and then an epsilon transition from the newly added state to each state
+  with probability leaky-hmm-prob * init(i) [except we need a mechanism so that
+  no more than two epsilon transitions can be taken per frame- this would involve
+  creating two copies of the states]
+
+  Recall that we mentioned that init(i) is the initial-probability of
+  HMM-state i, but these are obtained in such a way that they can be treated
+  as priors, or average occupation-probabilities.
+
+  Anyway, the way we formulate leaky-hmm is as follows:
+
+  * Forward computation (version 3)
+
+  Let leaky-hmm-prob be a constant defined by the user, with 0.1 being a typical
+  value.  It defines how much probability we give to the 'leaky' transitions.
+
+  - For frame 0, set alpha(0, i) = init(i).
+  - For 0 <= t <= T, define tot-alpha(t) = \sum_i alpha(t, i).
+  - For 0 <= t <= T, define alpha'(t, i) = alpha(t, i) + tot-alpha(t) * leaky-hmm-prob * init(i).
+
+  - For 1 <= t <= T, the computation of alpha(t, i) is as before except we use
+      the previous frame's alpha' instead of alpha.  That is:
+           alpha(t, i) = 0
+           for (j, p, n) in pred(i):  # note: j is preceding-state.
+              alpha(t, i) += alpha'(t-1, j) * p * x(t-1, n) / tot-alpha(t-1)
+
+  - total-prob = \sum_i alpha'(T, i)
+
+  The corrected log-prob that we return from the algorithm will be
+   (total-prob + \sum_{t=0}^{T-1} tot-alpha(t)).
+
+  * Backward computation (version 3)
+
+  The backward computation is as follows.  It is fairly straightforward to
+  derive if you think of it as an instance of backprop where beta, tot-beta and
+  beta' are the partial derivatives of the output log-prob w.r.t. the
+  corresponding alpha, tot-alpha and alpha' quantities.  Note, tot-beta is not
+  really the sum of the betas as its name might suggest, it's just the
+  derivative w.r.t. tot-alpha.
+
+   - beta'(T, i) = 1 / total-prob.
+   - for 0 <= t <= T, define tot-beta(t) = leaky-hmm-prob * \sum_i init(i) * beta'(t, i)
+   - for 0 <= t <= T, define beta(t, i) = beta'(t, i) + tot-beta(t).
+   - for 0 <= t < T, we compute beta'(t, i) and update gamma(t, n) as follows:
+        for 0 <= i < I:
+           beta'(t, i) = 0
+           for (j, p, n) in foll(i):  # note: j is following-state.
+              beta'(t, i) += beta(t+1, j) * p * x(t, n) / tot-alpha(t)
+              gamma(t, n) += alpha'(t, i) * beta(t+1, j) * p *  x(t, n) / tot-alpha(t)
+
+   Note: in the code, the tot-alpha and tot-beta quantities go in the same
+   memory location that the corresponding alpha and beta for state I would go.
+
+ */
+
+
+// This does forward-backward in parallel on a number of sequences, using a
+// single HMM.
+class DenominatorComputation {
+ public:
+  /*
+    Constructor.  'nnet_output' is the raw nnet output (which we'll treat as
+    pseudo-log-likelihoods).
+
+    @param [in] opts  The options.
+    @param [in] graph  The HMM that we use for the denominator (like a decoding graph,
+                       with pdf-ids on the transitions).
+    @param [in] num_sequences The number of separate time sequences (all of the same length)
+                       that we are working with.  Must divide nnet_output.NumRows().
+    @param [in] nnet_output  The output of the neural network for this minibatch.
+                       The rows must be ordered as (first frame of all sequences)
+                       (second frame of all sequences), etc.
+  */
+  DenominatorComputation(const ChainTrainingOptions &opts,
+                         const DenominatorGraph &den_graph,
+                         int32 num_sequences,
+                         const CuMatrixBase<BaseFloat> &nnet_output);
+
+  // Does the forward computation, and returns the total negated log-like summed
+  // over all sequences.  You will have to scale this by any supervision
+  // weighting factor, manually.
+  BaseFloat Forward();
+
+  // this adds deriv_weight times (the derivative of the log-prob w.r.t. the
+  // nnet output), to 'nnet_output_deriv'.
+  // returns true if everything seemed OK, false if a failure was detected.
+  bool Backward(BaseFloat deriv_weight,
+                CuMatrixBase<BaseFloat> *nnet_output_deriv);
+
+ private:
+  // Defining this constant as an enum is easier.  it controls a memory/speed
+  // tradeoff, determining how many frames' worth of the transposed derivative
+  // we store at a time.  It's not very critical; the only disadvantage from
+  // setting it small is that we have to invoke an AddMat kernel more times.
+  enum { kMaxDerivTimeSteps = 8 };
+
+  // sets up the alpha for frame t = 0.
+  void AlphaFirstFrame();
+  // the alpha computation for some 0 < t <= num_time_steps_.
+  void AlphaGeneralFrame(int32 t);
+  // does the 'alpha-dash' computation for time t.  this relates to
+  // 'leaky hmm'.
+  void AlphaDash(int32 t);
+
+  // done after all the alphas, this function computes and returns the total
+  // log-likelihood summed over all the sequences, and sets tot_prob_ (if we're
+  // doing correction) log_correction_term_.  Note, this won't be scaled by
+  // 'deriv_scale' (which of course we haven't seen by the time this is called,
+  // from the Forward() computation).
+  BaseFloat ComputeTotLogLike();
+
+  void BetaDashLastFrame();
+  // beta computation for 0 <= beta < num_time_steps_.
+  void BetaDashGeneralFrame(int32 t);
+  // compute the beta quantity from the beta-dash quantity (relates to leaky hmm).
+  void Beta(int32 t);
+
+  // some checking that we can do if debug mode is activated, or on frame zero.
+  // Sets ok_ to false if a bad problem is detected.
+  void BetaGeneralFrameDebug(int32 t);
+
+  const ChainTrainingOptions &opts_;
+  const DenominatorGraph &den_graph_;
+
+  // number of separate frame sequences
+  int32 num_sequences_;
+  // number of frames per sequence.  nnet_output_.NumRows() equals
+  // num_sequences_ * frames_per_sequence.
+  int32 frames_per_sequence_;
+
+  // The transpose of the exp() of the nnet output (the transpose is more
+  // convenient for memory locality, and the exp() avoids us having to
+  // exponentiate in the forward-backward).
+  //
+  // The row-index is the pdf-id; and the column index equals (frame_index *
+  // num_sequences + sequence_index).
+  CuMatrix<BaseFloat> exp_nnet_output_transposed_;
+
+  // the derivs w.r.t. the nnet outputs (transposed)
+  CuMatrix<BaseFloat> nnet_output_deriv_transposed_;
+
+  // the (temporarily) alpha and (more permanently) alpha-dash probabilities;
+  // dimension is (frames_per_sequence + 1) by (num-hmm-states * num-sequences +
+  // num_sequences).  Note, they are not logs.  The last 'num_sequences'
+  // columns, where the alpha for the state indexed 'num_hmm_states' would live,
+  // are for the alpha-sums, which relates to leaky HMM.
+  CuMatrix<BaseFloat> alpha_;
+
+  // the beta (also beta-dash) probabilities (rolling buffer); dimension is 2 *
+  // (num-hmm-states * num-sequences + num_sequences).  [the last
+  // 'num_sequences' columns are for the beta-sums, which relates to leaky HMM.]
+  // Note: for efficiency and to simplify the equations, these are actually the
+  // beta / tot_prob_.
+  CuMatrix<BaseFloat> beta_;
+
+  // the total probability for each sequence, excluding the product of
+  // correction terms.  [the correction terms refer to the fact that we multiply
+  // on each frame by 1/alpha of hmm-state 0 of the previous frame.].
+  // After the correction terms the total probability is fairly close to 1,
+  // which is why we can store it as non-log.
+  CuVector<BaseFloat> tot_prob_;
+
+  // the log of tot_prob_.
+  CuVector<BaseFloat> tot_log_prob_;
+
+  // the log of the total correction term for each sequence, which is the
+  // product of the alpha-sums [used in the leaky-hmm computation] over all the
+  // frames.  The 'correction terms' are terms that we divide the alphas and
+  // betas by in order to keep them in a good dynamic range.  The product of
+  // them must be included in the total likelihood.
+  CuVector<BaseFloat> log_correction_term_;
+
+  bool ok_;
+};
+
+
+
+}  // namespace chain
+}  // namespace kaldi
+
+#endif  // KALDI_CHAIN_CHAIN_DENOMINATOR_H_
+
diff --git a/src/chain/chain-kernels-ansi.h b/src/chain/chain-kernels-ansi.h
new file mode 100644
index 00000000000..388c78ab2ee
--- /dev/null
+++ b/src/chain/chain-kernels-ansi.h
@@ -0,0 +1,56 @@
+// chain/chain-kernels-ansi.h
+
+// Copyright      2015  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#ifndef KALDI_CHAIN_CHAIN_KERNELS_ANSI_H_
+#define KALDI_CHAIN_CHAIN_KERNELS_ANSI_H_
+#include "chain/chain-datastruct.h"
+
+#if HAVE_CUDA == 1
+extern "C" {
+
+  void cuda_chain_hmm_backward(dim3 Gr, dim3 Bl,
+                               const Int32Pair *forward_transitions,
+                               const DenominatorGraphTransition *transitions,
+                               int32_cuda num_sequences,
+                               int32_cuda num_hmm_states,
+                               const BaseFloat *probs,
+                               int32_cuda prob_stride,
+                               const BaseFloat *this_alpha,
+                               const BaseFloat *next_beta,
+                               BaseFloat *this_beta,
+                               BaseFloat *log_prob_deriv,
+                               int32_cuda log_prob_deriv_stride);
+
+  void cuda_chain_hmm_forward(dim3 Gr, dim3 Bl,
+                              const Int32Pair *backward_transitions,
+                              const DenominatorGraphTransition *transitions,
+                              int32_cuda num_sequences,
+                              int32_cuda num_hmm_states,
+                              const BaseFloat *probs,
+                              int32_cuda prob_stride,
+                              const BaseFloat *prev_alpha,
+                              BaseFloat *this_alpha);
+
+} // extern "C"
+
+#endif  // HAVE_CUDA
+
+
+#endif  // KALDI_CHAIN_CHAIN_KERNELS_ANSI_H_
diff --git a/src/chain/chain-kernels.cu b/src/chain/chain-kernels.cu
new file mode 100644
index 00000000000..8d555ee76cc
--- /dev/null
+++ b/src/chain/chain-kernels.cu
@@ -0,0 +1,279 @@
+// chain/chain-kernels.cu
+
+// Copyright  2015  Johns Hopkins University (author: Daniel Povey)
+
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include <cfloat>
+#include "chain/chain-kernels-ansi.h"
+
+
+
+template <typename Real>
+__device__ inline void atomic_add(Real* address, Real value) {
+  Real old = value;
+  Real ret = atomicExch(address, 0.0f);
+  Real new_old = ret + old;
+  while ((old = atomicExch(address, new_old)) != 0.0f) {
+    new_old = atomicExch(address, 0.0f);
+    new_old += old;
+  }
+}
+
+template<>
+__device__ inline void atomic_add(double* address, double val) {
+  unsigned long long int* address_as_ull =
+    reinterpret_cast<unsigned long long int*>(address);
+  unsigned long long int old = *address_as_ull, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed,
+                    __double_as_longlong(val + __longlong_as_double(assumed)));
+  } while (assumed != old);
+}
+
+template <typename Real>
+__device__ inline void atomic_add_thresholded(Real* address, Real value) {
+  // This function uses a randomized algorithm to only do atomic adds for values
+  // >=n a threshold, and if it's below the threshold, randomly add the
+  // threshold itself with probability (value / threshold).  This preserves
+  // expectations.  Note: we assume that value >= 0.
+
+  // kThresholdingPowerOfTwo is defined in chain-datastruct.h; it defines
+  // the threshold for randomized posterior pruning.
+  const Real threshold = 1.0 / (1 << kThresholdingPowerOfTwo);
+  if (value >= threshold) {
+    atomic_add(address, value);
+  } else {
+    // The intention here is to do:
+    // with probability(value / threshold), do:
+    //   atomic_add(address, threshold);
+    // We use the least significant bits of the value as a source of
+    // randomness.  It would probably be more efficient to extract these
+    // random bits directly from the float, but I don't want to have to
+    // deal with endian-ness issues.
+    //
+    // below, x is a fixed-point representation of (value / threshold); it would
+    // be 16777216 == 2^24 if value == threshold and 0 if value == 0.  We choose
+    // the power 24 because that's the number of binary digits in the mantissa
+    // in IEEE single precision floating point.
+    // Note: we parenthesize the expression like this so that the
+    // denominator can be precomputed as a constant expression.
+    int32_cuda x = value / (threshold / (1 << 24));
+    // in the line below, the expression (x >> 12) is a representation of (value /
+    // threshold) between 0 and 4096, with 4096 representing (value / threshold ==
+    // 1), while (x & 4095) is treated as a pseudorandom number between 0 and 4095.
+    if ((x >> 12) > (x & 4095))
+      atomic_add(address, threshold);
+  }
+}
+
+// one iteration of the forward computation in the 'tombstone' CTC HMM computation.
+// The grid y determines which HMM-state we handle.  [put this in the grid because
+// HMM-states don't all take the same amount of time in the backwards direction, and it's
+// better for scheduling to have them at the outer level.]
+// The block x and grid x determine which sequence (0 ... num_sequences - 1) we handle;
+// note that num_sequences == the number of elements in the minibatch, and we
+// insist they all have the same number of time steps.
+// note: 'probs' is indexed by sequence-index + (pdf-index * prob_stride).
+__global__
+static void _cuda_chain_hmm_forward(const Int32Pair *backward_transitions,
+                                    const DenominatorGraphTransition *transitions,
+                                    int32_cuda num_sequences,
+                                    int32_cuda num_hmm_states,
+                                    const BaseFloat *probs,
+                                    int32_cuda prob_stride,
+                                    const BaseFloat *prev_alpha,
+                                    BaseFloat *this_alpha) {
+  // 'backward_transitions', indexed by hmm-state, consists of [start, end]
+  // indexes into the 'transitions' array.  This gives us the info for
+  // transitions *into* this state.  'probs' contains the exponentiated neural
+  // net outputs; it has dimension num-output-indexes by num_sequences and its
+  // stride is 'prob_stride'.  'prev_alpha' and 'this_alpha', which are
+  // extracted from a larger matrix, both have dimension num-history-states by
+  // num-sequences.
+
+  // s is the index of the sequence within the minibatch,
+  // from 0 .. num-egs-in-this-minibatch - 1.
+  // h is the hmm-state index.
+  int32_cuda s = threadIdx.x + blockIdx.x * blockDim.x,
+      h  = blockIdx.y;
+  if (s >= num_sequences)
+    return;
+
+  double this_tot_alpha = 0.0;
+  const DenominatorGraphTransition
+      *trans_iter = transitions + backward_transitions[h].first,
+      *trans_end = transitions + backward_transitions[h].second;
+  // Note: regarding this loop unrolling, I tried the automatic unrolling using
+  // #pragma unroll 2 (after modifying the loop to have an integer index), but I
+  // did not see any performance improvement, it was slightly slower.  So the
+  // compiler must be doing something different than what I'm doing here.
+  const int loop_unroll = 2;  // don't change this without changing the code
+                              // below.
+  for (; trans_iter + loop_unroll <= trans_end; trans_iter += loop_unroll) {
+    BaseFloat transition_prob0 = trans_iter[0].transition_prob;
+    int32_cuda pdf_id0 = trans_iter[0].pdf_id,
+        prev_hmm_state0 = trans_iter[0].hmm_state;
+    BaseFloat transition_prob1 = trans_iter[1].transition_prob;
+    int32_cuda pdf_id1 = trans_iter[1].pdf_id,
+        prev_hmm_state1 = trans_iter[1].hmm_state;
+    BaseFloat pseudo_loglike0 = probs[pdf_id0 * prob_stride + s],
+             this_prev_alpha0 = prev_alpha[prev_hmm_state0 * num_sequences + s],
+              pseudo_loglike1 = probs[pdf_id1 * prob_stride + s],
+             this_prev_alpha1 = prev_alpha[prev_hmm_state1 * num_sequences + s];
+
+    this_tot_alpha += this_prev_alpha0 * transition_prob0 * pseudo_loglike0 +
+                       this_prev_alpha1 * transition_prob1 * pseudo_loglike1;
+  }
+  if (trans_iter != trans_end) {
+    // mop up the odd transition.
+    BaseFloat transition_prob0 = trans_iter[0].transition_prob;
+    int32_cuda pdf_id0 = trans_iter[0].pdf_id,
+       prev_hmm_state0 = trans_iter[0].hmm_state;
+    BaseFloat pseudo_loglike0 = probs[pdf_id0 * prob_stride + s],
+             this_prev_alpha0 = prev_alpha[prev_hmm_state0 * num_sequences + s];
+    this_tot_alpha += this_prev_alpha0 * transition_prob0 * pseudo_loglike0;
+  }
+
+  // Let arbitrary_scale be the inverse of the sum of all alpha values on-- the
+  // previous frame this sum of all the alpha values is stored in the place that
+  // we'd store the previous alpha for state-index equal to num_hmm_states
+  // (i.e. one past the end).  We multiply this into all the
+  // transition-probabilities from the previous frame to this frame, in both the
+  // forward and backward passes, in order to keep the alphas in a good numeric
+  // range.  This won't affect the posteriors, as it's just a constant factor
+  // for each frame, but when computing the total likelihood we'll need to
+  // compensate for it later on.
+  BaseFloat arbitrary_scale =
+      1.0 / prev_alpha[num_hmm_states * num_sequences + s];
+  this_alpha[h * num_sequences + s] = this_tot_alpha * arbitrary_scale;
+}
+
+
+__global__
+static void _cuda_chain_hmm_backward(const Int32Pair *forward_transitions,
+                                     const DenominatorGraphTransition *transitions,
+                                     int32_cuda num_sequences, int32_cuda num_hmm_states,
+                                     const BaseFloat *probs, int32_cuda prob_stride,
+                                     const BaseFloat *this_alpha, const BaseFloat *next_beta,
+                                     BaseFloat *this_beta, BaseFloat *log_prob_deriv,
+                                     int32_cuda log_prob_deriv_stride) {
+  // 'forward_transitions', indexed by hmm-state, consists of [start, end]
+  // indexes into the 'transition_info' array.  This is about the transitions
+  // *out of* this state.  'probs' contains the exponentiated neural net
+  // outputs; it has dimension num-output-indexes by num_sequences, and contains
+  // just the observation probabilities for this time index.  Its stride is
+  // prob_stride.
+  // 'this_alpha', 'next_beta' and 'this_beta' all have dimension
+  // num-history-states by num-sequences.
+  // The beta probs are normalized in such a way (by multiplying by 1/(total-data-prob))
+  // that to get occupation counts we don't need to multiply by 1/total-data-prob.
+  // deriv_scale is a factor (e.g. -1.0 or -0.99) that we multiply these derivs by
+  // while accumulating them.
+
+  // s is the index of the sequence within the minibatch,
+  // from 0 .. num-egs-in-this-minibatch - 1.
+  // h is the hmm-state index.
+  int32_cuda s = threadIdx.x + blockIdx.x * blockDim.x,
+      h = blockIdx.y;
+  if (s >= num_sequences)
+    return;
+
+  // See where arbitrary_scale is defined in the forward computation above, for
+  // more explanation of inv_arbitrary_scale.
+  BaseFloat this_alpha_prob = this_alpha[h * num_sequences + s],
+      inv_arbitrary_scale =
+      this_alpha[num_hmm_states * num_sequences + s];
+  double tot_variable_factor = 0.0;
+
+  BaseFloat occupation_factor = this_alpha_prob / inv_arbitrary_scale;
+  const DenominatorGraphTransition
+      *trans_iter = transitions + forward_transitions[h].first,
+      *trans_end = transitions + forward_transitions[h].second;
+  const int loop_unroll = 2;  // don't change this without changing the code
+                              // below.
+  for (; trans_iter + loop_unroll <= trans_end; trans_iter += loop_unroll) {
+    BaseFloat transition_prob0 = trans_iter[0].transition_prob;
+    int32_cuda pdf_id0 = trans_iter[0].pdf_id,
+        next_hmm_state0 = trans_iter[0].hmm_state;
+    BaseFloat transition_prob1 = trans_iter[1].transition_prob;
+    int32_cuda pdf_id1 = trans_iter[1].pdf_id,
+        next_hmm_state1 = trans_iter[1].hmm_state;
+    BaseFloat variable_factor0 = transition_prob0 *
+        next_beta[next_hmm_state0 * num_sequences + s] *
+                    probs[pdf_id0 * prob_stride + s],
+         variable_factor1 = transition_prob1 *
+        next_beta[next_hmm_state1 * num_sequences + s] *
+                    probs[pdf_id1 * prob_stride + s];
+    tot_variable_factor += variable_factor0 + variable_factor1;
+    BaseFloat occupation_prob0 = variable_factor0 * occupation_factor;
+    atomic_add_thresholded(log_prob_deriv + (pdf_id0 * log_prob_deriv_stride + s),
+                           occupation_prob0);
+    BaseFloat occupation_prob1 = variable_factor1 * occupation_factor;
+    atomic_add_thresholded(log_prob_deriv + (pdf_id1 * log_prob_deriv_stride + s),
+                           occupation_prob1);
+  }
+  if (trans_iter != trans_end) {
+    // mop up the odd transition.
+    BaseFloat transition_prob0 = trans_iter[0].transition_prob;
+    int32_cuda pdf_id0 = trans_iter[0].pdf_id,
+        next_hmm_state0 = trans_iter[0].hmm_state;
+    BaseFloat variable_factor0 = transition_prob0 *
+        next_beta[next_hmm_state0 * num_sequences + s] *
+                      probs[pdf_id0 * prob_stride + s];
+    tot_variable_factor += variable_factor0;
+    BaseFloat occupation_prob0 = variable_factor0 * occupation_factor;
+    atomic_add_thresholded(log_prob_deriv + (pdf_id0 * log_prob_deriv_stride + s),
+                           occupation_prob0);
+  }
+  BaseFloat beta = tot_variable_factor / inv_arbitrary_scale;
+  this_beta[h * num_sequences + s] = beta;
+}
+
+
+void cuda_chain_hmm_forward(dim3 Gr, dim3 Bl,
+                            const Int32Pair *backward_transitions,
+                            const DenominatorGraphTransition *transitions,
+                            int32_cuda num_sequences,
+                            int32_cuda num_hmm_states,
+                            const BaseFloat *probs, int32_cuda prob_stride,
+                            const BaseFloat *prev_alpha,
+                            BaseFloat *this_alpha) {
+  _cuda_chain_hmm_forward<<<Gr,Bl>>>(backward_transitions, transitions,
+                                     num_sequences, num_hmm_states,
+                                     probs, prob_stride,
+                                     prev_alpha, this_alpha);
+}
+
+void cuda_chain_hmm_backward(dim3 Gr, dim3 Bl,
+                             const Int32Pair *forward_transitions,
+                             const DenominatorGraphTransition *transitions,
+                             int32_cuda num_sequences,
+                             int32_cuda num_hmm_states,
+                             const BaseFloat *probs, int32_cuda prob_stride,
+                             const BaseFloat *this_alpha, const BaseFloat *next_beta,
+                             BaseFloat *this_beta,
+                             BaseFloat *log_prob_deriv,
+                             int32_cuda log_prob_deriv_stride) {
+  _cuda_chain_hmm_backward<<<Gr,Bl>>>(forward_transitions, transitions,
+                                      num_sequences, num_hmm_states,
+                                      probs, prob_stride,
+                                      this_alpha, next_beta,
+                                      this_beta, log_prob_deriv,
+                                      log_prob_deriv_stride);
+}
+
diff --git a/src/chain/chain-numerator.cc b/src/chain/chain-numerator.cc
new file mode 100644
index 00000000000..139d28bdd77
--- /dev/null
+++ b/src/chain/chain-numerator.cc
@@ -0,0 +1,213 @@
+// chain/chain-numerator.cc
+
+// Copyright      2015   Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "chain/chain-numerator.h"
+#include "cudamatrix/cu-vector.h"
+
+namespace kaldi {
+namespace chain {
+
+
+NumeratorComputation::NumeratorComputation(
+    const Supervision &supervision,
+    const CuMatrixBase<BaseFloat> &nnet_output):
+    supervision_(supervision),
+    nnet_output_(nnet_output) {
+  ComputeFstStateTimes(supervision_.fst, &fst_state_times_);
+  KALDI_ASSERT(supervision.num_sequences * supervision.frames_per_sequence ==
+               nnet_output.NumRows() &&
+               supervision.label_dim == nnet_output.NumCols());
+}
+
+
+void NumeratorComputation::ComputeLookupIndexes() {
+
+  int32 num_states = supervision_.fst.NumStates();
+  int32 num_arcs_guess = num_states * 2;
+  fst_output_indexes_.reserve(num_arcs_guess);
+
+  int32 frames_per_sequence = supervision_.frames_per_sequence,
+      num_sequences = supervision_.num_sequences,
+      cur_time = 0;
+
+  // the following is a CPU version of nnet_output_indexes_.  It is a list of
+  // pairs (row-index, column-index) which index nnet_output_.  The row-index
+  // corresponds to the time-frame 't', and the column-index the pdf-id, but we
+  // have to be a little careful with the row-index because there is a
+  // reordering that happens if supervision_.num_sequences > 1.
+  //
+
+  // output-index) and denominator_indexes_cpu is a list of pairs (c,
+  // history-state-index).
+  std::vector<Int32Pair> nnet_output_indexes_cpu;
+
+  // index_map_this_frame is a map, only valid for t == cur_time,
+  // from the pdf-id to the index into nnet_output_indexes_cpu for the
+  // likelihood at (cur_time, pdf-id).
+  unordered_map<int32,int32> index_map_this_frame;
+
+  typedef unordered_map<int32,int32>::iterator IterType;
+
+  for (int32 state = 0; state < num_states; state++) {
+    int32 t = fst_state_times_[state];
+    if (t != cur_time) {
+      KALDI_ASSERT(t == cur_time + 1);
+      index_map_this_frame.clear();
+      cur_time = t;
+    }
+    for (fst::ArcIterator<fst::StdVectorFst> aiter(supervision_.fst, state);
+         !aiter.Done(); aiter.Next()) {
+      int32 pdf_id = aiter.Value().ilabel - 1;
+      KALDI_ASSERT(pdf_id >= 0 && pdf_id < supervision_.label_dim);
+
+      int32 index = nnet_output_indexes_cpu.size();
+
+      // the next few lines are a more efficient way of doing the following:
+      // if (index_map_this_frame.count(pdf_id) == 0) {
+      //   index = index_map_this_frame[pdf_id] = nnet_output_indexes_cpu.size();
+      //   nnet_output_indexes_cpu.push_back(pair(pdf_id, row-index));
+      // } else {
+      //   index = index_map_this_frame[pdf_id];
+      // }
+      std::pair<IterType,bool> p = index_map_this_frame.insert(
+          std::pair<const int32, int32>(pdf_id, index));
+      if (p.second) {  // Was inserted -> map had no key 'output_index'
+        Int32Pair pair;  // we can't use constructors as this was declared in C.
+        pair.first = ComputeRowIndex(t, frames_per_sequence, num_sequences);
+        pair.second = pdf_id;
+        nnet_output_indexes_cpu.push_back(pair);
+      } else {  // was not inserted -> set 'index' to the existing index.
+        index = p.first->second;
+      }
+      fst_output_indexes_.push_back(index);
+    }
+  }
+  nnet_output_indexes_ = nnet_output_indexes_cpu;
+  KALDI_ASSERT(!fst_output_indexes_.empty());
+}
+
+BaseFloat NumeratorComputation::Forward() {
+  ComputeLookupIndexes();
+  nnet_logprobs_.Resize(nnet_output_indexes_.Dim(), kUndefined);
+  nnet_output_.Lookup(nnet_output_indexes_, nnet_logprobs_.Data());
+  const fst::StdVectorFst &fst = supervision_.fst;
+  KALDI_ASSERT(fst.Start() == 0);
+  int32 num_states = fst.NumStates();
+  log_alpha_.Resize(num_states, kUndefined);
+  log_alpha_.Set(-std::numeric_limits<double>::infinity());
+  tot_log_prob_ = -std::numeric_limits<double>::infinity();
+
+  log_alpha_(0) = 0.0;  // note, state zero is the start state, we checked above
+
+  const BaseFloat *nnet_logprob_data = nnet_logprobs_.Data();
+  std::vector<int32>::const_iterator fst_output_indexes_iter =
+      fst_output_indexes_.begin();
+
+  double *log_alpha_data = log_alpha_.Data();
+
+  for (int32 state = 0; state < num_states; state++) {
+    double this_log_alpha = log_alpha_data[state];
+    for (fst::ArcIterator<fst::StdVectorFst> aiter(fst, state); !aiter.Done();
+         aiter.Next(), ++fst_output_indexes_iter) {
+      const fst::StdArc &arc = aiter.Value();
+      int32 nextstate = arc.nextstate;
+      BaseFloat transition_logprob = -arc.weight.Value();
+      int32 index = *fst_output_indexes_iter;
+      BaseFloat pseudo_loglike = nnet_logprob_data[index];
+      double &next_log_alpha = log_alpha_data[nextstate];
+      next_log_alpha = LogAdd(next_log_alpha, pseudo_loglike +
+                              transition_logprob + this_log_alpha);
+    }
+    if (fst.Final(state) != fst::TropicalWeight::Zero()) {
+      BaseFloat final_logprob = -fst.Final(state).Value();
+      tot_log_prob_ = LogAdd(tot_log_prob_,
+                             this_log_alpha + final_logprob);
+    }
+  }
+  KALDI_ASSERT(fst_output_indexes_iter ==
+               fst_output_indexes_.end());
+  return tot_log_prob_ * supervision_.weight;
+}
+
+
+void NumeratorComputation::Backward(
+    CuMatrixBase<BaseFloat> *nnet_output_deriv) {
+  const fst::StdVectorFst &fst = supervision_.fst;
+  int32 num_states = fst.NumStates();
+  log_beta_.Resize(num_states, kUndefined);
+  nnet_logprob_derivs_.Resize(nnet_logprobs_.Dim());
+
+  // we'll be counting backwards and moving the 'fst_output_indexes_iter'
+  // pointer back.
+  const int32 *fst_output_indexes_iter = &(fst_output_indexes_[0]) +
+      fst_output_indexes_.size();
+  const BaseFloat *nnet_logprob_data = nnet_logprobs_.Data();
+  double tot_log_prob = tot_log_prob_;
+  double *log_beta_data = log_beta_.Data();
+  const double *log_alpha_data = log_alpha_.Data();
+  BaseFloat *nnet_logprob_deriv_data = nnet_logprob_derivs_.Data();
+
+  for (int32 state = num_states - 1; state >= 0; state--) {
+    int32 this_num_arcs  = fst.NumArcs(state);
+    // on the backward pass we access the fst_output_indexes_ vector in a zigzag
+    // pattern.
+    fst_output_indexes_iter -= this_num_arcs;
+    const int32 *this_fst_output_indexes_iter = fst_output_indexes_iter;
+    double this_log_beta = -fst.Final(state).Value();
+    double this_log_alpha = log_alpha_data[state];
+    for (fst::ArcIterator<fst::StdVectorFst> aiter(fst, state); !aiter.Done();
+         aiter.Next(), this_fst_output_indexes_iter++) {
+      const fst::StdArc &arc = aiter.Value();
+      double next_log_beta = log_beta_data[arc.nextstate];
+      BaseFloat transition_logprob = -arc.weight.Value();
+      int32 index = *this_fst_output_indexes_iter;
+      BaseFloat pseudo_loglike = nnet_logprob_data[index];
+      this_log_beta = LogAdd(this_log_beta, pseudo_loglike +
+                             transition_logprob + next_log_beta);
+      BaseFloat occupation_logprob = this_log_alpha + pseudo_loglike +
+          transition_logprob + next_log_beta - tot_log_prob,
+          occupation_prob = exp(occupation_logprob);
+      nnet_logprob_deriv_data[index] += occupation_prob;
+    }
+    // check for -inf.
+    KALDI_PARANOID_ASSERT(this_log_beta - this_log_beta == 0);
+    log_beta_data[state] = this_log_beta;
+  }
+  KALDI_ASSERT(fst_output_indexes_iter == &(fst_output_indexes_[0]));
+
+  int32 start_state = 0;  // the fact that the start state is numbered 0 is
+                          // implied by other properties of the FST
+                          // (epsilon-free-ness and topological sorting, and
+                          // connectedness).
+  double tot_log_prob_backward = log_beta_(start_state);
+  if (!ApproxEqual(tot_log_prob_backward, tot_log_prob_))
+    KALDI_WARN << "Disagreement in forward/backward log-probs: "
+               << tot_log_prob_backward << " vs. " << tot_log_prob_;
+
+  // copy this data to GPU.
+  CuVector<BaseFloat> nnet_logprob_deriv_cuda;
+  nnet_logprob_deriv_cuda.Swap(&nnet_logprob_derivs_);
+  nnet_output_deriv->AddElements(supervision_.weight, nnet_output_indexes_,
+                                 nnet_logprob_deriv_cuda.Data());
+}
+
+
+}  // namespace chain
+}  // namespace kaldi
diff --git a/src/chain/chain-numerator.h b/src/chain/chain-numerator.h
new file mode 100644
index 00000000000..15cb31e0571
--- /dev/null
+++ b/src/chain/chain-numerator.h
@@ -0,0 +1,146 @@
+// chain/chain-numerator.h
+
+// Copyright       2015  Johns Hopkins University (Author: Daniel Povey)
+
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#ifndef KALDI_CHAIN_CHAIN_NUMERATOR_H_
+#define KALDI_CHAIN_CHAIN_NUMERATOR_H_
+
+#include <vector>
+#include <map>
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "fstext/fstext-lib.h"
+#include "tree/context-dep.h"
+#include "lat/kaldi-lattice.h"
+#include "matrix/kaldi-matrix.h"
+#include "hmm/transition-model.h"
+#include "chain/chain-supervision.h"
+#include "cudamatrix/cu-matrix.h"
+#include "cudamatrix/cu-array.h"
+
+namespace kaldi {
+namespace chain {
+
+
+// This class is responsible for the forward-backward of the 'supervision'
+// (numerator) FST.
+//
+// note: the supervision.weight is ignored by this class, you have to apply
+// it externally.
+// Because the supervision FSTs are quite skinny, i.e. have very few paths for
+// each frame, it's feasible to do this computation on the CPU, and that's what
+// we do.  We transfer from/to the GPU only the things that we need.
+
+class NumeratorComputation {
+
+ public:
+
+  /// Initialize the objcect.  Note: we expect the 'nnet_output' to have the
+  /// same number of rows as supervision.num_frames * supervision.num_sequences,
+  /// and the same number of columns as the 'label-dim' of the supervision
+  /// object (which will be the NumPdfs() of the transition model); but the
+  /// ordering of the rows of 'nnet_output' is not the same as the ordering of
+  /// frames in paths in the 'supervision' object (which has all frames of the
+  /// 1st sequence first, then the 2nd sequence, and so on).  Instead, the
+  /// frames in 'nnet_output' are ordered as: first the first frame of each
+  /// sequence, then the second frame of each sequence, and so on.  This is more
+  /// convenient both because the nnet3 code internally orders them that way,
+  /// and because this makes it easier to order things in the way that class
+  /// SingleHmmForwardBackward needs (we can just transpose, instead of doing a
+  /// 3d tensor rearrangement).
+  NumeratorComputation(const Supervision &supervision,
+                       const CuMatrixBase<BaseFloat> &nnet_output);
+
+  // TODO: we could enable a Viterbi mode.
+
+  // Does the forward computation.  Returns the total log-prob multiplied
+  // by supervision_.weight.
+  BaseFloat Forward();
+
+  // Does the backward computation and (efficiently) adds the derivative of the
+  // nnet output w.r.t. the (log-prob times supervision_.weight times
+  // deriv_weight) to 'nnet_output_deriv'.
+  void Backward(CuMatrixBase<BaseFloat> *nnet_output_deriv);
+
+ private:
+
+  const Supervision &supervision_;
+
+  // state times of supervision_.fst.
+  std::vector<int32> fst_state_times_;
+
+
+  // the exp of the neural net output.
+  const CuMatrixBase<BaseFloat> &nnet_output_;
+
+
+  // 'fst_output_indexes' contains an entry for each arc in the supervision FST, in
+  // the order you'd get them if you visit each arc of each state in order.
+  // the contents of fst_output_indexes_ are indexes into nnet_output_indexes_
+  // and nnet_logprobs_.
+  std::vector<int32> fst_output_indexes_;
+
+  // nnet_output_indexes is a list of (row, column) indexes that we need to look
+  // up in nnet_output_ for the forward-backward computation.  The order is
+  // arbitrary, but indexes into this vector appear in fst_output_indexes;
+  // and it's important that each pair only appear once (in order for the
+  // derivatives to be summed properly).
+  CuArray<Int32Pair> nnet_output_indexes_;
+
+  // the log-probs obtained from lookup in the nnet output, on the CPU.  This
+  // vector has the same size as nnet_output_indexes_.  In the backward
+  // computation, the storage is re-used for derivatives.
+  Vector<BaseFloat> nnet_logprobs_;
+
+  // derivatives w.r.t. the nnet logprobs.  These can be interpreted as
+  // occupation probabilities.
+  Vector<BaseFloat> nnet_logprob_derivs_;
+
+  // The log-alpha value (forward probability) for each state in the lattices.
+  Vector<double> log_alpha_;
+
+  // The total pseudo-log-likelihood from the forward-backward.
+  double tot_log_prob_;
+
+  // The log-beta value (backward probability) for each state in the lattice
+  Vector<double> log_beta_;
+
+  // This function creates fst_output_indexes_ and nnet_output_indexes_.
+  void ComputeLookupIndexes();
+
+  // convert time-index in the FST to a row-index in the nnet-output (to account
+  // for the fact that the sequences are interleaved in the nnet-output).
+  inline int32 ComputeRowIndex(int32 t, int32 frames_per_sequence,
+                               int32 num_sequences) {
+    return t / frames_per_sequence +
+        num_sequences * (t % frames_per_sequence);
+  }
+
+};
+
+
+
+
+}  // namespace chain
+}  // namespace kaldi
+
+#endif  // KALDI_CHAIN_CHAIN_NUMERATOR_H_
+
diff --git a/src/chain/chain-supervision-test.cc b/src/chain/chain-supervision-test.cc
new file mode 100644
index 00000000000..d4b891db06e
--- /dev/null
+++ b/src/chain/chain-supervision-test.cc
@@ -0,0 +1,626 @@
+// chain/chain-supervision-test.cc
+
+// Copyright      2015   Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "chain/chain-supervision.h"
+#include "chain/chain-numerator.h"
+#include "fstext/fstext-lib.h"
+#include "cudamatrix/cu-device.h"
+#include "cudamatrix/cu-vector.h"
+#include "hmm/hmm-test-utils.h"
+#include "chain/chain-den-graph.h"
+#include "chain/chain-denominator.h"
+#include "hmm/hmm-utils.h"
+
+
+
+namespace kaldi {
+namespace chain {
+
+// computes a phone language-model FST, which has only monophone context.
+void ComputeExamplePhoneLanguageModel(const std::vector<int32> &phones,
+                                      fst::StdVectorFst *g_fst) {
+
+  g_fst->DeleteStates();
+  int32 state = g_fst->AddState();
+  g_fst->SetStart(state);
+
+  Vector<BaseFloat> probs(phones.size() + 1);
+  probs.SetRandn();
+  probs.ApplyPow(2.0);
+  probs.Add(0.01);
+  probs.Scale(1.0 / probs.Sum());
+
+  for (size_t i = 0; i < phones.size(); i++) {
+    int32 phone = phones[i];
+    fst::StdArc arc(phone, phone,
+                    fst::TropicalWeight(-log(probs(i))), state);
+    g_fst->AddArc(state, arc);
+  }
+  g_fst->SetFinal(state, fst::TropicalWeight(-log(probs(phones.size()))));
+}
+
+
+void ComputeExampleDenFst(const ContextDependency &ctx_dep,
+                          const TransitionModel &trans_model,
+                          fst::StdVectorFst *den_graph) {
+  using fst::StdVectorFst;
+  using fst::StdArc;
+  StdVectorFst phone_lm;
+  ComputeExamplePhoneLanguageModel(trans_model.GetPhones(), &phone_lm);
+
+  CreateDenominatorFst(ctx_dep, trans_model, phone_lm, den_graph);
+}
+
+
+void TestSupervisionIo(const Supervision &supervision) {
+  bool binary = (RandInt(0, 1) == 0);
+  std::ostringstream os;
+  supervision.Write(os, binary);
+  std::istringstream is(os.str());
+  Supervision supervision2;
+  if (RandInt(0, 1) == 0)
+    supervision2 = supervision;  // test reading already-existing object.
+  supervision2.Read(is, binary);
+  std::ostringstream os2;
+  supervision2.Write(os2, binary);
+  KALDI_ASSERT(os.str() == os2.str());
+  if (binary) {
+    KALDI_ASSERT(supervision == supervision2);
+  }
+  // also test swap and constructor
+  Supervision supervision3(supervision), supervision4;
+  supervision3.Swap(&supervision4);
+  KALDI_ASSERT(supervision == supervision4);
+}
+
+void TestSupervisionNumerator(const Supervision &supervision) {
+
+  CuMatrix<BaseFloat> nnet_output(supervision.num_sequences *
+                                  supervision.frames_per_sequence,
+                                  supervision.label_dim);
+  nnet_output.SetRandn();
+
+  NumeratorComputation num(supervision, nnet_output);
+
+  // Test that derivs are accurate.
+
+  BaseFloat forward_prob = num.Forward();
+
+  CuMatrix<BaseFloat> nnet_output_deriv(nnet_output.NumRows(),
+                                        nnet_output.NumCols());
+  num.Backward(&nnet_output_deriv);
+
+  int32 dim = 3;
+  Vector<BaseFloat> predicted_objf_changes(dim),
+      observed_objf_changes(dim);
+  BaseFloat delta = 1.0e-04;
+  for (int32 p = 0; p < dim; p++) {
+    CuMatrix<BaseFloat> new_nnet_output(nnet_output.NumRows(),
+                                        nnet_output.NumCols());
+    new_nnet_output.SetRandn();
+    new_nnet_output.Scale(delta);
+    predicted_objf_changes(p) = TraceMatMat(nnet_output_deriv, new_nnet_output,
+                                            kTrans);
+    new_nnet_output.AddMat(1.0, nnet_output);
+    NumeratorComputation num2(supervision, new_nnet_output);
+    observed_objf_changes(p) = num2.Forward() - forward_prob;
+  }
+  KALDI_LOG << "Predicted objf changes are: "
+            << predicted_objf_changes;
+  KALDI_LOG << "Observed objf changes are: "
+            << observed_objf_changes;
+
+  {
+    BaseFloat correction = (predicted_objf_changes.Sum() - observed_objf_changes.Sum()) /
+        predicted_objf_changes.Dim();
+    observed_objf_changes.Add(correction);
+    KALDI_LOG << "Correcting observed objf changes for statistical effects, to "
+              << observed_objf_changes;
+    KALDI_ASSERT(predicted_objf_changes.ApproxEqual(observed_objf_changes, 0.1));
+  }
+
+
+  {
+    CuVector<BaseFloat> rand(nnet_output.NumRows());
+    rand.SetRandn();
+    CuMatrix<BaseFloat> nnet_output_mod(nnet_output);
+    nnet_output_mod.AddVecToCols(1.0, rand);
+    NumeratorComputation num_mod(supervision, nnet_output_mod);
+    BaseFloat forward_prob_mod = num_mod.Forward();
+    BaseFloat predicted_change = rand.Sum(),
+        observed_change = forward_prob_mod - forward_prob;
+    KALDI_ASSERT(fabs(predicted_change - observed_change)  < 0.1);
+  }
+
+
+}
+
+void TestSupervisionAppend(const TransitionModel &trans_model,
+                           const Supervision &supervision) {
+  int32 num_append = RandInt(1,5);
+  std::vector<const Supervision*> input(num_append);
+  for (int32 i = 0; i < num_append; i++)
+    input[i] = &supervision;
+  std::vector<Supervision> output;
+  bool compactify = (RandInt(0, 1) == 0);
+  AppendSupervision(input, compactify, &output);
+  if (compactify) {
+    KALDI_ASSERT(output.size() == 1 &&
+                 output[0].frames_per_sequence ==
+                 supervision.frames_per_sequence &&
+                 output[0].num_sequences == num_append);
+  } else {
+    KALDI_ASSERT(output.size() == input.size());
+  }
+  int32 tot_sequences_in = 0, tot_sequences_out = 0,
+      tot_frames_in = 0, tot_frames_out = 0;
+  for (int32 i = 0; i < num_append; i++) {
+    tot_sequences_in += input[i]->num_sequences;
+    tot_frames_in += input[i]->num_sequences *
+        input[i]->frames_per_sequence;
+  }
+  for (int32 i = 0; i < output.size(); i++) {
+    tot_sequences_out += output[i].num_sequences;
+    tot_frames_out += output[i].num_sequences *
+        output[i].frames_per_sequence;
+  }
+  KALDI_ASSERT(tot_sequences_out == tot_sequences_in &&
+               tot_frames_out == tot_frames_in);
+
+  TestSupervisionIo(output[0]);
+  TestSupervisionNumerator(output[0]);
+  output[0].Check(trans_model);
+}
+
+void TestSupervisionReattached(const TransitionModel &trans_model,
+                               const Supervision &supervision,
+                               const Supervision &reattached_supervision) {
+  using namespace fst;
+  KALDI_LOG << "testing reattached";
+  KALDI_ASSERT(reattached_supervision.frames_per_sequence *
+               reattached_supervision.num_sequences ==
+               supervision.frames_per_sequence * supervision.num_sequences &&
+               reattached_supervision.weight == supervision.weight &&
+               reattached_supervision.label_dim == supervision.label_dim);
+  UniformArcSelector<StdArc> selector;
+  RandGenOptions<UniformArcSelector<StdArc> > randgen_opts(selector);
+  StdVectorFst fst_path;
+  RandGen(supervision.fst, &fst_path, randgen_opts);
+  StdVectorFst composed;
+  Compose(fst_path, reattached_supervision.fst, &composed);
+  Connect(&composed);
+  KALDI_ASSERT(composed.NumStates() != 0);
+  supervision.Check(trans_model);
+  reattached_supervision.Check(trans_model);
+}
+
+
+void TestSupervisionFrames(const Supervision &supervision) {
+  using namespace fst;
+  UniformArcSelector<StdArc> selector;
+  RandGenOptions<UniformArcSelector<StdArc> > randgen_opts(selector);
+  VectorFst<StdArc> rand_path;
+  RandGen(supervision.fst, &rand_path, randgen_opts);
+  std::vector<int32> isymbols_out, osymbols_out;
+  fst::TropicalWeight weight_out;
+  bool ans = GetLinearSymbolSequence(rand_path, &isymbols_out, &osymbols_out,
+                                     &weight_out);
+  KALDI_ASSERT(ans);
+  KALDI_ASSERT(isymbols_out == osymbols_out);
+  KALDI_ASSERT(isymbols_out.size() ==
+               static_cast<size_t>(supervision.num_sequences *
+                                   supervision.frames_per_sequence));
+  KALDI_ASSERT(weight_out == fst::TropicalWeight::One());
+
+  bool test = true;
+  // make sure epsilon free
+  KALDI_ASSERT(supervision.fst.Properties(fst::kNoEpsilons, test) != 0);
+  // make sure acceptor
+  KALDI_ASSERT(supervision.fst.Properties(fst::kAcceptor, test) != 0);
+}
+
+
+void ChainTrainingTest(const DenominatorGraph &den_graph,
+                       const Supervision &supervision) {
+  int32 num_sequences = supervision.num_sequences,
+      frames_per_sequence = supervision.frames_per_sequence;
+  if (frames_per_sequence == 1)  // this will break some code.
+    return;
+
+  CuMatrix<BaseFloat> nnet_output(num_sequences * frames_per_sequence,
+                                  den_graph.NumPdfs());
+
+  bool zero_output = (RandInt(0, 3) == 0);
+  if (!zero_output)
+    nnet_output.SetRandn();
+
+  ChainTrainingOptions opts;
+  if (RandInt(0, 1) == 1)
+    opts.leaky_hmm_coefficient = 0.2;
+
+  CuMatrix<BaseFloat> nnet_output_deriv(nnet_output.NumRows(),
+                                        nnet_output.NumCols(),
+                                        kUndefined);
+
+  BaseFloat objf, l2_term, weight;
+
+  ComputeChainObjfAndDeriv(opts, den_graph, supervision,
+                           nnet_output, &objf, &l2_term, &weight,
+                           &nnet_output_deriv);
+
+  {
+    // make sure each row of nnet_output_deriv sums to one (shift invariance of
+    // the nnet output).
+    CuVector<BaseFloat> nnet_output_deriv_row_sums(nnet_output_deriv.NumRows());
+    nnet_output_deriv_row_sums.AddColSumMat(1.0, nnet_output_deriv, 0.0);
+    KALDI_ASSERT(nnet_output_deriv_row_sums.Norm(2.0) < 0.1);
+  }
+
+  KALDI_LOG << "Chain objf per frame is " << (objf / weight)
+            << " over " << weight << " frames (weighted)";
+
+  { // a check
+    BaseFloat output_deriv_sum = nnet_output_deriv.Sum();
+    KALDI_LOG << "Sum of nnet-output-deriv is " << output_deriv_sum
+              << " vs. expected 0.";
+    KALDI_ASSERT(output_deriv_sum < 0.2);
+  }
+
+  KALDI_ASSERT(objf <= 0.0);
+
+  int32 num_tries = 5;
+  BaseFloat epsilon = 1.0e-04;
+  Vector<BaseFloat> predicted_objf_changes(num_tries),
+      observed_objf_changes(num_tries);
+  for (int32 p = 0; p < num_tries; p++) {
+    CuMatrix<BaseFloat> nnet_delta_output(nnet_output.NumRows(),
+                                          nnet_output.NumCols());
+    nnet_delta_output.SetRandn();
+    nnet_delta_output.Scale(epsilon);
+    predicted_objf_changes(p) = TraceMatMat(nnet_output_deriv,
+                                            nnet_delta_output, kTrans);
+    CuMatrix<BaseFloat> nnet_output_perturbed(nnet_delta_output);
+    nnet_output_perturbed.AddMat(1.0, nnet_output);
+
+    BaseFloat objf_modified, l2_term_modified, weight_modified;
+
+    ComputeChainObjfAndDeriv(opts, den_graph, supervision,
+                             nnet_output_perturbed,
+                             &objf_modified, &l2_term_modified,
+                             &weight_modified,
+                             NULL);
+
+    observed_objf_changes(p) = objf_modified - objf;
+  }
+  KALDI_LOG << "Predicted objf changes are " << predicted_objf_changes;
+  KALDI_LOG << "Observed objf changes are " << observed_objf_changes;
+  {
+    Vector<BaseFloat> error(predicted_objf_changes);
+    error.AddVec(-1.0, observed_objf_changes);
+    KALDI_LOG << "num-sequences = " << num_sequences << ", frames-per-sequence = "
+              << frames_per_sequence << ", relative accuracy is "
+              << (error.Norm(2.0) / predicted_objf_changes.Norm(2.0));
+  }
+
+  {
+    // we get inaccuracy for long segments, I think because there is a bias when we
+    // add random noise for it to increase the likelihood (for winner-take-all reasons)
+    // and for long utterances this bias adds up over the frames and tends to
+    // outweigh the random component that the gradient predicts (which will tend to
+    // cancel).  Try to correct for this...
+    BaseFloat correction = (predicted_objf_changes.Sum() - observed_objf_changes.Sum()) /
+        predicted_objf_changes.Dim();
+    observed_objf_changes.Add(correction);
+    KALDI_LOG << "Correcting observed objf changes for statistical effects, to "
+              << observed_objf_changes;
+    if (frames_per_sequence > 2 &&
+        predicted_objf_changes.Norm(2.0) > 0.1 * epsilon) {
+      // if we only have the initial and final frames, due to the scaling-down
+      // of pdfs not in the numerator sequence the derivative might be zero,
+      // which would cause problems doing the comparison.
+      // note, epsilon = 1.0e-04.
+      KALDI_ASSERT(predicted_objf_changes.ApproxEqual(observed_objf_changes, 0.25));
+    }
+  }
+}
+
+void TestSupervisionSplitting(const ContextDependency &ctx_dep,
+                              const TransitionModel &trans_model,
+                              const Supervision &supervision) {
+  fst::StdVectorFst den_fst, normalization_fst;
+  ComputeExampleDenFst(ctx_dep, trans_model, &den_fst);
+  DenominatorGraph den_graph(den_fst, trans_model.NumPdfs());
+  den_graph.GetNormalizationFst(den_fst, &normalization_fst);
+
+  SupervisionSplitter splitter(supervision);
+  int32 num_frames = supervision.num_sequences * supervision.frames_per_sequence,
+      frames_per_range = RandInt(3, 10);
+
+  std::vector<int32> range_starts;
+  SplitIntoRanges(num_frames, frames_per_range, &range_starts);
+  int32 num_ranges = range_starts.size();
+  std::vector<Supervision> split_supervision(num_ranges);
+  for (int32 i = 0; i < num_ranges; i++) {
+    splitter.GetFrameRange(range_starts[i], frames_per_range,
+                           &split_supervision[i]);
+    bool ans = AddWeightToSupervisionFst(normalization_fst,
+                                         &split_supervision[i]);
+    KALDI_ASSERT(ans);
+    split_supervision[i].Check(trans_model);
+  }
+  if (num_ranges > 0) {
+    TestSupervisionIo(split_supervision[RandInt(0, num_ranges - 1)]);
+    TestSupervisionFrames(split_supervision[RandInt(0, num_ranges - 1)]);
+
+    std::vector<Supervision> reattached_supervision;
+    std::vector<const Supervision*> to_append(num_ranges);
+    for (int32 i = 0; i < num_ranges; i++)
+      to_append[i] = &(split_supervision[i]);
+    bool compactify = true;
+    AppendSupervision(to_append, compactify, &reattached_supervision);
+    KALDI_ASSERT(reattached_supervision.size() == 1);
+    ChainTrainingTest(den_graph, reattached_supervision[0]);
+    if (num_frames % frames_per_range == 0) {
+      TestSupervisionReattached(trans_model,
+                                supervision,
+                                reattached_supervision[0]);
+    }
+  }
+}
+
+
+void ChainDenominatorTest(const DenominatorGraph &den_graph) {
+
+  int32 num_sequences = RandInt(1, 5),
+      frames_per_sequence = RandInt(10, 20);
+  if (RandInt(0, 3) == 0)
+    frames_per_sequence *= 30;  // test how it works on long sequences
+  CuMatrix<BaseFloat> nnet_output(num_sequences * frames_per_sequence,
+                                  den_graph.NumPdfs());
+
+  bool zero_output = (RandInt(0, 3) == 0);
+  if (!zero_output)
+    nnet_output.SetRandn();
+
+  ChainTrainingOptions opts;
+
+  DenominatorComputation denominator_computation(opts, den_graph,
+                                                 num_sequences, nnet_output);
+
+  BaseFloat forward_prob = denominator_computation.Forward(),
+      per_frame = forward_prob / (num_sequences * frames_per_sequence);
+  KALDI_LOG << "Forward prob is " << forward_prob
+            << " = " << per_frame << " per frame.";
+
+  CuMatrix<BaseFloat> nnet_output_deriv(nnet_output.NumRows(),
+                                        nnet_output.NumCols());
+
+  denominator_computation.Backward(1.0, &nnet_output_deriv);
+
+
+  { // a check
+    BaseFloat output_deriv_sum = nnet_output_deriv.Sum();
+    KALDI_LOG << "Sum of nnet-output-deriv is " << output_deriv_sum
+              << " vs. expected " << (num_sequences * frames_per_sequence);
+    KALDI_ASSERT(output_deriv_sum - BaseFloat(num_sequences * frames_per_sequence) <
+                 10.0);
+  }
+
+  int32 num_tries = 5;
+  BaseFloat epsilon = 1.0e-04;
+  Vector<BaseFloat> predicted_objf_changes(num_tries),
+      observed_objf_changes(num_tries);
+  for (int32 p = 0; p < num_tries; p++) {
+    CuMatrix<BaseFloat> nnet_delta_output(nnet_output.NumRows(),
+                                          nnet_output.NumCols());
+    nnet_delta_output.SetRandn();
+    nnet_delta_output.Scale(epsilon);
+    predicted_objf_changes(p) = TraceMatMat(nnet_output_deriv,
+                                            nnet_delta_output, kTrans);
+    CuMatrix<BaseFloat> nnet_output_perturbed(nnet_delta_output);
+    nnet_output_perturbed.AddMat(1.0, nnet_output);
+
+    DenominatorComputation denominator_computation_perturbed(opts, den_graph,
+                                                             num_sequences,
+                                                             nnet_output_perturbed);
+
+    BaseFloat forward_prob_perturbed = denominator_computation_perturbed.Forward();
+    observed_objf_changes(p) = forward_prob_perturbed - forward_prob;
+  }
+  KALDI_LOG << "Predicted objf changes are " << predicted_objf_changes;
+  KALDI_LOG << "Observed objf changes are " << observed_objf_changes;
+  {
+    Vector<BaseFloat> error(predicted_objf_changes);
+    error.AddVec(-1.0, observed_objf_changes);
+    KALDI_LOG << "num-sequences = " << num_sequences << ", frames-per-sequence = "
+              << frames_per_sequence << ", relative error is "
+              << (error.Norm(2.0) / predicted_objf_changes.Norm(2.0));
+  }
+  if (frames_per_sequence < 50) {
+    // we get inaccuracy for long segments, I think because there is a bias when we
+    // add random noise for it to increase the likelihood (for winner-take-all reasons)
+    // and for long utterances this bias adds up over the frames and tends to
+    // outweigh the random component that the gradient predicts (which will tend to
+    // cancel).
+    KALDI_ASSERT(predicted_objf_changes.ApproxEqual(observed_objf_changes, 0.25));
+  }
+}
+
+
+
+void ChainSupervisionTest() {
+  ContextDependency *ctx_dep;
+  TransitionModel *trans_model = GenRandTransitionModel(&ctx_dep);
+  const std::vector<int32> &phones = trans_model->GetPhones();
+
+  int32 subsample_factor = RandInt(1, 3);
+
+  int32 phone_sequence_length = RandInt(1, 20);
+  std::vector<std::pair<int32, int32> > phones_durations(phone_sequence_length);
+
+  CompactLattice clat;
+  int32 cur_state = clat.AddState();
+  clat.SetStart(cur_state);
+
+  for (int32 i = 0; i < phone_sequence_length; i++) {
+    int32 phone = phones[RandInt(0, phones.size() - 1)];
+    int32 min_length = trans_model->GetTopo().MinLength(phone),
+        headroom = 5,
+        duration = RandInt(subsample_factor * min_length,
+                           subsample_factor * min_length + headroom);
+    phones_durations[i].first = phone;
+    phones_durations[i].second = duration;
+    int32 next_state = clat.AddState();
+    std::vector<int32> ones(duration, 1);
+    clat.AddArc(cur_state,
+                CompactLatticeArc(phone, phone,
+                                  CompactLatticeWeight(LatticeWeight::One(),
+                                                       ones), next_state));
+    cur_state = next_state;
+  }
+  clat.SetFinal(cur_state, CompactLatticeWeight::One());
+  ProtoSupervision proto_sup1, proto_sup2;
+  SupervisionOptions opts;
+  opts.frame_subsampling_factor = subsample_factor;
+  bool ans1 = AlignmentToProtoSupervision(opts, phones_durations, &proto_sup1),
+      ans2 = PhoneLatticeToProtoSupervision(opts, clat, &proto_sup2);
+  KALDI_ASSERT(ans1 && ans2);
+  KALDI_ASSERT(proto_sup1 == proto_sup2);
+
+  Supervision supervision;
+  if (!ProtoSupervisionToSupervision(*ctx_dep, *trans_model,
+                                     proto_sup1, &supervision)) {
+    // we shouldn't fail because we multiplied by
+    // 'subsample_factor' when creating the duration.
+    KALDI_ERR << "Failed creating supervision.";
+  }
+  supervision.Check(*trans_model);
+  TestSupervisionIo(supervision);
+  TestSupervisionSplitting(*ctx_dep, *trans_model, supervision);
+  TestSupervisionAppend(*trans_model, supervision);
+
+  {
+    fst::StdVectorFst den_fst;
+    ComputeExampleDenFst(*ctx_dep, *trans_model, &den_fst);
+    DenominatorGraph den_graph(den_fst, trans_model->NumPdfs());
+    ChainDenominatorTest(den_graph);
+    if (RandInt(0, 1) == 0)
+      supervision.weight = 0.5;
+    fst::StdVectorFst normalization_fst;
+    den_graph.GetNormalizationFst(den_fst, &normalization_fst);
+    // add the weight to the numerator FST so we can assert objf <= 0.
+    bool ans = AddWeightToSupervisionFst(normalization_fst, &supervision);
+    KALDI_ASSERT(ans);
+    // TODO: still have to test for appended sequences.
+    ChainTrainingTest(den_graph, supervision);
+  }
+
+  delete ctx_dep;
+  delete trans_model;
+}
+
+void AddArc(int32 from, int32 to,
+            fst::StdVectorFst *fst) {
+  fst->AddArc(from, fst::StdArc(0, 0, fst::TropicalWeight::One(), to));
+}
+
+void BreadthFirstTest() {
+  using namespace fst;
+  StdVectorFst fst;
+  for (int32 i = 0; i < 6; i++)
+    fst.AddState();
+  fst.SetStart(0);
+  fst.SetFinal(2, TropicalWeight::One());
+  AddArc(0, 3, &fst);
+  AddArc(0, 4, &fst);
+  AddArc(4, 5, &fst);
+  AddArc(3, 5, &fst);
+  AddArc(5, 1, &fst);
+  AddArc(1, 2, &fst);
+  SortBreadthFirstSearch(&fst);
+
+  KALDI_ASSERT(fst.Properties(fst::kTopSorted, true) != 0);
+
+}
+
+// this function tests SplitIntoRanges() and GetWeightsForRanges().
+void TestRanges() {
+  int32 frames_per_range = RandInt(20, 100),
+                 overlap = RandInt(0, 10),
+              num_frames = RandInt(15, 500);
+  std::vector<int32> range_starts;
+  SplitIntoRanges(num_frames - overlap, frames_per_range - overlap,
+                  &range_starts);
+  Vector<BaseFloat> weights_orig(num_frames),
+      weights_new(num_frames);
+  int32 num_ranges = range_starts.size();
+  for (int32 i = 0; i < num_ranges; i++) {
+    int32 start_t = range_starts[i];
+    for (int32 j = 0; j < frames_per_range; j++) {
+      int32 t = start_t + j;
+      weights_orig(t) += 1.0;
+    }
+  }
+  std::vector<Vector<BaseFloat> > weights;
+  GetWeightsForRanges(frames_per_range,
+                      range_starts, &weights);
+  for (int32 i = 0; i < num_ranges; i++) {
+    KALDI_LOG << "weights[" << i << "] = "
+              << weights[i];
+    int32 start_t = range_starts[i];
+    for (int32 j = 0; j < frames_per_range; j++) {
+      int32 t = start_t + j;
+      weights_new(t) += weights[i](j);
+    }
+  }
+  KALDI_LOG << "Orig weights are " << weights_orig;
+  KALDI_LOG << "New weights are " << weights_new;
+  for (int32 t = 0; t < num_frames; t++) {
+    if (weights_orig(t) != 0.0) {
+      KALDI_ASSERT(fabs(weights_new(t) - 1.0) < 0.001);
+    } else {
+      KALDI_ASSERT(weights_new(t) == 0.0);
+    }
+  }
+}
+
+
+}  // namespace chain
+}  // namespace kaldi
+
+int main() {
+  using namespace kaldi;
+
+  for (int32 loop = 0; loop < 2; loop++) {
+#if HAVE_CUDA == 1
+    if (loop == 0)
+      CuDevice::Instantiate().SelectGpuId("no");
+    else
+      CuDevice::Instantiate().SelectGpuId("yes");
+#endif
+    for (int32 i = 0; i < 5; i++) {
+      kaldi::chain::ChainSupervisionTest();
+      kaldi::chain::BreadthFirstTest();
+    }
+    kaldi::chain::TestRanges();
+#if HAVE_CUDA == 1
+    CuDevice::Instantiate().PrintProfile();
+#endif
+  }
+}
diff --git a/src/chain/chain-supervision.cc b/src/chain/chain-supervision.cc
new file mode 100644
index 00000000000..3074e9c7742
--- /dev/null
+++ b/src/chain/chain-supervision.cc
@@ -0,0 +1,831 @@
+// chain/chain-supervision.cc
+
+// Copyright      2015   Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "chain/chain-supervision.h"
+#include "lat/lattice-functions.h"
+#include "util/text-utils.h"
+#include "hmm/hmm-utils.h"
+#include <numeric>
+
+namespace kaldi {
+namespace chain {
+
+const int kSupervisionMaxStates = 200000;  // we can later make this
+                                           // configurable if needed.
+
+// attempts determinization (with limited max-states) and minimization;
+// returns true on success
+bool TryDeterminizeMinimize(int32 supervision_max_states,
+                            fst::StdVectorFst *supervision_fst) {
+  if (supervision_fst->NumStates() >= supervision_max_states) {
+    KALDI_WARN << "Not attempting determinization as number of states "
+               << "is too large " << supervision_fst->NumStates();
+    return false;
+  }
+  fst::DeterminizeOptions<fst::StdArc> opts;
+  opts.state_threshold = supervision_max_states;
+  fst::StdVectorFst fst_copy = *supervision_fst;
+  fst::Determinize(fst_copy, supervision_fst, opts);
+  // the - 1 here is just because I'm not sure if it stops just before the
+  // threshold.
+  if (supervision_fst->NumStates() >= opts.state_threshold - 1) {
+    KALDI_WARN << "Determinization stopped early after reaching "
+               << supervision_fst->NumStates() << " states.  Likely "
+               << "this utterance has a very strange transcription.";
+    return false;
+  }
+  fst::Minimize(supervision_fst);
+  return true;
+}
+
+void ProtoSupervision::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<ProtoSupervision>");
+  if (!binary) os << "\n";
+  int32 num_frames = allowed_phones.size();
+  WriteToken(os, binary, "<NumFrames>");
+  WriteBasicType(os, binary, num_frames);
+  if (!binary) os << "\n";
+  WriteToken(os, binary, "<AllowedPhones>");
+  if (!binary) os << "\n";
+  for (int32 i = 0; i < num_frames; i++)
+    WriteIntegerVector(os, binary, allowed_phones[i]);
+  if (!binary) os << "\n";
+  WriteFstKaldi(os, binary, fst);
+  WriteToken(os, binary, "</ProtoSupervision>");
+  if (!binary) os << "\n";
+}
+
+void SupervisionOptions::Check() const {
+  KALDI_ASSERT(left_tolerance >= 0 && right_tolerance >= 0 &&
+               frame_subsampling_factor > 0 &&
+               left_tolerance + right_tolerance >= frame_subsampling_factor);
+}
+
+bool AlignmentToProtoSupervision(const SupervisionOptions &opts,
+                                 const std::vector<int32> &phones,
+                                 const std::vector<int32> &durations,
+                                 ProtoSupervision *proto_supervision) {
+  opts.Check();
+  KALDI_ASSERT(phones.size() > 0 && phones.size() == durations.size());
+  std::vector<int32> labels(phones.size());
+  int32 num_frames = std::accumulate(durations.begin(), durations.end(), 0),
+      factor = opts.frame_subsampling_factor,
+      num_frames_subsampled = (num_frames + factor - 1) / factor;
+  proto_supervision->allowed_phones.clear();
+  proto_supervision->allowed_phones.resize(num_frames_subsampled);
+  proto_supervision->fst.DeleteStates();
+  if (num_frames_subsampled == 0)
+    return false;
+
+  int32 current_frame = 0, num_phones = phones.size();
+  for (int32 i = 0; i < num_phones; i++) {
+    int32 phone = phones[i], duration = durations[i];
+    KALDI_ASSERT(phone > 0 && duration > 0);
+    int32 t_start = std::max<int32>(0, (current_frame - opts.left_tolerance)),
+            t_end = std::min<int32>(num_frames,
+                                    (current_frame + duration + opts.right_tolerance)),
+       t_start_subsampled = (t_start + factor - 1) / factor,
+       t_end_subsampled = (t_end + factor - 1) / factor;
+
+    // note: if opts.Check() passed, the following assert should pass too.
+    KALDI_ASSERT(t_end_subsampled > t_start_subsampled &&
+                 t_end_subsampled <= num_frames_subsampled);
+    for (int32 t_subsampled = t_start_subsampled;
+         t_subsampled < t_end_subsampled; t_subsampled++)
+      proto_supervision->allowed_phones[t_subsampled].push_back(phone);
+    current_frame += duration;
+  }
+  KALDI_ASSERT(current_frame == num_frames);
+  for (int32 t_subsampled = 0; t_subsampled < num_frames_subsampled;
+       t_subsampled++) {
+    KALDI_ASSERT(!proto_supervision->allowed_phones[t_subsampled].empty());
+    SortAndUniq(&(proto_supervision->allowed_phones[t_subsampled]));
+  }
+  fst::MakeLinearAcceptor(phones, &(proto_supervision->fst));
+  return true;
+}
+
+bool AlignmentToProtoSupervision(
+    const SupervisionOptions &opts,
+    const std::vector<std::pair<int32, int32> > &phones_durations,
+    ProtoSupervision *proto_supervision) {
+  KALDI_ASSERT(phones_durations.size() > 0);
+  std::vector<int32> phones(phones_durations.size()),
+      durations(phones_durations.size());
+  for (size_t size = phones_durations.size(), i = 0; i < size; i++) {
+    phones[i] = phones_durations[i].first;
+    durations[i] = phones_durations[i].second;
+  }
+  return AlignmentToProtoSupervision(opts, phones, durations,
+                                     proto_supervision);
+}
+
+
+bool ProtoSupervision::operator == (const ProtoSupervision &other) const {
+  return (allowed_phones == other.allowed_phones &&
+          fst::Equal(fst, other.fst));
+}
+
+bool PhoneLatticeToProtoSupervision(const SupervisionOptions &opts,
+                                    const CompactLattice &lat,
+                                    ProtoSupervision *proto_supervision) {
+  opts.Check();
+  if (lat.NumStates() == 0) {
+    KALDI_WARN << "Empty lattice provided";
+    return false;
+  }
+  int32 num_states = lat.NumStates();
+  proto_supervision->fst.DeleteStates();
+  proto_supervision->fst.ReserveStates(num_states);
+  std::vector<int32> state_times;
+  int32 num_frames = CompactLatticeStateTimes(lat, &state_times),
+      factor = opts.frame_subsampling_factor,
+    num_frames_subsampled = (num_frames + factor - 1) / factor;
+  for (int32 state = 0; state < num_states; state++)
+    proto_supervision->fst.AddState();
+  proto_supervision->fst.SetStart(lat.Start());
+
+  proto_supervision->allowed_phones.clear();
+  proto_supervision->allowed_phones.resize(num_frames_subsampled);
+
+  for (int32 state = 0; state < num_states; state++) {
+    int32 state_time = state_times[state];
+    for (fst::ArcIterator<CompactLattice> aiter(lat, state); !aiter.Done();
+         aiter.Next()) {
+      const CompactLatticeArc &lat_arc = aiter.Value();
+      int32 next_state_time = state_time + lat_arc.weight.String().size();
+      int32 phone = lat_arc.ilabel;  // It's an acceptor so ilabel == ollabel.
+      if (phone == 0) {
+        KALDI_WARN << "CompactLattice has epsilon arc.  Unexpected.";
+        return false;
+      }
+      proto_supervision->fst.AddArc(state,
+                                    fst::StdArc(phone, phone,
+                                                fst::TropicalWeight::One(),
+                                                lat_arc.nextstate));
+      int32 t_begin = std::max<int32>(0, (state_time - opts.left_tolerance)),
+              t_end = std::min<int32>(num_frames,
+                                      (next_state_time + opts.right_tolerance)),
+ t_begin_subsampled = (t_begin + factor - 1)/ factor,
+   t_end_subsampled = (t_end + factor - 1)/ factor;
+    for (int32 t_subsampled = t_begin_subsampled;
+         t_subsampled < t_end_subsampled; t_subsampled++)
+      proto_supervision->allowed_phones[t_subsampled].push_back(phone);
+    }
+    if (lat.Final(state) != CompactLatticeWeight::Zero()) {
+      proto_supervision->fst.SetFinal(state, fst::TropicalWeight::One());
+      if (state_times[state] != num_frames) {
+        KALDI_WARN << "Time of final state " << state << " in lattice is "
+                   << "not equal to number of frames " << num_frames
+                   << ".  Are you sure the lattice is phone-aligned? "
+                   << "Rejecting it.";
+        return false;
+      }
+    }
+  }
+  for (int32 t_subsampled = 0; t_subsampled < num_frames_subsampled;
+       t_subsampled++) {
+    KALDI_ASSERT(!proto_supervision->allowed_phones[t_subsampled].empty());
+    SortAndUniq(&(proto_supervision->allowed_phones[t_subsampled]));
+  }
+  return true;
+}
+
+
+bool TimeEnforcerFst::GetArc(StateId s, Label ilabel, fst::StdArc* oarc) {
+  // the following call will do the range-check on 'ilabel'.
+  int32 phone = trans_model_.TransitionIdToPhone(ilabel);
+  KALDI_ASSERT(static_cast<size_t>(s) <= allowed_phones_.size());
+  if (static_cast<size_t>(s) == allowed_phones_.size()) {
+    // No arcs come from the final state.a
+    return false;
+  }
+  if (std::binary_search(allowed_phones_[s].begin(),
+                         allowed_phones_[s].end(), phone)) {
+    // the olabel will be a pdf-id plus one, not a transition-id.
+    int32 pdf_id = trans_model_.TransitionIdToPdf(ilabel);
+    oarc->ilabel = ilabel;
+    oarc->olabel = pdf_id + 1;
+    oarc->weight = fst::TropicalWeight::One();
+    oarc->nextstate = s + 1;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+
+bool ProtoSupervisionToSupervision(
+    const ContextDependencyInterface &ctx_dep,
+    const TransitionModel &trans_model,
+    const ProtoSupervision &proto_supervision,
+    Supervision *supervision) {
+  using fst::VectorFst;
+  using fst::StdArc;
+  VectorFst<StdArc> phone_fst(proto_supervision.fst);
+  int32 subsequential_symbol = trans_model.GetPhones().back() + 1;
+  if (ctx_dep.CentralPosition() != ctx_dep.ContextWidth() - 1) {
+    // note: this function only adds the subseq symbol to the input of what was
+    // previously an acceptor, so we project, i.e. copy the ilabels to the
+    // olabels
+    AddSubsequentialLoop(subsequential_symbol, &phone_fst);
+    fst::Project(&phone_fst, fst::PROJECT_INPUT);
+  }
+  std::vector<int32> disambig_syms;  // empty list of diambiguation symbols.
+  fst::ContextFst<StdArc> cfst(subsequential_symbol, trans_model.GetPhones(),
+                               disambig_syms, ctx_dep.ContextWidth(),
+                               ctx_dep.CentralPosition());
+  VectorFst<StdArc> context_dep_fst;
+  fst::ComposeContextFst(cfst, phone_fst, &context_dep_fst);
+  // at this point, context_dep_fst will have indexes into 'ilabels' as its
+  // input symbol (representing context-dependent phones), and phones on its
+  // output.  We don't need the phones, so we'll project.
+  fst::Project(&context_dep_fst, fst::PROJECT_INPUT);
+
+  std::vector<int32> disambig_syms_h; // disambiguation symbols on input side
+                                      // of H -- will be empty.
+
+  HTransducerConfig h_cfg;
+
+  // We don't want to add any transition probabilities as they will be added
+  // when we compose with the denominator graph.
+  h_cfg.transition_scale = 0.0;
+  h_cfg.push_weights = false;  // there's nothing to push.
+
+
+  VectorFst<StdArc> *h_fst = GetHTransducer(cfst.ILabelInfo(),
+                                            ctx_dep,
+                                            trans_model,
+                                            h_cfg,
+                                            &disambig_syms_h);
+  KALDI_ASSERT(disambig_syms_h.empty());
+
+  VectorFst<StdArc> transition_id_fst;
+  TableCompose(*h_fst, context_dep_fst, &transition_id_fst);
+  delete h_fst;
+
+  // We don't want to add any transition probabilities as they will be added
+  // when we compose with the denominator graph.
+  BaseFloat self_loop_scale = 0.0;
+
+  bool reorder = true;  // more efficient in general; won't affect results.
+  // add self-loops to the FST with transition-ids as its labels.
+  AddSelfLoops(trans_model, disambig_syms_h, self_loop_scale, reorder,
+               &transition_id_fst);
+
+  // at this point transition_id_fst will have transition-ids as its ilabels and
+  // context-dependent phones (indexes into ILabelInfo()) as its olabels.
+  // Discard the context-dependent phones by projecting on the input, keeping
+  // only the transition-ids.
+  fst::Project(&transition_id_fst, fst::PROJECT_INPUT);
+  if (transition_id_fst.Properties(fst::kIEpsilons, true) != 0) {
+    // remove epsilons, if there are any.
+    fst::RmEpsilon(&transition_id_fst);
+  }
+  KALDI_ASSERT(transition_id_fst.NumStates() > 0);
+
+  // The last step is to enforce that phones can only appear on the frames they
+  // are 'allowed' to appear on.  This will also convert the FST to have pdf-ids
+  // plus one as the labels
+  TimeEnforcerFst enforcer_fst(trans_model, proto_supervision.allowed_phones);
+  ComposeDeterministicOnDemand(transition_id_fst,
+                               &enforcer_fst,
+                               &(supervision->fst));
+  fst::Connect(&(supervision->fst));
+  // at this point supervision->fst will have pdf-ids plus one as the olabels,
+  // but still transition-ids as the ilabels.  Copy olabels to ilabels.
+  fst::Project(&(supervision->fst), fst::PROJECT_OUTPUT);
+
+  KALDI_ASSERT(supervision->fst.Properties(fst::kIEpsilons, true) == 0);
+  if (supervision->fst.NumStates() == 0) {
+    KALDI_WARN << "Supervision FST is empty (too many phones for too few "
+               << "frames?)";
+    // possibly there were too many phones for too few frames.
+    return false;
+  }
+
+  supervision->weight = 1.0;
+  supervision->num_sequences = 1;
+  supervision->frames_per_sequence = proto_supervision.allowed_phones.size();
+  supervision->label_dim = trans_model.NumPdfs();
+  SortBreadthFirstSearch(&(supervision->fst));
+  return true;
+}
+
+
+
+SupervisionSplitter::SupervisionSplitter(
+    const Supervision &supervision):
+    supervision_(supervision),
+    frame_(supervision_.fst.NumStates(), -1) {
+  const fst::StdVectorFst &fst(supervision_.fst);
+  // The fst in struct Supervision is supposed to be epsilon-free and
+  // topologically sorted; this function relies on those properties to
+  // set up the frame_ vector (which maps each state in the
+  // FST to a frame-index 0 <= t < num_frames), and it checks them.
+  if (supervision_.num_sequences != 1) {
+    KALDI_WARN << "Splitting already-reattached sequence (only expected in "
+               << "testing code)";
+  }
+  int32 num_states = fst.NumStates(),
+      num_frames = supervision_.frames_per_sequence * supervision_.num_sequences;
+  KALDI_ASSERT(num_states > 0);
+  int32 start_state = fst.Start();
+  // FST should be top-sorted and connected, so start-state must be 0.
+  KALDI_ASSERT(start_state == 0 && "Expecting start-state to be 0");
+  frame_[start_state] = 0;
+  for (int32 state = 0; state < num_states; state++) {
+    int32 cur_frame = frame_[state];
+    if (cur_frame == -1) {
+      // If this happens it means the Supervision does not have the required
+      // properties, e.g. being top-sorted and connected.
+      KALDI_ERR << "Error computing frame indexes for Supervision";
+    }
+    for (fst::ArcIterator<fst::StdVectorFst> aiter(fst, state);
+         !aiter.Done(); aiter.Next()) {
+      const fst::StdArc &arc = aiter.Value();
+      // The FST is supposed to be an epsilon-free acceptor.
+      KALDI_ASSERT(arc.ilabel == arc.olabel && arc.ilabel > 0);
+      int32 nextstate = arc.nextstate;
+      KALDI_ASSERT(nextstate >= 0 && nextstate < num_states);
+      // all arcs go from some t to t + 1.
+      int32 &next_frame = frame_[nextstate];
+      if (next_frame == -1)
+        next_frame = cur_frame + 1;
+      else
+        KALDI_ASSERT(next_frame == cur_frame + 1);
+    }
+  }
+  // The following assert checks that the number of frames in the FST
+  // matches the num_frames stored in the supervision object; it also relies
+  // on the topological sorting and connectedness of the FST.
+  KALDI_ASSERT(frame_.back() == num_frames);
+  std::vector<int32>::iterator iter = frame_.begin(),
+      end = iter + (frame_.size() - 1);
+  // check that the frame-indexes of states are monotonically non-decreasing, as
+  // they should be based on the top-sorting.  We rely on this property to
+  // compute the frame ranges while splitting.
+  while (iter != end) {
+    int32 cur_t = *iter;
+    ++iter;
+    int32 next_t = *iter;
+    KALDI_ASSERT(next_t >= cur_t);
+  }
+}
+
+void SupervisionSplitter::GetFrameRange(int32 begin_frame, int32 num_frames,
+                                        Supervision *out_supervision) const {
+  int32 end_frame = begin_frame + num_frames;
+  // Note: end_frame is not included in the range of frames that the
+  // output supervision object covers; it's one past the end.
+  KALDI_ASSERT(num_frames > 0 && begin_frame >= 0 &&
+               begin_frame + num_frames <=
+               supervision_.num_sequences * supervision_.frames_per_sequence);
+  std::vector<int32>::const_iterator begin_iter =
+      std::lower_bound(frame_.begin(), frame_.end(), begin_frame),
+      end_iter = std::lower_bound(begin_iter, frame_.end(), end_frame);
+  KALDI_ASSERT(*begin_iter == begin_frame &&
+               (begin_iter == frame_.begin() || begin_iter[-1] < begin_frame));
+  // even if end_frame == supervision_.num_frames, there should be a state with
+  // that frame index.
+  KALDI_ASSERT(end_iter[-1] < end_frame &&
+               (end_iter < frame_.end() || *end_iter == end_frame));
+  int32 begin_state = begin_iter - frame_.begin(),
+      end_state = end_iter - frame_.begin();
+
+  CreateRangeFst(begin_frame, end_frame,
+                 begin_state, end_state, &(out_supervision->fst));
+
+  KALDI_ASSERT(out_supervision->fst.NumStates() > 0);
+  KALDI_ASSERT(supervision_.num_sequences == 1);
+  out_supervision->num_sequences = 1;
+  out_supervision->weight = supervision_.weight;
+  out_supervision->frames_per_sequence = num_frames;
+  out_supervision->label_dim = supervision_.label_dim;
+}
+
+void SupervisionSplitter::CreateRangeFst(
+    int32 begin_frame, int32 end_frame,
+    int32 begin_state, int32 end_state,
+    fst::StdVectorFst *fst) const {
+  // There will be a special pre-start state that has epsilon transitions to all
+  // states whose frame equals begin_frame; we'll later do RmEpsilon to remove
+  // these.  Next we will include all states begin_state <= s < end_state in the
+  // output FST, plus (if end_frame != supervision_.num_frames) a special final
+  // state.  All transitions to states >= end_state will be turned into
+  // a transition to the special final state.  There should be no final-probs
+  // on the states begin_state <= s < end_state.
+  KALDI_ASSERT(end_state > begin_state);
+  fst->DeleteStates();
+  fst->ReserveStates(end_state - begin_state + 2);
+  int32 start_state = fst->AddState();
+  fst->SetStart(start_state);
+  for (int32 i = begin_state; i < end_state; i++)
+    fst->AddState();
+  // Add the special final-state.
+  int32 final_state = fst->AddState();
+  fst->SetFinal(final_state, fst::TropicalWeight::One());
+  for (int32 state = begin_state; state < end_state; state++) {
+    int32 output_state = state - begin_state + 1;
+    if (frame_[state] == begin_frame) {
+      // we'd like to make this an initial state, but OpenFst doesn't allow
+      // multiple initial states.  Instead we add an epsilon transition to it
+      // from our actual initial state; we'll later do RmEpsilon and
+      // determinize.
+      fst->AddArc(start_state,
+                  fst::StdArc(0, 0, fst::TropicalWeight::One(),
+                              output_state));
+    } else {
+      KALDI_ASSERT(frame_[state] < end_frame);
+    }
+    typedef fst::ArcIterator<fst::StdVectorFst> IterType;
+    for (IterType aiter(supervision_.fst, state); !aiter.Done(); aiter.Next()) {
+      const fst::StdArc &arc(aiter.Value());
+      int32 nextstate = arc.nextstate;
+      if (nextstate >= end_state) {
+        // A transition to any state outside the range becomes a transition to
+        // our special final-state.
+        fst->AddArc(output_state,
+                    fst::StdArc(arc.ilabel, arc.olabel,
+                                arc.weight, final_state));
+      } else {
+        int32 output_nextstate = arc.nextstate - begin_state + 1;
+        // note: arc.ilabel should equal arc.olabel and arc.weight should equal
+        // fst::TropicalWeight::One().
+        fst->AddArc(output_state,
+                    fst::StdArc(arc.ilabel, arc.olabel,
+                                arc.weight, output_nextstate));
+      }
+    }
+  }
+}
+
+
+// I couldn't figure out how to do this with OpenFST's native 'visitor' and
+// queue mechanisms so I'm just coding this myself.
+void SortBreadthFirstSearch(fst::StdVectorFst *fst) {
+  std::vector<int32> state_order(fst->NumStates(), -1);
+  std::vector<bool> seen(fst->NumStates(), false);
+  int32 start_state = fst->Start();
+  KALDI_ASSERT(start_state >= 0);
+  std::deque<int32> queue;
+  queue.push_back(start_state);
+  seen[start_state] = true;
+  int32 num_output = 0;
+  while (!queue.empty()) {
+    int32 state = queue.front();
+    state_order[state] = num_output++;
+    queue.pop_front();
+    for (fst::ArcIterator<fst::StdVectorFst> aiter(*fst, state);
+         !aiter.Done(); aiter.Next()) {
+      int32 nextstate = aiter.Value().nextstate;
+      if (!seen[nextstate]) {
+        seen[nextstate] = true;
+        queue.push_back(nextstate);
+      }
+    }
+  }
+  if (num_output != fst->NumStates())
+    KALDI_ERR << "Input to SortBreadthFirstSearch must be connected.";
+  fst::StateSort(fst, state_order);
+}
+
+
+
+void Supervision::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<Supervision>");
+  WriteToken(os, binary, "<Weight>");
+  WriteBasicType(os, binary, weight);
+  WriteToken(os, binary, "<NumSequences>");
+  WriteBasicType(os, binary, num_sequences);
+  WriteToken(os, binary, "<FramesPerSeq>");
+  WriteBasicType(os, binary, frames_per_sequence);
+  WriteToken(os, binary, "<LabelDim>");
+  WriteBasicType(os, binary, label_dim);
+  KALDI_ASSERT(frames_per_sequence > 0 && label_dim > 0 &&
+               num_sequences > 0);
+  if (binary == false) {
+    // In text mode, write the FST without any compactification.
+    WriteFstKaldi(os, binary, fst);
+  } else {
+    // Write using StdAcceptorCompactFst, making use of the fact that it's an
+    // acceptor.
+    fst::FstWriteOptions write_options("<unknown>");
+    fst::StdCompactAcceptorFst::WriteFst(
+        fst, fst::AcceptorCompactor<fst::StdArc>(), os,
+        write_options);
+  }
+  WriteToken(os, binary, "</Supervision>");
+}
+
+void Supervision::Swap(Supervision *other) {
+  std::swap(weight, other->weight);
+  std::swap(num_sequences, other->num_sequences);
+  std::swap(frames_per_sequence, other->frames_per_sequence);
+  std::swap(label_dim, other->label_dim);
+  std::swap(fst, other->fst);
+}
+
+void Supervision::Read(std::istream &is, bool binary) {
+  ExpectToken(is, binary, "<Supervision>");
+  ExpectToken(is, binary, "<Weight>");
+  ReadBasicType(is, binary, &weight);
+  ExpectToken(is, binary, "<NumSequences>");
+  ReadBasicType(is, binary, &num_sequences);
+  ExpectToken(is, binary, "<FramesPerSeq>");
+  ReadBasicType(is, binary, &frames_per_sequence);
+  ExpectToken(is, binary, "<LabelDim>");
+  ReadBasicType(is, binary, &label_dim);
+  if (!binary) {
+    ReadFstKaldi(is, binary, &fst);
+  } else {
+    fst::StdCompactAcceptorFst *compact_fst =
+        fst::StdCompactAcceptorFst::Read(
+            is, fst::FstReadOptions(std::string("[unknown]")));
+    if (compact_fst == NULL)
+      KALDI_ERR << "Error reading compact FST from disk";
+    fst = *compact_fst;
+    delete compact_fst;
+  }
+    // ReadFstKaldi will work even though we wrote using a compact format.
+  ExpectToken(is, binary, "</Supervision>");
+}
+
+int32 ComputeFstStateTimes(const fst::StdVectorFst &fst,
+                           std::vector<int32> *state_times) {
+  if (fst.Start() != 0)  // this is implied by our properties.
+    KALDI_ERR << "Expecting input FST start state to be zero";
+  int32 num_states = fst.NumStates();
+  int32 total_length = -1;
+  state_times->clear();
+  state_times->resize(num_states, -1);
+  (*state_times)[0] = 0;
+  for (int32 state = 0; state < num_states; state++) {
+    int32 next_state_time = (*state_times)[state] + 1;
+    if (next_state_time <= 0)  // i.e. (*state_times)[state] < 0
+      KALDI_ERR << "Input FST does not have required properties.";
+    for (fst::ArcIterator<fst::StdVectorFst> aiter(fst, state);
+         !aiter.Done(); aiter.Next()) {
+      const fst::StdArc &arc = aiter.Value();
+      KALDI_ASSERT(arc.ilabel != 0);
+      int32 &next_state_ref = (*state_times)[arc.nextstate];
+      if (next_state_ref == -1)
+        next_state_ref = next_state_time;
+      else if (next_state_ref != next_state_time)
+        KALDI_ERR << "Input FST does not have required properties.";
+    }
+    if (fst.Final(state) != fst::TropicalWeight::Zero()) {
+      if (total_length == -1)
+        total_length = next_state_time - 1;
+      else if (total_length != next_state_time - 1)
+        KALDI_ERR << "Input FST does not have required properties.";
+    }
+  }
+  if (total_length < 0)
+    KALDI_ERR << "Input FST does not have required properties.";
+  return total_length;
+}
+
+Supervision::Supervision(const Supervision &other):
+    weight(other.weight), num_sequences(other.num_sequences),
+    frames_per_sequence(other.frames_per_sequence),
+    label_dim(other.label_dim), fst(other.fst) { }
+
+void AppendSupervision(const std::vector<const Supervision*> &input,
+                       bool compactify,
+                       std::vector<Supervision> *output_supervision) {
+  KALDI_ASSERT(!input.empty());
+  int32 label_dim = input[0]->label_dim,
+      num_inputs = input.size();
+  if (num_inputs == 1) {
+    output_supervision->resize(1);
+    (*output_supervision)[0] = *(input[0]);
+    return;
+  }
+  std::vector<bool> output_was_merged;
+  for (int32 i = 1; i < num_inputs; i++)
+    KALDI_ASSERT(input[i]->label_dim == label_dim &&
+                 "Trying to append incompatible Supervision objects");
+  output_supervision->clear();
+  output_supervision->reserve(input.size());
+  for (int32 i = 0; i < input.size(); i++) {
+    const Supervision &src = *(input[i]);
+    if (compactify && !output_supervision->empty() &&
+        output_supervision->back().weight == src.weight &&
+        output_supervision->back().frames_per_sequence ==
+        src.frames_per_sequence) {
+      // Combine with current output
+      // append src.fst to output_supervision->fst.
+      fst::Concat(&output_supervision->back().fst, src.fst);
+      output_supervision->back().num_sequences++;
+      output_was_merged.back() = true;
+    } else {
+      output_supervision->resize(output_supervision->size() + 1);
+      output_supervision->back() = src;
+      output_was_merged.push_back(false);
+    }
+  }
+  KALDI_ASSERT(output_was_merged.size() == output_supervision->size());
+  for (size_t i = 0; i < output_supervision->size(); i++) {
+    if (output_was_merged[i]) {
+      fst::StdVectorFst &out_fst = (*output_supervision)[i].fst;
+      // The process of concatenation will have introduced epsilons.
+      fst::RmEpsilon(&out_fst);
+      SortBreadthFirstSearch(&out_fst);
+    }
+  }
+}
+
+bool AddWeightToSupervisionFst(const fst::StdVectorFst &normalization_fst,
+                               Supervision *supervision) {
+  // remove epsilons before composing.  'normalization_fst' has noepsilons so
+  // the composed result will be epsilon free.
+  fst::StdVectorFst supervision_fst_noeps(supervision->fst);
+  fst::RmEpsilon(&supervision_fst_noeps);
+  if (!TryDeterminizeMinimize(kSupervisionMaxStates,
+                              &supervision_fst_noeps))
+    return false;
+
+  // note: by default, 'Compose' will call 'Connect', so if the
+  // resulting FST is not connected, it will end up empty.
+  fst::StdVectorFst composed_fst;
+  fst::Compose(supervision_fst_noeps, normalization_fst,
+               &composed_fst);
+  if (composed_fst.NumStates() == 0)
+    return false;
+  // projection should not be necessary, as both FSTs are acceptors.
+  // determinize and minimize to make it as compact as possible.
+
+  if (!TryDeterminizeMinimize(kSupervisionMaxStates,
+                              &composed_fst))
+    return false;
+  supervision->fst = composed_fst;
+
+  // Make sure the states are numbered in increasing order of time.
+  SortBreadthFirstSearch(&(supervision->fst));
+  KALDI_ASSERT(supervision->fst.Properties(fst::kAcceptor, true) == fst::kAcceptor);
+  KALDI_ASSERT(supervision->fst.Properties(fst::kIEpsilons, true) == 0);
+  return true;
+}
+
+void SplitIntoRanges(int32 num_frames,
+                     int32 frames_per_range,
+                     std::vector<int32> *range_starts) {
+  if (frames_per_range > num_frames) {
+    range_starts->clear();
+    return;  // there is no room for even one range.
+  }
+  int32 num_ranges = num_frames  / frames_per_range,
+      extra_frames = num_frames % frames_per_range;
+  // this is a kind of heuristic.  If the number of frames we'd
+  // be skipping is less than 1/4 of the frames_per_range, then
+  // skip frames; otherwise, duplicate frames.
+  // it's important that this is <=, not <, so that if
+  // extra_frames == 0 and frames_per_range is < 4, we
+  // don't insert an extra range.
+  if (extra_frames <= frames_per_range / 4) {
+    // skip frames.  we do this at start or end, or between ranges.
+    std::vector<int32> num_skips(num_ranges + 1, 0);
+    for (int32 i = 0; i < extra_frames; i++)
+      num_skips[RandInt(0, num_ranges)]++;
+    range_starts->resize(num_ranges);
+    int32 cur_start = num_skips[0];
+    for (int32 i = 0; i < num_ranges; i++) {
+      (*range_starts)[i] = cur_start;
+      cur_start += frames_per_range;
+      cur_start += num_skips[i + 1];
+    }
+    KALDI_ASSERT(cur_start == num_frames);
+  } else {
+    // duplicate frames.
+    num_ranges++;
+    int32 num_duplicated_frames = frames_per_range - extra_frames;
+    // the way we handle the 'extra_frames' frames of output is that we
+    // backtrack zero or more frames between outputting each pair of ranges, and
+    // the total of these backtracks equals 'extra_frames'.
+    std::vector<int32> num_backtracks(num_ranges, 0);
+    for (int32 i = 0; i < num_duplicated_frames; i++) {
+      // num_ranges - 2 below is not a bug.  we only want to backtrack
+      // between ranges, not past the end of the last range (i.e. at
+      // position num_ranges - 1).  we make the vector one longer to
+      // simplify the loop below.
+      num_backtracks[RandInt(0, num_ranges - 2)]++;
+    }
+    range_starts->resize(num_ranges);
+    int32 cur_start = 0;
+    for (int32 i = 0; i < num_ranges; i++) {
+      (*range_starts)[i] = cur_start;
+      cur_start += frames_per_range;
+      cur_start -= num_backtracks[i];
+    }
+    KALDI_ASSERT(cur_start == num_frames);
+  }
+}
+
+bool Supervision::operator == (const Supervision &other) const {
+  return weight == other.weight && num_sequences == other.num_sequences &&
+      frames_per_sequence == other.frames_per_sequence &&
+      label_dim == other.label_dim && fst::Equal(fst, other.fst);
+}
+
+void Supervision::Check(const TransitionModel &trans_mdl) const {
+  if (weight <= 0.0)
+    KALDI_ERR << "Weight should be positive.";
+  if (frames_per_sequence <= 0)
+    KALDI_ERR << "Invalid frames_per_sequence: " << frames_per_sequence;
+  if (num_sequences <= 0)
+    KALDI_ERR << "Invalid num_sequences: " << num_sequences;
+  if (label_dim != trans_mdl.NumPdfs())
+    KALDI_ERR << "Invalid label-dim: " << label_dim
+              << ", expected " << trans_mdl.NumPdfs();
+  std::vector<int32> state_times;
+  if (frames_per_sequence * num_sequences !=
+      ComputeFstStateTimes(fst, &state_times))
+    KALDI_ERR << "Num-frames does not match fst.";
+}
+
+void GetWeightsForRanges(int32 range_length,
+                         const std::vector<int32> &range_starts,
+                         std::vector<Vector<BaseFloat> > *weights) {
+  KALDI_ASSERT(range_length > 0);
+  int32 num_ranges = range_starts.size();
+  weights->resize(num_ranges);
+  for (int32 i = 0; i < num_ranges; i++) {
+    (*weights)[i].Resize(range_length);
+    (*weights)[i].Set(1.0);
+  }
+  for (int32 i = 0; i + 1 < num_ranges; i++) {
+    int32 j = i + 1;
+    int32 i_start = range_starts[i], i_end = i_start + range_length,
+          j_start = range_starts[j];
+    KALDI_ASSERT(j_start > i_start);
+    if (i_end > j_start) {
+      Vector<BaseFloat> &i_weights = (*weights)[i], &j_weights = (*weights)[j];
+
+      int32 overlap_length = i_end - j_start;
+      // divide the overlapping piece of the 2 ranges into 3 regions of
+      // approximately equal size, called the left, middle and right region.
+      int32 left_length = overlap_length / 3,
+          middle_length = (overlap_length - left_length) / 2,
+           right_length = overlap_length - left_length - middle_length;
+      KALDI_ASSERT(left_length >= 0 && middle_length >= 0 && right_length >= 0 &&
+                   left_length + middle_length + right_length == overlap_length);
+      // set the weight of the left region to be zero for the right (j) range.
+      for (int32 k = 0; k < left_length; k++)
+        j_weights(k) = 0.0;
+      // set the weight of the right region to be zero for the left (i) range.
+      for (int32 k = 0; k < right_length; k++)
+        i_weights(range_length - 1 - k) = 0.0;
+      // for the middle range, linearly interpolate between the 0's and 1's.
+      // note: we multiply with existing weights instead of set in order to get
+      // more accurate behavior in the unexpected case where things triply
+      // overlap.
+      for (int32 k = 0; k < middle_length; k++) {
+        BaseFloat weight = (0.5 + k) / middle_length;
+        j_weights(left_length + k) = weight;
+        i_weights(range_length - 1 - right_length - k) = weight;
+      }
+    }
+  }
+}
+
+
+void GetWeightsForRangesNew(int32 range_length,
+                            int32 num_frames_zeroed,
+                            const std::vector<int32> &range_starts,
+                            std::vector<Vector<BaseFloat> > *weights) {
+  KALDI_ASSERT(range_length > 0 && num_frames_zeroed * 2 < range_length);
+  int32 num_ranges = range_starts.size();
+  weights->resize(num_ranges);
+  for (int32 i = 0; i < num_ranges; i++) {
+    (*weights)[i].Resize(range_length);
+    (*weights)[i].Set(1.0);
+  }
+  if (num_frames_zeroed == 0)
+    return;
+  for (int32 i = 1; i < num_ranges; i++)
+    (*weights)[i].Range(0, num_frames_zeroed).Set(0.0);
+  for (int32 i = 0; i + 1 < num_ranges; i++)
+    (*weights)[i].Range(range_length - num_frames_zeroed,
+                        num_frames_zeroed).Set(0.0);
+}
+
+
+}  // namespace chain
+}  // namespace kaldi
diff --git a/src/chain/chain-supervision.h b/src/chain/chain-supervision.h
new file mode 100644
index 00000000000..2dda8baf1e4
--- /dev/null
+++ b/src/chain/chain-supervision.h
@@ -0,0 +1,434 @@
+// chain/chain-supervision.h
+
+// Copyright       2015  Johns Hopkins University (Author: Daniel Povey)
+
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#ifndef KALDI_CHAIN_CHAIN_SUPERVISION_H_
+#define KALDI_CHAIN_CHAIN_SUPERVISION_H_
+
+#include <vector>
+#include <map>
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "lat/kaldi-lattice.h"
+#include "fstext/deterministic-fst.h"
+#include "hmm/transition-model.h"
+
+namespace kaldi {
+namespace chain {
+
+/*
+  This file contains some declarations relating to the object we use to
+  encode the supervision information for the 'chain' model.
+
+  If we were training the model on whole utterances we could just use the
+  reference phone sequence, but to make it easier to train on parts of
+  utterances (and also for efficiency) we use the time-alignment information,
+  extended by a user-specified margin, to limit the range of frames
+  that the phones can appear at.
+*/
+
+
+struct SupervisionOptions {
+  int32 left_tolerance;
+  int32 right_tolerance;
+  int32 frame_subsampling_factor;
+
+  SupervisionOptions(): left_tolerance(5),
+                        right_tolerance(5),
+                        frame_subsampling_factor(1) { }
+
+  void Register(OptionsItf *opts) {
+    opts->Register("left-tolerance", &left_tolerance, "Left tolerance for "
+                   "shift in phone position relative to the alignment");
+    opts->Register("right-tolerance", &right_tolerance, "Right tolerance for "
+                   "shift in phone position relative to the alignment");
+    opts->Register("frame-subsampling-factor", &frame_subsampling_factor, "Used "
+                   "if the frame-rate for the chain model will be less than the "
+                   "frame-rate of the original alignment.  Applied after "
+                   "left-tolerance and right-tolerance are applied (so they are "
+                   "in terms of the original num-frames.");
+  }
+  void Check() const;
+};
+
+
+// This is the form that the supervision information for 'chain' models takes
+// we compile it to Supervision.
+//  The normal compilation sequence is:
+// (AlignmentToProtoSupervision or PhoneLatticeToProtoSupervision)
+// Then you would call ProtoSupervisionToSupervision.
+
+struct ProtoSupervision {
+  // a list of (sorted, unique) lists of phones that are allowed
+  // on each frame.  number of frames is allowed_phones.size(), which
+  // will equal the path length in 'fst'.
+  std::vector<std::vector<int32> > allowed_phones;
+
+  // The FST of phones; an epsilon-free acceptor.
+  fst::StdVectorFst fst;
+
+  bool operator == (const ProtoSupervision &other) const;
+
+  // We have a Write but no Read function; this Write function is
+  // only needed for debugging.
+  void Write(std::ostream &os, bool binary) const;
+};
+
+/**  Creates a ProtoSupervision from a vector of phones and their durations,
+     such as might be derived from a training-data alignment (see the function
+     SplitToPhones()).  Note: this probably isn't the normal way you'll do it,
+     it might be better to start with a phone-aligned lattice so you can capture
+     the alternative pronunciations; see PhoneLatticeToProtoSupervision().
+     Returns true on success (the only possible failure is that total duration <
+     opts.subsampling_factor). */
+bool AlignmentToProtoSupervision(const SupervisionOptions &opts,
+                                 const std::vector<int32> &phones,
+                                 const std::vector<int32> &durations,
+                                 ProtoSupervision *proto_supervision);
+
+/**   Creates a ProtoSupervision object from a vector of (phone, duration) pairs
+      (see the function SplitToPhones()).  This does the same jobs as the other
+      AlignmentToProtoSupervision, from different input.
+ */
+bool AlignmentToProtoSupervision(
+    const SupervisionOptions &opts,
+    const std::vector<std::pair<int32, int32> > &phones_durs,
+    ProtoSupervision *proto_supervision);
+
+
+/** Creates a proto-supervision from a phone-aligned phone lattice (i.e. a
+    lattice with phones as the labels, and with the transition-ids aligned with
+    the phones so you can compute the correct times.  The normal path to
+    create such a lattice would be to generate a lattice containing multiple
+    pronunciations of the transcript by using steps/align_fmllr_lats.sh or a
+    similar script, followed by lattice-align-phones
+    --replace-output-symbols=true.
+    Returns true on success, and false on failure (the only failure modes are that
+    the number of frames in the lattice is less than opts.frame_subsampling_factor,
+    or there are epsilon phones in the lattice, or the final-probs have alignments
+    on them.
+*/
+bool PhoneLatticeToProtoSupervision(const SupervisionOptions &opts,
+                                    const CompactLattice &clat,
+                                    ProtoSupervision *proto_supervision);
+
+
+/** Modifies the duration information (start_time and end_time) of each phone
+    instance by the left_tolerance and right_tolerance (being careful not to go
+    over the edges of the utterance) and then applies frame-rate subsampling by
+    dividing each frame index in start_times and end_times , and num_frames, by
+    frame_subsampling_factor.  Requires that proto_supervision->num_frames >=
+    options.frame_subsampling_factor.
+
+*/
+void ModifyProtoSupervisionTimes(const SupervisionOptions &options,
+                                 ProtoSupervision *proto_supervision);
+
+
+
+/**
+   This class wraps the vector of allowed phones for each frame to create a
+   DeterministicOnDemandFst that we can compose with the decoding-graph FST to
+   limit the frames on which these phones are allowed to appear.  This FST also
+   helps us convert the labels from transition-ids to (pdf-ids plus one), which
+   is what we'll be using in the forward-backward (it avoids the need to
+   keep the transition model around).
+
+   Suppose the number of frames is T, then there will be T+1 states in
+   this FST, numbered from 0 to T+1, where state 0 is initial and state
+   T+1 is final.  A transition is only allowed from state t to state t+1
+   with a particular transition-id as its ilabel, if the corresponding
+   phone is listed in the 'allowed_phones' for that frame.  The olabels
+   are pdf-ids plus one.
+ */
+class TimeEnforcerFst:
+      public fst::DeterministicOnDemandFst<fst::StdArc> {
+ public:
+  typedef fst::StdArc::Weight Weight;
+  typedef fst::StdArc::StateId StateId;
+  typedef fst::StdArc::Label Label;
+
+  TimeEnforcerFst(const TransitionModel &trans_model,
+                  const std::vector<std::vector<int32> > &allowed_phones):
+      trans_model_(trans_model),
+      allowed_phones_(allowed_phones) { }
+
+  // We cannot use "const" because the pure virtual function in the interface is
+  // not const.
+  virtual StateId Start() { return 0; }
+
+  virtual Weight Final(StateId s) {
+    return (s == allowed_phones_.size() ? Weight::One() : Weight::Zero());
+  }
+
+  // The ilabel is a transition-id; the state is interpreted as a frame-index.
+  // The olabel on oarc will be a pdf-id.  The state-id is the time index 0 <= t
+  // <= num_frames.  All transitions are to the next frame (but not all are
+  // allowed).  The interface of GetArc requires ilabel to be nonzero (not
+  // epsilon).
+  virtual bool GetArc(StateId s, Label ilabel, fst::StdArc* oarc);
+
+ private:
+  const TransitionModel &trans_model_;
+  const std::vector<std::vector<int32> > &allowed_phones_;
+};
+
+
+// struct Supervision is the fully-processed supervision information for
+// a whole utterance or (after splitting) part of an utterance.  It contains the
+// time limits on phones encoded into the FST.
+struct Supervision {
+  // The weight of this example (will usually be 1.0).
+  BaseFloat weight;
+
+  // num_sequences will be 1 if you create a Supervision object from a single
+  // lattice or alignment, but if you combine multiple Supevision objects
+  // the 'num_sequences' is the number of objects that were combined (the
+  // FSTs get appended).
+  int32 num_sequences;
+
+  // the number of frames in each sequence of appended objects.  num_frames *
+  // num_sequences must equal the path length of any path in the FST.
+  // Technically this information is redundant with the FST, but it's convenient
+  // to have it separately.
+  int32 frames_per_sequence;
+
+  // the maximum possible value of the labels in 'fst' (which go from 1 to
+  // label_dim).  This should equal the NumPdfs() in the TransitionModel object.
+  // Included to avoid training on mismatched egs.
+  int32 label_dim;
+
+  // This is an epsilon-free unweighted acceptor that is sorted in increasing
+  // order of frame index (this implies it's topologically sorted but it's a
+  // stronger condition).  The labels are pdf-ids plus one (to avoid epsilons,
+  // since pdf-ids are zero-based).  Each successful path in 'fst' has exactly
+  // 'frames_per_sequence * num_sequences' arcs on it (first 'frames_per_sequence' arcs for the
+  // first sequence; then 'frames_per_sequence' arcs for the second sequence, and so on).
+  fst::StdVectorFst fst;
+
+  Supervision(): weight(1.0), num_sequences(1), frames_per_sequence(-1),
+                 label_dim(-1) { }
+
+  Supervision(const Supervision &other);
+
+  void Swap(Supervision *other);
+
+  bool operator == (const Supervision &other) const;
+
+  // This function checks that this supervision object satifsies some
+  // of the properties we expect of it, and calls KALDI_ERR if not.
+  void Check(const TransitionModel &trans_model) const;
+
+  void Write(std::ostream &os, bool binary) const;
+  void Read(std::istream &is, bool binary);
+};
+
+
+/** This function creates a Supervision object from a ProtoSupervision object.
+    The labels will be pdf-ids plus one.  It sets supervision->label_dim
+    trans_model.NumPdfs().
+
+    It returns true on success, and false on failure; the only failure mode is
+    that it might return false on that would not be a bug, is when the FST is
+    empty because there were too many phones for the number of frames.
+*/
+bool ProtoSupervisionToSupervision(
+    const ContextDependencyInterface &ctx_dep,
+    const TransitionModel &trans_model,
+    const ProtoSupervision &proto_supervision,
+    Supervision *supervision);
+
+
+/**
+   This function sorts the states of the fst argument in an ordering
+   corresponding with a breadth-first search order starting from the
+   start state.  This gives us the sorting on frame index for the
+   FSTs that appear in class Supervision (it relies on them being
+   epsilon-free).
+   This function requires that the input FST be connected (i.e. all states
+   reachable from the start state).
+   This function is called from ProtoSupervisionToSupervision().
+*/
+void SortBreadthFirstSearch(fst::StdVectorFst *fst);
+
+// This class is used for splitting something of type Supervision into
+// multiple pieces corresponding to different frame-ranges.
+class SupervisionSplitter {
+ public:
+  SupervisionSplitter(const Supervision &supervision);
+
+  // Extracts a frame range of the supervision into 'supervision'.  Note: the
+  // supervision object should not be used for training before you do
+  // 'AddWeightToSupervisionFst', which not only adds the weights from the
+  // normalization graph (derived from the normalization FST), but also removes
+  // epsilons and ensures the states are sorted on time.
+  void GetFrameRange(int32 begin_frame, int32 frames_per_sequence,
+                     Supervision *supervision) const;
+ private:
+  // Creates an output FST covering frames begin_frame <= t < end_frame,
+  // assuming that the corresponding state-range that we need to
+  // include, begin_state <= s < end_state has been included.
+  // (note: the output FST will also have two special initial and final
+  // states).  Does not do the post-processing (RmEpsilon, Determinize,
+  // TopSort on the result).  See code for details.
+  void CreateRangeFst(int32 begin_frame, int32 end_frame,
+                      int32 begin_state, int32 end_state,
+                      fst::StdVectorFst *fst) const;
+
+  const Supervision &supervision_;
+  // Indexed by the state-index of 'supervision_.fst', this is the frame-index,
+  // which ranges from 0 to (supervision_.frames_per_sequence *
+  // supervision_.num_sequences) - 1.  This will be monotonically increasing
+  // (note that supervision_.fst is topologically sorted).
+  std::vector<int32> frame_;
+};
+
+
+/// This function adds weights to the FST in the supervision object, by
+/// composing with the 'normalization fst'.  It should be called directly after
+/// GetFrameRange().  The 'normalization fst' is produced by the function
+/// DenominatorGraph::GetNormalizationFst(); it's a slight modification of the
+/// 'denominator fst'.  This function modifies the weights in the supervision
+/// object- adding to each path, the weight that it gets in the normalization
+/// fst, which is the same weight that it will get in the denominator
+/// forward-backward computation.  This ensures that the (log) objective
+/// function can never be positive (as the numerator graph will be a strict
+/// subset of the denominator, with the same weights for the same paths).  This
+/// function returns true on success, and false on the (hopefully) rare occasion
+/// that the composition of the normalization fst with the supervision produced
+/// an empty result (this shouldn't happen unless there were alignment errors in
+/// the alignments used to train the phone language model leading to unseen
+/// 3-grams that occur in the training sequences).
+/// This function also removes epsilons and makes sure supervision->fst has the
+/// required sorting of states.  Think of it as the final stage in preparation
+/// of the supervision FST.
+bool AddWeightToSupervisionFst(const fst::StdVectorFst &normalization_fst,
+                               Supervision *supervision);
+
+/// Assuming the 'fst' is epsilon-free, connected, and has the property that all
+/// paths from the start-state are of the same length, output a vector
+/// containing that length (from the start-state to the current state) to
+/// 'state_times'.  The member 'fst' of struct Supervision has this property.
+/// Returns the total number of frames.  This function is similar to
+/// LatticeStateTimes() and CompactLatticeStateTimes() declared in
+/// lat/lattice-functions.h, except that unlike LatticeStateTimes(), we don't
+/// allow epsilons-- not because they are hard to handle but because in this
+/// context we don't expect them.  This function also expects that the input fst
+/// will have the property that the state times are in nondecreasing order (as
+/// SortBreadthFirstSearch() will accomplish for FSTs satsifying the other
+/// properties we mentioned).  This just happens to be something we enforce
+/// while creating these FSTs.
+///
+/// @param fst[in] The input fst: should be epsilon-free; connected; nonempty;
+///                should have the property that all paths to a given state (or
+///                to a nonzero final-prob) should have the same number of arcs;
+///                and its states should be sorted on this path length (e.g.
+///                SortBreadthFirst will do this).
+/// @param state_times[out]  The state times that we output; will be set to
+///                a vector with the dimension fst.NumStates().
+///
+/// @return  Returns the path length
+int32 ComputeFstStateTimes(const fst::StdVectorFst &fst,
+                           std::vector<int32> *state_times);
+
+
+/// This function appends a list of supervision objects to create what will
+/// usually be a single such object, but if the weights and num-frames are not
+/// all the same it will only append Supervision objects where successive ones
+/// have the same weight and num-frames, and if 'compactify' is true.  The
+/// normal use-case for this is when you are combining neural-net examples for
+/// training; appending them like this helps to simplify the training process.
+
+/// This function will crash if the values of label_dim in the inputs are not
+/// all the same.
+void AppendSupervision(const std::vector<const Supervision*> &input,
+                       bool compactify,
+                       std::vector<Supervision> *output_supervision);
+
+
+/// This function helps you to pseudo-randomly split a sequence of length 'num_frames',
+/// interpreted as frames 0 ... num_frames - 1, into pieces of length exactly
+/// 'frames_per_range', to be used as examples for training.  Because frames_per_range
+/// may not exactly divide 'num_frames', this function will leave either small gaps or
+/// small overlaps in pseudo-random places.
+/// The output 'range_starts' will be set to a list of the starts of ranges, the
+/// output ranges are of the form
+/// [ (*range_starts)[i] ... (*range_starts)[i] + frames_per_range - 1 ].
+void SplitIntoRanges(int32 num_frames,
+                     int32 frames_per_range,
+                     std::vector<int32> *range_starts);
+
+
+/// This utility function is not used directly in the 'chain' code.  It is used
+/// to get weights for the derivatives, so that we don't doubly train on some
+/// frames after splitting them up into overlapping ranges of frames.  The input
+/// 'range_starts' will be obtained from 'SplitIntoRanges', but the
+/// 'range_length', which is a length in frames, may be longer than the one
+/// supplied to SplitIntoRanges, due the 'overlap'.  (see the calling code...
+/// if we want overlapping ranges, we get it by 'faking' the input to
+/// SplitIntoRanges).
+///
+/// The output vector 'weights' will be given the same dimension as
+/// 'range_starts'.  By default the output weights in '*weights' will be vectors
+/// of all ones, of length equal to 'range_length', and '(*weights)[i]' represents
+/// the weights given to frames numbered
+///   t = range_starts[i] ... range_starts[i] + range_length - 1.
+/// If these ranges for two successive 'i' values overlap, then we
+/// reduce the weights to ensure that no 't' value gets a total weight
+/// greater than 1.  We do this by dividing the overlapped region
+/// into three approximately equal parts, and giving the left part
+/// to the left range; the right part to the right range; and
+/// in between, interpolating linearly.
+void GetWeightsForRanges(int32 range_length,
+                         const std::vector<int32> &range_starts,
+                         std::vector<Vector<BaseFloat> > *weights);
+
+
+/// This is a newer version of GetWeightsForRanges with a simpler behavior
+/// than GetWeightsForRanges and a different purpose.  Instead of aiming to
+/// create weights that sum to one over the whole file, the purpose is to
+/// zero out the derivative weights for a certain number of frames to each
+/// side of every 'cut point' in the numerator lattice [by numerator lattice,
+/// what I mean is the FST that we automatically generate from the numerator
+/// alignment or lattice].  So we don't zero out the weights for the very
+/// beginning or very end of each original utterance, just those where
+/// we split the utterance into pieces.  We believe there is an incentive
+/// for the network to produce deletions near the edges, and this aims to fix
+/// this problem.
+/// range_length is the length of each range of times (so range_starts[0]
+/// represents the start of a range of t values of length 'range_length'
+/// and so range_starts[1] etc.), and num_frames_zeroed is the number of frames
+/// on each side of the cut point on which we are supposed to zero out the
+/// derivative.
+void GetWeightsForRangesNew(int32 range_length,
+                            int32 num_frames_zeroed,
+                            const std::vector<int32> &range_starts,
+                            std::vector<Vector<BaseFloat> > *weights);
+
+
+typedef TableWriter<KaldiObjectHolder<Supervision> > SupervisionWriter;
+typedef SequentialTableReader<KaldiObjectHolder<Supervision> > SequentialSupervisionReader;
+typedef RandomAccessTableReader<KaldiObjectHolder<Supervision> > RandomAccessSupervisionReader;
+
+}  // namespace chain
+}  // namespace kaldi
+
+#endif  // KALDI_CHAIN_CHAIN_SUPERVISION_H_
diff --git a/src/chain/chain-training.cc b/src/chain/chain-training.cc
new file mode 100644
index 00000000000..1bf0201fbfa
--- /dev/null
+++ b/src/chain/chain-training.cc
@@ -0,0 +1,115 @@
+// chain/chain-training.cc
+
+// Copyright      2015   Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "chain/chain-training.h"
+#include "chain/chain-kernels-ansi.h"
+#include "chain/chain-numerator.h"
+#include "chain/chain-denominator.h"
+
+namespace kaldi {
+namespace chain {
+
+void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts,
+                              const DenominatorGraph &den_graph,
+                              const Supervision &supervision,
+                              const CuMatrixBase<BaseFloat> &nnet_output,
+                              BaseFloat *objf,
+                              BaseFloat *l2_term,                              
+                              BaseFloat *weight,
+                              CuMatrixBase<BaseFloat> *nnet_output_deriv,
+                              CuMatrixBase<BaseFloat> *xent_output_deriv) {
+  BaseFloat num_logprob_weighted;
+  if (nnet_output_deriv)
+    nnet_output_deriv->SetZero();
+  {
+    NumeratorComputation numerator(supervision, nnet_output);
+    // note: supervision.weight is included as a factor in the derivative from
+    // the numerator object, and the logprob too.
+    num_logprob_weighted = numerator.Forward();
+    if (nnet_output_deriv) {
+      numerator.Backward(nnet_output_deriv);
+      if (xent_output_deriv)
+        xent_output_deriv->CopyFromMat(*nnet_output_deriv);
+    } else if (xent_output_deriv) {
+      // this branch will be taken if xent_output_deriv but not
+      // nnet_output_deriv is set- which could happen if you want to compute the
+      // cross-entropy objective but not the derivatives.
+      xent_output_deriv->SetZero();
+      numerator.Backward(xent_output_deriv);
+    }
+  }
+  DenominatorComputation denominator(opts, den_graph,
+                                     supervision.num_sequences,
+                                     nnet_output);
+
+  BaseFloat den_logprob = denominator.Forward();
+  bool ok = true;
+  if (nnet_output_deriv)
+    ok = denominator.Backward(-supervision.weight,
+                              nnet_output_deriv);
+
+  *objf = num_logprob_weighted - supervision.weight * den_logprob;
+  *weight = supervision.weight * supervision.num_sequences *
+      supervision.frames_per_sequence;
+  if (!((*objf) - (*objf) == 0) || !ok) {
+    // inf or NaN detected, or denominator computation returned false.
+    if (nnet_output_deriv)
+      nnet_output_deriv->SetZero();
+    if (xent_output_deriv)
+      xent_output_deriv->SetZero();
+    BaseFloat default_objf = -10;
+    KALDI_WARN << "Objective function is " << (*objf)
+               << " and denominator computation (if done) returned "
+               << std::boolalpha << ok
+               << ", setting objective function to " << default_objf
+               << " per frame.";
+    *objf  = default_objf * *weight;
+  }
+
+  // This code helps us see how big the derivatives are, on average,
+  // for different frames of the sequences.  As expected, they are
+  // smaller towards the edges of the sequences (due to the penalization
+  // of 'incorrect' pdf-ids.
+  if (GetVerboseLevel() >= 1) {
+    int32 tot_frames = nnet_output_deriv->NumRows(),
+ frames_per_sequence = supervision.frames_per_sequence,
+       num_sequences = supervision.num_sequences;
+    CuVector<BaseFloat> row_products(tot_frames);
+    row_products.AddDiagMat2(1.0, *nnet_output_deriv, kNoTrans, 0.0);
+    Vector<BaseFloat> row_products_cpu(row_products);
+    Vector<BaseFloat> row_products_per_frame(frames_per_sequence);
+    for (int32 i = 0; i < tot_frames; i++)
+      row_products_per_frame(i / num_sequences) += row_products_cpu(i);
+    KALDI_LOG << "Derivs per frame are " << row_products_per_frame;
+  }
+
+  if (opts.l2_regularize == 0.0) {
+    *l2_term = 0.0;
+  } else {
+    // compute the l2 penalty term and its derivative
+    BaseFloat scale = supervision.weight * opts.l2_regularize;
+    *l2_term = -0.5 * scale * TraceMatMat(nnet_output, nnet_output, kTrans);
+    if (nnet_output_deriv)
+      nnet_output_deriv->AddMat(-1.0 * scale, nnet_output);
+  }
+}
+
+
+}  // namespace chain
+}  // namespace kaldi
diff --git a/src/chain/chain-training.h b/src/chain/chain-training.h
new file mode 100644
index 00000000000..e6143d10846
--- /dev/null
+++ b/src/chain/chain-training.h
@@ -0,0 +1,131 @@
+// chain/chain-training.h
+
+// Copyright       2015  Johns Hopkins University (Author: Daniel Povey)
+
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#ifndef KALDI_CHAIN_CHAIN_TRAINING_H_
+#define KALDI_CHAIN_CHAIN_TRAINING_H_
+
+#include <vector>
+#include <map>
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "fstext/fstext-lib.h"
+#include "tree/context-dep.h"
+#include "lat/kaldi-lattice.h"
+#include "matrix/kaldi-matrix.h"
+#include "hmm/transition-model.h"
+#include "chain/chain-den-graph.h"
+#include "chain/chain-supervision.h"
+
+namespace kaldi {
+namespace chain {
+
+
+struct ChainTrainingOptions {
+  // l2 regularization constant on the 'chain' output; the actual term added to
+  // the objf will be -0.5 times this constant times the squared l2 norm.
+  // (squared so it's additive across the dimensions).  e.g. try 0.0005.
+  BaseFloat l2_regularize;
+
+  // Coefficient for 'leaky hmm'.  This means we have an epsilon-transition from
+  // each state to a special state with probability one, and then another
+  // epsilon-transition from that special state to each state, with probability
+  // leaky_hmm_coefficient times [initial-prob of destination state].  Imagine
+  // we make two copies of each state prior to doing this, version A and version
+  // B, with transition from A to B, so we don't have to consider epsilon loops-
+  // or just imagine the coefficient is small enough that we can ignore the
+  // epsilon loops.
+  BaseFloat leaky_hmm_coefficient;
+
+
+  // Cross-entropy regularization constant.  (e.g. try 0.1).  If nonzero,
+  // the network is expected to have an output named 'output-xent', which
+  // should have a softmax as its final nonlinearity.
+  BaseFloat xent_regularize;
+
+  ChainTrainingOptions(): l2_regularize(0.0), leaky_hmm_coefficient(1.0e-05),
+                          xent_regularize(0.0) { }
+  
+  void Register(OptionsItf *opts) {
+    opts->Register("l2-regularize", &l2_regularize, "l2 regularization "
+                   "constant for 'chain' training, applied to the output "
+                   "of the neural net.");
+    opts->Register("leaky-hmm-coefficient", &leaky_hmm_coefficient, "Coefficient "
+                   "that allows transitions from each HMM state to each other "
+                   "HMM state, to ensure gradual forgetting of context (can "
+                   "improve generalization).  For numerical reasons, may not be "
+                   "exactly zero.");
+    opts->Register("xent-regularize", &xent_regularize, "Cross-entropy "
+                   "regularization constant for 'chain' training.  If "
+                   "nonzero, the network is expected to have an output "
+                   "named 'output-xent', which should have a softmax as "
+                   "its final nonlinearity.");
+  }
+};
+
+
+/**
+   This function does both the numerator and denominator parts of the 'chain'
+   computation in one call.
+
+   @param [in] opts        Struct containing options
+   @param [in] den_graph   The denominator graph, derived from denominator fst.
+   @param [in] supervision  The supervision object, containing the supervision
+                            paths and constraints on the alignment as an FST
+   @param [in] nnet_output  The output of the neural net; dimension must equal
+                          ((supervision.num_sequences * supervision.frames_per_sequence) by
+                            den_graph.NumPdfs()).  The rows are ordered as: all sequences
+                            for frame 0; all sequences for frame 1; etc.
+   @param [out] objf       The [num - den] objective function computed for this
+                           example; you'll want to divide it by 'tot_weight' before
+                           displaying it.
+   @param [out] l2_term  The l2 regularization term in the objective function, if
+                           the --l2-regularize option is used.  To be added to 'o
+   @param [out] weight     The weight to normalize the objective function by;
+                           equals supervision.weight * supervision.num_sequences *
+                           supervision.frames_per_sequence.
+   @param [out] nnet_output_deriv  The derivative of the objective function w.r.t.
+                           the neural-net output.  Only written to if non-NULL.
+                           You don't have to zero this before passing to this function,
+                           we zero it internally.
+   @param [out] xent_output_deriv  If non-NULL, then the numerator part of the derivative
+                           (which equals a posterior from the numerator forward-backward,
+                           scaled by the supervision weight) is written to here.  This will
+                           be used in the cross-entropy regularization code.  This value
+                           is also used in computing the cross-entropy objective value.
+*/
+void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts,
+                              const DenominatorGraph &den_graph,
+                              const Supervision &supervision,
+                              const CuMatrixBase<BaseFloat> &nnet_output,
+                              BaseFloat *objf,
+                              BaseFloat *l2_term,
+                              BaseFloat *weight,
+                              CuMatrixBase<BaseFloat> *nnet_output_deriv,
+                              CuMatrixBase<BaseFloat> *xent_output_deriv = NULL);
+                              
+
+
+}  // namespace chain
+}  // namespace kaldi
+
+#endif  // KALDI_CHAIN_CHAIN_TRAINING_H_
+
diff --git a/src/chain/context-dep-topology.h b/src/chain/context-dep-topology.h
new file mode 100644
index 00000000000..5eae267a5cf
--- /dev/null
+++ b/src/chain/context-dep-topology.h
@@ -0,0 +1,129 @@
+// chain/context-dep-topology.h
+
+// Copyright      2015  Johns Hopkins University (Author: Daniel Povey)
+
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_CHAIN_CONTEXT_DEP_TOPOLOGY_H_
+#define KALDI_CHAIN_CONTEXT_DEP_TOPOLOGY_H_
+
+#include <vector>
+#include <map>
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "fstext/fstext-lib.h"
+#include "chain/phone-topology.h"
+#include "chain/phone-context.h"
+
+namespace kaldi {
+namespace chain {
+
+
+/**
+  The 'ContextDepTopology' object is responsible for combining the
+  'PhoneTopology' model, which describes the quasi-HMM topology for each phone,
+  and the 'PhoneContext' model, which describes how we create left-context
+  dependent phones.  It also allocates 'graph-labels' and 'output-labels'.  It
+  is analogous to 'HC' in the 'HCLG' recipe.  It's of a manageable size as an
+  FST, because we limit ourselves to left context.
+
+  A 'graph-label' is one-based, is sufficient to identify the logical CD-phone
+  and the label in the topology, and can also be mapped to an 'output-label'.
+
+  The output-label is also one-based; it is sufficient to identify the physical
+  CD-phone and the label in the topology object, but won't let you identify
+  the monophone (because output-labels may be shared between monophones).
+
+  The neural-net output is indexed by the output-label minus one (to form
+  a zero-based index).
+*/
+
+class ContextDepTopology {
+ public:
+
+  ContextDepTopology();
+
+  ContextDepTopology(const PhoneTopology &topology,
+                     const PhoneContext &context);
+
+  const PhoneTopology &GetPhoneTopology() { return phone_topology_; }
+
+  const PhoneContext &GetPhoneContext() { return phone_context_; }
+
+  // Returns the number of output-labels (labels corresponding to the neural-net
+  // output).  The actual neural-net output matrix is indexed by the label minus
+  // one, which we call an output-index.
+  int32 NumOutputLabels();
+
+  // Returns the number of graph-labels.   A graph-label is what will typically
+  // appear in HCLG decoding graphs; it is mappable to an output-label, but we
+  // also ensure that it is mappable to a phone.
+  int32 NumGraphLabels();
+
+  // convenience function to return the number of phones.
+  int32 NumPhones() { return phone_topology_.NumPhones(); }
+
+  // maps a graph-label to an output-label.
+  int32 GraphLabelToOutputLabel(int32 graph_label);
+
+  // maps a graph label to a phone.
+  int32 GraphLabelToPhone(int32 graph_label);
+
+  // maps a graph label to a logical cd-phone [a logical cd-phone is always
+  // mappable to the monophone].
+  int32 GraphLabelToLogicalCdPhone(int32 graph_label);
+
+  // maps a graph label to a physical cd-phone, as defined by the PhoneContext
+  // object.
+  int32 GraphLabelToPhysicalCdPhone(int32 graph_label);
+
+  // maps a graph label to a label in the phone's topology object (needed to
+  // work out phone alignments).
+  int32 GraphLabelToTopologyLabel(int32 graph_label);
+
+  // Outputs to 'output' an FST that represents this object-- it's essentially
+  // the 'HC' object in the 'HCLG' recipe.  It's an unweighted transducer where
+  // the input labels are phones (or epsilon) and the output labels are
+  // 'graph-labels'.  Note: we will ensure that there are no epsilons on
+  // the 'output side'.
+  void GetAsFst(fst::VectorFst<fst::StdArc>* output) const;
+
+  // This variant of of GetAsFst gives you 'output-labels' as the olabels, instead
+  // of graph-labels.  These are indexes-into-the-nnet-output plus one.
+  void GetAsFstWithOutputLabels(fst::VectorFst<fst::StdArc>* output) const;
+
+  void Write(std::ostream &os, bool binary) const;
+
+  void Read(std::istream &is, bool binary);
+
+ private:
+  PhoneTopology phone_topology_;
+  PhoneContext phone_context_;
+
+  struct GraphLabelInfo {
+    int32 logical_cd_phone;
+    int32 topology_label;
+    int32 output_label;
+  };
+};
+
+
+}  // namespace chain
+}  // namespace kaldi
+
+#endif  // KALDI_CHAIN_CONTEXT_DEP_TOPOLOGY_H_
diff --git a/src/chain/language-model-test.cc b/src/chain/language-model-test.cc
new file mode 100644
index 00000000000..04a57441ada
--- /dev/null
+++ b/src/chain/language-model-test.cc
@@ -0,0 +1,112 @@
+// chain/language-model-test.cc
+
+// Copyright      2015   Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "chain/language-model.h"
+#include "fstext/fstext-utils.h"
+
+namespace kaldi {
+namespace chain {
+
+static void GetTestingData(int32 *vocab_size,
+                           std::vector<std::vector<int32> > *data) {
+  // read the code of a C++ file as training data.
+  bool binary;
+  Input input("language-model.cc", &binary);
+  KALDI_ASSERT(!binary);
+  std::istream &is = input.Stream();
+  std::string line;
+  *vocab_size = 127;
+  int32 line_count = 0;
+  for (; getline(is, line); line_count++) {
+    std::vector<int32> int_line(line.size());
+    for (size_t i = 0; i < line.size(); i++) {
+      int32 this_char = line[i];
+      if (this_char == 0) {
+        this_char = 1;  // should never happen, but just make sure, as 0 is
+                        // treated as BOS/EOS in the language modeling code.
+      }
+      int_line[i] = std::min<int32>(127, this_char);
+    }
+    data->push_back(int_line);
+  }
+  KALDI_ASSERT(line_count > 0);
+}
+
+
+void ShowPerplexity(const fst::StdVectorFst &fst,
+                    const std::vector<std::vector<int32> > &data) {
+  int64 num_phones = 0;
+  double tot_loglike = 0;
+  for (size_t i = 0; i < data.size(); i++) {
+    num_phones += data[i].size();
+    fst::StdVectorFst linear_fst;
+    MakeLinearAcceptor(data[i], &linear_fst);
+    fst::StdVectorFst composed_fst;
+    fst::Compose(linear_fst, fst, &composed_fst);
+    fst::TropicalWeight weight = fst::ShortestDistance(composed_fst);
+    KALDI_ASSERT(weight != fst::TropicalWeight::Zero());
+    tot_loglike -= weight.Value();
+  }
+  double perplexity = exp(-(tot_loglike / num_phones));
+  KALDI_LOG << "Perplexity over " << num_phones
+            << " phones (of training data) is " << perplexity;
+}
+
+
+void LanguageModelTest() {
+  int32 vocab_size;
+  std::vector<std::vector<int32> > data;
+  GetTestingData(&vocab_size, &data);
+
+  LanguageModelOptions opts;
+  opts.no_prune_ngram_order = RandInt(1, 3);
+  opts.ngram_order = opts.no_prune_ngram_order + RandInt(0, 3);
+  opts.num_extra_lm_states = RandInt(1, 200);
+  if (opts.ngram_order < 2)
+    opts.ngram_order = 2;
+  if (RandInt(1, 2) == 1)
+    opts.num_extra_lm_states *= 10;
+
+  LanguageModelEstimator estimator(opts);
+  for (size_t i = 0; i < data.size(); i++) {
+    std::vector<int32> &sentence = data[i];
+    estimator.AddCounts(sentence);
+  }
+
+  fst::StdVectorFst fst;
+  estimator.Estimate(&fst);
+  bool ans = IsStochasticFstInLog(fst);
+  KALDI_ASSERT(ans);  // check that it normalizes.
+  KALDI_ASSERT(fst.Properties(fst::kAcceptor, true) == fst::kAcceptor);
+  KALDI_ASSERT(fst.Properties(fst::kIDeterministic, true) == fst::kIDeterministic);
+  KALDI_ASSERT(fst.Properties(fst::kIEpsilons, true) == 0);
+
+  ShowPerplexity(fst, data);
+}
+
+
+
+}  // namespace chain
+}  // namespace kaldi
+
+int main() {
+  //  kaldi::SetVerboseLevel(2);
+  for (int32 i = 0; i < 30; i++)
+    kaldi::chain::LanguageModelTest();
+}
diff --git a/src/chain/language-model.cc b/src/chain/language-model.cc
new file mode 100644
index 00000000000..f144d3d1bc1
--- /dev/null
+++ b/src/chain/language-model.cc
@@ -0,0 +1,411 @@
+// chain/language-model.cc
+
+// Copyright      2015   Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <numeric>
+#include "chain/language-model.h"
+#include "util/simple-io-funcs.h"
+
+
+namespace kaldi {
+namespace chain {
+
+void LanguageModelEstimator::AddCounts(const std::vector<int32> &sentence) {
+  KALDI_ASSERT(opts_.ngram_order >= 2 && "--ngram-order must be >= 2");
+  KALDI_ASSERT(opts_.ngram_order >= opts_.no_prune_ngram_order);
+  int32 order = opts_.ngram_order;
+  // 0 is used for left-context at the beginning of the file.. treat it as BOS.
+  std::vector<int32> history(1, 0);
+  std::vector<int32>::const_iterator iter = sentence.begin(),
+      end = sentence.end();
+  for (; iter != end; ++iter) {
+    KALDI_ASSERT(*iter != 0);
+    IncrementCount(history, *iter);
+    history.push_back(*iter);
+    if (history.size() >= order)
+      history.erase(history.begin());
+  }
+  // Probability of end of sentence.  This will end up getting ignored later, but
+  // it still makes a difference for probability-normalization reasons.
+  IncrementCount(history, 0);
+}
+
+void LanguageModelEstimator::IncrementCount(const std::vector<int32> &history,
+                                            int32 next_phone) {
+  int32 lm_state_index = FindOrCreateLmStateIndexForHistory(history);
+  if (lm_states_[lm_state_index].tot_count == 0) {
+    num_active_lm_states_++;
+  }
+  lm_states_[lm_state_index].AddCount(next_phone, 1);
+}
+
+void LanguageModelEstimator::SetParentCounts() {
+  int32 num_lm_states = lm_states_.size();
+  for (int32 l = 0; l < num_lm_states; l++) {
+    int32 this_count = lm_states_[l].tot_count;
+    int32 l_iter = l;
+    while (l_iter != -1) {
+      lm_states_[l_iter].tot_count_with_parents += this_count;
+      l_iter = lm_states_[l_iter].backoff_lmstate_index;
+    }
+  }
+  for (int32 l = 0; l < num_lm_states; l++) {
+    KALDI_ASSERT(lm_states_[l].tot_count_with_parents >=
+                 lm_states_[l].tot_count);
+  }
+}
+
+int32 LanguageModelEstimator::CheckActiveStates() const {
+  int32 num_active_states = 0,
+      num_lm_states = lm_states_.size(),
+      num_basic_lm_states = 0;
+  for (int32 l = 0; l < num_lm_states; l++) {
+    if (lm_states_[l].tot_count != 0)
+      num_active_states++;
+    if (lm_states_[l].history.size() == opts_.no_prune_ngram_order - 1)
+      num_basic_lm_states++;
+  }
+  KALDI_ASSERT(num_active_states == num_active_lm_states_);
+  return num_basic_lm_states;
+}
+
+int32 LanguageModelEstimator::FindLmStateIndexForHistory(
+    const std::vector<int32> &hist) const {
+  MapType::const_iterator iter = hist_to_lmstate_index_.find(hist);
+  if (iter == hist_to_lmstate_index_.end())
+    return -1;
+  else
+    return iter->second;
+}
+
+int32 LanguageModelEstimator::FindNonzeroLmStateIndexForHistory(
+    std::vector<int32> hist) const {
+  while (1) {
+    int32 l = FindLmStateIndexForHistory(hist);
+    if (l == -1 || lm_states_[l].tot_count == 0) {
+      // no such state or state has zero count.
+      if (hist.empty())
+        KALDI_ERR << "Error looking up LM state index for history "
+                  << "(likely code bug)";
+      hist.erase(hist.begin());  // back off.
+    } else {
+      return l;
+    }
+  }
+}
+
+int32 LanguageModelEstimator::FindOrCreateLmStateIndexForHistory(
+    const std::vector<int32> &hist) {
+  MapType::const_iterator iter = hist_to_lmstate_index_.find(hist);
+  if (iter != hist_to_lmstate_index_.end())
+    return iter->second;
+  int32 ans = lm_states_.size();  // index of next element
+  // next statement relies on default construct of LmState.
+  lm_states_.resize(lm_states_.size() + 1);
+  lm_states_.back().history = hist;
+  hist_to_lmstate_index_[hist] = ans;
+  // make sure backoff_lmstate_index is set, if needed.
+  if (hist.size() >= opts_.no_prune_ngram_order) {
+    // we need a backoff state to exist- create one if needed.
+    std::vector<int32> backoff_hist(hist.begin() + 1,
+                                    hist.end());
+
+    int32 backoff_lm_state = FindOrCreateLmStateIndexForHistory(
+        backoff_hist);
+    lm_states_[ans].backoff_lmstate_index = backoff_lm_state;
+    hist_to_lmstate_index_[backoff_hist] = backoff_lm_state;
+  }
+  return ans;
+}
+
+void LanguageModelEstimator::LmState::AddCount(int32 phone, int32 count) {
+  std::map<int32, int32>::iterator iter = phone_to_count.find(phone);
+  if (iter == phone_to_count.end())
+    phone_to_count[phone] = count;
+  else
+    iter->second += count;
+  tot_count += count;
+}
+
+void LanguageModelEstimator::LmState::Add(const LmState &other) {
+  KALDI_ASSERT(&other != this);
+  std::map<int32, int32>::const_iterator iter = other.phone_to_count.begin(),
+      end = other.phone_to_count.end();
+  for (; iter != end; ++iter)
+    AddCount(iter->first, iter->second);
+}
+
+void LanguageModelEstimator::LmState::Clear() {
+  phone_to_count.clear();
+  tot_count = 0;
+  tot_count_with_parents = false;
+  backoff_allowed = false;
+}
+
+BaseFloat LanguageModelEstimator::LmState::LogLike() const {
+  double ans = 0.0;
+  int32 tot_count_check = 0;
+  std::map<int32, int32>::const_iterator iter = phone_to_count.begin(),
+      end = phone_to_count.end();
+  for (; iter != end; ++iter) {
+    int32 count = iter->second;
+    tot_count_check += count;
+    double prob = count * 1.0 / tot_count;
+    ans += log(prob) * count;
+  }
+  KALDI_ASSERT(tot_count_check == tot_count);
+  return ans;
+}
+
+void LanguageModelEstimator::InitializeQueue() {
+  int32 num_lm_states = lm_states_.size();
+  while (!queue_.empty()) queue_.pop();
+  for (int32 l = 0; l < num_lm_states; l++) {
+    lm_states_[l].backoff_allowed = BackoffAllowed(l);
+    if (lm_states_[l].backoff_allowed) {
+      BaseFloat like_change = BackoffLogLikelihoodChange(l);
+      queue_.push(std::pair<BaseFloat,int32>(like_change, l));
+    }
+  }
+}
+
+BaseFloat LanguageModelEstimator::BackoffLogLikelihoodChange(
+    int32 l) const {
+  const LmState &lm_state = lm_states_.at(l);
+  KALDI_ASSERT(lm_state.backoff_allowed && lm_state.backoff_lmstate_index >= 0);
+  const LmState &backoff_lm_state = lm_states_.at(
+      lm_state.backoff_lmstate_index);
+  KALDI_ASSERT(lm_state.tot_count != 0);
+  // if the backoff state has zero count, there would naturally be a zero
+  // cost, but return -1e15 * (count of this lm state)... this encourages the
+  // lowest-count state to be backed off first.
+  if (backoff_lm_state.tot_count == 0)
+    return -1.0e-15 * lm_state.tot_count;
+  LmState sum_state(backoff_lm_state);
+  sum_state.Add(lm_state);
+  BaseFloat log_like_change =
+      sum_state.LogLike() - 
+      lm_state.LogLike() -
+      backoff_lm_state.LogLike();
+  // log-like change should not be positive... give it a margin for round-off
+  // error.
+  KALDI_ASSERT(log_like_change < 0.1);
+  if (log_like_change > 0.0)
+    log_like_change = 0.0;
+  return log_like_change;
+}
+
+
+void LanguageModelEstimator::DoBackoff() {
+  int32 initial_active_states = num_active_lm_states_,
+      target_num_lm_states = num_basic_lm_states_ + opts_.num_extra_lm_states;
+
+  // create 3 intermediate targets and the final target.  Between each phase we'll
+  // do InitializeQueue(), which will get us more exact values.
+  int32 num_targets = 4;
+  std::vector<int32> targets(num_targets);
+  for (int32 t = 0; t < num_targets; t++) {
+    // the targets get progressively closer to target_num_lm_states;
+    targets[t] = initial_active_states +
+        ((target_num_lm_states - initial_active_states) * (t + 1)) / num_targets;
+  }
+  KALDI_ASSERT(targets.back() == target_num_lm_states);
+
+  for (int32 t = 0; t < num_targets; t++) {
+    KALDI_VLOG(2) << "Backing off states, stage " << t;
+    InitializeQueue();
+    int32 this_target = targets[t];
+    while (num_active_lm_states_ > this_target && !queue_.empty()) {
+      BaseFloat like_change = queue_.top().first;
+      int32 lm_state = queue_.top().second;
+      queue_.pop();
+      BaseFloat recomputed_like_change = BackoffLogLikelihoodChange(lm_state);
+      if (!ApproxEqual(like_change, recomputed_like_change)) {
+        // If it changed (i.e. we had a stale likelihood-change on the queue),
+        // just put back the recomputed like-change on the queue and make no other
+        // changes.
+        KALDI_VLOG(2) << "Not backing off state, since like-change changed from "
+                      << like_change << " to " << recomputed_like_change;
+        queue_.push(std::pair<BaseFloat,int32>(recomputed_like_change, lm_state));
+      } else {
+        KALDI_VLOG(2) << "Backing off state with like-change = "
+                      << recomputed_like_change;
+        BackOffState(lm_state);
+      }
+    }
+  }
+  KALDI_LOG << "In LM [hard] backoff, target num states was "
+            << num_basic_lm_states_ << " + --num-extra-lm-states="
+            << opts_.num_extra_lm_states << " = " << target_num_lm_states
+            << ", pruned from " << initial_active_states << " to "
+            << num_active_lm_states_;
+}
+
+void LanguageModelEstimator::BackOffState(int32 l) {
+  LmState &lm_state = lm_states_.at(l);
+  KALDI_ASSERT(lm_state.backoff_allowed);
+  KALDI_ASSERT(lm_state.backoff_lmstate_index >= 0);
+  KALDI_ASSERT(lm_state.tot_count > 0);  // or shouldn't be backing it off.
+  LmState &backoff_lm_state = lm_states_.at(lm_state.backoff_lmstate_index);
+  bool backoff_state_had_backoff_allowed = backoff_lm_state.backoff_allowed;
+  if (backoff_lm_state.tot_count != 0)
+    num_active_lm_states_--;
+  // add the counts of lm_state to backoff_lm_state.
+  backoff_lm_state.Add(lm_state);
+  // zero the counts in this lm_state.
+  lm_state.Clear();
+  backoff_lm_state.backoff_allowed = BackoffAllowed(
+      lm_state.backoff_lmstate_index);
+  
+  if (!backoff_state_had_backoff_allowed &&
+      backoff_lm_state.backoff_allowed) {
+    // the backoff state would not have been in the queue, but is now allowed in
+    // the queue.
+    BaseFloat backoff_like_change = BackoffLogLikelihoodChange(
+        lm_state.backoff_lmstate_index);
+    queue_.push(std::pair<BaseFloat,int32>(backoff_like_change,
+                                           lm_state.backoff_lmstate_index));
+  }
+}
+
+int32 LanguageModelEstimator::AssignFstStates() {
+  CheckActiveStates();
+  int32 num_lm_states = lm_states_.size();
+  int32 current_fst_state = 0;
+  for (int32 l = 0; l < num_lm_states; l++)
+    if (lm_states_[l].tot_count != 0)
+      lm_states_[l].fst_state = current_fst_state++;
+  KALDI_ASSERT(current_fst_state == num_active_lm_states_);
+  return current_fst_state;
+}
+
+void LanguageModelEstimator::Estimate(fst::StdVectorFst *fst) {
+  KALDI_LOG << "Estimating language model with --no-prune-ngram-order="
+            << opts_.no_prune_ngram_order << ", --ngram-order="
+            << opts_.ngram_order << ", --num-extra-lm-state="
+            << opts_.num_extra_lm_states;
+  SetParentCounts();
+  num_basic_lm_states_ = CheckActiveStates();
+  DoBackoff();
+  int32 num_fst_states = AssignFstStates();
+  OutputToFst(num_fst_states, fst);
+}
+
+int32 LanguageModelEstimator::FindInitialFstState() const {
+  std::vector<int32> history(1, 0);
+  int32 l = FindNonzeroLmStateIndexForHistory(history);
+  KALDI_ASSERT(l != -1 && lm_states_[l].fst_state != -1);
+  return lm_states_[l].fst_state;
+}
+
+
+bool LanguageModelEstimator::BackoffAllowed(int32 l) const {
+  const LmState &lm_state = lm_states_.at(l);
+  if (lm_state.history.size() < opts_.no_prune_ngram_order)
+    return false;
+  KALDI_ASSERT(lm_state.tot_count <= lm_state.tot_count_with_parents);
+  if (lm_state.tot_count != lm_state.tot_count_with_parents)
+    return false;
+  if (lm_state.tot_count == 0)
+    return false;
+  // the next if-statement is an optimization where we skip the
+  // following test if we know that it must always be true.
+  if (lm_state.history.size() == opts_.ngram_order - 1)
+    return true;
+  std::map<int32, int32>::const_iterator
+      iter = lm_state.phone_to_count.begin(),
+      end = lm_state.phone_to_count.end();
+  for (; iter != end; ++iter) {
+    int32 phone = iter->first;
+    if (phone != 0) {
+      std::vector<int32> next_hist(lm_state.history);
+      next_hist.push_back(phone);
+      int32 next_lmstate = FindLmStateIndexForHistory(next_hist);
+      if (next_lmstate != -1 &&
+          lm_states_[next_lmstate].tot_count_with_parents != 0) {
+        // backoff is not allowed because we need all the context we have
+        // in order to make this transition; we can't afford to discard
+        // the leftmost phone.
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+void LanguageModelEstimator::OutputToFst(
+    int32 num_states,
+    fst::StdVectorFst *fst) const {
+  KALDI_ASSERT(num_states == num_active_lm_states_);
+  fst->DeleteStates();
+  for (int32 i = 0; i < num_states; i++)
+    fst->AddState();
+  fst->SetStart(FindInitialFstState());
+  
+  int64 tot_count = 0;
+  double tot_logprob = 0.0;
+
+  int32 num_lm_states = lm_states_.size();
+  // note: not all lm-states end up being 'active'.
+  for (int32 l = 0; l < num_lm_states; l++) {
+    const LmState &lm_state = lm_states_[l];
+    if (lm_state.fst_state == -1)
+      continue;
+    int32 state_count = lm_state.tot_count;
+    KALDI_ASSERT(state_count != 0);
+    std::map<int32, int32>::const_iterator
+        iter = lm_state.phone_to_count.begin(),
+        end = lm_state.phone_to_count.end();
+    for (; iter != end; ++iter) {
+      int32 phone = iter->first, count = iter->second;
+      BaseFloat logprob = log(count * 1.0 / state_count);
+      tot_count += count;
+      tot_logprob += logprob * count;
+      if (phone == 0) {  // Interpret as final-prob.
+        fst->SetFinal(lm_state.fst_state, fst::TropicalWeight(-logprob));
+      } else {  // It becomes a transition.
+        std::vector<int32> next_history(lm_state.history);
+        next_history.push_back(phone);
+        int32 dest_lm_state = FindNonzeroLmStateIndexForHistory(next_history),
+            dest_fst_state = lm_states_[dest_lm_state].fst_state;
+        KALDI_ASSERT(dest_fst_state != -1);
+        fst->AddArc(lm_state.fst_state,
+                    fst::StdArc(phone, phone, fst::TropicalWeight(-logprob),
+                                dest_fst_state));
+      }
+    }
+  }
+  BaseFloat perplexity = exp(-(tot_logprob / tot_count));
+  KALDI_LOG << "Total number of phone instances seen was " << tot_count;
+  KALDI_LOG << "Perplexity on training data is: " << perplexity;
+  KALDI_LOG << "Note: perplexity on unseen data will be infinity as there is "
+            << "no smoothing.  This is by design, to reduce the number of arcs.";
+  fst::Connect(fst);
+  // Make sure that Connect does not delete any states.
+  int32 num_states_connected = fst->NumStates();
+  KALDI_ASSERT(num_states_connected == num_states);
+  // arc-sort.  ilabel or olabel doesn't matter, it's an acceptor.
+  fst::ArcSort(fst, fst::ILabelCompare<fst::StdArc>());
+  KALDI_LOG << "Created phone language model with " << num_states << " states.";
+}
+
+}  // namespace chain
+}  // namespace kaldi
+
+
diff --git a/src/chain/language-model.h b/src/chain/language-model.h
new file mode 100644
index 00000000000..b2c3f4cd746
--- /dev/null
+++ b/src/chain/language-model.h
@@ -0,0 +1,269 @@
+// chain/language-model.h
+
+// Copyright      2015  Johns Hopkins University (Author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#ifndef KALDI_CHAIN_LANGUAGE_MODEL_H_
+#define KALDI_CHAIN_LANGUAGE_MODEL_H_
+
+#include <vector>
+#include <map>
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "fstext/fstext-lib.h"
+#include "lat/kaldi-lattice.h"
+
+namespace kaldi {
+
+
+namespace chain {
+
+// Options for phone language model estimation.  This is similar to an
+// un-smoothed language model of a certain order (e.g. triphone).  We won't be
+// actually decoding with this, we'll just use it as the 'denominator graph' in
+// acoustic model estimation.  The reason for avoiding smoothing is to reduce
+// the number of transitions in the language model, which will improve
+// efficiency of training.
+
+struct LanguageModelOptions {
+  int32 ngram_order;  // you might want to tune this
+  int32 num_extra_lm_states;  // you also might want to tune this
+  int32 no_prune_ngram_order;  // e.g. set this to 3 and it won't prune the
+                               // trigram contexts (note: a trigram
+                               // history-state has 2 known left phones)... this
+                               // tends to make for a more compact graph (since
+                               // the context FST anyway expands to trigram).
+
+  LanguageModelOptions():
+      ngram_order(4),
+      num_extra_lm_states(1000),
+      no_prune_ngram_order(3) { }
+
+  void Register(OptionsItf *opts) {
+    opts->Register("ngram-order", &ngram_order, "n-gram order for the phone "
+                   "language model used for the 'denominator model'");
+    opts->Register("num-extra-lm-states", &num_extra_lm_states, "Number of LM "
+                   "states desired on top of the nubmer determined by the "
+                   "--no-prune-ngram-order option.");
+    opts->Register("no-prune-ngram-order", &no_prune_ngram_order, "n-gram order "
+                   "below which the language model is not pruned (should "
+                   "probably be set the same as your --context-width for phone "
+                   "context in tree building, to make the graph as compact as "
+                   "possible)");
+  }
+};
+
+/**
+   This LanguageModelEstimator class estimates an n-gram language model
+   with a kind of 'hard' backoff that is intended to reduce the number of
+   arcs in the final compiled FST.  Basically, we never back off to the lower-order
+   n-gram state, but we sometimes do just say, "this state's count is too small
+   so we won't have this state at all", and this LM state disappears and
+   transitions to it go to the lower-order n-gram's state.
+
+   This language model is implemented as a set of states, and transitions
+   between these states; there is no concept of a backoff transition here.
+   Because this maps very naturally to an FST, we output it as an FST.
+ */
+class LanguageModelEstimator {
+ public:
+  LanguageModelEstimator(LanguageModelOptions &opts): opts_(opts),
+                                                      num_active_lm_states_(0) {
+    KALDI_ASSERT(opts.ngram_order >= 1 && opts.no_prune_ngram_order >= 1);
+  }
+
+  // Adds counts for this sentence.  Basically does: for each n-gram in the
+  // sentence, count[n-gram] += 1.  The only constraint on 'sentence' is that it
+  // should contain no zeros.
+  void AddCounts(const std::vector<int32> &sentence);
+
+  // Estimates the LM and outputs it as an FST.  Note: there is
+  // no concept here of backoff arcs.
+  void Estimate(fst::StdVectorFst *fst);
+
+ protected:
+  struct LmState {
+    // the phone history associated with this state (length can vary).
+    std::vector<int32> history;
+    // maps from
+    std::map<int32, int32> phone_to_count;
+    // total count of this state.  As we back off states to lower-order states
+    // (and note that this is a hard backoff where we completely remove un-needed
+    // states) this tot_count may become zero.
+    int32 tot_count;
+
+    // total count of this state plus all states that back off to this state.
+    // only valid after SetParentCounts() is called.
+    int32 tot_count_with_parents;
+
+    // LM-state index of the backoff LM state (if it exists, else -1)...
+    // provided for convenience.  The backoff state exist if and only
+    // if history.size() >= no_prune_ngram_order
+    int32 backoff_lmstate_index;
+
+    // keeps track of the number of other LmStates 'other' for whom
+    // (other.tot_count > 0 or other.num_parents > 0) and
+    // other.backoff_lmstate_index is the index of this LM state.
+    // This lets us know whether this state has a chance, in the future,
+    // of getting a nonzero count, which in turn is used in the
+    // BackoffAllowed() function.
+    int32 num_parents;
+
+    // this is only set after we decide on the FST state numbering (at the end).
+    // If not set, it's -1.
+    int32 fst_state;
+
+    // True if backoff of this state is allowed (which implies it's in the queue).
+    // Backoff of this state is allowed (i.e. we will consider removing this state)
+    // if its history length is >= opts.no_prune_ngram_order, and it has nonzero
+    // count, and
+    bool backoff_allowed;
+
+    void AddCount(int32 phone, int32 count);
+
+    // Log-likelihood of data in this case, summed, not averaged:
+    // i.e. sum(phone in phones) count(phone) * log-prob(phone | this state).
+    BaseFloat LogLike() const;
+    // Add the contents of another LmState.
+    void Add(const LmState &other);
+    // Clear all counts from this state.
+    void Clear();
+    LmState(): tot_count(0), tot_count_with_parents(0),  backoff_lmstate_index(-1),
+               fst_state(-1), backoff_allowed(false) { }
+    LmState(const LmState &other):
+        history(other.history), phone_to_count(other.phone_to_count),
+        tot_count(other.tot_count), tot_count_with_parents(other.tot_count_with_parents),
+        backoff_lmstate_index(other.backoff_lmstate_index),
+        fst_state(other.fst_state), backoff_allowed(other.backoff_allowed) { }
+  };
+
+  // maps from history to int32
+  typedef unordered_map<std::vector<int32>, int32, VectorHasher<int32> > MapType;
+
+  LanguageModelOptions opts_;
+
+  MapType hist_to_lmstate_index_;
+  std::vector<LmState> lm_states_;  // indexed by lmstate_index, the LmStates.
+
+  // Keeps track of the number of lm states that have nonzero counts.
+  int32 num_active_lm_states_;
+
+  // The number of LM states that we would have due to the
+  // no_prune_ngram_order_.  Equals the number of history-states of length
+  // no_prune_ngram_order_ - 1.  Used to compute the total number of desired
+  // state (by adding opts_.num_extra_lm_states).
+  int32 num_basic_lm_states_;
+
+  // Queue of pairs: (likelihood change [which is negative], lm_state_index).
+  // We always pick the one with the highest (least negative) likelihood change
+  // to merge.  Note: elements in the queue can get out of date, so it's
+  // necessary to check that something is up-to-date (i.e. the likelihood change
+  // is accurate) before backing off a state.
+  // Note: after InitializeQueue() is called, any state that has nonzero count
+  // and history-length >= no_prune_ngram_order, will be in the queue.
+  //
+  // This whole algorithm is slightly approximate (i.e. it may not always back
+  // off the absolutely lowest-cost states), because we don't force
+  // recomputation of all the costs each time we back something off.  Generally
+  // speaking, these costs will only increase as we back off more states, so the
+  // approximation is not such a big deal.
+  std::priority_queue<std::pair<BaseFloat, int32> > queue_;
+
+
+  // adds the counts for this ngram (called from AddCounts()).
+  inline void IncrementCount(const std::vector<int32> &history,
+                             int32 next_phone);
+
+
+  // Computes whether backoff should be allowed for this lm_state.  (the caller
+  // can set the backoff_allowed variable to match).  Backoff is allowed if the
+  // history length is >= opts_.no_prune_ngram_order, and tot_count ==
+  // tot_count_with_parents (i.e. there are no parents that are not yet backed
+  // off), and the total count is nonzero, and all transitions from this state
+  // involve backoff.  (i.e. backoff is disallowed if the the history-state
+  // (this history-state + next-phone) exists.
+  bool BackoffAllowed(int32 lm_state) const;
+
+  // sets up tot_count_with_parents in all the lm-states
+  void SetParentCounts();
+
+  // Computes the change, in log-likelihood caused by backing off this lm state
+  // to its backoff state, i.e. combining its counts with those of its backoff
+  // state.  This lm state must have backoff_allowed set to true.  This function
+  // returns what can be interpreted as a negated cost.  As a special case, if
+  // the backoff state has a zero count but this state has a nonzero count, we
+  // set the like-change to -1e-15 * (count of this state).  Before the backoff
+  // states have any counts, this encourages the lowest-count states to get
+  // backed-off first.
+  BaseFloat BackoffLogLikelihoodChange(int32 lmstate_index) const;
+
+  // Adds to the queue, all LmStates that have nonzero count and history-length is
+  // >= no_prune_ngram_order.
+  void InitializeQueue();
+
+  // does the logic of pruning/backing-off states.
+  void DoBackoff();
+
+  // This function, will back off the counts of this lm_state to its
+  // backoff state, and update num_active_lm_states_ as appropriate.
+  // If the count of the backoff state was previously zero, and the backoff
+  // state's history-length is >= no_prune_ngram_order, the backoff
+  // state will get added to the queue.
+  void BackOffState(int32 lm_state);
+
+  // Check, that num_active_lm_states_ is accurate, and returns
+  // the number of 'basic' LM-states (i.e. the number of lm-states whose history
+  // is of length no_prune_ngram_order - 1).
+  int32 CheckActiveStates() const;
+
+  // Finds and returns an LM-state index for a history -- or -1 if it doesn't
+  // exist.  No backoff is done.
+  int32 FindLmStateIndexForHistory(const std::vector<int32> &hist) const;
+
+  // Finds and returns an LM-state index for a history -- and creates one if
+  // it doesn't exist -- and also creates any backoff states needed, down
+  // to history-length no_prune_ngram_order - 1.
+  int32 FindOrCreateLmStateIndexForHistory(const std::vector<int32> &hist);
+
+  // Finds and returns the most specific LM-state index for a history or
+  // backed-off versions of it, that exists and has nonzero count.  Will die if
+  // there is no such history.  [e.g. if there is no unigram backoff state,
+  // which generally speaking there won't be.]
+  int32 FindNonzeroLmStateIndexForHistory(std::vector<int32> hist) const;
+
+  // after all backoff has been done, assigns FST state indexes to all states
+  // that exist and have nonzero count.  Returns the number of states.
+  int32 AssignFstStates();
+
+  // find the FST index of the initial-state, and returns it.
+  int32 FindInitialFstState() const;
+
+  void OutputToFst(
+      int32 num_fst_states,
+      fst::StdVectorFst *fst) const;
+
+};
+
+
+
+}  // namespace chain
+}  // namespace kaldi
+
+#endif
+
diff --git a/src/chain/phone-context.h b/src/chain/phone-context.h
new file mode 100644
index 00000000000..bfcb56e64d1
--- /dev/null
+++ b/src/chain/phone-context.h
@@ -0,0 +1,188 @@
+// chain/phone-context.h
+
+// Copyright      2015  Johns Hopkins University (Author: Daniel Povey)
+
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#ifndef KALDI_CHAIN_PHONE_CONTEXT_H_
+#define KALDI_CHAIN_PHONE_CONTEXT_H_
+
+#include <vector>
+#include <map>
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "fstext/fstext-lib.h"
+
+namespace kaldi {
+namespace chain {
+
+
+/**
+  The 'PhoneContext' object is responsible for mapping phones in left-context to
+  cd-phones (context-dependent phones).  In the 'chain' models, we only support
+  left-context, in order to make phone-level discriminative training
+  sufficiently efficient.  The 'PhoneContext' model represents all the
+  information we need to know about the phonetic-context decision tree (so after
+  building the decision tree, we can build the PhoneContext object and then
+  discard the tree).
+
+  There two types of cd-phones: cd-phones, and physical cd-phones.  The logical
+  ones can be mapped to physical.  The logical cd-phones are the ones that we
+  actually put in the graph, which will enable us to work out the phone sequence
+  (assuming the topology is 'alignable', which it normally will be).  Logical
+  cd-phones are mappable to the (mono) phone; the physical ones are less
+  detailed, and can't necessarily be mapped to the monophones.
+
+  Note that the PhoneTopology and PhoneContext will be incorporated as data
+  members in the ContextDependentTopology model, which contains information
+  about topology and context, and also controls the allocation of output-ids
+  (which are indexes into the neural net output, and roughly correspond to
+  context-dependent states in a conventional HMM-based system).
+*/
+
+class PhoneContext: public fst::DeterministicOnDemandFst<fst::StdArc> {
+ public:
+  /*  First, members that relate to the base class.   */
+
+  // repeat the typedefs (they're not inherited automatically; we could inherit
+  //  but they are boilerplate so we just repeat them).
+  typedef typename fst::StdArc Arc;
+  typedef typename Arc::StateId StateId;  // should be int32.
+  typedef typename Arc::Weight Weight;
+  typedef typename Arc::Label Label;  // should be int32.
+
+  // The following are part of the interface from DeterministicOnDemandFst.
+  virtual StateId Start() { return 0; }
+
+  // all states are final.
+  virtual Weight Final(StateId s) { return Weight::One(); }
+
+  // Assuming 0 <= s < NumStates() and 1 <= phone <= NumPhones(),
+  // this function will return true and output to Arc as follows:
+  // ilabel = phone, olabel = logical-cd-phone, weight = One(),
+  // nextstate = [the next state after seeing this phone.]
+  virtual bool GetArc(StateId s, Label phone, Arc *oarc) = 0;
+
+  // There is a concept of states in this model, whereby when it outputs a phone
+  // it advances the state.  So it's an FST-like representation of the decision
+  // tree.  States are numbered from 0 to NumStates() - 1.  This function is
+  // actually not in the interface, but it is the same as in ExpandedFst.
+  int32 NumStates() const { return transitions_.size(); }
+
+  virtual ~PhoneContext();
+
+  /*  Next members not relating to the base class.   */
+
+  PhoneContext();
+
+  // Initialization from a tree (which must be left-context only, i.e.
+  // CentralPosition() == ContextWidth() - 1).  The initialization method relies
+  // on enumerating all possible contexts, so it will be slow if you have a
+  // ridiculously large context.
+
+  // Note: we hope not to use this, we will use a separate version of the
+  // tree-building code that tries to reduce the number of 'context states'.
+  PhoneContext(int32 num_phones, const ContextDependencyInterface &ctx_dep);
+
+  // Phones are numbered from 1 to NumPhones().
+  int32 NumPhones() const { return num_phones_; }
+
+
+  // Return the number of distinct labels on the topology FST for this phone:
+  // the labels must be contiguously numbered from 1, so this is the same as
+  // the largest topology label.
+  bool GetNumLabels(int32 phone) const;
+
+  // Logical context-dependent phones are numbered from 1 to
+  // NumLogicalCdPhones().
+  int32 NumLogicalCdPhones() const { return logical_to_phone_.size() - 1; }
+
+  // Physical context-dependent phones are numbered from 1 to
+  // NumPhysicalCdPhones().
+  int32 NumPhysicalCdPhones() const { return num_physical_cd_phones_; }
+
+  // This function tells you how many phones of left-context the underlying
+  // decision tree was built with: 0 for monophone, 1 for left-biphone, etc.  It
+  // amounts to an assertion that if you take a given phone sequence of length
+  // LeftContext(), and starting from any FST state, use that phone-sequence as
+  // ilabels, you'll always end up in the same state.
+  int32 LeftContext() const { return left_context_; }
+
+  // Maps a logical CD-phone to the phone index (i.e. of the monophone with
+  // no context)-- you cannot map to a full context, that is not what
+  // logical CD-phones mean in this code.
+  int32 LogicalToPhone(int32 logical_cd_phone) const;
+
+  // Maps a logical CD-phone to a physical CD-phone.
+  int32 LogicalToPhysical(int32 logical_cd_phone) const;
+
+  // Given a context-dependent phone index, return the set of phones it may
+  // correspond to (in most cases this would be a set of just one element).
+  // We'll implement this when we need it- it will require storing derived
+  // variables, to make it efficient.
+  //
+  // void CdPhoneToPhones(int32 cd_phone, std::vector<int32> *phones);
+
+
+  void Write(std::ostream &os, bool binary) const;
+
+  void Read(std::istream &is);
+
+  // Outputs to 'output' an FST that's a copy of this object in the normal FST
+  // format (as opposed to DeterministicOnDemandFst).  This is the 'C' FST
+  // (the context-dependency FST) in the HCLG recipe.
+  // ilabels are phones, olabels are cd-phones.  Note: can be implemented by
+  // taking an FST 'f' with one state that's initial and final, with self-loops
+  // for each phone, and then calling ComposeDeterministicOnDemand(f, *this,
+  // output).
+  void GetAsFst(fst::VectorFst<fst::StdArc>* output) const;
+ private:
+  void Check();
+  // Sets up the cd_phone_to_phone_ array.
+  void ComputeCdPhoneToPhone();
+
+  int32 num_phones_;
+  int32 num_physical_cd_phones_;
+  int32 left_context_;
+
+  // 'transitions_' is indexed by state, then by phone - 1 (each vector of pairs
+  // is of length num_phones), and each pair is (cd-phone-index, next-state).
+  // For instance (bear in mind that 0 is the initial-state that you get at the
+  // begining of a phone_sequence), transitions_[0][p].first is the
+  // logical-cd-phone you get from seeing phone p with the left-context being the
+  // beginning of a sequence (i.e. a left-context of all zeros, as far as the
+  // tree is concerned); and transitions_[0][p].second is the context state you
+  // go to after seeing that phone.
+  std::vector<std::vector<std::pair<int32, int32> > > transitions_;
+
+  // map logical CD-phones to phones.  Indexed by logical CD-phone (zeroth
+  // element not used).
+  std::vector<int32> logical_to_phone_;
+
+  // map logical CD-phones to physical CD-phones.  Indexed by logical CD-phone (zeroth
+  // element not used).
+  std::vector<int32> logical_to_physical_;
+
+};
+
+
+}  // namespace chain
+}  // namespace kaldi
+
+#endif  // KALDI_CHAIN_PHONE_CONTEXT_H_
diff --git a/src/chain/phone-topology.cc b/src/chain/phone-topology.cc
new file mode 100644
index 00000000000..e0a3fb639b7
--- /dev/null
+++ b/src/chain/phone-topology.cc
@@ -0,0 +1,98 @@
+// chain/phone-topology.cc
+
+// Copyright      2015   Johns Hopkins University (author: Daniel Povey)
+//                2015   Xingyu Na
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "chain/phone-topology.h"
+
+namespace kaldi {
+namespace chain {
+
+
+const fst::VectorFst<StdArc>& PhoneTopolgy::TopologyForPhone (int32 phone) {
+  return fsts_[phone];
+}
+
+PhoneTopology::PhoneTopology (int32 num_phones) {
+  fsts_.clear();
+  fsts_.resize(num_phones + 1);
+  for (int32 i = 1; i <= num_phones; i++) {
+    fst::VectorFst<StdArc> fst;
+    fst.AddState();  // state 0
+    fst.SetStart(0); // set start state
+    fst.AddState();  // state 1
+    fst.AddArc(0, StdArc(1, 1, 0.5, 1));
+    fst.AddArc(1, StdArc(2, 2, 0.5, 1));
+    fst.SetFinal(1); // set final state
+    fsts_[i] = fst;
+  }
+}
+
+void PhoneTopology::Write(std::ostream &os, bool binary) const{
+  WriteToken(os, binary, "<PhoneTopology>");
+  if (!binary) os << "\n";
+  int num_phones = fsts_.size() - 1;
+  WriteToken(os, binary, "<NumPhones>");
+  WriteBasicType(os, binary, num_phones);
+  if (!binary) os << "\n";
+  std::vector<fst::VectorFst<StdArc> >::iterator fiter = fsts_.begin(),
+        fend = fsts_.end();
+  for (++fiter; fiter != fend; ++fiter)
+    WriteFstKaldi(os, binary, *fiter);
+  WriteToken(os, binary, "</PhoneTopology>");
+}
+
+void PhoneTopology::Read(std::istream &is, bool binary) const{
+  ExpectToken(is, binary, "<PhoneTopology>");
+  int num_phones;
+  ExpectToken(is, binary, "<NumPhones>");
+  ReadBasicType(is, binary, &num_phones);
+  fsts_.resize(num_phones + 1);
+  std::vector<fst::VectorFst<StdArc> >::iterator fiter = fsts_.begin(),
+        fend = fsts_.end();
+  for (++fiter; fiter != fend; ++fiter)
+    ReadFstKaldi(os, binary, fiter);
+  ExpectToken(is, binary, "</PhoneTopology>");
+}
+
+bool PhonoTopology::IsAlignable() {
+  std::vector<fst::VectorFst<StdArc> >::iterator fiter = fsts_.begin(),
+        fend = fsts_.end();
+  for (++fiter; fiter != fend; ++fiter) {
+    // Get start state symbles
+    unordered_set<int> syms;
+    for (ArcIterator<Fst<Arc> >aiter(*fiter, fiter->Start()); !aiter.Done(); aiter.Next()) {
+      const Arc &arc = aiter.Value();
+      syms.insert(arc.ilabel);
+    }
+    for (StateIterator<StdFst> siter(*fiter); !siter.Done(); siter.Next()) {
+      typename Arc::StateId s = siter.Value();
+      for (ArcIterator<Fst<Arc> >aiter(*fiter, s); !aiter.Done(); aiter.Next()) {
+        const Arc &arc = aiter.Value();
+        if (arc.nextstate == fiter->Start())
+          return false;
+        if (s != fiter->Start() && syms.find(arc.ilabel) != syms.end())
+          return false;
+      }
+    }
+  }
+  return true;
+}
+
+}  // namespace chain
+}  // namespace kaldi
diff --git a/src/chain/phone-topology.h b/src/chain/phone-topology.h
new file mode 100644
index 00000000000..cec7e28686d
--- /dev/null
+++ b/src/chain/phone-topology.h
@@ -0,0 +1,99 @@
+// chain/phone-topology.h
+
+// Copyright      2015  Johns Hopkins University (Author: Daniel Povey)
+
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#ifndef KALDI_CHAIN_PHONE_TOPOLOGY_H_
+#define KALDI_CHAIN_PHONE_TOPOLOGY_H_
+
+#include <vector>
+#include <map>
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "fstext/fstext-lib.h"
+
+namespace kaldi {
+namespace chain {
+
+
+/**
+  The 'PhoneTopology' object stores the topology for each of the phones that the
+  system handles.  This is the equivalent of a HMM topology, except that the
+  emission probabilities are on the arcs not the states (so it's much more
+  FST-like), and there are no transition probabilities (these are just folded
+  into the emission probabilities).  Note that it's the fact that the 'chain'
+  system is trained discriminatively from the start is what enables us to treat
+  the transition probabilities this way.
+
+  A topology is an epsilon-free finite state acceptor.  The
+  'normal' topology that you get if you don't do anything special, is as
+  follows:
+
+0   1   1      # transition from state 0 to state 1 with label 1.
+1   1   2      # transition from state 1 to state 1 (self-loop) with label 2.
+1   0          # this says that state 1 is final.
+
+   The FSTs have the following properties:
+      - they are epsilon free
+      - the start state is numbered zero.
+      - the start state is not final.
+      - all states are used.
+      - the symbols on the labels of the FST start from 1 and are contiguous (no
+        unused symbols between the smallest and largest symbol).
+
+
+  Phones are given indexes from 1 to NumPhones() (no gaps are allowed here).
+
+  A topology for a phone is an FST
+ */
+
+class PhoneTopology {
+ public:
+  int32 NumPhones() { returns static_cast<int32>(fsts_.size()) - 1; }
+
+  // Returns the topology for a given phone.
+  const fst::VectorFst<StdArc> &TopologyForPhone(int32 phone);
+
+  // This constructor gives the phones the default topology.  If you want to
+  // give it a different topology, then you can create the text-form of this
+  // object using a script.
+  PhoneTopology(int32 num_phones);
+
+  void Write(std::ostream &os, bool binary) const;
+
+  void Read(std::istream &is, bool binary) const;
+
+  // returns true if all the phones' FSTs have the following properties:
+  //  - the symbols on arcs out of the start-state are disjoint from the
+  //    symbols on arcs out of other states.
+  //  - there are no arcs ending in the start state.
+  bool IsAlignable();
+ private:
+  void Check();
+
+  // index zero is not used.
+  std::vector<fst::VectorFst<StdArc> > fsts_;
+};
+
+
+}  // namespace chain
+}  // namespace kaldi
+
+#endif  // KALDI_CHAIN_PHONE_TOPOLOGY_H_
diff --git a/src/chainbin/Makefile b/src/chainbin/Makefile
new file mode 100644
index 00000000000..3fdd7fdb4d0
--- /dev/null
+++ b/src/chainbin/Makefile
@@ -0,0 +1,30 @@
+
+all:
+EXTRA_CXXFLAGS = -Wno-sign-compare
+include ../kaldi.mk
+
+LDFLAGS += $(CUDA_LDFLAGS)
+LDLIBS += $(CUDA_LDLIBS)
+
+BINFILES = chain-est-phone-lm chain-get-supervision chain-make-den-fst \
+        nnet3-chain-get-egs nnet3-chain-copy-egs nnet3-chain-merge-egs \
+        nnet3-chain-shuffle-egs nnet3-chain-subset-egs \
+        nnet3-chain-acc-lda-stats nnet3-chain-train nnet3-chain-compute-prob \
+        nnet3-chain-combine nnet3-chain-normalize-egs
+
+
+OBJFILES =
+
+# Add this dependency to force cuda-compiled.o to be rebuilt when we reconfigure.
+cuda-compiled.o: ../kaldi.mk
+
+TESTFILES =
+
+ADDLIBS = ../nnet3/kaldi-nnet3.a ../chain/kaldi-chain.a ../gmm/kaldi-gmm.a \
+         ../decoder/kaldi-decoder.a ../lat/kaldi-lat.a ../hmm/kaldi-hmm.a  \
+         ../transform/kaldi-transform.a ../tree/kaldi-tree.a \
+         ../cudamatrix/kaldi-cudamatrix.a \
+         ../matrix/kaldi-matrix.a ../fstext/kaldi-fstext.a \
+         ../util/kaldi-util.a ../thread/kaldi-thread.a ../base/kaldi-base.a
+
+include ../makefiles/default_rules.mk
diff --git a/src/chainbin/chain-est-phone-lm.cc b/src/chainbin/chain-est-phone-lm.cc
new file mode 100644
index 00000000000..f16b3f4f14b
--- /dev/null
+++ b/src/chainbin/chain-est-phone-lm.cc
@@ -0,0 +1,81 @@
+// chainbin/chain-est-phone-lm.cc
+
+// Copyright       2015  Johns Hopkins University (author:  Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "chain/language-model.h"
+
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::chain;
+    typedef kaldi::int32 int32;
+
+    const char *usage =
+        "Initialize un-smoothed phone language model for 'chain' training\n"
+        "Output in FST format (epsilon-free deterministic acceptor)\n"
+        "\n"
+        "Usage:  chain-est-phone-lm [options] <phone-seqs-rspecifier> <phone-lm-fst-out>\n"
+        "The phone-sequences are used to train a language model.\n"
+        "e.g.:\n"
+        "gunzip -c input_dir/ali.*.gz | ali-to-phones input_dir/final.mdl ark:- ark:- | \\\n"
+        " chain-est-phone-lm --leftmost-context-questions=dir/leftmost_questions.txt ark:- dir/phone_G.fst\n";
+
+    bool binary_write = true;
+    LanguageModelOptions lm_opts;
+
+    ParseOptions po(usage);
+    po.Register("binary", &binary_write, "Write output in binary mode");
+    lm_opts.Register(&po);
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string phone_seqs_rspecifier = po.GetArg(1),
+        lm_fst_wxfilename = po.GetArg(2);
+
+
+    LanguageModelEstimator lm_estimator(lm_opts);
+
+    SequentialInt32VectorReader phones_reader(phone_seqs_rspecifier);
+    KALDI_LOG << "Reading phone sequences";
+    for (; !phones_reader.Done(); phones_reader.Next()) {
+      const std::vector<int32> &phone_seq = phones_reader.Value();
+      lm_estimator.AddCounts(phone_seq);
+    }
+    KALDI_LOG << "Estimating phone LM";
+    fst::StdVectorFst fst;
+    lm_estimator.Estimate(&fst);
+
+    WriteFstKaldi(fst, lm_fst_wxfilename);
+
+    KALDI_LOG << "Estimated phone language model and wrote it to "
+              << lm_fst_wxfilename;
+    return 0;
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
diff --git a/src/chainbin/chain-get-supervision.cc b/src/chainbin/chain-get-supervision.cc
new file mode 100644
index 00000000000..b05f1166da4
--- /dev/null
+++ b/src/chainbin/chain-get-supervision.cc
@@ -0,0 +1,151 @@
+// chainbin/chain-get-supervision.cc
+
+// Copyright      2015  Johns Hopkins University (author:  Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sstream>
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "chain/chain-supervision.h"
+
+namespace kaldi {
+namespace chain {
+
+
+// This wrapper function does all the job of processing the features and
+// lattice into ChainSupervision objects, and writing them out.
+static bool ProcessSupervision(const TransitionModel &trans_model,
+                               const ContextDependencyInterface &ctx_dep,
+                               const ProtoSupervision &proto_sup,
+                               const std::string &key,
+                               SupervisionWriter *supervision_writer) {
+  Supervision supervision;
+  if (!ProtoSupervisionToSupervision(ctx_dep, trans_model,
+                                     proto_sup, &supervision)) {
+    KALDI_WARN << "Failed creating supervision for utterance "
+               << key;
+    return false;
+  }
+  if (RandInt(0, 10) == 0)
+    supervision.Check(trans_model);
+
+  supervision_writer->Write(key, supervision);
+  return true;
+}
+
+
+} // namespace chain
+} // namespace kaldi
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::chain;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Get a 'chain' supervision object for each file of training data.\n"
+        "This will normally be piped into nnet3-chain-get-egs, where it\n"
+        "will be split up into pieces and combined with the features.\n"
+        "Input can come in two formats: from alignments\n"
+        "(from ali-to-phones --write-lenghts=true), or from lattices\n"
+        "(e.g. derived from aligning the data, see steps/align_fmllr_lats.sh)\n"
+        "that have been converged to phone-level lattices with\n"
+        "lattice-align-phones --replace-output-symbols=true.\n"
+        "\n"
+        "Usage: chain-get-supervision [options] <tree> <transition-model> "
+        "[<phones-with-lengths-rspecifier>|<phone-lattice-rspecifier>] <supervision-wspecifier>\n"
+        "See steps/nnet3/chain/get_egs.sh for example\n";
+
+
+    bool lattice_input = false;
+    SupervisionOptions sup_opts;
+
+    ParseOptions po(usage);
+    sup_opts.Register(&po);
+    po.Register("lattice-input", &lattice_input, "If true, expect phone "
+                "lattices as input");
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 4) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string tree_rxfilename = po.GetArg(1),
+        trans_model_rxfilename = po.GetArg(2),
+        phone_durs_or_lat_rspecifier = po.GetArg(3),
+        supervision_wspecifier = po.GetArg(4);
+
+    TransitionModel trans_model;
+    ReadKaldiObject(trans_model_rxfilename, &trans_model);
+
+    ContextDependency ctx_dep;
+    ReadKaldiObject(tree_rxfilename, &ctx_dep);
+
+    SupervisionWriter supervision_writer(supervision_wspecifier);
+
+    int32 num_utts_done = 0, num_utts_error = 0;
+
+    if (lattice_input) {
+      SequentialCompactLatticeReader clat_reader(phone_durs_or_lat_rspecifier);
+      for (; !clat_reader.Done(); clat_reader.Next()) {
+        std::string key = clat_reader.Key();
+        const CompactLattice &clat = clat_reader.Value();
+        ProtoSupervision proto_supervision;
+        bool ans = PhoneLatticeToProtoSupervision(sup_opts, clat,
+                                                  &proto_supervision);
+        if (!ans) {
+          KALDI_WARN << "Error creating proto-supervision for utterance " << key;
+          num_utts_error++;
+          continue;
+        }
+        if (ProcessSupervision(trans_model, ctx_dep,
+                               proto_supervision, key, &supervision_writer))
+          num_utts_done++;
+        else
+          num_utts_error++;
+      }
+    } else {
+      SequentialInt32PairVectorReader phone_and_dur_reader(
+          phone_durs_or_lat_rspecifier);
+      for (; !phone_and_dur_reader.Done(); phone_and_dur_reader.Next()) {
+        std::string key = phone_and_dur_reader.Key();
+        const std::vector<std::pair<int32,int32> > &ali =
+            phone_and_dur_reader.Value();
+        ProtoSupervision proto_supervision;
+        AlignmentToProtoSupervision(sup_opts, ali,
+                                    &proto_supervision);
+        if (ProcessSupervision(trans_model, ctx_dep,
+                               proto_supervision, key, &supervision_writer))
+          num_utts_done++;
+        else
+          num_utts_error++;
+      }
+    }
+    KALDI_LOG << "Generated chain supervision information for "
+              << num_utts_done << " utterances, errors on "
+              << num_utts_error;
+    return (num_utts_done > num_utts_error ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
diff --git a/src/chainbin/chain-make-den-fst.cc b/src/chainbin/chain-make-den-fst.cc
new file mode 100644
index 00000000000..0d8d249242b
--- /dev/null
+++ b/src/chainbin/chain-make-den-fst.cc
@@ -0,0 +1,86 @@
+// chainbin/chain-make-den-fst.cc
+
+// Copyright       2015  Johns Hopkins University (author:  Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "chain/chain-den-graph.h"
+
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::chain;
+    typedef kaldi::int32 int32;
+
+    const char *usage =
+        "Created 'denominator' FST for 'chain' training\n"
+        "Outputs in FST format.  <denominator-fst-out> is an epsilon-free acceptor\n"
+        "<normalization-fst-out> is a modified version of <denominator-fst> (w.r.t.\n"
+        "initial and final probs) that is used in example generation.\n"
+        "\n"
+        "Usage: chain-make-den-fsth [options] <tree> <transition-model> <phone-lm-fst> "
+        "<denominator-fst-out> <normalization-fst-out>\n"
+        "e.g.:\n"
+        "chain-make-den-fst dir/tree dir/0.trans_mdl dir/phone_lm.fst dir/den.fst dir/normalization.fst\n";
+
+    ParseOptions po(usage);
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 5) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string tree_rxfilename = po.GetArg(1),
+        transition_model_rxfilename = po.GetArg(2),
+        phone_lm_rxfilename = po.GetArg(3),
+        den_fst_wxfilename = po.GetArg(4),
+        normalization_fst_wxfilename = po.GetArg(5);
+
+
+    ContextDependency ctx_dep;
+    TransitionModel trans_model;
+    fst::StdVectorFst phone_lm;
+
+    ReadKaldiObject(tree_rxfilename, &ctx_dep);
+    ReadKaldiObject(transition_model_rxfilename, &trans_model);
+    ReadFstKaldi(phone_lm_rxfilename, &phone_lm);
+
+    fst::StdVectorFst den_fst;
+    chain::CreateDenominatorFst(ctx_dep, trans_model, phone_lm,
+                                &den_fst);
+
+    fst::StdVectorFst normalization_fst;
+    chain::DenominatorGraph den_graph(den_fst, trans_model.NumPdfs());
+    den_graph.GetNormalizationFst(den_fst, &normalization_fst);
+
+
+    WriteFstKaldi(den_fst, den_fst_wxfilename);
+    WriteFstKaldi(normalization_fst, normalization_fst_wxfilename);
+
+    KALDI_LOG << "Write denominator FST to " << den_fst_wxfilename
+              << " and normalization FST to " << normalization_fst_wxfilename;
+    return 0;
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
diff --git a/src/chainbin/nnet3-chain-acc-lda-stats.cc b/src/chainbin/nnet3-chain-acc-lda-stats.cc
new file mode 100644
index 00000000000..3f092879b6e
--- /dev/null
+++ b/src/chainbin/nnet3-chain-acc-lda-stats.cc
@@ -0,0 +1,206 @@
+// nnet3bin/nnet3-chain-acc-lda-stats.cc
+
+// Copyright 2015  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "hmm/transition-model.h"
+#include "lat/lattice-functions.h"
+#include "nnet3/nnet-nnet.h"
+#include "nnet3/nnet-chain-example.h"
+#include "nnet3/nnet-compute.h"
+#include "nnet3/nnet-optimize.h"
+#include "transform/lda-estimate.h"
+
+
+namespace kaldi {
+namespace nnet3 {
+
+class NnetChainLdaStatsAccumulator {
+ public:
+  NnetChainLdaStatsAccumulator(BaseFloat rand_prune,
+                               const Nnet &nnet):
+      rand_prune_(rand_prune), nnet_(nnet), compiler_(nnet) { }
+
+
+  void AccStats(const NnetChainExample &eg) {
+    ComputationRequest request;
+    bool need_backprop = false, store_stats = false,
+        need_xent = false, need_xent_deriv = false;
+
+    GetChainComputationRequest(nnet_, eg, need_backprop, store_stats,
+                               need_xent, need_xent_deriv, &request);
+
+    const NnetComputation &computation = *(compiler_.Compile(request));
+
+    NnetComputeOptions options;
+    if (GetVerboseLevel() >= 3)
+      options.debug = true;
+    NnetComputer computer(options, computation, nnet_, NULL);
+
+    computer.AcceptInputs(nnet_, eg.inputs);
+    computer.Forward();
+    const CuMatrixBase<BaseFloat> &nnet_output = computer.GetOutput("output");
+    AccStatsFromOutput(eg, nnet_output);
+  }
+
+  void WriteStats(const std::string &stats_wxfilename, bool binary) {
+    if (lda_stats_.TotCount() == 0) {
+      KALDI_ERR << "Accumulated no stats.";
+    } else {
+      WriteKaldiObject(lda_stats_, stats_wxfilename, binary);
+      KALDI_LOG << "Accumulated stats, soft frame count = "
+                << lda_stats_.TotCount() << ".  Wrote to "
+                << stats_wxfilename;
+    }
+  }
+ private:
+  void AccStatsFromOutput(const NnetChainExample &eg,
+                          const CuMatrixBase<BaseFloat> &nnet_output) {
+    BaseFloat rand_prune = rand_prune_;
+
+    if (eg.outputs.size() != 1 || eg.outputs[0].name != "output")
+      KALDI_ERR << "Expecting the example to have one output named 'output'.";
+
+
+    const chain::Supervision &supervision = eg.outputs[0].supervision;
+    // handling the one-sequence-per-eg case is easier so we just do that.
+    KALDI_ASSERT(supervision.num_sequences == 1 &&
+                 "This program expects one sequence per eg.");
+    int32 num_frames = supervision.frames_per_sequence,
+        num_pdfs = supervision.label_dim;
+    KALDI_ASSERT(num_frames == nnet_output.NumRows());
+    const fst::StdVectorFst &fst = supervision.fst;
+
+    Lattice lat;
+    // convert the FST to a lattice, putting all the weight on
+    // the graph weight.  This is to save us having to implement the
+    // forward-backward on FSTs.
+    ConvertFstToLattice(fst, &lat);
+    Posterior post;
+    LatticeForwardBackward(lat, &post);
+    KALDI_ASSERT(post.size() == static_cast<size_t>(num_frames));
+
+    // Subtract one, to convert the (pdf-id + 1) which appears in the
+    // supervision FST, to a pdf-id.
+    for (size_t i = 0; i < post.size(); i++)
+      for (size_t j = 0; j < post[i].size(); j++)
+        post[i][j].first--;
+
+    if (lda_stats_.Dim() == 0)
+      lda_stats_.Init(num_pdfs,
+                      nnet_output.NumCols());
+
+    for (int32 t = 0; t < num_frames; t++) {
+      // the following, transferring row by row to CPU, would be wasteful if we
+      // actually were using a GPU, but we don't anticipate using a GPU in this
+      // program.
+      CuSubVector<BaseFloat> cu_row(nnet_output, t);
+      // "row" is actually just a redudant copy, since we're likely on CPU,
+      // but we're about to do an outer product, so this doesn't dominate.
+      Vector<BaseFloat> row(cu_row);
+
+      std::vector<std::pair<int32,BaseFloat> >::const_iterator
+          iter = post[t].begin(), end = post[t].end();
+
+      for (; iter != end; ++iter) {
+        int32 pdf = iter->first;
+        BaseFloat weight = iter->second;
+        BaseFloat pruned_weight = RandPrune(weight, rand_prune);
+        if (pruned_weight != 0.0)
+          lda_stats_.Accumulate(row, pdf, pruned_weight);
+      }
+    }
+  }
+
+  BaseFloat rand_prune_;
+  const Nnet &nnet_;
+  CachingOptimizingCompiler compiler_;
+  LdaEstimate lda_stats_;
+};
+
+}
+}
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Accumulate statistics in the same format as acc-lda (i.e. stats for\n"
+        "estimation of LDA and similar types of transform), starting from nnet+chain\n"
+        "training examples.  This program puts the features through the network,\n"
+        "and the network output will be the features; the supervision in the\n"
+        "training examples is used for the class labels.  Used in obtaining\n"
+        "feature transforms that help nnet training work better.\n"
+        "Note: the time boundaries it gets from the chain supervision will be\n"
+        "a little fuzzy (which is not ideal), but it should not matter much in\n"
+        "this situation\n"
+        "\n"
+        "Usage:  nnet3-chain-acc-lda-stats [options] <raw-nnet-in> <training-examples-in> <lda-stats-out>\n"
+        "e.g.:\n"
+        "nnet3-chain-acc-lda-stats 0.raw ark:1.cegs 1.acc\n"
+        "See also: nnet-get-feature-transform\n";
+
+    bool binary_write = true;
+    BaseFloat rand_prune = 0.0;
+
+    ParseOptions po(usage);
+    po.Register("binary", &binary_write, "Write output in binary mode");
+    po.Register("rand-prune", &rand_prune,
+                "Randomized pruning threshold for posteriors");
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string nnet_rxfilename = po.GetArg(1),
+        examples_rspecifier = po.GetArg(2),
+        lda_accs_wxfilename = po.GetArg(3);
+
+    // Note: this neural net is probably just splicing the features at this
+    // point.
+    Nnet nnet;
+    ReadKaldiObject(nnet_rxfilename, &nnet);
+
+    NnetChainLdaStatsAccumulator accumulator(rand_prune, nnet);
+
+    int64 num_egs = 0;
+
+    SequentialNnetChainExampleReader example_reader(examples_rspecifier);
+    for (; !example_reader.Done(); example_reader.Next(), num_egs++)
+      accumulator.AccStats(example_reader.Value());
+
+    KALDI_LOG << "Processed " << num_egs << " examples.";
+    // the next command will die if we accumulated no stats.
+    accumulator.WriteStats(lda_accs_wxfilename, binary_write);
+
+    return 0;
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
+
diff --git a/src/chainbin/nnet3-chain-combine.cc b/src/chainbin/nnet3-chain-combine.cc
new file mode 100644
index 00000000000..a7083c8332e
--- /dev/null
+++ b/src/chainbin/nnet3-chain-combine.cc
@@ -0,0 +1,121 @@
+// chainbin/nnet3-chain-combine.cc
+
+// Copyright 2012-2015  Johns Hopkins University (author:  Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "nnet3/nnet-chain-combine.h"
+
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Using a subset of training or held-out nnet3+chain examples, compute an\n"
+        "optimal combination of  anumber of nnet3 neural nets by maximizing the\n"
+        "'chain' objective function.  See documentation of options for more details.\n"
+        "Inputs and outputs are nnet3 raw nnets.\n"
+        "\n"
+        "Usage:  nnet3-chain-combine [options] <den-fst> <raw-nnet-in1> <raw-nnet-in2> ... <raw-nnet-inN> <chain-examples-in> <raw-nnet-out>\n"
+        "\n"
+        "e.g.:\n"
+        " nnet3-combine den.fst 35.raw 36.raw 37.raw 38.raw ark:valid.cegs final.raw\n";
+
+    bool binary_write = true;
+    std::string use_gpu = "yes";
+    NnetCombineConfig combine_config;
+    chain::ChainTrainingOptions chain_config;
+
+    ParseOptions po(usage);
+    po.Register("binary", &binary_write, "Write output in binary mode");
+    po.Register("use-gpu", &use_gpu,
+                "yes|no|optional|wait, only has effect if compiled with CUDA");
+
+    combine_config.Register(&po);
+    chain_config.Register(&po);
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() < 4) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+#if HAVE_CUDA==1
+    CuDevice::Instantiate().SelectGpuId(use_gpu);
+#endif
+
+    std::string
+        den_fst_rxfilename = po.GetArg(1),
+        raw_nnet_rxfilename = po.GetArg(2),
+        valid_examples_rspecifier = po.GetArg(po.NumArgs() - 1),
+        nnet_wxfilename = po.GetArg(po.NumArgs());
+
+
+    fst::StdVectorFst den_fst;
+    ReadFstKaldi(den_fst_rxfilename, &den_fst);
+
+    Nnet nnet;
+    ReadKaldiObject(raw_nnet_rxfilename, &nnet);
+
+
+    std::vector<NnetChainExample> egs;
+    egs.reserve(10000);  // reserve a lot of space to minimize the chance of
+                         // reallocation.
+
+    { // This block adds training examples to "egs".
+      SequentialNnetChainExampleReader example_reader(
+          valid_examples_rspecifier);
+      for (; !example_reader.Done(); example_reader.Next())
+        egs.push_back(example_reader.Value());
+      KALDI_LOG << "Read " << egs.size() << " examples.";
+      KALDI_ASSERT(!egs.empty());
+    }
+
+
+    int32 num_nnets = po.NumArgs() - 3;
+    NnetChainCombiner combiner(combine_config, chain_config,
+                               num_nnets, egs, den_fst, nnet);
+
+    for (int32 n = 1; n < num_nnets; n++) {
+      std::string this_nnet_rxfilename = po.GetArg(n + 2);
+      ReadKaldiObject(this_nnet_rxfilename, &nnet);
+      combiner.AcceptNnet(nnet);
+    }
+
+    combiner.Combine();
+
+#if HAVE_CUDA==1
+    CuDevice::Instantiate().PrintProfile();
+#endif
+
+    WriteKaldiObject(combiner.GetNnet(), nnet_wxfilename, binary_write);
+
+    KALDI_LOG << "Finished combining neural nets, wrote model to "
+              << nnet_wxfilename;
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
+
diff --git a/src/chainbin/nnet3-chain-compute-prob.cc b/src/chainbin/nnet3-chain-compute-prob.cc
new file mode 100644
index 00000000000..7f9d688777a
--- /dev/null
+++ b/src/chainbin/nnet3-chain-compute-prob.cc
@@ -0,0 +1,88 @@
+// nnet3bin/nnet3-chain-compute-prob.cc
+
+// Copyright 2015  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "nnet3/nnet-chain-diagnostics.h"
+
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Computes and prints to in logging messages the average log-prob per frame of\n"
+        "the given data with an nnet3+chain neural net.  The input of this is the output of\n"
+        "e.g. nnet3-chain-get-egs | nnet3-chain-merge-egs.\n"
+        "\n"
+        "Usage:  nnet3-chain-compute-prob [options] <raw-nnet3-model-in> <denominator-fst> <training-examples-in>\n"
+        "e.g.: nnet3-chain-compute-prob 0.mdl den.fst ark:valid.egs\n";
+
+
+    // This program doesn't support using a GPU, because these probabilities are
+    // used for diagnostics, and you can just compute them with a small enough
+    // amount of data that a CPU can do it within reasonable time.
+    // It wouldn't be hard to make it support GPU, though.
+
+    NnetComputeProbOptions nnet_opts;
+    chain::ChainTrainingOptions chain_opts;
+
+    ParseOptions po(usage);
+
+    nnet_opts.Register(&po);
+    chain_opts.Register(&po);
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string nnet_rxfilename = po.GetArg(1),
+        den_fst_rxfilename = po.GetArg(2),
+        examples_rspecifier = po.GetArg(3);
+
+    Nnet nnet;
+    ReadKaldiObject(nnet_rxfilename, &nnet);
+
+    fst::StdVectorFst den_fst;
+    ReadFstKaldi(den_fst_rxfilename, &den_fst);
+
+    NnetChainComputeProb chain_prob_computer(nnet_opts, chain_opts, den_fst,
+                                            nnet);
+
+    SequentialNnetChainExampleReader example_reader(examples_rspecifier);
+
+    for (; !example_reader.Done(); example_reader.Next())
+      chain_prob_computer.Compute(example_reader.Value());
+
+    bool ok = chain_prob_computer.PrintTotalStats();
+
+    return (ok ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
+
diff --git a/src/chainbin/nnet3-chain-copy-egs.cc b/src/chainbin/nnet3-chain-copy-egs.cc
new file mode 100644
index 00000000000..5404cdc438e
--- /dev/null
+++ b/src/chainbin/nnet3-chain-copy-egs.cc
@@ -0,0 +1,140 @@
+// chainbin/nnet3-chain-copy-egs.cc
+
+// Copyright 2012-2015  Johns Hopkins University (author:  Daniel Povey)
+//                2014  Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "hmm/transition-model.h"
+#include "nnet3/nnet-chain-example.h"
+
+namespace kaldi {
+// returns an integer randomly drawn with expected value "expected_count"
+// (will be either floor(expected_count) or ceil(expected_count)).
+int32 GetCount(double expected_count) {
+  KALDI_ASSERT(expected_count >= 0.0);
+  int32 ans = floor(expected_count);
+  expected_count -= ans;
+  if (WithProb(expected_count))
+    ans++;
+  return ans;
+}
+}
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Copy examples for nnet3+chain network training, possibly changing the binary mode.\n"
+        "Supports multiple wspecifiers, in which case it will write the examples\n"
+        "round-robin to the outputs.\n"
+        "\n"
+        "Usage:  nnet3-chain-copy-egs [options] <egs-rspecifier> <egs-wspecifier1> [<egs-wspecifier2> ...]\n"
+        "\n"
+        "e.g.\n"
+        "nnet3-chain-copy-egs ark:train.cegs ark,t:text.cegs\n"
+        "or:\n"
+        "nnet3-chain-copy-egs ark:train.cegs ark:1.cegs ark:2.cegs\n";
+
+    bool random = false;
+    int32 srand_seed = 0;
+    int32 frame_shift = 0;
+    int32 truncate_deriv_weights = 0;
+    BaseFloat keep_proportion = 1.0;
+
+    ParseOptions po(usage);
+    po.Register("random", &random, "If true, will write frames to output "
+                "archives randomly, not round-robin.");
+    po.Register("keep-proportion", &keep_proportion, "If <1.0, this program will "
+                "randomly keep this proportion of the input samples.  If >1.0, it will "
+                "in expectation copy a sample this many times.  It will copy it a number "
+                "of times equal to floor(keep-proportion) or ceil(keep-proportion).");
+    po.Register("srand", &srand_seed, "Seed for random number generator "
+                "(only relevant if --random=true or --keep-proportion != 1.0)");
+    po.Register("frame-shift", &frame_shift, "Allows you to shift time values "
+                "in the supervision data (excluding iVector data) - useful in "
+                "augmenting data.  Note, the outputs will remain at the closest "
+                "exact multiples of the frame subsampling factor");
+    po.Register("truncate-deriv-weights", &truncate_deriv_weights,
+                "If nonzero, the number of initial/final subsample frames that "
+                "will have their derivatives' weights set to zero.");
+
+    po.Read(argc, argv);
+
+    srand(srand_seed);
+
+    if (po.NumArgs() < 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string examples_rspecifier = po.GetArg(1);
+
+    SequentialNnetChainExampleReader example_reader(examples_rspecifier);
+
+    int32 num_outputs = po.NumArgs() - 1;
+    std::vector<NnetChainExampleWriter*> example_writers(num_outputs);
+    for (int32 i = 0; i < num_outputs; i++)
+      example_writers[i] = new NnetChainExampleWriter(po.GetArg(i+2));
+
+    std::vector<std::string> exclude_names; // names we never shift times of;
+                                            // not configurable for now.
+    exclude_names.push_back(std::string("ivector"));
+
+
+    int64 num_read = 0, num_written = 0;
+    for (; !example_reader.Done(); example_reader.Next(), num_read++) {
+      // count is normally 1; could be 0, or possibly >1.
+      int32 count = GetCount(keep_proportion);
+      std::string key = example_reader.Key();
+      if (frame_shift == 0 && truncate_deriv_weights == 0) {
+        const NnetChainExample &eg = example_reader.Value();
+        for (int32 c = 0; c < count; c++) {
+          int32 index = (random ? Rand() : num_written) % num_outputs;
+          example_writers[index]->Write(key, eg);
+          num_written++;
+        }
+      } else if (count > 0) {
+        NnetChainExample eg = example_reader.Value();
+        if (frame_shift != 0)
+          ShiftChainExampleTimes(frame_shift, exclude_names, &eg);
+        if (truncate_deriv_weights != 0)
+          TruncateDerivWeights(truncate_deriv_weights, &eg);
+        for (int32 c = 0; c < count; c++) {
+          int32 index = (random ? Rand() : num_written) % num_outputs;
+          example_writers[index]->Write(key, eg);
+          num_written++;
+        }
+      }
+    }
+    for (int32 i = 0; i < num_outputs; i++)
+      delete example_writers[i];
+    KALDI_LOG << "Read " << num_read
+              << " neural-network training examples, wrote " << num_written;
+    return (num_written == 0 ? 1 : 0);
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
+
diff --git a/src/chainbin/nnet3-chain-get-egs.cc b/src/chainbin/nnet3-chain-get-egs.cc
new file mode 100644
index 00000000000..321b18ed122
--- /dev/null
+++ b/src/chainbin/nnet3-chain-get-egs.cc
@@ -0,0 +1,372 @@
+// chainbin/nnet3-chain-get-egs.cc
+
+// Copyright      2015  Johns Hopkins University (author:  Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sstream>
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "hmm/transition-model.h"
+#include "hmm/posterior.h"
+#include "nnet3/nnet-example.h"
+#include "nnet3/nnet-chain-example.h"
+#include "nnet3/nnet-example-utils.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+
+/**
+   This function does all the processing for one utterance, and outputs the
+   supervision objects to 'example_writer'.  Note: if normalization_fst is the
+   empty FST (with no states), it skips the final stage of egs preparation and
+   you should do it later with nnet3-chain-normalize-egs.
+*/
+
+static bool ProcessFile(const fst::StdVectorFst &normalization_fst,
+                        const MatrixBase<BaseFloat> &feats,
+                        const MatrixBase<BaseFloat> *ivector_feats,
+                        const chain::Supervision &supervision,
+                        const std::string &utt_id,
+                        bool compress,
+                        int32 left_context,
+                        int32 right_context,
+                        int32 frames_per_eg,
+                        int32 frames_overlap_per_eg,
+                        int32 frame_subsampling_factor,
+                        int32 cut_zero_frames,
+                        int64 *num_frames_written,
+                        int64 *num_egs_written,
+                        NnetChainExampleWriter *example_writer) {
+  KALDI_ASSERT(supervision.num_sequences == 1);
+  int32 num_feature_frames = feats.NumRows(),
+      num_output_frames = supervision.frames_per_sequence,
+      num_feature_frames_subsampled =
+                             (num_feature_frames + frame_subsampling_factor - 1)/
+                             frame_subsampling_factor;
+  if (num_output_frames != num_feature_frames_subsampled) {
+    // we tolerate deviations in the num-frames if they are very small (1 output
+    // frame).
+
+    if (abs(num_output_frames - num_feature_frames_subsampled) > 1) {
+      KALDI_ERR << "Mismatch in num-frames: chain supervision has "
+                << num_output_frames
+                << " versus features/frame_subsampling_factor = "
+                << num_feature_frames << " / " << frame_subsampling_factor
+                << " = " << num_feature_frames_subsampled
+                << ": check that --frame-subsampling-factor option is set "
+                << "the same as to chain-get-supervision.";
+    }
+    int32 new_num_feature_frames =
+        num_output_frames * frame_subsampling_factor;
+    // add a few frames at the end to make it match up.
+    Matrix<BaseFloat> feats_new(new_num_feature_frames, feats.NumCols(),
+                                kUndefined);
+    int32 min_feature_frames = std::min<int32>(num_feature_frames,
+                                               new_num_feature_frames);
+    feats_new.RowRange(0, min_feature_frames).CopyFromMat(
+        feats.RowRange(0, min_feature_frames));
+    for (int32 i = num_feature_frames; i < new_num_feature_frames; i++)
+      feats_new.Row(i).CopyFromVec(feats.Row(num_feature_frames - 1));
+    return ProcessFile(normalization_fst, feats_new, ivector_feats,
+                       supervision, utt_id, compress, left_context, right_context,
+                       frames_per_eg, frames_overlap_per_eg, frame_subsampling_factor,
+                       cut_zero_frames, num_frames_written, num_egs_written,
+                       example_writer);
+  }
+
+  KALDI_ASSERT(frames_per_eg % frame_subsampling_factor == 0);
+
+  int32 frames_per_eg_subsampled = frames_per_eg / frame_subsampling_factor,
+      frames_overlap_subsampled = frames_overlap_per_eg / frame_subsampling_factor,
+      frames_shift_subsampled = frames_per_eg_subsampled - frames_overlap_subsampled;
+
+  if (num_feature_frames_subsampled < frames_per_eg_subsampled) {
+    KALDI_WARN << "Length of features for utterance " << utt_id
+               << " is less than than the frames_per_eg (after sub-sampling).";
+    return false;
+  }
+
+  // we don't do any padding, as it would be a bit tricky to pad the 'chain' supervision.
+  // Instead we select ranges of frames that fully fit within the file;  these
+  // might slightly overlap with each other or have gaps.
+  std::vector<int32> range_starts_subsampled;
+  chain::SplitIntoRanges(num_feature_frames_subsampled -
+                         frames_overlap_subsampled,
+                         frames_shift_subsampled,
+                         &range_starts_subsampled);
+  // The 'deriv_weights' make sure we don't count frames twice, and also ensure
+  // that we tend to avoid having nonzero weights on the derivatives that are
+  // too close to the edge of the corresponding 'range' (these derivatives close
+  // to the edge are not as accurate as they could be, because when we split we
+  // don't know the correct alphas and betas).
+  std::vector<Vector<BaseFloat> > deriv_weights;
+  if (cut_zero_frames >= 0)
+    chain::GetWeightsForRangesNew(frames_per_eg_subsampled,
+                                  cut_zero_frames / frame_subsampling_factor,
+                                  range_starts_subsampled,
+                                  &deriv_weights);
+  else
+    chain::GetWeightsForRanges(frames_per_eg_subsampled,
+                               range_starts_subsampled,
+                               &deriv_weights);
+
+  if (range_starts_subsampled.empty()) {
+    KALDI_WARN << "No output for utterance " << utt_id
+               << " (num-frames=" << num_feature_frames
+               << ") because too short for --frames-per-eg="
+               << frames_per_eg;
+    return false;
+  }
+  chain::SupervisionSplitter splitter(supervision);
+
+  for (size_t i = 0; i < range_starts_subsampled.size(); i++) {
+    int32 range_start_subsampled = range_starts_subsampled[i],
+        range_start = range_start_subsampled * frame_subsampling_factor;
+
+    chain::Supervision supervision_part;
+    splitter.GetFrameRange(range_start_subsampled,
+                           frames_per_eg_subsampled,
+                           &supervision_part);
+
+    if (normalization_fst.NumStates() > 0 &&
+        !AddWeightToSupervisionFst(normalization_fst,
+                                   &supervision_part)) {
+      KALDI_WARN << "For utterance " << utt_id << ", frames "
+                 << range_start << " to " << (range_start + frames_per_eg)
+                 << ", FST was empty after composing with normalization FST. "
+                 << "This should be extremely rare (a few per corpus, at most)";
+      return false;
+    }
+
+    int32 first_frame = 0;  // we shift the time-indexes of all these parts so
+                            // that the supervised part starts from frame 0.
+    NnetChainSupervision nnet_supervision("output", supervision_part,
+                                          deriv_weights[i],
+                                          first_frame, frame_subsampling_factor);
+
+    NnetChainExample nnet_chain_eg;
+    nnet_chain_eg.outputs.resize(1);
+    nnet_chain_eg.outputs[0].Swap(&nnet_supervision);
+    nnet_chain_eg.inputs.resize(ivector_feats != NULL ? 2 : 1);
+
+    int32 tot_frames = left_context + frames_per_eg + right_context;
+    Matrix<BaseFloat> input_frames(tot_frames, feats.NumCols(), kUndefined);
+
+    // Set up "input_frames".
+    for (int32 j = -left_context; j < frames_per_eg + right_context; j++) {
+      int32 t = range_start + j;
+      if (t < 0) t = 0;
+      if (t >= feats.NumRows()) t = feats.NumRows() - 1;
+      SubVector<BaseFloat> src(feats, t),
+          dest(input_frames, j + left_context);
+      dest.CopyFromVec(src);
+    }
+    NnetIo input_io("input", - left_context,
+                    input_frames);
+    nnet_chain_eg.inputs[0].Swap(&input_io);
+
+    if (ivector_feats != NULL) {
+      // if applicable, add the iVector feature.
+      // try to get closest frame to middle of window to get
+      // a representative iVector.
+      int32 closest_frame = range_start + frames_per_eg / 2;
+      KALDI_ASSERT(ivector_feats->NumRows() > 0);
+      if (closest_frame >= ivector_feats->NumRows())
+        closest_frame = ivector_feats->NumRows() - 1;
+      Matrix<BaseFloat> ivector(1, ivector_feats->NumCols());
+      ivector.Row(0).CopyFromVec(ivector_feats->Row(closest_frame));
+      NnetIo ivector_io("ivector", 0, ivector);
+      nnet_chain_eg.inputs[1].Swap(&ivector_io);
+    }
+
+    if (compress)
+      nnet_chain_eg.Compress();
+
+    std::ostringstream os;
+    os << utt_id << "-" << range_start;
+
+    std::string key = os.str(); // key is <utt_id>-<frame_id>
+
+    *num_frames_written += frames_per_eg;
+    *num_egs_written += 1;
+
+    example_writer->Write(key, nnet_chain_eg);
+  }
+  return true;
+}
+
+} // namespace nnet2
+} // namespace kaldi
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Get frame-by-frame examples of data for nnet3+chain neural network\n"
+        "training.  This involves breaking up utterances into pieces of a\n"
+        "fixed size.  Input will come from chain-get-supervision.\n"
+        "Note: if <normalization-fst> is not supplied the egs will not be\n"
+        "ready for training; in that case they should later be processed\n"
+        "with nnet3-chain-normalize-egs\n"
+        "\n"
+        "Usage:  nnet3-chain-get-egs [options] [<normalization-fst>] <features-rspecifier> "
+        "<chain-supervision-rspecifier> <egs-wspecifier>\n"
+        "\n"
+        "An example [where $feats expands to the actual features]:\n"
+        "chain-get-supervision [args] | \\\n"
+        "  nnet3-chain-get-egs --left-context=25 --right-context=9 --num-frames=20 dir/normalization.fst \\\n"
+        "  \"$feats\" ark,s,cs:- ark:cegs.1.ark\n"
+        "Note: the --frame-subsampling-factor option must be the same as given to\n"
+        "chain-get-supervision.\n";
+
+    bool compress = true;
+    int32 left_context = 0, right_context = 0, num_frames = 1,
+        num_frames_overlap = 0, length_tolerance = 100,
+        cut_zero_frames = -1,
+        frame_subsampling_factor = 1;
+
+    std::string ivector_rspecifier;
+
+    ParseOptions po(usage);
+    po.Register("compress", &compress, "If true, write egs in "
+                "compressed format (recommended)");
+    po.Register("cut-zero-frames", &cut_zero_frames, "Number of frames "
+                "(measured before subsampling) to zero the derivative on each "
+                "side of a cut point (if set, activates new-style derivative "
+                "weights)");
+    po.Register("left-context", &left_context, "Number of frames of left "
+                "context the neural net requires.");
+    po.Register("right-context", &right_context, "Number of frames of right "
+                "context the neural net requires.");
+    po.Register("num-frames", &num_frames, "Number of frames with labels "
+                "that each example contains.  Will be rounded up to a multiple "
+                "of --frame-subsampling-factor.");
+    po.Register("num-frames-overlap", &num_frames_overlap, "Number of frames of "
+                "overlap between each example (could be useful in conjunction "
+                "--min-deriv-time and --max-deriv-time, to avoid wasting data). "
+                "Each time we shift by --num-frames minus --num-frames-overlap.");
+    po.Register("ivectors", &ivector_rspecifier, "Rspecifier of ivector "
+                "features, as a matrix.");
+    po.Register("length-tolerance", &length_tolerance, "Tolerance for "
+                "difference in num-frames between feat and ivector matrices");
+    po.Register("frame-subsampling-factor", &frame_subsampling_factor, "Used "
+                "if the frame-rate at the output will be less than the "
+                "frame-rate of the input");
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() < 3 || po.NumArgs() > 4) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    if (num_frames <= 0 || left_context < 0 || right_context < 0 ||
+        length_tolerance < 0 || frame_subsampling_factor <= 0)
+      KALDI_ERR << "One of the integer options is out of the allowed range.";
+    RoundUpNumFrames(frame_subsampling_factor,
+                     &num_frames, &num_frames_overlap);
+
+    std::string
+        normalization_fst_rxfilename,
+        feature_rspecifier,
+        supervision_rspecifier,
+        examples_wspecifier;
+    if (po.NumArgs() == 3) {
+      feature_rspecifier = po.GetArg(1);
+      supervision_rspecifier = po.GetArg(2);
+      examples_wspecifier = po.GetArg(3);
+    } else {
+      normalization_fst_rxfilename = po.GetArg(1);
+      KALDI_ASSERT(!normalization_fst_rxfilename.empty());
+      feature_rspecifier = po.GetArg(2);
+      supervision_rspecifier = po.GetArg(3);
+      examples_wspecifier = po.GetArg(4);
+    }
+
+    fst::StdVectorFst normalization_fst;
+    if (!normalization_fst_rxfilename.empty()) {
+      ReadFstKaldi(normalization_fst_rxfilename, &normalization_fst);
+      KALDI_ASSERT(normalization_fst.NumStates() > 0);
+    }
+
+    SequentialBaseFloatMatrixReader feat_reader(feature_rspecifier);
+    chain::RandomAccessSupervisionReader supervision_reader(
+        supervision_rspecifier);
+    NnetChainExampleWriter example_writer(examples_wspecifier);
+    RandomAccessBaseFloatMatrixReader ivector_reader(ivector_rspecifier);
+
+    int32 num_done = 0, num_err = 0;
+    int64 num_frames_written = 0, num_egs_written = 0;
+
+    for (; !feat_reader.Done(); feat_reader.Next()) {
+      std::string key = feat_reader.Key();
+      const Matrix<BaseFloat> &feats = feat_reader.Value();
+      if (!supervision_reader.HasKey(key)) {
+        KALDI_WARN << "No pdf-level posterior for key " << key;
+        num_err++;
+      } else {
+        const chain::Supervision &supervision = supervision_reader.Value(key);
+        const Matrix<BaseFloat> *ivector_feats = NULL;
+        if (!ivector_rspecifier.empty()) {
+          if (!ivector_reader.HasKey(key)) {
+            KALDI_WARN << "No iVectors for utterance " << key;
+            num_err++;
+            continue;
+          } else {
+            // this address will be valid until we call HasKey() or Value()
+            // again.
+            ivector_feats = &(ivector_reader.Value(key));
+          }
+        }
+        if (ivector_feats != NULL &&
+            (abs(feats.NumRows() - ivector_feats->NumRows()) > length_tolerance
+             || ivector_feats->NumRows() == 0)) {
+          KALDI_WARN << "Length difference between feats " << feats.NumRows()
+                     << " and iVectors " << ivector_feats->NumRows()
+                     << " exceeds tolerance " << length_tolerance;
+          num_err++;
+          continue;
+        }
+        if (ProcessFile(normalization_fst, feats, ivector_feats, supervision,
+                        key, compress,
+                        left_context, right_context, num_frames,
+                        num_frames_overlap, frame_subsampling_factor,
+                        cut_zero_frames, &num_frames_written, &num_egs_written,
+                        &example_writer))
+          num_done++;
+        else
+          num_err++;
+      }
+    }
+
+    KALDI_LOG << "Finished generating nnet3-chain examples, "
+              << "successfully processed " << num_done
+              << " feature files, wrote " << num_egs_written << " examples, "
+              << " with " << num_frames_written << " frames in total; "
+              << num_err << " files had errors.";
+    return (num_egs_written == 0 || num_err > num_done ? 1 : 0);
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
diff --git a/src/chainbin/nnet3-chain-merge-egs.cc b/src/chainbin/nnet3-chain-merge-egs.cc
new file mode 100644
index 00000000000..45dca4051f3
--- /dev/null
+++ b/src/chainbin/nnet3-chain-merge-egs.cc
@@ -0,0 +1,101 @@
+// chainbin/nnet3-chain-merge-egs.cc
+
+// Copyright 2012-2015  Johns Hopkins University (author:  Daniel Povey)
+//                2014  Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "hmm/transition-model.h"
+#include "nnet3/nnet-chain-example.h"
+
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "This copies nnet3+chain training examples from input to output, merging them\n"
+        "into composite examples.  The --minibatch-size option controls how many egs\n"
+        "are merged into a single output eg.\n"
+        "\n"
+        "Usage:  nnet3-chain-merge-egs [options] <egs-rspecifier> <egs-wspecifier>\n"
+        "e.g.\n"
+        "nnet3-chain-merge-egs --minibatch-size=128 ark:1.cegs ark:- | nnet3-chain-train-simple ... \n"
+        "See also nnet3-chain-copy-egs\n";
+
+    bool compress = false;
+    int32 minibatch_size = 64;
+
+    ParseOptions po(usage);
+    po.Register("minibatch-size", &minibatch_size, "Target size of minibatches "
+                "when merging (see also --measure-output-frames)");
+    po.Register("compress", &compress, "If true, compress the output examples "
+                "(not recommended unless you are writing to disk");
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string examples_rspecifier = po.GetArg(1),
+        examples_wspecifier = po.GetArg(2);
+
+    SequentialNnetChainExampleReader example_reader(examples_rspecifier);
+    NnetChainExampleWriter example_writer(examples_wspecifier);
+
+    std::vector<NnetChainExample> examples;
+    examples.reserve(minibatch_size);
+
+    int64 num_read = 0, num_written = 0;
+    while (!example_reader.Done()) {
+      const NnetChainExample &cur_eg = example_reader.Value();
+      examples.resize(examples.size() + 1);
+      examples.back() = cur_eg;
+
+      bool minibatch_ready =
+          static_cast<int32>(examples.size()) >= minibatch_size;
+
+      // Do Next() now, so we can test example_reader.Done() below .
+      example_reader.Next();
+      num_read++;
+
+      if (minibatch_ready || (example_reader.Done() && !examples.empty())) {
+        NnetChainExample merged_eg;
+        MergeChainExamples(compress, &examples, &merged_eg);
+        std::ostringstream ostr;
+        ostr << "merged-" << num_written;
+        num_written++;
+        std::string output_key = ostr.str();
+        example_writer.Write(output_key, merged_eg);
+        examples.clear();
+      }
+    }
+    KALDI_LOG << "Merged " << num_read << " egs to " << num_written << '.';
+    return (num_written != 0 ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
+
diff --git a/src/chainbin/nnet3-chain-normalize-egs.cc b/src/chainbin/nnet3-chain-normalize-egs.cc
new file mode 100644
index 00000000000..9d3f56f756a
--- /dev/null
+++ b/src/chainbin/nnet3-chain-normalize-egs.cc
@@ -0,0 +1,91 @@
+// chainbin/nnet3-chain-normalize-egs.cc
+
+// Copyright      2015  Johns Hopkins University (author:  Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "hmm/transition-model.h"
+#include "nnet3/nnet-chain-example.h"
+#include "chain/chain-supervision.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Add weights from 'normalization' FST to nnet3+chain examples.\n"
+        "Should be done if and only if the <normalization-fst> argument of\n"
+        "nnet3-chain-get-egs was not supplied when the original egs were\n"
+        "created.\n"
+        "\n"
+        "Usage:  nnet3-chain-normalize-egs [options] <normalization-fst> <egs-rspecifier> <egs-wspecifier>\n"
+        "\n"
+        "e.g.\n"
+        "nnet3-chain-normalize-egs dir/normalization.fst ark:train_in.cegs ark:train_out.cegs\n";
+
+    ParseOptions po(usage);
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string normalization_fst_rxfilename = po.GetArg(1),
+                         examples_rspecifier = po.GetArg(2),
+                         examples_wspecifier = po.GetArg(3);
+
+    fst::StdVectorFst normalization_fst;
+    ReadFstKaldi(normalization_fst_rxfilename, &normalization_fst);
+
+    SequentialNnetChainExampleReader example_reader(examples_rspecifier);
+    NnetChainExampleWriter example_writer(examples_wspecifier);
+
+    int64 num_written = 0, num_err = 0;;
+    for (; !example_reader.Done(); example_reader.Next()) {
+      std::string key = example_reader.Key();
+      NnetChainExample eg = example_reader.Value();
+
+      if (eg.outputs.size() != 1)
+        KALDI_ERR << "Expected example to have exactly one output.";
+      if (!AddWeightToSupervisionFst(normalization_fst,
+                                     &(eg.outputs[0].supervision))) {
+        KALDI_WARN << "For example " << key
+                   << ", FST was empty after composing with normalization FST. "
+                   << "This should be extremely rare (a few per corpus, at most)";
+        num_err++;
+      } else {
+        example_writer.Write(key, eg);
+        num_written++;
+      }
+    }
+
+    KALDI_LOG << "Added normalization to " << num_written
+              << " egs; had errors on " << num_err;
+    return (num_written == 0 ? 1 : 0);
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
+
diff --git a/src/chainbin/nnet3-chain-shuffle-egs.cc b/src/chainbin/nnet3-chain-shuffle-egs.cc
new file mode 100644
index 00000000000..7ab6e28f607
--- /dev/null
+++ b/src/chainbin/nnet3-chain-shuffle-egs.cc
@@ -0,0 +1,115 @@
+// chainbin/nnet3-chain-shuffle-egs.cc
+
+// Copyright 2012-2015  Johns Hopkins University (author:  Daniel Povey)
+//                2014  Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "hmm/transition-model.h"
+#include "nnet3/nnet-chain-example.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Copy nnet3+chain examples for neural network training, from the input to output,\n"
+        "while randomly shuffling the order.  This program will keep all of the examples\n"
+        "in memory at once, unless you use the --buffer-size option\n"
+        "\n"
+        "Usage:  nnet3-chain-shuffle-egs [options] <egs-rspecifier> <egs-wspecifier>\n"
+        "\n"
+        "nnet3-chain-shuffle-egs --srand=1 ark:train.egs ark:shuffled.egs\n";
+
+    int32 srand_seed = 0;
+    int32 buffer_size = 0;
+    ParseOptions po(usage);
+    po.Register("srand", &srand_seed, "Seed for random number generator ");
+    po.Register("buffer-size", &buffer_size, "If >0, size of a buffer we use "
+                "to do limited-memory partial randomization.  Otherwise, do "
+                "full randomization.");
+
+    po.Read(argc, argv);
+
+    srand(srand_seed);
+
+    if (po.NumArgs() != 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string examples_rspecifier = po.GetArg(1),
+        examples_wspecifier = po.GetArg(2);
+
+    int64 num_done = 0;
+
+    std::vector<std::pair<std::string, NnetChainExample*> > egs;
+
+    SequentialNnetChainExampleReader example_reader(examples_rspecifier);
+    NnetChainExampleWriter example_writer(examples_wspecifier);
+    if (buffer_size == 0) { // Do full randomization
+      // Putting in an extra level of indirection here to avoid excessive
+      // computation and memory demands when we have to resize the vector.
+
+      for (; !example_reader.Done(); example_reader.Next())
+        egs.push_back(std::pair<std::string, NnetChainExample*>(
+            example_reader.Key(),
+            new NnetChainExample(example_reader.Value())));
+
+      std::random_shuffle(egs.begin(), egs.end());
+    } else {
+      KALDI_ASSERT(buffer_size > 0);
+      egs.resize(buffer_size,
+                 std::pair<std::string, NnetChainExample*>("", NULL));
+      for (; !example_reader.Done(); example_reader.Next()) {
+        int32 index = RandInt(0, buffer_size - 1);
+        if (egs[index].second == NULL) {
+          egs[index] = std::pair<std::string, NnetChainExample*>(
+              example_reader.Key(),
+              new NnetChainExample(example_reader.Value()));
+        } else {
+          example_writer.Write(egs[index].first, *(egs[index].second));
+          egs[index].first = example_reader.Key();
+          *(egs[index].second) = example_reader.Value();
+          num_done++;
+        }
+      }
+    }
+    for (size_t i = 0; i < egs.size(); i++) {
+      if (egs[i].second != NULL) {
+        example_writer.Write(egs[i].first, *(egs[i].second));
+        delete egs[i].second;
+        num_done++;
+      }
+    }
+
+    KALDI_LOG << "Shuffled order of " << num_done
+              << " neural-network training examples "
+              << (buffer_size ? "using a buffer (partial randomization)" : "");
+
+    return (num_done == 0 ? 1 : 0);
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
+
diff --git a/src/chainbin/nnet3-chain-subset-egs.cc b/src/chainbin/nnet3-chain-subset-egs.cc
new file mode 100644
index 00000000000..0206003ab13
--- /dev/null
+++ b/src/chainbin/nnet3-chain-subset-egs.cc
@@ -0,0 +1,101 @@
+// chainbin/nnet3-chain-subset-egs.cc
+
+// Copyright 2012-2015  Johns Hopkins University (author:  Daniel Povey)
+//                2014  Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "nnet3/nnet-chain-example.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Creates a random subset of the input nnet3+chain examples, of a specified size.\n"
+        "Uses no more memory than the size of the subset.\n"
+        "\n"
+        "Usage:  nnet3-chain-cubset-egs [options] <egs-rspecifier> [<egs-wspecifier2> ...]\n"
+        "\n"
+        "e.g.\n"
+        "nnet3-chain-subset-egs [args] ark:- | nnet-subset-egs --n=1000 ark:- ark:subset.cegs\n";
+
+    int32 srand_seed = 0;
+    int32 n = 1000;
+    bool randomize_order = true;
+    ParseOptions po(usage);
+    po.Register("srand", &srand_seed, "Seed for random number generator ");
+    po.Register("n", &n, "Number of examples to output");
+    po.Register("randomize-order", &randomize_order, "If true, randomize the order "
+                "of the output");
+
+    po.Read(argc, argv);
+
+    srand(srand_seed);
+
+    if (po.NumArgs() != 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string examples_rspecifier = po.GetArg(1),
+        examples_wspecifier = po.GetArg(2);
+
+    std::vector<std::pair<std::string, NnetChainExample> > egs;
+    egs.reserve(n);
+
+    SequentialNnetChainExampleReader example_reader(examples_rspecifier);
+
+    int64 num_read = 0;
+    for (; !example_reader.Done(); example_reader.Next()) {
+      num_read++;
+      if (num_read <= n) {
+        egs.resize(egs.size() + 1);
+        egs.back().first = example_reader.Key();
+        egs.back().second = example_reader.Value();
+      } else {
+        BaseFloat keep_prob = n / static_cast<BaseFloat>(num_read);
+        if (WithProb(keep_prob)) { // With probability "keep_prob"
+          int32 index = RandInt(0, n-1);
+          egs[index].first = example_reader.Key();
+          egs[index].second = example_reader.Value();
+        }
+      }
+    }
+    if (randomize_order)
+      std::random_shuffle(egs.begin(), egs.end());
+
+    NnetChainExampleWriter writer(examples_wspecifier);
+    for (size_t i = 0; i < egs.size(); i++) {
+      writer.Write(egs[i].first, egs[i].second);
+    }
+
+    KALDI_LOG << "Selected a subset of " << egs.size() << " out of " << num_read
+              << " nnet3+chain training examples ";
+
+    return (num_read != 0 ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
+
diff --git a/src/chainbin/nnet3-chain-train.cc b/src/chainbin/nnet3-chain-train.cc
new file mode 100644
index 00000000000..5486a5f7fe9
--- /dev/null
+++ b/src/chainbin/nnet3-chain-train.cc
@@ -0,0 +1,99 @@
+// nnet3bin/nnet3-chain-train.cc
+
+// Copyright 2015  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "nnet3/nnet-chain-training.h"
+
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    using namespace kaldi::chain;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Train nnet3+chain neural network parameters with backprop and stochastic\n"
+        "gradient descent.  Minibatches are to be created by nnet3-chain-merge-egs in\n"
+        "the input pipeline.  This training program is single-threaded (best to\n"
+        "use it with a GPU).\n"
+        "\n"
+        "Usage:  nnet3-chain-train [options] <raw-nnet-in> <denominator-fst-in> <chain-training-examples-in> <raw-nnet-out>\n"
+        "\n"
+        "nnet3-chain-train 1.raw den.fst 'ark:nnet3-merge-egs 1.cegs ark:-|' 2.raw\n";
+
+    bool binary_write = true;
+    std::string use_gpu = "yes";
+    NnetChainTrainingOptions opts;
+
+    ParseOptions po(usage);
+    po.Register("binary", &binary_write, "Write output in binary mode");
+    po.Register("use-gpu", &use_gpu,
+                "yes|no|optional|wait, only has effect if compiled with CUDA");
+
+    opts.Register(&po);
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 4) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+#if HAVE_CUDA==1
+    CuDevice::Instantiate().SelectGpuId(use_gpu);
+#endif
+
+    std::string nnet_rxfilename = po.GetArg(1),
+        den_fst_rxfilename = po.GetArg(2),
+        examples_rspecifier = po.GetArg(3),
+        nnet_wxfilename = po.GetArg(4);
+
+    Nnet nnet;
+    ReadKaldiObject(nnet_rxfilename, &nnet);
+
+    bool ok;
+
+    {
+      fst::StdVectorFst den_fst;
+      ReadFstKaldi(den_fst_rxfilename, &den_fst);
+
+      NnetChainTrainer trainer(opts, den_fst, &nnet);
+
+      SequentialNnetChainExampleReader example_reader(examples_rspecifier);
+
+      for (; !example_reader.Done(); example_reader.Next())
+        trainer.Train(example_reader.Value());
+
+      ok = trainer.PrintTotalStats();
+    }
+
+#if HAVE_CUDA==1
+    CuDevice::Instantiate().PrintProfile();
+#endif
+    WriteKaldiObject(nnet, nnet_wxfilename, binary_write);
+    KALDI_LOG << "Wrote raw model to " << nnet_wxfilename;
+    return (ok ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
diff --git a/src/configure b/src/configure
index a758dbeb50b..d8675e0925a 100755
--- a/src/configure
+++ b/src/configure
@@ -13,7 +13,9 @@
 # ./configure --shared  ## shared libraries.
 # ./configure --mkl-root=/opt/intel/mkl
 # ./configure --mkl-root=/opt/intel/mkl --threaded-math=yes
-# ./configure --openblas-root=../tools/OpenBLAS/install  # before doing
+# ./configure --mkl-root=/opt/intel/mkl --threaded-math=yes --mkl-threading=tbb
+#        This is for MKL 11.3 -- which does not seem  to provide Intel OMP libs
+# ./configure  --openblas-root=../tools/OpenBLAS/install  # before doing
 #        # this, cd to ../tools and type "make openblas".  Note:
 #        # this is not working correctly on all platforms, do "make test"
 #        # and look out for segmentation faults.
@@ -21,7 +23,7 @@
 
 #This should be incremented after every significant change of the configure script
 #I.e. after each change that affects the kaldi.mk or the build system as whole
-CONFIGURE_VERSION=2
+CONFIGURE_VERSION=3
 
 function rel2abs {
   if [ ! -z "$1" ]; then
@@ -39,7 +41,7 @@ function read_dirname {
 
 function is_set {
   local myvar=${1:-notset}
-  if [ "$myvar" == "notset" ]; then 
+  if [ "$myvar" == "notset" ]; then
     return 1
   else
     return 0
@@ -50,7 +52,7 @@ function is_set {
 
 ##   First do some checks.  These verify that all the things are
 ##   here that should be here.
-if [ "`basename $PWD`" != "src" ]; then
+if ! [ -x "$PWD/configure" ]; then
   echo 'You must run "configure" from the src/ directory.'
   exit 1
 fi
@@ -67,10 +69,10 @@ unset OPENBLASROOT
 unset MKLLIBDIR
 
 function usage {
-  echo 'Usage: ./configure [--static|--shared] [--threaded-atlas={yes|no}] [--atlas-root=ATLASROOT] [--fst-root=FSTROOT] 
+  echo 'Usage: ./configure [--static|--shared] [--threaded-atlas={yes|no}] [--atlas-root=ATLASROOT] [--fst-root=FSTROOT]
   [--openblas-root=OPENBLASROOOT] [--clapack-root=CLAPACKROOT] [--mkl-root=MKLROOT] [--mkl-libdir=MKLLIBDIR]
-  [--omp-libdir=OMPDIR] [--static-fst={yes|no}] [--static-math={yes|no}] [--threaded-math={yes|no}] [--mathlib=ATLAS|MKL|CLAPACK|OPENBLAS] 
-  [--use-cuda={yes|no}] [--cudatk-dir=CUDATKDIR]';
+  [--omp-libdir=OMPDIR] [--static-fst={yes|no}] [--static-math={yes|no}] [--threaded-math={yes|no}] [--mathlib=ATLAS|MKL|CLAPACK|OPENBLAS]
+  [--use-cuda={yes|no}] [--cudatk-dir=CUDATKDIR][--mkl-threading=sequential|iomp|tbb|gomp]';
 }
 
 threaded_atlas=false #  By default, use the un-threaded version of ATLAS.
@@ -79,68 +81,115 @@ static_math=false
 static_fst=false
 use_cuda=true
 dynamic_kaldi=false
+mkl_threading=sequential
 
 cmd_line="$0 $@"  # Save the command line to include in kaldi.mk
 
 while [ $# -gt 0 ];
 do
   case "$1" in
-  --help) usage; exit 0 ;;
-  --version) echo $CONFIGURE_VERSION; exit 0 ;;
+  --help)
+    usage; exit 0 ;;
+  --version)
+    echo $CONFIGURE_VERSION; exit 0 ;;
   --static)
-    # FIXME depends on order of options first--static/--shared then --static-math -> it will override it
-  dynamic_kaldi=false;
-  static_math=true;
-  static_fst=true;
-  shift ;;  
+    dynamic_kaldi=false;
+    static_math=true;
+    static_fst=true;
+    shift ;;
   --shared)
-  dynamic_kaldi=true;
-  static_math=false;
-  static_fst=false;
-  shift ;;  
+    dynamic_kaldi=true;
+    static_math=false;
+    static_fst=false;
+    shift ;;
   --atlas-root=*)
-  ATLASROOT=`read_dirname $1`; shift ;;
+    ATLASROOT=`read_dirname $1`;
+    shift ;;
   --threaded-atlas=yes)
-  threaded_atlas=true; shift ;;
+    threaded_atlas=true;
+    shift ;;
   --threaded-atlas=no)
-  threaded_atlas=false; shift ;;
+    threaded_atlas=false;
+    shift ;;
   --threaded-math=yes)
-  threaded_atlas=true; threaded_math=true; shift ;;
+    threaded_atlas=true;
+    threaded_math=true;
+    mkl_threading=iomp
+    shift ;;
   --threaded-math=no)
-  threaded_atlas=false; threaded_math=false; shift ;;
+    threaded_atlas=false;
+    threaded_math=false;
+    mkl_threading=sequential
+    shift ;;
   --use-cuda=yes)
-  use_cuda=true; shift ;;
+    use_cuda=true;
+    shift ;;
   --use-cuda=no)
-  use_cuda=false; shift ;;
+    use_cuda=false;
+    shift ;;
   --static-math=yes)
-  static_math=true; shift ;;
+    static_math=true;
+    shift ;;
   --static-math=no)
-  static_math=false; shift ;;
+    static_math=false;
+    shift ;;
   --static-fst=yes)
-  static_fst=true; shift ;;
+    static_fst=true;
+    shift ;;
   --static-fst=no)
-  static_fst=false; shift ;;
+    static_fst=false;
+    shift ;;
+  --mkl-threading=sequential)
+    threaded_atlas=false;
+    threaded_math=false;
+    mkl_threading=sequential;
+    shift ;;
+  --mkl-threading=*)
+    mkl_threading=`expr "X$1" : '[^=]*=\(.*\)'`;
+    threaded_atlas=true;
+    threaded_math=true;
+    shift ;;
   --fst-root=*)
-  FSTROOT=`read_dirname $1`; shift ;;
+    FSTROOT=`read_dirname $1`;
+    shift ;;
   --clapack-root=*)
-  CLAPACKROOT=`read_dirname $1`; shift ;;
+    CLAPACKROOT=`read_dirname $1`;
+    shift ;;
   --openblas-root=*)
-  OPENBLASROOT=`read_dirname $1`; shift ;;
+    OPENBLASROOT=`read_dirname $1`;
+    shift ;;
   --mkl-root=*)
-  MKLROOT=`read_dirname $1`; shift ;;
+    MKLROOT=`read_dirname $1`;
+    shift ;;
   --mkl-libdir=*)
-  MKLLIBDIR=`read_dirname $1`; shift ;;
+    MKLLIBDIR=`read_dirname $1`;
+    shift ;;
+  --speex-root=*)
+    SPEEXROOT=`read_dirname $1`;
+    shift ;;
+  --speex-libdir=*)
+    SPEEXLIBDIR=`read_dirname $1`;
+    shift ;;
+  --speex-includedir=*)
+    SPEEXINCLUDEDIR=`read_dirname $1`;
+    shift ;;
   --omp-libdir=*)
-  OMPLIBDIR=`read_dirname $1`; shift ;;
+    OMPLIBDIR=`read_dirname $1`;
+    shift ;;
   --mathlib=*)
-  MATHLIB=`expr "X$1" : '[^=]*=\(.*\)'`; shift ;;
+    MATHLIB=`expr "X$1" : '[^=]*=\(.*\)'`;
+    shift ;;
   --cudatk-dir=*)
-  CUDATKDIR=`read_dirname $1`; shift ;; #CUDA is used in src/cudamatrix and src/nnet{,bin} only
+    CUDATKDIR=`read_dirname $1`;
+    shift ;; #CUDA is used in src/cudamatrix and src/nnet{,bin} only
   *)  echo "Unknown argument: $1, exiting"; usage; exit 1 ;;
   esac
 done
 
-
+# the idea here is that if you change the configuration options from using
+# CUDA to not using it, or vice versa, we want to recompile all parts of the
+# code that may use a GPU.  Touching this file is a way to force this.
+touch cudamatrix/cu-common.h 2>/dev/null
 
 function failure {
   echo "***configure failed: $* ***" >&2
@@ -178,6 +227,7 @@ function check_for_slow_expf {
   cd ..
 }
 
+
 function exit_success {
   check_for_bad_gcc;
   check_for_slow_expf;
@@ -186,6 +236,27 @@ function exit_success {
 }
 
 
+
+function check_library {
+  local libpath=$1
+  local libname=$2
+  local libext=$3
+  local full_libname="$libpath/$libname.$libext"
+  ##echo "Testing $full_libname" >&2
+  test -f "$full_libname" && return ;
+  return 1
+}
+
+
+
+#Check if at least one of these variables is set
+#If yes, we want to switch to using the MKL
+is_set $MKLLIBDIR && echo "Force-configuring KALDI to use MKL" && export MATHLIB="MKL"
+is_set $MKLROOT && echo "Force-configuring KALDI to use MKL"&& export MATHLIB="MKL"
+is_set $CLAPACKROOT && echo "Force-configuring KALDI to use CLAPACK"&& export MATHLIB="CLAPACK"
+is_set $OPENBLASROOT && echo "Force-configuring KALDI to use OPENBLAS"&& export MATHLIB="OPENBLAS"
+
+#MKL functions
 function linux_configure_mkllibdir {
   local mklroot=$1
 
@@ -198,120 +269,156 @@ function linux_configure_mkllibdir {
   fi
 }
 
+function linux_configure_mkl_includes {
+  test -d $1/include && echo "$1/include" && return;
+  test -d $2/../../include && echo "$2/../../include" && return;
+  failure "Could not find the MKL include directory"
+}
+
+
 function linux_configure_mkl_libraries {
   local mkllibdir=$1
   local static=$2
   local threaded=$3
+  local mplib=$4
+
+  declare -A mkl_libs
+  mkl_libs=(
+    [sequential]="mkl_intel_lp64 mkl_core mkl_sequential"
+    [gomp]="mkl_intel_lp64 mkl_core mkl_gnu_thread"
+    [iomp]="mkl_intel_lp64 mkl_core mkl_intel_thread "
+    [tbb]="mkl_intel_lp64 mkl_core mkl_tbb_thread "
+  )
+
+  if [ -z "${mkl_libs[$threaded]}" ]; then
+    echo >&2 "Unknown threading mode: $threaded"
+    return 1;
+  fi
 
-  #these lines were generated using the Intel Link Line Advisor 2.2
-  local threaded_libs="mkl_intel_lp64 mkl_intel_thread mkl_core"
-  local sequential_libs="mkl_intel_lp64 mkl_sequential mkl_core"
-
-  if  ! $static && $threaded ; then
-    for file in $threaded_libs; do
-      local libfile=$mkllibdir/lib$file.so
-      check_exists $libfile
-    done 
-    echo  "-L$mkllibdir -Wl,-rpath=$mkllibdir -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5 -lpthread -lm"
-  elif  ! $static && ! $threaded ; then
-    for file in $sequential_libs; do
+  local linkline=""
+  if  ! $static ; then
+    linkline="-L$mkllibdir -Wl,-rpath=$mkllibdir"
+    for file in ${mkl_libs[$threaded]}; do
       local libfile=$mkllibdir/lib$file.so
       check_exists $libfile
-    done  
-    echo "-L$mkllibdir  -Wl,-rpath=$mkllibdir -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -liomp5 -lpthread -lm"
-  elif  $static && $threaded ; then
-    local linkline=""
-    test -f "$mkllibdir/libmkl_solver_lp64.a" && linkline="$linkline $mkllibdir/libmkl_solver_lp64.a"
-    linkline="$linkline -Wl,--start-group"
-    for file in $threaded_libs; do
-      local libfile=$mkllibdir/lib$file.a
-      check_exists $libfile
-      linkline="$linkline $libfile"
-    done  
-    #linkline="$linkline -Wl,--end-group -liomp5 -lpthread -lm "
-    linkline="$linkline -Wl,--end-group  "
-    echo $linkline
-  elif  $static && ! $threaded ; then
-    local linkline=""
-    test -f "$mkllibdir/libmkl_solver_lp64_sequential.a" && linkline="$linkline $mkllibdir/libmkl_solver_lp64_sequential.a"
+      linkline+=" -l$file "
+    done
+  else
+    if [ $threaded == "sequential" ] ; then
+      test -f "$mkllibdir/libmkl_solver_lp64.a" && \
+        linkline="$linkline $mkllibdir/libmkl_solver_lp64.a"
+    else
+      test -f "$mkllibdir/libmkl_solver_lp64_sequential.a" && \
+        linkline="$linkline $mkllibdir/libmkl_solver_lp64_sequential.a"
+    fi
     linkline="$linkline -Wl,--start-group"
-    for file in $sequential_libs; do
-      local libfile=$mkllibdir/lib$file.a
+    for file in ${mkl_libs[$threaded]}; do
+      local libfile=$mkllibdir/lib${file}.a
       check_exists $libfile
       linkline="$linkline $libfile"
-    done  
-    #linkline="$linkline -Wl,--end-group -liomp5 -lpthread -lm "
-    linkline="$linkline -Wl,--end-group  "
-    echo $linkline
-  else
-    return 1;
+    done
+    linkline="$linkline -Wl,--end-group "
   fi
+  echo "$linkline"
 }
 
-
-function linux_configure_mkl_includes {
-  test -d $1/include && echo "$1/include" && return;
-  test -d $2/../../include && echo "$2/../../include" && return;
-  failure "Could not find the MKL include directory"
-}
-
-function check_library {
-  local libpath=$1
-  local libname=$2
-  local libext=$3
-  
-  local full_libname="$libpath/$libname.$libext"
-  ##echo "Testing $full_libname" >&2
-  test -f "$full_libname" && return ;
-  return 1
+function linux_configure_mkl_extra {
+  local static=$1
+  local threaded=$2
+
+  declare -A extra_libs
+  extra_libs=(
+    [sequential]="-ldl -lpthread -lm"
+    [gomp]="-lgomp -ldl -lpthread -lm"
+    [iomp]="-ldl -lpthread -lm"
+    [tbb]=" -ldl -lpthread -lm "
+  )
+  echo "$linkline ${extra_libs[$threaded]}"
 }
 
+function linux_configure_threadinglibdir {
+  local library=$1
+  local mklroot=$2
+  local mkllibdir=$3
+  local libexts=$4
 
-function linux_configure_omplibdir {
-  local mklroot=$1
-  local mkllibdir=$2
-  local libexts=$3
-  
-  
   ##First we try to use the library in the same directory
   ##where the mkl libraries reside
   ##Afterwards, just try some possibilities for different MKL layouts
   for libext in $libexts; do
-    echo "Testing $libext from [$libexts] " >&2
+    check_library $mkllibdir "lib$library" $libext \
+    && echo `readlink -f $mkllibdir` && return 0
 
-    check_library $mkllibdir "libiomp5" $libext \
-    && echo `readlink -f $mkllibdir` && return
-    
     local testdir=`(cd $mklroot; cd ..; cd lib/intel64;pwd)`
-    test -d $testdir && check_library $testdir "libiomp5" $libext && echo `readlink -f $testdir` && return;
+    test -d $testdir && check_library $testdir "lib$library" $libext && echo `readlink -f $testdir` && return 0;
     local testdir=`(cd $mklroot; cd ..; cd lib/em64t;pwd)`
-    test -d $testdir && check_library $testdir "libiomp5" $libext && echo `readlink -f $testdir` && return;
-    
+    test -d $testdir && check_library $testdir "lib$library" $libext && echo `readlink -f $testdir` && return 0;
+
     local testdir=`(cd $mkllibdir; cd ../../..; cd lib/intel64;pwd)`
-    test -d $testdir && check_library $testdir "libiomp5" $libext && echo `readlink -f $testdir` && return;
+    test -d $testdir && check_library $testdir "lib$library" $libext && echo `readlink -f $testdir` && return 0;
     local testdir=`(cd $mklroot; cd ../../..; cd lib/em64t;pwd)`
-    test -d $testdir && check_library $testdir "libiomp5" $libext && echo `readlink -f $testdir` && return;
+    test -d $testdir && check_library $testdir "lib$library" $libext && echo `readlink -f $testdir` && return 0;
   done
 
   #failure "Could not find the library iomp5, use the configure switch --omp-libdir"
   return 1
 }
 
-#Check if at least one of these variables is set
-#If yes, we want to switch to using the MKL
-is_set $MKLLIBDIR && echo "Force-configuring KALDI to use MKL" && export MATHLIB="MKL"
-is_set $MKLROOT && echo "Force-configuring KALDI to use MKL"&& export MATHLIB="MKL"
-is_set $CLAPACKROOT && echo "Force-configuring KALDI to use CLAPACK"&& export MATHLIB="CLAPACK"
-is_set $OPENBLASROOT && echo "Force-configuring KALDI to use OPENBLAS"&& export MATHLIB="OPENBLAS"
+function linux_configure_mkl_threading {
+  local mklroot=$1
+  local mkllibdir=$2
+  local static=$3
+  local threading=$4
+
+  declare -A libs
+  libs=(
+    [sequential]=""
+    [gomp]=""
+    [iomp]="iomp5"
+    [tbb]="tbb"
+  )
+
+  echo >&2 "Configuring MKL threading as $threading"
+  library=${libs[$threading]}
+  if [ -z "$library" ]; then
+    return 0
+  fi
+
+  if ! is_set $OMPLIBDIR ; then
+    if  $static ; then
+      OMPLIBDIR=`linux_configure_threadinglibdir $library "$MKLROOT" "$MKLLIBDIR" "a"`
+    else
+      OMPLIBDIR=`linux_configure_threadinglibdir $library "$MKLROOT" "$MKLLIBDIR" "so"`
+    fi
+  fi
+
+  check_library $OMPLIBDIR "lib$library" "a" || \
+  check_library $OMPLIBDIR "lib$library" "so" || \
+  failure "Could not find the $library library, have your tried the --omp-libdir switch?"
+
+  OMP_LINK_LINE=''
+  # TODO(arnab): in the following conditional, the $static_math test is
+  # needed since the OpenMP library is assumed to be dynamic.
+  if [ "$OMPLIBDIR" != "$MKLLIBDIR" ] ; then
+    OMP_LINK_LINE="-L${OMPLIBDIR}"
+  fi
+  #if the libiomp5 library is dynamic, we add the rpath attribute
+  if ! $static_math ; then
+    OMP_LINK_LINE="$OMP_LINK_LINE -Wl,-rpath=$OMPLIBDIR -l$library"
+  else
+    OMP_LINK_LINE="$OMP_LINK_LINE -Wl,-Bstatic -l$library -Wl,-Bdynamic"
+  fi
+  echo "$OMP_LINK_LINE"
+}
 
 ##
-##CUDA is used in src/cudamatrix and src/nnet{,bin} only.
-##It is used to accelerate the neural network training,
-##the rest of kaldi is running on CPUs.
+## CUDA is used only in selected directories including src/cudamatrix, src/nnet*
+## and src/chain*.  It is used to accelerate the neural network training, the
+## rest of kaldi runs on CPUs.
 ##
-function linux_configure_cuda {
+function configure_cuda {
   #check for CUDA toolkit in the system
-  if [ ! $CUDATKDIR ]; then
+  if [ ! -d  "$CUDATKDIR" ]; then
     for base in /Developer/NVIDIA/CUDA-6.0 /usr/local/share/cuda /usr/local/cuda /pkgs_local/cuda-3.2/ /opt/nvidia_cuda/cuda-6.0/ /usr/; do
       if [ -f $base/bin/nvcc ]; then
         CUDATKDIR=$base
@@ -319,7 +426,7 @@ function linux_configure_cuda {
     done
   fi
 
-  if [ $CUDATKDIR ]; then
+  if [ -d "$CUDATKDIR" ]; then
     if [ ! -f $CUDATKDIR/bin/nvcc ]; then
       failure "Cannnot find nvcc in CUDATKDIR=$CUDATKDIR"
     fi
@@ -329,45 +436,71 @@ function linux_configure_cuda {
     echo CUDA = true >> kaldi.mk
     echo CUDATKDIR = $CUDATKDIR >> kaldi.mk
 
+    # Determine 'CUDA_ARCH',
+    CUDA_VERSION=$($CUDATKDIR/bin/nvcc -V | tr '.,' '_ ' | awk '/release/{sub(/.*release/,""); print $1;}') # MAJOR_MINOR,
+    if [ -z "$CUDA_VERSION" ] ; then
+      echo "Cannot figure out CUDA_VERSION from the nvcc output. Either your CUDA is too new or too old."
+      exit 1
+    fi
+
+    case $CUDA_VERSION in
+      5_5) CUDA_ARCH="-gencode arch=compute_13,code=sm_13 -gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35" ;;
+      6_*) CUDA_ARCH="-gencode arch=compute_13,code=sm_13 -gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50" ;;
+      7_*) CUDA_ARCH="-gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_53,code=sm_53" ;;
+      *) echo "Unsupported CUDA_VERSION (CUDA_VERSION=$CUDA_VERSION), please report it to Kaldi mailing list, together with 'nvcc -h' or 'ptxas -h' which lists allowed -gencode values..."; exit 1 ;;
+    esac
+    echo "CUDA_ARCH = $CUDA_ARCH" >> kaldi.mk
+
+    # 64bit/32bit?
     if [ "`uname -m`" == "x86_64" ]; then
-      cat makefiles/linux_x86_64_cuda.mk >> kaldi.mk
+      if [ "`uname`" == "Darwin" ]; then
+        sed 's/lib64/lib/g' < makefiles/cuda_64bit.mk >> kaldi.mk
+      else
+        cat makefiles/cuda_64bit.mk >> kaldi.mk
+      fi
     else
-      cat makefiles/linux_cuda.mk >> kaldi.mk
+      cat makefiles/cuda_32bit.mk >> kaldi.mk
     fi
+
   else
-    echo "CUDA will not be used! If you have already installed cuda drivers and cuda toolkit, try using --cudatk-dir=... option.  Note: this is only relevant for neural net experiments"
+    echo "CUDA will not be used! If you have already installed cuda drivers "
+    echo "and cuda toolkit, try using --cudatk-dir=... option.  Note: this is"
+    echo "only relevant for neural net experiments"
   fi
 }
 
 function linux_configure_speex {
   #check whether the user has called tools/extras/install_speex.sh or not
-  SPEEXROOT=`pwd`/../tools/speex
+  [ ! -z "$SPEEXROOT" ] || SPEEXROOT=`pwd`/../tools/speex
+  [ ! -z "$SPEEXLIBDIR" ] || SPEEXLIBDIR="$SPEEXROOT"/lib
+  [ ! -z "$SPEEXINCLUDEDIR" ] || SPEEXINCLUDEDIR="$SPEEXROOT"/include
   static_speex=$1
   if [ "foo"$static_speex == "foo" ]; then
     static_speex=false
   fi
-  
+
   if $static_speex; then
     spx_type=a
   else
     spx_type=so
   fi
-  if [ ! -f "$SPEEXROOT/lib/libspeex.${spx_type}" ];then
-    echo "Static=[$static_speex] Speex library not found: You can still build Kaldi without Speex."
+  if [ ! -f "$SPEEXLIBDIR/libspeex.${spx_type}" ];then
+    echo "Info: configuring Kaldi not to link with Speex (don't worry, it's only needed if you"
+    echo "intend to use 'compress-uncompress-speex', which is very unlikely)"
     return
   fi
-  
-  if [ -f $SPEEXROOT/include/speex/speex.h ]; then
+
+  if [ -f $SPEEXINCLUDEDIR/speex/speex.h ]; then
     echo >> kaldi.mk
-    echo CXXFLAGS += -DHAVE_SPEEX -I${SPEEXROOT}/include >> kaldi.mk
-    
+    echo CXXFLAGS += -DHAVE_SPEEX -I${SPEEXINCLUDEDIR} >> kaldi.mk
+
     if $static_speex; then
-      echo LDLIBS += $SPEEXROOT/lib/libspeex.a
+      echo LDLIBS += $SPEEXLIBDIR/libspeex.a
     else
-      echo LDLIBS += -L${SPEEXROOT}/lib -lspeex >> kaldi.mk
-      echo LDFLAGS += -Wl,-rpath=${SPEEXROOT}/lib >> kaldi.mk
+      echo LDLIBS += -L${SPEEXLIBDIR} -lspeex >> kaldi.mk
+      echo LDFLAGS += -Wl,-rpath=${SPEEXLIBDIR} >> kaldi.mk
     fi
-    
+
     echo "Successfully configured with Speex at $SPEEXROOT, (static=[$static_speex])"
   else
     echo "Speex will not be used. If you want to use it, run tools/extras/install_speex.sh first."
@@ -391,15 +524,18 @@ function linux_atlas_failure { # function we use when we couldn't find
    fix_cxx_flag
    echo "** $* ***"
    echo "**  ERROR   **"
-   echo "**Configure cannot proceed automatically, but by editing kaldi.mk"
-   echo "** you may be able to proceed (replace [somewhere] with a directory);"
-   echo "** or install the ATLAS package on your machine (if you are system "
-   echo "   administrator, you can do it easily by searching the atlas packages "
-   echo "   with commands like 'apt-cache search libatlas' or 'yum search atlas',"
-   echo "   and install the packages with commands 'apt-get install' or 'yum install') "
-   echo "   e.g. 'apt-get install libatlas-dev libatlas-base-dev';"
-   echo "** or try going to ../tools and running install_atlas.sh, and running"
-   echo "   this script (configure) again."
+   echo "** Configure cannot proceed automatically."
+   echo "**  If you know that you have ATLAS installed somewhere on your machine, you"
+   echo "** may be able to proceed by replacing [somewhere] in kaldi.mk with a directory."
+   echo "**  If you have sudo (root) access you could install the ATLAS package on your"
+   echo "** machine, e.g. 'sudo apt-get install libatlas-dev libatlas-base-dev' or"
+   echo "** 'sudo yum install atlas.x86_64' or 'sudo zypper install libatlas3-devel',"
+   echo "** or on cygwin, install atlas from the installer GUI; and then run ./configure"
+   echo "** again."
+   echo "**"
+   echo "**  Otherwise (or if you prefer OpenBLAS for speed), you could go the OpenBLAS"
+   echo "** route: cd to ../tools, type 'extras/install_openblas.sh', cd back to here,"
+   echo "** and type './configure  --openblas-root=../tools/OpenBLAS/install'"
    exit 1;
 }
 
@@ -426,11 +562,11 @@ function linux_check_static {
 function linux_configure_debian_ubuntu {
   m=$1
   ATLASLIBS="/usr/lib$m/atlas-base/libatlas.so.3gf  /usr/lib$m/atlas-base/libf77blas.so.3gf /usr/lib$m/atlas-base/libcblas.so.3gf  /usr/lib$m/atlas-base/liblapack_atlas.so.3gf"
-  for f in $ATLASLIBS; do 
+  for f in $ATLASLIBS; do
     [ ! -f $f ] && return 1;
   done
   lapacklib=$(echo $ATLASLIBS | awk '{print $NF}')
-  if ! nm --dynamic $lapacklib | grep ATL_cgetrf >/dev/null; then 
+  if ! nm --dynamic $lapacklib | grep ATL_cgetrf >/dev/null; then
     exit 1;
   fi
   echo ATLASINC = $ATLASROOT/include >> kaldi.mk
@@ -438,18 +574,18 @@ function linux_configure_debian_ubuntu {
   cat makefiles/linux_atlas.mk >> kaldi.mk
   fix_cxx_flag
   echo "Successfully configured for Debian/Ubuntu Linux [dynamic libraries] with ATLASLIBS =$ATLASLIBS"
-  $use_cuda && linux_configure_cuda
+  $use_cuda && configure_cuda
   linux_configure_speex
   exit_success;
 }
 
 function linux_configure_debian_ubuntu3 {
   ATLASLIBS="/usr/lib/libatlas.so.3  /usr/lib/libf77blas.so.3 /usr/lib/libcblas.so.3  /usr/lib/liblapack_atlas.so.3"
-  for f in $ATLASLIBS; do 
+  for f in $ATLASLIBS; do
     [ ! -f $f ] && return 1;
   done
   lapacklib=$(echo $ATLASLIBS | awk '{print $NF}')
-  if ! nm --dynamic $lapacklib | grep ATL_cgetrf >/dev/null; then 
+  if ! nm --dynamic $lapacklib | grep ATL_cgetrf >/dev/null; then
     exit 1;
   fi
   echo ATLASINC = $ATLASROOT/include >> kaldi.mk
@@ -457,29 +593,29 @@ function linux_configure_debian_ubuntu3 {
   cat makefiles/linux_atlas.mk >> kaldi.mk
   fix_cxx_flag
   echo "Successfully configured for Debian/Ubuntu Linux [dynamic libraries] with ATLASLIBS =$ATLASLIBS"
-  $use_cuda && linux_configure_cuda
+  $use_cuda && configure_cuda
   linux_configure_speex
   exit_success;
 }
 
 function linux_configure_debian7 {
   ATLASLIBS="/usr/lib/atlas-base/libatlas.so.3.0 /usr/lib/atlas-base/libf77blas.so.3.0 /usr/lib/atlas-base/libcblas.so.3 /usr/lib/atlas-base/liblapack_atlas.so.3"
-  for f in $ATLASLIBS; do 
+  for f in $ATLASLIBS; do
     [ ! -f $f ] && return 1;
   done
   lapacklib=$(echo $ATLASLIBS | awk '{print $NF}')
-  if ! nm --dynamic $lapacklib | grep ATL_cgetrf >/dev/null; then 
+  if ! nm --dynamic $lapacklib | grep ATL_cgetrf >/dev/null; then
     exit 1;
   fi
   libdir=$(dirname $(echo $ATLASLIBS | awk '{print $1}'))
   [ -z "$libdir" ] && echo "Error getting libdir in linux_configure_debian7" && exit 1;
   echo ATLASINC = $ATLASROOT/include >> kaldi.mk
   echo ATLASLIBS = $ATLASLIBS -Wl,-rpath=$libdir >> kaldi.mk
-  echo 
+  echo
   cat makefiles/linux_atlas.mk >> kaldi.mk
   fix_cxx_flag
   echo "Successfully configured for Debian 7 [dynamic libraries] with ATLASLIBS =$ATLASLIBS"
-  $use_cuda && linux_configure_cuda
+  $use_cuda && configure_cuda
   linux_configure_speex
   exit_success;
 }
@@ -487,18 +623,18 @@ function linux_configure_debian7 {
 function linux_configure_redhat {
   m=$1  # 64 or empty.
   ATLASLIBS="/usr/lib$m/atlas/libatlas.so.3 /usr/lib$m/atlas/libf77blas.so.3 /usr/lib$m/atlas/libcblas.so.3 /usr/lib$m/atlas/libclapack.so.3"
-  for f in $ATLASLIBS; do 
+  for f in $ATLASLIBS; do
     [ ! -f $f ] && return 1;
   done
   libdir=$(dirname $(echo $ATLASLIBS | awk '{print $1}'))
   [ -z "$libdir" ] && echo "Error getting libdir in linux_configure_redhat" && exit 1;
   echo ATLASINC = $ATLASROOT/include >> kaldi.mk
   echo ATLASLIBS = $ATLASLIBS -Wl,-rpath=$libdir >> kaldi.mk
-  echo 
+  echo
   cat makefiles/linux_atlas.mk >> kaldi.mk
   fix_cxx_flag
   echo "Successfully configured for red hat [dynamic libraries] with ATLASLIBS =$ATLASLIBS"
-  $use_cuda && linux_configure_cuda
+  $use_cuda && configure_cuda
   exit_success;
 }
 
@@ -508,18 +644,18 @@ function linux_configure_redhat_fat {
   # See http://stackoverflow.com/questions/13439296/build-shared-libraries-in-atlas.
   m=$1  # 64 or empty.
   ATLASLIBS="/usr/lib$m/atlas/libsatlas.so.3 /usr/lib$m/atlas/libtatlas.so.3"
-  for f in $ATLASLIBS; do 
+  for f in $ATLASLIBS; do
     [ ! -f $f ] && return 1;
   done
   libdir=$(dirname $(echo $ATLASLIBS | awk '{print $1}'))
   [ -z "$libdir" ] && echo "Error getting libdir in linux_configure_redhat_fat" && exit 1;
   echo ATLASINC = $ATLASROOT/include >> kaldi.mk
   echo ATLASLIBS = $ATLASLIBS -Wl,-rpath=$libdir >> kaldi.mk
-  echo 
+  echo
   cat makefiles/linux_atlas.mk >> kaldi.mk
   fix_cxx_flag
   echo "Successfully configured for red hat [dynamic libraries, fat] with ATLASLIBS =$ATLASLIBS"
-  $use_cuda && linux_configure_cuda
+  $use_cuda && configure_cuda
   exit_success;
 }
 
@@ -553,25 +689,25 @@ function linux_configure_static {
       fi
     fi
   done
-  if [ "$ATLASLIBS" == "" ]; then  
+  if [ "$ATLASLIBS" == "" ]; then
     echo Could not find any libraries $ATLASLIBDIR/{liblapack,liblapack_atlas,libclapack} that seem to be an ATLAS CLAPACK library.
     return ;
   fi
-   
+
   for x in lib${pt}cblas.a libatlas.a lib${pt}f77blas.a; do
     if [ ! -f $ATLASLIBDIR/$x ]; then
       echo "Configuring static ATLAS libraries failed: Could not find library $x in directory $ATLASLIBDIR"
       return 1;
     fi
     ATLASLIBS="$ATLASLIBS $ATLASLIBDIR/$x"
-  done    
+  done
   if $threaded_atlas; then ATLASLIBS="$ATLASLIBS"; fi
 
   echo ATLASINC = $ATLASROOT/include >> kaldi.mk
   echo ATLASLIBS = $ATLASLIBS >> kaldi.mk
   cat makefiles/linux_atlas.mk >> kaldi.mk
   fix_cxx_flag
-  $use_cuda && linux_configure_cuda
+  $use_cuda && configure_cuda
   linux_configure_speex
   echo "Successfully configured for Linux [static libraries] with ATLASLIBS =$ATLASLIBS"
   exit_success;
@@ -591,7 +727,7 @@ function linux_check_dynamic {
         return 0;
       fi
   done
-  echo "... no {libatlas,lib${pt}atlas}.so in $dir";
+  # echo "... no {libatlas,lib${pt}atlas}.so in $dir";
   return 1;
 }
 
@@ -635,7 +771,7 @@ function linux_configure_dynamic {
       echo Could not find any libraries $ATLASLIBDIR/{liblapack,liblapack_atlas,libclapack} that seem to be an ATLAS CLAPACK library.
       return 1;
     fi
-  
+
     for x in ${pt}cblas atlas ${pt}f77blas; do
       if [ ! -f $ATLASLIBDIR/lib$x.so ]; then
         echo "Configuring dynamic ATLAS libraries failed: Could not find library $x in directory $ATLASLIBDIR"
@@ -650,7 +786,7 @@ function linux_configure_dynamic {
   echo ATLASLIBS = $ATLASLIBS >> kaldi.mk
   cat makefiles/linux_atlas.mk >> kaldi.mk
   fix_cxx_flag
-  $use_cuda && linux_configure_cuda
+  $use_cuda && configure_cuda
   linux_configure_speex
   echo "Successfully configured for Linux [dynamic libraries] with ATLASLIBS =$ATLASLIBS"
   exit_success;
@@ -693,7 +829,7 @@ echo "CONFIGURE_VERSION := $CONFIGURE_VERSION" >> kaldi.mk
 echo "FSTROOT = $FSTROOT" >> kaldi.mk
 
 # Check installed OpenFst version and add C++11 flags if OpenFst >= 1.4
-OPENFST_VER=`grep 'PACKAGE_VERSION' $FSTROOT/Makefile | sed -e 's:.*= ::'`
+OPENFST_VER="${OPENFST_VER:-`grep 'PACKAGE_VERSION' $FSTROOT/Makefile | sed -e 's:.*= ::'`}"
 echo "OPENFST_VER = $OPENFST_VER" >> kaldi.mk
 OPENFST_VER_NUM=`echo $OPENFST_VER | sed 's/\./ /g' | xargs printf "%d%02d%02d"`
 if [ $OPENFST_VER_NUM -ge 10400 ]; then
@@ -710,7 +846,7 @@ echo "Doing OS specific configurations ..."
 # which crashes on Darwin. Also the linear algebra libraries on Macs are
 # used differently (through the Accelerate framework) than on Linux.
 if [ "`uname`" == "Darwin"  ]; then
- $use_cuda && linux_configure_cuda
+ $use_cuda && configure_cuda
   echo "On Darwin: checking for Accelerate framework ..."
   if [ ! -e /System/Library/Frameworks/Accelerate.framework ]; then
     failure "Need the Accelerate.framework to compile on Darwin."
@@ -739,7 +875,10 @@ if [ "`uname`" == "Darwin"  ]; then
   elif [ "$osx_ver" == "10.10" ]; then
     check_exists makefiles/darwin_10_10.mk
     cat makefiles/darwin_10_10.mk >> kaldi.mk
-  else 
+  elif [ "$osx_ver" == "10.11" ]; then
+    check_exists makefiles/darwin_10_11.mk
+    cat makefiles/darwin_10_11.mk >> kaldi.mk
+  else
     failure "OS X version '$osx_ver' not supported"
   fi
   echo "Configuration succeeded for platform Darwin."
@@ -780,7 +919,7 @@ if [ "`uname`" == "Linux" ]; then
       failure "Could not find required header files cblas.h or clapack.h in ATLAS dir '$ATLASROOT/include'"
     fi
     echo "Using ATLAS as the linear algebra library."
-    
+
     # Finding out where the libraries are located:
     # First we look for the static libraries and then look for dynamic ones.
     # We're looking for four libraries, all in the same directory, named
@@ -825,8 +964,8 @@ if [ "`uname`" == "Linux" ]; then
       failure "MKL on Linux only supported for Intel(R) 64 architecture (x86_64).
       See makefiles/linux_64_mkl.mk to manually configure for other platforms."
     fi
-    
-    if  is_set "$MKLROOT" -a ! is_set "$MKLLIBDIR"; then
+
+    if  ( is_set "$MKLROOT" && ! is_set "$MKLLIBDIR" ); then
       echo -n "Configuring MKL library directory: "
       MKLLIBDIR=`linux_configure_mkllibdir $MKLROOT`
       if [ $? -ne 0 ]; then
@@ -836,47 +975,38 @@ if [ "`uname`" == "Linux" ]; then
       fi
     fi
 
-    MKL_LINK_LINE=`linux_configure_mkl_libraries "$MKLLIBDIR" $static_math $threaded_math` || exit 1
+    MKL_LINK_LINE=`linux_configure_mkl_libraries "$MKLLIBDIR" $static_math $mkl_threading` || exit 1
+    echo "MKL configured with threading: $mkl_threading, libs: $MKL_LINK_LINE"
 
     MKL_COMPILE_LINE=`linux_configure_mkl_includes "$MKLROOT" "$MKLLIBDIR"` || exit 1
     echo "MKL include directory configured as: $MKL_COMPILE_LINE"
     MKL_COMPILE_LINE=" -I${MKL_COMPILE_LINE} "
-    
-    if ! is_set $OMPLIBDIR ; then
-      if  $static_math ; then
-        OMPLIBDIR=`linux_configure_omplibdir "$MKLROOT" "$MKLLIBDIR" "a"`
-      else
-        OMPLIBDIR=`linux_configure_omplibdir "$MKLROOT" "$MKLLIBDIR" "so"`
-      fi
-    fi
-    check_library $OMPLIBDIR "libiomp5" "a" || check_library $OMPLIBDIR "libiomp5" "so" \
-      || failure "Could not find the iomp5 library, have your tried the --omp-libdir switch?"
-    echo "OMP library directory configured as: $OMPLIBDIR"
-    OMP_LINK_LINE=''
-    # TODO(arnab): in the following conditional, the $static_math test is
-    # needed since the OpenMP library is assumed to be dynamic.
-    if [ "$OMPLIBDIR" != "$MKLLIBDIR" ] ; then
-      OMP_LINK_LINE="-L${OMPLIBDIR}"
-      #if the libiomp5 library is dynamic, we add the rpath attribute
-      if ! $static_math ; then
-        OMP_LINK_LINE="$OMP_LINK_LINE -Wl,-rpath=$OMPLIBDIR"
-      else
-        OMP_LINK_LINE="$OMP_LINK_LINE -Wl,-Bstatic -liomp5 -Wl,-Bdynamic"
-      fi
+
+    THREADING_LINE=`linux_configure_mkl_threading $MKLROOT $MKLLIBDIR $static_math $mkl_threading` || exit 1
+    EXTRA_LIBS=`linux_configure_mkl_extra $static_math $mkl_threading` || exit 1
+    if [ ! -z "$THREADING_LINE" ] || [ ! -z "$EXTRA_LIBS" ]; then
+      echo "MKL threading libraries configured as $THREADING_LINE $EXTRA_LIBS"
     fi
- 
+
     echo "Using Intel MKL as the linear algebra library."
+    (
+      cd probe; rm -f mkl-test;
+      g++ mkl-test.cc -o mkl-test $MKL_COMPILE_LINE $MKL_LINK_LINE $THREADING_LINE $EXTRA_LIBS || exit 1
+      test -f ./mkl-test || exit 1
+      ./mkl-test || exit 1
+      cd ..
+    ) || failure "Cannot validate the MKL switches"
 
     echo MKLROOT = $MKLROOT >> kaldi.mk
-    if [ ! -z $MKLLIBDIR ]; then 
+    if [ ! -z $MKLLIBDIR ]; then
       echo MKLLIB = $MKLLIBDIR >> kaldi.mk
     fi
     check_exists makefiles/linux_x86_64_mkl.mk
     cat makefiles/linux_x86_64_mkl.mk >> kaldi.mk
     fix_cxx_flag
-    echo "MKLFLAGS = ${MKL_LINK_LINE} ${OMP_LINK_LINE} " >> kaldi.mk
+    echo "MKLFLAGS = ${MKL_LINK_LINE} ${THREADING_LINE} $EXTRA_LIBS " >> kaldi.mk
 
-    $use_cuda && linux_configure_cuda
+    $use_cuda && configure_cuda
     linux_configure_speex
     echo "Successfully configured for Linux with MKL libs from $MKLROOT"
     exit_success;
@@ -899,7 +1029,7 @@ if [ "`uname`" == "Linux" ]; then
     cat makefiles/linux_clapack.mk >> kaldi.mk
     fix_cxx_flag
     echo "Warning (CLAPACK): this part of the configure process is not properly tested and will not work."
-    $use_cuda && linux_configure_cuda
+    $use_cuda && configure_cuda
     linux_configure_speex
     echo "Successfully configured for Linux with CLAPACK libs from $CLAPACKROOT"
     exit_success;
@@ -908,7 +1038,7 @@ if [ "`uname`" == "Linux" ]; then
     if [ -z "$OPENBLASROOT" ]; then
       failure "Must specify the location of OPENBLAS with --openblas-root option (and it must exist)"
     fi
-    if [ ! -f $OPENBLASROOT/lib/libopenblas.so ]; then 
+    if [ ! -f $OPENBLASROOT/lib/libopenblas.so ]; then
       failure "Expected to find the file $OPENBLASROOT/lib/libopenblas.so"
     fi
     echo "Your math library seems to be OpenBLAS.  Configuring appropriately."
@@ -923,11 +1053,11 @@ if [ "`uname`" == "Linux" ]; then
     echo "OPENBLASROOT = $OPENBLASROOT" >> kaldi.mk
     cat makefiles/linux_openblas.mk >> kaldi.mk
     fix_cxx_flag
-    $use_cuda && linux_configure_cuda
+    $use_cuda && configure_cuda
     linux_configure_speex
     echo "Successfully configured OpenBLAS from $OPENBLASROOT."
     exit_success;
-  else 
+  else
     failure "Unsupported linear algebra library '$MATHLIB'"
   fi
 fi
diff --git a/src/cudamatrix/Makefile b/src/cudamatrix/Makefile
index 34b621b428f..1bfb087540a 100644
--- a/src/cudamatrix/Makefile
+++ b/src/cudamatrix/Makefile
@@ -1,68 +1,36 @@
 
-
 all:
 
-OPENFST_CXXFLAGS = 
-OPENFST_LDLIBS =
-
-
 include ../kaldi.mk
-
 LDFLAGS += $(CUDA_LDFLAGS)
 LDLIBS += $(CUDA_LDLIBS)
 
 TESTFILES = cu-vector-test cu-matrix-test cu-math-test cu-test cu-sp-matrix-test cu-packed-matrix-test cu-tp-matrix-test \
             cu-block-matrix-test cu-matrix-speed-test cu-vector-speed-test cu-sp-matrix-speed-test cu-array-test \
-						cu-sparse-matrix-test
-
+	    cu-sparse-matrix-test cu-device-test
 
 OBJFILES = cu-device.o cu-math.o cu-matrix.o cu-packed-matrix.o cu-sp-matrix.o \
            cu-vector.o cu-common.o cu-tp-matrix.o cu-rand.o cu-block-matrix.o \
-           cu-sparse-matrix.o
+           cu-sparse-matrix.o cu-allocator.o cu-array.o
 ifeq ($(CUDA), true)
   OBJFILES += cu-kernels.o cu-randkernels.o
 endif
 
 LIBNAME = kaldi-cudamatrix
 
-all:  $(LIBFILE)
-
+ADDLIBS = ../matrix/kaldi-matrix.a ../util/kaldi-util.a ../thread/kaldi-thread.a \
+	  ../base/kaldi-base.a
 
+# Make sure we have CUDA_ARCH from kaldi.mk,
 ifeq ($(CUDA), true)
-  #Default compute capability architectures we compile with
-  CUDA_ARCH=-gencode arch=compute_20,code=sm_20
-  #Get the CUDA Toolkit version (remove decimal point char)
-  CUDA_VERSION=$(shell $(CUDATKDIR)/bin/nvcc -V | grep release | sed -e 's|.*release ||' -e 's|,.*||' -e 's|\.||')
-  #For toolkit 4.2 or newer, add the compute capability 3.0 
-  CUDA_VER_GT_4_2 := $(shell [ $(CUDA_VERSION) -ge 42 ] && echo true)
-  ifeq ($(CUDA_VER_GT_4_2), true)
-    CUDA_ARCH += -gencode arch=compute_30,code=sm_30
-  endif
-  #For toolkit 5.0 or newer, add the compute capability 3.5 
-  CUDA_VER_GT_5_0 := $(shell [ $(CUDA_VERSION) -ge 50 ] && echo true)
-  ifeq ($(CUDA_VER_GT_5_0), true)
-    CUDA_ARCH += -gencode arch=compute_35,code=sm_35
-  endif
-  #For toolkit 6.0 or newer, add the compute capability 5.0
-  CUDA_VER_GT_6_0 := $(shell [ $(CUDA_VERSION) -ge 60 ] && echo true)
-  ifeq ($(CUDA_VER_GT_6_0), true)
-    CUDA_ARCH += -gencode arch=compute_50,code=sm_50
-  endif
-  #For toolkit older than 6.5, add the compute capability 1.0
-  CUDA_VER_GT_6_5 := $(shell [ $(CUDA_VERSION) -ge 65 ] && echo true)
-  ifneq ($(CUDA_VER_GT_6_5), true)
-    CUDA_ARCH += -gencode arch=compute_13,code=sm_13 \
-                 -gencode arch=compute_10,code=sm_10 
+  ifndef CUDA_ARCH
+    $(error CUDA_ARCH is undefined, run 'src/configure')
   endif
 endif
 
-
-#implicit rule for kernel compilation
+# Implicit rule for kernel compilation,
 %.o : %.cu
 	$(CUDATKDIR)/bin/nvcc -c $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../
 
-
-ADDLIBS = ../matrix/kaldi-matrix.a ../base/kaldi-base.a  ../util/kaldi-util.a 
-
 include ../makefiles/default_rules.mk
 
diff --git a/src/cudamatrix/cu-allocator.cc b/src/cudamatrix/cu-allocator.cc
new file mode 100644
index 00000000000..eacfbdf3c8e
--- /dev/null
+++ b/src/cudamatrix/cu-allocator.cc
@@ -0,0 +1,370 @@
+// cudamatrix/cu-allocator.cc
+
+// Copyright      2015  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+
+#if HAVE_CUDA == 1
+
+#include <cublas_v2.h>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+
+#include <string>
+#include <vector>
+#include <algorithm>
+#include "cudamatrix/cu-common.h"
+#include "cudamatrix/cu-device.h"
+#include "cudamatrix/cu-matrix.h"
+#include "base/kaldi-error.h"
+#include "base/kaldi-utils.h"
+#include "util/common-utils.h"
+
+namespace kaldi {
+
+
+void* CuMemoryAllocator::Malloc(size_t size) {
+  // For now just call MallocPitch and throw away the pitch, to avoid
+  // duplicating code here.  Apparently the time difference is quite small.
+  size_t pitch;
+  return MallocPitch(size, 1, &pitch);
+}
+
+// Returns max(0, floor(log_2(i))).   Not tested independently.
+static inline size_t IntegerLog2(size_t i) {
+  size_t ans = 0;
+  while (i > 256) {
+    i >>= 8;
+    ans += 8;
+  }
+  while (i > 16) {
+    i >>= 4;
+    ans += 4;
+  }
+  while (i > 1) {
+    i >>= 1;
+    ans++;
+  }
+  return ans;
+}
+
+//inline
+CuMemoryAllocator::MruCache& CuMemoryAllocator::GetCacheForSize(
+    size_t num_bytes) {
+  size_t bucket_index = IntegerLog2(num_bytes);
+  KALDI_ASSERT(num_bytes > 0 && bucket_index < caches_.size());
+  return caches_[bucket_index];
+}
+
+//inline
+void* CuMemoryAllocator::MallocPitchInternal(size_t row_bytes,
+                                             size_t num_rows,
+                                            size_t *pitch) {
+  num_system_allocations_++;
+  void *ans;
+  cudaError_t e;
+  for (int32 i = 0; i <= 2; i++) {
+    if (num_rows != 1) {
+      Timer tim;
+      e = cudaMallocPitch(&ans, pitch, row_bytes, num_rows);
+      tot_time_taken_in_cuda_malloc_pitch_ += tim.Elapsed();
+    } else {
+      Timer tim;
+      // we might save a little time this way.
+      e = cudaMalloc(&ans, row_bytes);
+      tot_time_taken_in_cuda_malloc_ += tim.Elapsed();
+      *pitch = row_bytes;
+    }
+    if (e != cudaSuccess) {
+      PrintMemoryUsage();
+      // On the first 2 out of the 3 iters, try freeing memory.
+      if (i <= 1) {
+        KALDI_WARN << "Allocation of " << row_bytes << " x "
+                   << num_rows << " region failed: freeing some memory and "
+                   << "trying again. ";
+        BaseFloat new_memory_factor = 1.1;
+        if (opts_.memory_factor > new_memory_factor) {
+          KALDI_LOG << "To avoid future problems like this, changing "
+                    << "memory_factor from " << opts_.memory_factor << " to "
+                    << new_memory_factor;
+          opts_.memory_factor = new_memory_factor;
+        }
+        size_t memory_cached = MemoryCached(),
+            memory_requested = row_bytes * num_rows,
+            memory_to_free = std::max<size_t>(memory_cached / 2,
+                                              std::min<size_t>(memory_cached,
+                                                               memory_requested));
+        FreeSomeCachedMemory(memory_to_free);
+      } else {
+        KALDI_ERR << "Cannot allocate the requested memory ("
+                  << row_bytes << " x " << num_rows << " = "
+                  << row_bytes * num_rows << " bytes)";
+      }
+      cudaGetLastError();  // Clear the error state.
+    } else {
+      break;
+    }
+  }
+  return ans;
+}
+
+void CuMemoryAllocator::PrintMemoryUsage() const {
+  KALDI_LOG << "Memory usage: " << cur_bytes_allocated_
+            << " bytes currently allocated (max: "
+            << max_bytes_allocated_ << "); " << cur_bytes_used_
+            << " currently in use by user (max: " << max_bytes_used_ << ")"
+            << "; " << num_system_allocations_ << '/'
+            << num_user_allocations_ << " calls to Malloc* resulted in "
+            << "CUDA calls.";
+  KALDI_LOG << "Time taken in cudaMallocPitch=" << tot_time_taken_in_cuda_malloc_pitch_
+            << ", in cudaMalloc=" << tot_time_taken_in_cuda_malloc_
+            << ", in cudaFree=" << tot_time_taken_in_cuda_free_
+            << ", in this->MallocPitch()=" << tot_time_taken_in_malloc_pitch_;
+}
+
+CuMemoryAllocator::CuMemoryAllocator(CuAllocatorOptions opts):
+    opts_(opts),
+    caches_(40),
+    cur_bytes_allocated_(0),
+    max_bytes_allocated_(0),
+    cur_bytes_used_(0),
+    max_bytes_used_(0),
+    t_(1),
+    num_user_allocations_(0),
+    num_system_allocations_(0),
+    tot_time_taken_in_cuda_malloc_(0.0),
+    tot_time_taken_in_cuda_malloc_pitch_(0.0),
+    tot_time_taken_in_cuda_free_(0.0),
+    tot_time_taken_in_malloc_pitch_(0.0) { }
+
+void* CuMemoryAllocator::MallocPitch(size_t row_bytes,
+                                     size_t num_rows,
+                                     size_t *pitch) {
+  Timer tim;
+  t_++;
+  num_user_allocations_++;
+  size_t requested_bytes = row_bytes * num_rows;
+  if (cur_bytes_used_ + requested_bytes > max_bytes_used_)
+    max_bytes_used_ = cur_bytes_used_ + requested_bytes;
+  MruCache &cache = GetCacheForSize(requested_bytes);
+  MemoryRequest request(row_bytes, num_rows);
+  CachedMemoryElement output;
+  if (cache.Lookup(request, &output)) {
+    // we have cached memory with this value.
+    void *ans = output.pointer;
+    *pitch = output.pitch;
+    used_map_[ans] = UsedMemoryElement(row_bytes, num_rows, output.pitch);
+    cur_bytes_used_ += requested_bytes;
+    tot_time_taken_in_malloc_pitch_ += tim.Elapsed();
+    return ans;
+  } else {
+    // note: it's important that we already updated max_bytes_used_.
+    size_t next_bytes_allocated = cur_bytes_allocated_ + requested_bytes,
+        max_bytes_to_allocate =
+        static_cast<size_t>(opts_.memory_factor * max_bytes_used_);
+    ssize_t bytes_overflow = next_bytes_allocated - max_bytes_to_allocate;
+    if (bytes_overflow > 0) {
+      // The amount we would have allocated, after fulfilling this request,
+      // would exceed our limits (we don't allow ourselves to allocate more than
+      // memory_factor times the maximum amount of memory the user ever owns
+      // during the lifetime of the program).  So free some memory.
+      KALDI_ASSERT(bytes_overflow <= MemoryCached());  // sanity check.
+      FreeSomeCachedMemory(static_cast<size_t>(bytes_overflow));
+      KALDI_ASSERT(cur_bytes_allocated_ + requested_bytes <=
+                   max_bytes_to_allocate);
+    }
+    void *ans = MallocPitchInternal(row_bytes, num_rows, pitch);
+    cur_bytes_allocated_ += requested_bytes;
+    if (cur_bytes_allocated_ > max_bytes_allocated_)
+      max_bytes_allocated_ = cur_bytes_allocated_;
+    used_map_[ans] = UsedMemoryElement(row_bytes, num_rows, *pitch);
+    cur_bytes_used_ += requested_bytes;
+    tot_time_taken_in_malloc_pitch_ += tim.Elapsed();
+    return ans;
+  }
+}
+
+void CuMemoryAllocator::FreeSomeCachedMemory(size_t bytes_to_free_in) {
+  Timer tim;
+  // the next few lines are responsible for increasing the amount of memory we
+  // are going to free, in case the user requested an amount that's very tiny
+  // compared with the total amount of memory ever used.  This helps us
+  // to amortize the cost of visiting all of the buckets inside this code.
+  // (there are only 40 buckets so it's not so big, but we're being careful.
+  size_t bytes_cached = cur_bytes_allocated_ - cur_bytes_used_,
+      min_to_free = static_cast<size_t>(max_bytes_used_ * opts_.delete_factor);
+  size_t bytes_to_free = std::min(bytes_cached,
+                                  std::max(bytes_to_free_in, min_to_free)),
+      bytes_freed = 0;
+
+  size_t num_caches = caches_.size(),
+      t = t_;
+  // size_factor contains the approximate (power-of-two) size of the pointers
+  // that each cache's pointers contain.  The 'cost' of keeping any given pointer,
+  // we declare to be the time since we last used it multiplied by the size
+  // of the memory in the pointer.
+  std::vector<BaseFloat> size_factor(num_caches);
+  for (size_t i = 0, j=1; i < num_caches; i++, j *= 2)
+    size_factor[i] = j;
+
+  std::priority_queue<std::pair<BaseFloat,int32> > queue;
+  // Set up the queue.
+  for (int32 i = 0; i < num_caches; i++) {
+    const MruCache &cache = caches_[i];
+    size_t cache_t = cache.LeastRecentTime();
+    if (cache_t > 0) {  // t == 0 means the cache is empty.
+      size_t interval = t - cache_t;
+      BaseFloat cost = size_factor[i] * interval;
+      KALDI_ASSERT(interval > 0);
+      queue.push(std::pair<BaseFloat,int32>(cost, i));
+    }
+  }
+  while (bytes_freed < bytes_to_free) {
+    // If the following fails it means I made some kind of bookkeeping error,
+    // and most likely we are trying to free more memory than we really have
+    // cached.
+    KALDI_ASSERT(!queue.empty() && "Code error.");
+    std::pair<BaseFloat, int32> p = queue.top();
+    int32 cache_index = p.second;
+    MruCache &cache = caches_[cache_index];
+    queue.pop();
+    if (queue.empty()) {
+      while (bytes_freed < bytes_to_free) {
+        bytes_freed += cache.RemoveLeastRecentlyUsed();
+      }
+    } else {
+      BaseFloat next_worst_cost = queue.top().first;
+      while (1)  {
+        bytes_freed += cache.RemoveLeastRecentlyUsed();
+        if (bytes_freed >= bytes_to_free)
+          break;
+        size_t least_recent_time = cache.LeastRecentTime();
+        if (least_recent_time == 0)  // this cache is now empty
+          break;
+        size_t interval = t - least_recent_time;
+        KALDI_ASSERT(interval > 0);
+        BaseFloat cost = size_factor[cache_index] * interval;
+        if (cost < next_worst_cost) {
+          // There is another bucket that has worse cost than this,
+          // so stop processing this bucket-- but first put it
+          // back in the queue.
+          queue.push(std::pair<BaseFloat, int32>(cost, cache_index));
+          break;
+        }
+      }
+    }
+  }
+  KALDI_ASSERT(bytes_freed <= cur_bytes_allocated_);
+  cur_bytes_allocated_ -= bytes_freed;
+  tot_time_taken_in_cuda_free_ += tim.Elapsed();
+}
+
+void CuMemoryAllocator::Free(void *ptr) {
+  t_++;
+  unordered_map<void*, UsedMemoryElement, PointerHasher>::iterator iter =
+      used_map_.find(ptr);
+  if (iter == used_map_.end()) {
+    KALDI_ERR << "Attempt to free CUDA memory pointer that was not allocated: "
+              << ptr;
+  }
+  const UsedMemoryElement &elem = iter->second;
+  size_t num_bytes = elem.row_bytes * elem.num_rows;
+
+  cur_bytes_used_ -= num_bytes;
+  MruCache &cache = GetCacheForSize(num_bytes);
+
+  cache.Insert(MemoryRequest(elem.row_bytes, elem.num_rows),
+               CachedMemoryElement(ptr, t_, elem.pitch));
+  used_map_.erase(iter);
+}
+
+size_t CuMemoryAllocator::MruCache::LeastRecentTime() const {
+  if (list_.empty()) {
+    KALDI_PARANOID_ASSERT(map_.empty());
+    return 0;
+  } else {
+    const MemoryRequest &mr = list_.front();
+    MapType::const_iterator iter = map_.find(mr);
+    KALDI_ASSERT(iter != map_.end());
+    const MapValueType &queue = iter->second;
+    KALDI_ASSERT(!queue.empty());
+    return queue.front().first.t;
+  }
+}
+
+bool CuMemoryAllocator::MruCache::Lookup(const MemoryRequest &request,
+                                         CachedMemoryElement *output) {
+  MapType::iterator iter = map_.find(request);
+  if (iter == map_.end())
+    return false;
+  MapValueType &q = iter->second;
+  KALDI_ASSERT(!q.empty());
+  // use q.back() as we want to return the most recently used one if there
+  // is a choice.  We believe this will give better caching behavior.
+  *output = q.back().first;
+  list_.erase(q.back().second);
+  q.pop_back();
+  if (q.empty())
+    map_.erase(request);
+  return true;
+}
+
+void CuMemoryAllocator::MruCache::Insert(const MemoryRequest &request,
+                                         const CachedMemoryElement &element) {
+  list_.push_back(request);
+  map_[request].push_back(std::pair<CachedMemoryElement, ListIterType>(
+      element,
+      --list_.end()));
+}
+
+size_t CuMemoryAllocator::MruCache::RemoveLeastRecentlyUsed() {
+  // Remove least-recently-used element from cache.
+  KALDI_ASSERT(!list_.empty());
+  MemoryRequest request = list_.front();
+  MapType::iterator iter = map_.find(request);
+  KALDI_ASSERT(iter != map_.end());
+  MapValueType &queue = iter->second;
+  KALDI_ASSERT(!queue.empty());
+  // least recently used elements are at the front of the queue.
+  std::pair<CachedMemoryElement, ListIterType> &p = queue.front();
+  KALDI_ASSERT(p.second == list_.begin());
+  CU_SAFE_CALL(cudaFree(p.first.pointer));
+  queue.pop_front();
+  if (queue.empty())
+    map_.erase(request);
+  list_.pop_front();
+  return request.first * request.second;
+}
+
+CuMemoryAllocator::MruCache& CuMemoryAllocator::MruCache::operator = (
+    const CuMemoryAllocator::MruCache &other) {
+  KALDI_ASSERT(other.list_.empty());
+  return *this;
+}
+CuMemoryAllocator::MruCache::MruCache(
+    const CuMemoryAllocator::MruCache &other) {
+  KALDI_ASSERT(other.list_.empty());
+}
+
+
+
+
+}
+
+
+#endif // HAVE_CUDA
diff --git a/src/cudamatrix/cu-allocator.h b/src/cudamatrix/cu-allocator.h
new file mode 100644
index 00000000000..b10601b8245
--- /dev/null
+++ b/src/cudamatrix/cu-allocator.h
@@ -0,0 +1,229 @@
+// cudamatrix/cu-allocator.h
+
+// Copyright 2015   Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+
+#ifndef KALDI_CUDAMATRIX_CU_ALLOCATOR_H_
+#define KALDI_CUDAMATRIX_CU_ALLOCATOR_H_
+
+#if HAVE_CUDA == 1
+
+#include <cublas_v2.h>
+#include <map>
+#include <list>
+#include <queue>
+#include <iostream>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include "base/kaldi-common.h"
+#include "util/stl-utils.h"
+
+namespace kaldi {
+
+
+// For now we don't give the user a way to modify these from the command line.
+struct CuAllocatorOptions {
+  // memory_factor is the total amount of (allocated + cached) memory that we
+  // allow to be held, relative to the max amount of memory the program has ever
+  // allocated.  It will increase the amount of memory the program will
+  // potentially consume, by this factor.
+  BaseFloat memory_factor;
+
+  // This is the minimum amount of memory that we will delete when we are forced
+  // to delete stuff, relative to the max amount of memory the program has ever
+  // allocated.  This should be less than memory_factor - 1.0 and > 0.  It
+  // shouldn't be too critical.  The reason it exists is to avoid calling the
+  // cleanup code and only releasing very small amounts of memory, because there
+  // is a constant overhead proportional to the number of buckets.
+  BaseFloat delete_factor;
+
+  CuAllocatorOptions(): memory_factor(1.5),
+                        delete_factor(0.001) { }
+
+  void Check() {
+    KALDI_ASSERT(delete_factor < memory_factor - 1.0 && delete_factor > 0.0);
+  }
+};
+
+
+
+
+// Class that caches memory for us (the CUDA
+// malloc and free routines are very slow).
+// This is a member of the CuDevice class.
+class CuMemoryAllocator {
+ public:
+  void* Malloc(size_t size);
+
+  void* MallocPitch(size_t row_bytes, size_t num_rows, size_t *pitch);
+
+  void Free(void *ptr);
+
+
+  // the maximum amount of memory that was ever allocated in the lifetime of the
+  // program, in bytes.
+  size_t MaxMemoryAllocated() const { return max_bytes_allocated_; }
+
+  // memory held in the cache currently, in bytes.
+  size_t MemoryCached() const { return cur_bytes_allocated_ - cur_bytes_used_; }
+
+  // memory that's cached plus memory that's allocated, in bytes.
+  size_t MemoryAllocated() const { return cur_bytes_allocated_; }
+
+  void PrintMemoryUsage() const;
+
+  CuMemoryAllocator(CuAllocatorOptions opts);
+ private:
+
+  void FreeSomeCachedMemory(size_t bytes_to_free);
+
+  // This calls CudaMallocPitch, checks for errors (dies if it has to), and
+  // returns the result.  It's up to the caller to do all the bookkeeping though.
+  inline void* MallocPitchInternal(size_t row_bytes, size_t num_rows, size_t *pitch);
+
+  typedef std::pair<size_t, size_t> MemoryRequest;  // (row_bytes, num_rows).
+  struct CachedMemoryElement {
+    void *pointer;  // the CUDA memory location that we own
+    size_t t;       // time value when we put this in the cache.
+    size_t pitch;   // pitch of this memory region (c.f. cudaMallocPitch()).
+    CachedMemoryElement() { }
+    CachedMemoryElement(void *pointer, size_t t, size_t pitch):
+        pointer(pointer), t(t), pitch(pitch) { }
+  };
+
+  // This class caches a map from MemoryRequest to a list of CachedMemoryElements,
+  // and gives us access to the least-recently-used element for efficient.
+  // removal.
+  // We will have an instance of this class for each power-of-2 of size in
+  // bytes.  This makes it easier to, when we need to delete something, find
+  // the item for which the (time-since-used * size-in-bytes) is approximately
+  // greatest.
+  class MruCache {
+   public:
+    size_t LeastRecentTime() const;  // t value of least recent CachedMemoryElement (0
+                                     // if empty).
+
+    size_t RemoveLeastRecentlyUsed();  // Remove least-recently-used element
+                                       // from cache.  Return size in bytes of
+                                       // that removed memory region.  Crash if
+                                       // this was empty.
+
+    // Attempts lookup of the most recently cached element corresponding to
+    // 'request'.  If available, removes it from the cache and puts it to
+    // 'output', and returns true.  Otherwise returns false.
+    bool Lookup(const MemoryRequest &request,
+                CachedMemoryElement *output);
+
+    // Inserts this CachedMemoryElement to the list of CachedMemoryElements for this
+    // MemoryRequest.  The time in the CachedMemoryElement is expected to be greater
+    // than times in previously supplied CachedMemoryElements.
+    void Insert(const MemoryRequest &request,
+                const CachedMemoryElement &element);
+
+    struct MemoryRequestHasher {
+      // input is interpreted as (row_bytes, num_rows).  row_bytes will always
+      // be a multiple of 4, and num_rows will frequently be a multiple of
+      // powers of 2 also.  We need to shift right and add so that there will be
+      // some action in the lower-order bits.
+      size_t operator () (const std::pair<size_t,size_t> &p) const {
+        size_t temp = p.first + 1867 * p.second;
+        return temp + (temp >> 2) + (temp >> 8);
+      }
+    };
+
+    MruCache() { }
+    // Define these to make inclusion in std::vector possible, but make them
+    // fail if called on anything but empty cache objects-- we never resize
+    // the vector of caches after initializing it.
+    MruCache &operator = (const MruCache &other);
+    MruCache(const MruCache &other);
+   private:
+    typedef std::list<MemoryRequest> ListType;
+    typedef std::list<MemoryRequest>::iterator ListIterType;
+    typedef std::deque<std::pair<CachedMemoryElement, ListIterType> > MapValueType;
+    typedef unordered_map<MemoryRequest, MapValueType,
+                          MemoryRequestHasher> MapType;
+    // 'list_' contains MemoryRequests with the most recent on the back (where they are added),
+    // and least recent on the front (where they are removed by RemoveLeastRecentlyUsed, although
+    // they are also removed from random parts of the list by Lookup().
+    // There will in general be duplicates of MemoryRequests in the list, as
+    // many as there are entries in the MapValueType.
+    ListType list_;
+    // 'map_' maps from a MemoryRequest to a queue of (memory-element,
+    // iterator), with the most-recently-added things at the back; we remove
+    // things from the front of these queues (oldest) inside
+    // RemoveLeastRecentlyUsed(), and from the back (newest) in Lookup.
+    MapType map_;
+  };
+
+
+  inline MruCache &GetCacheForSize(size_t num_bytes);
+
+  CuAllocatorOptions opts_;
+
+  // indexed by log_2 (amount of memory requested), the caches.
+  std::vector<MruCache> caches_;
+
+  size_t cur_bytes_allocated_;  // number of bytes currently owned by callers or
+                                // cached.
+  size_t max_bytes_allocated_;  // the max over all time, of cur_bytes_allocated_.
+  size_t cur_bytes_used_;  // number of bytes currently owned by callers.
+  size_t max_bytes_used_;  // the max over all time, of cur_bytes_used_.
+  size_t t_;  // time counter, incremented with each call.
+  size_t num_user_allocations_;  // number of times user calls Malloc*
+  size_t num_system_allocations_;  // number of times we call cudaMalloc*.
+  double tot_time_taken_in_cuda_malloc_;  // time in cudaMalloc
+  double tot_time_taken_in_cuda_malloc_pitch_;  // time in cudaMallocPitch
+  double tot_time_taken_in_cuda_free_;  // time in cudaFree
+  double tot_time_taken_in_malloc_pitch_;  // time in this->MallocPitch()
+
+
+  // a memory element is 'used' when it is currently possessed by the caller
+  // (and is not in our cache).
+  struct UsedMemoryElement {
+    size_t row_bytes;
+    size_t num_rows;
+    size_t pitch;
+    UsedMemoryElement() { }
+    UsedMemoryElement(size_t row_bytes, size_t num_rows, size_t pitch):
+        row_bytes(row_bytes), num_rows(num_rows), pitch(pitch)  { }
+  };
+
+  struct PointerHasher {
+    size_t operator() (const void *arg) const {
+      // the last few bits tend to be very predictable, for alignment reasons (CUDA
+      // allocation may align on 256 byte or 512 byte boundaries or something similar).
+      size_t temp = reinterpret_cast<size_t>(arg);
+      return (temp >> 4) + (temp >> 9);
+    }
+  };
+
+  // This is a map from memory locations owned by the user, so we can recover
+  // the information when people call Free() and we add it back into the cache.
+  unordered_map<void*, UsedMemoryElement, PointerHasher> used_map_;
+
+};
+
+
+}  // namespace
+
+#endif // HAVE_CUDA
+
+
+#endif
diff --git a/src/cudamatrix/cu-array-inl.h b/src/cudamatrix/cu-array-inl.h
index d9e88af36c5..6b9c91be642 100644
--- a/src/cudamatrix/cu-array-inl.h
+++ b/src/cudamatrix/cu-array-inl.h
@@ -1,6 +1,6 @@
 // cudamatrix/cu-array-inl.h
 
-// Copyright 2009-2012  Karel Vesely
+// Copyright 2009-2016  Karel Vesely
 //                2013  Johns Hopkins University (author: Daniel Povey)
 
 // See ../../COPYING for clarification regarding multiple authors
@@ -23,6 +23,8 @@
 #ifndef KALDI_CUDAMATRIX_CU_ARRAY_INL_H_
 #define KALDI_CUDAMATRIX_CU_ARRAY_INL_H_
 
+#include <algorithm>
+
 #if HAVE_CUDA == 1
 #include <cuda_runtime_api.h>
 #include "cudamatrix/cu-common.h"
@@ -109,6 +111,23 @@ void CuArray<T>::CopyFromVec(const std::vector<T> &src) {
 }
 
 
+template<typename T>
+void CuArray<T>::CopyFromArray(const CuArray<T> &src) {
+  this->Resize(src.Dim(), kUndefined);
+  if (dim_ == 0) return;
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    Timer tim;
+    CU_SAFE_CALL(cudaMemcpy(this->data_, src.data_, dim_ * sizeof(T),
+                            cudaMemcpyDeviceToDevice));
+    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
+  } else
+#endif
+  {
+    memcpy(this->data_, src.data_, dim_ * sizeof(T));
+  }
+}
+
 
 template<typename T>
 void CuArray<T>::CopyToVec(std::vector<T> *dst) const {
@@ -119,16 +138,33 @@ void CuArray<T>::CopyToVec(std::vector<T> *dst) const {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) { 
     Timer tim;
-    CU_SAFE_CALL(cudaMemcpy(&dst->front(), Data(), dim_*sizeof(T), cudaMemcpyDeviceToHost));
+    CU_SAFE_CALL(cudaMemcpy(&dst->front(), Data(), dim_ * sizeof(T), cudaMemcpyDeviceToHost));
     CuDevice::Instantiate().AccuProfile("CuArray::CopyToVecD2H", tim.Elapsed());
   } else
 #endif
   {
-    memcpy(&dst->front(), data_, dim_*sizeof(T));
+    memcpy(&dst->front(), data_, dim_ * sizeof(T));
   }
 }
 
 
+template<typename T>
+void CuArray<T>::CopyToHost(T *dst) const {
+  if (dim_ == 0) return;
+  KALDI_ASSERT(dst != NULL);
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) { 
+    Timer tim;
+    CU_SAFE_CALL(cudaMemcpy(dst, Data(), dim_ * sizeof(T), cudaMemcpyDeviceToHost));
+    CuDevice::Instantiate().AccuProfile("CuArray::CopyToVecD2H", tim.Elapsed());
+  } else
+#endif
+  {
+    memcpy(dst, data_, dim_ * sizeof(T));
+  }
+}
+
+ 
 template<typename T>
 void CuArray<T>::SetZero() {
   if (dim_ == 0) return;
@@ -145,70 +181,89 @@ void CuArray<T>::SetZero() {
 }
 
 
-
-/**
- * Print the vector to stream
- */
-template<typename T>
-std::ostream &operator << (std::ostream &out, const CuArray<T> &vec) {
-  std::vector<T> tmp;
-  vec.CopyToVec(&tmp);
-  out << "[";
-  for(int32 i=0; i<tmp.size(); i++) {
-    out << " " << tmp[i];
-  }
-  out << " ]\n";
-  return out;
+template<class T> 
+void CuArray<T>::Set(const T &value) {
+  // This is not implemented yet, we'll do so if it's needed.
+  KALDI_ERR << "CuArray<T>::Set not implemented yet for this type.";
 }
+// int32 specialization implemented in 'cudamatrix/cu-array.cc',
+template<> 
+void CuArray<int32>::Set(const int32 &value);
 
 
 template<class T> 
-inline void CuArray<T>::Set(const T &value) {
+void CuArray<T>::Add(const T &value) {
   // This is not implemented yet, we'll do so if it's needed.
-  KALDI_ERR << "CuArray<T>::Set not implemented yet for this type.";
+  KALDI_ERR << "CuArray<T>::Add not implemented yet for this type.";
 }
-
+// int32 specialization implemented in 'cudamatrix/cu-array.cc',
 template<> 
-inline void CuArray<int32>::Set(const int32 &value) {
-  if (dim_ == 0) return;
-#if HAVE_CUDA == 1
-  if (CuDevice::Instantiate().Enabled()) { 
-    Timer tim;
+void CuArray<int32>::Add(const int32 &value);
 
-    dim3 dimBlock(CU2DBLOCK);
-    dim3 dimGrid(n_blocks(Dim(), CU2DBLOCK));
-    ::MatrixDim d = { 1, Dim(), Dim() };
-
-    cudaI32_set_const(dimGrid, dimBlock, data_, value, d);
-    CU_SAFE_CALL(cudaGetLastError());
 
+template<class T> 
+inline T CuArray<T>::Min() const {
+  KALDI_ASSERT(this->Dim() > 0);
+  Timer tim;
+  std::vector<T> tmp(Dim());
+  CopyToVec(&tmp);
+  T ans = *std::min_element(tmp.begin(), tmp.end());
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
-  } else
-#endif
-  {
-    for (int32 i = 0; i < dim_; i++)
-      data_[i] = value;
   }
+#endif
+  return ans;
 }
 
-template<typename T>
-void CuArray<T>::CopyFromArray(const CuArray<T> &src) {
-  this->Resize(src.Dim(), kUndefined);
-  if (dim_ == 0) return;
+
+template<class T> 
+inline T CuArray<T>::Max() const {
+  KALDI_ASSERT(this->Dim() > 0);
+  Timer tim;
+  std::vector<T> tmp(Dim());
+  CopyToVec(&tmp);
+  T ans = *std::max_element(tmp.begin(), tmp.end());
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
-    Timer tim;
-    CU_SAFE_CALL(cudaMemcpy(this->data_, src.data_, dim_ * sizeof(T),
-                            cudaMemcpyDeviceToDevice));
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
-  } else
-#endif
-  {
-    memcpy(this->data_, src.data_, dim_ * sizeof(T));
   }
+#endif
+  return ans;
 }
 
 
+template<typename T>
+void CuArray<T>::Read(std::istream& in, bool binary) {
+  std::vector<T> tmp;
+  ReadIntegerVector(in, binary, &tmp);
+  (*this) = tmp;
+}
+
+
+template<typename T>
+void CuArray<T>::Write(std::ostream& out, bool binary) const {
+  std::vector<T> tmp(this->Dim());
+  this->CopyToVec(&tmp);
+  WriteIntegerVector(out, binary, tmp);
+}
+
+
+/**
+ * Print the vector to stream
+ */
+template<typename T>
+std::ostream &operator << (std::ostream &out, const CuArray<T> &vec) {
+  std::vector<T> tmp;
+  vec.CopyToVec(&tmp);
+  out << "[";
+  for(int32 i=0; i<tmp.size(); i++) {
+    out << " " << tmp[i];
+  }
+  out << " ]\n";
+  return out;
+}
+
 } // namespace kaldi
 
 #endif
diff --git a/src/cudamatrix/cu-array-test.cc b/src/cudamatrix/cu-array-test.cc
index de45688ba3c..c0f5f1d1149 100644
--- a/src/cudamatrix/cu-array-test.cc
+++ b/src/cudamatrix/cu-array-test.cc
@@ -73,6 +73,9 @@ static void UnitTestCuArray() {
       CuArray<T> cu_vec(vec);
       std::vector<T> vec2;
       cu_vec.CopyToVec(&vec2);
+      T *vec22 = new T[vec.size()];
+      cu_vec.CopyToHost(vec22);
+      delete[] vec22;
     }
 
     { // test assignment operator from CuArray.
diff --git a/src/cudamatrix/cu-array.cc b/src/cudamatrix/cu-array.cc
new file mode 100644
index 00000000000..86313f41292
--- /dev/null
+++ b/src/cudamatrix/cu-array.cc
@@ -0,0 +1,86 @@
+// cudamatrix/cu-array.cc
+
+// Copyright 2016  Brno University of Technology (author: Karel Vesely)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#if HAVE_CUDA == 1
+#include <cuda_runtime_api.h>
+#endif
+
+#include "base/timer.h"
+#include "cudamatrix/cu-common.h"
+#include "cudamatrix/cu-device.h"
+#include "cudamatrix/cu-matrixdim.h"
+#include "cudamatrix/cu-kernels.h"
+
+#include "cudamatrix/cu-array.h"
+
+namespace kaldi {
+
+template<> 
+void CuArray<int32>::Set(const int32 &value) {
+  if (dim_ == 0) return;
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) { 
+    Timer tim;
+
+    dim3 dimBlock(CU2DBLOCK);
+    dim3 dimGrid(n_blocks(Dim(), CU2DBLOCK));
+    ::MatrixDim d = { 1, Dim(), Dim() };
+
+    cuda_int32_set_const(dimGrid, dimBlock, data_, value, d);
+    CU_SAFE_CALL(cudaGetLastError());
+
+    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
+  } else
+#endif
+  {
+    for (int32 i = 0; i < dim_; i++) {
+      data_[i] = value;
+    }
+  }
+}
+
+
+template<> 
+void CuArray<int32>::Add(const int32 &value) {
+  if (dim_ == 0) return;
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) { 
+    Timer tim;
+
+    dim3 dimBlock(CU2DBLOCK);
+    dim3 dimGrid(n_blocks(Dim(), CU2DBLOCK));
+    ::MatrixDim d = { 1, Dim(), Dim() };
+
+    cuda_int32_add(dimGrid, dimBlock, data_, value, d);
+    CU_SAFE_CALL(cudaGetLastError());
+
+    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
+  } else
+#endif
+  {
+    for (int32 i = 0; i < dim_; i++) {
+      data_[i] += value;
+    }
+  }
+} 
+
+
+}  // namespace kaldi
diff --git a/src/cudamatrix/cu-array.h b/src/cudamatrix/cu-array.h
index 18ea7c2ef11..86672db9b08 100644
--- a/src/cudamatrix/cu-array.h
+++ b/src/cudamatrix/cu-array.h
@@ -88,6 +88,11 @@ class CuArray {
   /// objects are more than plain structs.
   void CopyToVec(std::vector<T> *dst) const;
 
+  /// Version of the above function that copies contents to a host array.
+  /// This function requires *dst to be allocated before calling. The allocated
+  /// size should be dim_ * sizeof(T)
+  void CopyToHost(T *dst) const;
+
   /// Sets the memory for the object to zero, via memset.  You should verify
   /// that this makes sense for type T.
   void SetZero();
@@ -96,6 +101,18 @@ class CuArray {
   /// assignment operators or destructors are not called.  This is NOT IMPLEMENTED
   /// YET except for T == int32 (the current implementation will just crash).
   void Set(const T &value);
+  
+  /// Add a constant value. This is NOT IMPLEMENTED YET except for T == int32 
+  /// (the current implementation will just crash).
+  void Add(const T &value);
+
+  /// Get minimum value (for now implemented on CPU, reimplement if slow).
+  /// Asserts the vector is non-empty, otherwise crashes.
+  T Min() const;
+
+  /// Get minimum value (for now implemented on CPU, reimplement if slow).
+  /// Asserts the vector is non-empty, otherwise crashes.
+  T Max() const;
 
   CuArray<T> &operator= (const CuArray<T> &in) {
     this->CopyFromArray(in); return *this;
@@ -104,6 +121,10 @@ class CuArray {
   CuArray<T> &operator= (const std::vector<T> &in) {
     this->CopyFromVec(in); return *this;
   }
+
+  /// I/O
+  void Read(std::istream &is, bool binary);
+  void Write(std::ostream &is, bool binary) const;
   
  private:
   MatrixIndexT dim_;     ///< dimension of the vector
@@ -115,9 +136,8 @@ class CuArray {
 /// I/O
 template<typename T>
 std::ostream &operator << (std::ostream &out, const CuArray<T> &vec);
- 
-} // namespace
 
+} // namespace
 
 #include "cudamatrix/cu-array-inl.h"
 
diff --git a/src/cudamatrix/cu-block-matrix.cc b/src/cudamatrix/cu-block-matrix.cc
index 018a1a2a672..d36b3e31f92 100644
--- a/src/cudamatrix/cu-block-matrix.cc
+++ b/src/cudamatrix/cu-block-matrix.cc
@@ -20,7 +20,7 @@
 
 #if HAVE_CUDA == 1
 #include <cuda_runtime_api.h>
-#include <cublas.h>
+#include <cublas_v2.h>
 #endif
 
 #include <algorithm>
diff --git a/src/cudamatrix/cu-common.cc b/src/cudamatrix/cu-common.cc
index cbe6392dbf6..2b23bf0b621 100644
--- a/src/cudamatrix/cu-common.cc
+++ b/src/cudamatrix/cu-common.cc
@@ -1,6 +1,7 @@
 // cudamatrix/cu-common.cc
 
 // Copyright      2013  Karel Vesely
+//                2015  Johns Hopkins University (author: Daniel Povey)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -23,11 +24,11 @@
 // This file contains some #includes, forward declarations
 // and typedefs that are needed by all the main header
 // files in this directory.
-
 #include "base/kaldi-common.h"
 #include "matrix/kaldi-blas.h"
 #include "cudamatrix/cu-device.h"
 #include "cudamatrix/cu-common.h"
+#include "cudamatrix/cu-matrixdim.h"
 
 namespace kaldi {
 
@@ -43,6 +44,29 @@ cublasOperation_t KaldiTransToCuTrans(MatrixTransposeType kaldi_trans) {
     cublas_trans = CUBLAS_OP_C;
   return cublas_trans;
 }
+
+void GetBlockSizesForSimpleMatrixOperation(int32 num_rows,
+                                           int32 num_cols,
+                                           dim3 *dimGrid,
+                                           dim3 *dimBlock) {
+  KALDI_ASSERT(num_rows > 0 && num_cols > 0);
+  int32 col_blocksize = 64, row_blocksize = 4;
+  while (col_blocksize > 1 &&
+         (num_cols + (num_cols / 2) <= col_blocksize ||
+          num_rows > 65536 * row_blocksize)) {
+    col_blocksize /= 2;
+    row_blocksize *= 2;
+  }
+
+  dimBlock->x = col_blocksize;
+  dimBlock->y = row_blocksize;
+  dimBlock->z = 1;
+  dimGrid->x = n_blocks(num_cols, col_blocksize);
+  dimGrid->y = n_blocks(num_rows, row_blocksize);
+  KALDI_ASSERT(dimGrid->y <= 65536 &&
+               "Matrix has too many rows to process");
+  dimGrid->z = 1;
+}
 #endif
 
 } // namespace
diff --git a/src/cudamatrix/cu-common.h b/src/cudamatrix/cu-common.h
index 7530d5c8627..eadf963e2c8 100644
--- a/src/cudamatrix/cu-common.h
+++ b/src/cudamatrix/cu-common.h
@@ -30,7 +30,7 @@
 #include "matrix/matrix-common.h"
 
 #if HAVE_CUDA == 1
-#include <cublas.h>
+#include <cublas_v2.h>
 #include <cuda_runtime_api.h>
 
 
@@ -41,26 +41,42 @@
   if ((ret = (fun)) != 0) { \
     KALDI_ERR << "cudaError_t " << ret << " : \"" << cudaGetErrorString((cudaError_t)ret) << "\" returned from '" << #fun << "'"; \
   } \
-  cudaThreadSynchronize(); \
-} 
+  cudaDeviceSynchronize(); \
+}
 
 #define KALDI_CUDA_ERR(ret, msg) \
 { \
   if (ret != 0) { \
     KALDI_ERR << msg << ", diagnostics: cudaError_t " << ret << " : \"" << cudaGetErrorString((cudaError_t)ret) << "\", in " << __FILE__ << ":" << __LINE__; \
   } \
-  cudaThreadSynchronize(); \
-} 
+  cudaDeviceSynchronize(); \
+}
 
 namespace kaldi {
 
 /** Number of blocks in which the task of size 'size' is splitted **/
-inline int32 n_blocks(int32 size, int32 block_size) { 
-  return size / block_size + ((size % block_size == 0)? 0 : 1); 
+inline int32 n_blocks(int32 size, int32 block_size) {
+  return size / block_size + ((size % block_size == 0)? 0 : 1);
 }
 
 cublasOperation_t KaldiTransToCuTrans(MatrixTransposeType kaldi_trans);
-  
+
+
+/*
+  This function gives you suitable dimBlock and dimGrid sizes for a simple
+  matrix operation (one that applies to each element of the matrix.  The x
+  indexes will be interpreted as column indexes, and the y indexes will be
+  interpreted as row indexes; this is based on our interpretation of a matrix as
+  being row-major, i.e.  having column-stride = 1, not based on CuBLAS's
+  opposite interpretation.  There is a good reason for associating the column
+  index with x and not y; this helps memory locality in adjacent kernels.
+ */
+void GetBlockSizesForSimpleMatrixOperation(int32 num_rows,
+                                           int32 num_cols,
+                                           dim3 *dimGrid,
+                                           dim3 *dimBlock);
+
+
 }
 
 #endif // HAVE_CUDA
diff --git a/src/cudamatrix/cu-device-test.cc b/src/cudamatrix/cu-device-test.cc
new file mode 100644
index 00000000000..716c1c24d4c
--- /dev/null
+++ b/src/cudamatrix/cu-device-test.cc
@@ -0,0 +1,125 @@
+// cudamatrix/cu-device-test.cc
+
+// Copyright 2015  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include <iostream>
+#include <vector>
+#include <cstdlib>
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "cudamatrix/cu-matrix.h"
+#include "cudamatrix/cu-vector.h"
+
+using namespace kaldi;
+
+
+namespace kaldi {
+
+
+template<typename Real>
+std::string NameOf() {
+  return (sizeof(Real) == 8 ? "<double>" : "<float>");
+}
+
+template<typename Real> void TestCuMatrixResize(int32 size_multiple) {
+  int32 num_matrices = 256;
+  BaseFloat time_in_secs = 0.2;
+
+  std::vector<std::pair<int32, int32> > sizes(num_matrices);
+
+  for (int32 i = 0; i < num_matrices; i++) {
+    int32 num_rows = RandInt(1, 10);
+    num_rows *= num_rows;
+    num_rows *= size_multiple;
+    int32 num_cols = RandInt(1, 10);
+    num_cols *= num_cols;
+    num_cols *= size_multiple;
+    sizes[i].first = num_rows;
+    sizes[i].second = num_rows;
+  }
+
+  std::vector<CuMatrix<BaseFloat> > matrices(num_matrices);
+
+  Timer tim;
+  size_t num_floats_processed = 0;
+  for (;tim.Elapsed() < time_in_secs; ) {
+    int32 matrix = RandInt(0, num_matrices - 1);
+    if (matrices[matrix].NumRows() == 0) {
+      int32 num_rows = sizes[matrix].first,
+          num_cols = sizes[matrix].second;
+      matrices[matrix].Resize(num_rows, num_cols, kUndefined);
+      num_floats_processed += num_rows * num_cols;
+    } else {
+      matrices[matrix].Resize(0, 0);
+    }
+  }
+
+  BaseFloat gflops = num_floats_processed / (tim.Elapsed() * 1.0e+09);
+
+  KALDI_LOG << "For CuMatrix::Resize" << NameOf<Real>() << ", for size_multiple = "
+            << size_multiple << ", speed was " << gflops << " gigaflops.";
+}
+
+template <typename Real>
+void CudaMatrixResizeTest() {
+  std::vector<int32> sizes;
+  sizes.push_back(1);
+  sizes.push_back(2);
+  sizes.push_back(4);
+  sizes.push_back(8);
+  sizes.push_back(16);
+  //sizes.push_back(24);
+  //sizes.push_back(32);
+  //sizes.push_back(40);
+
+  int32 ns = sizes.size();
+  for (int32 s = 0; s < ns; s++)
+    TestCuMatrixResize<Real>(sizes[s]);
+}
+
+
+} // namespace kaldi
+
+
+int main() {
+  for (int32 loop = 0; loop < 2; loop++) {
+#if HAVE_CUDA == 1
+    if (loop == 0)
+      CuDevice::Instantiate().SelectGpuId("no");
+    else
+      CuDevice::Instantiate().SelectGpuId("yes");
+#endif
+
+    kaldi::CudaMatrixResizeTest<float>();
+#if HAVE_CUDA == 1
+    if (CuDevice::Instantiate().DoublePrecisionSupported()) {
+      kaldi::CudaMatrixResizeTest<double>();
+    } else {
+      KALDI_WARN << "Double precision not supported";
+    }
+#else
+    kaldi::CudaMatrixResizeTest<double>();
+#endif
+  }
+#if HAVE_CUDA == 1
+  CuDevice::Instantiate().PrintProfile();
+#endif
+  std::cout << "Tests succeeded.\n";
+}
diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc
index 5246dfd2cb7..c34994ed6ce 100644
--- a/src/cudamatrix/cu-device.cc
+++ b/src/cudamatrix/cu-device.cc
@@ -2,7 +2,7 @@
 
 // Copyright 2009-2012  Karel Vesely
 //                2013  Lucas Ondel
-//                2013  Johns Hopkins University (author: Daniel Povey)
+//           2013-2015  Johns Hopkins University (author: Daniel Povey)
 //                2015  Guoguo Chen
 
 // See ../../COPYING for clarification regarding multiple authors
@@ -24,7 +24,7 @@
 
 #if HAVE_CUDA == 1
 
-#include <cublas.h>
+#include <cublas_v2.h>
 #include <cuda.h>
 #include <cuda_runtime_api.h>
 
@@ -41,10 +41,10 @@
 #include "base/kaldi-error.h"
 #include "base/kaldi-utils.h"
 #include "util/common-utils.h"
+#include "util/kaldi-io.h"
 
 namespace kaldi {
 
-
 /**
    This function was added by Dan in July 2015 after upgrading on the CLSP
    cluster to the CUDA 7.0 toolkit; the old mechanism of just calling
@@ -55,22 +55,26 @@ namespace kaldi {
    changed feature (the NVidia docs were never super-clear regarding device
    initialization).  But regardless, changing to this new mechanism should be
    harmless even if the problem was specific to the CLSP grid.
- */
+*/
 
-static bool GetCudaContext(int32 num_gpus) {
-  cudaError_t e;
+static bool GetCudaContext(int32 num_gpus, std::string *debug_str) {
+  std::ostringstream debug_stream;
+  debug_stream << "num-gpus=" << num_gpus << ". ";
   for (int32 device = 0; device < num_gpus; device++) {
     cudaSetDevice(device);
-    e = cudaDeviceSynchronize(); // << CUDA context gets created here.
-    cudaGetLastError(); // reset the error state     
+    cudaError_t e = cudaDeviceSynchronize(); // << CUDA context gets created here.
     if (e == cudaSuccess) {
+      *debug_str = debug_stream.str();
       return true;
     }
+    debug_stream << "Device " << device << ": " << cudaGetErrorString(e) << ".  ";
+    cudaGetLastError();  // Make sure the error state doesn't get returned in
+                         // the next cudaGetLastError().
   }
+  *debug_str = debug_stream.str();
   return false;
 }
 
-
 /**
  * SelectGpuId(use_gpu)
  *
@@ -102,7 +106,7 @@ void CuDevice::SelectGpuId(std::string use_gpu) {
               << ", cannot change it on the fly!";
   }
   // Allow the GPU to stay disabled
-  if(!Enabled() && use_gpu == "no") {
+  if (!Enabled() && use_gpu == "no") {
     KALDI_LOG << "Manually selected to compute on CPU.";
     return;
   }
@@ -110,8 +114,7 @@ void CuDevice::SelectGpuId(std::string use_gpu) {
   // Check that we have a gpu available
   int32 num_gpus = 0;
 
-  cudaError_t e;
-  e = cudaGetDeviceCount(&num_gpus);
+  cudaError_t e = cudaGetDeviceCount(&num_gpus);
 
   if (num_gpus == 0) {
     if (use_gpu == "yes" || use_gpu == "wait") {
@@ -124,18 +127,24 @@ void CuDevice::SelectGpuId(std::string use_gpu) {
   }
 
   // Create a CUDA context.
-  bool got_context = GetCudaContext(num_gpus);
+  std::string debug_str;
+  bool got_context = GetCudaContext(num_gpus, &debug_str);
 
   if (use_gpu != "wait") {
     if (!got_context) {
       // So far no we don't have context, sleep a bit and retry.
       int32 sec_sleep = (use_gpu == "yes" ? 20 : 2);
       KALDI_WARN << "Will try again to get a GPU after " << sec_sleep
-        << " seconds.";
+                 << " seconds.";
       Sleep(sec_sleep);
-      if (! GetCudaContext(num_gpus)) {      
+      if (!GetCudaContext(num_gpus, &debug_str)) {
         if (use_gpu == "yes") {
-          KALDI_CUDA_ERR(e, "Failed to create CUDA context, no more unused GPUs?");
+          {
+            Input input;
+            input.Open("nvidia-smi 1>&2 |");
+          }
+          KALDI_LOG << debug_str;
+          KALDI_ERR << "Failed to create CUDA context, no more unused GPUs? ";
         }
         if (use_gpu == "optional") {
           KALDI_WARN << "Running on CPU!!! No more unused CUDA GPUs?";
@@ -154,7 +163,7 @@ void CuDevice::SelectGpuId(std::string use_gpu) {
       num_times++;
       wait_time += sec_sleep;
       Sleep(sec_sleep);
-      got_context = GetCudaContext(num_gpus);
+      got_context = GetCudaContext(num_gpus, &debug_str);
     }
 
     KALDI_WARN << "Waited " << wait_time
@@ -170,7 +179,7 @@ void CuDevice::SelectGpuId(std::string use_gpu) {
     return;
   } else {
     // Or suggest to use compute exclusive mode
-    if(num_gpus > 1) { 
+    if (num_gpus > 1) {
       KALDI_WARN << "Suggestion: use 'nvidia-smi -c 1' to set compute exclusive mode";
     }
     // And select the GPU according to proportion of free memory
@@ -199,15 +208,14 @@ void CuDevice::FinalizeActiveGpu() {
   // Get the device-id of active device:
   {
     int32 act_gpu_id;
-    cudaError_t e;
-    e = cudaGetDevice(&act_gpu_id);
-    if(e != cudaSuccess) {
+    cudaError_t e = cudaGetDevice(&act_gpu_id);
+    if (e != cudaSuccess) {
       KALDI_CUDA_ERR(e, "Failed to get device-id of active device.");
     }
     // Remember the id of active GPU
     active_gpu_id_ = act_gpu_id; // CuDevice::Enabled() is true from now on
     // Initialize the CUBLAS
-    CU_SAFE_CALL(cublasInit());
+    CU_SAFE_CALL(cublasCreate(&handle_));
 
     // Notify user which GPU is finally used
     char name[128];
@@ -218,8 +226,6 @@ void CuDevice::FinalizeActiveGpu() {
     KALDI_LOG << "The active GPU is [" << act_gpu_id << "]: " << name << "\t"
               << GetFreeMemory(&free_memory_at_startup_, NULL) << " version "
               << properties_.major << "." << properties_.minor;
-
-    if (verbose_) PrintMemoryUsage();
   }
   return;
 }
@@ -239,12 +245,12 @@ bool CuDevice::IsComputeExclusive() {
   // get the device-id and its device-properties
   int32 gpu_id = -1;
   cudaError_t e = cudaGetDevice(&gpu_id);
-  if(e != cudaSuccess) {
+  if (e != cudaSuccess) {
     KALDI_CUDA_ERR(e, "Failed to get current device");
   }
   struct cudaDeviceProp gpu_prop;
   e = cudaGetDeviceProperties(&gpu_prop, gpu_id);
-  if(e != cudaSuccess) {
+  if (e != cudaSuccess) {
     KALDI_CUDA_ERR(e,  "Failed to get device properties");
   }
   // find out whether compute exclusive mode is used
@@ -263,7 +269,7 @@ bool CuDevice::IsComputeExclusive() {
       // The computation mode is not compute-exclusive,
       // in this case we release the GPU context...
       e = cudaThreadExit(); // deprecated, but for legacy reason not cudaDeviceReset
-      if(e != cudaSuccess) {
+      if (e != cudaSuccess) {
         KALDI_CUDA_ERR(e, "Failed to release CUDA context on a GPU");
       }
       return false;
@@ -277,14 +283,13 @@ bool greater_pair(const std::pair<TA, TB> &left, const std::pair<TA, TB>& right)
 
 bool CuDevice::SelectGpuIdAuto() {
   // Check that we have at least one gpu
-  cudaError_t e;
   int32 num_gpus = 0;
-  e = cudaGetDeviceCount(&num_gpus);
-  if(num_gpus == 0) {
+  cudaError_t e = cudaGetDeviceCount(&num_gpus);
+  if (num_gpus == 0) {
     KALDI_WARN << "No CUDA devices found";
     if (e != cudaSuccess) {
       KALDI_WARN << "cudaGetDeviceCount() returned " << e
-        <<", meaning: \"" << cudaGetErrorString(e)  << "\"";
+                 <<", meaning: \"" << cudaGetErrorString(e)  << "\"";
     }
     return false;
   }
@@ -343,7 +348,7 @@ bool CuDevice::SelectGpuIdAuto() {
   // find GPU with max free memory
   int32 max_id=0;
   std::sort(free_mem_ratio.begin(), free_mem_ratio.end(),
-      greater_pair<int, float>);
+            greater_pair<int, float>);
   // the free_mem_ratio should be bigger than zero
   KALDI_ASSERT(free_mem_ratio[max_id].second > 0.0);
 
@@ -359,14 +364,14 @@ bool CuDevice::SelectGpuIdAuto() {
 
     KALDI_LOG << "Trying to select device: " << dev_id << " (automatically), mem_ratio: " << mem_ratio;
     e = cudaSetDevice(dev_id);
-    if(e != cudaSuccess) {
+    if (e != cudaSuccess) {
       KALDI_WARN << "Cannot select this device: return code " << e
-        << ", Error message: \"" << cudaGetErrorString(e) << "\"";
+                 << ", Error message: \"" << cudaGetErrorString(e) << "\"";
     } else {
       e = cudaThreadSynchronize(); // deprecated, but for legacy not cudaDeviceSynchronize
-      if(e != cudaSuccess) {
+      if (e != cudaSuccess) {
         KALDI_WARN << "Cannot select this device: return code " << e
-          << ", Error message: \"" << cudaGetErrorString(e) << "\"";
+                   << ", Error message: \"" << cudaGetErrorString(e) << "\"";
       }
     }
     max_id++;
@@ -390,9 +395,11 @@ void CuDevice::AccuProfile(const std::string &key, double time) {
 
 void CuDevice::PrintMemoryUsage() const {
   if (Enabled()) {
+    allocator_.PrintMemoryUsage();
     int64 free_memory_now;
     GetFreeMemory(&free_memory_now, NULL);
-    KALDI_LOG << "Memory used: " << (free_memory_at_startup_ - free_memory_now) << " bytes.";
+    KALDI_LOG << "Memory used (according to the device): "
+              << (free_memory_at_startup_ - free_memory_now) << " bytes.";
   }
 }
 
@@ -400,7 +407,7 @@ void CuDevice::PrintProfile() {
   if (verbose_ && Enabled()) {
     std::ostringstream os;
     os << "-----\n[cudevice profile]\n";
-    std::map<std::string, double>::iterator it;
+    unordered_map<std::string, double, StringHasher>::iterator it;
     std::vector<std::pair<double, std::string> > pairs;
     double total_time = 0.0;
     for(it = profile_map_.begin(); it != profile_map_.end(); ++it) {
@@ -425,10 +432,10 @@ void CuDevice::PrintProfile() {
 
 
 std::string CuDevice::GetFreeMemory(int64* free, int64* total) const {
-// WARNING! the CUDA API is inconsistent accross versions!
+  // WARNING! the CUDA API is inconsistent accross versions!
 #ifdef _MSC_VER
-	size_t mem_free, mem_total;
-	cuMemGetInfo_v2(&mem_free, &mem_total);
+  size_t mem_free, mem_total;
+  cuMemGetInfo_v2(&mem_free, &mem_total);
 #else
 #if (CUDA_VERSION >= 3020)
   // define the function signature type
@@ -437,16 +444,12 @@ std::string CuDevice::GetFreeMemory(int64* free, int64* total) const {
   unsigned int mem_free, mem_total;
 #endif
   {
-    // we will load the cuMemGetInfo dynamically from libcuda.so
-    // cuMemGetInfo(&mem_free, &mem_total);
+    // we will load cuMemGetInfo_v2 dynamically from libcuda.so
     // pre-fill ``safe'' values that will not cause problems
     mem_free = 1; mem_total = 1;
-#ifdef _MSC_VER
-    cuMemGetInfo_v2(&mem_free, &mem_total);
-#else
     // open libcuda.so
     void* libcuda = dlopen("libcuda.so",RTLD_LAZY);
-    if(NULL == libcuda) {
+    if (NULL == libcuda) {
       KALDI_WARN << "cannot open libcuda.so";
     } else {
       // define the function signature type
@@ -458,7 +461,7 @@ std::string CuDevice::GetFreeMemory(int64* free, int64* total) const {
       typedef CUresult (*cu_fun_ptr)(int*, int*);
       cu_fun_ptr dl_cuMemGetInfo = (cu_fun_ptr)dlsym(libcuda,"cuMemGetInfo");
 #endif
-      if(NULL == dl_cuMemGetInfo) {
+      if (NULL == dl_cuMemGetInfo) {
         KALDI_WARN << "cannot load cuMemGetInfo from libcuda.so";
       } else {
         // call the function
@@ -467,12 +470,11 @@ std::string CuDevice::GetFreeMemory(int64* free, int64* total) const {
       // close the library
       dlclose(libcuda);
     }
-#endif
   }
 #endif
   // copy the output values outside
-  if(NULL != free) *free = mem_free;
-  if(NULL != total) *total = mem_total;
+  if (NULL != free) *free = mem_free;
+  if (NULL != total) *total = mem_total;
   // prepare the text output
   std::ostringstream os;
   os << "free:" << mem_free/(1024*1024) << "M, "
@@ -491,14 +493,14 @@ void CuDevice::DeviceGetName(char* name, int32 len, int32 dev) {
 #else
   // open libcuda.so
   void* libcuda = dlopen("libcuda.so",RTLD_LAZY);
-  if(NULL == libcuda) {
+  if (NULL == libcuda) {
     KALDI_WARN << "cannot open libcuda.so";
   } else {
     // define the function signature type
     typedef CUresult (*cu_fun_ptr)(char*,int,CUdevice);
     // get the symbol
     cu_fun_ptr cuDeviceGetName_ptr = (cu_fun_ptr)dlsym(libcuda,"cuDeviceGetName");
-    if(NULL == cuDeviceGetName_ptr) {
+    if (NULL == cuDeviceGetName_ptr) {
       KALDI_WARN << "cannot load cuDeviceGetName from libcuda.so";
     } else {
       // call the function
@@ -512,7 +514,7 @@ void CuDevice::DeviceGetName(char* name, int32 len, int32 dev) {
 
 
 void CuDevice::CheckGpuHealth() {
-  if(!Enabled()) return;
+  if (!Enabled()) return;
   Timer t;
   // prepare small matrices for a quick test
   Matrix<BaseFloat> a(50, 100);
@@ -532,47 +534,48 @@ void CuDevice::CheckGpuHealth() {
 }
 
 
-void CuDevice::Free(void *ptr) {
+/*
+  void CuDevice::Free(void *ptr) {
   CU_SAFE_CALL(cudaFree(ptr));
-}
+  }
 
-void* CuDevice::MallocPitch(size_t row_bytes, size_t num_rows, size_t *pitch) {
-  void *ret_ptr = NULL;
-  cudaError_t e = cudaMallocPitch(&ret_ptr, pitch, row_bytes, num_rows);
+  void* CuDevice::MallocPitch(size_t row_bytes, size_t num_rows, size_t *pitch) {
+  void *ans = NULL;
+  cudaError_t e = cudaMallocPitch(&ans, pitch, row_bytes, num_rows);
   if (e != cudaSuccess) {
-    PrintMemoryUsage();
-    KALDI_ERR << "CuDevice::MallocPitch: cannot allocate the requested memory ("
-      << row_bytes << " x " << num_rows << " = "
-      << row_bytes * num_rows << " bytes )";
+  PrintMemoryUsage();
+  KALDI_ERR << "CuDevice::MallocPitch: cannot allocate the requested memory ("
+  << row_bytes << " x " << num_rows << " = "
+  << row_bytes * num_rows << " bytes )";
+  }
+  return ans;
   }
-  return ret_ptr;
-}
 
-void* CuDevice::Malloc(size_t size) {
-  void *ret_ptr = NULL;
-  cudaError_t e = cudaMalloc(&ret_ptr, size);
+  void* CuDevice::Malloc(size_t size) {
+  void *ans = NULL;
+  cudaError_t e = cudaMalloc(&ans, size);
   if (e != cudaSuccess) {
-    PrintMemoryUsage();
-    KALDI_ERR << "CuDevice::Malloc: cannot allocate the requested memory"
-      << " (" << size << " bytes )";
+  PrintMemoryUsage();
+  KALDI_ERR << "CuDevice::Malloc: cannot allocate the requested memory"
+  << " (" << size << " bytes )";
   }
-  return ret_ptr;
-}
+  return ans;
+  }
+*/
 
-CuDevice::CuDevice(): active_gpu_id_(-1), verbose_(true)
-  { }
+CuDevice::CuDevice(): active_gpu_id_(-1), verbose_(true),
+                      allocator_(CuAllocatorOptions()) { }
 
 
 CuDevice::~CuDevice() {
   if (Enabled()) {
-    cublasShutdown();
+    cublasDestroy(handle_);
+    cudaDeviceReset();
   }
 }
 
 // The instance of the static singleton
 CuDevice CuDevice::global_device_;
-
-
 }
 
 
diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h
index 5858fc2d84e..ddf275a73e8 100644
--- a/src/cudamatrix/cu-device.h
+++ b/src/cudamatrix/cu-device.h
@@ -1,6 +1,7 @@
 // cudamatrix/cu-device.h
 
 // Copyright 2009-2012  Karel Vesely
+//           2012-2015  Johns Hopkins University (author: Daniel Povey)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -24,18 +25,19 @@
 
 #if HAVE_CUDA == 1
 
+#include <cublas_v2.h>
 #include <map>
 #include <string>
 #include <iostream>
 #include <cuda.h>
 #include <cuda_runtime_api.h>
 #include "base/kaldi-common.h"
+#include "cudamatrix/cu-allocator.h"
 
 namespace kaldi {
 
-
 /**
- * Singleton object which represents CUDA device
+ * Singleton object which represents the CUDA device
  * responsible for CUBLAS initilalisation, collects profiling info
  */
 class CuDevice {
@@ -44,26 +46,29 @@ class CuDevice {
   ~CuDevice();
   static inline CuDevice& Instantiate() { return global_device_; }
 
+  inline cublasHandle_t GetHandle() { return handle_; }
+
   // We provide functions Malloc, MallocPitch and Free which replace cudaMalloc,
   // cudaMallocPitch and cudaFree.  Their function is to cache the results of
   // previous allocations to avoid the very large overhead that CUDA's
   // allocation seems to give for some setups.
-  void* Malloc(size_t size);
-  
-  void* MallocPitch(size_t row_bytes, size_t num_rows, size_t *pitch);
-  
-  void Free(void *ptr);
+  inline void* Malloc(size_t size) { return allocator_.Malloc(size); }
+
+  inline void* MallocPitch(size_t row_bytes, size_t num_rows, size_t *pitch) {
+    return allocator_.MallocPitch(row_bytes, num_rows, pitch);
+  }
+  inline void Free(void *ptr) { allocator_.Free(ptr); }
 
   /// Select a GPU for computation, the 'use_gpu' modes are:
   ///  "yes"      -- Select GPU automatically and die if this fails.
-  ///  "optional" -- Do as above, but if it fails, back off to CPU. 
-  ///  "no"       -- Run on CPU. 
+  ///  "optional" -- Do as above, but if it fails, back off to CPU.
+  ///  "no"       -- Run on CPU.
   ///  (more comments in cu-device.cc)
   void SelectGpuId(std::string use_gpu);
 
   /// Check if the CUDA GPU is selected for use
   bool Enabled() const {
-    return (active_gpu_id_ > -1); 
+    return (active_gpu_id_ > -1);
   }
 
   /// Get the active GPU id
@@ -79,18 +84,18 @@ class CuDevice {
 
   /// Sum the IO time
   void AccuProfile(const std::string &key, double time);
-  void PrintProfile(); 
+  void PrintProfile();
 
   void PrintMemoryUsage() const;
-  
-  void ResetProfile() { 
-    profile_map_.clear(); 
+
+  void ResetProfile() {
+    profile_map_.clear();
   }
-  
+
   /// Get the actual GPU memory use stats
   std::string GetFreeMemory(int64* free = NULL, int64* total = NULL) const;
   /// Get the name of the GPU
-  void DeviceGetName(char* name, int32 len, int32 dev); 
+  void DeviceGetName(char* name, int32 len, int32 dev);
 
   /// Check if GPU is in good condition by multiplying small matrices on GPU+CPU.
   /// Overheated GPUs may give inaccurate results, which we want to detect.
@@ -100,14 +105,16 @@ class CuDevice {
   /// will always be a multiple of n (from properties_.textureAlignment).
   /// Otherwise, return 16, which is the stride used for CPU matrices.
   int32 GetMatrixAlignment() const;
-  
+
  private:
   CuDevice();
   CuDevice(CuDevice&); // Disallow.
   CuDevice &operator=(CuDevice&);  // Disallow.
 
+
   static CuDevice global_device_;
-  
+  cublasHandle_t handle_;
+
   /// Check if the GPU run in compute exclusive mode Returns true if it is
   /// running in compute exclusive mode and we have a GPU.  Returns false
   /// otherwise.  Sets error to true if there was some error, such as that we
@@ -122,31 +129,35 @@ class CuDevice {
   bool SelectGpuIdManual(int32 gpu_id);
 
   void FinalizeActiveGpu();
-  
-  /// Should only be called if Enabled() == true. 
+
+  /// Should only be called if Enabled() == true.
   int32 MajorDeviceVersion();
 
-  /// Should only be called if Enabled() == true. 
+  /// Should only be called if Enabled() == true.
   int32 MinorDeviceVersion();
 
-  std::map<std::string, double> profile_map_;
-  
+  unordered_map<std::string, double, StringHasher> profile_map_;
+
   /// active_gpu_id_ values:
   /// -3 default (default, the SelectGpuId was not called, we did not want to use GPU)
   /// -2 SelectGpuId was called, but no GPU was present
   /// -1 SelectGpuId was called, but the GPU was manually disabled
   /// 0..N Normal GPU IDs
-  int32 active_gpu_id_; 
-  
+  int32 active_gpu_id_;
+
   int64 free_memory_at_startup_;
-  
+
   cudaDeviceProp properties_;
 
   bool verbose_;
 
-  
+  CuMemoryAllocator allocator_;
+
 }; // class CuDevice
 
+// This function is declared as a more convenient way to get the CUDA device handle for use
+// in the CUBLAS v2 API, since we so frequently need to access it.
+inline cublasHandle_t GetCublasHandle() { return CuDevice::Instantiate().GetHandle(); }
 
 
 }  // namespace
diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h
index caae069da9e..bb909b47c32 100644
--- a/src/cudamatrix/cu-kernels-ansi.h
+++ b/src/cudamatrix/cu-kernels-ansi.h
@@ -2,7 +2,7 @@
 
 // Copyright 2009-2012  Karel Vesely
 //                2013  Johns Hopkins University (author: Daniel Povey)
-//                2013  Hainan Xu    
+//                2013  Hainan Xu
 //                2013  Xiaohui Zhang
 //           2013-2015  Guoguo Chen
 
@@ -35,7 +35,8 @@ extern "C" {
 /*********************************************************
  * int32 CUDA kernel calls (no template wrapper)
  */
-void cudaI32_set_const(dim3 Gr, dim3 Bl, int32_cuda *mat, int32_cuda value, MatrixDim d);
+void cuda_int32_set_const(dim3 Gr, dim3 Bl, int32_cuda *mat, int32_cuda value, MatrixDim d);
+void cuda_int32_add(dim3 Gr, dim3 Bl, int32_cuda *mat, int32_cuda value, MatrixDim d);
 
 
 
@@ -44,7 +45,7 @@ void cudaI32_set_const(dim3 Gr, dim3 Bl, int32_cuda *mat, int32_cuda value, Matr
  */
 
 /*
- * CuMatrix 
+ * CuMatrix
  */
 void cudaF_copy_upp_low(dim3 Gr, dim3 Bl, float* A, MatrixDim dimA);
 void cudaF_copy_low_upp(dim3 Gr, dim3 Bl, float* A, MatrixDim dimA);
@@ -55,11 +56,10 @@ void cudaF_copy_from_tp_trans(dim3 Gr, dim3 Bl, float* A, const float* B, Matrix
 void cudaFD_copy_from_tp_trans(dim3 Gr, dim3 Bl, float* A, const double* B, MatrixDim dmat);
 void cudaF_copy_from_tp(dim3 Gr, dim3 Bl, float* A, const float* B, MatrixDim dmat);
 void cudaFD_copy_from_tp(dim3 Gr, dim3 Bl, float* A, const double* B, MatrixDim dmat);
-void cudaF_copy_col_from_vec(int Gr, int Bl, float* mat, const float* v, int col, MatrixDim d);
 void cudaF_apply_exp(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);
 void cudaF_apply_pow(dim3 Gr, dim3 Bl, float* mat, float power, MatrixDim d);
 void cudaF_apply_pow_abs(dim3 Gr, dim3 Bl, float* mat, float power, bool include_sign,  MatrixDim d);
-void cudaF_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);  
+void cudaF_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);
 void cudaF_apply_floor(dim3 Gr, dim3 Bl, float* mat, float floor_val, MatrixDim d);
 void cudaF_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
 void cudaF_add_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
@@ -90,6 +90,7 @@ void cudaF_calc_pnorm_deriv(dim3 Gr, dim3 Bl, float *y, const float *x1, const f
 void cudaF_calc_group_max_deriv(dim3 Gr, dim3 Bl, float *y, const float *x1, const float *x2,  MatrixDim d, int src_stride, int group_size);
 void cudaF_div_rows_vec(dim3 Gr, dim3 Bl, float *mat, const float *vec_div, MatrixDim d);
 void cudaF_add_mat(dim3 Gr, dim3 Bl, float alpha, const float *src, float *dst, MatrixDim d, int src_stride, int A_trans);
+void cudaF_add_mat_blocks(dim3 Gr, dim3 Bl, float alpha, const float *src, int32_cuda num_row_blocks, int32_cuda num_col_blocks, float *dst, MatrixDim d, int src_stride, int A_trans);
 void cudaF_add_mat_mat_div_mat(dim3 Gr, dim3 Bl, const float *A, const float *B, const float *C, float *dst, MatrixDim d, int stride_a, int stride_b, int stride_c);
 void cudaF_add_vec_to_cols(dim3 Gr, dim3 Bl, float alpha, const float *col, float beta, float *dst, MatrixDim d);
 void cudaF_add_vec_to_rows(dim3 Gr, dim3 Bl, float alpha, const float *row, float beta, float *dst, MatrixDim d);
@@ -106,19 +107,19 @@ void cudaF_vec_mul_elements(int Gr, int Bl, float* v, const float* a, int dim);
 void cudaF_vec_soft_max(int Gr, int Bl, float* v, int dim);
 void cudaF_vec_min(const float* v, float* value, int dim);
 void cudaF_vec_max(const float* v, float* value, int dim);
-void cudaF_trace_mat_mat_trans(const float* A, const float* B, MatrixDim dA, int B_stride, float* value);
-void cudaF_trace_mat_mat(const float* A, const float* B, MatrixDim dA, int B_stride, float* value);
-void cudaF_add_diag_mat_mat(int Gr, int Bl, float alpha, float* v, int v_dim, const float* M, 
-                            int M_cols, int M_row_stride, int M_col_stride, const float *N, int N_row_stride, 
-                            int N_col_stride, int threads_per_element, float beta);  
+void cudaF_trace_mat_mat_trans(dim3 Gr, dim3 Bl, const float* A, const float* B, MatrixDim dA, int B_stride, float* value);
+void cudaF_trace_mat_mat(dim3 Gr, dim3 Bl, const float* A, const float* B, MatrixDim dA, int B_stride, float* value);
+void cudaF_add_diag_mat_mat(int Gr, int Bl, float alpha, float* v, int v_dim, const float* M,
+                            int M_cols, int M_row_stride, int M_col_stride, const float *N, int N_row_stride,
+                            int N_col_stride, int threads_per_element, float beta);
 void cudaF_add_vec_vec(int Gr, int Bl, float alpha, float* v, const float* x, const float* y, float beta, int dim);
-void cudaF_copy_col_from_mat(int Gr, int Bl, float* v, int col, const float* mat, MatrixDim dmat, int dim);
 void cudaF_copy_col_from_mat_df(int Gr, int Bl, double* v, int col, const float* mat, MatrixDim dmat, int dim);
 void cudaF_copy_col_from_mat_fd(int Gr, int Bl, float* v, int col, const float* mat, MatrixDim dmat, int dim);
 void cudaF_vec_sum(int Gr, int Bl, float* v, float* value, int dim, int inc);
 void cudaF_pvec_sum(int Gr, int Bl, float* vec, float* pvec_sum, int dim, int size);
 void cudaF_vec_copy_diag_from_packed(int Gr, int Bl, float *dst, const float *src, int dim);
 void cudaF_vec_apply_floor(int Gr, int Bl, float* v, float floor_val, float* num, int dim);
+void cudaF_vec_apply_ceiling(int Gr, int Bl, float* v, float ceiling_val, float* num, int dim);
 void cudaF_vec_apply_exp(int Gr, int Bl, float* v, int dim);
 void cudaF_vec_apply_log(int Gr, int Bl, float* v, float* flag, int dim);
 void cudaF_trace(int Gr, int Bl, float* mat, float* value, int dim);
@@ -141,12 +142,13 @@ void cudaF_soft_hinge(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, i
 void cudaF_group_pnorm(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride, int group_size, float power);
 void cudaF_group_max(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride, int group_size);
 void cudaF_sigmoid(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride);
+void cudaF_heaviside(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride);
 void cudaF_diff_sigmoid(dim3 Gr, dim3 Bl, float *eout, const float *e, const float *y, MatrixDim d, int e_stride, int y_stride);
 void cudaF_tanh(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride);
 void cudaF_diff_tanh(dim3 Gr, dim3 Bl, float *eout, const float *e, const float *y, MatrixDim d, int e_stride, int y_stride);
 
 void cudaF_regularize_l1(dim3 Gr, dim3 Bl, float *wei, float *grad, float l1, float lr, MatrixDim d, int stride_grad);
-void cudaF_find_row_max_id(dim3 Gr, dim3 Bl, const float *mat, float *vec_val, int32_cuda *vec_id, int32_cuda voff, MatrixDim d);
+void cudaF_find_row_max_id(dim3 Gr, dim3 Bl, const float *mat, float *vec_val, int32_cuda *vec_id, MatrixDim d);
 void cudaF_diff_xent(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt, float *mat_net_out, float *vec_log_post, MatrixDim d);
 void cudaF_copy_rows_from_vec(dim3 Gr, dim3 Bl, float *mat_out, MatrixDim d_out, const float *v_in);
 
@@ -158,17 +160,18 @@ void cudaF_copy_from_sp(dim3 Gr, dim3 Bl, const float* x, float* y, MatrixDim d_
 void cudaF_take_lower(dim3 Gr, dim3 Bl, const float* x, float* y, MatrixDim d_in);
 void cudaF_take_upper(dim3 Gr, dim3 Bl, const float* x, float* y, MatrixDim d_in);
 void cudaF_take_mean(dim3 Gr, dim3 Bl, const float* x, float* y, MatrixDim d_in);
-void cudaF_matrix_add_elements(dim3 Gr, dim3 Bl, float *data, MatrixDim dim, float alpha, MatrixElement<float>* x, int s);
+void cudaF_matrix_add_elements(dim3 Gr, dim3 Bl, float *data, MatrixDim dim, float alpha, MatrixElement<float>* x, int num_elements);
+void cudaF_matrix_add_indexed_values(dim3 Gr, dim3 Bl, MatrixDim dim, float alpha, const Int32Pair* indices, const float* x, int s, float* data);
 void cudaF_comp_obj_deriv(dim3 Gr,dim3 Bl, MatrixElement<float>* x, int s, const float* z, MatrixDim d, float* z2, MatrixDim d2, float* t);
-void cudaF_transpose_matrix(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);  
+void cudaF_transpose_matrix(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);
 void cudaF_sy_add_tr2(dim3 Gr, dim3 Bl, float alpha, float beta, const float* T, MatrixDim tdim,
                       float *S, MatrixDim sdim);
 void cudaF_sum_column_ranges(dim3 Gr, dim3 Bl, float *data, MatrixDim dim,
                              const float *src_data, MatrixDim src_dim,
-                             const Int32Pair *indices);  
+                             const Int32Pair *indices);
 void cudaF_add_row_ranges(dim3 Gr, dim3 Bl, float *data, MatrixDim dim,
                           const float *src_data, MatrixDim src_dim,
-                          const Int32Pair *indexes);  
+                          const Int32Pair *indexes);
 void cudaF_matrix_lookup(dim3 Gr, dim3 Bl, const float *data, MatrixDim dim,
                          const Int32Pair *indices, int indices_size,
                          float *output);
@@ -176,28 +179,27 @@ void cudaF_matrix_lookup(dim3 Gr, dim3 Bl, const float *data, MatrixDim dim,
 void cudaF_equal_element_mask(dim3 Gr, dim3 Bl, const float *mat1,
                               const float *mat2, float *mask, MatrixDim mat1_dim,
                               int mat2_stride, int mask_stride);
-  
+
 /*********************************************************
  * double CUDA kernel calls
  */
 
 /*
- * CuMatrix 
+ * CuMatrix
  */
 void cudaD_copy_upp_low(dim3 Gr, dim3 Bl, double* A, MatrixDim dimB);
 void cudaD_copy_low_upp(dim3 Gr, dim3 Bl, double* A, MatrixDim dimA);
 void cudaD_add_diag_vec_mat(dim3 Gr, dim3 Bl, double alpha, double *mat, MatrixDim mat_dim,
                             const double *vec, const double *mat2, int mat2_row_stride,
-                            int mat2_col_stride, double beta);  
+                            int mat2_col_stride, double beta);
 void cudaD_copy_from_tp_trans(dim3 Gr, dim3 Bl, double* A, const double* B, MatrixDim dmat);
 void cudaDF_copy_from_tp_trans(dim3 Gr, dim3 Bl, double* A, const float* B, MatrixDim dmat);
 void cudaD_copy_from_tp(dim3 Gr, dim3 Bl, double* A, const double* B, MatrixDim dmat);
 void cudaDF_copy_from_tp(dim3 Gr, dim3 Bl, double* A, const float* B, MatrixDim dmat);
-void cudaD_copy_col_from_vec(int Gr, int Bl, double* mat, const double* v, int col, MatrixDim d);
 void cudaD_apply_exp(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);
 void cudaD_apply_pow(dim3 Gr, dim3 Bl, double* mat, double power, MatrixDim d);
 void cudaD_apply_pow_abs(dim3 Gr, dim3 Bl, double* mat, double power, bool include_sign, MatrixDim d);
-void cudaD_apply_heaviside(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);  
+void cudaD_apply_heaviside(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);
 void cudaD_apply_floor(dim3 Gr, dim3 Bl, double* mat, double floor_val, MatrixDim d);
 void cudaD_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
 void cudaD_add_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
@@ -228,6 +230,7 @@ void cudaD_calc_pnorm_deriv(dim3 Gr, dim3 Bl, double *y, const double *x1, const
 void cudaD_calc_group_max_deriv(dim3 Gr, dim3 Bl, double *y, const double *x1, const double *x2,  MatrixDim d, int src_stride, int group_size);
 void cudaD_div_rows_vec(dim3 Gr, dim3 Bl, double *mat, const double *vec_div, MatrixDim d);
 void cudaD_add_mat(dim3 Gr, dim3 Bl, double alpha, const double *src, double *dst, MatrixDim d, int src_stride, int A_trans);
+void cudaD_add_mat_blocks(dim3 Gr, dim3 Bl, double alpha, const double *src, int32_cuda num_row_blocks, int32_cuda num_col_blocks, double *dst, MatrixDim d, int src_stride, int A_trans);
 void cudaD_add_mat_mat_div_mat(dim3 Gr, dim3 Bl, const double *A, const double *B, const double *C, double *dst, MatrixDim d, int stride_a, int stride_b, int stride_c);
 void cudaD_add_vec_to_cols(dim3 Gr, dim3 Bl, double alpha, const double *col, double beta, double *dst, MatrixDim d);
 void cudaD_add_vec_to_rows(dim3 Gr, dim3 Bl, double alpha, const double *row, double beta, double *dst, MatrixDim d);
@@ -245,19 +248,19 @@ void cudaD_vec_mul_elements(int Gr, int Bl, double* v, const double* a, int dim)
 void cudaD_vec_soft_max(int Gr, int Bl, double* v, int dim);
 void cudaD_vec_min(const double* v, double* value, int dim);
 void cudaD_vec_max(const double* v, double* value, int dim);
-void cudaD_trace_mat_mat_trans(const double* A, const double* B, MatrixDim dA, int B_stride, double* value);
-void cudaD_trace_mat_mat(const double* A, const double* B, MatrixDim dA, int B_stride, double* value);
-void cudaD_add_diag_mat_mat(int Gr, int Bl, double alpha, double* v, int v_dim, const double* M, 
-                            int M_cols, int M_row_stride, int M_col_stride, const double *N, int N_row_stride, 
-                            int N_col_stride, int threads_per_element, double beta);  
+void cudaD_trace_mat_mat_trans(dim3 Gr, dim3 Bl, const double* A, const double* B, MatrixDim dA, int B_stride, double* value);
+void cudaD_trace_mat_mat(dim3 Gr, dim3 Bl, const double* A, const double* B, MatrixDim dA, int B_stride, double* value);
+void cudaD_add_diag_mat_mat(int Gr, int Bl, double alpha, double* v, int v_dim, const double* M,
+                            int M_cols, int M_row_stride, int M_col_stride, const double *N, int N_row_stride,
+                            int N_col_stride, int threads_per_element, double beta);
 void cudaD_add_vec_vec(int Gr, int Bl, double alpha, double* v, const double* x, const double* y, double beta, int dim);
-void cudaD_copy_col_from_mat(int Gr, int Bl, double* v, int col, const double* mat, MatrixDim dmat, int dim);
 void cudaD_copy_col_from_mat_df(int Gr, int Bl, double* v, int col, const double* mat, MatrixDim dmat, int dim);
 void cudaD_copy_col_from_mat_fd(int Gr, int Bl, float* v, int col, const double* mat, MatrixDim dmat, int dim);
 void cudaD_vec_sum(int Gr, int Bl, double* v, double* value, int dim, int inc);
 void cudaD_pvec_sum(int Gr, int Bl, double* vec, double* pvec_sum, int dim, int size);
 void cudaD_vec_copy_diag_from_packed(int Gr, int Bl, double *dst, const double *src, int dim);
 void cudaD_vec_apply_floor(int Gr, int Bl, double* v, double floor_val, float* num, int dim);
+void cudaD_vec_apply_ceiling(int Gr, int Bl, double* v, double ceiling_val, float* num, int dim);
 void cudaD_vec_apply_exp(int Gr, int Bl, double* v, int dim);
 void cudaD_vec_apply_log(int Gr, int Bl, double* v, double* flag, int dim);
 void cudaD_trace(int Gr, int Bl, double* mat, double* value, int dim);
@@ -270,7 +273,7 @@ void cudaD_add_mat_blockmat(dim3 Gr, dim3 Bl, double *data, MatrixDim d, const d
 void cudaD_block_add_mat_mat(dim3 Gr, dim3 Bl, CuBlockMatrixData *B_cu_data, int num_blocks,
                              const double *C_data, int C_num_cols, int C_row_stride, int C_col_stride,
                              const double *D_data, int D_row_stride, int D_col_stride,
-                             double alpha, double beta);  
+                             double alpha, double beta);
 
 
 /*
@@ -282,12 +285,13 @@ void cudaD_soft_hinge(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d,
 void cudaD_group_pnorm(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride, int group_size, double power);
 void cudaD_group_max(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride, int group_size);
 void cudaD_sigmoid(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride);
+void cudaD_heaviside(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride);
 void cudaD_diff_sigmoid(dim3 Gr, dim3 Bl, double *eout, const double *e, const double *y, MatrixDim d, int e_stride, int y_stride);
 void cudaD_tanh(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride);
 void cudaD_diff_tanh(dim3 Gr, dim3 Bl, double *eout, const double *e, const double *y, MatrixDim d, int e_stride, int y_stride);
 
 void cudaD_regularize_l1(dim3 Gr, dim3 Bl, double *wei, double *grad, double l1, double lr, MatrixDim d, int stride_grad);
-void cudaD_find_row_max_id(dim3 Gr, dim3 Bl, const double *mat, double *vec_val, int32_cuda *vec_id, int32_cuda voff, MatrixDim d);
+void cudaD_find_row_max_id(dim3 Gr, dim3 Bl, const double *mat, double *vec_val, int32_cuda *vec_id, MatrixDim d);
 void cudaD_diff_xent(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt, double *mat_net_out, double *vec_log_post, MatrixDim d);
 void cudaD_copy_rows_from_vec(dim3 Gr, dim3 Bl, double *mat_out, MatrixDim d_out, const double *v_in);
 
@@ -320,17 +324,13 @@ void cuda_copy_from_smat_fd_trans(dim3 Gr, dim3 Bl, float* mat_out, const Matrix
 void cuda_copy_from_smat_df_trans(dim3 Gr, dim3 Bl, double* mat_out, const MatrixElement<float>* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in);
 void cuda_copy_from_smat_dd_trans(dim3 Gr, dim3 Bl, double* mat_out, const MatrixElement<double>* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in);
 
-void cuda_copy_from_smat_as_vec_ff(dim3 Gr, dim3 Bl, float* vec_out, const MatrixElement<float>* smat_in, MatrixIndexT_cuda d_in);
-void cuda_copy_from_smat_as_vec_fd(dim3 Gr, dim3 Bl, float* vec_out, const MatrixElement<double>* smat_in, MatrixIndexT_cuda d_in);
-void cuda_copy_from_smat_as_vec_df(dim3 Gr, dim3 Bl, double* vec_out, const MatrixElement<float>* smat_in, MatrixIndexT_cuda d_in);
-void cuda_copy_from_smat_as_vec_dd(dim3 Gr, dim3 Bl, double* vec_out, const MatrixElement<double>* smat_in, MatrixIndexT_cuda d_in);
-
 void cudaF_trace_mat_smat(dim3 Gr, dim3 Bl, const float* mat_in, const MatrixElement<float>* smat_in, MatrixDim mat_d_in, MatrixIndexT_cuda smat_d_in, float* trace_vec_out);
 void cudaF_trace_mat_smat_trans(dim3 Gr, dim3 Bl, const float* mat_in, const MatrixElement<float>* smat_in, MatrixDim mat_d_in, MatrixIndexT_cuda smat_d_in, float* trace_vec_out);
 void cudaD_trace_mat_smat(dim3 Gr, dim3 Bl, const double* mat_in, const MatrixElement<double>* smat_in, MatrixDim mat_d_in, MatrixIndexT_cuda smat_d_in, double* trace_vec_out);
 void cudaD_trace_mat_smat_trans(dim3 Gr, dim3 Bl, const double* mat_in, const MatrixElement<double>* smat_in, MatrixDim mat_d_in, MatrixIndexT_cuda smat_d_in, double* trace_vec_out);
 
-void cudaD_matrix_add_elements(dim3 Gr, dim3 Bl, double *data, MatrixDim dim, double alpha, MatrixElement<double>* x, int s);
+void cudaD_matrix_add_elements(dim3 Gr, dim3 Bl, double *data, MatrixDim dim, double alpha, MatrixElement<double>* x, int num_elements);
+void cudaD_matrix_add_indexed_values(dim3 Gr, dim3 Bl, MatrixDim dim, double alpha, const Int32Pair* indices, const double* x, int s, double* data);
 void cudaD_comp_obj_deriv(dim3 Gr,dim3 Bl, MatrixElement<double>* x, int s, const double* z, MatrixDim d, double* z2, MatrixDim d2, double* t);
 
 void cudaD_transpose_matrix(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);
@@ -345,14 +345,14 @@ void cudaD_add_row_ranges(dim3 Gr, dim3 Bl, double *data, MatrixDim dim,
 void cudaD_matrix_lookup(dim3 Gr, dim3 Bl, const double *data, MatrixDim dim,
                          const Int32Pair *indices, int indices_size,
                          double *output);
- 
+
 void cudaD_equal_element_mask(dim3 Gr, dim3 Bl, const double *mat1,
-                              const double *mat2, double *mask, MatrixDim mat1_dim, 
+                              const double *mat2, double *mask, MatrixDim mat1_dim,
                               int mat2_stride, int mask_stride);
-  
 
-  
-} // extern "C" 
+
+
+} // extern "C"
 
 #endif // HAVE_CUDA
 
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 00d6c71ab2d..c2d8b45174a 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -25,7 +25,7 @@
 // In this file is the CUDA code of the CUDA kernels, plus the ANSI-C wrappers
 
 #include <cfloat>
-#include "cu-kernels-ansi.h"
+#include "cudamatrix/cu-kernels-ansi.h"
 
 
 /***********************************************************************
@@ -35,7 +35,7 @@ template<typename Real>
 __device__
 static Real _sum_reduce(Real buffer[]) {
   // Total number of active threads
-  int32_cuda nTotalThreads = blockDim.x;	
+  int32_cuda nTotalThreads = blockDim.x;
   __syncthreads();
   // perform tree-based reduction (sum)
   while(nTotalThreads > 1) {
@@ -70,7 +70,7 @@ static Real _min_reduce(Real buffer[]) {
     if (threadIdx.x < halfPoint) {
       if (threadIdx.x + halfPoint < nTotalThreads) {
         Real temp = buffer[threadIdx.x + halfPoint];
-        if (temp < buffer[threadIdx.x]) 
+        if (temp < buffer[threadIdx.x])
            buffer[threadIdx.x] = temp;
       }
     }
@@ -86,7 +86,7 @@ template<typename Real>
 __device__
 static Real _max_reduce(Real buffer[]) {
   // Total number of active threads
-  int32_cuda nTotalThreads = blockDim.x;	
+  int32_cuda nTotalThreads = blockDim.x;
   __syncthreads();
   // perform tree-based reduction (max)
   while(nTotalThreads > 1) {
@@ -96,7 +96,7 @@ static Real _max_reduce(Real buffer[]) {
       // Get the shared value stored by another thread
       if(threadIdx.x+halfPoint < nTotalThreads) {
         Real temp = buffer[threadIdx.x + halfPoint];
-        if (temp > buffer[threadIdx.x]) 
+        if (temp > buffer[threadIdx.x])
           buffer[threadIdx.x] = temp;
       }
     }
@@ -113,7 +113,7 @@ template<typename Real>
 __device__
 static int32_cuda _max_id_reduce(Real val[], int32_cuda idx[]) {
   // Total number of active threads
-  int32_cuda nTotalThreads = blockDim.x;	
+  int32_cuda nTotalThreads = blockDim.x;
   __syncthreads();
   // perform tree-based reduction (get index of maximum)
   while(nTotalThreads > 1) {
@@ -175,17 +175,14 @@ __global__
 static void _add_diag_vec_mat(Real alpha, Real *mat, MatrixDim mat_dim,
                               const Real *vec, const Real *mat2, int mat2_row_stride,
                               int mat2_col_stride, Real beta) {
-  // Note from Dan: in this kernel, we make the x dimension correspond to the
-  // row index and y to the column index.  That was not always the case for
-  // earlier kernels written by others.
-  int i = blockIdx.y * blockDim.y + threadIdx.y; // row index
-  int j = blockIdx.x * blockDim.x + threadIdx.x; // column index
-  
-  int index = i * mat_dim.stride + j,
-      index2 = i * mat2_row_stride + j * mat2_col_stride;
-  
-  if (i < mat_dim.rows && j < mat_dim.cols) {
-    mat[index] = alpha * vec[i] * mat2[index2] + beta * mat[index];
+  int i = blockIdx.x * blockDim.x + threadIdx.x; // column index
+  int j = blockIdx.y * blockDim.y + threadIdx.y; // row index
+
+  int index = j * mat_dim.stride + i,
+      index2 = j * mat2_row_stride + i * mat2_col_stride;
+
+  if (i < mat_dim.cols && j < mat_dim.rows) {
+    mat[index] = alpha * vec[j] * mat2[index2] + beta * mat[index];
   }
 }
 
@@ -193,13 +190,12 @@ static void _add_diag_vec_mat(Real alpha, Real *mat, MatrixDim mat_dim,
 template<typename Real, typename OtherReal>
 __global__
 static void _copy_from_tp(Real* A, const OtherReal* B, MatrixDim dmat) {
-  int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
-  int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y;
-
-  if (i < dmat.rows && j < dmat.cols) {
-    int32_cuda index_B = (i * (i+1) / 2) + j;
-    int32_cuda index_A = i * dmat.stride + j;
-    if (j <= i) {
+  int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;  // col index
+  int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y;  // row index
+  if (i < dmat.cols && j < dmat.rows) {
+    int32_cuda index_B = (j * (j+1) / 2) + i;
+    int32_cuda index_A = j * dmat.stride + i;
+    if (i <= j) {
       A[index_A] = B[index_B];
     } else {
       A[index_A] = 0.0;
@@ -211,6 +207,8 @@ static void _copy_from_tp(Real* A, const OtherReal* B, MatrixDim dmat) {
 template<typename Real, typename OtherReal>
 __global__
 static void _copy_from_tp_trans(Real* A, const OtherReal* B, MatrixDim dmat) {
+  // we interpret these indexes oppositely from normal, but it doesn't
+  // matter as it's invoked in a symmetric way.
   int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
   int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y;
   // transpose the indices used to index the source TpMatrix.
@@ -226,32 +224,54 @@ static void _copy_from_tp_trans(Real* A, const OtherReal* B, MatrixDim dmat) {
 }
 
 
-// for this kernel, following the newer pattern, the x-dim is the row-index, the
-// y-dim is the col-index.
 template<typename Real, typename OtherReal>
 __global__
 static void _copy_from_mat(Real* mat_out, const OtherReal* mat_in, MatrixDim d_out, MatrixDim d_in) {
-  int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; // row-index
-  int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y; // col-index.
-  int32_cuda index_out = j + i * d_out.stride;
-  int32_cuda index_in = j + i * d_in.stride;
-  if (i < d_out.rows && j < d_out.cols)
+  int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;  // col-index
+  int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y;  // row-index.
+  int32_cuda index_out = i + j * d_out.stride;
+  int32_cuda index_in = i + j * d_in.stride;
+  if (i < d_out.cols && j < d_out.rows)
     mat_out[index_out] = static_cast<Real>(mat_in[index_in]);
 }
 
+template<int TileDim, typename Real, typename OtherReal>
+__global__
+static void _copy_from_mat_trans(Real* mat_out, const OtherReal* mat_in,
+    MatrixDim d_out, MatrixDim d_in) {
+  // Use shared meme to achieve both coalesced memory reading and writing
+  // '+1' to avoid bank conflict when reading sbuf
+  __shared__ Real sbuf[TileDim][TileDim + 1];
 
+  const int32_cuda i_in = blockIdx.y * TileDim + threadIdx.y; // row-index
+  const int32_cuda j_in = blockIdx.x * TileDim + threadIdx.x; // col-index
+  const int32_cuda tile_stride_in = CU1DBLOCK / TileDim * d_in.stride;
+  int32_cuda index_in = i_in * d_in.stride + j_in;
 
-// for this kernel, the x-dim is the row-index at the output, the y-dim is the
-// col-index at the output
-template<typename Real, typename OtherReal>
-__global__
-static void _copy_from_mat_trans(Real* mat_out, const OtherReal* mat_in, MatrixDim d_out, MatrixDim d_in) {
-  int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; // row-index out
-  int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y; // col-index out
-  int32_cuda index_out = j + i * d_out.stride;
-  int32_cuda index_in = i + j * d_in.stride;
-  if (i < d_out.rows && j < d_out.cols)
-    mat_out[index_out] = static_cast<Real>(mat_in[index_in]);
+# pragma unroll
+  for (int i = 0; i < TileDim; i += CU1DBLOCK / TileDim) {
+    if (i_in + i < d_in.rows && j_in < d_in.cols) {
+      sbuf[threadIdx.y + i][threadIdx.x] = static_cast<Real>(mat_in[index_in]);
+    }
+    index_in += tile_stride_in;
+  }
+  __syncthreads();
+
+  // Grid is transposed, but block is not yet.
+  // Warp (blockDim.x) is always along the row-dim.
+  const int32_cuda i_out = blockIdx.x * TileDim + threadIdx.y;
+  const int32_cuda j_out = blockIdx.y * TileDim + threadIdx.x;
+  const int32_cuda tile_stride_out = CU1DBLOCK / TileDim * d_out.stride;
+  int32_cuda index_out = i_out * d_out.stride + j_out;
+
+# pragma unroll
+  for (int i = 0; i < TileDim; i += CU1DBLOCK / TileDim) {
+    if (i_out + i < d_out.rows && j_out < d_out.cols) {
+      // block is tranposed when reading sbuf
+      mat_out[index_out] = sbuf[threadIdx.x][threadIdx.y + i];
+    }
+    index_out += tile_stride_out;
+  }
 }
 
 template<typename Real, typename OtherReal>
@@ -272,14 +292,6 @@ static void _copy_from_smat_trans(Real* mat_out, const MatrixElement<OtherReal>*
   mat_out[data_index] = smat_in[smat_index].weight;
 }
 
-template<typename Real, typename OtherReal>
-__global__
-static void _copy_from_smat_as_vec(Real* vec_out, const MatrixElement<OtherReal>* smat_in, MatrixIndexT_cuda d_in) {
-  int smat_index = blockIdx.x * blockDim.x + threadIdx.x;
-  if (smat_index >= d_in) return;
-  vec_out[smat_index] = smat_in[smat_index].weight;
-}
-
 template<typename Real>
 __global__
 static void _trace_mat_smat_trans(const Real* mat_in, const MatrixElement<Real>* smat_in, MatrixDim mat_d_in, MatrixIndexT_cuda smat_d_in, Real* trace_vec_out) {
@@ -312,25 +324,13 @@ static void _transpose_matrix(Real* mat, MatrixDim d) {
   mat[index_b] = a;
 }
 
-
-template<typename Real>
-__global__
-static void _copy_col_from_vec(Real* mat, const Real* v, int col, MatrixDim d) {
-  int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
-  if ( i < d.rows ) {
-    int32_cuda index = col + i * d.stride;
-    mat[index] = v[i];
-  }
-}
-
-
 template<typename Real>
 __global__
 static void _apply_exp(Real* mat, MatrixDim d) {
   int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
   int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y;
   int32_cuda index = i + j * d.stride;
-  if ( i < d.cols && j < d.rows ) {
+  if (i < d.cols && j < d.rows) {
     mat[index] = exp(mat[index]);
   }
 }
@@ -380,8 +380,8 @@ static void _add_diag_packed(Real* mat, Real value, int dim) {
 template<typename Real>
 __global__
 static void _set_const(Real* mat, Real value, MatrixDim d) {
-  int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
-  int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y;
+  int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;  // column
+  int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y;  // row
   int32_cuda index = i + j * d.stride;
   if (i < d.cols && j < d.rows)
     mat[index] = value;
@@ -495,14 +495,14 @@ static void _mul_rows_vec(Real* mat, const Real* scale, MatrixDim d) {
 
 template<typename Real>
 __global__
-static void _mul_rows_group_mat(Real *y, const Real *x, MatrixDim d, 
+static void _mul_rows_group_mat(Real *y, const Real *x, MatrixDim d,
                                 int src_stride, int group_size) {
   int i = blockIdx.x * blockDim.x + threadIdx.x;
   int j = blockIdx.y * blockDim.y + threadIdx.y;
-  if (j < d.rows && i < d.cols ) {  
+  if (j < d.rows && i < d.cols ) {
     int dst_index = i + j * d.stride;
     int src_index = i / group_size + j * src_stride;
-    y[dst_index] *= x[src_index]; 
+    y[dst_index] *= x[src_index];
   }
 }
 
@@ -514,7 +514,7 @@ static void _calc_pnorm_deriv(Real *deriv, const Real *vec, const Real *norm,
         MatrixDim d, int src_stride, int group_size, Real power) {
   int i = blockIdx.x * blockDim.x + threadIdx.x;
   int j = blockIdx.y * blockDim.y + threadIdx.y;
-  if (j < d.rows  && i < d.cols ) {  
+  if (j < d.rows  && i < d.cols ) {
     int dst_index = i + j * d.stride,
         src_index = i / group_size + j * src_stride;
     Real vec_element = vec[dst_index], // this is the element of the original vector.
@@ -571,7 +571,7 @@ static void _div_rows_vec(Real* mat, const Real* vec_div, MatrixDim d) {
     inv[threadIdx.y] = 1.0/vec_div[j];
   }
   __syncthreads();
- 
+
   //multiply elements
   if (i < d.cols && j < d.rows)
     mat[index] *= inv[threadIdx.y];
@@ -581,12 +581,12 @@ static void _div_rows_vec(Real* mat, const Real* vec_div, MatrixDim d) {
 template<typename Real>
 __global__
 static void _add_mat(Real alpha, const Real* src, Real* dst, MatrixDim d, int src_stride) {
-  int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
-  int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y;
-  int32_cuda index = i + j*d.stride;
-  int32_cuda index_src = i + j*src_stride;
+  int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;  // column index
+  int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y;  // row index
+  int32_cuda index = i + j * d.stride;
+  int32_cuda index_src = i + j * src_stride;
   if (i < d.cols && j < d.rows)
-    dst[index] = alpha*src[index_src] + dst[index];
+    dst[index] = alpha * src[index_src] + dst[index];
 }
 
 template<typename Real>
@@ -602,7 +602,37 @@ static void _add_mat_trans(Real alpha, const Real* src, Real* dst, MatrixDim d,
 
 template<typename Real>
 __global__
-static void _add_mat_mat_div_mat(const Real* A, const Real* B, const Real* C, Real* dst, MatrixDim d, int stride_a, 
+static void _add_mat_blocks(Real alpha, const Real* src, int32_cuda num_row_blocks, int32_cuda num_col_blocks, Real* dst, MatrixDim d, int src_stride) {
+  int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
+  int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y;
+  int32_cuda index = i + j * d.stride;
+  int32_cuda index_src = i + j * src_stride;
+  if (i < d.cols && j < d.rows)
+    for (int32_cuda p = 0; p < num_row_blocks; p++) {
+      for (int32_cuda q = 0; q < num_col_blocks; q++) {
+        dst[index] = alpha * src[index_src + p * src_stride * d.rows + q * d.cols] + dst[index];
+      }
+    }
+}
+
+template<typename Real>
+__global__
+static void _add_mat_blocks_trans(Real alpha, const Real* src, int32_cuda num_row_blocks, int32_cuda num_col_blocks, Real* dst, MatrixDim d, int src_stride) {
+  int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
+  int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y;
+  int32_cuda index = i + j * d.stride;
+  int32_cuda index_src = j + i * src_stride;
+  if (i < d.cols && j < d.rows)
+    for (int32_cuda p = 0; p < num_row_blocks; p++) {
+      for (int32_cuda q = 0; q < num_col_blocks; q++) {
+        dst[index] = alpha * src[index_src + p * src_stride * d.cols + q * d.rows] + dst[index];
+      }
+    }
+}
+
+template<typename Real>
+__global__
+static void _add_mat_mat_div_mat(const Real* A, const Real* B, const Real* C, Real* dst, MatrixDim d, int stride_a,
                                  int stride_b, int stride_c) {
   int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
   int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y;
@@ -619,7 +649,7 @@ static void _add_mat_mat_div_mat(const Real* A, const Real* B, const Real* C, Re
 
 // Given a matrix input S (not packed!) and a lower-triangular matrix L,
 // this function does S = beta S + alpha * L^T L.  This is used in PSD matrix inversion.
-// The i index is the row of the destination S and the j the column (although of 
+// The i index is the row of the destination S and the j the column (although of
 // course the output is symmetric so it doesn't matter in a sense).  The main point
 // of this is to make use of various symmetries and zero-ness.
 template<typename Real>
@@ -628,14 +658,14 @@ static void _sy_add_tr2(Real alpha, Real beta, const Real *T, MatrixDim tdim, Re
                         MatrixDim sdim) {
   int i = blockIdx.x * blockDim.x + threadIdx.x;
   int j = blockIdx.y * blockDim.y + threadIdx.y;
-  
+
   if (i >= sdim.rows || j > i) return;
 
   // this thread computes the dot-product of the i'th column of
   // L with the j'th column of L.  The values we're multiplying
   // are only nonzero for row-index k greater or equal to
   // max(i, j), which equals i.
-  
+
   Real sum = 0.0;
   for (int k = i; k < sdim.rows; k++) {
     int i_index = i + tdim.stride * k,
@@ -682,27 +712,22 @@ static void _apply_mask(Real* mat, const char* mask, MatrixDim dmat, MatrixDim d
   int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y;
   int32_cuda index = i + j*dmat.stride;
   int32_cuda index2 = i + j*dmask.stride;
-  if ( i < dmat.cols  &&  j < dmat.rows ) 
+  if ( i < dmat.cols  &&  j < dmat.rows )
     if(mask[index2] == 0) mat[index] = 0;
 }
 
 template<typename Real>
 __global__
 static void _add_mat_diag_vec(Real alpha, Real *mat, MatrixDim mat_dim,
-                              const Real *mat2, int mat2_row_stride, int mat2_col_stride, 
+                              const Real *mat2, int mat2_row_stride, int mat2_col_stride,
                               const Real *vec, Real beta) {
-  // Note from Dan: in this kernel, we make the x dimension correspond to the
-  // row index and y to the column index.  That was not always the case for
-  // earlier kernels written by others.
-  int i = blockIdx.x * blockDim.x + threadIdx.x; // row index
-  int j = blockIdx.y * blockDim.y + threadIdx.y; // column index
-  
-  int index = i * mat_dim.stride + j,
-      index2 = i * mat2_row_stride + j * mat2_col_stride;
-  
-  if (i < mat_dim.rows && j < mat_dim.cols) {
-    mat[index] = alpha * mat2[index2] * vec[j] + beta * mat[index];
-  }
+  int i = blockIdx.x * blockDim.x + threadIdx.x; // column index
+  int j = blockIdx.y * blockDim.y + threadIdx.y; // row index
+
+  int index = i + j * mat_dim.stride,
+      index2 = i * mat2_col_stride + j * mat2_row_stride;
+  if (j < mat_dim.rows && i < mat_dim.cols)
+    mat[index] = alpha * mat2[index2] * vec[i] + beta * mat[index];
 }
 
 template<typename Real>
@@ -738,7 +763,7 @@ static void _set_bias_params(Real* v, const Real* a, Real param_1, Real param_2,
       v[i] = v[i] / factor;
     } else if ( ratio > param_1 ) {
       Real factor = ((ratio/param_1) > param_2) ? param_2 : (ratio/param_1);
-      v[i] = v[i] * factor; 
+      v[i] = v[i] * factor;
     }
   }
 }
@@ -749,7 +774,7 @@ __global__
 static void _copy_from_vec_df(double* v_out, const Real* v_in, int dim) {
   int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
   //  if (blockIdx.y > 0) return;
-  
+
   if (i < dim) {
     v_out[i] = (double) v_in[i];
   }
@@ -757,30 +782,25 @@ static void _copy_from_vec_df(double* v_out, const Real* v_in, int dim) {
 
 
 // This kernel writes a copy of the vector "v_in" to each row of the matrix
-// "m_out".  the dimension of v_in should be equal to the #columns of m_out.  In
-// this kernel, following the new pattern, x corresponds to row-index and y to
-// column-index.
+// "m_out".  the dimension of v_in should be equal to the #columns of m_out.
 template<typename Real>
 __global__
 static void _copy_rows_from_vec(Real* m_out, MatrixDim d, const Real* v_in) {
-  int i = blockIdx.x * blockDim.x + threadIdx.x; // row index.
-  int j = blockIdx.y * blockDim.y + threadIdx.y; // column index.
-  
-  if (i < d.rows && j < d.cols) {
-    int index = i * d.stride + j;
-    m_out[index] = v_in[j];
+  int i = blockIdx.x * blockDim.x + threadIdx.x; // column index.
+  int j = blockIdx.y * blockDim.y + threadIdx.y; // row index.
+  if (i < d.cols && j < d.rows) {
+    int index = i + j * d.stride;
+    m_out[index] = v_in[i];
   }
 }
 
-
-
 template<typename Real>
 __global__
 static void _copy_from_vec_fd(float* v_out, const Real* v_in, int dim) {
   int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
   //  if (blockIdx.y > 0) return;
-  
-  if ( i < dim) {
+
+  if (i < dim) {
     v_out[i] = (float) v_in[i];
   }
 }
@@ -792,7 +812,7 @@ static void _vec_min(const Real* v, Real* value, int dim) {
   int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
 
   if(i >= CU1DBLOCK) return;
-  
+
   __shared__ Real row_data[CU1DBLOCK];
 
   int block_size = (dim + CU1DBLOCK - 1) / CU1DBLOCK;
@@ -841,84 +861,128 @@ static void _vec_max(const Real* v, Real* value, int dim) {
 }
 
 
-// _trace_mat_mat expects to be called with 1 blocks, each of dimension
-// CU1DBLOCK.  Each block outputs a partial sum to value[blockIdx.x],
-// i.e. value[0 through 0].
-template<typename Real, int num_blocks>
-__global__
-static void _trace_mat_mat(const Real* A, const Real* B, MatrixDim dA, int B_stride, Real* value) {
-  int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
+// _trace_mat_mat reduce the partial sum to value[blockIdx.y * gridDim.x + blockIdx.x]
+// It use shared mem to transpose matrix B to ensure coalesced memory access
+template<int TileDim, typename Real>
+__global__
+static void _trace_mat_mat(const Real* A, const Real* B, MatrixDim dA,
+    int B_stride, Real* value) {
+  // Reuse shared mem and make indexing easier. "+1" to avoid bank conflict
+  __shared__ union {
+    Real trans[TileDim][TileDim + 1];
+    Real sum[CU1DBLOCK];
+  } smem;
+  const int32_cuda tid = threadIdx.y * blockDim.x + threadIdx.x; // linear thread id;
+  const int32_cuda grid_height = gridDim.y * TileDim;
+
+  const int32_cuda ja = blockIdx.x * TileDim + threadIdx.x;
+  const int32_cuda ib = blockIdx.x * TileDim + threadIdx.y;
+  int32_cuda ia = blockIdx.y * TileDim + threadIdx.y;
+  int32_cuda jb = blockIdx.y * TileDim + threadIdx.x;
+
+  // Grid reduce
+  Real tsum = Real(0);
+  for (int32_cuda i0 = 0; i0 < dA.rows; i0 += grid_height) {
+    // Load from B, transpose the block and store in shared mem
+    if (jb < dA.rows) {
+#     pragma unroll
+      for (int i = 0; i < TileDim; i += CU1DBLOCK / TileDim) {
+        if (ib + i < dA.cols) {
+          smem.trans[threadIdx.x][threadIdx.y + i] =
+              B[(ib + i) * B_stride + jb];
+        }
+      }
+    }
+    __syncthreads();
 
-  if(blockIdx.x > num_blocks || threadIdx.x > CU1DBLOCK) return;
-  
-  int num_elements = dA.rows * dA.cols,
-      num_threads = CU1DBLOCK * num_blocks;
-  int block_size = (num_elements + num_threads - 1) / num_threads;
-  int loop_start = i * block_size, loop_end = (i + 1) * block_size;
-  if (loop_end > num_elements) 
-    loop_end = num_elements;
+    // Load from A, sum up the product.
+    if (ja < dA.cols) {
+#     pragma unroll
+      for (int i = 0; i < TileDim; i += CU1DBLOCK / TileDim) {
+        if (ia + i < dA.rows) {
+          tsum += A[(ia + i) * dA.stride + ja]
+              * smem.trans[threadIdx.y + i][threadIdx.x];
+        }
+      }
+    }
+    __syncthreads();
 
-  Real sum = 0.0;
-  for (int j = loop_start; j < loop_end; j++) {
-    // for (int j = i; j < num_elements; j += num_threads) {
-    int row = j / dA.cols, col = j % dA.cols; // "row" is row-index in A, "col" is
-                                              // col-index in A; in B, it's reversed.
-    int index_A = col + row * dA.stride,
-        index_B = row + col * B_stride;
-    sum += A[index_A] * B[index_B];
+    ia += grid_height;
+    jb += grid_height;
   }
-  __shared__ Real row_data[CU1DBLOCK];
-
-  row_data[threadIdx.x] = sum;
 
+  smem.sum[tid] = tsum;
   __syncthreads();
 
-  Real ans = _sum_reduce(row_data);
-  if (threadIdx.x == 0)
-    value[blockIdx.x] = ans;
+  // Block reduce
+# pragma unroll
+  for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
+    if (tid < shift)
+      smem.sum[tid] += smem.sum[tid + shift];
+    __syncthreads();
+  }
+
+  // Warp reduce. Implicitly synchronized within a warp.
+  if (tid < warpSize) {
+#   pragma unroll
+    for (int shift = warpSize; shift > 0; shift >>= 1) {
+      smem.sum[tid] += smem.sum[tid + shift];
+    }
+  }
+
+  // output 1 sum per thread block
+  if (tid == 0) {
+    value[blockIdx.y * gridDim.x + blockIdx.x] = smem.sum[0];
+  }
 }
 
-// _trace_mat_mat_trans expects to be called with 4 blocks, each of dimension
-// CU1DBLOCK.  Each block outputs a partial sum to value[blockIdx.x],
-// i.e. value[0 through 3].
-template<typename Real, int num_blocks>
+// _trace_mat_mat_trans reduce the partial sum to value[blockIdx.y * gridDim.x + blockIdx.x]
+template<typename Real>
 __global__
 static void _trace_mat_mat_trans(const Real* A, const Real* B, MatrixDim dA, int B_stride, Real* value) {
-  int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if(blockIdx.x > num_blocks || threadIdx.x > CU1DBLOCK) return;  
-  
-  int num_elements = dA.rows * dA.cols,
-      num_threads = CU1DBLOCK * num_blocks;
-  // int block_size = (num_elements + num_threads - 1) / num_threads;
-  // int loop_start = i * block_size, loop_end = (i + 1) * block_size;
-  // if (loop_end > num_elements) 
-  //  loop_end = num_elements;
-
-  Real sum = 0.0;
-  // for (int j = loop_start; j < loop_end; j++) {
-  for (int j = i; j < num_elements; j += num_threads) {
-    int row = j / dA.cols, col = j % dA.cols; // "row" is row-index in A, "col" is
-                                              // col-index in A; in B, it's reversed.
-    int index_A = col + row * dA.stride,
-        index_B = col + row * B_stride;
-    sum += A[index_A] * B[index_B];
+  __shared__ Real ssum[CU1DBLOCK];
+  const int32_cuda tid = threadIdx.y * blockDim.x + threadIdx.x; // linear thread id;
+  const int32_cuda j = blockIdx.x * blockDim.x + threadIdx.x;
+  const int32_cuda grid_height = gridDim.y * blockDim.y;
+  int32_cuda i = blockIdx.y * blockDim.y + threadIdx.y;
+
+  // Grid reduce
+  Real tsum = Real(0);
+  if (j < dA.cols) {
+    while (i < dA.rows) {
+      tsum += A[i * dA.stride + j] * B[i * B_stride + j];
+      i += grid_height;
+    }
   }
-  __shared__ Real row_data[CU1DBLOCK];
+  ssum[tid] = tsum;
+  __syncthreads();
 
-  row_data[threadIdx.x] = sum;
+  // Block reduce
+# pragma unroll
+  for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
+    if (tid < shift)
+      ssum[tid] += ssum[tid + shift];
+    __syncthreads();
+  }
 
-  __syncthreads();
+  // Warp reduce. Implicitly synchronized within a warp.
+  if (tid < warpSize) {
+#   pragma unroll
+    for (int shift = warpSize; shift > 0; shift >>= 1) {
+      ssum[tid] += ssum[tid + shift];
+    }
+  }
 
-  Real ans = _sum_reduce(row_data);
-  if (threadIdx.x == 0)
-    value[blockIdx.x] = ans;
+  // output 1 sum per thread block
+  if (tid == 0) {
+    value[blockIdx.y * gridDim.x + blockIdx.x] = ssum[0];
+  }
 }
 
 
 // Adds diag(M N) to v, where M and N are matrices.  We supply row_stride and
 // col_stride arguments for M and N, and swapping them allows us to transpose
-// those matrices.  Note: we imagine row-major indexing here, just like Kaldi 
+// those matrices.  Note: we imagine row-major indexing here, just like Kaldi
 // and CBLAS (but unlike CUBLAS).
 // This kernel expects the blockDim to be (CU1DBLOCK, 1) and the
 // gridDim times CU1DBLOCK to be at least num-rows-of-v * threads_per_element.
@@ -929,24 +993,24 @@ static void _add_diag_mat_mat(
        Real alpha, Real* v, int v_dim, const Real* M, int M_cols, int M_row_stride,
        int M_col_stride, const Real *N, int N_row_stride, int N_col_stride,
        int threads_per_element, Real beta) {
-  
+
   // we actually assume blockDim.x == CU1DBLOCK here.
   // Each diagonal element of v is processed by "threads_per_element" threads.
   __shared__ Real temp_data[CU1DBLOCK];
 
   int i = blockIdx.x * blockDim.x + threadIdx.x;
   int v_idx = i / threads_per_element,   // v_idx is the index into v that we are supposed to
-      sub_idx = i % threads_per_element; // add to; 0 <= sub_idx < threads_per_element tells 
+      sub_idx = i % threads_per_element; // add to; 0 <= sub_idx < threads_per_element tells
                                          // us which block of elements we sum up.
-  if (v_idx >= v_dim) return;
-      
-  Real sum = 0.0;
-  for (int j = sub_idx; j < M_cols; j += threads_per_element) {
-    int M_index = v_idx * M_row_stride + j * M_col_stride,
-        N_index = j * N_row_stride + v_idx * N_col_stride;
-    sum += M[M_index] * N[N_index];
+  if (v_idx < v_dim) {
+    Real sum = 0.0;
+    for (int j = sub_idx; j < M_cols; j += threads_per_element) {
+      int M_index = v_idx * M_row_stride + j * M_col_stride,
+          N_index = j * N_row_stride + v_idx * N_col_stride;
+      sum += M[M_index] * N[N_index];
+    }
+    temp_data[threadIdx.x] = sum;
   }
-  temp_data[threadIdx.x] = sum;
 
   // start_idx = threadIdx.x - sub_idx; // start of the position in temp_data
                                      // that we want to sum up.
@@ -966,7 +1030,7 @@ static void _add_diag_mat_mat(
     __syncthreads();
     num_total_threads = half_point;
   }
-  if (sub_idx == 0) {
+  if (sub_idx == 0 && v_idx < v_dim) {
     v[v_idx] = beta * v[v_idx] + alpha * temp_data[threadIdx.x];
   }
 }
@@ -983,18 +1047,6 @@ static void _add_vec_vec(Real alpha, Real* v, const Real* x, const Real* y, Real
 }
 
 
-template<typename Real>
-__global__
-static void _copy_col_from_mat(Real* v, int col, const Real* mat, MatrixDim dmat, int dim) {
-  int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
-  int32_cuda index = col + i * dmat.stride;
-  // if (blockIdx.y > 0)  return;
-
-  if (i < dim)
-    v[i] = mat[index];
-}
-
-
 template<typename Real>
 __global__
 static void _copy_col_from_mat_df(double* v, int col, const Real* mat, MatrixDim dmat, int dim) {
@@ -1024,10 +1076,10 @@ __global__
 static void _vec_apply_exp(Real* v, int dim) {
   int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
   // if (blockIdx.y > 0) return;
-  
+
   if (i < dim) {
     v[i] = exp(v[i]);
-  } 
+  }
 }
 
 
@@ -1036,7 +1088,7 @@ __global__
 static void _vec_apply_log(Real* v, Real* flag, int dim) {
   int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
   //  if (blockIdx.y > 0) return;
-  
+
   if (i < dim) {
     if (v[i] < 0) {
       *flag = 1;
@@ -1072,10 +1124,10 @@ static void _cuda_comp_obj_deriv(MatrixElement<Real> *x, int s, const Real* z, M
   for(int j = loop_start; j< loop_end; j++) {
     int m = (x + j)->row;   //* ((int*) ((size_t)x + j * (2 * sizeof(int) + sizeof(Real) )) );
     int label = (x + j)->column; //*(int*) ((size_t)x + j * (2 * sizeof(int) + sizeof(Real) )+ sizeof(int));
-    Real weight = (x + j)->weight; //*(Real*) ((size_t)x + j * (2 * sizeof(int) + sizeof(Real) ) + 2 * sizeof(int)); 
+    Real weight = (x + j)->weight; //*(Real*) ((size_t)x + j * (2 * sizeof(int) + sizeof(Real) ) + 2 * sizeof(int));
     tmp_weight_sum += weight;
     Real this_prob =  *(z + m * d.stride + label);
-    tmp_tot_objf += weight * log(this_prob); 
+    tmp_tot_objf += weight * log(this_prob);
 
     *(z2 + m * d2.stride + label ) += weight / this_prob;// there might be problems here....
   }
@@ -1084,34 +1136,32 @@ static void _cuda_comp_obj_deriv(MatrixElement<Real> *x, int s, const Real* z, M
   __syncthreads();
   *t = _sum_reduce(tot_objf);
   __syncthreads();
-  *(t+1) = _sum_reduce(tot_weight); 
+  *(t+1) = _sum_reduce(tot_weight);
   return;
 }
 
 template<typename Real>
 __global__
-static void _cuda_matrix_add_elements(Real *data, MatrixDim dim, Real alpha, MatrixElement<Real>* x, int s) { 
-  int i = threadIdx.x;
+static void _cuda_matrix_add_elements(Real *data, MatrixDim dim, Real alpha, MatrixElement<Real>* x, int num_elements) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= num_elements)
+        return;
+    data[x[i].row * dim.stride + x[i].column] += alpha * x[i].weight;
+}
+
+template<typename Real>
+__global__
+static void _cuda_matrix_add_indexed_values(MatrixDim dim, Real alpha,
+                                            const Int32Pair* indices, const Real* x,
+                                            int s, Real* data) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
   if (i >= s)
     return;
-  int size = s / CU1DBLOCK; //the least size in a loop (later part)
-  int threshold = s - size * CU1DBLOCK; //any loop below this number would + 1
-
-  int loop_start;
-  int loop_end;
-  if(i < threshold) {
-    loop_start = i * (size + 1);
-    loop_end = (i+1) * (size + 1);
-  }
-  else {
-    loop_start = threshold + i*size;
-    loop_end = threshold + (i+1)*size;
-  }
-  for(int j = loop_start; j < loop_end; j++) {
-    *(data + x[j].row * dim.stride + x[j].column) += alpha * x[j].weight;
-  }
+  int data_i = indices[i].first * dim.stride + indices[i].second;
+  data[data_i] += alpha * x[i];
 }
 
+
 template<typename Real>
 __global__
 static void _matrix_lookup(const Real *data, MatrixDim dim,
@@ -1127,8 +1177,8 @@ static void _matrix_lookup(const Real *data, MatrixDim dim,
 template<typename Real>
 __global__
 static void _equal_element_mask(const Real *mat1, const Real *mat2, Real *mask, MatrixDim mat1_dim, int mat2_stride, int mask_stride) {
-  int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; //col
-  int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y; //row
+  int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; // col
+  int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y; // row
   int32_cuda index_mat1 = i + j*mat1_dim.stride;
   int32_cuda index_mat2 = i + j*mat2_stride;
   int32_cuda index_mask = i + j*mask_stride;
@@ -1140,10 +1190,10 @@ template<typename Real>
 __global__
 static void _vec_sum(Real *v, Real *sum, int dim, int inc) {
   int i = threadIdx.x;
-  __shared__ Real row_data[CU1DBLOCK];  
+  __shared__ Real row_data[CU1DBLOCK];
 
   if (i >= CU1DBLOCK) return;
-  
+
   Real tmp_sum = 0;
   int size = dim / CU1DBLOCK; //the least size in a loop (later part)
   int threshold = dim - size * CU1DBLOCK; //any loop below this number would + 1
@@ -1161,7 +1211,7 @@ static void _vec_sum(Real *v, Real *sum, int dim, int inc) {
   for(int j = loop_start; j< loop_end; j++) {
     tmp_sum += v[j * inc];
   }
- 
+
   row_data[threadIdx.x] = tmp_sum;
   __syncthreads();
   *sum = _sum_reduce(row_data);
@@ -1173,7 +1223,6 @@ __global__
 static void _pvec_sum(Real* v, Real* g, int dim, int size) {
   int i = blockIdx.x * blockDim.x + threadIdx.x;
   int start = size * i;
-  if (start >= dim) return;
   int end = start + size;
   if (end > dim) end = dim;
   __shared__ Real row_data[CU1DBLOCK];
@@ -1191,7 +1240,7 @@ template<typename Real>
 __global__
 static void _vec_apply_floor(Real *v, Real floor_val, float *count, int dim) {
   int i = blockIdx.x * blockDim.x + threadIdx.x;
-  
+
   if ( i < dim) {
     if ( v[i] < floor_val) {
       v[i] = floor_val;
@@ -1202,18 +1251,28 @@ static void _vec_apply_floor(Real *v, Real floor_val, float *count, int dim) {
   }
 }
 
-
-// Caution, here i/block{idx,dim}.x is the row index and j/block{idx,dim}.y is the col index.
-// this is for no reason, really, I just happened to prefer this
-// at the time. [dan]
 template<typename Real>
 __global__
-static void _apply_pow(Real* mat, Real power, MatrixDim d) {
+static void _vec_apply_ceiling(Real *v, Real ceiling_val, float *count, int dim) {
   int i = blockIdx.x * blockDim.x + threadIdx.x;
-  int j = blockIdx.y * blockDim.y + threadIdx.y;
-  int index = i * d.stride + j;
 
-  if (i < d.rows && j < d.cols) {
+  if ( i < dim) {
+    if ( v[i] > ceiling_val) {
+      v[i] = ceiling_val;
+      count[i] = 1;
+    } else {
+      count[i] = 0;
+    }
+  }
+}
+
+template<typename Real>
+__global__
+static void _apply_pow(Real* mat, Real power, MatrixDim d) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;  // col index
+  int j = blockIdx.y * blockDim.y + threadIdx.y;  // row index
+  int index = i + j * d.stride;
+  if (i < d.cols && j < d.rows) {
     if (power == 1.0)
       return;
     if (power == 2.0) {
@@ -1231,13 +1290,12 @@ static void _apply_pow(Real* mat, Real power, MatrixDim d) {
 template<typename Real>
 __global__
 static void _apply_pow_abs(Real* mat, Real power, bool include_sign, MatrixDim d) {
-  int i = blockIdx.x * blockDim.x + threadIdx.x;
-  int j = blockIdx.y * blockDim.y + threadIdx.y;
-  int index = i * d.stride + j;
-
-  if (i < d.rows && j < d.cols) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;  // col index
+  int j = blockIdx.y * blockDim.y + threadIdx.y;  // row index
+  int index = i + j * d.stride;
+  if (i < d.cols && j < d.rows) {
     if (include_sign == true && mat[index] < 0) {
-      if (power == 1.0) 
+      if (power == 1.0)
         mat[index] = -std::abs(mat[index]);
       if (power == 2.0) {
         mat[index] = -mat[index] * mat[index];
@@ -1247,7 +1305,7 @@ static void _apply_pow_abs(Real* mat, Real power, bool include_sign, MatrixDim d
         mat[index] = -pow(std::abs(mat[index]), power);
       }
     } else {
-      if (power == 1.0) 
+      if (power == 1.0)
         mat[index] = std::abs(mat[index]);
       if (power == 2.0) {
         mat[index] = mat[index] * mat[index];
@@ -1262,27 +1320,22 @@ static void _apply_pow_abs(Real* mat, Real power, bool include_sign, MatrixDim d
   }
 }
 
-// Caution, here i/block{idx,dim}.x is the row index and j/block{idx,dim}.y is the col index.
-// this is for no reason, really, I just happened to prefer this
-// at the time. [dan]
 template<typename Real>
 __global__
 static void _apply_heaviside(Real* mat, MatrixDim d) {
-  int i = blockIdx.x * blockDim.x + threadIdx.x;
-  int j = blockIdx.y * blockDim.y + threadIdx.y;
-  int index = i * d.stride + j;
-
-  if (i < d.rows && j < d.cols) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;  // col index
+  int j = blockIdx.y * blockDim.y + threadIdx.y;  // row index
+  int index = i + j * d.stride;
+  if (i < d.cols && j < d.rows)
     mat[index] = (mat[index] > 0.0 ? 1.0 : 0.0);
-  }
 }
 
 
 template<typename Real>
 __global__
 static void _apply_floor(Real* mat, Real floor_val, MatrixDim d) {
-  int i = blockIdx.x * blockDim.x + threadIdx.x;
-  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int i = blockIdx.x * blockDim.x + threadIdx.x;  // col index
+  int j = blockIdx.y * blockDim.y + threadIdx.y;  // row index
   int index = i + j * d.stride;
 
   if (i < d.cols && j < d.rows) {
@@ -1295,57 +1348,50 @@ static void _apply_floor(Real* mat, Real floor_val, MatrixDim d) {
 template<typename Real>
 __global__
 static void _copy_cols(Real* dst, const Real *src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
-  // Note: in this kernel, the x dimension corresponds to rows and the y to columns,
-  // as it will be going forward.
-
-  int i = blockIdx.x * blockDim.x + threadIdx.x;
-  int j = blockIdx.y * blockDim.y + threadIdx.y;
-  if (i < dst_dim.rows && j < dst_dim.cols) {
-    int index = reorder[j],
-        dst_index = i * dst_dim.stride + j;
+  int i = blockIdx.x * blockDim.x + threadIdx.x;  // col index
+  int j = blockIdx.y * blockDim.y + threadIdx.y;  // row index
+  if (i < dst_dim.cols && j < dst_dim.rows) {
+    int index = reorder[i],
+        dst_index = j * dst_dim.stride + i;
     if (index >= 0) {
-      int src_index = i * src_stride + reorder[j];
-      Real val = src[src_index]; 
+      int src_index = j * src_stride + reorder[i];
+      Real val = src[src_index];
       dst[dst_index] = val;
     } else {
       dst[dst_index] = 0.0;
     }
-  } 
+  }
 }
 
 template<typename Real>
 __global__
-static void _add_cols(Real* dst, const Real *src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
-  // Note: in this kernel, the x dimension corresponds to rows and the y to columns,
-  // as it will be going forward.
-
-  int i = blockIdx.x * blockDim.x + threadIdx.x;
-  int j = blockIdx.y * blockDim.y + threadIdx.y;
-  if (i < dst_dim.rows && j < dst_dim.cols) {
-    int index = reorder[j],
-        dst_index = i * dst_dim.stride + j;
+static void _add_cols(Real* dst, const Real *src, const MatrixIndexT_cuda* reorder,
+                      MatrixDim dst_dim, int src_stride) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;  // col index
+  int j = blockIdx.y * blockDim.y + threadIdx.y;  // row index
+  if (i < dst_dim.cols && j < dst_dim.rows) {
+    int index = reorder[i],
+        dst_index = j * dst_dim.stride + i;
     if (index >= 0) {
-      int src_index = i * src_stride + reorder[j];
-      Real val = src[src_index]; 
+      int src_index = j * src_stride + index;
+      Real val = src[src_index];
       dst[dst_index] += val;
     }
-  } 
+  }
 }
 
 template<typename Real>
 __global__
-static void _copy_rows(Real* dst, const Real *src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
-  // Note: in this kernel, the x dimension corresponds to rows and the y to columns,
-  // as it will be going forward.
-
-  int i = blockIdx.x * blockDim.x + threadIdx.x;
-  int j = blockIdx.y * blockDim.y + threadIdx.y;
-  if (i < dst_dim.rows && j < dst_dim.cols) {
-    int index = reorder[i],
-        dst_index = i * dst_dim.stride + j;
+static void _copy_rows(Real* dst, const Real *src, const MatrixIndexT_cuda* reorder,
+                       MatrixDim dst_dim, int src_stride) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;  // col index
+  int j = blockIdx.y * blockDim.y + threadIdx.y;  // row index
+  if (i < dst_dim.cols && j < dst_dim.rows) {
+    int index = reorder[j],
+        dst_index = j * dst_dim.stride + i;
     if (index >= 0) {
-      int src_index = reorder[i] * src_stride + j;
-      Real val = src[src_index]; 
+      int src_index = reorder[j] * src_stride + i;
+      Real val = src[src_index];
       dst[dst_index] = val;
     } else {
       dst[dst_index] = 0;
@@ -1356,12 +1402,13 @@ static void _copy_rows(Real* dst, const Real *src, const MatrixIndexT_cuda* reor
 template<typename Real>
 __global__
 static void _copy_rows(Real* dst, const Real *const *src, MatrixDim dst_dim) {
-  int i = blockIdx.x * blockDim.x + threadIdx.x;
-  int j = blockIdx.y * blockDim.y + threadIdx.y;
-  if (i < dst_dim.rows && j < dst_dim.cols) {
-    int dst_index = i * dst_dim.stride + j;
-    if (src[i] != NULL) {
-      dst[dst_index] = src[i][j];
+  int i = blockIdx.x * blockDim.x + threadIdx.x;  // col index
+  int j = blockIdx.y * blockDim.y + threadIdx.y;  // row index
+  if (i < dst_dim.cols && j < dst_dim.rows) {
+    int dst_index = j * dst_dim.stride + i;
+    const Real *pointer = src[j];
+    if (pointer != NULL) {
+      dst[dst_index] = pointer[i];
     } else {
       dst[dst_index] = 0;
     }
@@ -1372,11 +1419,12 @@ template<typename Real>
 __global__
 static void _copy_to_rows(Real* const* dst,
                           const Real *src, MatrixDim src_dim) {
-  int i = blockIdx.x * blockDim.x + threadIdx.x;
-  int j = blockIdx.y * blockDim.y + threadIdx.y;
-  if (i < src_dim.rows && j < src_dim.cols) {
-    if (dst[i] != NULL) {
-      dst[i][j] = src[i * src_dim.stride + j];
+  int i = blockIdx.x * blockDim.x + threadIdx.x;  // col index
+  int j = blockIdx.y * blockDim.y + threadIdx.y;  // row index
+  if (i < src_dim.cols && j < src_dim.rows) {
+    Real *pointer = dst[j];
+    if (pointer != NULL) {
+      pointer[i] = src[j * src_dim.stride + i];
     }
   }
 }
@@ -1386,27 +1434,27 @@ __global__
 static void _add_rows(Real alpha, Real* dst, const Real *src,
                      const MatrixIndexT_cuda* reorder,
                      MatrixDim dst_dim, int src_stride) {
-  int i = blockIdx.x * blockDim.x + threadIdx.x;
-  int j = blockIdx.y * blockDim.y + threadIdx.y;
-  if (i < dst_dim.rows && j < dst_dim.cols) {
-    int dst_index = i * dst_dim.stride + j;
-    if (reorder[i] >= 0) {
-      int src_index = reorder[i] * src_stride + j;
+  int i = blockIdx.x * blockDim.x + threadIdx.x;  // col index
+  int j = blockIdx.y * blockDim.y + threadIdx.y;  // row index
+  if (i < dst_dim.cols && j < dst_dim.rows) {
+    int dst_index = j * dst_dim.stride + i;
+    if (reorder[j] >= 0) {
+      int src_index = reorder[j] * src_stride + i;
       dst[dst_index] += alpha * src[src_index];
     }
-  } 
+  }
 }
 
 template<typename Real>
 __global__
 static void _add_rows(Real alpha,
                       Real* dst, const Real *const *src, MatrixDim dst_dim) {
-  int i = blockIdx.x * blockDim.x + threadIdx.x;
-  int j = blockIdx.y * blockDim.y + threadIdx.y;
-  if (i < dst_dim.rows && j < dst_dim.cols) {
-    int dst_index = i * dst_dim.stride + j;
-    if (src[i] != NULL) {
-      dst[dst_index] += alpha * src[i][j];
+  int i = blockIdx.x * blockDim.x + threadIdx.x;  // col index
+  int j = blockIdx.y * blockDim.y + threadIdx.y;  // row index
+  if (i < dst_dim.cols && j < dst_dim.rows) {
+    int dst_index = j * dst_dim.stride + i;
+    if (src[j] != NULL) {
+      dst[dst_index] += alpha * src[j][i];
     }
   }
 }
@@ -1415,11 +1463,11 @@ template<typename Real>
 __global__
 static void _add_to_rows(Real alpha,
                          Real* const* dst, const Real *src, MatrixDim src_dim) {
-  int i = blockIdx.x * blockDim.x + threadIdx.x;
-  int j = blockIdx.y * blockDim.y + threadIdx.y;
-  if (i < src_dim.rows && j < src_dim.cols) {
-    if (dst[i] != NULL) {
-      dst[i][j] += alpha * src[i * src_dim.stride + j];
+  int i = blockIdx.x * blockDim.x + threadIdx.x;  // col index
+  int j = blockIdx.y * blockDim.y + threadIdx.y;  // row index
+  if (i < src_dim.cols && j < src_dim.rows) {
+    if (dst[j] != NULL) {
+      dst[j][i] += alpha * src[j * src_dim.stride + i];
     }
   }
 }
@@ -1471,7 +1519,7 @@ static void _add_mat_blockmat_trans(Real *data, MatrixDim dim, const Real *A_dat
       BT_col_stride = cu_data.matrix_dim.stride;
   const Real *B_data = static_cast<Real*>(cu_data.matrix_data); // Cast from void;
   // we avoided a bunch of hassle by doing this (relates to Ansi-C requirement).
-      
+
   for (int k = 0; k < BT_num_cols; k++) {
     const Real *this_BT_col = B_data + k * BT_col_stride;
     const Real *this_A_row = A_data + i * A_row_stride + BT_row_start * A_col_stride;
@@ -1496,7 +1544,7 @@ static void _add_mat_blockmat(Real *data, MatrixDim dim, const Real *A_data, int
   if (i >= A_num_rows || j >= B_num_blocks) return;
 
   const CuBlockMatrixData &block_data = B_cu_data[j];
-  
+
   int B_row_start = block_data.row_offset,
       B_col_start = block_data.col_offset,
       B_num_rows = block_data.matrix_dim.rows,
@@ -1504,7 +1552,7 @@ static void _add_mat_blockmat(Real *data, MatrixDim dim, const Real *A_data, int
       B_row_stride = block_data.matrix_dim.stride;
   const Real *B_data = static_cast<Real*>(block_data.matrix_data); // Cast from void;
   // we avoided a bunch of hassle by doing this (relates to Ansi-C requirement).
-      
+
   for (int k = 0; k < B_num_cols; k++) {
     const Real *this_B_col = B_data + k;
     const Real *this_A_row = A_data + i * A_row_stride + B_row_start * A_col_stride;
@@ -1551,7 +1599,7 @@ static void _block_add_mat_mat(CuBlockMatrixData *B_cu_data, int num_blocks,
       i * block_data.matrix_dim.stride + j;
 
   Real B_val = *B_elem;
-  
+
   // B_row and B_col are the (row, col) index into the full matrix B.
   int B_row = block_data.row_offset + i, B_col = block_data.col_offset + j;
 
@@ -1585,7 +1633,7 @@ static void _blockadd_mat_blockmat_trans(Real *data, MatrixDim dim, const Real *
       BT_col_stride = cu_data.matrix_dim.stride;
   const Real *B_data = static_cast<Real*>(cu_data.matrix_data); // Cast from void;
   // we avoided a bunch of hassle by doing this (relates to Ansi-C requirement).
-      
+
   for (int k = 0; k < BT_num_cols; k++) {
     const Real *this_BT_col = B_data + k * BT_col_stride;
     const Real *this_A_row = A_data + i * A_row_stride + BT_row_start * A_col_stride;
@@ -1600,17 +1648,14 @@ static void _blockadd_mat_blockmat_trans(Real *data, MatrixDim dim, const Real *
   }
 }
 
-
-// Since this is a newer kernel, x is the row-index and y is the
-// column-index.
 template<typename Real>
 __global__
 static void _sum_column_ranges(Real *data, MatrixDim dim,
                                const Real *src_data,
                                MatrixDim src_dim,
                                const Int32Pair *indices) {
-  int row = blockIdx.x * blockDim.x + threadIdx.x;
-  int col = blockIdx.y * blockDim.y + threadIdx.y;
+  int col = blockIdx.x * blockDim.x + threadIdx.x;
+  int row = blockIdx.y * blockDim.y + threadIdx.y;
   if (row >= dim.rows || col >= dim.cols)
     return;
   int dst_index = row * dim.stride + col,
@@ -1626,15 +1671,16 @@ template<typename Real>
 __global__
 static void _add_row_ranges(Real *data, MatrixDim dim, const Real *src_data,
                             MatrixDim src_dim, const Int32Pair *indexes) {
-  int row = blockIdx.x * blockDim.x + threadIdx.x;
-  int col = blockIdx.y * blockDim.y + threadIdx.y;
+  int col = blockIdx.x * blockDim.x + threadIdx.x;
+  int row = blockIdx.y * blockDim.y + threadIdx.y;
   if (row >= dim.rows || col >= dim.cols)
     return;
   int dst_index = row * dim.stride + col;
-  for (int row_index = indexes[col].first;
-      row_index < indexes[col].second; row_index++) {
+  int src_index_start = indexes[row].first,
+      src_index_end = indexes[row].second;
+  for (int row_index = src_index_start; row_index < src_index_end;
+       row_index++)
     data[dst_index] += src_data[row_index * src_dim.stride + col];
-  }
 }
 
 template<typename Real>
@@ -1654,7 +1700,7 @@ static void _soft_hinge(Real*y, const Real*x, MatrixDim d, int src_stride) {
 
 template<typename Real>
 __global__
-static void _group_pnorm(Real *y, const Real *x, MatrixDim d, int src_stride, 
+static void _group_pnorm(Real *y, const Real *x, MatrixDim d, int src_stride,
 			 int group_size, Real power) {
   int i = blockIdx.x * blockDim.x + threadIdx.x;
   int j = blockIdx.y * blockDim.y + threadIdx.y;
@@ -1663,20 +1709,20 @@ static void _group_pnorm(Real *y, const Real *x, MatrixDim d, int src_stride,
     Real tmp = 0;
     int src_begin_index = i * group_size + j * src_stride;
     int src_end_index = src_begin_index + group_size;
-    for (int src_index = src_begin_index; src_index < src_end_index; 
+    for (int src_index = src_begin_index; src_index < src_end_index;
          src_index ++) {
-      tmp += pow(std::abs(x[src_index]), power); 
+      tmp += pow(std::abs(x[src_index]), power);
     }
     tmp = pow(tmp, Real(1.0 / power));
     if (!isnan(tmp)) {
       y[dst_index] = tmp;
     } else {
       Real max_value = x[src_begin_index], min_value = max_value;
-      for (int src_index = src_begin_index + 1; 
+      for (int src_index = src_begin_index + 1;
       	   src_index < src_end_index; src_index ++) {
-        if (x[src_index] > max_value) 
+        if (x[src_index] > max_value)
           max_value = x[src_index];
-        if (x[src_index] < min_value) 
+        if (x[src_index] < min_value)
           min_value = x[src_index];
       }
       tmp = 0.0;
@@ -1689,7 +1735,7 @@ static void _group_pnorm(Real *y, const Real *x, MatrixDim d, int src_stride,
         for (int src_index = src_begin_index;
              src_index < src_end_index; src_index ++) {
           Real x_scaled = x[src_index] / max_abs_value;
-          tmp += pow(std::abs(x_scaled), Real(power)); 
+          tmp += pow(std::abs(x_scaled), Real(power));
         }
         y[dst_index] = pow(tmp, Real(1.0 / power)) * max_abs_value;
       }
@@ -1740,7 +1786,7 @@ static void _diff_sigmoid(Real*eout, const Real*e, const Real*y, MatrixDim d, in
   int dst_index = i + j*d.stride;
   int e_index = i + j*e_stride;
   int y_index = i + j*y_stride;
-  if (i < d.cols  && j < d.rows ) 
+  if (i < d.cols  && j < d.rows )
     eout[dst_index] = y[y_index]*(1.0-y[y_index]) * e[e_index];
 }
 
@@ -1769,13 +1815,26 @@ __global__
 static void _diff_tanh(Real*eout, const Real*e, const Real*y, MatrixDim d, int e_stride, int y_stride) {
   int i = blockIdx.x * blockDim.x + threadIdx.x;
   int j = blockIdx.y * blockDim.y + threadIdx.y;
-  int dst_index = i + j*d.stride; 
-  int e_index   = i + j*e_stride; 
+  int dst_index = i + j*d.stride;
+  int e_index   = i + j*e_stride;
   int y_index   = i + j*y_stride;
-  if (i < d.cols  && j < d.rows ) 
+  if (i < d.cols  && j < d.rows )
     eout[dst_index] = (1.0 - y[y_index]*y[y_index]) * e[e_index];
 }
 
+template<typename Real>
+__global__
+static void _heaviside(Real*y, const Real*x, MatrixDim d, int src_stride) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int dst_index = i + j*d.stride, src_index = i + j*src_stride;
+  if(i < d.cols && j < d.rows) {
+    Real res = (x[src_index] > 0.0 ? 1.0 : 0.0);
+    y[dst_index] = res;
+  }
+}
+
+
 template<typename Real>
 __global__
 static void _softmax_reduce(Real*y, const Real*x, MatrixDim d, int src_stride) {
@@ -1809,7 +1868,7 @@ static void _softmax_reduce(Real*y, const Real*x, MatrixDim d, int src_stride) {
   }
   Real max = aux[0];
   __syncthreads();
-  
+
    // subtract max, apply exp, sum up...
   y[threadIdx.x+j*d.stride] = exp(x[threadIdx.x+j*d.stride] - max);
   aux[threadIdx.x] = y[threadIdx.x+j*d.stride];
@@ -1980,10 +2039,10 @@ static void _vec_copy_diag_from_packed(Real* y, const Real* x, int dim) {
 template<typename Real>
 __global__
 static void _copy_from_sp(const Real* x, Real* y, MatrixDim dim) {
-  int i = blockIdx.x * blockDim.x + threadIdx.x;
-  int j = blockIdx.y * blockDim.y + threadIdx.y;
-  if (i < dim.rows && j < dim.cols) {
-    int dst_index = i * dim.stride + j, src_index;
+  int i = blockIdx.x * blockDim.x + threadIdx.x;  // column index
+  int j = blockIdx.y * blockDim.y + threadIdx.y;  //
+  if (i < dim.cols && j < dim.rows) {
+    int dst_index = i + j * dim.stride, src_index;
     if (j <= i) {  // no transpose
       src_index = (i * (i+1) / 2) + j;
     } else { // transpose.
@@ -2041,7 +2100,7 @@ static void _regularize_l1(Real* wei, Real* grad, Real l1, Real lr, MatrixDim d,
   if (i < d.cols && j < d.rows) {
 
     if(wei[index]==0.0) return; //skip L1 if zero weight!
-    
+
     Real l1_signed = l1;
     if(wei[index] < 0.0) //flip sign
       l1_signed = -l1;
@@ -2057,36 +2116,63 @@ static void _regularize_l1(Real* wei, Real* grad, Real l1, Real lr, MatrixDim d,
   }
 }
 
-
-
 template<typename Real>
 __global__
-static void _find_row_max_id(const Real* mat, Real* vec_val, int32_cuda* vec_id, int32_cuda voff, MatrixDim d) {
-  int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
-  int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y;
+static void _find_row_max_id(const Real* mat, Real* vec_val, int32_cuda* vec_id,
+    MatrixDim d) {
+  const int32_cuda i = blockIdx.x;
+  const int32_cuda base = i * d.stride;
+  const int32_cuda tid = threadIdx.x;
 
-  if(blockIdx.x > 0) return;
-  if(blockDim.y != 1) return;
+  __shared__ Real smax[CU1DBLOCK];
+  __shared__ int32_cuda sidx[CU1DBLOCK];
 
-  __shared__ Real value[CU1DBLOCK];
-  __shared__ int32_cuda index[CU1DBLOCK];
+  Real tmax = -1e20;
+  int32_cuda tidx = -1;
 
-  //copy to shared memory
-  value[threadIdx.x] = mat[i+j*d.stride];
-  index[threadIdx.x] = threadIdx.x;
-  __syncthreads();
-  
-  //get the id of the max value
-  int32_cuda out_max = _max_id_reduce(value, index);
-  __syncthreads();
+  // Loop over blocks for coalesced memory access.
+  for (int32_cuda j = tid; j < d.cols; j += CU1DBLOCK) {
+    const Real val = mat[base + j];
+    if (val > tmax) {
+      tmax = val;
+      tidx = j;
+    }
+  }
 
-  //see if it's bigger value
-  if(threadIdx.x == 0) {
-    if(vec_val[j] <= mat[out_max+j*d.stride]) {
-      vec_val[j] = mat[out_max+j*d.stride];
-      vec_id[j]  = voff+out_max;
+  smax[tid] = tmax;
+  sidx[tid] = tidx;
+
+  // Parallel reduce
+  #pragma unroll
+  for (int32_cuda num_working_threads = CU1DBLOCK / 2;
+      num_working_threads >= warpSize; num_working_threads >>= 1) {
+    __syncthreads();
+    if (tid < num_working_threads) {
+      if (smax[tid + num_working_threads] > smax[tid]) {
+        smax[tid] = smax[tid + num_working_threads];
+        sidx[tid] = sidx[tid + num_working_threads];
+      }
     }
   }
+  // Warp reduce without __syncthreads()
+  // (note.: synchronizes implicitly within a warp at the multiprocessor)
+  if (tid < warpSize / 2) {
+    #pragma unroll
+    for (int32_cuda num_working_threads = warpSize / 2; num_working_threads > 0;
+        num_working_threads >>= 1) {
+      if (smax[tid + num_working_threads] > smax[tid]) {
+        smax[tid] = smax[tid + num_working_threads];
+        sidx[tid] = sidx[tid + num_working_threads];
+      }
+    }
+  }
+
+  if (tid == 0) {
+    if (vec_val) {
+      vec_val[i] = smax[0];
+    }
+    vec_id[i] = sidx[0];
+  }
 }
 
 
@@ -2113,10 +2199,13 @@ static void _diff_xent(const int32_cuda* vec_tgt, Real* mat_net_out, Real* vec_l
  */
 
 /*
- * "int32" 
+ * "int32"
  */
-void cudaI32_set_const(dim3 Gr, dim3 Bl, int32_cuda* mat, int32_cuda value, MatrixDim d) {
-  _set_const<<<Gr,Bl>>>(mat,value,d); 
+void cuda_int32_set_const(dim3 Gr, dim3 Bl, int32_cuda* mat, int32_cuda value, MatrixDim d) {
+  _set_const<<<Gr,Bl>>>(mat,value,d);
+}
+void cuda_int32_add(dim3 Gr, dim3 Bl, int32_cuda* mat, int32_cuda value, MatrixDim d) {
+  _add<<<Gr,Bl>>>(mat,value,d);
 }
 
 
@@ -2151,11 +2240,6 @@ void cudaFD_copy_from_tp(dim3 Gr, dim3 Bl, float* A, const double* B, MatrixDim
   _copy_from_tp<<<Gr,Bl>>>(A,B,dmat);
 }
 
-
-void cudaF_copy_col_from_vec(int Gr, int Bl, float* mat, const float* v, int col, MatrixDim d) {
-  _copy_col_from_vec<<<Gr,Bl>>>(mat,v,col,d);
-}
-
 void cudaF_transpose_matrix(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) {
   _transpose_matrix<<<Gr,Bl>>>(mat, d);
 }
@@ -2174,7 +2258,6 @@ void cudaF_apply_pow_abs(dim3 Gr, dim3 Bl, float* mat, float power, bool include
 
 void cudaF_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) {
   _apply_heaviside<<<Gr,Bl>>>(mat, d);
-
 }
 
 void cudaF_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
@@ -2230,7 +2313,7 @@ void cudaF_add_diag_packed(int Gr, int Bl, float* mat, float value, int dim) {
 }
 
 void cudaF_set_const(dim3 Gr, dim3 Bl, float* mat, float value, MatrixDim d) {
-  _set_const<<<Gr,Bl>>>(mat,value,d); 
+  _set_const<<<Gr,Bl>>>(mat,value,d);
 }
 
 void cudaF_set_zero_above_diag(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) {
@@ -2238,7 +2321,7 @@ void cudaF_set_zero_above_diag(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) {
 }
 
 void cudaF_add(dim3 Gr, dim3 Bl, float* mat, float value, MatrixDim d) {
-  _add<<<Gr,Bl>>>(mat,value,d); 
+  _add<<<Gr,Bl>>>(mat,value,d);
 }
 
 void cudaF_scale_diag_packed(int Gr, int Bl, float* mat, float value, int dim) {
@@ -2246,45 +2329,45 @@ void cudaF_scale_diag_packed(int Gr, int Bl, float* mat, float value, int dim) {
 }
 
 void cudaF_scale(dim3 Gr, dim3 Bl, float* mat, float value, MatrixDim d) {
-  _scale<<<Gr,Bl>>>(mat,value,d); 
+  _scale<<<Gr,Bl>>>(mat,value,d);
 }
 
 void cudaF_apply_log(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) {
-  _apply_log<<<Gr,Bl>>>(mat,d); 
+  _apply_log<<<Gr,Bl>>>(mat,d);
 }
 
 void cudaF_mul_elements(dim3 Gr, dim3 Bl, float* mat, const float* A, MatrixDim dst_d, int src_stride) {
-  _mul_elements<<<Gr,Bl>>>(mat,A,dst_d,src_stride); 
+  _mul_elements<<<Gr,Bl>>>(mat,A,dst_d,src_stride);
 }
 
 void cudaF_div_elements(dim3 Gr, dim3 Bl, float* mat, const float* A, MatrixDim dst_d, int src_stride) {
-  _div_elements<<<Gr,Bl>>>(mat,A,dst_d,src_stride); 
+  _div_elements<<<Gr,Bl>>>(mat,A,dst_d,src_stride);
 }
 
 void cudaF_max(dim3 Gr, dim3 Bl, float* mat, const float* A, MatrixDim dst_d, int src_stride) {
-  _max<<<Gr,Bl>>>(mat,A,dst_d,src_stride); 
+  _max<<<Gr,Bl>>>(mat,A,dst_d,src_stride);
 }
 
 void cudaF_mul_cols_vec(dim3 Gr, dim3 Bl, float* mat, const float* scale, MatrixDim d) {
-  _mul_cols_vec<<<Gr,Bl>>>(mat,scale,d); 
+  _mul_cols_vec<<<Gr,Bl>>>(mat,scale,d);
 }
 
 void cudaF_mul_rows_vec(dim3 Gr, dim3 Bl, float* mat, const float* scale, MatrixDim d) {
   _mul_rows_vec<<<Gr,Bl>>>(mat,scale,d);
 }
 
-void cudaF_mul_rows_group_mat(dim3 Gr, dim3 Bl, float *y, const float *x, 
+void cudaF_mul_rows_group_mat(dim3 Gr, dim3 Bl, float *y, const float *x,
 			      MatrixDim d, int src_stride, int group_size) {
   _mul_rows_group_mat<<<Gr,Bl>>>(y, x, d, src_stride, group_size);
 }
 
-void cudaF_calc_pnorm_deriv(dim3 Gr, dim3 Bl, float *y, const float *x1, 
+void cudaF_calc_pnorm_deriv(dim3 Gr, dim3 Bl, float *y, const float *x1,
 			    const float *x2, MatrixDim d, int src_stride,
 			    int group_size, float power) {
   _calc_pnorm_deriv<<<Gr,Bl>>>(y, x1, x2, d, src_stride, group_size, power);
 }
 
-void cudaF_calc_group_max_deriv(dim3 Gr, dim3 Bl, float *y, const float *x1, 
+void cudaF_calc_group_max_deriv(dim3 Gr, dim3 Bl, float *y, const float *x1,
 			        const float *x2, MatrixDim d, int src_stride,
 			        int group_size) {
   _calc_group_max_deriv<<<Gr,Bl>>>(y, x1, x2, d, src_stride, group_size);
@@ -2296,12 +2379,20 @@ void cudaF_div_rows_vec(dim3 Gr, dim3 Bl, float* mat, const float* vec_div, Matr
 
 void cudaF_add_mat(dim3 Gr, dim3 Bl, float alpha, const float* src, float* dst, MatrixDim d, int src_stride, int A_trans) {
   if (A_trans) {
-    _add_mat_trans<<<Gr,Bl>>>(alpha,src,dst,d,src_stride);  
+    _add_mat_trans<<<Gr,Bl>>>(alpha,src,dst,d,src_stride);
   } else {
     _add_mat<<<Gr,Bl>>>(alpha,src,dst,d,src_stride);
   }
 }
 
+void cudaF_add_mat_blocks(dim3 Gr, dim3 Bl, float alpha, const float* src, int32_cuda num_row_blocks, int32_cuda num_col_blocks, float* dst, MatrixDim d, int src_stride, int A_trans) {
+  if (A_trans) {
+    _add_mat_blocks_trans<<<Gr,Bl>>>(alpha, src, num_row_blocks, num_col_blocks, dst, d, src_stride);
+  } else {
+    _add_mat_blocks<<<Gr,Bl>>>(alpha, src, num_row_blocks, num_col_blocks, dst, d, src_stride);
+  }
+}
+
 void cudaF_add_mat_mat_div_mat(dim3 Gr, dim3 Bl, const float *A, const float *B, const float *C, float *dst, MatrixDim d, int stride_a, int stride_b, int stride_c) {
   _add_mat_mat_div_mat<<<Gr,Bl>>>(A,B,C,dst,d, stride_a, stride_b, stride_c);
 }
@@ -2312,12 +2403,12 @@ void cudaF_sy_add_tr2(dim3 Gr, dim3 Bl, float alpha, float beta, const float* T,
 }
 
 void cudaF_add_vec_to_cols(dim3 Gr, dim3 Bl, float alpha, const float* col, float beta, float* dst, MatrixDim d) {
-  _add_vec_to_cols<<<Gr,Bl>>>(alpha,col,beta,dst,d); 
+  _add_vec_to_cols<<<Gr,Bl>>>(alpha,col,beta,dst,d);
 }
 
 
 void cudaF_add_vec_to_rows(dim3 Gr, dim3 Bl, float alpha, const float* row, float beta, float* dst, MatrixDim d) {
-  _add_vec_to_rows<<<Gr,Bl>>>(alpha,row,beta,dst,d); 
+  _add_vec_to_rows<<<Gr,Bl>>>(alpha,row,beta,dst,d);
 }
 
 void cudaF_add_mat_diag_vec(dim3 Gr, dim3 Bl, float alpha, float *mat, MatrixDim mat_dim, const float *mat2, int mat2_row_stride, int mat2_col_stride, const float *vec,  float beta) {
@@ -2331,7 +2422,7 @@ void cudaF_add_mat_mat_elements(dim3 Gr, dim3 Bl, float *data, const float *srcA
 
 // CURRENTLY UNUSED...
 void cudaF_apply_mask(dim3 Gr, dim3 Bl, float* mat, const char* mask, MatrixDim dmat, MatrixDim dmask) {
-  _apply_mask<<<Gr,Bl>>>(mat,mask,dmat,dmask); 
+  _apply_mask<<<Gr,Bl>>>(mat,mask,dmat,dmask);
 }
 
 
@@ -2367,17 +2458,17 @@ void cudaF_vec_max(const float* v, float* value, int dim) {
   _vec_max<<<1,CU1DBLOCK>>>(v, value, dim);
 }
 
-void cudaF_trace_mat_mat_trans(const float* A, const float* B, MatrixDim dA, int B_stride, float* value) {
-  _trace_mat_mat_trans<float,4> <<<4,CU1DBLOCK>>>(A,B,dA,B_stride,value);
+void cudaF_trace_mat_mat_trans(dim3 Gr, dim3 Bl, const float* A, const float* B, MatrixDim dA, int B_stride, float* value) {
+  _trace_mat_mat_trans<<<Gr,Bl>>>(A,B,dA,B_stride,value);
 }
 
-void cudaF_trace_mat_mat(const float* A, const float* B, MatrixDim dA, int B_stride, float* value) {
-  _trace_mat_mat<float,2> <<<2,CU1DBLOCK>>>(A,B,dA,B_stride,value);
+void cudaF_trace_mat_mat(dim3 Gr, dim3 Bl, const float* A, const float* B, MatrixDim dA, int B_stride, float* value) {
+  _trace_mat_mat<32><<<Gr,Bl>>>(A,B,dA,B_stride,value);
 }
 
 
-void cudaF_add_diag_mat_mat(int Gr, int Bl, float alpha, float* v, int v_dim, const float* M, 
-     int M_cols, int M_row_stride, int M_col_stride, const float *N, int N_row_stride, 
+void cudaF_add_diag_mat_mat(int Gr, int Bl, float alpha, float* v, int v_dim, const float* M,
+     int M_cols, int M_row_stride, int M_col_stride, const float *N, int N_row_stride,
                             int N_col_stride, int threads_per_element, float beta) {
    _add_diag_mat_mat<<<Gr,Bl>>>(alpha, v, v_dim, M, M_cols, M_row_stride, M_col_stride,
                                 N, N_row_stride, N_col_stride, threads_per_element, beta);
@@ -2395,8 +2486,12 @@ void cudaF_pvec_sum(int Gr, int Bl, float* v, float* pvec_sum, int dim, int size
   _pvec_sum<<<Gr,Bl>>>(v, pvec_sum, dim, size);
 }
 
-void cudaF_matrix_add_elements(dim3 Gr, dim3 Bl, float *data, MatrixDim dim, float alpha, MatrixElement<float>* x, int s) { 
-  _cuda_matrix_add_elements<<<Gr, Bl>>>(data, dim, alpha, x, s); 
+void cudaF_matrix_add_elements(dim3 Gr, dim3 Bl, float *data, MatrixDim dim, float alpha, MatrixElement<float>* x, int num_elements) {
+  _cuda_matrix_add_elements<<<Gr, Bl>>>(data, dim, alpha, x, num_elements);
+}
+
+void cudaF_matrix_add_indexed_values(dim3 Gr, dim3 Bl, MatrixDim dim, float alpha, const Int32Pair* indices, const float* x, int s, float* data) {
+  _cuda_matrix_add_indexed_values<<<Gr, Bl>>>(dim, alpha, indices, x, s, data);
 }
 
 void cudaF_comp_obj_deriv(dim3 Gr, dim3 Bl, MatrixElement<float>* x, int s, const float* z, MatrixDim d, float* z2, MatrixDim d2, float* t) {
@@ -2415,6 +2510,10 @@ void cudaF_vec_apply_floor(int Gr, int Bl, float* v, float floor_val, float *cou
   _vec_apply_floor<<<Gr,Bl>>>(v,floor_val,count,dim);
 }
 
+void cudaF_vec_apply_ceiling(int Gr, int Bl, float* v, float ceiling_val, float *count, int dim) {
+  _vec_apply_ceiling<<<Gr,Bl>>>(v, ceiling_val,count,dim);
+}
+
 void cudaF_vec_apply_exp(int Gr, int Bl, float* v, int dim) {
   _vec_apply_exp<<<Gr,Bl>>>(v,dim);
 }
@@ -2440,7 +2539,7 @@ void cudaF_add_mat_blockmat(dim3 Gr, dim3 Bl, float *data, MatrixDim d, const fl
     _add_mat_blockmat<<<Gr,Bl>>>(data, d, Adata, A_num_rows, A_num_cols,
                                  A_row_stride, A_col_stride, B_cu_data,
                                  B_num_blocks, alpha, beta);
-    
+
   }
 }
 
@@ -2457,7 +2556,7 @@ void cudaF_block_add_mat_mat(dim3 Gr, dim3 Bl, CuBlockMatrixData *B_cu_data, int
  * cu::
  */
 void cudaF_soft_hinge (dim3 Gr, dim3 Bl, float* y, const float* x, MatrixDim d, int src_stride) {
-  _soft_hinge<<<Gr,Bl>>>(y, x, d, src_stride); 
+  _soft_hinge<<<Gr,Bl>>>(y, x, d, src_stride);
 }
 
 void cudaF_group_pnorm(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride, int group_size, float power) {
@@ -2469,7 +2568,7 @@ void cudaF_group_max(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, in
 }
 
 void cudaF_sigmoid (dim3 Gr, dim3 Bl, float* y, const float* x, MatrixDim d, int src_stride) {
-  _sigmoid<<<Gr,Bl>>>(y, x, d, src_stride); 
+  _sigmoid<<<Gr,Bl>>>(y, x, d, src_stride);
 }
 
 void cudaF_diff_sigmoid (dim3 Gr, dim3 Bl, float* eout, const float* e, const float* y, MatrixDim d, int e_stride, int y_stride) {
@@ -2477,13 +2576,17 @@ void cudaF_diff_sigmoid (dim3 Gr, dim3 Bl, float* eout, const float* e, const fl
 }
 
 void cudaF_tanh (dim3 Gr, dim3 Bl, float* y, const float* x, MatrixDim d, int src_stride) {
-  _tanh<<<Gr,Bl>>>(y, x, d, src_stride); 
+  _tanh<<<Gr,Bl>>>(y, x, d, src_stride);
 }
 
 void cudaF_diff_tanh (dim3 Gr, dim3 Bl, float* eout, const float* e, const float* y, MatrixDim d, int e_stride, int y_stride) {
   _diff_tanh<<<Gr,Bl>>>(eout, e, y, d, e_stride, y_stride);
 }
 
+void cudaF_heaviside (dim3 Gr, dim3 Bl, float* y, const float* x, MatrixDim d, int src_stride) {
+  _heaviside<<<Gr,Bl>>>(y, x, d, src_stride);
+}
+
 void cudaF_softmax_reduce (size_t Gr, size_t Bl, float* y, const float* x, MatrixDim d, int src_stride) {
   _softmax_reduce<<<Gr,Bl>>>(y, x, d, src_stride);
 }
@@ -2493,7 +2596,7 @@ void cudaF_log_softmax_reduce (size_t Gr, size_t Bl, float* y, const float* x, M
 }
 
 void cudaF_splice(dim3 Gr, dim3 Bl, float* y, const float* x, const int32_cuda* off, MatrixDim d_out, MatrixDim d_in) {
-  _splice<<<Gr,Bl>>>(y,x,off,d_out,d_in); 
+  _splice<<<Gr,Bl>>>(y,x,off,d_out,d_in);
 }
 
 void cudaF_one(int Gr, int Bl, float* x, int dim) {
@@ -2517,20 +2620,20 @@ void cudaF_copy_from_sp(dim3 Gr, dim3 Bl, const float* x, float* y, MatrixDim di
 }
 
 void cudaF_copy(dim3 Gr, dim3 Bl, float* y, const float* x, const int32_cuda* copy_from, MatrixDim d_out, MatrixDim d_in) {
-  _copy<<<Gr,Bl>>>(y,x,copy_from,d_out,d_in); 
+  _copy<<<Gr,Bl>>>(y,x,copy_from,d_out,d_in);
 }
-  
-void cudaF_randomize(dim3 Gr, dim3 Bl, float* y, const float* x, const int32_cuda* copy_from, MatrixDim d_out, MatrixDim d_in) { 
-  _randomize<<<Gr,Bl>>>(y,x,copy_from,d_out,d_in); 
+
+void cudaF_randomize(dim3 Gr, dim3 Bl, float* y, const float* x, const int32_cuda* copy_from, MatrixDim d_out, MatrixDim d_in) {
+  _randomize<<<Gr,Bl>>>(y,x,copy_from,d_out,d_in);
 }
 
 
 void cudaF_regularize_l1(dim3 Gr, dim3 Bl, float* wei, float* grad, float l1, float lr, MatrixDim d, int stride_grad) {
-  _regularize_l1<<<Gr,Bl>>>(wei,grad,l1,lr,d,stride_grad); 
+  _regularize_l1<<<Gr,Bl>>>(wei,grad,l1,lr,d,stride_grad);
 }
 
-void cudaF_find_row_max_id(dim3 Gr, dim3 Bl, const float* mat, float* vec_val, int32_cuda* vec_id, int32_cuda voff, MatrixDim d) {
-  _find_row_max_id<<<Gr,Bl>>>(mat, vec_val, vec_id, voff, d);
+void cudaF_find_row_max_id(dim3 Gr, dim3 Bl, const float* mat, float* vec_val, int32_cuda* vec_id, MatrixDim d) {
+  _find_row_max_id<<<Gr,Bl>>>(mat, vec_val, vec_id, d);
 }
 
 void cudaF_diff_xent(dim3 Gr, dim3 Bl, const int32_cuda* vec_tgt, float* mat_net_out, float* vec_log_post, MatrixDim d) {
@@ -2541,10 +2644,6 @@ void cudaF_copy_rows_from_vec(dim3 Gr, dim3 Bl, float *mat_out, MatrixDim d_out,
   _copy_rows_from_vec<<<Gr,Bl>>>(mat_out, d_out, v_in);
 }
 
-void cudaF_copy_col_from_mat(int Gr, int Bl, float* v, int col, const float* mat, MatrixDim dmat, int dim) {
-  _copy_col_from_mat<<<Gr,Bl>>>(v,col,mat,dmat,dim);
-}
-
 void cudaF_copy_col_from_mat_df(int Gr, int Bl, double* v, int col, const float* mat, MatrixDim dmat, int dim) {
   _copy_col_from_mat_df<<<Gr,Bl>>>(v,col,mat,dmat,dim);
 }
@@ -2578,7 +2677,7 @@ void cudaF_equal_element_mask(dim3 Gr, dim3 Bl, const float *mat1,
 }
 
 /*
- * "double" 
+ * "double"
  */
 
 /*
@@ -2607,11 +2706,6 @@ void cudaDF_copy_from_tp(dim3 Gr, dim3 Bl, double* A, const float* B, MatrixDim
   _copy_from_tp<<<Gr,Bl>>>(A,B,dmat);
 }
 
-
-void cudaD_copy_col_from_vec(int Gr, int Bl, double* mat, const double* v, int col, MatrixDim d) {
-  _copy_col_from_vec<<<Gr,Bl>>>(mat,v,col,d);
-}
-
 void cudaD_transpose_matrix(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) {
   _transpose_matrix<<<Gr,Bl>>>(mat, d);
 }
@@ -2685,7 +2779,7 @@ void cudaD_add_diag_packed(int Gr, int Bl, double* mat, double value, int dim) {
 }
 
 void cudaD_set_const(dim3 Gr, dim3 Bl, double* mat, double value, MatrixDim d) {
-  _set_const<<<Gr,Bl>>>(mat,value,d); 
+  _set_const<<<Gr,Bl>>>(mat,value,d);
 }
 
 void cudaD_set_zero_above_diag(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) {
@@ -2693,7 +2787,7 @@ void cudaD_set_zero_above_diag(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) {
 }
 
 void cudaD_add(dim3 Gr, dim3 Bl, double* mat, double value, MatrixDim d) {
-  _add<<<Gr,Bl>>>(mat,value,d); 
+  _add<<<Gr,Bl>>>(mat,value,d);
 }
 
 void cudaD_scale_diag_packed(int Gr, int Bl, double* mat, double value, int dim) {
@@ -2701,46 +2795,46 @@ void cudaD_scale_diag_packed(int Gr, int Bl, double* mat, double value, int dim)
 }
 
 void cudaD_scale(dim3 Gr, dim3 Bl, double* mat, double value, MatrixDim d) {
-  _scale<<<Gr,Bl>>>(mat,value,d); 
+  _scale<<<Gr,Bl>>>(mat,value,d);
 }
 
 void cudaD_apply_log(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) {
-  _apply_log<<<Gr,Bl>>>(mat,d); 
+  _apply_log<<<Gr,Bl>>>(mat,d);
 }
 
 void cudaD_mul_elements(dim3 Gr, dim3 Bl, double* mat, const double* A, MatrixDim dst_d, int src_stride) {
-  _mul_elements<<<Gr,Bl>>>(mat,A,dst_d,src_stride); 
+  _mul_elements<<<Gr,Bl>>>(mat,A,dst_d,src_stride);
 }
 
 void cudaD_div_elements(dim3 Gr, dim3 Bl, double* mat, const double* A, MatrixDim dst_d, int src_stride) {
-  _div_elements<<<Gr,Bl>>>(mat,A,dst_d,src_stride); 
+  _div_elements<<<Gr,Bl>>>(mat,A,dst_d,src_stride);
 }
 
 void cudaD_max(dim3 Gr, dim3 Bl, double* mat, const double* A, MatrixDim dst_d, int src_stride) {
-  _max<<<Gr,Bl>>>(mat,A,dst_d,src_stride); 
+  _max<<<Gr,Bl>>>(mat,A,dst_d,src_stride);
 }
 
 void cudaD_mul_cols_vec(dim3 Gr, dim3 Bl, double* mat, const double* scale, MatrixDim d) {
-  _mul_cols_vec<<<Gr,Bl>>>(mat,scale,d); 
+  _mul_cols_vec<<<Gr,Bl>>>(mat,scale,d);
 }
 
 void cudaD_mul_rows_vec(dim3 Gr, dim3 Bl, double* mat, const double* scale, MatrixDim d) {
   _mul_rows_vec<<<Gr,Bl>>>(mat,scale,d);
 }
 
-void cudaD_mul_rows_group_mat(dim3 Gr, dim3 Bl, double* y, const double* x, 
+void cudaD_mul_rows_group_mat(dim3 Gr, dim3 Bl, double* y, const double* x,
 			      MatrixDim d, int src_stride, int group_size) {
   _mul_rows_group_mat<<<Gr,Bl>>>(y, x, d, src_stride, group_size);
 }
 
-void cudaD_calc_pnorm_deriv(dim3 Gr, dim3 Bl, double*y, const double* x1, 
-			    const double* x2, MatrixDim d, int src_stride, 
+void cudaD_calc_pnorm_deriv(dim3 Gr, dim3 Bl, double*y, const double* x1,
+			    const double* x2, MatrixDim d, int src_stride,
 			    int group_size, double power) {
   _calc_pnorm_deriv<<<Gr,Bl>>>(y, x1, x2, d, src_stride, group_size, power);
 }
 
-void cudaD_calc_group_max_deriv(dim3 Gr, dim3 Bl, double*y, const double* x1, 
-			        const double* x2, MatrixDim d, int src_stride, 
+void cudaD_calc_group_max_deriv(dim3 Gr, dim3 Bl, double*y, const double* x1,
+			        const double* x2, MatrixDim d, int src_stride,
 			        int group_size) {
   _calc_group_max_deriv<<<Gr,Bl>>>(y, x1, x2, d, src_stride, group_size);
 }
@@ -2753,7 +2847,15 @@ void cudaD_add_mat(dim3 Gr, dim3 Bl, double alpha, const double* src, double* ds
   if (A_trans) {
     _add_mat_trans<<<Gr,Bl>>>(alpha,src,dst,d,src_stride);
   } else {
-    _add_mat<<<Gr,Bl>>>(alpha,src,dst,d,src_stride);   
+    _add_mat<<<Gr,Bl>>>(alpha,src,dst,d,src_stride);
+  }
+}
+
+void cudaD_add_mat_blocks(dim3 Gr, dim3 Bl, double alpha, const double* src, int32_cuda num_row_blocks, int32_cuda num_col_blocks, double* dst, MatrixDim d, int src_stride, int A_trans) {
+  if (A_trans) {
+    _add_mat_blocks_trans<<<Gr,Bl>>>(alpha, src, num_row_blocks, num_col_blocks, dst, d, src_stride);
+  } else {
+    _add_mat_blocks<<<Gr,Bl>>>(alpha, src, num_row_blocks, num_col_blocks, dst, d, src_stride);
   }
 }
 
@@ -2767,11 +2869,11 @@ void cudaD_sy_add_tr2(dim3 Gr, dim3 Bl, double alpha, double beta, const double*
 }
 
 void cudaD_add_vec_to_cols(dim3 Gr, dim3 Bl, double alpha, const double* col, double beta, double* dst, MatrixDim d) {
-  _add_vec_to_cols<<<Gr,Bl>>>(alpha,col,beta,dst,d); 
+  _add_vec_to_cols<<<Gr,Bl>>>(alpha,col,beta,dst,d);
 }
 
 void cudaD_add_vec_to_rows(dim3 Gr, dim3 Bl, double alpha, const double* row, double beta, double* dst, MatrixDim d) {
-  _add_vec_to_rows<<<Gr,Bl>>>(alpha,row,beta,dst,d); 
+  _add_vec_to_rows<<<Gr,Bl>>>(alpha,row,beta,dst,d);
 }
 
 void cudaD_add_mat_diag_vec(dim3 Gr, dim3 Bl, double alpha, double *mat, MatrixDim mat_dim, const double *mat2, int mat2_row_stride, int mat2_col_stride, const double *vec,  double beta) {
@@ -2784,7 +2886,7 @@ void cudaD_add_mat_mat_elements(dim3 Gr, dim3 Bl, double *data, const double *sr
 
 // CURRENTLY UNUSED...
 void cudaD_apply_mask(dim3 Gr, dim3 Bl, double* mat, const char* mask, MatrixDim dmat, MatrixDim dmask) {
-  _apply_mask<<<Gr,Bl>>>(mat,mask,dmat,dmask); 
+  _apply_mask<<<Gr,Bl>>>(mat,mask,dmat,dmask);
 }
 
 
@@ -2820,16 +2922,16 @@ void cudaD_vec_max(const double* v, double* value, int dim) {
   _vec_max<<<1,CU1DBLOCK>>>(v, value, dim);
 }
 
-void cudaD_trace_mat_mat_trans(const double* A, const double* B, MatrixDim dA, int B_stride, double* value) {
-  _trace_mat_mat_trans<double,4> <<<4,CU1DBLOCK>>>(A,B,dA,B_stride,value);
+void cudaD_trace_mat_mat_trans(dim3 Gr, dim3 Bl, const double* A, const double* B, MatrixDim dA, int B_stride, double* value) {
+  _trace_mat_mat_trans<<<Gr,Bl>>>(A,B,dA,B_stride,value);
 }
 
-void cudaD_trace_mat_mat(const double* A, const double* B, MatrixDim dA, int B_stride, double* value) {
-  _trace_mat_mat<double,2> <<<2,CU1DBLOCK>>>(A,B,dA,B_stride,value);
+void cudaD_trace_mat_mat(dim3 Gr, dim3 Bl, const double* A, const double* B, MatrixDim dA, int B_stride, double* value) {
+  _trace_mat_mat<32><<<Gr,Bl>>>(A,B,dA,B_stride,value);
 }
 
-void cudaD_add_diag_mat_mat(int Gr, int Bl, double alpha, double* v, int v_dim, const double* M, 
-     int M_cols, int M_row_stride, int M_col_stride, const double *N, int N_row_stride, 
+void cudaD_add_diag_mat_mat(int Gr, int Bl, double alpha, double* v, int v_dim, const double* M,
+     int M_cols, int M_row_stride, int M_col_stride, const double *N, int N_row_stride,
      int N_col_stride, int threads_per_element, double beta) {
    _add_diag_mat_mat<<<Gr,Bl>>>(alpha, v, v_dim, M, M_cols, M_row_stride, M_col_stride,
                                 N, N_row_stride, N_col_stride, threads_per_element, beta);
@@ -2839,10 +2941,6 @@ void cudaD_add_vec_vec(int Gr, int Bl, double alpha, double* v, const double* x,
   _add_vec_vec<<<Gr,Bl>>>(alpha,v,x,y,beta,dim);
 }
 
-void cudaD_copy_col_from_mat(int Gr, int Bl, double* v, int col, const double* mat, MatrixDim dmat, int dim) {
-  _copy_col_from_mat<<<Gr,Bl>>>(v,col,mat,dmat,dim);
-}
-
 void cudaD_copy_col_from_mat_df(int Gr, int Bl, double* v, int col, const double* mat, MatrixDim dmat, int dim) {
   _copy_col_from_mat_df<<<Gr,Bl>>>(v,col,mat,dmat,dim);
 }
@@ -2859,8 +2957,12 @@ void cudaD_pvec_sum(int Gr, int Bl, double* v, double* pvec_sum, int dim, int si
   _pvec_sum<<<Gr,Bl>>>(v,pvec_sum,dim,size);
 }
 
-void cudaD_matrix_add_elements(dim3 Gr, dim3 Bl, double *data, MatrixDim dim, double alpha, MatrixElement<double>* x, int s) { 
-  _cuda_matrix_add_elements<<<Gr, Bl>>>(data, dim, alpha, x, s); 
+void cudaD_matrix_add_elements(dim3 Gr, dim3 Bl, double *data, MatrixDim dim, double alpha, MatrixElement<double>* x, int num_elements) {
+  _cuda_matrix_add_elements<<<Gr, Bl>>>(data, dim, alpha, x, num_elements);
+}
+
+void cudaD_matrix_add_indexed_values(dim3 Gr, dim3 Bl, MatrixDim dim, double alpha, const Int32Pair* indices, const double* x, int s, double* data) {
+  _cuda_matrix_add_indexed_values<<<Gr, Bl>>>(dim, alpha, indices, x, s, data);
 }
 
 void cudaD_vec_copy_diag_from_packed(int Gr, int Bl, double *dst, const double *src, int dim) {
@@ -2871,6 +2973,10 @@ void cudaD_vec_apply_floor(int Gr, int Bl, double* v, double floor_val, float *c
   _vec_apply_floor<<<Gr,Bl>>>(v,floor_val,count,dim);
 }
 
+void cudaD_vec_apply_ceiling(int Gr, int Bl, double* v, double ceiling_val, float *count, int dim) {
+  _vec_apply_ceiling<<<Gr,Bl>>>(v,ceiling_val,count,dim);
+}
+
 void cudaD_vec_apply_exp(int Gr, int Bl, double* v, int dim) {
   _vec_apply_exp<<<Gr,Bl>>>(v,dim);
 }
@@ -2911,21 +3017,21 @@ void cudaD_block_add_mat_mat(dim3 Gr, dim3 Bl, CuBlockMatrixData *B_cu_data, int
  * cu::
  */
 void cudaD_soft_hinge (dim3 Gr, dim3 Bl, double* y, const double* x, MatrixDim d, int src_stride) {
-  _soft_hinge<<<Gr,Bl>>>(y, x, d, src_stride); 
+  _soft_hinge<<<Gr,Bl>>>(y, x, d, src_stride);
 }
 
-void cudaD_group_pnorm(dim3 Gr, dim3 Bl, double* y, const double* x, MatrixDim d, 
+void cudaD_group_pnorm(dim3 Gr, dim3 Bl, double* y, const double* x, MatrixDim d,
 		       int src_stride, int group_size, double power) {
   _group_pnorm<<<Gr,Bl>>>(y, x, d, src_stride, group_size, power);
 }
 
-void cudaD_group_max(dim3 Gr, dim3 Bl, double* y, const double* x, MatrixDim d, 
+void cudaD_group_max(dim3 Gr, dim3 Bl, double* y, const double* x, MatrixDim d,
 		     int src_stride, int group_size) {
   _group_max<<<Gr,Bl>>>(y, x, d, src_stride, group_size);
 }
 
 void cudaD_sigmoid (dim3 Gr, dim3 Bl, double* y, const double* x, MatrixDim d, int src_stride) {
-  _sigmoid<<<Gr,Bl>>>(y, x, d, src_stride); 
+  _sigmoid<<<Gr,Bl>>>(y, x, d, src_stride);
 }
 
 void cudaD_diff_sigmoid (dim3 Gr, dim3 Bl, double* eout, const double* e, const double* y, MatrixDim d, int e_stride, int y_stride) {
@@ -2933,13 +3039,17 @@ void cudaD_diff_sigmoid (dim3 Gr, dim3 Bl, double* eout, const double* e, const
 }
 
 void cudaD_tanh (dim3 Gr, dim3 Bl, double* y, const double* x, MatrixDim d, int src_stride) {
-  _tanh<<<Gr,Bl>>>(y, x, d, src_stride); 
+  _tanh<<<Gr,Bl>>>(y, x, d, src_stride);
 }
 
 void cudaD_diff_tanh (dim3 Gr, dim3 Bl, double* eout, const double* e, const double* y, MatrixDim d, int e_stride, int y_stride) {
   _diff_tanh<<<Gr,Bl>>>(eout, e, y, d, e_stride, y_stride);
 }
 
+void cudaD_heaviside (dim3 Gr, dim3 Bl, double* y, const double* x, MatrixDim d, int src_stride) {
+  _heaviside<<<Gr,Bl>>>(y, x, d, src_stride);
+}
+
 void cudaD_softmax_reduce (size_t Gr, size_t Bl, double* y, const double* x, MatrixDim d, int src_stride) {
   _softmax_reduce<<<Gr,Bl>>>(y, x, d, src_stride);
 }
@@ -2949,7 +3059,7 @@ void cudaD_log_softmax_reduce (size_t Gr, size_t Bl, double* y, const double* x,
 }
 
 void cudaD_splice(dim3 Gr, dim3 Bl, double* y, const double* x, const int32_cuda* off, MatrixDim d_out, MatrixDim d_in) {
-  _splice<<<Gr,Bl>>>(y,x,off,d_out,d_in); 
+  _splice<<<Gr,Bl>>>(y,x,off,d_out,d_in);
 }
 
 void cudaD_one(int Gr, int Bl, double* x, int dim) {
@@ -2973,19 +3083,19 @@ void cudaD_copy_from_sp(dim3 Gr, dim3 Bl, const double* x, double* y, MatrixDim
 }
 
 void cudaD_copy(dim3 Gr, dim3 Bl, double* y, const double* x, const int32_cuda* copy_from, MatrixDim d_out, MatrixDim d_in) {
-  _copy<<<Gr,Bl>>>(y,x,copy_from,d_out,d_in); 
+  _copy<<<Gr,Bl>>>(y,x,copy_from,d_out,d_in);
 }
-  
-void cudaD_randomize(dim3 Gr, dim3 Bl, double* y, const double* x, const int32_cuda* copy_from, MatrixDim d_out, MatrixDim d_in) { 
-  _randomize<<<Gr,Bl>>>(y,x,copy_from,d_out,d_in); 
+
+void cudaD_randomize(dim3 Gr, dim3 Bl, double* y, const double* x, const int32_cuda* copy_from, MatrixDim d_out, MatrixDim d_in) {
+  _randomize<<<Gr,Bl>>>(y,x,copy_from,d_out,d_in);
 }
 
 void cudaD_regularize_l1(dim3 Gr, dim3 Bl, double* wei, double* grad, double l1, double lr, MatrixDim d,int stride_grad) {
-  _regularize_l1<<<Gr,Bl>>>(wei,grad,l1,lr,d,stride_grad); 
+  _regularize_l1<<<Gr,Bl>>>(wei,grad,l1,lr,d,stride_grad);
 }
 
-void cudaD_find_row_max_id(dim3 Gr, dim3 Bl, const double* mat, double* vec_val, int32_cuda* vec_id, int32_cuda voff, MatrixDim d) {
-  _find_row_max_id<<<Gr,Bl>>>(mat, vec_val, vec_id, voff, d);
+void cudaD_find_row_max_id(dim3 Gr, dim3 Bl, const double* mat, double* vec_val, int32_cuda* vec_id, MatrixDim d) {
+  _find_row_max_id<<<Gr,Bl>>>(mat, vec_val, vec_id, d);
 }
 
 void cudaD_diff_xent(dim3 Gr, dim3 Bl, const int32_cuda* vec_tgt, double* mat_net_out, double* vec_log_post, MatrixDim d) {
@@ -3041,19 +3151,19 @@ void cuda_copy_from_mat_dd(dim3 Gr, dim3 Bl, double *mat_out, const double* mat_
 }
 
 void cuda_copy_from_mat_df_trans(dim3 Gr, dim3 Bl, double* mat_out, const float* mat_in, MatrixDim d_out, MatrixDim d_in) {
-  _copy_from_mat_trans<<<Gr,Bl>>>(mat_out,mat_in,d_out,d_in);
+  _copy_from_mat_trans<32><<<Gr,Bl>>>(mat_out,mat_in,d_out,d_in);
 }
 
-void cuda_copy_from_mat_ff_trans(dim3 Gr, dim3 Bl, float* mat_out, const float* mat_in, MatrixDim d_out, MatrixDim d_in) {
-  _copy_from_mat_trans<<<Gr,Bl>>>(mat_out,mat_in,d_out,d_in);
+void cuda_copy_from_mat_ff_trans(dim3 Gr, dim3 Bl, float* mat_out,  const float* mat_in, MatrixDim d_out, MatrixDim d_in) {
+  _copy_from_mat_trans<32><<<Gr,Bl>>>(mat_out,mat_in,d_out,d_in);
 }
 
 void cuda_copy_from_mat_fd_trans(dim3 Gr, dim3 Bl, float *mat_out, const double* mat_in, MatrixDim d_out, MatrixDim d_in) {
-  _copy_from_mat_trans<<<Gr,Bl>>>(mat_out,mat_in,d_out,d_in);
+  _copy_from_mat_trans<32><<<Gr,Bl>>>(mat_out,mat_in,d_out,d_in);
 }
 
 void cuda_copy_from_mat_dd_trans(dim3 Gr, dim3 Bl, double *mat_out, const double* mat_in, MatrixDim d_out, MatrixDim d_in) {
-  _copy_from_mat_trans<<<Gr,Bl>>>(mat_out,mat_in,d_out,d_in);
+  _copy_from_mat_trans<32><<<Gr,Bl>>>(mat_out,mat_in,d_out,d_in);
 }
 
 void cuda_copy_from_smat_ff(dim3 Gr, dim3 Bl, float* mat_out, const MatrixElement<float>* smat_in, MatrixDim d_out, MatrixIndexT_cuda d_in) {
@@ -3081,19 +3191,6 @@ void cuda_copy_from_smat_dd_trans(dim3 Gr, dim3 Bl, double* mat_out, const Matri
   _copy_from_smat_trans<<<Gr,Bl>>>(mat_out, smat_in, d_out, d_in);
 }
 
-void cuda_copy_from_smat_as_vec_ff(dim3 Gr, dim3 Bl, float* vec_out, const MatrixElement<float>* smat_in, MatrixIndexT_cuda d_in) {
-  _copy_from_smat_as_vec<<<Gr,Bl>>>(vec_out, smat_in, d_in);
-}
-void cuda_copy_from_smat_as_vec_fd(dim3 Gr, dim3 Bl, float* vec_out, const MatrixElement<double>* smat_in, MatrixIndexT_cuda d_in) {
-  _copy_from_smat_as_vec<<<Gr,Bl>>>(vec_out, smat_in, d_in);
-}
-void cuda_copy_from_smat_as_vec_df(dim3 Gr, dim3 Bl, double* vec_out, const MatrixElement<float>* smat_in, MatrixIndexT_cuda d_in) {
-  _copy_from_smat_as_vec<<<Gr,Bl>>>(vec_out, smat_in, d_in);
-}
-void cuda_copy_from_smat_as_vec_dd(dim3 Gr, dim3 Bl, double* vec_out, const MatrixElement<double>* smat_in, MatrixIndexT_cuda d_in) {
-  _copy_from_smat_as_vec<<<Gr,Bl>>>(vec_out, smat_in, d_in);
-}
-
 void cudaF_trace_mat_smat(dim3 Gr, dim3 Bl, const float* mat_in, const MatrixElement<float>* smat_in, MatrixDim mat_d_in, MatrixIndexT_cuda smat_d_in, float* trace_vec_out) {
   _trace_mat_smat<<<Gr,Bl>>>(mat_in, smat_in, mat_d_in, smat_d_in, trace_vec_out);
 }
diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h
index 9464f9e261a..342f2705e74 100644
--- a/src/cudamatrix/cu-kernels.h
+++ b/src/cudamatrix/cu-kernels.h
@@ -4,7 +4,7 @@
 //                2013  Ehsan Variani
 //                2014  Johns Hopkins University (author: Daniel Povey)
 //                2013  Hainan Xu
-//                2013  Xiaohui Zhang    
+//                2013  Xiaohui Zhang
 //           2013-2015  Guoguo Chen
 
 // See ../../COPYING for clarification regarding multiple authors
@@ -33,14 +33,14 @@
 #include "cudamatrix/cu-kernels-ansi.h"
 
 /*
- * In this file are C++ templated wrappers 
+ * In this file are C++ templated wrappers
  * of the ANSI-C CUDA kernels
  */
 
 namespace kaldi {
 
 /*
- * CuMatrix 
+ * CuMatrix
  */
 
 inline void cuda_copy_upp_low(dim3 Gr, dim3 Bl, float* A, MatrixDim dimA) { cudaF_copy_upp_low(Gr, Bl, A, dimA); }
@@ -108,19 +108,6 @@ inline void cuda_copy_from_smat_trans(dim3 Gr, dim3 Bl, double* mat_out, const M
   cuda_copy_from_smat_dd_trans(Gr, Bl, mat_out, smat_in, d_out, d_in);
 }
 
-inline void cuda_copy_from_smat_as_vec(dim3 Gr, dim3 Bl, float* vec_out, const MatrixElement<float>* smat_in, MatrixIndexT_cuda d_in) {
-  cuda_copy_from_smat_as_vec_ff(Gr, Bl, vec_out, smat_in, d_in);
-}
-inline void cuda_copy_from_smat_as_vec(dim3 Gr, dim3 Bl, float* vec_out, const MatrixElement<double>* smat_in, MatrixIndexT_cuda d_in) {
-  cuda_copy_from_smat_as_vec_fd(Gr, Bl, vec_out, smat_in, d_in);
-}
-inline void cuda_copy_from_smat_as_vec(dim3 Gr, dim3 Bl, double* vec_out, const MatrixElement<float>* smat_in, MatrixIndexT_cuda d_in) {
-  cuda_copy_from_smat_as_vec_df(Gr, Bl, vec_out, smat_in, d_in);
-}
-inline void cuda_copy_from_smat_as_vec(dim3 Gr, dim3 Bl, double* vec_out, const MatrixElement<double>* smat_in, MatrixIndexT_cuda d_in) {
-  cuda_copy_from_smat_as_vec_dd(Gr, Bl, vec_out, smat_in, d_in);
-}
-
 inline void cuda_trace_mat_smat(dim3 Gr, dim3 Bl, const float* mat_in, const MatrixElement<float>* smat_in, MatrixDim mat_d_in, MatrixIndexT_cuda smat_d_in, float* trace_vec_out) {
   cudaF_trace_mat_smat(Gr, Bl, mat_in, smat_in, mat_d_in, smat_d_in, trace_vec_out);
 }
@@ -134,7 +121,6 @@ inline void cuda_trace_mat_smat_trans(dim3 Gr, dim3 Bl, const double* mat_in, co
   cudaD_trace_mat_smat_trans(Gr, Bl, mat_in, smat_in, mat_d_in, smat_d_in, trace_vec_out);
 }
 
-inline void cuda_copy_col_from_vec(int Gr, int Bl, float* mat, const float* v, int col, MatrixDim d) { cudaF_copy_col_from_vec(Gr,Bl,mat,v,col,d); }
 inline void cuda_apply_exp(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) { cudaF_apply_exp(Gr,Bl,mat,d); }
 inline void cuda_apply_pow(dim3 Gr, dim3 Bl, float* mat, float power, MatrixDim dim) { cudaF_apply_pow(Gr,Bl,mat,power,dim); }
 inline void cuda_apply_pow_abs(dim3 Gr, dim3 Bl, float* mat, float power, bool include_sign, MatrixDim dim) { cudaF_apply_pow_abs(Gr,Bl,mat,power,include_sign, dim); }
@@ -183,16 +169,17 @@ inline void cuda_mul_rows_group_mat(dim3 Gr, dim3 Bl, float *y, const float *x,
 inline void cuda_calc_pnorm_deriv(dim3 Gr, dim3 Bl, float *y, const float *x1, const float *x2,  MatrixDim d, int src_stride, int group_size, float power) {cudaF_calc_pnorm_deriv(Gr, Bl, y, x1, x2, d, src_stride, group_size, power); }
 inline void cuda_calc_group_max_deriv(dim3 Gr, dim3 Bl, float *y, const float *x1, const float *x2,  MatrixDim d, int src_stride, int group_size) {cudaF_calc_group_max_deriv(Gr, Bl, y, x1, x2, d, src_stride, group_size); }
 inline void cuda_add_mat(dim3 Gr, dim3 Bl, float alpha, const float *src, float *dst, MatrixDim d, int src_stride, int A_trans) { cudaF_add_mat(Gr,Bl,alpha,src,dst,d,src_stride, A_trans); }
+inline void cuda_add_mat_blocks(dim3 Gr, dim3 Bl, float alpha, const float *src, int32_cuda num_row_blocks, int32_cuda num_col_blocks, float *dst, MatrixDim d, int src_stride, int A_trans) { cudaF_add_mat_blocks(Gr, Bl, alpha, src, num_row_blocks, num_col_blocks, dst, d, src_stride, A_trans); }
 inline void cuda_add_mat_mat_div_mat(dim3 Gr, dim3 Bl, const float *A, const float *B, const float *C, float *dst, MatrixDim d, int stride_a, int stride_b, int stride_c) { cudaF_add_mat_mat_div_mat(Gr,Bl,A,B,C,dst,d,stride_a,stride_b,stride_c); }
 inline void cuda_add_vec_to_cols(dim3 Gr, dim3 Bl, float alpha, const float *col, float beta, float *dst, MatrixDim d) { cudaF_add_vec_to_cols(Gr,Bl,alpha,col,beta,dst,d); }
 inline void cuda_add_vec_to_rows(dim3 Gr, dim3 Bl, float alpha, const float *row, float beta, float *dst, MatrixDim d) { cudaF_add_vec_to_rows(Gr,Bl,alpha,row,beta,dst,d); }
 inline void cuda_transpose_matrix(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) { cudaF_transpose_matrix(Gr, Bl, mat, d); }
 inline void cuda_sy_add_tr2(dim3 Gr, dim3 Bl, float alpha, float beta, const float* T, MatrixDim tdim, float *S, MatrixDim sdim) { cudaF_sy_add_tr2(Gr, Bl, alpha, beta, T, tdim, S, sdim); }
 inline void cuda_add_mat_diag_vec(dim3 Gr, dim3 Bl, float alpha, float *mat, MatrixDim mat_dim, const float *mat2, int mat2_row_stride, int mat2_col_stride, const float *vec,  float beta) { cudaF_add_mat_diag_vec(Gr, Bl, alpha, mat, mat_dim, mat2, mat2_row_stride, mat2_col_stride, vec, beta); }
-inline void cuda_add_mat_mat_elements(dim3 Gr, dim3 Bl, float *data, const float *srcA_data, const float *srcB_data, MatrixDim dim, int srcA_stride, int srcB_stride, float alpha, float beta) { cudaF_add_mat_mat_elements(Gr, Bl, data, srcA_data, srcB_data, dim, srcA_stride, srcB_stride, alpha, beta); } 
+inline void cuda_add_mat_mat_elements(dim3 Gr, dim3 Bl, float *data, const float *srcA_data, const float *srcB_data, MatrixDim dim, int srcA_stride, int srcB_stride, float alpha, float beta) { cudaF_add_mat_mat_elements(Gr, Bl, data, srcA_data, srcB_data, dim, srcA_stride, srcB_stride, alpha, beta); }
+
 
 
- 
 /*
  * CuVector
  */
@@ -205,22 +192,22 @@ inline void cuda_vec_mul_elements(int Gr, int Bl, float* v, const float* a, int
 inline void cuda_vec_soft_max(int Gr, int Bl, float* v, int dim) { cudaF_vec_soft_max(Gr,Bl,v,dim); }
 inline void cuda_vec_min(const float* v, float* value, int dim) { cudaF_vec_min(v,value,dim); }
 inline void cuda_vec_max(const float* v, float* value, int dim) { cudaF_vec_max(v,value,dim); }
-inline void cuda_trace_mat_mat_trans(const float* A, const float* B, MatrixDim dA, int B_stride, float* value) { cudaF_trace_mat_mat_trans(A,B,dA,B_stride,value); }
-inline void cuda_trace_mat_mat(const float* A, const float* B, MatrixDim dA, int B_stride, float* value) { cudaF_trace_mat_mat(A,B,dA,B_stride,value); }
-inline void cuda_add_diag_mat_mat(int Gr, int Bl, float alpha, float* v, int v_dim, const float* M, 
-                                  int M_cols, int M_row_stride, int M_col_stride, const float *N, int N_row_stride, 
+inline void cuda_trace_mat_mat_trans(dim3 Gr, dim3 Bl, const float* A, const float* B, MatrixDim dA, int B_stride, float* value) { cudaF_trace_mat_mat_trans(Gr,Bl,A,B,dA,B_stride,value); }
+inline void cuda_trace_mat_mat(dim3 Gr, dim3 Bl, const float* A, const float* B, MatrixDim dA, int B_stride, float* value) { cudaF_trace_mat_mat(Gr,Bl,A,B,dA,B_stride,value); }
+inline void cuda_add_diag_mat_mat(int Gr, int Bl, float alpha, float* v, int v_dim, const float* M,
+                                  int M_cols, int M_row_stride, int M_col_stride, const float *N, int N_row_stride,
                                   int N_col_stride, int threads_per_element, float beta) {
   cudaF_add_diag_mat_mat(Gr, Bl, alpha, v, v_dim, M, M_cols, M_row_stride, M_col_stride, N, N_row_stride,
                          N_col_stride, threads_per_element, beta);
 }
 inline void cuda_add_vec_vec(int Gr, int Bl, float alpha, float* v, const float* x, const float* y, float beta, int dim) { cudaF_add_vec_vec(Gr,Bl,alpha,v,x,y,beta,dim); }
-inline void cuda_copy_col_from_mat(int Gr, int Bl, float* v, int col, const float* mat, MatrixDim dmat, int dim) { cudaF_copy_col_from_mat(Gr,Bl,v,col,mat,dmat,dim); }
 inline void cuda_copy_col_from_mat_df(int Gr, int Bl, double* v, int col, const float* mat, MatrixDim dmat, int dim) { cudaF_copy_col_from_mat_df(Gr,Bl,v,col,mat,dmat,dim); }
 inline void cuda_copy_col_from_mat_fd(int Gr, int Bl, float* v, int col, const float* mat, MatrixDim dmat, int dim) { cudaF_copy_col_from_mat_fd(Gr,Bl,v,col,mat,dmat,dim); }
 inline void cuda_vec_sum(int Gr, int Bl, float* v, float* value, int dim, int inc) { cudaF_vec_sum(Gr,Bl,v,value,dim,inc); }
 inline void cuda_pvec_sum(int Gr, int Bl, float* vec, float* pvec_sum, int dim, int size) { cudaF_pvec_sum(Gr, Bl, vec, pvec_sum, dim, size); }
 inline void cuda_vec_copy_diag_from_packed(int Gr, int Bl, float *dst, const float *src, int dim) { cudaF_vec_copy_diag_from_packed(Gr,Bl,dst,src,dim); }
 inline void cuda_vec_apply_floor(int Gr, int Bl, float* v, float floor_val, float* num, int dim) { cudaF_vec_apply_floor(Gr,Bl,v,floor_val,num,dim); }
+inline void cuda_vec_apply_ceiling(int Gr, int Bl, float* v, float floor_val, float* num, int dim) { cudaF_vec_apply_ceiling(Gr,Bl,v,floor_val,num,dim); }
 inline void cuda_vec_apply_exp(int Gr, int Bl, float* v, int dim) { cudaF_vec_apply_exp(Gr,Bl,v,dim); }
 inline void cuda_vec_apply_log(int Gr, int Bl, float* v, float* flag, int dim) { cudaF_vec_apply_log(Gr,Bl,v,flag,dim); }
 inline void cuda_invert_elements(dim3 Gr, dim3 Bl, float *data, MatrixDim d) { cudaF_invert_elements(Gr,Bl,data,d); }
@@ -253,6 +240,7 @@ inline void cuda_sigmoid(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d
 inline void cuda_diff_sigmoid(dim3 Gr, dim3 Bl, float *eout, const float *e, const float *y, MatrixDim d, int e_stride, int y_stride) { cudaF_diff_sigmoid(Gr,Bl,eout,e,y,d,e_stride,y_stride); }
 inline void cuda_tanh(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride) { cudaF_tanh(Gr,Bl,y,x,d,src_stride); }
 inline void cuda_diff_tanh(dim3 Gr, dim3 Bl, float *eout, const float *e, const float *y, MatrixDim d, int e_stride, int y_stride) { cudaF_diff_tanh(Gr,Bl,eout,e,y,d,e_stride,y_stride); }
+inline void cuda_heaviside(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride) { cudaF_heaviside(Gr,Bl,y,x,d,src_stride); }
 /*
 Bl: dimBlock value is fixed min(d.col, CU1DBLOCK), represent CU1DBLOCK threads reduce a row at the same time.
 Gr: the number of rows
@@ -261,7 +249,7 @@ inline void cuda_softmax_reduce(size_t Gr, size_t Bl, float *y, const float *x,
 inline void cuda_log_softmax_reduce(size_t Gr, size_t Bl, float *y, const float *x, MatrixDim d, int src_stride) { cudaF_log_softmax_reduce(Gr,Bl,y,x,d,src_stride); }
 
 inline void cuda_regularize_l1(dim3 Gr, dim3 Bl, float *wei, float *grad, float l1, float lr, MatrixDim d, int stride_grad) { cudaF_regularize_l1(Gr,Bl,wei,grad,l1,lr,d,stride_grad); }
-inline void cuda_find_row_max_id(dim3 Gr, dim3 Bl, const float *mat, float *vec_val, int32_cuda *vec_id, int32_cuda voff, MatrixDim d) { cudaF_find_row_max_id(Gr,Bl,mat,vec_val,vec_id,voff,d); }
+inline void cuda_find_row_max_id(dim3 Gr, dim3 Bl, const float *mat, float *vec_val, int32_cuda *vec_id, MatrixDim d) { cudaF_find_row_max_id(Gr,Bl,mat,vec_val,vec_id,d); }
 inline void cuda_diff_xent(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt, float *mat_net_out, float *vec_log_post, MatrixDim d) { cudaF_diff_xent(Gr,Bl,vec_tgt,mat_net_out,vec_log_post,d); }
 inline void cuda_copy_rows_from_vec(dim3 Gr, dim3 Bl, float *mat_out, MatrixDim d_out, const float *v_in) {
   cudaF_copy_rows_from_vec(Gr, Bl, mat_out, d_out, v_in);
@@ -277,7 +265,8 @@ inline void cuda_copy_from_sp(dim3 Gr, dim3 Bl, const float* x, float* y, Matrix
 inline void cuda_take_lower(dim3 Gr, dim3 Bl, const float* x, float* y, MatrixDim d_in) { cudaF_take_lower(Gr,Bl,x,y,d_in); }
 inline void cuda_take_upper(dim3 Gr, dim3 Bl, const float* x, float* y, MatrixDim d_in) { cudaF_take_upper(Gr,Bl,x,y,d_in); }
 inline void cuda_take_mean(dim3 Gr, dim3 Bl, const float* x, float* y, MatrixDim d_in) { cudaF_take_mean(Gr,Bl,x,y,d_in); }
-inline void cuda_matrix_add_elements(dim3 Gr, dim3 Bl, float *data, MatrixDim dim, float alpha, MatrixElement<float>* x, int s) { cudaF_matrix_add_elements(Gr, Bl, data, dim, alpha, x, s); }
+inline void cuda_matrix_add_elements(dim3 Gr, dim3 Bl, float *data, MatrixDim dim, float alpha, MatrixElement<float>* x, int num_elements) { cudaF_matrix_add_elements(Gr, Bl, data, dim, alpha, x, num_elements); }
+inline void cuda_matrix_add_indexed_values(dim3 Gr, dim3 Bl, MatrixDim dim, float alpha, const Int32Pair* indices, const float* x, int s, float* data) { cudaF_matrix_add_indexed_values(Gr, Bl, dim, alpha, indices, x, s, data); }
 inline void cuda_comp_obj_deriv(dim3 Gr, dim3 Bl, MatrixElement<float>* x, int32 size, const float* z, MatrixDim d, float* z2, MatrixDim d2, float* t) {cudaF_comp_obj_deriv(Gr,Bl,x,size,z,d,z2,d2,t); }
 inline void cuda_sum_column_ranges(dim3 Gr, dim3 Bl, float *data, MatrixDim dim,
                                    const float *src_data, MatrixDim src_dim,
@@ -295,7 +284,7 @@ inline void cuda_matrix_lookup(dim3 Gr, dim3 Bl, const float *data,
   cudaF_matrix_lookup(Gr, Bl, data, dim, indices, indices_size, output);
 }
 
-inline void cuda_equal_element_mask(dim3 Gr, dim3 Bl, const float *mat1, const float *mat2, float *mask, 
+inline void cuda_equal_element_mask(dim3 Gr, dim3 Bl, const float *mat1, const float *mat2, float *mask,
                                MatrixDim mat1_dim, int mat2_stride, int mask_stride) {
   cudaF_equal_element_mask(Gr, Bl, mat1, mat2, mask, mat1_dim, mat2_stride, mask_stride);
 }
@@ -305,7 +294,7 @@ inline void cuda_equal_element_mask(dim3 Gr, dim3 Bl, const float *mat1, const f
 // double versions
 
 /*
- * CuMatrix 
+ * CuMatrix
  */
 inline void cuda_copy_upp_low(dim3 Gr, dim3 Bl, double* A, MatrixDim dimA) { cudaD_copy_upp_low(Gr, Bl, A, dimA); }
 inline void cuda_copy_low_upp(dim3 Gr, dim3 Bl, double* A, MatrixDim dimA) { cudaD_copy_low_upp(Gr, Bl, A, dimA); }
@@ -319,7 +308,6 @@ inline void cuda_copy_from_tp_trans(dim3 Gr, dim3 Bl, double* A, const double* B
 inline void cuda_copy_from_tp_trans(dim3 Gr, dim3 Bl, double* A, const float* B, MatrixDim dmat) { cudaDF_copy_from_tp_trans(Gr,Bl,A,B,dmat); }
 inline void cuda_copy_from_tp(dim3 Gr, dim3 Bl, double* A, const double* B, MatrixDim dmat) { cudaD_copy_from_tp(Gr,Bl,A,B,dmat); }
 inline void cuda_copy_from_tp(dim3 Gr, dim3 Bl, double* A, const float* B, MatrixDim dmat) { cudaDF_copy_from_tp(Gr,Bl,A,B,dmat); }
-inline void cuda_copy_col_from_vec(int Gr, int Bl, double* mat, const double* v, int col, MatrixDim d) { cudaD_copy_col_from_vec(Gr,Bl,mat,v,col,d); }
 inline void cuda_apply_exp(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) { cudaD_apply_exp(Gr,Bl,mat,d); }
 inline void cuda_apply_pow(dim3 Gr, dim3 Bl, double* mat, double power, MatrixDim dim) { cudaD_apply_pow(Gr,Bl,mat,power,dim); }
 inline void cuda_apply_pow_abs(dim3 Gr, dim3 Bl, double* mat, double power, bool include_sign, MatrixDim dim) { cudaD_apply_pow_abs(Gr,Bl,mat,power,include_sign,dim); }
@@ -368,6 +356,7 @@ inline void cuda_mul_rows_group_mat(dim3 Gr, dim3 Bl, double *y, const double *x
 inline void cuda_calc_pnorm_deriv(dim3 Gr, dim3 Bl, double *y, const double *x1, const double *x2,  MatrixDim d, int src_stride, int group_size, double power) {cudaD_calc_pnorm_deriv(Gr, Bl, y, x1, x2, d, src_stride, group_size, power); }
 inline void cuda_calc_group_max_deriv(dim3 Gr, dim3 Bl, double *y, const double *x1, const double *x2,  MatrixDim d, int src_stride, int group_size) {cudaD_calc_group_max_deriv(Gr, Bl, y, x1, x2, d, src_stride, group_size); }
 inline void cuda_add_mat(dim3 Gr, dim3 Bl, double alpha, const double *src, double *dst, MatrixDim d, int src_stride, int A_trans) { cudaD_add_mat(Gr,Bl,alpha,src,dst,d,src_stride, A_trans); }
+inline void cuda_add_mat_blocks(dim3 Gr, dim3 Bl, double alpha, const double *src, int32_cuda num_row_blocks, int32_cuda num_col_blocks, double *dst, MatrixDim d, int src_stride, int A_trans) { cudaD_add_mat_blocks(Gr, Bl, alpha, src, num_row_blocks, num_col_blocks, dst, d, src_stride, A_trans); }
 inline void cuda_add_mat_mat_div_mat(dim3 Gr, dim3 Bl, const double *A, const double *B, const double *C, double *dst, MatrixDim d, int stride_a, int stride_b, int stride_c) { cudaD_add_mat_mat_div_mat(Gr,Bl,A,B,C,dst,d,stride_a,stride_b,stride_c); }
 inline void cuda_add_vec_to_cols(dim3 Gr, dim3 Bl, double alpha, const double *col, double beta, double *dst, MatrixDim d) { cudaD_add_vec_to_cols(Gr,Bl,alpha,col,beta,dst,d); }
 inline void cuda_add_vec_to_rows(dim3 Gr, dim3 Bl, double alpha, const double *row, double beta, double *dst, MatrixDim d) { cudaD_add_vec_to_rows(Gr,Bl,alpha,row,beta,dst,d); }
@@ -388,22 +377,22 @@ inline void cuda_vec_mul_elements(int Gr, int Bl, double* v, const double* a, in
 inline void cuda_vec_soft_max(int Gr, int Bl, double* v, int dim) { cudaD_vec_soft_max(Gr,Bl,v,dim); }
 inline void cuda_vec_min(const double* v, double* value, int dim) { cudaD_vec_min(v,value,dim); }
 inline void cuda_vec_max(const double* v, double* value, int dim) { cudaD_vec_max(v,value,dim); }
-inline void cuda_trace_mat_mat_trans(const double* A, const double* B, MatrixDim dA, int B_stride, double* value) { cudaD_trace_mat_mat_trans(A,B,dA,B_stride,value); }
-inline void cuda_trace_mat_mat(const double* A, const double* B, MatrixDim dA, int B_stride, double* value) { cudaD_trace_mat_mat(A,B,dA,B_stride,value); }
-inline void cuda_add_diag_mat_mat(int Gr, int Bl, double alpha, double* v, int v_dim, const double* M, 
-                                  int M_cols, int M_row_stride, int M_col_stride, const double *N, int N_row_stride, 
+inline void cuda_trace_mat_mat_trans(dim3 Gr, dim3 Bl, const double* A, const double* B, MatrixDim dA, int B_stride, double* value) { cudaD_trace_mat_mat_trans(Gr,Bl,A,B,dA,B_stride,value); }
+inline void cuda_trace_mat_mat(dim3 Gr, dim3 Bl, const double* A, const double* B, MatrixDim dA, int B_stride, double* value) { cudaD_trace_mat_mat(Gr,Bl,A,B,dA,B_stride,value); }
+inline void cuda_add_diag_mat_mat(int Gr, int Bl, double alpha, double* v, int v_dim, const double* M,
+                                  int M_cols, int M_row_stride, int M_col_stride, const double *N, int N_row_stride,
                                   int N_col_stride, int threads_per_element, double beta) {
   cudaD_add_diag_mat_mat(Gr, Bl, alpha, v, v_dim, M, M_cols, M_row_stride, M_col_stride, N, N_row_stride,
                          N_col_stride, threads_per_element, beta);
 }
 inline void cuda_add_vec_vec(int Gr, int Bl, double alpha, double* v, const double* x, const double* y, double beta, int dim) { cudaD_add_vec_vec(Gr,Bl,alpha,v,x,y,beta,dim); }
-inline void cuda_copy_col_from_mat(int Gr, int Bl, double* v, int col, const double* mat, MatrixDim dmat, int dim) { cudaD_copy_col_from_mat(Gr,Bl,v,col,mat,dmat,dim); }
 inline void cuda_copy_col_from_mat_df(int Gr, int Bl, double* v, int col, const double* mat, MatrixDim dmat, int dim) { cudaD_copy_col_from_mat_df(Gr,Bl,v,col,mat,dmat,dim); }
 inline void cuda_copy_col_from_mat_fd(int Gr, int Bl, float* v, int col, const double* mat, MatrixDim dmat, int dim) { cudaD_copy_col_from_mat_fd(Gr,Bl,v,col,mat,dmat,dim); }
 inline void cuda_vec_sum(int Gr, int Bl, double* v, double* value, int dim, int inc) { cudaD_vec_sum(Gr,Bl,v,value,dim,inc); }
 inline void cuda_pvec_sum(int Gr, int Bl, double* vec, double* pvec_sum, int dim, int size) { cudaD_pvec_sum(Gr,Bl,vec,pvec_sum,dim,size); }
 inline void cuda_vec_copy_diag_from_packed(int Gr, int Bl, double *dst, const double *src, int dim) { cudaD_vec_copy_diag_from_packed(Gr,Bl,dst,src,dim); }
 inline void cuda_vec_apply_floor(int Gr, int Bl, double* v, double floor_val, float* num, int dim) { cudaD_vec_apply_floor(Gr,Bl,v,floor_val,num,dim); }
+inline void cuda_vec_apply_ceiling(int Gr, int Bl, double* v, double floor_val, float* num, int dim) { cudaD_vec_apply_ceiling(Gr,Bl,v,floor_val,num,dim); }
 inline void cuda_vec_apply_exp(int Gr, int Bl, double* v, int dim) { cudaD_vec_apply_exp(Gr,Bl,v,dim); }
 inline void cuda_vec_apply_log(int Gr, int Bl, double* v, double* flag, int dim) { cudaD_vec_apply_log(Gr,Bl,v,flag,dim); }
 inline void cuda_invert_elements(dim3 Gr, dim3 Bl, double *data, MatrixDim d) { cudaD_invert_elements(Gr,Bl,data,d); }
@@ -434,11 +423,12 @@ inline void cuda_sigmoid(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim
 inline void cuda_diff_sigmoid(dim3 Gr, dim3 Bl, double *eout, const double *e, const double *y, MatrixDim d, int e_stride, int y_stride) { cudaD_diff_sigmoid(Gr,Bl,eout,e,y,d,e_stride,y_stride); }
 inline void cuda_tanh(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride) { cudaD_tanh(Gr,Bl,y,x,d,src_stride); }
 inline void cuda_diff_tanh(dim3 Gr, dim3 Bl, double *eout, const double *e, const double *y, MatrixDim d, int e_stride, int y_stride) { cudaD_diff_tanh(Gr,Bl,eout,e,y,d,e_stride,y_stride); }
+inline void cuda_heaviside(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride) { cudaD_heaviside(Gr,Bl,y,x,d,src_stride); }
 inline void cuda_softmax_reduce(size_t Gr, size_t Bl, double *y, const double *x, MatrixDim d, int src_stride) { cudaD_softmax_reduce(Gr,Bl,y,x,d,src_stride); }
 inline void cuda_log_softmax_reduce(size_t Gr, size_t Bl, double *y, const double *x, MatrixDim d, int src_stride) { cudaD_log_softmax_reduce(Gr,Bl,y,x,d,src_stride); }
 
 inline void cuda_regularize_l1(dim3 Gr, dim3 Bl, double *wei, double *grad, double l1, double lr, MatrixDim d, int stride_grad) { cudaD_regularize_l1(Gr,Bl,wei,grad,l1,lr,d,stride_grad); }
-inline void cuda_find_row_max_id(dim3 Gr, dim3 Bl, const double *mat, double *vec_val, int32_cuda *vec_id, int32_cuda voff, MatrixDim d) { cudaD_find_row_max_id(Gr,Bl,mat,vec_val,vec_id,voff,d); }
+inline void cuda_find_row_max_id(dim3 Gr, dim3 Bl, const double *mat, double *vec_val, int32_cuda *vec_id, MatrixDim d) { cudaD_find_row_max_id(Gr,Bl,mat,vec_val,vec_id,d); }
 inline void cuda_diff_xent(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt, double *mat_net_out, double *vec_log_post, MatrixDim d) {
   cudaD_diff_xent(Gr,Bl,vec_tgt,mat_net_out,vec_log_post,d);
 }
@@ -454,7 +444,8 @@ inline void cuda_copy_from_sp(dim3 Gr, dim3 Bl, const double* x, double* y, Matr
 inline void cuda_take_lower(dim3 Gr, dim3 Bl, const double* x, double* y, MatrixDim d_in) { cudaD_take_lower(Gr,Bl,x,y,d_in); }
 inline void cuda_take_upper(dim3 Gr, dim3 Bl, const double* x, double* y, MatrixDim d_in) { cudaD_take_upper(Gr,Bl,x,y,d_in); }
 inline void cuda_take_mean(dim3 Gr, dim3 Bl, const double* x, double* y, MatrixDim d_in) { cudaD_take_mean(Gr,Bl,x,y,d_in); }
-inline void cuda_matrix_add_elements(dim3 Gr, dim3 Bl, double *data, MatrixDim dim, double alpha, MatrixElement<double>* x, int s) { cudaD_matrix_add_elements(Gr, Bl, data, dim, alpha, x, s); }
+inline void cuda_matrix_add_elements(dim3 Gr, dim3 Bl, double *data, MatrixDim dim, double alpha, MatrixElement<double>* x, int num_elements) { cudaD_matrix_add_elements(Gr, Bl, data, dim, alpha, x, num_elements); }
+inline void cuda_matrix_add_indexed_values(dim3 Gr, dim3 Bl, MatrixDim dim, double alpha, const Int32Pair* indices, const double* x, int s, double* data) { cudaD_matrix_add_indexed_values(Gr, Bl, dim, alpha, indices, x, s, data); }
 inline void cuda_comp_obj_deriv(dim3 Gr, dim3 Bl, MatrixElement<double>* x, int32 size, const double* z, MatrixDim d, double* z2, MatrixDim d2, double* t) {cudaD_comp_obj_deriv(Gr,Bl,x,size,z,d,z2,d2,t); }
 inline void cuda_sum_column_ranges(dim3 Gr, dim3 Bl, double *data, MatrixDim dim,
                                    const double *src_data, MatrixDim src_dim, const Int32Pair *indices) {
@@ -471,23 +462,23 @@ inline void cuda_matrix_lookup(dim3 Gr, dim3 Bl, const double *data,
   cudaD_matrix_lookup(Gr, Bl, data, dim, indices, indices_size, output);
 }
 
-inline void cuda_equal_element_mask(dim3 Gr, dim3 Bl, const double *mat1, const double *mat2, double *mask, 
+inline void cuda_equal_element_mask(dim3 Gr, dim3 Bl, const double *mat1, const double *mat2, double *mask,
                                     MatrixDim mat1_dim, int mat2_stride, int mask_stride) {
   cudaD_equal_element_mask(Gr, Bl, mat1, mat2, mask, mat1_dim, mat2_stride, mask_stride);
 }
 
 // Also include some template-friendly wrappers of cublas functions:
-inline void cuda_axpy(int n, float alpha, const float *x, int incx, float *y, int incy) {
-  cublasSaxpy(n, alpha, x, incx, y, incy);
+inline cublasStatus_t cuda_axpy(cublasHandle_t handle, int n, float alpha, const float *x, int incx, float *y, int incy) {
+  return cublasSaxpy_v2(handle, n, &alpha, x, incx, y, incy);
 }
-inline void cuda_axpy(int n, double alpha, const double *x, int incx, double *y, int incy) {
-  cublasDaxpy(n, alpha, x, incx, y, incy);
+inline cublasStatus_t cuda_axpy(cublasHandle_t handle, int n, double alpha, const double *x, int incx, double *y, int incy) {
+  return cublasDaxpy_v2(handle, n, &alpha, x, incx, y, incy);
 }
-inline void cuda_scal(int n, float alpha, float *x, int incx) {
-  cublasSscal(n, alpha, x, incx);
+inline cublasStatus_t cuda_scal(cublasHandle_t handle, int n, float alpha, float *x, int incx) {
+  return cublasSscal_v2(handle, n, &alpha, x, incx);
 }
-inline void cuda_scal(int n, double alpha, double *x, int incx) {
-  cublasDscal(n, alpha, x, incx);
+inline cublasStatus_t cuda_scal(cublasHandle_t handle, int n, double alpha, double *x, int incx) {
+  return cublasDscal_v2(handle, n, &alpha, x, incx);
 }
 
 
diff --git a/src/cudamatrix/cu-math-test.cc b/src/cudamatrix/cu-math-test.cc
index 2dae3bcb7b5..c36cb88f6f6 100644
--- a/src/cudamatrix/cu-math-test.cc
+++ b/src/cudamatrix/cu-math-test.cc
@@ -1,4 +1,4 @@
-// cudamatrix/cuda-math-test.cc
+// cudamatrix/cu-math-test.cc
 
 // Copyright 2013 Johns Hopkins University (Author: David Snyder)
 
diff --git a/src/cudamatrix/cu-math.h b/src/cudamatrix/cu-math.h
index 453cf4439fb..65a4c0c4af3 100644
--- a/src/cudamatrix/cu-math.h
+++ b/src/cudamatrix/cu-math.h
@@ -1,7 +1,7 @@
 // cudamatrix/cu-math.h
 
 // Copyright 2009-2012  Karel Vesely
-//                2013  Johns Hopkins University (Author: David Snyder) 
+//                2013  Johns Hopkins University (Author: David Snyder)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -28,9 +28,9 @@
 #include "base/timer.h"
 
 namespace kaldi {
-  
+
 namespace cu {
- 
+
 /// RegularizeL1 is a gradient step with l1 regularization added to the
 /// gradient.  We don't let the value cross over zero from positive to negative
 /// or vice versa, in a single step.  If an element tries to cross zero and is
@@ -40,9 +40,9 @@ void RegularizeL1(CuMatrixBase<Real> *weight, CuMatrixBase<Real> *gradient,
                   Real l1_penalty, Real learning_rate);
 
 /// Copies a permutation of src into tgt. The row permutation is specified in
-/// copy_from_idx such that src.Row(copy_from_idx[r]) == tgt.Row(r). The 
+/// copy_from_idx such that src.Row(copy_from_idx[r]) == tgt.Row(r). The
 /// dimensions of copy_from_idx must be equivalent to the number of rows in
-/// tgt and src and all elements in the vector must be in [0, src.numRows()-1].  
+/// tgt and src and all elements in the vector must be in [0, src.numRows()-1].
 template<typename Real>
 void Randomize(const CuMatrixBase<Real> &src,
                const CuArray<int32> &copy_from_idx,
@@ -52,10 +52,10 @@ void Randomize(const CuMatrixBase<Real> &src,
 /// The dimensions of tgt must be equivalent to the number of rows in src
 /// and it must be that tgt.NumColumns == src.NumColumns * frame_offsets.Dim().
 /// As a result, tgt(i, k*n_cols + j) == src(i + frame_offsets[k], j) for the
-/// general case where i in [0..src.NumRows()-1], 
-/// k in [0..frame_offsets.Dim()-1], j in [0..src.NumRows()-1] 
+/// general case where i in [0..src.NumRows()-1],
+/// k in [0..frame_offsets.Dim()-1], j in [0..src.NumRows()-1]
 /// and n_cols = src.NumColumns(). If i + frame_offsets[k] is greater than the
-/// number of rows in src or less than 0 than the right side of the equation 
+/// number of rows in src or less than 0 than the right side of the equation
 /// is replaced by src(src.NumRows()-1, j) or src(0, j) respectively, to avoid
 /// an index out of bounds.
 template<typename Real>
@@ -73,6 +73,13 @@ void Copy(const CuMatrixBase<Real> &src,
           const CuArray<int32> &copy_from_indices,
           CuMatrixBase<Real> *tgt);
 
+template <typename Real>
+void Group2norm(const CuMatrixBase<Real> &src,
+                CuMatrixBase<Real> *dest,
+                int32 group_stride);
+
+
+
 
 } // namespace cu
 } // namespace kaldi
diff --git a/src/cudamatrix/cu-matrix-inl.h b/src/cudamatrix/cu-matrix-inl.h
index a37e38bcd17..9b7a707d2e5 100644
--- a/src/cudamatrix/cu-matrix-inl.h
+++ b/src/cudamatrix/cu-matrix-inl.h
@@ -38,15 +38,35 @@ inline CuSubMatrix<Real>::CuSubMatrix(const CuMatrixBase<Real> &mat,
     KALDI_ASSERT(row_offset >= 0 && col_offset >= 0 &&
                  row_offset + num_rows <= mat.num_rows_ &&
                  col_offset + num_cols <= mat.num_cols_);
-    this->data_ = mat.data_ + (row_offset * mat.stride_) + col_offset;
+    this->data_ = mat.data_ + static_cast<size_t>(col_offset) +
+        static_cast<size_t>(row_offset) * static_cast<size_t>(mat.stride_);
     this->num_cols_ = num_cols;
     this->num_rows_ = num_rows;
     this->stride_ = mat.stride_;
   }
 }
-  
+
+template<typename Real>
+inline CuSubMatrix<Real>::CuSubMatrix(const Real *data,
+                                      const MatrixIndexT num_rows,
+                                      const MatrixIndexT num_cols,
+                                      const MatrixIndexT stride):
+    CuMatrixBase<Real>(const_cast<Real*>(data), num_rows, num_cols, stride) {
+  // in general if you use SubMatrix or CuSubMatrix, const-correctness is not
+  // preserved (preserving it would require us duplicating the class and it
+  // would have been a hassle).
+
+  // Note: we used to check that stride >= num_cols.  We no longer check for
+  // this as there are some situations where having stride < num_cols is useful,
+  // but beware because most if not all CUBLAS calls will crash when given
+  // such an input, even in a situation where it makes sense.
+  KALDI_ASSERT((num_rows != 0) == (num_cols != 0) && stride >= 0 &&
+               num_rows >= 0 && num_cols >= 0 && stride >= 0);
+}
+
+
 } // namespace kaldi
 
 #endif
 
-  
+
diff --git a/src/cudamatrix/cu-matrix-speed-test.cc b/src/cudamatrix/cu-matrix-speed-test.cc
index 1ef970b9272..1052733b045 100644
--- a/src/cudamatrix/cu-matrix-speed-test.cc
+++ b/src/cudamatrix/cu-matrix-speed-test.cc
@@ -40,7 +40,102 @@ template<typename Real>
 std::string NameOf() {
   return (sizeof(Real) == 8 ? "<double>" : "<float>");
 }
-    
+
+template<typename Real> void TestCuMatrixTransposeNS(int32 dim) {
+  BaseFloat time_in_secs = 0.025;
+  CuMatrix<Real> M(dim, dim / 2);
+  M.SetRandn();
+
+  Timer tim;
+  int32 iter = 0;
+  for (; tim.Elapsed() < time_in_secs; iter++) {
+    M.Transpose();
+  }
+  BaseFloat fdim = dim;
+  BaseFloat gflops = (fdim * fdim * iter / 2) / (tim.Elapsed() * 1.0e+09);
+  KALDI_LOG<< "For CuMatrix::TransposeNS" << NameOf<Real>() << ", for dim = "
+  << dim << ", speed was " << gflops << " gigaflops.";
+}
+
+template<typename Real> void TestCuMatrixTransposeS(int32 dim) {
+  BaseFloat time_in_secs = 0.025;
+  CuMatrix<Real> M(dim, dim);
+  M.SetRandn();
+
+  Timer tim;
+  int32 iter = 0;
+  for (; tim.Elapsed() < time_in_secs; iter++) {
+    M.Transpose();
+  }
+  BaseFloat fdim = dim;
+  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
+  KALDI_LOG<< "For CuMatrix::TransposeS" << NameOf<Real>() << ", for dim = "
+  << dim << ", speed was " << gflops << " gigaflops.";
+}
+
+template<typename Real> void TestCuMatrixTransposeCross(int32 dim) {
+  BaseFloat time_in_secs = 0.025;
+  CuMatrix<float> Mf(dim / 2, dim), ref(dim, dim / 2);
+  CuMatrix<Real> Md(dim, dim / 2);
+  Mf.SetRandn();
+  ref = Mf;
+
+  Timer tim;
+  int32 iter = 0;
+  for (; tim.Elapsed() < time_in_secs; iter++) {
+    Md.CopyFromMat(Mf, kTrans);
+    Mf.CopyFromMat(Md, kTrans);
+  }
+  BaseFloat fdim = dim;
+  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
+  KALDI_LOG<< "For CuMatrix::TransposeCross" << NameOf<Real>() << ", for dim = "
+  << dim << ", speed was " << gflops << " gigaflops.";
+
+  AssertEqual(ref, Mf);
+}
+
+template<typename Real> void TestCuMatrixAddMat(int32 dim, 
+		int32 num_row_blocks, int32 num_col_blocks) {
+  BaseFloat time_in_secs = 0.025;
+  CuMatrix<Real> A(dim, dim), B(dim * num_row_blocks, dim * num_col_blocks);
+  A.SetRandn();
+  B.SetRandn();
+  Timer tim;
+  int32 iter = 0;
+  for (;tim.Elapsed() < time_in_secs; iter++) {
+    for (int32 i = 0; i < num_row_blocks; i++) {
+      for (int32 j = 0; j < num_col_blocks; j++) {
+        A.AddMat(0.0, CuSubMatrix<Real>(B, i * dim, dim, j * dim, dim));
+      }
+    }
+  }
+  BaseFloat fdim = dim;
+  BaseFloat gflops = (fdim * fdim * num_row_blocks * num_col_blocks * iter) 
+	  / (tim.Elapsed() * 1.0e+09);
+  KALDI_LOG << "For CuMatrix::AddMat" << NameOf<Real>() << ", for dim = "
+	    << dim << "numRowBlocks = "<< num_row_blocks << "numColBlocks = "
+	    << num_col_blocks << ", speed was " << gflops << " gigaflops.";
+}
+
+template<typename Real> void TestCuMatrixAddMatBlocks(int32 dim, 
+		int32 num_row_blocks, int32 num_col_blocks) {
+  BaseFloat time_in_secs = 0.025;
+  CuMatrix<Real> A(dim, dim), B(dim * num_row_blocks, dim * num_col_blocks);
+  A.SetRandn();
+  B.SetRandn();
+  Timer tim;
+  int32 iter = 0;
+  for (;tim.Elapsed() < time_in_secs; iter++) {
+    A.AddMatBlocks(0.0, B);
+  }
+  BaseFloat fdim = dim;
+  BaseFloat gflops = (fdim * fdim * num_row_blocks * num_col_blocks * iter) 
+	  / (tim.Elapsed() * 1.0e+09);
+   KALDI_LOG << "For CuMatrix::AddMatBlocks" << NameOf<Real>() << ", for dim = "
+	     << dim << ", numRowBlocks = "<< num_row_blocks << ", numColBlocks = "
+	     << num_col_blocks << ", speed was " << gflops << " gigaflops.";
+}
+
 template<typename Real> void TestCuMatrixMatMat(int32 dim) {
   BaseFloat time_in_secs = 0.025;
   CuMatrix<Real> M(dim, dim), N(dim, dim), O(dim, dim);
@@ -58,6 +153,42 @@ template<typename Real> void TestCuMatrixMatMat(int32 dim) {
             << dim << ", speed was " << gflops << " gigaflops.";
 }
 
+template<typename Real> void TestCuMatrixMatMatBatched(int32 dim, int32 batchCount) {
+  std::vector<CuMatrix<Real>* > a(batchCount), b(batchCount), c(batchCount);
+  std::vector<CuSubMatrix<Real>* > A, B, C;
+  
+  for (int32 i = 0; i < batchCount; i++) {
+    // first create a Matrix intance and then creat a SubMatrix instance from that
+    a[i] = new CuMatrix<Real>(dim, dim);
+    b[i] = new CuMatrix<Real>(dim, dim);
+    c[i] = new CuMatrix<Real>(dim, dim);
+    a[i]->SetRandn();
+    b[i]->SetRandn();
+    A.push_back(new CuSubMatrix<Real>(*(a[i]), 0, a[i]->NumRows(), 0, 
+			    a[i]->NumCols()));
+    B.push_back(new CuSubMatrix<Real>(*(b[i]), 0, b[i]->NumRows(), 0, 
+			    b[i]->NumCols()));
+    C.push_back(new CuSubMatrix<Real>(*(c[i]), 0, c[i]->NumRows(), 0, 
+			    c[i]->NumCols()));
+  }
+  BaseFloat time_in_secs = 0.025;
+  Timer tim;
+  int32 iter = 0;
+  for (;tim.Elapsed() < time_in_secs; iter++) {
+    AddMatMatBatched(static_cast<Real>(1.0), C, A, kNoTrans, B, kNoTrans, 
+		    static_cast<Real>(0.0));
+  }
+  for (int32 i = 0; i< batchCount; i++) {
+    delete a[i]; delete b[i]; delete c[i];
+    delete A[i]; delete B[i]; delete C[i];
+  }
+
+  BaseFloat fdim = dim;
+  BaseFloat gflops = (fdim * fdim * fdim * iter * batchCount) / (tim.Elapsed() * 1.0e+09);
+  KALDI_LOG << "For CuMatrix::AddMatMatBatched" << NameOf<Real>() << ", for dim = " << dim 
+	    << ", batchSize = " << batchCount << ", speed was " << gflops << " gigaflops.";
+}
+
 template<typename Real> void TestCuMatrixAddDiagVecMat(int32 dim, MatrixTransposeType trans) {
   BaseFloat time_in_secs = 0.015;
   CuMatrix<Real> M(dim, dim), N(dim, dim);
@@ -85,14 +216,14 @@ template<typename Real> void TestSymInvertPosDef(int32 dim) {
   M.SetRandn();
   N.SymAddMat2(1.0, M, kNoTrans, 0.0);
   CuMatrix<Real> Ncopy(N);
-  
+
   int iter = 0;
   Timer tim;
   for (;tim.Elapsed() < time_in_secs; iter++) {
     Ncopy.CopyFromMat(N);
     Ncopy.SymInvertPosDef();
   }
-  
+
   BaseFloat fdim = dim;
   BaseFloat gflops = (fdim * fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
   KALDI_LOG << "For CuMatrix::TestCuInvertPosDef" << NameOf<Real>() << ", for dim = "
@@ -100,9 +231,9 @@ template<typename Real> void TestSymInvertPosDef(int32 dim) {
 }
 
 
-template<typename Real> 
+template<typename Real>
 static void TestCuMatrixCompObjfAndDeriv(int32 dim) {
-  BaseFloat time_in_secs = 0.025;  
+  BaseFloat time_in_secs = 0.025;
   // Previously tested for larger dims, but test was slow.
 
   int32 n_r = dim, n_c = dim + Rand() % 5;
@@ -111,7 +242,7 @@ static void TestCuMatrixCompObjfAndDeriv(int32 dim) {
   B.SetRandn();
   B.Add(1.0);
   B.ApplyFloor(1.0e-10);
-  
+
   std::vector<MatrixElement<Real> > labels;
   for(int i = 0; i < n_r; i++) {
     for(int j = 0; j < n_c; j++) {
@@ -135,7 +266,7 @@ static void TestCuMatrixCompObjfAndDeriv(int32 dim) {
   KALDI_LOG << "For CuMatrix::CompObjfAndDeriv" << NameOf<Real>() << ", for dim = "
             << dim << ", speed was " << gflops << " gigaflops.";
 
-  
+
   // do it one more time for correctness test.
   C.SetZero();
   C.CompObjfAndDeriv(labels, B, &a, &b);
@@ -144,30 +275,30 @@ static void TestCuMatrixCompObjfAndDeriv(int32 dim) {
 
   // repeat the real test.
   Real sum2;  // sum(i, j) A(i, j) log(B(i, j));
-  { 
+  {
     CuMatrix<Real> Bcopy(B);
     Bcopy.ApplyLog();
     sum2 = TraceMatMat(Bcopy, A, kTrans);
   }
-  
+
   KALDI_ASSERT(ApproxEqual(a, sum2));
 
   B.InvertElements();
   A.MulElements(B);  // each element of A is now A(i, j) / B(i, j);
   KALDI_ASSERT(ApproxEqual(A, C));
-  
+
 
 }
 
 
-template<typename Real> 
+template<typename Real>
 static void TestCuFindRowMaxId(int32 dim) {
 
   int32 dimM = dim, dimN = dimM + Rand() % 5;
 
   Matrix<Real> Hi(dimM, dimN);
   Hi.SetRandn();
-  
+
   CuMatrix<Real> Di(dimM, dimN);
   Di.CopyFromMat(Hi);
 
@@ -186,7 +317,7 @@ static void TestCuFindRowMaxId(int32 dim) {
   KALDI_LOG << "For CuMatrix::FindRowMaxId" << NameOf<Real>() << ", for dim = "
             << dim << ", speed was " << gflops << " gigaflops.";
 
-  
+
   // on cpu
   for(MatrixIndexT r=0; r<Hi.NumRows(); r++) {
     Real max=-1.0e+20; int32 idx=-1;
@@ -220,6 +351,23 @@ template<typename Real> void TestCuMatrixSigmoid(int32 dim) {
             << dim << ", speed was " << gflops << " gigaflops.";
 }
 
+template<typename Real> void TestCuMatrixHeaviside(int32 dim) {
+  BaseFloat time_in_secs = 0.025;
+  CuMatrix<Real> M(dim, dim), N(dim, dim);
+  M.SetRandn();
+  N.SetRandn();
+  Timer tim;
+  int32 iter = 0;
+  for (;tim.Elapsed() < time_in_secs; iter++) {
+    N.ApplyHeaviside();
+  }
+
+  BaseFloat fdim = dim;
+  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
+  KALDI_LOG << "For CuMatrix::Heaviside" << NameOf<Real>() << ", for dim = "
+            << dim << ", speed was " << gflops << " gigaflops.";
+}
+
 
 template<typename Real> void TestCuMatrixMulRowsGroupMat(int32 dim) {
   BaseFloat time_in_secs = 0.025;
@@ -298,7 +446,7 @@ template<typename Real> void TestCuMatrixGroupPnormDeriv(int32 dim) {
   int32 group_size = 4;
   CuMatrix<Real> M(dim, dim), N(dim, dim / group_size), O(dim, dim);
   M.SetRandn();
-  N.GroupPnorm(M, 2.0);  
+  N.GroupPnorm(M, 2.0);
   Timer tim;
   int32 iter = 0;
 
@@ -348,8 +496,8 @@ template<typename Real> void TestCuMatrixGroupMaxDeriv(int32 dim) {
 template<typename Real> void TestCuMatrixTraceMatMat(int32 dim) {
   for (int32 n = 0; n < 2; n++) {
     MatrixTransposeType trans = (n == 0 ? kNoTrans : kTrans);
-    BaseFloat time_in_secs = 0.08;
-  
+    BaseFloat time_in_secs = 0.02;
+
     CuMatrix<Real> M(dim, dim), N(dim, dim);
     M.SetRandn();
     N.SetRandn();
@@ -360,7 +508,7 @@ template<typename Real> void TestCuMatrixTraceMatMat(int32 dim) {
     }
     BaseFloat fdim = dim;
     BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
-    KALDI_LOG << "For CuMatrix::TraceMatMat" << NameOf<Real>() 
+    KALDI_LOG << "For CuMatrix::TraceMatMat" << NameOf<Real>()
               << (trans == kTrans ? " [transposed]" : "") << ", for dim = "
               << dim << ", speed was " << gflops << " gigaflops.";
   }
@@ -368,10 +516,10 @@ template<typename Real> void TestCuMatrixTraceMatMat(int32 dim) {
 
 
 template<typename Real> void TestCuMatrixCholesky(int32 dim) {
-  BaseFloat time_in_secs = 0.08;
-  
+  BaseFloat time_in_secs = 0.025;
+
   CuMatrix<Real> M(dim, dim);
-  M.AddToDiag(100.0);  
+  M.AddToDiag(100.0);
   Timer tim;
   int32 iter = 0;
   for (;tim.Elapsed() < time_in_secs; iter++)
@@ -379,7 +527,7 @@ template<typename Real> void TestCuMatrixCholesky(int32 dim) {
 
   BaseFloat fdim = dim;
   BaseFloat gflops = (fdim * fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
-  KALDI_LOG << "For CuMatrix::Cholesky" << NameOf<Real>() 
+  KALDI_LOG << "For CuMatrix::Cholesky" << NameOf<Real>()
             << ", for dim = " << dim << ", speed was " << gflops << " gigaflops.";
 }
 
@@ -418,7 +566,7 @@ template<typename Real> void TestCuMatrixCopyFromTp(int32 dim, MatrixTransposeTy
   Matrix<Real> M_cpu(T_cpu, trans);
   Matrix<Real> M2_cpu(M);
   AssertEqual(M_cpu, M2_cpu);
-  
+
   BaseFloat fdim = dim;
   BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
   KALDI_LOG << "For CuMatrix::CopyFromTp" << (trans == kNoTrans ? "[NoTrans]":"[Trans]")
@@ -442,7 +590,7 @@ template<typename Real> void TestCuMatrixCopyFromSp(int32 dim) {
   Matrix<Real> M_cpu(S_cpu);
   Matrix<Real> M2_cpu(M);
   AssertEqual(M_cpu, M2_cpu);
-  
+
   BaseFloat fdim = dim;
   BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
   KALDI_LOG << "For CuMatrix::CopyFromSp" << NameOf<Real>() << ", for dim = "
@@ -469,6 +617,19 @@ template<typename Real> void TestCuMatrixCopyUpperToLower(int32 dim) {
 }
 
 
+template<typename Real> void TestCuMatrixResize(int32 dim) {
+  BaseFloat time_in_secs = 0.025;
+  Timer tim;
+  int32 iter = 0;
+  for (; tim.Elapsed() < time_in_secs; iter++) {
+    CuMatrix<Real>M(dim, dim, kUndefined);  // we are testing the allocation and deallocation time.
+  }
+  BaseFloat fdim = dim;
+  BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
+  KALDI_LOG << "For CuMatrix::TestCuMatrixResize" << NameOf<Real>() << ", for dim = "
+            << dim << ", speed was " << gflops << " gigaflops.";
+}
+
 template<typename Real> void TestCuMatrixSetZeroAboveDiag(int32 dim) {
   BaseFloat time_in_secs = 0.025;
   CuMatrix<Real> M(dim, dim);
@@ -482,9 +643,10 @@ template<typename Real> void TestCuMatrixSetZeroAboveDiag(int32 dim) {
   KALDI_LOG << "For CuMatrix::SetZeroAboveDiag" << NameOf<Real>() << ", for dim = "
             << dim << ", speed was " << gflops << " gigaflops.";
 }
-template<typename Real> 
+
+template<typename Real>
 void TestCuMatrixLookup(int32 dim) {
-  BaseFloat time_in_secs = 0.025; 
+  BaseFloat time_in_secs = 0.025;
   int32 dimM = dim, dimN = dim;
   CuMatrix<Real> H(dimM, dimN);
   H.SetRandn();
@@ -493,6 +655,7 @@ void TestCuMatrixLookup(int32 dim) {
   std::vector<Real> output;
   // Generates the indices and the reference.
   int32 num_index = dim * dim;
+  output.resize(num_index);
   for (int32 j = 0; j < num_index; j++) {
     MatrixIndexT r = Rand() % dimM;
     MatrixIndexT c = Rand() % dimN;
@@ -506,12 +669,12 @@ void TestCuMatrixLookup(int32 dim) {
   Timer tim;
   int32 iter = 0;
   for (; tim.Elapsed()< time_in_secs; iter++)
-    H.Lookup(indices, &output);
+    H.Lookup(indices, &(output[0]));
 
-  BaseFloat fdim = dim;    
+  BaseFloat fdim = dim;
   BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
-  KALDI_LOG << "For CuMatrix::Lookup" << NameOf<Real>() << ", for dim = " 
-            << dim << ", speed was " << gflops << " gigaflops.";  
+  KALDI_LOG << "For CuMatrix::Lookup" << NameOf<Real>() << ", for dim = "
+            << dim << ", speed was " << gflops << " gigaflops.";
 }
 
 template<typename Real> void TestCuMatrixCopyRows1(int32 dim) {
@@ -693,8 +856,16 @@ template<typename Real> void CudaMatrixSpeedTest() {
   sizes.push_back(512);
   sizes.push_back(1024);
   int32 ns = sizes.size();
+  for (int32 s = 0; s < ns; s++)
+    TestCuMatrixResize<Real>(sizes[s]);
+  for (int32 s = 0; s < ns; s++)
+    TestCuMatrixAddMat<Real>(sizes[s], 3, 3);
+  for (int32 s = 0; s < ns; s++)
+    TestCuMatrixAddMatBlocks<Real>(sizes[s], 3, 3);
   for (int32 s = 0; s < ns; s++)
     TestCuMatrixMatMat<Real>(sizes[s]);
+  for (int32 s = 0; s < ns; s++)
+    TestCuMatrixMatMatBatched<Real>(sizes[s], 10);
   for (int32 s = 0; s < ns; s++) {
     TestCuMatrixAddDiagVecMat<Real>(sizes[s], kNoTrans);
     TestCuMatrixAddDiagVecMat<Real>(sizes[s], kTrans);
@@ -705,6 +876,8 @@ template<typename Real> void CudaMatrixSpeedTest() {
     TestCuMatrixCholesky<Real>(sizes[s]);
   for (int32 s = 0; s < ns; s++)
     TestCuMatrixSigmoid<Real>(sizes[s]);
+  for (int32 s = 0; s < ns; s++)
+    TestCuMatrixHeaviside<Real>(sizes[s]);
   for (int32 s = 0; s < ns; s++)
     TestCuFindRowMaxId<Real>(sizes[s]);
   for (int32 s = 0; s < ns; s++)
@@ -737,22 +910,28 @@ template<typename Real> void CudaMatrixSpeedTest() {
     TestCuMatrixCopyUpperToLower<Real>(sizes[s]);
   for (int32 s = 0; s < ns; s++)
     TestCuMatrixSetZeroAboveDiag<Real>(sizes[s]);
-  for (int32 s = 0; s < ns; s++) 
+  for (int32 s = 0; s < ns; s++)
     TestCuMatrixLookup<Real>(sizes[s]);
-  for (int32 s = 0; s < ns; s++) 
+  for (int32 s = 0; s < ns; s++)
     TestCuMatrixCopyRows1<Real>(sizes[s]);
-  for (int32 s = 0; s < ns; s++) 
+  for (int32 s = 0; s < ns; s++)
     TestCuMatrixCopyRows2<Real>(sizes[s]);
-  for (int32 s = 0; s < ns; s++) 
+  for (int32 s = 0; s < ns; s++)
     TestCuMatrixCopyToRows<Real>(sizes[s]);
-  for (int32 s = 0; s < ns; s++) 
+  for (int32 s = 0; s < ns; s++)
     TestCuMatrixAddRows1<Real>(sizes[s]);
-  for (int32 s = 0; s < ns; s++) 
+  for (int32 s = 0; s < ns; s++)
     TestCuMatrixAddRows2<Real>(sizes[s]);
-  for (int32 s = 0; s < ns; s++) 
+  for (int32 s = 0; s < ns; s++)
     TestCuMatrixAddToRows<Real>(sizes[s]);
-  for (int32 s = 0; s < ns; s++) 
+  for (int32 s = 0; s < ns; s++)
     TestCuMatrixAddRowRanges<Real>(sizes[s]);
+  for (int32 s = 0; s < ns; s++)
+    TestCuMatrixTransposeCross<Real>(sizes[s]);
+  for (int32 s = 0; s < ns; s++)
+    TestCuMatrixTransposeS<Real>(sizes[s]);
+  for (int32 s = 0; s < ns; s++)
+    TestCuMatrixTransposeNS<Real>(sizes[s]);
 }
 
 
diff --git a/src/cudamatrix/cu-matrix-test.cc b/src/cudamatrix/cu-matrix-test.cc
index e54047e7262..74419ea25ba 100644
--- a/src/cudamatrix/cu-matrix-test.cc
+++ b/src/cudamatrix/cu-matrix-test.cc
@@ -1,9 +1,9 @@
-// cudamatrix/cuda-matrix-test.cc
+// cudamatrix/cu-matrix-test.cc
 
 // Copyright 2010  Karel Vesely
 //           2013  Lucas Ondel
 //           2013  Johns Hopkins University (author: Daniel Povey)
-//           2013  Hainan Xu    
+//           2013  Hainan Xu
 //           2013  Xiaohui Zhang
 //           2013  Johns Hopkins University (author: Guoguo Chen)
 
@@ -39,7 +39,7 @@ namespace kaldi {
 /*
  * INITIALIZERS
  */
-template<typename Real> 
+template<typename Real>
 static void InitRand(VectorBase<Real> *v) {
   for (MatrixIndexT i = 0; i < v->Dim(); i++)
     (*v)(i) = RandGauss();
@@ -47,7 +47,7 @@ static void InitRand(VectorBase<Real> *v) {
 
 
 
-template<typename Real> 
+template<typename Real>
 static void InitRand(MatrixBase<Real> *M) {
   do {
     for (MatrixIndexT i = 0;i < M->NumRows();i++)
@@ -58,7 +58,7 @@ static void InitRand(MatrixBase<Real> *M) {
 
 
 
-template<typename Real> 
+template<typename Real>
 static void RandZeroToOneMatrix(MatrixBase<Real>* mat) {
   for(int32 r=0; r<mat->NumRows(); r++)
     for(int32 c=0; c<mat->NumCols(); c++)
@@ -70,7 +70,7 @@ static void RandZeroToOneMatrix(MatrixBase<Real>* mat) {
  * Unit tests
  */
 
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuMatrixTraceMatMat() {
   for (int32 i = 0; i < 2; i++) {
     int32 M = 100 + Rand() % 200, N = 100 + Rand() % 200;
@@ -102,11 +102,11 @@ static void UnitTestCuMatrixTraceMatMat() {
 }
 
 
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuCholesky() {
   for (int32 i = 0; i < 2; i++) {
     int32 M = 1 + Rand() % 10, N = M + 5;
-    
+
     CuMatrix<Real> A(M, N);
     A.SetRandn();
     CuMatrix<Real> S(M, M);
@@ -133,7 +133,7 @@ static void UnitTestCuCholesky() {
 /*
  * CuMatrix
  */
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuMatrixApplyLog() {
   int32 M = 100 + Rand() % 200, N = 100 + Rand() % 200;
   Matrix<Real> H(M, N);
@@ -154,7 +154,7 @@ static void UnitTestCuMatrixApplyLog() {
 /*
  * CuMatrix
  */
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuMatrixApplyExp() {
   int32 M = 10 + Rand() % 20, N = 10 + Rand() % 20;
   Matrix<Real> H(M, N);
@@ -173,7 +173,7 @@ static void UnitTestCuMatrixApplyExp() {
 
 
 
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuMatrixSigmoid() {
   for (int32 i = 0; i < 2; i++) {
     int32 M = 100 + Rand() % 200, N = 100 + Rand() % 200;
@@ -193,7 +193,7 @@ static void UnitTestCuMatrixSigmoid() {
   }
 }
 
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuMatrixScale() {
   int32 M = 100 + Rand() % 200, N = 100 + Rand() % 200;
   Matrix<Real> H(M, N);
@@ -208,7 +208,7 @@ static void UnitTestCuMatrixScale() {
   AssertEqual(H, E);
 }
 
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuMatrixAdd() {
   int32 M = 100 + Rand() % 200, N = 100 + Rand() % 200;
   Matrix<Real> H(M, N);
@@ -224,7 +224,7 @@ static void UnitTestCuMatrixAdd() {
 }
 
 
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuMatrixSoftHinge() {
   int32 M = 100 + Rand() % 200, N = 100 + Rand() % 200;
   Matrix<Real> H(M, N);
@@ -236,13 +236,13 @@ static void UnitTestCuMatrixSoftHinge() {
 
   E.SoftHinge(D);
   H.SoftHinge(H);
-  
+
   Matrix<Real> H2(E);
 
   AssertEqual(H,H2);
 }
 
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuMatrixGroupPnorm() {
   int32 M = 100 + Rand() % 200, N = 100 + Rand() % 200;
   // M = 256; N = 256;
@@ -266,7 +266,7 @@ static void UnitTestCuMatrixGroupPnorm() {
   }
 }
 
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuMatrixGroupMax() {
   int32 M = 100 + Rand() % 200, N = 100 + Rand() % 200;
   // M = 256; N = 256;
@@ -287,7 +287,7 @@ static void UnitTestCuMatrixGroupMax() {
   }
 }
 
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuMatrixSet() {
   for (int32 i = 0; i < 2; i++) {
     BaseFloat value= 0.333;
@@ -302,20 +302,20 @@ static void UnitTestCuMatrixSet() {
 }
 
 
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuMatrixApplyPow() {
 
   for (int32 i = 0; i < 2; i++) {
     BaseFloat pow = 0.5 * (Rand() % 6);
-    
+
     Matrix<Real> H(10 + Rand() % 60, 10 + Rand() % 20);
     H.SetRandn();
     H.Row(0).Set(0.0);
     if (i == 2) { Matrix<Real> tmp(H, kTrans); H = tmp; }
-    
+
     if (pow != 1.0 && pow != 2.0 && pow != 3.0)
       H.MulElements(H); //make numbers positive
-    
+
     CuMatrix<Real> cH(H);
 
     cH.ApplyPow(pow);
@@ -326,17 +326,17 @@ static void UnitTestCuMatrixApplyPow() {
   }
 }
 
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuMatrixApplyPowAbs() {
 
   for (int32 i = 0; i < 2; i++) {
     BaseFloat pow = 0.5 * (Rand() % 6);
-    
+
     Matrix<Real> H(10 + Rand() % 60, 10 + Rand() % 20);
     H.SetRandn();
     H.Row(0).Set(0.0);
     if (i == 2) { Matrix<Real> tmp(H, kTrans); H = tmp; }
-    
+
     CuMatrix<Real> cH(H);
 
     cH.ApplyPowAbs(pow, true);
@@ -383,7 +383,7 @@ static void UnitTestCuMatrixCopyRows() {
         num_cols = 10 + Rand() % 10;
     CuMatrix<Real> M(num_rows1, num_cols);
     M.SetRandn();
-    
+
     CuMatrix<Real> N1(num_rows2, num_cols),
         N2(num_rows2, num_cols), O(num_rows2, num_cols);
     std::vector<int32> reorder(num_rows2);
@@ -404,7 +404,7 @@ static void UnitTestCuMatrixCopyRows() {
       for (int32 j = 0; j < num_cols; j++)
         if (reorder[i] < 0) O(i, j) = 0;
         else O(i, j) = M(reorder[i], j);
-    
+
     AssertEqual(N1, O);
     AssertEqual(N2, O);
   }
@@ -452,7 +452,7 @@ static void UnitTestCuMatrixAddRows() {
         num_rows2 = 10 + Rand() % 10,
         num_cols = 10 + Rand() % 10;
     CuMatrix<Real> M(num_rows1, num_cols);
-    M.SetRandn();    
+    M.SetRandn();
 
     CuMatrix<Real> N1(num_rows2, num_cols),
         N2(num_rows2, num_cols), O(num_rows2, num_cols);
@@ -592,7 +592,7 @@ static void UnitTestCuMatrixSumColumnRanges() {
     }
     CuMatrix<Real> cu_src(src);
     CuMatrix<Real> cu_dst(num_rows, num_cols2, kUndefined);
-    CuArray<Int32Pair> indices_tmp(indices);    
+    CuArray<Int32Pair> indices_tmp(indices);
     cu_dst.SumColumnRanges(cu_src, indices_tmp);
     Matrix<Real> dst2(cu_dst);
     AssertEqual(dst, dst2);
@@ -610,8 +610,8 @@ static void UnitTestCuMatrixAddRowRanges() {
     Matrix<Real> dst(num_rows2, num_cols); dst.SetRandn();
 
     // Computes the indexes.
-    std::vector<Int32Pair> indexes(num_cols);
-    for (MatrixIndexT i = 0; i < num_cols; i++) {
+    std::vector<Int32Pair> indexes(num_rows2);
+    for (MatrixIndexT i = 0; i < num_rows2; i++) {
       indexes[i].first = Rand() % num_rows1;
       int32 headroom = num_rows1 - indexes[i].first,
             size = (Rand() % headroom) + 1;
@@ -620,12 +620,11 @@ static void UnitTestCuMatrixAddRowRanges() {
                    indexes[i].second <= num_rows1 &&
                    indexes[i].first >= 0);
     }
-
     // Computes reference matrix.
     Matrix<Real> dst1(dst);
     for (MatrixIndexT i = 0; i < num_rows2; i++) {
+      int32 start = indexes[i].first, end = indexes[i].second;
       for (MatrixIndexT j = 0; j < num_cols; j++) {
-        int32 start = indexes[j].first, end = indexes[j].second;
         for (MatrixIndexT i2 = start; i2 < end; i2++)
           dst1(i, j) += src(i2, j);
       }
@@ -640,7 +639,7 @@ static void UnitTestCuMatrixAddRowRanges() {
   }
 }
 
-  
+
 template<typename Real>
 static void UnitTestCuMatrixCopyCols() {
   for (int32 p = 0; p < 2; p++) {
@@ -649,7 +648,7 @@ static void UnitTestCuMatrixCopyCols() {
         num_rows = 10 + Rand() % 10;
     CuMatrix<Real> M(num_rows, num_cols1);
     M.SetRandn();
-    
+
     CuMatrix<Real> N(num_rows, num_cols2), O(num_rows, num_cols2);
     std::vector<int32> reorder(num_cols2);
     for (int32 i = 0; i < num_cols2; i++)
@@ -657,7 +656,7 @@ static void UnitTestCuMatrixCopyCols() {
 
     CuArray<int32> reorder_gpu(reorder);
     N.CopyCols(M, reorder_gpu);
-    
+
     for (int32 i = 0; i < num_rows; i++)
       for (int32 j = 0; j < num_cols2; j++)
         if (reorder[j] < 0) O(i, j) = 0;
@@ -675,7 +674,7 @@ static void UnitTestCuMatrixAddCols() {
         num_rows = 10 + Rand() % 10;
     CuMatrix<Real> M(num_rows, num_cols1);
     M.SetRandn();
-    
+
     CuMatrix<Real> N(num_rows, num_cols2), O(num_rows, num_cols2);
     std::vector<int32> reorder(num_cols2);
     for (int32 i = 0; i < num_cols2; i++)
@@ -683,7 +682,7 @@ static void UnitTestCuMatrixAddCols() {
 
     CuArray<int32> reorder_gpu(reorder);
     N.AddCols(M, reorder_gpu);
-    
+
     for (int32 i = 0; i < num_rows; i++)
       for (int32 j = 0; j < num_cols2; j++)
         if (reorder[j] < 0) O(i, j) = 0;
@@ -693,16 +692,16 @@ static void UnitTestCuMatrixAddCols() {
 }
 
 
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuMatrixApplyFloor() {
 
   for (int32 i = 0; i < 3; i++) {
     BaseFloat floor = 0.33 * (Rand() % 6);
-    
+
     Matrix<Real> H(10 + Rand() % 600, 10 + Rand() % 20);
     H.SetRandn();
     if (i == 2) { Matrix<Real> tmp(H, kTrans); H = tmp; }
-    
+
     CuMatrix<Real> cH(H);
 
     cH.ApplyFloor(floor);
@@ -714,8 +713,28 @@ static void UnitTestCuMatrixApplyFloor() {
   }
 }
 
+template<typename Real>
+static void UnitTestCuMatrixApplyCeiling() {
+
+  for (int32 i = 0; i < 3; i++) {
+    BaseFloat ceiling = 0.33 * (Rand() % 6);
 
-template<typename Real> 
+    Matrix<Real> H(10 + Rand() % 600, 10 + Rand() % 20);
+    H.SetRandn();
+    if (i == 2) { Matrix<Real> tmp(H,kTrans); H = tmp; }
+
+    CuMatrix<Real> cH(H);
+
+    cH.ApplyCeiling(ceiling);
+
+    H.ApplyCeiling(ceiling);
+    Matrix<Real> H2(cH);
+
+    AssertEqual(H, H2);
+  }
+}
+
+template<typename Real>
 static void UnitTestCuMatrixApplyHeaviside() {
 
   for (int32 i = 0; i < 1; i++) {
@@ -735,11 +754,30 @@ static void UnitTestCuMatrixApplyHeaviside() {
 }
 
 
-template<typename Real> 
+template<typename Real>
+static void UnitTestCuMatrixHeaviside() {
+
+  for (int32 i = 0; i < 1; i++) {
+    Matrix<Real> H(10 + Rand() % 60, 10 + Rand() % 20);
+    H.SetRandn();
+    H.Row(0).Set(0.0);
+    if (i == 2) { Matrix<Real> tmp(H, kTrans); H = tmp; }
+
+    CuMatrix<Real> cH(H);
+    CuMatrix<Real> cH2(H.NumRows(), H.NumCols(), kUndefined);
+    cH2.Heaviside(cH);
+    H.ApplyHeaviside();
+    Matrix<Real> H2(cH2);
+    AssertEqual(H, H2);
+  }
+}
+
+
+template<typename Real>
 static void UnitTestCuMatrixMulElements() {
   for (int32 i = 0; i < 2; i++) {
     MatrixIndexT dimM = 100 + Rand() % 256, dimN = 100 + Rand() % 256;
-  
+
     Matrix<Real> Ha(dimM, dimN);
     Matrix<Real> Hb(dimM, dimN);
     Ha.SetRandn();
@@ -760,11 +798,11 @@ static void UnitTestCuMatrixMulElements() {
   }
 }
 
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuMatrixDivElements() {
   for (int32 i = 0; i < 2; i++) {
     MatrixIndexT dimM = 100 + Rand() % 256, dimN = 100 + Rand() % 256;
-  
+
     Matrix<Real> Ha(dimM, dimN);
     Matrix<Real> Hb(dimM, dimN);
     Ha.SetRandn();
@@ -785,7 +823,7 @@ static void UnitTestCuMatrixDivElements() {
   }
 }
 
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuMatrixMax() {
   Matrix<Real> Ha(100,100);
   Matrix<Real> Hb(100,100);
@@ -808,7 +846,7 @@ static void UnitTestCuMatrixMax() {
 
 
 
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuMatrixMulColsVec() {
   Matrix<Real> Hm(100,99);
   Vector<Real> Hv(99);
@@ -831,7 +869,7 @@ static void UnitTestCuMatrixMulColsVec() {
 
 
 
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuMatrixMulRowsVec() {
   for (int32 i = 0; i < 2; i++) {
     int32 dimM = 100 + Rand() % 200, dimN = 100 + Rand() % 200;
@@ -845,10 +883,10 @@ static void UnitTestCuMatrixMulRowsVec() {
     CuVector<Real> Dv(dimM);
     Dm.CopyFromMat(Hm);
     Dv.CopyFromVec(Hv);
-    
+
     Dm.MulRowsVec(Dv);
     Hm.MulRowsVec(Hv);
-    
+
     Matrix<Real> Hm2(dimM, dimN);
     Dm.CopyToMat(&Hm2);
 
@@ -856,7 +894,7 @@ static void UnitTestCuMatrixMulRowsVec() {
   }
 }
 
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuMatrixMulRowsGroupMat() {
   for (int32 i = 0; i < 2; i++) {
     int32 dimM = 100 + Rand() % 200, dimNs = 100 + Rand() % 200;
@@ -872,17 +910,17 @@ static void UnitTestCuMatrixMulRowsGroupMat() {
     CuMatrix<Real> Ds(dimM, dimNs);
     Dm.CopyFromMat(Hm);
     Ds.CopyFromMat(Hs);
-    
+
     Dm.MulRowsGroupMat(Ds);
     Hm.MulRowsGroupMat(Hs);
-    
+
     Matrix<Real> Hm2(dimM, dimN);
     Dm.CopyToMat(&Hm2);
     AssertEqual(Hm,Hm2);
   }
 }
 
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuMatrixGroupPnormDeriv() {
   int32 dimM = 100 + Rand() % 200, dimNs = 100 + Rand() % 200;
   int32 group_size = 1 + Rand() % 10;
@@ -898,25 +936,25 @@ static void UnitTestCuMatrixGroupPnormDeriv() {
     Hm.ApplyFloor(0.0); // will put some zeros in the matrix.. harder to
                         // do derivatives.
   Hs.GroupPnorm(Hm, power);
-  
+
   CuMatrix<Real> Dm(dimM, dimN);
   CuMatrix<Real> Dr(dimM, dimN);
   CuMatrix<Real> Ds(dimM, dimNs);
   Dm.CopyFromMat(Hm);
   Dr.CopyFromMat(Hr);
   Ds.CopyFromMat(Hs);
-  
-  // KALDI_LOG << "Hr " << Hr << " Dr " << Dr << "Ds" << Ds << " Hs " << Hs ; 
+
+  // KALDI_LOG << "Hr " << Hr << " Dr " << Dr << "Ds" << Ds << " Hs " << Hs ;
   Dr.GroupPnormDeriv(Dm, Ds, power);
   Hr.GroupPnormDeriv(Hm, Hs, power);
-  
-  // KALDI_LOG << "Hr " << Hr << " Dr " << Dr << "Ds" << Ds << " Hs " << Hs ; 
+
+  // KALDI_LOG << "Hr " << Hr << " Dr " << Dr << "Ds" << Ds << " Hs " << Hs ;
   Matrix<Real> Hr2(dimM, dimN);
   Dr.CopyToMat(&Hr2);
   AssertEqual(Hr,Hr2);
 }
 
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuMatrixGroupMaxDeriv() {
   int32 dimM = 100 + Rand() % 200, dimNs = 100 + Rand() % 200;
   int32 group_size = 1 + Rand() % 10;
@@ -931,19 +969,19 @@ static void UnitTestCuMatrixGroupMaxDeriv() {
     Hm.ApplyFloor(0.0); // will put some zeros in the matrix.. harder to
                         // do derivatives.
   Hs.GroupMax(Hm);
-  
+
   CuMatrix<Real> Dm(dimM, dimN);
   CuMatrix<Real> Dr(dimM, dimN);
   CuMatrix<Real> Ds(dimM, dimNs);
   Dm.CopyFromMat(Hm);
   Dr.CopyFromMat(Hr);
   Ds.CopyFromMat(Hs);
-  
-  // KALDI_LOG << "Hr " << Hr << " Dr " << Dr << "Ds" << Ds << " Hs " << Hs ; 
+
+  // KALDI_LOG << "Hr " << Hr << " Dr " << Dr << "Ds" << Ds << " Hs " << Hs ;
   Dr.GroupMaxDeriv(Dm, Ds);
   Hr.GroupMaxDeriv(Hm, Hs);
-  
-  // KALDI_LOG << "Hr " << Hr << " Dr " << Dr << "Ds" << Ds << " Hs " << Hs ; 
+
+  // KALDI_LOG << "Hr " << Hr << " Dr " << Dr << "Ds" << Ds << " Hs " << Hs ;
   Matrix<Real> Hr2(dimM, dimN);
   Dr.CopyToMat(&Hr2);
   AssertEqual(Hr,Hr2);
@@ -963,7 +1001,7 @@ template<typename Real> static void UnitTestCuMatrixAddDiagVecMat() {
 
     KALDI_ASSERT(M.Sum() != 0.0);
     KALDI_ASSERT(N.Sum() != 0.0);
-    
+
     CuVector<Real> V(dimM);
     V.SetRandn();
 
@@ -979,7 +1017,7 @@ template<typename Real> static void UnitTestCuMatrixAddDiagVecMat() {
       Mcheckrow.Scale(beta);
       Mcheckrow.AddVec(alpha * V(r), Nrow);
     }
-    
+
     M.AddDiagVecMat(alpha, V, N, trans, beta);
     AssertEqual(M, Mcheck);
     KALDI_ASSERT(M.Sum() != 0.0);
@@ -993,8 +1031,8 @@ template<typename Real> static void UnitTestCuMatrixAddMatDiagVec() {
     Real alpha = 0.43243, beta = 1.423;
 
     CuMatrix<Real> M(dimM, dimN), N(dimM, dimN), buf(dimM, dimN);
-    M.SetRandn(); 
-    N.SetRandn(); 
+    M.SetRandn();
+    N.SetRandn();
     buf.CopyFromMat(N);
     MatrixTransposeType trans = (p % 2 == 0 ? kNoTrans : kTrans);
     if (trans == kTrans)
@@ -1003,9 +1041,9 @@ template<typename Real> static void UnitTestCuMatrixAddMatDiagVec() {
     CuVector<Real> V(dimN);
     V.SetRandn();
 
-    CuMatrix<Real> Mcheck(M); 
+    CuMatrix<Real> Mcheck(M);
     Mcheck.Scale(beta);
-    buf.MulColsVec(V);  
+    buf.MulColsVec(V);
     Mcheck.AddMat(alpha, buf, kNoTrans);
 
     M.AddMatDiagVec(alpha, N, trans, V, beta);
@@ -1032,7 +1070,7 @@ template<typename Real> static void UnitTestCuMatrixAddMatMatElements() {
   KALDI_ASSERT(M.Sum() != 0.0);
 }
 
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuMatrixDivRowsVec() {
   Matrix<Real> Hm(100,99);
   Vector<Real> Hv(100);
@@ -1056,7 +1094,7 @@ static void UnitTestCuMatrixDivRowsVec() {
 
 
 
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuMatrixAddMat() {
   Matrix<Real> Ha(100,100);
   Matrix<Real> Hb(100,100);
@@ -1075,15 +1113,15 @@ static void UnitTestCuMatrixAddMat() {
   Da.CopyToMat(&Ha2);
 
   AssertEqual(Ha,Ha2);
-  
+
   //check use with submatrix
   CuMatrix<Real> mat1(10,10,kSetZero);
   mat1.AddMat(1.0,Da.Range(5,10,12,10)); //different stride for mat1,mat2
   CuMatrix<Real> mat2(Da.Range(5,10,12,10));
   AssertEqual(mat1,mat2);
- 
+
   for (int i = 0; i < 10; i++) {
-    int32 N = 5 * (10 + Rand() % 10),  M = 100 + Rand() % 50;  
+    int32 N = 5 * (10 + Rand() % 10),  M = 100 + Rand() % 50;
     Matrix<Real> Hc(N,M);
     Matrix<Real> Hd(M,N);
     Hc.SetRandn();
@@ -1093,11 +1131,11 @@ static void UnitTestCuMatrixAddMat() {
     CuMatrix<Real> Dd(M,N);
     Dc.CopyFromMat(Hc);
     Dd.CopyFromMat(Hd);
-  
+
     Real alpha = 0.5;
     Dc.AddMat(alpha,Dd,kTrans);
     Hc.AddMat(alpha,Hd,kTrans);
-  
+
     Matrix<Real> Hc2(N,M);
     Dc.CopyToMat(&Hc2);
     AssertEqual(Hc,Hc2);
@@ -1105,13 +1143,46 @@ static void UnitTestCuMatrixAddMat() {
     // check use with submatrix
     CuMatrix<Real> mat3(N/5,M,kSetZero);
     mat3.AddMat(1.0, Dd.Range(0,M,0,N/5),kTrans);
-    
+
     CuMatrix<Real> mat4(Dd.Range(0,M,0,N/5),kTrans);
     AssertEqual(mat3,mat4);
   }
 }
 
-template<typename Real> 
+template<typename Real>
+static void UnitTestCuMatrixAddMatBlocks() {
+  int32 num_row_blocks = 10, num_col_blocks = 20;
+  Matrix<Real> Ha1(100, 100), Ha2(100, 100);
+  Matrix<Real> Hb(100 * num_row_blocks, 100 * num_col_blocks);
+  Ha1.SetRandn();
+  Ha2.SetRandn();
+  Hb.SetRandn();
+
+  CuMatrix<Real> Da1(100, 100), Da2(100, 100);
+  CuMatrix<Real> Db(100 * num_row_blocks, 100 * num_col_blocks);
+  Da1.CopyFromMat(Ha1);
+  Da2.CopyFromMat(Ha2);
+  Db.CopyFromMat(Hb);
+
+  for (int32 i = 0; i < num_row_blocks; i++) {
+    for (int32 j = 0; j < num_col_blocks; j++) {
+      SubMatrix<Real> Hs(Hb.Range(i * 100, 100, j * 100, 100));
+      Ha1.AddMat(0.5, Hs, kNoTrans);
+      Ha2.AddMat(0.5, Hs, kTrans);
+    }
+  }
+
+  Da1.AddMatBlocks(0.5, Db, kNoTrans);
+  Da2.AddMatBlocks(0.5, Db, kTrans);
+  Matrix<Real> Ha11(100, 100);
+  Da1.CopyToMat(&Ha11);
+  AssertEqual(Ha1,Ha11);
+  Matrix<Real> Ha22(100, 100);
+  Da2.CopyToMat(&Ha22);
+  AssertEqual(Ha2,Ha22);
+}
+
+template<typename Real>
 static void UnitTestCuMatrixSum() {
   int32 M = 100 + Rand() % 300, N = 100 + Rand() % 300;
   CuMatrix<Real> A(M, N);
@@ -1121,7 +1192,7 @@ static void UnitTestCuMatrixSum() {
 }
 
 
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuMatrixAddVecToCols() {
   Matrix<Real> Hm(100,99);
   Vector<Real> Hv(100);
@@ -1144,7 +1215,7 @@ static void UnitTestCuMatrixAddVecToCols() {
 
 
 
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuMatrixAddVecToRows() {
   Matrix<Real> Hm(100,99);
   Vector<Real> Hv(99);
@@ -1166,7 +1237,7 @@ static void UnitTestCuMatrixAddVecToRows() {
 }
 
 
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuMatrixSymAddMat2() {
   for (int32 i = 0; i < 2; i++) {
     int32 dimM = 10 + Rand() % 200, dimN = 10 + Rand() % 30;
@@ -1196,7 +1267,7 @@ static void UnitTestCuMatrixSymAddMat2() {
 
 
 
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuMatrixSymInvertPosDef() {
   for (int32 i = 0; i < 2; i++) {
     int32 dimM = 10 + Rand() % 200, dimN = dimM + 20;
@@ -1243,7 +1314,7 @@ static void UnitTestCuMatrixSymInvertPosDef() {
 }
 
 
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuMatrixAddMatMat() {
   Matrix<Real> Ha(200,100);
   Matrix<Real> Hb(100,200);
@@ -1274,7 +1345,98 @@ static void UnitTestCuMatrixAddMatMat() {
 }
 
 
-template<typename Real> 
+template<typename Real>
+static void UnitTestCuMatrixAddVecVec() {
+  Vector<Real> x(100);
+  Vector<Real> y(200);
+  x.SetRandn();
+  y.SetRandn();
+
+  CuVector<Real> Cux(100);
+  CuVector<Real> Cuy(200);
+  Cux.CopyFromVec(x);
+  Cuy.CopyFromVec(y);
+
+  Matrix<Real> A(100,200);
+  CuMatrix<Real> CuA(100,200);
+
+  A.AddVecVec(0.5f, x, y);
+  CuA.AddVecVec(0.5f, Cux, Cuy);
+  Matrix<Real> A2(100, 200);
+  CuA.CopyToMat(&A2);
+
+  AssertEqual(A,A2);
+}
+
+
+template<typename Real>
+static void UnitTestCuMatrixAddMatMatBatched() {
+  const int32 batchCount = 10;
+  std::vector<Matrix<Real>* > Ha(batchCount), Hb(batchCount), Hc1(batchCount), Hc2(batchCount);
+  std::vector<CuMatrix<Real>* > Da(batchCount), Db(batchCount), Dc1(batchCount), Dc2(batchCount);
+  std::vector<SubMatrix<Real>* > HA, HB, HC1, HC2;
+  std::vector<CuSubMatrix<Real>* > DA, DB, DC1, DC2;
+
+  for (int32 i = 0; i < batchCount; i++) {
+    // first create a Matrix intance and then creat a SubMatrix instance from that
+    Ha[i] = new Matrix<Real>(200, 100);
+    Hb[i] = new Matrix<Real>(100, 200);
+    Hc1[i] = new Matrix<Real>(200, 200);
+    Hc2[i] = new Matrix<Real>(100, 100);
+    Ha[i]->SetRandn();
+    Hb[i]->SetRandn();
+    HA.push_back(new SubMatrix<Real>(*(Ha[i]), 0, Ha[i]->NumRows(), 0,
+			    Ha[i]->NumCols()));
+    HB.push_back(new SubMatrix<Real>(*(Hb[i]), 0, Hb[i]->NumRows(), 0,
+			    Hb[i]->NumCols()));
+    HC1.push_back(new SubMatrix<Real>(*(Hc1[i]), 0, Hc1[i]->NumRows(), 0,
+			    Hc1[i]->NumCols()));
+    HC2.push_back(new SubMatrix<Real>(*(Hc2[i]), 0, Hc2[i]->NumRows(), 0,
+			    Hc2[i]->NumCols()));
+
+    // first create a CuMatrix intance and then creat a CuSubMatrix instance from that
+    Da[i] = new CuMatrix<Real>(200, 100);
+    Db[i] = new CuMatrix<Real>(100, 200);
+    Dc1[i] = new CuMatrix<Real>(200, 200);
+    Dc2[i] = new CuMatrix<Real>(100, 100);
+    Da[i]->CopyFromMat(*(Ha[i]));
+    Db[i]->CopyFromMat(*(Hb[i]));
+    DA.push_back(new CuSubMatrix<Real>(*(Da[i]), 0, Da[i]->NumRows(), 0,
+			    Da[i]->NumCols()));
+    DB.push_back(new CuSubMatrix<Real>(*(Db[i]), 0, Db[i]->NumRows(), 0,
+			    Db[i]->NumCols()));
+    DC1.push_back(new CuSubMatrix<Real>(*(Dc1[i]), 0, Dc1[i]->NumRows(), 0,
+			    Dc1[i]->NumCols()));
+    DC2.push_back(new CuSubMatrix<Real>(*(Dc2[i]), 0, Dc2[i]->NumRows(), 0,
+			    Dc2[i]->NumCols()));
+  }
+
+  AddMatMatBatched(static_cast<Real>(0.5f), DC1, DA, kNoTrans, DB, kNoTrans,
+		  static_cast<Real>(0.0f));
+  AddMatMatBatched(static_cast<Real>(0.5f), DC2, DA, kTrans, DB, kTrans,
+		  static_cast<Real>(0.0f));
+
+  // used to store results from DC1 and DC2 for equality check
+  Matrix<Real> Hca1(200,200);
+  Matrix<Real> Hca2(100,100);
+
+  // equality check
+  for (int32 i = 0; i< batchCount; i++) {
+    (*HC1[i]).AddMatMat(0.5f, *(HA[i]), kNoTrans, *(HB[i]), kNoTrans, 0.0f);
+    (*HC2[i]).AddMatMat(0.5f, *(HA[i]), kTrans, *(HB[i]), kTrans, 0.0f);
+    DC1[i]->CopyToMat(&Hca1);
+    DC2[i]->CopyToMat(&Hca2);
+    AssertEqual(*(HC1[i]), Hca1);
+    AssertEqual(*(HC2[i]), Hca2);
+    delete Ha[i]; delete Hb[i]; delete Hc1[i]; delete Hc2[i];
+    delete HA[i]; delete HB[i]; delete HC1[i]; delete HC2[i];
+    delete Da[i]; delete Db[i]; delete Dc1[i]; delete Dc2[i];
+    delete DA[i]; delete DB[i]; delete DC1[i]; delete DC2[i];
+  }
+}
+
+
+template<typename Real>
 static void UnitTestCuMatrixAddToDiag() {
   for (int32 i = 0; i < 10; i++) {
     int32 dimM = 100 + Rand() % 200, dimN = 100 + Rand() % 200;
@@ -1288,7 +1450,7 @@ static void UnitTestCuMatrixAddToDiag() {
   }
 }
 
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuMatrixAdd2() {
   for (int32 i = 0; i < 10; i++) {
     int32 dimM = 100 + Rand() % 200, dimN = 100 + Rand() % 200;
@@ -1307,10 +1469,10 @@ template<typename Real>
 static void UnitTestCuMatrixCopyFromMat() {
   for (int32 i = 1; i < 10; i++) {
     MatrixIndexT dim = 5 * i + Rand() % 10;
-    
+
     Matrix<Real> A(dim, dim);
     A.SetRandn();
-    CuMatrix<Real> E(A);    
+    CuMatrix<Real> E(A);
     CuMatrix<Real> B(dim, dim);
     B.CopyFromMat(E);
 
@@ -1338,7 +1500,7 @@ template<typename Real>
 static void UnitTestCuMatrixAddMatTp() {
   for (int32 i = 1; i < 10; i++) {
     MatrixIndexT dim = 5 * i + Rand() % 10;
-    
+
     Matrix<Real> A(dim, dim);
     Matrix<Real> B(dim, dim);
     TpMatrix<Real> C(dim);
@@ -1348,7 +1510,7 @@ static void UnitTestCuMatrixAddMatTp() {
     CuMatrix<Real> D(A);
     CuMatrix<Real> E(B);
     CuTpMatrix<Real> F(C);
-    
+
     A.AddMatTp(1.0, B, kNoTrans, C, kNoTrans, 1.0);
     D.AddMatTp(1.0, E, kNoTrans, F, kNoTrans, 1.0);
 
@@ -1364,7 +1526,7 @@ static void UnitTestCuMatrixTranspose() {
     MatrixIndexT dimM = 5 * i + Rand() % 10,
         dimN = dimM;
     if (i % 2 == 0) dimN += 5;
-    
+
     CuMatrix<Real> A(dimM, dimN);
     A.SetRandn();
     CuMatrix<Real> B(A, kTrans);
@@ -1377,7 +1539,7 @@ template<typename Real>
 static void UnitTestCuMatrixAddTpMat() {
   for (int32 i = 1; i < 10; i++) {
     MatrixIndexT dim = 5 * i + Rand() % 10;
-    
+
     Matrix<Real> A(dim, dim);
     Matrix<Real> B(dim, dim);
     TpMatrix<Real> C(dim);
@@ -1387,7 +1549,7 @@ static void UnitTestCuMatrixAddTpMat() {
     CuMatrix<Real> D(A);
     CuMatrix<Real> E(B);
     CuTpMatrix<Real> F(C);
-    
+
     A.AddTpMat(1.0, C, kNoTrans, B, kNoTrans, 1.0);
     D.AddTpMat(1.0, F, kNoTrans, E, kNoTrans, 1.0);
 
@@ -1399,7 +1561,7 @@ static void UnitTestCuMatrixAddTpMat() {
 /*
  * CuVector unit tests
  */
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuVectorAddVec() {
   Vector<Real> Hv(777);
   Vector<Real> Hw(777);
@@ -1417,13 +1579,13 @@ static void UnitTestCuVectorAddVec() {
 
   Vector<Real> Hv2(777);
   Dv.CopyToVec(&Hv2);
-  
+
   AssertEqual(Hv,Hv2);
 }
 
 
 
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuVectorAddRowSumMat() {
  const int32 X=4321, Y=19;
   Real alpha=0.1, beta=0.7;
@@ -1440,7 +1602,7 @@ static void UnitTestCuVectorAddRowSumMat() {
   Dv.CopyFromVec(Hv);
 
   Dv.AddRowSumMat(alpha,Dm,beta);
-  
+
   Hv_accu.SetZero();
   Hv_accu.AddRowSumMat(1.0, Hm);
   Hv.Scale(beta);
@@ -1454,7 +1616,7 @@ static void UnitTestCuVectorAddRowSumMat() {
 
 
 
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuVectorAddRowSumMatLarge() {
   Matrix<Real> Hm(1000,990);
   Vector<Real> Hv(990);
@@ -1468,7 +1630,7 @@ static void UnitTestCuVectorAddRowSumMatLarge() {
   Dv.CopyFromVec(Hv);
 
   Dv.AddRowSumMat(0.5,Dm,0.7);
-  
+
   Hv_accu.SetZero();
   Hv_accu.AddRowSumMat(1.0, Hm);
   Hv.Scale(0.7);
@@ -1482,7 +1644,7 @@ static void UnitTestCuVectorAddRowSumMatLarge() {
 
 
 
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuVectorAddColSumMat() {
   const int32 X=19, Y=4321;
   Real alpha=0.5, beta=0.7;
@@ -1499,7 +1661,7 @@ static void UnitTestCuVectorAddColSumMat() {
   Dv.CopyFromVec(Hv);
 
   Dv.AddColSumMat(alpha,Dm,beta);
-  
+
   Hv_accu.SetZero();
   Hv_accu.AddColSumMat(1.0, Hm);
   Hv.Scale(beta);
@@ -1511,7 +1673,7 @@ static void UnitTestCuVectorAddColSumMat() {
   AssertEqual(Hv,Hv2);
 }
 
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuSubMatrix() {
   for (int32 iter = 0 ; iter < 10; iter++) {
     int32 M1 = 1 + rand () % 10, M2 = 1 + Rand() % 1, M3 = 1 + Rand() % 10, M = M1 + M2 + M3,
@@ -1530,7 +1692,7 @@ static void UnitTestCuSubMatrix() {
 
 
 
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuVectorAddColSumMatLarge() {
   Matrix<Real> Hm(1000,990);
   Vector<Real> Hv(1000);
@@ -1544,7 +1706,7 @@ static void UnitTestCuVectorAddColSumMatLarge() {
   Dv.CopyFromVec(Hv);
 
   Dv.AddColSumMat(0.5, Dm, 0.7);
-  
+
   Hv_accu.SetZero();
   Hv_accu.AddColSumMat(1.0, Hm);
   Hv.Scale(0.7);
@@ -1558,7 +1720,7 @@ static void UnitTestCuVectorAddColSumMatLarge() {
 
 
 
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuVectorInvertElements() {
   Vector<Real> Hv(777);
   InitRand(&Hv);
@@ -1571,11 +1733,11 @@ static void UnitTestCuVectorInvertElements() {
 
   Vector<Real> Hv2(777);
   Dv.CopyToVec(&Hv2);
-  
+
   AssertEqual(Hv,Hv2);
 }
 
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuMatrixInvertElements() {
   Matrix<Real> Hm(77, 77);
   InitRand(&Hm);
@@ -1588,7 +1750,7 @@ static void UnitTestCuMatrixInvertElements() {
 
   Matrix<Real> Hm2(77, 77);
   Dm.CopyToMat(&Hm2);
-  
+
   AssertEqual(Hm,Hm2);
 }
 
@@ -1639,7 +1801,7 @@ static void UnitTestCuVectorAddTpVec() {
   AssertEqual(Hv,Hv2);
 }
 
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuApproxEqual() {
   Real tol = 0.1;
   for (int32 i = 0; i < 2; i++) {
@@ -1655,7 +1817,7 @@ static void UnitTestCuApproxEqual() {
   }
 }
 
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuVectorMulTp() {
   Vector<Real> Hv(300);
   InitRand(&Hv);
@@ -1677,7 +1839,7 @@ static void UnitTestCuVectorMulTp() {
   AssertEqual(Hv,Hv2);
 }
 
-template<typename Real, typename OtherReal> 
+template<typename Real, typename OtherReal>
 static void UnitTestCuCopy() {
   for (int32 i = 0; i < 10; i++) {
     int32 M = 1 + Rand() % 10, N = 1 + Rand() % 10;
@@ -1700,13 +1862,13 @@ static void UnitTestCuCopy() {
     CuMatrix<Real> J(I, kTrans);
     Matrix<OtherReal> K(J, kTrans);
     CuMatrix<Real> L(K, kNoTrans);
-    
+
     KALDI_ASSERT(A.ApproxEqual(L));
   }
 
 }
 
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuSigmoid() {
   Matrix<Real> Hi(100,111);
   Matrix<Real> Ho(100,111);
@@ -1733,7 +1895,7 @@ static void UnitTestCuSigmoid() {
 
 
 
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuDiffSigmoid() {
   Matrix<Real> Hi(100,111);
   Matrix<Real> Ho(100,111);
@@ -1764,7 +1926,7 @@ static void UnitTestCuDiffSigmoid() {
 
 
 
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuSoftmax() {
 
   for (int32 i = 0; i < 2; i++) {
@@ -1775,7 +1937,7 @@ static void UnitTestCuSoftmax() {
     Matrix<Real> Ho(row,col);
     Hi.SetRandn();
     Hi.Scale(5.0);
-  
+
     CuMatrix<Real> Di(row, col);
     CuMatrix<Real> Do(row, col);
     Di.CopyFromMat(Hi);
@@ -1795,7 +1957,7 @@ static void UnitTestCuSoftmax() {
 }
 
 
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuLogSoftmax() {
 
   for (int32 i = 0; i < 2; i++) {
@@ -1806,7 +1968,7 @@ static void UnitTestCuLogSoftmax() {
     Matrix<Real> Ho(row, col);
     Hi.SetRandn();
     Hi.Scale(5.0);
-  
+
     CuMatrix<Real> Di(row, col);
     CuMatrix<Real> Do(row, col);
     Di.CopyFromMat(Hi);
@@ -1826,7 +1988,7 @@ static void UnitTestCuLogSoftmax() {
 }
 
 
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuFindRowMaxId() {
   for (int32 i = 0; i < 2; i++) {
     int32 dimM = 100 + Rand() % 200, dimN = 100 + Rand() % 200;
@@ -1860,7 +2022,7 @@ static void UnitTestCuFindRowMaxId() {
 
 
 
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuDiffXent() {
   int32 X=100, Y=111;
   //nnet output / diff
@@ -1878,7 +2040,7 @@ static void UnitTestCuDiffXent() {
   //logpost vector
   Vector<Real> Hlogpost(X);
   CuVector<Real> Dlogpost(X);
-  
+
   //gpu
   Di.DiffXent(Dtgt, &Dlogpost);
   //cpu
@@ -1906,7 +2068,7 @@ template<typename Real> void UnitTestCheck() {
 
   CuMatrix<Real> Dj(Di);
   KALDI_LOG << Dj.NumRows();
- 
+
 
 }
 
@@ -1957,7 +2119,7 @@ void UnitTestCuTanh() {
   H.SetRandn();
   CuMatrix<Real> D(100,110);
   D.CopyFromMat(H);
-  
+
   //gpu
   CuMatrix<Real> Di(100,110);
   Di.Tanh(D);
@@ -1970,7 +2132,7 @@ void UnitTestCuTanh() {
   AssertEqual(Df,Hf);
 }
 
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuDiffTanh() {
   Matrix<Real> Hi(100,111);
   Matrix<Real> Ho(100,111);
@@ -2016,7 +2178,7 @@ static void UnitTestCuMatrixSetRandn() {
     N.SetRandn();
     AssertEqual(M, N);
   }
-    
+
   for (int32 i = 0; i < 5; i++) {
     MatrixIndexT rows = 100 + Rand() % 50, cols = 100 + Rand() % 50;
     CuMatrix<Real> M(rows, cols);
@@ -2118,7 +2280,7 @@ static void UnitTestCuMatrixSetZeroAboveDiag() {
     Matrix<Real> A_orig(A);
     A.SetZeroAboveDiag();
     Matrix<Real> A_copy(A);
-        
+
     for (int32 i = 0; i < dim;  i++) {
       for (int32 j = 0; j < dim; j++) {
         Real aval = A_copy(i, j), aorigval = A_orig(i, j);
@@ -2151,14 +2313,14 @@ static void UnitTestCuMatrixCopyUpperToLower() {
 }
 
 
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuMatrixObjfDeriv() {
   int32 n_r = 100 + Rand() % 200, n_c = 20 + Rand() % 30;
   CuMatrix<Real> A(n_r, n_c), B(n_r, n_c);
   B.SetRandn();
   B.Add(1.0);
   B.ApplyFloor(1.0e-10);
-  
+
   std::vector<MatrixElement<Real> > labels;
   for(int i = 0; i < n_r; i++) {
     for(int j = 0; j < n_c; j++) {
@@ -2176,11 +2338,11 @@ static void UnitTestCuMatrixObjfDeriv() {
 
   // (sv_labels, logprobs, &tot_objf, &tot_weight)
   C.CompObjfAndDeriv(labels, B, &a, &b);
-  
+
   KALDI_ASSERT(ApproxEqual(b, A.Sum()));
 
   Real sum2;  // sum(i, j) A(i, j) log(B(i, j));
-  { 
+  {
     CuMatrix<Real> Bcopy(B);
     Bcopy.ApplyLog();
     sum2 = TraceMatMat(Bcopy, A, kTrans);
@@ -2192,43 +2354,58 @@ static void UnitTestCuMatrixObjfDeriv() {
   KALDI_ASSERT(ApproxEqual(A, C));
 }
 
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuMatrixAddElements() {
   for (int32 i = 0; i < 2; i++) {
     int32 dimM = 100 + Rand() % 50, dimN = 100 + Rand() % 50;
    // int32 dimM = 256, dimN = 256;
     CuMatrix<Real> H(dimM, dimN);
     H.SetRandn();
+    CuMatrix<Real> H_copy(H);
     CuMatrix<Real> M(H);
+    int32 num_elements = 100 + Rand() % 10;
     std::vector<MatrixElement<Real> > input;
+    std::vector<Int32Pair> input_index;
+    Real *input_value = new Real[num_elements];
     BaseFloat scale = -1 + (0.33 * (Rand() % 5));
-    for (int32 j = 0; j < 100 + Rand() % 10; j++) {
+    for (int32 j = 0; j < num_elements; j++) {
       MatrixIndexT r = Rand() % dimM;
       MatrixIndexT c = Rand() % dimN;
+      Int32Pair tmp_pair;
+      tmp_pair.first = r;
+      tmp_pair.second = c;
       Real offset = -1 + (0.33 * (Rand() % 5));
       M(r, c) += scale * offset;
       MatrixElement<Real> t = {r, c, offset};
       input.push_back(t);
+      input_index.push_back(tmp_pair);
+      input_value[j] = offset;
     }
     H.AddElements(scale, input);
+    CuArray<Int32Pair> cu_input_index(input_index);
+    H_copy.AddElements(scale, cu_input_index, input_value);
+    delete[] input_value;
 
     AssertEqual(H, M);
+    AssertEqual(H_copy, M);
   }
 }
 
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuMatrixLookup() {
   for (int32 i = 0; i < 2; i++) {
     int32 dimM = 100 + Rand() % 200, dimN = 100 + Rand() % 200;
     CuMatrix<Real> H(dimM, dimN);
     H.SetRandn();
 
+    int32 num_elements = 10 + Rand() % 10;
     std::vector<Int32Pair> indices;
     std::vector<Real> reference;
     std::vector<Real> output;
+    output.resize(num_elements);
 
     // Generates the indices and the reference.
-    for (int32 j = 0; j < 10 + Rand() % 10; j++) {
+    for (int32 j = 0; j < num_elements; j++) {
       MatrixIndexT r = Rand() % dimM;
       MatrixIndexT c = Rand() % dimN;
 
@@ -2239,13 +2416,13 @@ static void UnitTestCuMatrixLookup() {
       reference.push_back(H(r, c));
     }
 
-    H.Lookup(indices, &output);
+    H.Lookup(indices, &(output[0]));
 
     KALDI_ASSERT(reference == output);
   }
 }
 
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuMatrixEqualElementMask() {
   CuMatrix<Real> m1(10,9), m2(10,9);
   CuMatrix<Real> mask_same, mask_different;
@@ -2280,12 +2457,14 @@ template<typename Real> void CudaMatrixUnitTest() {
   UnitTestCuMatrixScale<Real>();
   UnitTestCuMatrixSigmoid<Real>();
   UnitTestCuMatrixSoftHinge<Real>();
-  UnitTestCuMatrixApplyPow<Real>(); 
-  UnitTestCuMatrixApplyPowAbs<Real>(); 
+  UnitTestCuMatrixApplyPow<Real>();
+  UnitTestCuMatrixApplyPowAbs<Real>();
   UnitTestCuMatrixSet<Real>();
   UnitTestCuMatrixAdd<Real>();
   UnitTestCuMatrixApplyFloor<Real>();
+  UnitTestCuMatrixApplyCeiling<Real>();
   UnitTestCuMatrixApplyHeaviside<Real>();
+  UnitTestCuMatrixHeaviside<Real>();
   UnitTestCuMatrixMulElements<Real>();
   UnitTestCuMatrixDivElements<Real>();
   UnitTestCuMatrixMax<Real>();
@@ -2293,11 +2472,14 @@ template<typename Real> void CudaMatrixUnitTest() {
   UnitTestCuMatrixMulRowsVec<Real>();
   UnitTestCuMatrixDivRowsVec<Real>();
   UnitTestCuMatrixAddMat<Real>();
+  UnitTestCuMatrixAddMatBlocks<Real>();
   UnitTestCuMatrixSum<Real>();
   UnitTestCuMatrixAddVecToCols<Real>();
   UnitTestCuMatrixAddVecToRows<Real>();
   UnitTestCuMatrixAddMatMat<Real>();
+  UnitTestCuMatrixAddVecVec<Real>();
   UnitTestCuMatrixSymAddMat2<Real>();
+  UnitTestCuMatrixAddMatMatBatched<Real>();
   UnitTestCuMatrixSymInvertPosDef<Real>();
   UnitTestCuMatrixCopyFromMat<Real>();
   UnitTestCuMatrixCopyFromTp<Real>();
@@ -2318,7 +2500,7 @@ template<typename Real> void CudaMatrixUnitTest() {
   UnitTestCuMatrixSetZeroAboveDiag<Real>();
   UnitTestCuMatrixAddElements<Real>();
   UnitTestCuMatrixLookup<Real>();
-  UnitTestCuMatrixEqualElementMask<Real>(); 
+  UnitTestCuMatrixEqualElementMask<Real>();
   // test CuVector<Real> methods
   UnitTestCuVectorAddVec<Real>();
   UnitTestCuVectorAddRowSumMat<Real>();
@@ -2331,17 +2513,17 @@ template<typename Real> void CudaMatrixUnitTest() {
   UnitTestCuMatrixIO<Real>();
   UnitTestCuSigmoid<Real>();
   UnitTestCuApproxEqual<Real>();
-  UnitTestCuCopy<Real, float>(); 
-#if HAVE_CUDA == 1  
+  UnitTestCuCopy<Real, float>();
+#if HAVE_CUDA == 1
   if (CuDevice::Instantiate().DoublePrecisionSupported())
 #endif
     UnitTestCuCopy<Real, double>();
   UnitTestCuMatrixAddToDiag<Real>();
   UnitTestCuMatrixAdd2<Real>();
   UnitTestCuDiffSigmoid<Real>();
-  UnitTestCuMatrixGroupPnorm<Real>();  
+  UnitTestCuMatrixGroupPnorm<Real>();
   UnitTestCuMatrixGroupPnormDeriv<Real>();
-  UnitTestCuMatrixGroupMax<Real>();  
+  UnitTestCuMatrixGroupMax<Real>();
   UnitTestCuMatrixGroupMaxDeriv<Real>();
   UnitTestCuMatrixMulRowsVec<Real>();
   UnitTestCuMatrixMulRowsGroupMat<Real>();
@@ -2377,7 +2559,7 @@ int main() {
 
     kaldi::CudaMatrixUnitTest<float>();
 
-    
+
 #if HAVE_CUDA == 1
     if (CuDevice::Instantiate().DoublePrecisionSupported()) {
       kaldi::CudaMatrixUnitTest<double>();
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index 03e6f8cfe2c..53f220e0c41 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -25,7 +25,7 @@
 
 #if HAVE_CUDA == 1
 #include <cuda_runtime_api.h>
-#include <cublas.h>
+#include <cublas_v2.h>
 #endif
 
 #include "base/timer.h"
@@ -46,7 +46,8 @@ namespace kaldi {
 
 template<typename Real>
 void CuMatrix<Real>::Resize(MatrixIndexT rows, MatrixIndexT cols,
-                            MatrixResizeType resize_type) {
+                            MatrixResizeType resize_type,
+                            MatrixStrideType stride_type) {
   // This code does not currently support the other resize_type options.
   KALDI_ASSERT(resize_type == kSetZero || resize_type == kUndefined);
   if (rows * cols == 0) KALDI_ASSERT(rows == 0 && cols == 0);
@@ -54,28 +55,35 @@ void CuMatrix<Real>::Resize(MatrixIndexT rows, MatrixIndexT cols,
     if (resize_type == kSetZero) this->SetZero();
     return;
   }
-
   if (this->num_rows_ != 0)
     this->Destroy();
-  if (rows == 0) return;  
+  if (rows == 0) return;
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
     MatrixIndexT row_bytes = cols * sizeof(Real);
     size_t pitch;
-    this->data_ = static_cast<Real*>(CuDevice::Instantiate().MallocPitch(
-        row_bytes, rows, &pitch));
-    this->num_rows_ = rows;
-    this->num_cols_ = cols; 
-    this->stride_ = pitch / sizeof(Real);
+    if (stride_type == kDefaultStride) {
+      this->data_ = static_cast<Real*>(CuDevice::Instantiate().MallocPitch(
+          row_bytes, rows, &pitch));
+      this->num_rows_ = rows;
+      this->num_cols_ = cols;
+      this->stride_ = pitch / sizeof(Real);
+    } else {  // kStrideEqualNumCols
+      size_t bytes = rows * cols * sizeof(Real);
+      this->data_ = static_cast<Real*>(CuDevice::Instantiate().Malloc(bytes));
+      this->num_rows_ = rows;
+      this->num_cols_ = cols;
+      this->stride_ = cols;
+    }
     if (resize_type == kSetZero) this->SetZero();
-    CuDevice::Instantiate().AccuProfile("CuMatrix::Resize", tim.Elapsed());    
+    CuDevice::Instantiate().AccuProfile("CuMatrix::Resize", tim.Elapsed());
   } else
 #endif
   { // Let the initializer of Matrix<Real> handle the allocation,
     // and then just do Swap which will switch the pointers.
     // This wastes a few instructions but is simple to code.
-    Matrix<Real> mat(rows, cols, resize_type);
+    Matrix<Real> mat(rows, cols, resize_type, stride_type);
     this->Swap(&mat);
   }
 }
@@ -87,7 +95,7 @@ void CuMatrix<Real>::Destroy() {
     if (this->data_ != NULL) {
       Timer tim;
       CuDevice::Instantiate().Free(this->data_);
-      CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());    
+      CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
     }
   } else
 #endif
@@ -213,7 +221,7 @@ void CuMatrixBase<Real>::CopyFromMat(const CuMatrixBase<OtherReal> &M,
       KALDI_ASSERT(M.NumRows() == num_rows_ && M.NumCols() == num_cols_);
     } else {
       KALDI_ASSERT(M.NumCols() == num_rows_ && M.NumRows() == num_cols_);
-    }    
+    }
     if (M.num_rows_ == 0) return; // Nothing to do.
     Timer tim;
     if (sizeof(Real) == sizeof(OtherReal) && trans == kNoTrans ) {
@@ -223,15 +231,22 @@ void CuMatrixBase<Real>::CopyFromMat(const CuMatrixBase<OtherReal> &M,
       CU_SAFE_CALL(cudaMemcpy2D(data_, dst_pitch, M.data_, src_pitch,
                                 width, M.num_rows_, cudaMemcpyDeviceToDevice));
     } else {
-      dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
-      // We are making this kernel "newer-style, with x corresponding to
-      // row dimension and y to column dimension.
-      dim3 dimGrid(n_blocks(num_rows_, CU2DBLOCK), n_blocks(num_cols_, CU2DBLOCK));
       if (trans == kNoTrans) {
+        dim3 dimGrid, dimBlock;
+        GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                              &dimGrid, &dimBlock);
         cuda_copy_from_mat(dimGrid, dimBlock, data_, M.data_, Dim(), M.Dim());
       } else {
-        cuda_copy_from_mat_trans(dimGrid, dimBlock, data_, M.data_, Dim(), M.Dim());
+        // 2D thread block with warps (blockDim.x) along the row-dim of input M.
+        // Each (8x32) thread block will transpose (32x32) data
+        const int32 warpSize = 32;
+        dim3 dimBlock(warpSize, CU1DBLOCK / warpSize);
+        dim3 dimGrid(n_blocks(M.NumCols(), warpSize),
+            n_blocks(M.NumRows(), warpSize));
+        cuda_copy_from_mat_trans(dimGrid, dimBlock, data_, M.data_, Dim(),
+            M.Dim());
       }
+      CU_SAFE_CALL(cudaGetLastError());
     }
     CuDevice::Instantiate().AccuProfile("CuMatrixBase::CopyFromMat(from other CuMatrixBase)", tim.Elapsed());
   } else
@@ -272,9 +287,9 @@ void CuMatrixBase<Real>::CopyFromTp(const CuTpMatrix<OtherReal> &M,
     if (trans == kNoTrans) {
       cuda_copy_from_tp(dimGrid, dimBlock, data_, M.Data(), Dim());
     } else {
-      cuda_copy_from_tp_trans(dimGrid, dimBlock, data_, M.Data(), Dim());      
+      cuda_copy_from_tp_trans(dimGrid, dimBlock, data_, M.Data(), Dim());
     }
-    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());    
+    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
   } else
 #endif
   {
@@ -294,10 +309,10 @@ template void CuMatrixBase<double>::CopyFromTp(const CuTpMatrix<double> &M,
 template<typename Real>
 void CuMatrixBase<Real>::CopyFromMat(const MatrixBase<Real> &src,
                                      MatrixTransposeType trans) {
-#if HAVE_CUDA == 1 
+#if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     if (trans == kNoTrans) {
-      KALDI_ASSERT(src.NumRows() == num_rows_ && src.NumCols() == num_cols_);      
+      KALDI_ASSERT(src.NumRows() == num_rows_ && src.NumCols() == num_cols_);
       Timer tim;
 
       MatrixIndexT dst_pitch = stride_*sizeof(Real);
@@ -305,7 +320,7 @@ void CuMatrixBase<Real>::CopyFromMat(const MatrixBase<Real> &src,
       MatrixIndexT width = src.NumCols()*sizeof(Real);
       CU_SAFE_CALL(cudaMemcpy2D(data_, dst_pitch, src.Data(), src_pitch,
                                 width, src.NumRows(), cudaMemcpyHostToDevice));
-      
+
       CuDevice::Instantiate().AccuProfile("CuMatrixBase::CopyFromMat(from CPU)",tim.Elapsed());
     } else {
       CuMatrix<Real> trans_mat(src); // Do the transpose on the GPU board.
@@ -398,8 +413,8 @@ template<typename Real>
 template<typename OtherReal>
 void CuMatrixBase<Real>::CopyToMat(MatrixBase<OtherReal> *dst,
                                    MatrixTransposeType trans) const {
-#if HAVE_CUDA == 1 
-  if (CuDevice::Instantiate().Enabled()) {    
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
     if (trans == kTrans || sizeof(OtherReal) != sizeof(Real)) {
       CuMatrix<OtherReal> this_trans(*this, trans);
       this_trans.CopyToMat(dst, kNoTrans);
@@ -407,7 +422,7 @@ void CuMatrixBase<Real>::CopyToMat(MatrixBase<OtherReal> *dst,
       KALDI_ASSERT(dst->NumRows() == NumRows() && dst->NumCols() == NumCols());
       if (num_rows_ == 0) return;
       Timer tim;
-   
+
       MatrixIndexT src_pitch = stride_*sizeof(Real);
       MatrixIndexT dst_pitch = dst->Stride()*sizeof(Real);
       MatrixIndexT width = NumCols()*sizeof(Real);
@@ -458,10 +473,10 @@ void CuMatrixBase<Real>::Write(std::ostream &os, bool binary) const {
 
 template<typename Real>
 void CuMatrixBase<Real>::SetZero() {
-#if HAVE_CUDA == 1 
-  if (CuDevice::Instantiate().Enabled()) { 
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
-    CU_SAFE_CALL(cudaMemset2D(data_, stride_ * sizeof(Real), 0, 
+    CU_SAFE_CALL(cudaMemset2D(data_, stride_ * sizeof(Real), 0,
                               num_cols_ * sizeof(Real), num_rows_ ));
     CuDevice::Instantiate().AccuProfile("CuMatrix::SetZero", tim.Elapsed());
   } else
@@ -477,15 +492,16 @@ void CuMatrixBase<Real>::SetZero() {
 /*
  * Methods wrapping the ANSI-C CUDA kernels
  */
-template<typename Real> 
+template<typename Real>
 void CuMatrixBase<Real>::Set(Real value) {
-  #if HAVE_CUDA == 1 
+  #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     if (num_rows_ == 0) return;
     Timer tim;
 
-    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
-    dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK));
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
 
     cuda_set_const(dimGrid, dimBlock, data_, value, Dim());
     CU_SAFE_CALL(cudaGetLastError());
@@ -506,8 +522,9 @@ void CuMatrixBase<Real>::SetZeroAboveDiag() {
     if (num_rows_ == 0) return;
     Timer tim;
 
-    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
-    dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK));
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
 
     cuda_set_zero_above_diag(dimGrid, dimBlock, data_, Dim());
     CU_SAFE_CALL(cudaGetLastError());
@@ -526,15 +543,16 @@ void CuMatrixBase<Real>::SetZeroAboveDiag() {
   }
 }
 
-template<typename Real> 
-void CuMatrixBase<Real>::Add(Real value) { 
-#if HAVE_CUDA == 1 
+template<typename Real>
+void CuMatrixBase<Real>::Add(Real value) {
+#if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     if (num_rows_ == 0) return;
     Timer tim;
 
-    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
-    dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK));
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
 
     cuda_add(dimGrid, dimBlock, data_, value, Dim());
     CU_SAFE_CALL(cudaGetLastError());
@@ -547,9 +565,9 @@ void CuMatrixBase<Real>::Add(Real value) {
   }
 }
 
-template<typename Real> 
-void CuMatrixBase<Real>::AddToDiag(Real value) { 
-#if HAVE_CUDA == 1 
+template<typename Real>
+void CuMatrixBase<Real>::AddToDiag(Real value) {
+#if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     if (num_rows_ == 0) return;
     Timer tim;
@@ -587,15 +605,16 @@ bool CuMatrixBase<Real>::IsUnit(Real tol) const {
 
 
 
-template<typename Real> 
-void CuMatrixBase<Real>::Scale(Real value) { 
-#if HAVE_CUDA == 1 
+template<typename Real>
+void CuMatrixBase<Real>::Scale(Real value) {
+#if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     if (num_rows_ == 0) return;
     Timer tim;
 
-    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
-    dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK));
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
 
     cuda_scale(dimGrid, dimBlock, data_, value, Dim());
     CU_SAFE_CALL(cudaGetLastError());
@@ -608,15 +627,16 @@ void CuMatrixBase<Real>::Scale(Real value) {
   }
 }
 
-template<typename Real> 
-void CuMatrixBase<Real>::ApplyLog() { 
-  #if HAVE_CUDA == 1 
+template<typename Real>
+void CuMatrixBase<Real>::ApplyLog() {
+  #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     if (num_rows_ == 0) return;
     Timer tim;
 
-    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
-    dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK));
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
 
     cuda_apply_log(dimGrid, dimBlock, data_, Dim());
     CU_SAFE_CALL(cudaGetLastError());
@@ -637,13 +657,14 @@ void CuMatrixBase<Real>::MulElements(const CuMatrixBase<Real>& A) {
 
     KALDI_ASSERT(num_cols_ == A.NumCols());
     KALDI_ASSERT(num_rows_ == A.NumRows());
-    
-    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
-    dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK));
+
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
 
     cuda_mul_elements(dimGrid, dimBlock, data_, A.data_, Dim(), A.Stride());
     CU_SAFE_CALL(cudaGetLastError());
-    
+
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
   } else
   #endif
@@ -660,13 +681,14 @@ void CuMatrixBase<Real>::DivElements(const CuMatrixBase<Real>& A) {
 
     KALDI_ASSERT(num_cols_ == A.NumCols());
     KALDI_ASSERT(num_rows_ == A.NumRows());
-    
-    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
-    dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK));
+
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
 
     cuda_div_elements(dimGrid, dimBlock, data_, A.data_, Dim(), A.Stride());
     CU_SAFE_CALL(cudaGetLastError());
-    
+
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
   } else
   #endif
@@ -683,13 +705,14 @@ void CuMatrixBase<Real>::Max(const CuMatrixBase<Real>& A) {
 
     KALDI_ASSERT(num_cols_ == A.NumCols());
     KALDI_ASSERT(num_rows_ == A.NumRows());
-    
-    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
-    dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK));
+
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
 
     cuda_max(dimGrid, dimBlock, data_, A.data_, Dim(), A.Stride());
     CU_SAFE_CALL(cudaGetLastError());
-    
+
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
   } else
   #endif
@@ -701,14 +724,16 @@ void CuMatrixBase<Real>::Max(const CuMatrixBase<Real>& A) {
 
 template<typename Real>
 void CuMatrixBase<Real>::MulColsVec(const CuVectorBase<Real> &scale) {
-#if HAVE_CUDA == 1 
-  if (CuDevice::Instantiate().Enabled()) { 
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
 
     KALDI_ASSERT(scale.Dim() == NumCols());
 
-    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
-    dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK));
+
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
 
     cuda_mul_cols_vec(dimGrid, dimBlock, data_, scale.data_, Dim());
     CU_SAFE_CALL(cudaGetLastError());
@@ -732,31 +757,33 @@ void CuMatrixBase<Real>::MulRowsVec(const CuVectorBase<Real> &scale) {
 
     KALDI_ASSERT(scale.Dim() == NumRows());
 
-    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
-    dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK));
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
 
     cuda_mul_rows_vec(dimGrid, dimBlock, data_, scale.data_, Dim());
     CU_SAFE_CALL(cudaGetLastError());
 
 
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
-  } else 
+  } else
   #endif
   {
     Mat().MulRowsVec(scale.Vec());
   }
 }
 
-template<typename Real> 
+template<typename Real>
 void CuMatrixBase<Real>::MulRowsGroupMat(const CuMatrixBase<Real> &src) {
   KALDI_ASSERT(src.NumCols() > 0);
-#if HAVE_CUDA == 1 
-  if (CuDevice::Instantiate().Enabled()) { 
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
     int group_size = this->NumCols() / src.NumCols();
-    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
-    dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK),
-                 n_blocks(NumRows(), CU2DBLOCK));
+
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
 
     cuda_mul_rows_group_mat(dimGrid, dimBlock, this->data_, src.data_,
                             this->Dim(), src.Stride(), group_size);
@@ -776,14 +803,14 @@ void CuMatrixBase<Real>::GroupPnormDeriv(const CuMatrixBase<Real> &src1,
   KALDI_ASSERT(src2.NumCols() > 0);
   int group_size = this->NumCols() / src2.NumCols();
   KALDI_ASSERT(this->NumCols() == src2.NumCols() * group_size);
-#if HAVE_CUDA == 1 
-  if (CuDevice::Instantiate().Enabled()) { 
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
-
     dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
-    dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK));
-
-    cuda_calc_pnorm_deriv(dimGrid, dimBlock, this->data_, src1.Data(), src2.Data(), Dim(), src2.Stride(), group_size, power);
+    dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK),
+                 n_blocks(NumRows(), CU2DBLOCK));
+    cuda_calc_pnorm_deriv(dimGrid, dimBlock, this->data_, src1.Data(),
+                          src2.Data(), Dim(), src2.Stride(), group_size, power);
     CU_SAFE_CALL(cudaGetLastError());
 
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
@@ -802,11 +829,10 @@ void CuMatrixBase<Real>::GroupMaxDeriv(const CuMatrixBase<Real> &src1,
   KALDI_ASSERT(this->NumCols() == src2.NumCols() * group_size);
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
-    Timer tim;
-
+   Timer tim;
     dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
-    dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK));
-
+    dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK),
+                 n_blocks(NumRows(), CU2DBLOCK));
     cuda_calc_group_max_deriv(dimGrid, dimBlock, this->data_,
                               src1.Data(), src2.Data(), Dim(),
                               src2.Stride(), group_size);
@@ -828,14 +854,15 @@ void CuMatrixBase<Real>::DivRowsVec(const CuVectorBase<Real> &div) {
 
     KALDI_ASSERT(div.Dim() == NumRows());
 
-    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
-    dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK));
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
 
     cuda_div_rows_vec(dimGrid, dimBlock, data_, div.data_, Dim());
     CU_SAFE_CALL(cudaGetLastError());
 
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
-  } else 
+  } else
 #endif
   {
     Vector<Real> temp(div.Vec()); // will copy.
@@ -843,17 +870,18 @@ void CuMatrixBase<Real>::DivRowsVec(const CuVectorBase<Real> &div) {
     Mat().MulRowsVec(temp);
   }
 }
- 
+
 template<typename Real>
 void CuMatrixBase<Real>::InvertElements() {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
 
-    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
-    dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK));
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
 
-    cuda_invert_elements(dimGrid, dimBlock, data_, Dim()); 
+    cuda_invert_elements(dimGrid, dimBlock, data_, Dim());
     CU_SAFE_CALL(cudaGetLastError());
 
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
@@ -866,7 +894,7 @@ void CuMatrixBase<Real>::InvertElements() {
 
 
 template<typename Real>
-void CuMatrixBase<Real>::AddMat(Real alpha, const CuMatrixBase<Real>& A, 
+void CuMatrixBase<Real>::AddMat(Real alpha, const CuMatrixBase<Real>& A,
                                 MatrixTransposeType transA) {
 
 #if HAVE_CUDA == 1
@@ -878,10 +906,14 @@ void CuMatrixBase<Real>::AddMat(Real alpha, const CuMatrixBase<Real>& A,
     }
     if (num_rows_ == 0) return;
     Timer tim;
+    // This block dimension seems to work better than the
+    // one from GetBlockSizesForSimpleMatrixOperation().
     dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
-    dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK));
-    cuda_add_mat(dimGrid, dimBlock, alpha, A.data_, data_, Dim(), A.Stride(),
-                 (transA == kTrans ? 1 : 0)); 
+    dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK),
+                 n_blocks(NumRows(), CU2DBLOCK));
+    cuda_add_mat(dimGrid, dimBlock, alpha, A.data_,
+                 data_, Dim(), A.Stride(),
+                 (transA == kTrans ? 1 : 0));
     CU_SAFE_CALL(cudaGetLastError());
 
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
@@ -893,7 +925,53 @@ void CuMatrixBase<Real>::AddMat(Real alpha, const CuMatrixBase<Real>& A,
 }
 
 template<typename Real>
-void CuMatrixBase<Real>::AddMatMatDivMat(const CuMatrixBase<Real> &A, 
+void CuMatrixBase<Real>::AddMatBlocks(Real alpha, const CuMatrixBase<Real> &A,
+		MatrixTransposeType transA) {
+  if (num_rows_ == 0 || num_cols_ == 0) return;
+  int32 num_row_blocks, num_col_blocks;
+  if (transA == kNoTrans) {
+    KALDI_ASSERT(A.NumRows() % num_rows_ == 0 && A.NumCols() % num_cols_ == 0);
+    num_row_blocks = A.Mat().NumRows() / num_rows_;
+    num_col_blocks = A.Mat().NumCols() / num_cols_;
+  } else {
+    KALDI_ASSERT(A.NumRows() % num_cols_ == 0 && A.NumCols() % num_rows_ == 0);
+    num_row_blocks = A.Mat().NumRows() / num_cols_;
+    num_col_blocks = A.Mat().NumCols() / num_rows_;
+  }
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    Timer tim;
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
+    cuda_add_mat_blocks(dimGrid, dimBlock, alpha, A.data_, num_row_blocks,
+		    num_col_blocks, data_, Dim(), A.Stride(),
+		    (transA == kTrans ? 1 : 0));
+    CU_SAFE_CALL(cudaGetLastError());
+
+    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
+  } else
+#endif
+  {
+    int32 nr, nc;
+    if (transA == kNoTrans) {
+      nr = num_rows_;
+      nc = num_cols_;
+    } else {
+      nr = num_cols_;
+      nc = num_rows_;
+    }
+    for (int32 i = 0; i < num_row_blocks; i++) {
+      for (int32 j = 0; j < num_col_blocks; j++) {
+        Mat().AddMat(alpha, SubMatrix<Real>(A.Mat(), i * nr, nr, j * nc, nc),
+			transA);
+      }
+    }
+  }
+}
+
+template<typename Real>
+void CuMatrixBase<Real>::AddMatMatDivMat(const CuMatrixBase<Real> &A,
                     const CuMatrixBase<Real> &B, const CuMatrixBase<Real> &C) {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
@@ -903,11 +981,11 @@ void CuMatrixBase<Real>::AddMatMatDivMat(const CuMatrixBase<Real> &A,
     KALDI_ASSERT(num_rows_ == B.num_rows_ && num_cols_ == B.num_cols_);
     KALDI_ASSERT(num_rows_ == C.num_rows_ && num_cols_ == C.num_cols_);
     if (num_rows_ == 0) return;
-
-    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
-    dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK));
-
-    cuda_add_mat_mat_div_mat(dimGrid, dimBlock, A.data_, B.data_, C.data_, data_, Dim(), A.Stride(), B.Stride(), C.Stride());
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
+    cuda_add_mat_mat_div_mat(dimGrid, dimBlock, A.data_, B.data_, C.data_,
+                             data_, Dim(), A.Stride(), B.Stride(), C.Stride());
     CU_SAFE_CALL(cudaGetLastError());
 
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
@@ -921,7 +999,7 @@ void CuMatrixBase<Real>::AddMatMatDivMat(const CuMatrixBase<Real> &A,
 template<typename Real>
 void CuMatrixBase<Real>::AddVecToCols(Real alpha,
                                       const CuVectorBase<Real> &col,
-                                      Real beta) { 
+                                      Real beta) {
   if (col.Dim() != NumRows()) {
     KALDI_ERR << "Non matching dimensions: Rows:" << NumRows() << " VectorDim:" << col.Dim();
   }
@@ -929,13 +1007,13 @@ void CuMatrixBase<Real>::AddVecToCols(Real alpha,
   #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
-   
-    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
-    dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK));
-
-    cuda_add_vec_to_cols(dimGrid, dimBlock, alpha, col.data_, beta, data_, Dim());
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
+    cuda_add_vec_to_cols(dimGrid, dimBlock, alpha, col.data_, beta,
+                         data_, Dim());
     CU_SAFE_CALL(cudaGetLastError());
-    
+
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
   } else
   #endif
@@ -950,20 +1028,19 @@ void CuMatrixBase<Real>::AddVecToCols(Real alpha,
 template<typename Real>
 void CuMatrixBase<Real>::AddVecToRows(Real alpha,
                                       const CuVectorBase<Real> &row,
-                                      Real beta) { 
+                                      Real beta) {
   if (row.Dim() != NumCols()) {
     KALDI_ERR << "Non matching dimensions: Cols:" << NumCols() << " VectorDim:" << row.Dim();
   }
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
-   
-    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
-    dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK));
-
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
     cuda_add_vec_to_rows(dimGrid, dimBlock, alpha, row.data_, beta, data_, Dim());
     CU_SAFE_CALL(cudaGetLastError());
-    
+
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
   } else
 #endif
@@ -986,7 +1063,7 @@ void CuMatrixBase<Real>::AddMatMat(
 
     // CUBLAS is col-major, cudamatrix is row-major, how to do the mapping?
     // keep trans..., just swap A&B matrices: A->B B->A
-    MatrixIndexT m = ((transB==kTrans)? B.NumRows() : B.NumCols()); 
+    MatrixIndexT m = ((transB==kTrans)? B.NumRows() : B.NumCols());
     MatrixIndexT n = ((transA==kTrans)? A.NumCols() : A.NumRows());
     MatrixIndexT k = ((transB==kTrans)? B.NumCols() : B.NumRows());
     MatrixIndexT k1 = ((transA==kTrans)? A.NumRows() : A.NumCols());
@@ -996,17 +1073,16 @@ void CuMatrixBase<Real>::AddMatMat(
     KALDI_ASSERT(k == k1);
 
     if (m == 0) return;
-    
-    
+
+
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
-
-    cublas_gemm((transB==kTrans?'T':'N'), (transA==kTrans?'T':'N'), m, n, k, 
-                alpha, B.data_, B.Stride(), A.data_, A.Stride(), 
-                beta, data_, Stride());
-
-    CU_SAFE_CALL(cublasGetError());
+    CU_SAFE_CALL(cublas_gemm(GetCublasHandle(),
+			    (transB==kTrans? CUBLAS_OP_T:CUBLAS_OP_N),
+			    (transA==kTrans? CUBLAS_OP_T:CUBLAS_OP_N),
+			    m, n, k, alpha, B.data_, B.Stride(),
+			    A.data_, A.Stride(), beta, data_, Stride()));
 
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
   } else
@@ -1017,6 +1093,29 @@ void CuMatrixBase<Real>::AddMatMat(
 }
 
 
+template<typename Real>
+void CuMatrixBase<Real>::AddVecVec(
+    Real alpha, const CuVectorBase<Real> &x, const CuVectorBase<Real> &y) {
+
+    MatrixIndexT m = y.Dim();
+    MatrixIndexT n = x.Dim();
+    KALDI_ASSERT(m == NumCols());
+    KALDI_ASSERT(n == NumRows());
+
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    Timer tim;
+    CU_SAFE_CALL(cublas_ger(GetCublasHandle(), m, n, alpha,
+                 y.Data(), 1, x.Data(), 1, data_, Stride()));
+
+    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
+  } else
+#endif
+  {
+    Mat().AddVecVec(alpha, x.Vec(), y.Vec());
+  }
+}
+
 
 template<typename Real>
 void CuMatrixBase<Real>::SymAddMat2(
@@ -1031,13 +1130,11 @@ void CuMatrixBase<Real>::SymAddMat2(
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
-    char trans = (transA == kTrans ? 'N' : 'T');
+    cublasOperation_t trans = (transA == kTrans ? CUBLAS_OP_N : CUBLAS_OP_T);
     MatrixIndexT A_other_dim = (transA == kNoTrans ? A.num_cols_ : A.num_rows_);
-    
-    cublas_syrk('U', trans, num_rows_, A_other_dim, alpha, A.Data(),
-                A.Stride(), beta, this->data_, this->stride_);
-
-    CU_SAFE_CALL(cublasGetError());
+    CU_SAFE_CALL(cublas_syrk(GetCublasHandle(), CUBLAS_FILL_MODE_UPPER, trans,
+			    num_rows_, A_other_dim, alpha, A.Data(),
+			    A.Stride(), beta, this->data_, this->stride_));
 
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
   } else
@@ -1064,10 +1161,8 @@ void CuMatrixBase<Real>::AddDiagVecMat(
 
     Timer tim;
     dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
-
     dim3 dimGrid(n_blocks(num_cols_, CU2DBLOCK),
                  n_blocks(num_rows_, CU2DBLOCK));
-
     MatrixIndexT M_row_stride = M.Stride(), M_col_stride = 1;
     if (transM == kTrans)
       std::swap(M_row_stride, M_col_stride);
@@ -1080,12 +1175,12 @@ void CuMatrixBase<Real>::AddDiagVecMat(
   {
     Mat().AddDiagVecMat(alpha, v.Vec(), M.Mat(), transM, beta);
   }
-}  
+}
 
 
 template<typename Real>
 void CuMatrixBase<Real>::AddMatDiagVec(
-    const Real alpha, 
+    const Real alpha,
     const CuMatrixBase<Real> &M, MatrixTransposeType transM,
     CuVectorBase<Real> &v,
     Real beta) {
@@ -1099,14 +1194,11 @@ void CuMatrixBase<Real>::AddMatDiagVec(
     KALDI_ASSERT(v.Dim() == this->NumCols());
 
     Timer tim;
-    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
-    // Caution, this dimGrid is not the same way around as much of the other
-    // code: going forward, I want to use the (rows, cols) order.
-    dim3 dimGrid(n_blocks(num_rows_, CU2DBLOCK), n_blocks(num_cols_, CU2DBLOCK));
-
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
     MatrixIndexT M_row_stride = M.Stride(), M_col_stride = 1;
     if (transM == kTrans) std::swap(M_row_stride, M_col_stride);
-
     cuda_add_mat_diag_vec(dimGrid, dimBlock, alpha, data_, Dim(),
                           M.Data(), M_row_stride, M_col_stride, v.Data(),  beta);
     CU_SAFE_CALL(cudaGetLastError());
@@ -1119,14 +1211,16 @@ void CuMatrixBase<Real>::AddMatDiagVec(
 }
 
 template<typename Real>
-void CuMatrixBase<Real>::AddMatMatElements(Real alpha, 
+void CuMatrixBase<Real>::AddMatMatElements(Real alpha,
     const CuMatrixBase<Real> &A, const CuMatrixBase<Real> &B, Real beta) {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
-    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
-    dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK));
-    cuda_add_mat_mat_elements(dimGrid, dimBlock, this->data_, A.Data(), B.Data(), Dim(), A.Stride(), B.Stride(), alpha, beta);
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
+    cuda_add_mat_mat_elements(dimGrid, dimBlock, this->data_, A.Data(),
+                              B.Data(), Dim(), A.Stride(), B.Stride(), alpha, beta);
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
   } else
 #endif
@@ -1139,16 +1233,16 @@ void CuMatrixBase<Real>::AddMatMatElements(Real alpha,
 template<typename Real>
 void CuMatrixBase<Real>::Sigmoid(const CuMatrixBase<Real> &src) {
   KALDI_ASSERT(SameDim(*this, src));
-#if HAVE_CUDA == 1 
+#if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
-
-    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
-    dim3 dimGrid(n_blocks(src.NumCols(), CU2DBLOCK), n_blocks(src.NumRows(), CU2DBLOCK));
-    
-    cuda_sigmoid(dimGrid, dimBlock, this->data_, src.data_, this->Dim(), src.Stride());
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
+    cuda_sigmoid(dimGrid, dimBlock, this->data_, src.data_, this->Dim(),
+                 src.Stride());
     CU_SAFE_CALL(cudaGetLastError());
-    
+
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
   } else
   #endif
@@ -1160,16 +1254,16 @@ void CuMatrixBase<Real>::Sigmoid(const CuMatrixBase<Real> &src) {
 template<typename Real>
 void CuMatrixBase<Real>::SoftHinge(const CuMatrixBase<Real> &src) {
   KALDI_ASSERT(SameDim(*this, src));
-#if HAVE_CUDA == 1 
-  if (CuDevice::Instantiate().Enabled()) { 
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
-
-    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
-    dim3 dimGrid(n_blocks(src.NumCols(), CU2DBLOCK), n_blocks(src.NumRows(), CU2DBLOCK));
-
-    cuda_soft_hinge(dimGrid, dimBlock, this->data_, src.data_, this->Dim(), src.Stride());
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
+    cuda_soft_hinge(dimGrid, dimBlock, this->data_, src.data_, this->Dim(),
+                    src.Stride());
     CU_SAFE_CALL(cudaGetLastError());
-    
+
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
   } else
   #endif
@@ -1184,11 +1278,13 @@ void CuMatrixBase<Real>::GroupPnorm(const CuMatrixBase<Real> &src, Real power) {
   KALDI_ASSERT(src.NumCols() == this->NumCols() * group_size &&
                this->NumRows() == src.NumRows());
 #if HAVE_CUDA == 1
-  if (CuDevice::Instantiate().Enabled()) { 
+  if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
     dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
-    dim3 dimGrid(n_blocks(src.NumCols(), CU2DBLOCK), n_blocks(src.NumRows(), CU2DBLOCK));
-    cuda_group_pnorm(dimGrid, dimBlock, this->data_, src.data_, this->Dim(), src.Stride(), group_size, power);
+    dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK),
+                 n_blocks(NumRows(), CU2DBLOCK));
+    cuda_group_pnorm(dimGrid, dimBlock, this->data_, src.data_, this->Dim(),
+                     src.Stride(), group_size, power);
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
   } else
@@ -1207,7 +1303,8 @@ void CuMatrixBase<Real>::GroupMax(const CuMatrixBase<Real> &src) {
   if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
     dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
-    dim3 dimGrid(n_blocks(src.NumCols(), CU2DBLOCK), n_blocks(src.NumRows(), CU2DBLOCK));
+    dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK),
+                 n_blocks(NumRows(), CU2DBLOCK));
     cuda_group_max(dimGrid, dimBlock, this->data_, src.data_,
                    this->Dim(), src.Stride(), group_size);
     CU_SAFE_CALL(cudaGetLastError());
@@ -1241,7 +1338,7 @@ void CuMatrix<Real>::CompObjfAndDeriv(const std::vector<MatrixElement<Real> >& s
                    iter->column < num_cols && iter->column >= 0);
     }
   }
-  
+
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     if (sv_labels.empty()) {
@@ -1277,7 +1374,7 @@ void CuMatrix<Real>::CompObjfAndDeriv(const std::vector<MatrixElement<Real> >& s
       KALDI_ASSERT(this_prob >= 0.99e-20); // we floored to 1.0e-20 in SoftmaxLayer.
       *tot_objf += weight * Log(this_prob);
       *tot_weight += weight;
-      (*this)(m, label) += weight / this_prob; 
+      (*this)(m, label) += weight / this_prob;
     }
   }
 }
@@ -1285,7 +1382,7 @@ void CuMatrix<Real>::CompObjfAndDeriv(const std::vector<MatrixElement<Real> >& s
 template<typename Real> // Y->this, X->src
 void CuMatrixBase<Real>::ApplySoftMaxPerRow(const CuMatrixBase<Real> &src) {
   KALDI_ASSERT(SameDim(*this, src));
-#if HAVE_CUDA == 1 
+#if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
     size_t dimBlock = src.num_cols_ > CU1DBLOCK ? CU1DBLOCK : src.num_cols_;
@@ -1308,7 +1405,7 @@ void CuMatrixBase<Real>::ApplySoftMaxPerRow(const CuMatrixBase<Real> &src) {
 template<typename Real> // Y->this, X->src
 void CuMatrixBase<Real>::ApplyLogSoftMaxPerRow(const CuMatrixBase<Real> &src) {
   KALDI_ASSERT(SameDim(*this, src));
-#if HAVE_CUDA == 1 
+#if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
     size_t dimBlock = src.num_cols_ > CU1DBLOCK ? CU1DBLOCK : src.num_cols_;
@@ -1334,13 +1431,12 @@ template<typename Real> // Eout -> *this, Ein -> diff, Y -> value
 void CuMatrixBase<Real>::DiffSigmoid(const CuMatrixBase<Real> &value,
                                      const CuMatrixBase<Real> &diff) {
   KALDI_ASSERT(SameDim(*this, value) && SameDim(*this, diff));
-#if HAVE_CUDA == 1 
-  if (CuDevice::Instantiate().Enabled()) { 
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
-
-    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
-    dim3 dimGrid(n_blocks(num_cols_, CU2DBLOCK), n_blocks(num_rows_, CU2DBLOCK));
-
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
     cuda_diff_sigmoid(dimGrid, dimBlock, data_, diff.data_, value.data_, Dim(), diff.Stride(), value.Stride());
     CU_SAFE_CALL(cudaGetLastError());
 
@@ -1352,20 +1448,20 @@ void CuMatrixBase<Real>::DiffSigmoid(const CuMatrixBase<Real> &value,
   }
 }
 
-  
+
 template<typename Real>
 void CuMatrixBase<Real>::Tanh(const CuMatrixBase<Real> &src) {
   KALDI_ASSERT(SameDim(*this, src));
-#if HAVE_CUDA == 1 
-  if (CuDevice::Instantiate().Enabled()) { 
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
-
-    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
-    dim3 dimGrid(n_blocks(src.NumCols(), CU2DBLOCK), n_blocks(src.NumRows(), CU2DBLOCK));
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
 
     cuda_tanh(dimGrid, dimBlock, this->data_, src.data_, this->Dim(), src.Stride());
     CU_SAFE_CALL(cudaGetLastError());
-    
+
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
   } else
 #endif
@@ -1379,13 +1475,12 @@ void CuMatrixBase<Real>::Tanh(const CuMatrixBase<Real> &src) {
 template<typename Real> // Ein -> diff, Y -> value
 void CuMatrixBase<Real>::DiffTanh(const CuMatrixBase<Real> &value,
                                   const CuMatrixBase<Real> &diff) {
-#if HAVE_CUDA == 1 
-  if (CuDevice::Instantiate().Enabled()) { 
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
-
-    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
-    dim3 dimGrid(n_blocks(num_cols_, CU2DBLOCK), n_blocks(num_rows_, CU2DBLOCK));
-
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
     cuda_diff_tanh(dimGrid, dimBlock, data_, diff.data_, value.data_, Dim(), diff.Stride(), value.Stride());
     CU_SAFE_CALL(cudaGetLastError());
 
@@ -1399,39 +1494,18 @@ void CuMatrixBase<Real>::DiffTanh(const CuMatrixBase<Real> &value,
 
 template<typename Real>
 void CuMatrixBase<Real>::FindRowMaxId(CuArray<int32> *id) const {
-#if HAVE_CUDA == 1 
+#if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
-     
-    // initialize the vectors
-    CuVector<Real> max(num_rows_);
-    max.Set(-1e21);
     id->Resize(num_rows_);
-    id->Set(-1);
+    MatrixDim d = Dim();
 
-    MatrixDim d=Dim(); // only stride will be used!
-   
-    // process per 256 column blocks 
-    for (int32 block = 0; (block+1)*256 <= num_cols_; block++) {
-      dim3 dimBlock(CU1DBLOCK, 1);
-      dim3 dimGrid(1, num_rows_);
-      int32 offset = block*CU1DBLOCK;
+    // CUDA thread layout: one thread block per matrix-row.
+    dim3 dimBlock(CU1DBLOCK);
+    dim3 dimGrid(num_rows_);
+    cuda_find_row_max_id(dimGrid, dimBlock, data_, NULL, id->Data(), d);
+    CU_SAFE_CALL(cudaGetLastError());
 
-      cuda_find_row_max_id(dimGrid, dimBlock, data_ + offset,
-                           max.data_, id->Data(), offset, d);
-    }
-    
-    // process the remainder
-    int32 div = num_cols_ / 256;
-    int32 mod = num_cols_ % 256;
-    if (mod != 0) {
-      dim3 dimBlock(mod, 1);
-      dim3 dimGrid(1, num_rows_);
-      int32 offset=div*256;
-      
-      cuda_find_row_max_id(dimGrid, dimBlock, data_ + offset,
-                           max.data_, id->Data(), offset, d);
-    }
     // now we have the indices!
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
   } else
@@ -1442,11 +1516,11 @@ void CuMatrixBase<Real>::FindRowMaxId(CuArray<int32> *id) const {
     id->Set(-1);
     // find maxima
     MatrixIndexT num_rows = num_rows_, num_cols = num_cols_;
-    for(MatrixIndexT r = 0; r < num_rows; r++) {
+    for (MatrixIndexT r = 0; r < num_rows; r++) {
       Real max = -1e21;
       int32 max_id = -1;
       const Real *row_data = Mat().RowData(r);
-      for(MatrixIndexT c = 0; c < num_cols; c++) {
+      for (MatrixIndexT c = 0; c < num_cols; c++) {
         if (max < row_data[c]) {
           max = row_data[c];
           max_id = c;
@@ -1460,14 +1534,13 @@ void CuMatrixBase<Real>::FindRowMaxId(CuArray<int32> *id) const {
 template<typename Real>
 void CuMatrixBase<Real>::DiffXent(const CuArray<int32> &tgt,
                                   CuVector<Real> *log_post_tgt) {
-  
+
   KALDI_ASSERT(tgt.Dim() == num_rows_);
   log_post_tgt->Resize(tgt.Dim());
 
-#if HAVE_CUDA == 1 
+#if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
-
     dim3 dimBlock(1, CU2DBLOCK*8);
     dim3 dimGrid(1, n_blocks(tgt.Dim(), CU2DBLOCK*8));
     cuda_diff_xent(dimGrid, dimBlock, tgt.Data(), data_,
@@ -1527,16 +1600,16 @@ void CuMatrixBase<Real>::Cholesky(CuMatrixBase<Real> *inv_cholesky) {
     return;
   }
   // At this point, if none of the other cases apply, we recurse.
-  
+
   // The selection of dim1 is a heuristic.  We could also just take half.
   int32 tot_dim = this->NumRows();
   int32 dim1;
   // Break it up into a whole number of blocks, for better memory alignment.
   // The line below, setting dim1 can be decided on a heuristic basis: from
-  // the point of view of correctness, it can really be any value 
+  // the point of view of correctness, it can really be any value
   // 0 < dim1 < tot_dim.
   dim1 = block_size * std::max<int32>(1, tot_dim / (2 * block_size));
-    
+
   int32 dim2 = tot_dim - dim1;
   CuSubMatrix<Real> this_11(*this, 0, dim1, 0, dim1),
       this_12(*this, 0, dim1, dim1, dim2),
@@ -1567,7 +1640,7 @@ void CuMatrixBase<Real>::Cholesky(CuMatrixBase<Real> *inv_cholesky) {
     L21 = A21 inv(L11') = A21 M11'
     We can compute L22 and M22 recursively by doing Cholesky (and computing the inverse Cholesky)
     on the quantity T = (A22 - L21 L21').   [we give it the name T just for easy reference.]
-        
+
     Computationally, we do this as follows:
     (1) Recurse to get L11 and M11.
     (2) Compute L21 = A21 M11'
@@ -1607,7 +1680,7 @@ void CuMatrixBase<Real>::Cholesky(CuMatrixBase<Real> *inv_cholesky) {
   // (5)(d) zero L12 and M12.
   this_12.SetZero();
   inv_12.SetZero();
-}  
+}
 
 
 
@@ -1617,13 +1690,13 @@ void CuMatrixBase<Real>::SymInvertPosDef() {
   if (num_rows_ == 0) return;
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
-    Timer tim;    
+    Timer tim;
     CuMatrix<Real> inv_cholesky(num_rows_, num_rows_);
     this->Cholesky(&inv_cholesky);
     // note: SymAddMat2 only updates lower part of *this.
     this->SymAddMat2(1.0, inv_cholesky, kTrans, 0.0);
     this->CopyLowerToUpper();
-    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());    
+    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
   } else
 #endif
   {
@@ -1661,31 +1734,35 @@ Real TraceMatMat(const CuMatrixBase<Real> &A,
     } else {
       KALDI_ASSERT(A.NumRows() == B.NumRows() && A.NumCols() == B.NumCols());
     }
-    if (A.NumRows() * A.NumCols() > 16384) {
-      // This version in which we don't use a special-purpose kernel, but
-      // do AddDiagMat on a temporary vector and returns its sum, seems to be
-      // faster for larger matrices.  The cutoff is approximate and
-      // we only looked at the time on square matrices, which
-      // is what we test in cu-matrix-speed-test.cc.
-      CuVector<Real> sum_vec(A.NumRows());
-      sum_vec.AddDiagMatMat(1.0, A, kNoTrans,
-                            B, trans, 0.0);
-      return sum_vec.Sum();
-    } else {
-      Timer tim;
-      // the sizes of result_vec must match what we
-      // call the kernels with, in cu-kernels.cu
-      CuVector<Real> result_vec(trans == kTrans ? 4 : 2, kUndefined);
-      if (trans == kNoTrans) {
-        cuda_trace_mat_mat(A.Data(), B.Data(), A.Dim(), B.Stride(), result_vec.Data());
-      } else {
-        cuda_trace_mat_mat_trans(A.Data(), B.Data(), A.Dim(), B.Stride(), result_vec.Data());
+    Timer tim;
+    // 2D blocks: each (8x32) block sums up (32x32) elements.
+    // 2D grid: try to cover all the matrix A unless it is too big.
+    // Kernel will reduce to ~256 elements with good performance,
+    // if the matrix is not in a very bad shape.
+    // (wider or taller than 32x8192)
+    // CPU will then reduce to 1 element.
+    const int kWarpSize = 32;
+    dim3 dimBlock(kWarpSize, CU1DBLOCK / kWarpSize);
+    dim3 dimGrid(n_blocks(A.NumCols(), kWarpSize),
+        n_blocks(A.NumRows(), kWarpSize));
+    if (dimGrid.x * dimGrid.y > 256) {
+      dimGrid.y = 256 / dimGrid.x;
+      if (dimGrid.y == 0) {
+        dimGrid.y = 1;
       }
-      CU_SAFE_CALL(cudaGetLastError());
-      Vector<Real> result_cpu(result_vec); // copying from CUDA faster than summing in CUDA.
-      result = result_cpu.Sum();
-      CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
     }
+    CuVector<Real> result_vec(dimGrid.x * dimGrid.y, kUndefined);
+    if (trans == kNoTrans) {
+      cuda_trace_mat_mat(dimGrid, dimBlock, A.Data(), B.Data(), A.Dim(),
+          B.Stride(), result_vec.Data());
+    } else {
+      cuda_trace_mat_mat_trans(dimGrid, dimBlock, A.Data(), B.Data(), A.Dim(),
+          B.Stride(), result_vec.Data());
+    }
+    CU_SAFE_CALL(cudaGetLastError());
+    Vector<Real> result_cpu(result_vec); // copying from CUDA faster than summing in CUDA.
+    result = result_cpu.Sum();
+    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
   } else
 #endif
   {
@@ -1703,6 +1780,93 @@ double TraceMatMat(const CuMatrixBase<double> &A,
                    const CuMatrixBase<double> &B,
                    MatrixTransposeType trans);
 
+template<typename Real>
+void AddMatMatBatched(const Real alpha, std::vector<CuSubMatrix<Real>* > &C,
+		const std::vector<CuSubMatrix<Real>* > &A, MatrixTransposeType transA,
+		const std::vector<CuSubMatrix<Real>* > &B, MatrixTransposeType transB,
+		const Real beta) {
+  KALDI_ASSERT(A.size() == B.size() && B.size() == C.size());
+  int32 size = A.size();
+
+  if (size == 0) return;
+
+  // all elements must have the same num-rows, num-cols and stride
+  for (int32 i = 0; i + 1 < size; i++) {
+    KALDI_ASSERT(A[i]->NumRows() == A[i+1]->NumRows());
+    KALDI_ASSERT(A[i]->NumCols() == A[i+1]->NumCols());
+    KALDI_ASSERT(A[i]->Stride() == A[i+1]->Stride());
+    KALDI_ASSERT(B[i]->NumRows() == B[i+1]->NumRows());
+    KALDI_ASSERT(B[i]->NumCols() == B[i+1]->NumCols());
+    KALDI_ASSERT(B[i]->Stride() == B[i+1]->Stride());
+    KALDI_ASSERT(C[i]->NumRows() == C[i+1]->NumRows());
+    KALDI_ASSERT(C[i]->NumCols() == C[i+1]->NumCols());
+    KALDI_ASSERT(C[i]->Stride() == C[i+1]->Stride());
+  }
+  // CUBLAS is col-major, cudamatrix is row-major, how to do the mapping?
+  // keep trans..., just swap A&B matrices: A->B B->A
+  MatrixIndexT m = ((transB==kTrans)? B[0]->NumRows() : B[0]->NumCols());
+  MatrixIndexT n = ((transA==kTrans)? A[0]->NumCols() : A[0]->NumRows());
+  MatrixIndexT k = ((transB==kTrans)? B[0]->NumCols() : B[0]->NumRows());
+  MatrixIndexT k1 = ((transA==kTrans)? A[0]->NumRows() : A[0]->NumCols());
+
+  KALDI_ASSERT(m == C[0]->NumCols());
+  KALDI_ASSERT(n == C[0]->NumRows());
+  KALDI_ASSERT(k == k1);
+
+  if (m == 0) return;
+
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    Timer tim;
+    Real **device_abc_array =
+        static_cast<Real**>(CuDevice::Instantiate().Malloc(3 * size * sizeof(Real*)));
+    const Real **device_a_array = const_cast<const Real**>(device_abc_array);
+    const Real **device_b_array = const_cast<const Real**>(device_abc_array) + size;
+    Real **device_c_array = device_abc_array + 2 * size;
+    const Real **host_abc_array = new const Real*[3*size];
+    const Real **host_a_array = host_abc_array;
+    const Real **host_b_array = host_abc_array + size;
+    const Real **host_c_array = host_abc_array + 2 * size;
+
+    for (int32 i = 0; i < size; i++) {
+      host_a_array[i] = A[i]->data_;
+      host_b_array[i] = B[i]->data_;
+      host_c_array[i] = C[i]->data_;
+    }
+
+    CU_SAFE_CALL(cudaMemcpy(device_abc_array, host_abc_array, 3*size*sizeof(Real*), cudaMemcpyHostToDevice));
+
+    CU_SAFE_CALL(cublas_gemmBatched(GetCublasHandle(),
+			    (transB==kTrans? CUBLAS_OP_T:CUBLAS_OP_N),
+			    (transA==kTrans? CUBLAS_OP_T:CUBLAS_OP_N),
+			    m, n, k, alpha, device_b_array, B[0]->Stride(),
+			    device_a_array, A[0]->Stride(), beta,
+			    device_c_array, C[0]->Stride(), size));
+
+    CuDevice::Instantiate().Free(device_abc_array);
+    delete[] host_abc_array;
+
+    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
+  } else
+#endif
+  {
+    for (int32 i = 0; i < size; i++) {
+      C[i]->Mat().AddMatMat(alpha, A[i]->Mat(), transA, B[i]->Mat(), transB, beta);
+    }
+  }
+}
+
+template
+void AddMatMatBatched(const float alpha, std::vector<CuSubMatrix<float>* > &C,
+		const std::vector<CuSubMatrix<float>* > &A, MatrixTransposeType transA,
+		const std::vector<CuSubMatrix<float>* > &B, MatrixTransposeType transB,
+		const float beta);
+
+template
+void AddMatMatBatched(const double alpha, std::vector<CuSubMatrix<double>* > &C,
+		const std::vector<CuSubMatrix<double>* > &A, MatrixTransposeType transA,
+		const std::vector<CuSubMatrix<double>* > &B, MatrixTransposeType transB,
+		const double beta);
 
 template<typename Real>
 void CuMatrixBase<Real>::CopyRowsFromVec(const CuVectorBase<Real> &v) {
@@ -1722,10 +1886,11 @@ void CuMatrixBase<Real>::CopyRowsFromVec(const CuVectorBase<Real> &v) {
                                   cudaMemcpyDeviceToDevice));
       }
     } else if (v.Dim() == num_cols_) {
-      dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
-      // this is a newer kernel where (x,y) dims represent (rows,cols).
-      dim3 dimGrid(n_blocks(NumRows(),CU2DBLOCK), n_blocks(NumCols(),CU2DBLOCK));
+      dim3 dimGrid, dimBlock;
+      GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                            &dimGrid, &dimBlock);
       cuda_copy_rows_from_vec(dimGrid, dimBlock, data_, this->Dim(), v.Data());
+      CU_SAFE_CALL(cudaGetLastError());
     } else {
       KALDI_ERR << "Wrong sized arguments";
     }
@@ -1755,17 +1920,11 @@ void CuMatrixBase<Real>::CopyRowsFromVec(const VectorBase<Real> &v) {
         }
       }
     } else if (v.Dim() == num_cols_) {
-      dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
-      // This is a newer kernel where x corresponds to NumRows() and y to NumCols().
-      dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK),
-                   n_blocks(NumCols(), CU2DBLOCK));
-
+      dim3 dimGrid, dimBlock;
+      GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                            &dimGrid, &dimBlock);
       cuda_copy_rows_from_vec(dimGrid, dimBlock, this->data_, this->Dim(), v.Data());
       CU_SAFE_CALL(cudaGetLastError());
-      
-      /*      const Real *v_data = v.Data();
-      for (MatrixIndexT r = 0; r < num_rows_; r++)
-      cudaMemcpy(RowData(r), v_data, sizeof(Real)*num_cols_, cudaMemcpyHostToDevice); */
     } else {
       KALDI_ERR << "Wrong sized arguments";
     }
@@ -1787,9 +1946,9 @@ void CuMatrixBase<Real>::CopyColFromVec(const CuVectorBase<Real> &v,
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
-    int dimBlock(CU1DBLOCK);
-    int dimGrid(n_blocks(NumRows(), CU1DBLOCK));
-    cuda_copy_col_from_vec(dimGrid, dimBlock, data_, v.Data(), col, Dim());
+    cublas_copy(GetCublasHandle(),
+                v.Dim(), v.Data(), 1,
+                this->data_ + col, this->stride_);
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
   } else
@@ -1804,10 +1963,9 @@ void CuMatrixBase<Real>::ApplyPow(Real power) {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
-    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
-    dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK),
-                 n_blocks(NumCols(), CU2DBLOCK));
-    
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
     cuda_apply_pow(dimGrid, dimBlock, data_, power, Dim());
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
@@ -1823,10 +1981,9 @@ void CuMatrixBase<Real>::ApplyPowAbs(Real power, bool include_sign) {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
-    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
-    dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK),
-                 n_blocks(NumCols(), CU2DBLOCK));
-    
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
     cuda_apply_pow_abs(dimGrid, dimBlock, data_, power, include_sign, Dim());
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
@@ -1842,10 +1999,9 @@ void CuMatrixBase<Real>::ApplyHeaviside() {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
-    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
-    dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK),
-                 n_blocks(NumCols(), CU2DBLOCK));
-    
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
     cuda_apply_heaviside(dimGrid, dimBlock, data_, Dim());
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
@@ -1856,15 +2012,35 @@ void CuMatrixBase<Real>::ApplyHeaviside() {
   }
 }
 
+template<typename Real>
+void CuMatrixBase<Real>::Heaviside(const CuMatrixBase<Real> &src) {
+  KALDI_ASSERT(SameDim(*this, src));
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    Timer tim;
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
+    cuda_heaviside(dimGrid, dimBlock, this->data_, src.data_, this->Dim(),
+                   src.Stride());
+    CU_SAFE_CALL(cudaGetLastError());
+
+    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
+  } else
+  #endif
+  {
+    Mat().Heaviside(src.Mat());
+  }
+}
 
 template<typename Real>
 void CuMatrixBase<Real>::ApplyExp() {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
-    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
-    dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK));
-
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
     cuda_apply_exp(dimGrid, dimBlock, data_, Dim());
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
@@ -1881,9 +2057,9 @@ void CuMatrixBase<Real>::ApplyFloor(Real floor_val) {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
-    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
-    dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK));
-
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
     cuda_apply_floor(dimGrid, dimBlock, data_, floor_val, Dim());
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
@@ -1899,9 +2075,9 @@ void CuMatrixBase<Real>::ApplyCeiling(Real ceiling_val) {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
-    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
-    dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK));
-
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
     cuda_apply_ceiling(dimGrid, dimBlock, data_, ceiling_val, Dim());
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
@@ -1922,6 +2098,7 @@ void VectorBase<Real>::CopyRowsFromMat(const CuMatrixBase<Real> &mat) {
     if (mat.Stride() == mat.NumCols()) {
       cudaMemcpy(data_, mat.Data(), sizeof(Real)*dim_, cudaMemcpyDeviceToHost);
     } else {
+      // we could definitely do better than the following.
       Real* vec_data = data_;
       for (MatrixIndexT r = 0; r < mat.NumRows(); r++) {
         cudaMemcpy(vec_data, mat.RowData(r), sizeof(Real) * mat.NumCols(),
@@ -1952,9 +2129,9 @@ void CuMatrixBase<Real>::CopyCols(const CuMatrixBase<Real> &src,
     KALDI_ASSERT(indices.Dim() == NumCols());
     KALDI_ASSERT(NumRows() == src.NumRows());
     Timer tim;
-    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
-    // This kernel, as it is newer has the (x,y) dims as (rows,cols).
-    dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK), n_blocks(NumCols(), CU2DBLOCK));
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
     cuda_copy_cols(dimGrid, dimBlock, data_, src.Data(), indices.Data(), Dim(), src.Stride());
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
@@ -1965,7 +2142,7 @@ void CuMatrixBase<Real>::CopyCols(const CuMatrixBase<Real> &src,
   }
 }
 
-  
+
 template<typename Real>
 void CuMatrixBase<Real>::CopyRows(const CuMatrixBase<Real> &src,
                                   const CuArray<MatrixIndexT> &indices) {
@@ -1973,12 +2150,13 @@ void CuMatrixBase<Real>::CopyRows(const CuMatrixBase<Real> &src,
   if (CuDevice::Instantiate().Enabled()) {
     KALDI_ASSERT(static_cast<MatrixIndexT>(indices.Dim()) == NumRows());
     KALDI_ASSERT(NumCols() == src.NumCols());
-    
+
     Timer tim;
-    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
-    // This kernel, as it is newer has the (x,y) dims as (rows,cols).
-    dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK), n_blocks(NumCols(), CU2DBLOCK));
-    cuda_copy_rows(dimGrid, dimBlock, data_, src.Data(), indices.Data(), Dim(), src.Stride());
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
+    cuda_copy_rows(dimGrid, dimBlock, data_, src.Data(), indices.Data(),
+                   Dim(), src.Stride());
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
   } else
@@ -1996,10 +2174,11 @@ void CuMatrixBase<Real>::AddCols(const CuMatrixBase<Real> &src,
     KALDI_ASSERT(indices.Dim() == NumCols());
     KALDI_ASSERT(NumRows() == src.NumRows());
     Timer tim;
-    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
-    // This kernel, as it is newer has the (x,y) dims as (rows,cols).
-    dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK), n_blocks(NumCols(), CU2DBLOCK));
-    cuda_add_cols(dimGrid, dimBlock, data_, src.Data(), indices.Data(), Dim(), src.Stride());
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
+    cuda_add_cols(dimGrid, dimBlock, data_, src.Data(), indices.Data(),
+                  Dim(), src.Stride());
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
   } else
@@ -2008,18 +2187,17 @@ void CuMatrixBase<Real>::AddCols(const CuMatrixBase<Real> &src,
     Mat().AddCols(src.Mat(), indices.Data());
   }
 }
-  
+
 template<typename Real>
 void CuMatrixBase<Real>::CopyRows(const CuArray<const Real*> &src) {
   if (NumRows() == 0) return;
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     KALDI_ASSERT(static_cast<MatrixIndexT>(src.Dim()) == NumRows());
-    
     Timer tim;
     dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
-    dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK),
-                 n_blocks(NumCols(), CU2DBLOCK));
+    dim3 dimGrid(n_blocks(num_cols_, CU2DBLOCK),
+                 n_blocks(num_rows_, CU2DBLOCK));
     cuda_copy_rows(dimGrid, dimBlock, data_, src.Data(), Dim());
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
@@ -2037,11 +2215,11 @@ void CuMatrixBase<Real>::CopyToRows(const CuArray<Real*> &dst) const {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     KALDI_ASSERT(static_cast<MatrixIndexT>(dst.Dim()) == NumRows());
-    
+
     Timer tim;
     dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
-    dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK),
-                 n_blocks(NumCols(), CU2DBLOCK));
+    dim3 dimGrid(n_blocks(num_cols_, CU2DBLOCK),
+                 n_blocks(num_rows_, CU2DBLOCK));
     cuda_copy_to_rows(dimGrid, dimBlock, dst.Data(), data_, Dim());
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
@@ -2062,11 +2240,10 @@ void CuMatrixBase<Real>::AddRows(Real alpha,
   if (CuDevice::Instantiate().Enabled()) {
     KALDI_ASSERT(static_cast<MatrixIndexT>(indexes.Dim()) == NumRows());
     KALDI_ASSERT(src.NumCols() == NumCols());
-    
     Timer tim;
-    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
-    dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK),
-                 n_blocks(NumCols(), CU2DBLOCK));
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
     cuda_add_rows(dimGrid, dimBlock, alpha,
                   data_, src.Data(), indexes.Data(), Dim(), src.Stride());
     CU_SAFE_CALL(cudaGetLastError());
@@ -2085,11 +2262,10 @@ void CuMatrixBase<Real>::AddRows(Real alpha, const CuArray<const Real*> &src) {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     KALDI_ASSERT(static_cast<MatrixIndexT>(src.Dim()) == NumRows());
-    
     Timer tim;
-    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
-    dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK),
-                 n_blocks(NumCols(), CU2DBLOCK));
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
     cuda_add_rows(dimGrid, dimBlock, alpha, data_, src.Data(), Dim());
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
@@ -2108,11 +2284,10 @@ void CuMatrixBase<Real>::AddToRows(Real alpha,
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     KALDI_ASSERT(static_cast<MatrixIndexT>(dst.Dim()) == NumRows());
-    
     Timer tim;
-    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
-    dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK),
-                 n_blocks(NumCols(), CU2DBLOCK));
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
     cuda_add_to_rows(dimGrid, dimBlock, alpha, dst.Data(), data_, Dim());
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
@@ -2132,12 +2307,12 @@ void CuMatrixBase<Real>::SumColumnRanges(const CuMatrixBase<Real> &src,
   if (NumRows() == 0) return;
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
-    
     Timer tim;
-    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
-    // This kernel, as it is newer has the (x,y) dims as (rows,cols).
-    dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK), n_blocks(NumCols(), CU2DBLOCK));
-    cuda_sum_column_ranges(dimGrid, dimBlock, data_, Dim(), src.Data(), src.Dim(), indices.Data());
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
+    cuda_sum_column_ranges(dimGrid, dimBlock, data_, Dim(), src.Data(),
+                           src.Dim(), indices.Data());
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
   } else
@@ -2165,16 +2340,15 @@ void CuMatrixBase<Real>::SumColumnRanges(const CuMatrixBase<Real> &src,
 template<typename Real>
 void CuMatrixBase<Real>::AddRowRanges(const CuMatrixBase<Real> &src,
                                       const CuArray<Int32Pair> &indexes) {
-  KALDI_ASSERT(static_cast<MatrixIndexT>(indexes.Dim()) == NumCols());
-  KALDI_ASSERT(src.NumCols() >= NumCols());
+  KALDI_ASSERT(static_cast<MatrixIndexT>(indexes.Dim()) == NumRows());
+  KALDI_ASSERT(src.NumCols() == NumCols());
   if (NumRows() == 0) return;
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
-    
     Timer tim;
-    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
-    dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK),
-                 n_blocks(NumCols(), CU2DBLOCK));
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
     cuda_add_row_ranges(dimGrid, dimBlock,
                         data_, Dim(), src.Data(), src.Dim(), indexes.Data());
     CU_SAFE_CALL(cudaGetLastError());
@@ -2188,9 +2362,9 @@ void CuMatrixBase<Real>::AddRowRanges(const CuMatrixBase<Real> &src,
     const Real *src_data = src.data_;
     const Int32Pair *indexes_data = indexes.Data();
     for (int32 row = 0; row < num_rows; row++) {
+      int32 start_row = indexes_data[row].first,
+          end_row = indexes_data[row].second;
       for (int32 col = 0; col < num_cols; col++) {
-        int32 start_row = indexes_data[col].first,
-                end_row = indexes_data[col].second;
         Real sum = 0.0;
         for (int32 src_row = start_row; src_row < end_row; src_row++)
           sum += src_data[src_row * src_stride + col];
@@ -2209,7 +2383,7 @@ void CuMatrixBase<Real>::CopyLowerToUpper() {
   if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
     dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
-    int32 dim = this->num_rows_;
+    int32 dim = num_rows_;
     dim3 dimGrid(n_blocks(dim, CU2DBLOCK),
                  n_blocks(dim, CU2DBLOCK));
     cuda_copy_low_upp(dimGrid, dimBlock, data_, Dim());
@@ -2229,8 +2403,8 @@ void CuMatrixBase<Real>::CopyUpperToLower() {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
-    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
     int32 dim = this->num_rows_;
+    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
     dim3 dimGrid(n_blocks(dim, CU2DBLOCK),
                  n_blocks(dim, CU2DBLOCK));
     cuda_copy_upp_low(dimGrid, dimBlock, data_, Dim());
@@ -2295,10 +2469,10 @@ Real CuMatrixBase<Real>::Trace(bool check_square) const {
     int dimBlock(CU1DBLOCK);
     int dimGrid = 1;// only 1 block here. we have loops in each thread  //(n_blocks(dim_, CU1DBLOCK));
     cuda_vec_sum(dimGrid, dimBlock, data_, tmp.Data(), dim, Stride() + 1);
-    CU_SAFE_CALL(cudaGetLastError());    
-    CuDevice::Instantiate().AccuProfile("CuVectorBase::Sum", tim.Elapsed());    
+    CU_SAFE_CALL(cudaGetLastError());
+    CuDevice::Instantiate().AccuProfile("CuVectorBase::Sum", tim.Elapsed());
     return tmp(0);
-  } else 
+  } else
 #endif
   {
     return Mat().Trace(check_square);
@@ -2348,7 +2522,7 @@ void CuMatrixBase<Real>::SetRandn() {
   if (CuDevice::Instantiate().Enabled()) {
     CuRand<Real> tmp;
     tmp.RandGaussian(this);
-  } else 
+  } else
 #endif
   {
     Mat().SetRandn();
@@ -2362,7 +2536,7 @@ void CuMatrixBase<Real>::SetRandUniform() {
   if (CuDevice::Instantiate().Enabled()) {
     CuRand<Real> tmp;
     tmp.RandUniform(this);
-  } else 
+  } else
 #endif
   {
     Mat().SetRandUniform();
@@ -2403,26 +2577,17 @@ template<typename Real>
 void CuMatrix<Real>::Transpose() {
   if (this->num_rows_ == 0)
     return;
-#if HAVE_CUDA == 1
-  if (this->num_rows_ == this->num_cols_ && CuDevice::Instantiate().Enabled()) {
-    Timer tim;
-    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
-    // (x,y) indices will be (row of *this, col of *this)
-    dim3 dimGrid(n_blocks(this->num_rows_, CU2DBLOCK),
-                 n_blocks(this->num_cols_, CU2DBLOCK));
-    cuda_transpose_matrix(dimGrid, dimBlock, this->data_, this->Dim());
-    CU_SAFE_CALL(cudaGetLastError());    
-    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
-  } else
-#endif
-  {
-    CuMatrix<Real> tmp(*this, kTrans);
-    *this = tmp;
-  }
+  // Copy and swap for all cases.
+  // No need for a separate kernel of squared matrix in-place transpose.
+  // It has the same posible peak performance as copy transpose,
+  // if allocate/deallocate overhead can be ignored.
+  CuMatrix<Real> tmp(*this, kTrans);
+  this->Swap(&tmp);
 }
 
 
 // Version of AddMatMat where 2nd argument is of type CuBlockMatrix.
+// Caution:
 template<typename Real>
 void CuMatrixBase<Real>::AddMatBlock(
     Real alpha,
@@ -2451,19 +2616,21 @@ void CuMatrixBase<Real>::AddMatBlock(
   if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
     MatrixDim this_dim = Dim();
-    
+
     dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
     // (x,y) indices will be (row of *this, block of B)
     dim3 dimGrid(n_blocks(num_rows_, CU2DBLOCK),
                  n_blocks(B_num_blocks, CU2DBLOCK));
 
+    // caution: the use of x as the row-index is not good, but
+    // this code is not much used, so I'm not updating it.a
     cuda_add_mat_blockmat(dimGrid, dimBlock, data_, this_dim, A.Data(),
                           A_num_rows, A_num_cols, A_row_stride, A_col_stride,
                           B.CuData(), B_num_blocks, alpha, beta,
                           (transB == kTrans ? 1 : 0));
-      
-    CU_SAFE_CALL(cudaGetLastError());                          
-    
+
+    CU_SAFE_CALL(cudaGetLastError());
+
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
   } else
 #endif
@@ -2494,7 +2661,7 @@ void CuMatrixBase<Real>::AddMatBlock(
 }
 
 template<typename Real>
-void CuMatrixBase<Real>::AddElements(Real alpha, 
+void CuMatrixBase<Real>::AddElements(Real alpha,
                                      const std::vector<MatrixElement<Real> >& input) {
   // Checks the dimension.
   MatrixIndexT num_rows = this->num_rows_, num_cols = this->num_cols_;
@@ -2511,7 +2678,7 @@ void CuMatrixBase<Real>::AddElements(Real alpha,
 
     Timer tim;
     int dimBlock(CU1DBLOCK);
-    int dimGrid = 1;// only 1 block here. we have loops in each thread  //(n_blocks(dim_, CU1DBLOCK));
+    int dimGrid(n_blocks(input.size(), CU1DBLOCK));
 
     cuda_matrix_add_elements(dimGrid, dimBlock, this->data_, this->Dim(),
                              alpha, (MatrixElement<Real>*)addr, input.size());
@@ -2527,47 +2694,97 @@ void CuMatrixBase<Real>::AddElements(Real alpha,
   }
 }
 
+template<typename Real>
+void CuMatrixBase<Real>::AddElements(Real alpha, const CuArray<Int32Pair> &indexes,
+                                     const Real *input) {
+  if (indexes.Dim() == 0) return;
+  KALDI_ASSERT(input != NULL);
+
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    Timer tim;
+    CuVector<Real> tmp_vec(indexes.Dim(), kUndefined);
+    CU_SAFE_CALL(cudaMemcpy(tmp_vec.Data(), input, indexes.Dim() * sizeof(Real),
+                            cudaMemcpyHostToDevice));
+
+    int dimBlock(CU1DBLOCK);
+    int dimGrid = n_blocks(indexes.Dim(), CU1DBLOCK);
+    cuda_matrix_add_indexed_values(dimGrid, dimBlock, this->Dim(), alpha,
+                                   indexes.Data(), tmp_vec.Data(), indexes.Dim(), this->data_);
+    CU_SAFE_CALL(cudaGetLastError());
+    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
+  } else
+#endif
+  {
+    MatrixIndexT num_rows = this->num_rows_, num_cols = this->num_cols_;
+    const Int32Pair *index = indexes.Data();
+    for (int32 i = 0; i < indexes.Dim(); i++) {
+      KALDI_ASSERT(index[i].first < num_rows && index[i].first >= 0 &&
+                   index[i].second < num_cols && index[i].second >= 0);
+      (*this)(index[i].first, index[i].second) += alpha * input[i];
+    }
+  }
+}
+
 template<typename Real>
 void CuMatrixBase<Real>::Lookup(const std::vector<Int32Pair> &indices,
-                                std::vector<Real> *output) const {
+                                Real *output) const {
   // Checks the dimension.
   MatrixIndexT num_rows = this->num_rows_, num_cols = this->num_cols_;
   for (int32 i = 0; i < indices.size(); ++i) {
     KALDI_ASSERT(indices[i].first < num_rows && indices[i].first >= 0 &&
                  indices[i].second < num_cols && indices[i].second >= 0);
   }
-  
-  // Checks the pointer.
+  if (indices.size() == 0) return;
   KALDI_ASSERT(output != NULL);
 
-  // Resizes the output vector.
-  output->resize(indices.size());
-
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuArray<Int32Pair> cuda_indices(indices);
-    CuArray<Real> cuda_output(output->size());
+    Lookup(cuda_indices, output);
+  } else
+#endif
+  {
+    for (int32 i = 0; i < indices.size(); i++) {
+      output[i] = (*this)(indices[i].first, indices[i].second);
+    }
+  }
+}
+
+template<typename Real>
+void CuMatrixBase<Real>::Lookup(const CuArray<Int32Pair> &indices,
+                                Real *output) const {
+  int32 num_elements = indices.Dim();
+  if (num_elements == 0) return;
+  KALDI_ASSERT(output != NULL);
 
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    CuArray<Real> cuda_output(num_elements);
     Timer tim;
     dim3 dimBlock(CU1DBLOCK, 1);
-    dim3 dimGrid(n_blocks(indices.size(), CU1DBLOCK), 1);
-    
+    dim3 dimGrid(n_blocks(num_elements, CU1DBLOCK), 1);
+
     cuda_matrix_lookup(dimGrid, dimBlock, this->data_, this->Dim(),
-                       cuda_indices.Data(), indices.size(), cuda_output.Data());
+                       indices.Data(), num_elements, cuda_output.Data());
     CU_SAFE_CALL(cudaGetLastError());
 
-    cuda_output.CopyToVec(output);
-    
+    cuda_output.CopyToHost(output);
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
   } else
 #endif
   {
-    for (int32 i = 0; i < indices.size(); i++) {
-      (*output)[i] = (*this)(indices[i].first, indices[i].second);
+    MatrixIndexT num_rows = this->num_rows_, num_cols = this->num_cols_;
+    const Int32Pair *index = indices.Data();
+    for (int32 i = 0; i < num_elements; i++) {
+      KALDI_ASSERT(index[i].first < num_rows && index[i].first >= 0 &&
+                   index[i].second < num_cols && index[i].second >= 0);
+      output[i] = (*this)(index[i].first, index[i].second);
     }
   }
 }
 
+
 template<typename Real>
 void CuMatrixBase<Real>::EqualElementMask(const CuMatrixBase<Real> &mat, CuMatrix<Real> *mask) const {
   // Check the inputs:
@@ -2579,10 +2796,12 @@ void CuMatrixBase<Real>::EqualElementMask(const CuMatrixBase<Real> &mat, CuMatri
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
-    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
-    dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK));
-    
-    cuda_equal_element_mask(dimGrid, dimBlock, this->data_, mat.Data(), mask->Data(), this->Dim(), mat.Stride(), mask->Stride());
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
+    cuda_equal_element_mask(dimGrid, dimBlock, this->data_, mat.Data(),
+                            mask->Data(), this->Dim(), mat.Stride(),
+                            mask->Stride());
     CU_SAFE_CALL(cudaGetLastError());
 
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
@@ -2611,7 +2830,7 @@ std::ostream &operator << (std::ostream &out, const CuMatrixBase<Real> &mat) {
 // instantiate the template
 template
 std::ostream &operator << (std::ostream &out, const CuMatrixBase<float> &mat);
-template 
+template
 std::ostream &operator << (std::ostream &out, const CuMatrixBase<double> &mat);
 
 
@@ -2621,7 +2840,7 @@ template class CuMatrix<double>;
 template class CuMatrixBase<float>;
 template class CuMatrixBase<double>;
 
-  
+
 
 
 
diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h
index 0fcb517994c..fec26424ef8 100644
--- a/src/cudamatrix/cu-matrix.h
+++ b/src/cudamatrix/cu-matrix.h
@@ -27,6 +27,7 @@
 #define KALDI_CUDAMATRIX_CU_MATRIX_H_
 
 #include <sstream>
+#include <vector>
 
 #include "cudamatrix/cu-matrixdim.h"
 #include "cudamatrix/cu-common.h"
@@ -43,6 +44,17 @@ namespace kaldi {
 template<typename Real>
 Real TraceMatMat(const CuMatrixBase<Real> &A, const CuMatrixBase<Real> &B,
                  MatrixTransposeType trans = kNoTrans);
+
+/// Does multiple matrix multiplications, executing them in parallel using
+/// cuBLAS's gemmBatched if we are using a GPU. Vectors A, B and C must have
+/// the same length; for each i, this function executes the matrix operation
+/// C[i] = alpha *  A[i](^T)*B[i](^T) + beta * C[i].
+template<typename Real>
+void AddMatMatBatched(const Real alpha, std::vector<CuSubMatrix<Real>* > &C,
+		const std::vector<CuSubMatrix<Real>* > &A, MatrixTransposeType transA,
+		const std::vector<CuSubMatrix<Real>* > &B, MatrixTransposeType transB,
+		const Real beta);
+
 /**
  * Matrix for CUDA computing.
  * Does the computation on the CUDA card when CUDA is compiled in and
@@ -95,9 +107,9 @@ class CuMatrixBase {
   /// and src.NumRows() must equal this.NumRows()
   void AddCols(const CuMatrixBase<Real> &src,
                const CuArray<MatrixIndexT> &indices);
-  
+
   /// Copies row r from row indexes[r] of src.
-  /// As a special case, if indexes[i] < 0, sets row i to zero  
+  /// As a special case, if indexes[i] < 0, sets row i to zero
   /// "reorder".size() must equal this->NumRows(), and
   /// src.NumCols() must equal this.NumCols()
   void CopyRows(const CuMatrixBase<Real> &src,
@@ -111,19 +123,18 @@ class CuMatrixBase {
   /// NULL then this.Row(r) will be set to zero.
   void CopyRows(const CuArray<const Real*> &src);
 
-  /// For each row r of this matrix, copies it to the array of floats at
-  /// the location given by dst[r], where dst[r] is assumed to be obtained from the RowData()
-  /// function of another CuMatrix, or from CuVector::Data() (i.e. it should point
-  /// to memory on the GPU if we're using a GPU, or on the CPU otherwise).
-  /// If dst[r] is NULL, does not copy anywhere.  Requires that none of the
-  /// memory regions pointed to by the pointers in "dst" overlap (e.g. none of
-  /// the pointers should be the same).
+  /// For each row r of this matrix, copies it to the array of floats at the
+  /// location given by dst[r], where dst[r] is assumed to be obtained from the
+  /// RowData() function of another CuMatrix, or from CuVector::Data() (i.e. it
+  /// should point to memory on the GPU if we're using a GPU, or on the CPU
+  /// otherwise).  If dst[r] is NULL, does not copy anywhere.  Requires that
+  /// none of the memory regions pointed to by the pointers in "dst" overlap
+  /// (e.g. none of the pointers should be the same).
   void CopyToRows(const CuArray<Real*> &dst) const;
-  
 
   /// Does for each row r, this.Row(r) += alpha * src.row(indexes[r]).
   /// If indexes[r] < 0, does not add anything.
-  /// "reorder".size() must equal this->NumRows(), 
+  /// "reorder".size() must equal this->NumRows(),
   /// all elements of "reorder" must be in [0, src.NumRows()-1],
   /// and src.NumCols() must equal this.NumCols()
   void AddRows(Real alpha,
@@ -135,7 +146,7 @@ class CuMatrixBase {
   /// a vector of floats, of the same length as this.NumCols().
   void AddRows(Real alpha,
                const CuArray<const Real*> &src);
-  
+
 
   /// For each row r of this matrix, adds it (times alpha) to the array of
   /// floats at the location given by dst[r], where dst[r] is assumed to be
@@ -145,7 +156,7 @@ class CuMatrixBase {
   /// for that row.  Requires that none of the memory regions pointed to by the
   /// pointers in "dst" overlap (e.g. none of the pointers should be the same).
   void AddToRows(Real alpha, const CuArray<Real*> &dst) const;
-  
+
 
   /// For each row r of this and for each column c, sets (*this)(r, c) to the
   /// sum \sum_j src(r, j), where j ranges from indexes[c].first through
@@ -156,12 +167,12 @@ class CuMatrixBase {
 
   /// For each row r of this and for each column c, do
   /// (*this)(r, c) += \sum_j src(j, c),
-  /// where j ranges from indexes[c].first through indexes[c].second - 1.
-  /// All indexes must be >= 0 and < src.NumRows(); to represent an empty range
-  /// just use the same index twice.
+  /// where j ranges from indexes[r].first through indexes[r].second - 1.
+  /// In general indexes must be >= 0 and < src.NumRows(); but to represent an empty range
+  /// you may use the pair (-1, -1) or any pair of numbers (i, j) such that i >= j.
   void AddRowRanges(const CuMatrixBase<Real> &src,
                     const CuArray<Int32Pair> &indexes);
-  
+
 
   friend Real TraceMatMat<Real>(const CuMatrixBase<Real> &A,
                                 const CuMatrixBase<Real> &B,
@@ -171,10 +182,15 @@ class CuMatrixBase {
                                  const CuSparseMatrix<Real> &B,
                                  MatrixTransposeType trans);
 
+  friend void AddMatMatBatched<Real>(const Real alpha, std::vector<CuSubMatrix<Real>* > &C,
+		const std::vector<CuSubMatrix<Real>* > &A, MatrixTransposeType transA,
+		const std::vector<CuSubMatrix<Real>* > &B, MatrixTransposeType transB,
+		const Real beta);
+
   /// Adds "value" to the diagonal elements of the matrix.  The matrix
   /// *this does not have to be square.
   void AddToDiag(Real value);
-  
+
   /// Dimensions
   MatrixIndexT NumRows() const { return num_rows_;  }
   MatrixIndexT NumCols() const { return num_cols_;  }
@@ -182,21 +198,21 @@ class CuMatrixBase {
 
   // MatrixDim is a struct containing "rows", "cols" and "stride",
   // that is an argument of most CUDA kernels.
-  ::MatrixDim Dim() const { 
-    ::MatrixDim d = { num_rows_, num_cols_, stride_ }; 
-    return d; 
+  ::MatrixDim Dim() const {
+    ::MatrixDim d = { num_rows_, num_cols_, stride_ };
+    return d;
   }
 
   Real FrobeniusNorm() const { return sqrt(TraceMatMat(*this, *this, kTrans)); }
 
-  bool IsUnit(Real tol = 0.001) const;  
+  bool IsUnit(Real tol = 0.001) const;
 
   /// True if ((*this)-other).FrobeniusNorm() <= tol * this->FrobeniusNorm()
   bool ApproxEqual(const CuMatrixBase<Real> &other, float tol = 0.01) const;
-  
+
   /// Get size of matrix in bytes
   MatrixIndexT SizeInBytes() const { return num_rows_*stride_*sizeof(Real); }
-  
+
   // Copy functions.  These do not resize.
   template<typename OtherReal>
   void CopyFromMat(const MatrixBase<OtherReal> &src,
@@ -209,23 +225,28 @@ class CuMatrixBase {
                    MatrixTransposeType trans = kNoTrans);
 
   void CopyFromSp(const CuSpMatrix<Real> &M);
-  
+
   template<typename OtherReal>
   void CopyFromTp(const CuTpMatrix<OtherReal> &M,
                   MatrixTransposeType trans = kNoTrans);
-  
+
   template<typename OtherReal>
   void CopyFromMat(const CuMatrixBase<OtherReal> &M,
-                   MatrixTransposeType trans = kNoTrans); 
+                   MatrixTransposeType trans = kNoTrans);
 
   template<typename OtherReal>
   void CopyToMat(MatrixBase<OtherReal> *dst,
                  MatrixTransposeType trans = kNoTrans) const;
-  
+
+  /// This function has two modes of operation.  If v.Dim() == NumRows() *
+  /// NumCols(), then treats the vector as a row-by-row concatenation of a
+  /// matrix and copies to *this.
+  /// if v.Dim() == NumCols(), it sets each row of *this to a copy of v.
   void CopyRowsFromVec(const CuVectorBase<Real> &v);
 
+  /// Version of CopyRowsFromVec() that takes a CPU-based vector.
   void CopyRowsFromVec(const VectorBase<Real> &v);
-  
+
   /// Copy vector into specific column of matrix.
   void CopyColFromVec(const CuVectorBase<Real> &v, const MatrixIndexT col);
 
@@ -233,6 +254,11 @@ class CuMatrixBase {
   /// element by element, x = 1 / (1 + exp(-x))
   void Sigmoid(const CuMatrixBase<Real> &src);
 
+  /// Set each element to the Heaviside function of the corresponding element
+  /// of "src", which we define as the function (x > 0 ? 1.0 : 0.0) [note:
+  /// in general, there are different ways to deal with the situation when x==0.]
+  void Heaviside(const CuMatrixBase<Real> &src);
+
   /// Apply the function y = log(1 + exp(x)), to each element.
   /// Note: the derivative of this function is the sigmoid function.
   /// This is like a soft ReLU.
@@ -268,7 +294,7 @@ class CuMatrixBase {
   /// defined (it's not defined where multiple inputs in the group are equal to the output).
   void GroupMaxDeriv(const CuMatrixBase<Real> &input,
                      const CuMatrixBase<Real> &output);
-  
+
   /// Compute the hyperbolic tangent (tanh) function; element by element,
   /// *this = tanh(src).
   void Tanh(const CuMatrixBase<Real> &src);
@@ -282,9 +308,9 @@ class CuMatrixBase {
   /// tanh output.  Does, element-by-element, *this = diff * (1 - value^2).
   void DiffTanh(const CuMatrixBase<Real> &value,
                 const CuMatrixBase<Real> &diff);
-  
+
   /// Differentiate the block [softmax+cross-entropy] :
-  /// dE/da = posterior_mat - target_mat, 
+  /// dE/da = posterior_mat - target_mat,
   /// 'E' is error function, 'a' is activation on softmax input
   ///
   /// Interface:
@@ -293,7 +319,7 @@ class CuMatrixBase {
   /// log_post_tgt ... per-frame statistics for cross-entropy computations :
   ///                  log(sum_row(posterior_mat .* target_mat))
   void DiffXent(const CuArray<int32> &tgt,
-                CuVector<Real> *log_post_tgt);  
+                CuVector<Real> *log_post_tgt);
 
   /// This function does sets *this to the Cholesky factor of *this (i.e.  the C
   /// satisfying *this = C C^T), and sets "inv_cholesky" (if supplied) to its
@@ -305,17 +331,19 @@ class CuMatrixBase {
   /// Inversion for positive definite symmetric matrices.
   /// Treats the input as symmetric but only reads the lower triangle.
   /// The output is symmetric.
-  void SymInvertPosDef(); 
+  void SymInvertPosDef();
 
   void ApplyPow(Real power);
-  ///< Apply power to the absolute value of each element. 
-  ///< If inlude_sign is true, the result will be multiplied with 
+  ///< Apply power to the absolute value of each element.
+  ///< If inlude_sign is true, the result will be multiplied with
   ///< the sign of the input value.
   ///< If the power is negative and the input to the power is zero,
   ///< The output will be set zero. If include_sign is true, it will
   ///< multiply the result by the sign of the input.
   void ApplyPowAbs(Real power, bool include_sign=false);
-  void ApplyHeaviside(); ///< For each element, sets x = (x > 0 ? 1.0 : 0.0)
+  /// For each element, sets x = (x > 0 ? 1.0 : 0.0).
+  /// See also Heaviside().
+  void ApplyHeaviside();
   void ApplyFloor(Real floor_val);
   void ApplyCeiling(Real ceiling_val);
   void ApplyExp();
@@ -331,16 +359,7 @@ class CuMatrixBase {
 
   /// Find the id of the maximal element for each row
   void FindRowMaxId(CuArray<int32> *id) const;
-  
-  /*
-  // Copy row interval from matrix
-  // @param r      [in] number of rows to copy.
-  // @param src    [in] source matrix.
-  // @param src_ro [in] source matrix row offset.
-  // @param dst_ro [in] destination matrix row offset.
-  // void CopyRowsFromMat(int32 r, const CuMatrixBase<Real> &src, int32 src_ro, int32 dst_ro);
-  */
-  
+
   /// Math operations, some calling kernels
   void SetZero();
   void Set(Real value);
@@ -349,18 +368,18 @@ class CuMatrixBase {
   void SetZeroAboveDiag();
   void Scale(Real value);
   void ApplyLog();
-  
-  /// Multiply two matrices elementwise: C = A .* C
+
+  /// Multiply two matrices elementwise: C = C .* A
   void MulElements(const CuMatrixBase<Real> &A);
-  /// Divide two matrices elementwise: C = A ./ C
+  /// Divide two matrices elementwise: C = A ./ A
   void DivElements(const CuMatrixBase<Real> &A);
   /// Do, elementwise, *this = max(*this, A).
   void Max(const CuMatrixBase<Real> &A);
   /// scale i'th column by scale[i]
-  void MulColsVec(const CuVectorBase<Real> &scale); 
+  void MulColsVec(const CuVectorBase<Real> &scale);
   /// scale i'th row by scale[i]
   void MulRowsVec(const CuVectorBase<Real> &scale);
-  /// divide each row into src.NumCols() groups, and then scale i'th row's jth group of elements by src[i, j].   
+  /// divide each row into src.NumCols() groups, and then scale i'th row's jth group of elements by src[i, j].
   void MulRowsGroupMat(const CuMatrixBase<Real> &src);
   /// divide i'th row by scale[i]
   void DivRowsVec(const CuVectorBase<Real> &div);
@@ -369,7 +388,11 @@ class CuMatrixBase {
   /// *this += alpha * A
   void AddMat(Real alpha, const CuMatrixBase<Real> &A,
               MatrixTransposeType trans = kNoTrans);
-  
+
+  /// if A.NumRows() is multiple of (*this)->NumRows and A.NumCols() is multiple of (*this)->NumCols
+  /// divide A into blocks of the same size as (*this) and add them to *this (times alpha)
+  void AddMatBlocks(Real alpha, const CuMatrixBase<Real> &A, MatrixTransposeType trans = kNoTrans);
+
   /// (for each column c of *this), c = alpha * col + beta * c
   void AddVecToCols(Real alpha, const CuVectorBase<Real> &col, Real beta = 1.0);
   /// (for each row r of *this), r = alpha * row + beta * r
@@ -377,6 +400,8 @@ class CuMatrixBase {
   /// C = alpha * A(^T)*B(^T) + beta * C
   void AddMatMat(Real alpha, const CuMatrixBase<Real> &A, MatrixTransposeType transA,
                  const CuMatrixBase<Real> &B, MatrixTransposeType transB, Real beta);
+  /// A = alpha * x * y^T + A .
+  void AddVecVec(Real alpha, const CuVectorBase<Real> &x, const CuVectorBase<Real> &y);
   /// *this = a * b / c (by element; when c = 0, *this = a)
   void AddMatMatDivMat(const CuMatrixBase<Real> &A, const CuMatrixBase<Real> &B, const CuMatrixBase<Real> &C);
 
@@ -386,30 +411,30 @@ class CuMatrixBase {
   void SymAddMat2(const Real alpha, const CuMatrixBase<Real> &M,
                   MatrixTransposeType transA, Real beta);
 
-  
+
   /// This function is like AddMatMat but for where the second argument is of
   /// type CuBlockMatrix (a block-diagonal matrix of blocks).
   void AddMatBlock(Real alpha, const CuMatrixBase<Real> &A, MatrixTransposeType transA,
                    const CuBlockMatrix<Real> &B, MatrixTransposeType transB, Real beta);
-  
+
   /// *this = beta * *this + alpha * diag(v) * M [or M^T].
   /// The same as adding M but scaling each row M_i by v(i).
   void AddDiagVecMat(const Real alpha, const CuVectorBase<Real> &v,
-                     const CuMatrixBase<Real> &M, MatrixTransposeType transM, 
-                     Real beta = 1.0);  
+                     const CuMatrixBase<Real> &M, MatrixTransposeType transM,
+                     Real beta = 1.0);
 
   // *this = beta * *this + alpha * M  * diag(v) [or M^T].
   // The same as adding M but scaling each column M_j by v(j).
   void AddMatDiagVec(const Real alpha,
                      const CuMatrixBase<Real> &M, MatrixTransposeType transM,
                      CuVectorBase<Real> &v,
-                     Real beta = 1.0);  
+                     Real beta = 1.0);
 
   /// *this = beta * *this + alpha * A .* B (.* element by element multiplication)
   void AddMatMatElements(const Real alpha,
-                    const CuMatrixBase<Real>& A,
-                    const CuMatrixBase<Real>& B,
-                    const Real beta);
+                         const CuMatrixBase<Real>& A,
+                         const CuMatrixBase<Real>& B,
+                         const Real beta);
 
   /// this <-- beta*this + alpha*A*B
   void AddMatSp(const Real alpha,
@@ -419,7 +444,7 @@ class CuMatrixBase {
     CuMatrix<Real> M(B);
     return AddMatMat(alpha, A, transA, M, kNoTrans, beta);
   }
-  
+
   /// this <-- beta*this + alpha*SpA*B
   void AddSpMat(const Real alpha,
                 const CuSpMatrix<Real> &A,
@@ -465,7 +490,7 @@ class CuMatrixBase {
   }
   inline CuSubMatrix<Real> ColRange(const MatrixIndexT col_offset,
                                     const MatrixIndexT num_cols) const {
-    return CuSubMatrix<Real>(*this, 0, num_rows_, col_offset, num_cols); 
+    return CuSubMatrix<Real>(*this, 0, num_rows_, col_offset, num_cols);
   }
 
   inline const CuSubVector<Real> Row(MatrixIndexT i) const {
@@ -487,7 +512,7 @@ class CuMatrixBase {
                           static_cast<UnsignedMatrixIndexT>(num_cols_));
     return CuValue<Real>(data_ + r * stride_ + c);
   }
-  
+
   inline Real operator() (MatrixIndexT r, MatrixIndexT c) const {
     KALDI_PARANOID_ASSERT(static_cast<UnsignedMatrixIndexT>(r) <
                           static_cast<UnsignedMatrixIndexT>(num_rows_) &&
@@ -513,11 +538,23 @@ class CuMatrixBase {
   // (*this).
   void AddElements(Real alpha, const std::vector<MatrixElement<Real> >& input);
 
-  // This function resizes the output to indexes.size(), and for each element of
-  // "indexes" it interprets it as a (row, column) index into *this, and puts
-  // (*this)(row, column) into the corresponding element of "output".
+  // For each i, with indexes[i] = (j, k), does (*this)(j, k) += input[i].
+  // Requires, but does not check, that the vector of indexes does not contrain
+  // repeated elements, 'input' is the start of an array of length equal to
+  // indexes.Dim(), which is located on GPU memory if we are using the GPU.
+  void AddElements(Real alpha, const CuArray<Int32Pair> &indexes,
+                   const Real *input);
+
+  // This function requires that 'output' is a host array and is allocated with size
+  // of indexes.size(), and for each element of 'indexes' it interprets it as
+  // a (row, column) index into *this, and puts (*this)(row, column) into
+  // the corresponding element of 'output'.
   void Lookup(const std::vector<Int32Pair> &indexes,
-              std::vector<Real> *output) const;
+              Real *output) const;
+
+  // CUDA version of Lookup, would be called internally by the above function.
+  void Lookup(const CuArray<Int32Pair> &indexes,
+              Real *output) const;
 
   // Creates binary mask with per-element equality predicates of *this, mat.
   // Output stored to 'mask', values : 1.0 = equal, 0.0 = not-equal.
@@ -547,14 +584,14 @@ class CuMatrixBase {
   inline MatrixBase<Real> &Mat() {
     return *(reinterpret_cast<MatrixBase<Real>* >(this));
   }
-  
+
  protected:
-  
+
   // The constructors are protected to prevent the user creating an instance of
   // this class (you should create a child class CuMatrix or CuSubMatrix.
-  
+
   CuMatrixBase(): data_(NULL), num_cols_(0), num_rows_(0), stride_(0) { }
-  
+
   /// This constructor takes the #rows, #cols and stride; it's called from
   /// the constructor of CuSubMatrix.
   CuMatrixBase(Real *data,
@@ -574,6 +611,7 @@ class CuMatrixBase {
   MatrixIndexT num_cols_;
   MatrixIndexT num_rows_;
   MatrixIndexT stride_;
+
  private:
   KALDI_DISALLOW_COPY_AND_ASSIGN(CuMatrixBase);
 }; // class CuMatrixBase
@@ -585,11 +623,12 @@ class CuMatrix: public CuMatrixBase<Real> {
  public:
 
   CuMatrix() { }
-    
+
   /// Constructor with memory initialisation
   CuMatrix(MatrixIndexT rows, MatrixIndexT cols,
-           MatrixResizeType resize_type = kSetZero) {
-    Resize(rows, cols, resize_type); 
+           MatrixResizeType resize_type = kSetZero,
+           MatrixStrideType stride_type = kDefaultStride) {
+    Resize(rows, cols, resize_type, stride_type);
   }
 
   // Note: we had to remove the "explicit" keyword due
@@ -599,7 +638,7 @@ class CuMatrix: public CuMatrixBase<Real> {
 
   explicit CuMatrix(const CuBlockMatrix<Real> &other,
                     MatrixTransposeType trans = kNoTrans);
-  
+
   explicit CuMatrix(const CuMatrixBase<Real> &other,
                     MatrixTransposeType trans = kNoTrans);
 
@@ -607,7 +646,7 @@ class CuMatrix: public CuMatrixBase<Real> {
   explicit CuMatrix(const MatrixBase<OtherReal> &other,
                     MatrixTransposeType trans = kNoTrans);
 
-  /// Copy constructor taking SpMatrix... 
+  /// Copy constructor taking SpMatrix...
   explicit CuMatrix(const CuSpMatrix<Real> &M) : CuMatrixBase<Real>() {
     Resize(M.NumRows(), M.NumRows(), kUndefined);
     this->CopyFromSp(M);
@@ -625,7 +664,7 @@ class CuMatrix: public CuMatrixBase<Real> {
   template<typename OtherReal>
   explicit CuMatrix(const CuMatrixBase<OtherReal> &M,
                     MatrixTransposeType trans = kNoTrans);
-  
+
   CuMatrix<Real> &operator = (const CuMatrixBase<Real> &other) {
     this->Resize(other.NumRows(), other.NumCols(), kUndefined);
     this->CopyFromMat(other);
@@ -636,8 +675,8 @@ class CuMatrix: public CuMatrixBase<Real> {
     this->Resize(other.NumRows(), other.NumCols(), kUndefined);
     this->CopyFromMat(other);
     return *this;
-  }  
-  
+  }
+
   CuMatrix<Real> &operator = (const MatrixBase<Real> &other) {
     this->Resize(other.NumRows(), other.NumCols(), kUndefined);
     this->CopyFromMat(other);
@@ -648,14 +687,15 @@ class CuMatrix: public CuMatrixBase<Real> {
 
   /// Allocate the memory
   void Resize(MatrixIndexT rows, MatrixIndexT cols,
-              MatrixResizeType resize_type = kSetZero);
-    
+              MatrixResizeType resize_type = kSetZero,
+              MatrixStrideType stride_type = kDefaultStride);
+
   void Swap(Matrix<Real> *mat);
   void Swap(CuMatrix<Real> *mat);
 
   template<typename OtherReal>
   void Swap(CuMatrix<OtherReal> *mat);
-  
+
   /// I/O functions
   void Read(std::istream &is, bool binary);
 
@@ -698,7 +738,16 @@ class CuSubMatrix: public CuMatrixBase<Real> {
                      const MatrixIndexT num_rows,
                      const MatrixIndexT col_offset,
                      const MatrixIndexT num_cols);
-                    
+
+  // This constructor should be used with caution; it can be used for
+  // constructing 'fake' submatrices if you want to play with
+  // the stride. 'data' should point to GPU data if you're using the
+  // GPU.
+  inline CuSubMatrix(const Real *data,
+                     const MatrixIndexT num_rows,
+                     const MatrixIndexT num_cols,
+                     const MatrixIndexT stride);
+
   /// This type of constructor is needed for Range() to work [in CuMatrix base
   /// class]. Cannot make it explicit or that breaks.
   inline CuSubMatrix<Real> (const CuSubMatrix &other):
@@ -717,8 +766,8 @@ bool ApproxEqual(const CuMatrixBase<Real> &A,
 }
 
 template<typename Real>
-inline void AssertEqual(CuMatrixBase<Real> &A, CuMatrixBase<Real> &B,
-                        float tol = 0.01) {
+inline void AssertEqual(const CuMatrixBase<Real> &A,
+                        const CuMatrixBase<Real> &B, float tol = 0.01) {
   KALDI_ASSERT(A.ApproxEqual(B, tol));
 }
 
@@ -742,8 +791,8 @@ template<typename Real>
 template<typename OtherReal>
 Matrix<Real>::Matrix(const CuMatrixBase<OtherReal> &M,
                      MatrixTransposeType trans) {
-  if (trans == kNoTrans) Init(M.NumRows(), M.NumCols());
-  else Init(M.NumCols(), M.NumRows());
+  if (trans == kNoTrans) Init(M.NumRows(), M.NumCols(), kDefaultStride);
+  else Init(M.NumCols(), M.NumRows(), kDefaultStride);
   M.CopyToMat(this, trans);
 }
 
diff --git a/src/cudamatrix/cu-matrixdim.h b/src/cudamatrix/cu-matrixdim.h
index 32df913b4b1..dab7bd40eb2 100644
--- a/src/cudamatrix/cu-matrixdim.h
+++ b/src/cudamatrix/cu-matrixdim.h
@@ -57,7 +57,7 @@ extern "C" {
 
 // we define the following constants here because this file is included
 // both by the C++ code and also CUDA code.
-  
+
 
 // The size of a CUDA 1-d block, e.g. for vector operations..
 #define CU1DBLOCK 256
diff --git a/src/cudamatrix/cu-packed-matrix-test.cc b/src/cudamatrix/cu-packed-matrix-test.cc
index 8fb8b2d1182..381ced3b2c2 100644
--- a/src/cudamatrix/cu-packed-matrix-test.cc
+++ b/src/cudamatrix/cu-packed-matrix-test.cc
@@ -1,4 +1,4 @@
-// cudamatrix/cu-sp-matrix-test.cc
+// cudamatrix/cu-packed-matrix-test.cc
 //
 // Copyright 2013  Ehsan Variani
 //                 Lucas Ondel
diff --git a/src/cudamatrix/cu-packed-matrix.cc b/src/cudamatrix/cu-packed-matrix.cc
index 017b719a749..fd69e652be0 100644
--- a/src/cudamatrix/cu-packed-matrix.cc
+++ b/src/cudamatrix/cu-packed-matrix.cc
@@ -22,7 +22,7 @@
 
 #if HAVE_CUDA == 1
 #include <cuda_runtime_api.h>
-#include <cublas.h>
+#include <cublas_v2.h>
 #endif
 
 #include "base/timer.h"
@@ -297,8 +297,8 @@ void CuPackedMatrix<Real>::Scale(Real alpha) {
     Timer tim;
     size_t nr = static_cast<size_t>(num_rows_),
         num_elements = ((nr * (nr+1)) / 2);
-    cublas_scal(num_elements, alpha, data_, 1);
-    CU_SAFE_CALL(cudaGetLastError());
+    CU_SAFE_CALL(cublas_scal(GetCublasHandle(), num_elements, alpha, data_, 1));
+    
     CuDevice::Instantiate().AccuProfile("CuPackedMatrix::Scale", tim.Elapsed());
   } else
 #endif
@@ -333,7 +333,7 @@ void CuPackedMatrix<Real>::AddPacked(const Real alpha, const CuPackedMatrix<Real
     Timer tim;
     size_t nr = num_rows_,
         sz = (nr * (nr + 1)) / 2;
-    cublas_axpy(sz, alpha, M.Data(), 1, data_, 1);
+    cublas_axpy(GetCublasHandle(), sz, alpha, M.Data(), 1, data_, 1);
     CuDevice::Instantiate().AccuProfile("CuPackedMatrix::AddPacked", tim.Elapsed());
   } else
 #endif
diff --git a/src/cudamatrix/cu-packed-matrix.h b/src/cudamatrix/cu-packed-matrix.h
index 7f93b1dac65..0131ba6c101 100644
--- a/src/cudamatrix/cu-packed-matrix.h
+++ b/src/cudamatrix/cu-packed-matrix.h
@@ -157,9 +157,8 @@ class CuPackedMatrix {
 
   Real *data_;
   MatrixIndexT num_rows_;
-
   void AddPacked(const Real alpha, const CuPackedMatrix<Real> &M);
-  
+
  private:
   // Disallow assignment.
   PackedMatrix<Real> & operator=(const PackedMatrix<Real> &other);
diff --git a/src/cudamatrix/cu-sp-matrix-speed-test.cc b/src/cudamatrix/cu-sp-matrix-speed-test.cc
index 9ad0f6d23db..455bf58608f 100644
--- a/src/cudamatrix/cu-sp-matrix-speed-test.cc
+++ b/src/cudamatrix/cu-sp-matrix-speed-test.cc
@@ -1,4 +1,4 @@
-// cudamatrix/cu-matrix-speed-test.cc
+// cudamatrix/cu-sp-matrix-speed-test.cc
 
 // Copyright 2013  Johns Hopkins University (author: Daniel Povey)
 
@@ -53,7 +53,7 @@ static void UnitTestCuSpMatrixInvert(int32 dim) {
     if (iter  > 0) {
       B.Invert();
     } else { // do some more testing...
-    
+
       CuMatrix<Real> D(A);
       A.AddMat2(1.0, D, kTrans, 1.0);
       A.AddToDiag(0.1 * dim);
@@ -61,10 +61,10 @@ static void UnitTestCuSpMatrixInvert(int32 dim) {
       CuMatrix<Real> C(B);
       B.AddMat2(1.0, C, kTrans, 1.0);
       B.AddToDiag(0.1 * dim);
-    
+
       A.Invert();
       B.Invert();
-    
+
       SpMatrix<Real> E(dim);
       B.CopyToSp(&E);
 
@@ -82,7 +82,7 @@ static void UnitTestCuSpMatrixInvert(int32 dim) {
 
 template<typename Real>
 static void UnitTestCuSpMatrixCopyFromMat(int32 dim, SpCopyType copy_type) {
-  BaseFloat time_in_secs = 0.05;
+  BaseFloat time_in_secs = 0.01;
   int32 iter = 0;
   Timer tim;
   CuMatrix<Real> A(dim, dim);
diff --git a/src/cudamatrix/cu-sp-matrix.cc b/src/cudamatrix/cu-sp-matrix.cc
index 128d056bad2..2ad5834b796 100644
--- a/src/cudamatrix/cu-sp-matrix.cc
+++ b/src/cudamatrix/cu-sp-matrix.cc
@@ -20,7 +20,7 @@
 
 #if HAVE_CUDA == 1
 #include <cuda_runtime_api.h>
-#include <cublas.h>
+#include <cublas_v2.h>
 #endif
 
 #include "base/timer.h"
@@ -115,9 +115,9 @@ void CuSpMatrix<Real>::AddVec2(const Real alpha, const CuVectorBase<Real> &v) {
     dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
     dim3 dimGrid(n_blocks(nr, CU2DBLOCK), n_blocks(nr, CU2DBLOCK));
 
-    cublas_spr('U', this->num_rows_, alpha, v.Data(),
-               1, this->Data());
-    CU_SAFE_CALL(cudaGetLastError());
+    CU_SAFE_CALL(cublas_spr(GetCublasHandle(), CUBLAS_FILL_MODE_UPPER, this->num_rows_, alpha, v.Data(),
+               1, this->Data()));
+    
     CuDevice::Instantiate().AccuProfile("CuSpMatrix::AddVec2", tim.Elapsed());
   } else
 #endif
@@ -145,10 +145,10 @@ void CuSpMatrix<Real>::AddMat2(const Real alpha, const CuMatrixBase<Real> &M,
       return;
     }
 
-    char trans = (transM == kTrans ? 'N' : 'T');
+    cublasOperation_t trans = (transM == kTrans ? CUBLAS_OP_N : CUBLAS_OP_T);
 
     CuMatrix<Real> tmp_mat(*this);
-    cublas_syrk('U', trans, this_dim, m_other_dim, alpha, M.Data(),
+    cublas_syrk(GetCublasHandle(), CUBLAS_FILL_MODE_UPPER, trans, this_dim, m_other_dim, alpha, M.Data(),
                 M.Stride(), beta, tmp_mat.Data(), tmp_mat.Stride());
     this->CopyFromMat(tmp_mat, kTakeLower);
     
@@ -218,7 +218,6 @@ bool CuSpMatrix<Real>::IsUnit(Real tol) const {
   // Note: we could do this more efficiently still, by slightly changing the
   // definition of IsUnit and getting rid of the extra stuff inside TraceSpSp
   // that corrects for the diagonal being counted twice.
-  
   return (TraceSpSp(*this, *this) + this->NumRows() - 2.0 * this->Trace() <=
           tol * this->NumRows());
 }
diff --git a/src/cudamatrix/cu-sp-matrix.h b/src/cudamatrix/cu-sp-matrix.h
index 2fa46c332f6..2b66c208149 100644
--- a/src/cudamatrix/cu-sp-matrix.h
+++ b/src/cudamatrix/cu-sp-matrix.h
@@ -135,7 +135,6 @@ class CuSpMatrix : public CuPackedMatrix<Real> {
   inline SpMatrix<Real> &Mat() {
     return *(reinterpret_cast<SpMatrix<Real>* >(this));
   }
-
 };
 
 template<typename Real>
diff --git a/src/cudamatrix/cu-sparse-matrix-test.cc b/src/cudamatrix/cu-sparse-matrix-test.cc
index a94f4685928..726b6e5ccd8 100644
--- a/src/cudamatrix/cu-sparse-matrix-test.cc
+++ b/src/cudamatrix/cu-sparse-matrix-test.cc
@@ -77,7 +77,7 @@ static void UnitTestCuSparseMatrixSum() {
 
     Real sum1 = cu_smat.Sum();
     Real sum2 = mat.Sum();
-    AssertEqual(sum1, sum2, 0.00001);
+    KALDI_ASSERT(fabs(sum1 - sum2) < 1.0e-05);
   }
 }
 
diff --git a/src/cudamatrix/cu-sparse-matrix.cc b/src/cudamatrix/cu-sparse-matrix.cc
index e4808615728..17d69ce849a 100644
--- a/src/cudamatrix/cu-sparse-matrix.cc
+++ b/src/cudamatrix/cu-sparse-matrix.cc
@@ -21,7 +21,7 @@
 
 #if HAVE_CUDA == 1
 #include <cuda_runtime_api.h>
-#include <cublas.h>
+#include <cublas_v2.h>
 #endif
 
 #include <utility>
@@ -55,9 +55,12 @@ MatrixIndexT CuSparseMatrix<Real>::NumElements() const {
 
 template <typename Real>
 Real CuSparseMatrix<Real>::Sum() const {
+  if (NumElements() == 0)
+    return 0.0;
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
-    CuVector<Real> sum_vec(*this);
+    CuVector<Real> sum_vec(this->NumElements(), kUndefined);
+    this->CopyElementsToVec(&sum_vec);
     return sum_vec.Sum();
   } else
 #endif
@@ -70,7 +73,8 @@ template <typename Real>
 Real CuSparseMatrix<Real>::FrobeniusNorm() const {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
-    CuVector<Real> element_vec(*this);
+    CuVector<Real> element_vec(this->NumElements(), kUndefined);
+    this->CopyElementsToVec(&element_vec);
     return element_vec.Norm(2);
   } else
 #endif
@@ -202,6 +206,27 @@ void CuSparseMatrix<double>::CopyToSmat(SparseMatrix<float> *smat) const;
 template
 void CuSparseMatrix<double>::CopyToSmat(SparseMatrix<double> *smat) const;
 
+template <typename Real>
+void CuSparseMatrix<Real>::CopyElementsToVec(CuVectorBase<Real> *vec) const {
+  KALDI_ASSERT(vec != NULL);
+  KALDI_ASSERT(this->NumElements() == vec->Dim());
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    Timer tim;
+    cublas_copy(GetCublasHandle(),
+                this->NumElements(),
+                &(this->elements_.Data()->weight),
+                static_cast<size_t>(sizeof(MatrixElement<Real>) / sizeof(Real)),
+                vec->Data(), 1);
+    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
+  } else
+#endif
+  {
+    Vector<Real> tmp(this->NumElements(), kUndefined);
+    Mat().CopyElementsToVec(&tmp);
+    vec->CopyFromVec(tmp);
+  }
+}
 
 template <typename Real>
 void CuSparseMatrix<Real>::Swap(SparseMatrix<Real> *smat) {
@@ -341,6 +366,7 @@ void GeneralMatrix::CopyToMat(CuMatrixBase<BaseFloat> *cu_mat,
         Matrix<BaseFloat> mat(cmat_);
         if (trans == kNoTrans) {
           cu_mat->CopyFromMat(mat);
+          break;
         } else {
           CuMatrix<BaseFloat> temp_cu;
           temp_cu.Swap(&mat);
diff --git a/src/cudamatrix/cu-sparse-matrix.h b/src/cudamatrix/cu-sparse-matrix.h
index ff2ba238414..1298ee5ea5f 100644
--- a/src/cudamatrix/cu-sparse-matrix.h
+++ b/src/cudamatrix/cu-sparse-matrix.h
@@ -95,6 +95,10 @@ class CuSparseMatrix {
   template <typename OtherReal>
   void CopyToSmat(SparseMatrix<OtherReal> *smat) const;
 
+  /// Copy elements to CuVector. It is the caller's responsibility to resize
+  /// <*vec>.
+  void CopyElementsToVec(CuVectorBase<Real> *vec) const;
+
   /// Swap with CPU-based matrix.
   void Swap(SparseMatrix<Real> *smat);
 
diff --git a/src/cudamatrix/cu-tp-matrix-test.cc b/src/cudamatrix/cu-tp-matrix-test.cc
index 87203ea3a65..e9d1d66aad9 100644
--- a/src/cudamatrix/cu-tp-matrix-test.cc
+++ b/src/cudamatrix/cu-tp-matrix-test.cc
@@ -1,4 +1,4 @@
-// cudamatrix/cu-sp-matrix-test.cc
+// cudamatrix/cu-tp-matrix-test.cc
 //
 // Copyright 2013  Ehsan Variani
 //                 Lucas Ondel
diff --git a/src/cudamatrix/cu-tp-matrix.cc b/src/cudamatrix/cu-tp-matrix.cc
index efc12df2bfb..4c3d32d2468 100644
--- a/src/cudamatrix/cu-tp-matrix.cc
+++ b/src/cudamatrix/cu-tp-matrix.cc
@@ -20,7 +20,7 @@
 
 #if HAVE_CUDA==1
 #include <cuda_runtime_api.h>
-#include <cublas.h>
+#include <cublas_v2.h>
 #endif
 
 #include "base/timer.h"
@@ -74,9 +74,8 @@ void CuTpMatrix<Real>::Invert() {
     CU_SAFE_CALL(cudaGetLastError());        
     CuMatrix<Real> tmp2(dim, dim);
     tmp2.CopyFromTp(*this);
-    cublas_trsm(dim, dim, alpha, tmp2.Data(), tmp2.Dim().stride, 
-      tmp.Data(), tmp.Dim().stride);
-    CU_SAFE_CALL(cudaGetLastError());        
+    CU_SAFE_CALL(cublas_trsm(GetCublasHandle(), dim, dim, alpha, tmp2.Data(), tmp2.Dim().stride, 
+      tmp.Data(), tmp.Dim().stride));
     this->CopyFromMat(tmp, kNoTrans);
   } else
 #endif
diff --git a/src/cudamatrix/cu-tp-matrix.h b/src/cudamatrix/cu-tp-matrix.h
index 1b74dd98470..8de46ec46f5 100644
--- a/src/cudamatrix/cu-tp-matrix.h
+++ b/src/cudamatrix/cu-tp-matrix.h
@@ -83,7 +83,6 @@ class CuTpMatrix : public CuPackedMatrix<Real> {
   inline TpMatrix<Real> &Mat() {
     return *(reinterpret_cast<TpMatrix<Real>* >(this));
   }
-  
 };
 
 } // namespace
diff --git a/src/cudamatrix/cu-vector-speed-test.cc b/src/cudamatrix/cu-vector-speed-test.cc
index b42a04204f2..1e3b46a4ac7 100644
--- a/src/cudamatrix/cu-vector-speed-test.cc
+++ b/src/cudamatrix/cu-vector-speed-test.cc
@@ -39,7 +39,7 @@ std::string NameOf() {
 }
 
 template<typename Real> void TestCuVectorSoftmax(int32 dim) {
-  BaseFloat time_in_secs = 0.05;
+  BaseFloat time_in_secs = 0.02;
   CuVector<Real> M(dim);
   M.SetRandn();
 
@@ -57,7 +57,7 @@ template<typename Real> void TestCuVectorSoftmax(int32 dim) {
 
 
 template<typename Real> void TestCuVectorSum(int32 dim) {
-  BaseFloat time_in_secs = 0.05;
+  BaseFloat time_in_secs = 0.02;
   CuVector<Real> M(dim);
   M.SetRandn();
 
@@ -75,7 +75,7 @@ template<typename Real> void TestCuVectorSum(int32 dim) {
 
 
 template<typename Real> void TestCuVectorVecVecOne(int32 dim) {
-  BaseFloat time_in_secs = 0.05;
+  BaseFloat time_in_secs = 0.02;
   CuVector<Real> M(dim);
   M.SetRandn();
 
@@ -99,7 +99,7 @@ template<typename Real> void TestCuVectorVecVecOne(int32 dim) {
 template<typename Real> void TestCuVectorAddDiagMatMat(int32 dim,
                                                        MatrixTransposeType transN,
                                                        MatrixTransposeType transO) {
-  BaseFloat time_in_secs = 0.05;
+  BaseFloat time_in_secs = 0.02;
   CuVector<Real> v(dim);
   v.SetRandn();
   CuMatrix<Real> N(dim, dim), O(dim, dim);
@@ -108,7 +108,7 @@ template<typename Real> void TestCuVectorAddDiagMatMat(int32 dim,
 
   Timer tim;
   int32 iter = 0;
-  
+
   for (;tim.Elapsed() < time_in_secs; iter++) {
     v.AddDiagMatMat(1.0, N, transN, O, transO, 1.0);
   }
@@ -123,7 +123,7 @@ template<typename Real> void TestCuVectorAddDiagMatMat(int32 dim,
 
 
 template<typename Real> void TestCuVectorAddDiagMat2(int32 dim, MatrixTransposeType trans) {
-  BaseFloat time_in_secs = 0.05;
+  BaseFloat time_in_secs = 0.02;
   CuVector<Real> v(dim);
   v.SetRandn();
   CuMatrix<Real> N(dim, dim);
@@ -131,7 +131,7 @@ template<typename Real> void TestCuVectorAddDiagMat2(int32 dim, MatrixTransposeT
 
   Timer tim;
   int32 iter = 0;
-  
+
   for (;tim.Elapsed() < time_in_secs; iter++) {
     v.AddDiagMat2(1.0, N, trans, 0.0);
   }
@@ -209,7 +209,7 @@ template<typename Real> void CudaVectorSpeedTest() {
     TestCuVectorAddDiagMatMat<Real>(sizes[s], kTrans, kNoTrans);
     TestCuVectorAddDiagMatMat<Real>(sizes[s], kTrans, kTrans);
   }
-  for (int32 s = 0; s < ns; s++) { 
+  for (int32 s = 0; s < ns; s++) {
     TestCuVectorAddDiagMat2<Real>(sizes[s], kNoTrans);
     TestCuVectorAddDiagMat2<Real>(sizes[s], kTrans);
   }
@@ -221,7 +221,7 @@ template<typename Real> void CudaVectorSpeedTest() {
     TestCuVectorAddColSumMat<Real>(sizes[s], kNoTrans);
     TestCuVectorAddColSumMat<Real>(sizes[s], kTrans);
   }
-  
+
 }
 
 
diff --git a/src/cudamatrix/cu-vector-test.cc b/src/cudamatrix/cu-vector-test.cc
index db715d75d7a..9b7aa97776a 100644
--- a/src/cudamatrix/cu-vector-test.cc
+++ b/src/cudamatrix/cu-vector-test.cc
@@ -1,4 +1,4 @@
-// cudamatrix/cuda-vector-test.cc
+// cudamatrix/cu-vector-test.cc
 
 // Copyright 2013 Lucas Ondel
 //           2013 Johns Hopkins University (author: Daniel Povey)
@@ -22,7 +22,7 @@
 #include <iostream>
 #include <vector>
 #include <cstdlib>
-
+#include <cmath>
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "cudamatrix/cu-matrix.h"
@@ -62,7 +62,7 @@ static void UnitTestCuVectorIO() {
 }
 
 
-template<typename Real, typename OtherReal> 
+template<typename Real, typename OtherReal>
 static void UnitTestCuVectorCopyFromVec() {
   for (int32 i = 1; i < 10; i++) {
     MatrixIndexT dim = 10 * i;
@@ -80,7 +80,7 @@ static void UnitTestCuVectorCopyFromVec() {
   }
 }
 
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuSubVector() {
   for (int32 iter = 0 ; iter < 10; iter++) {
     int32 M1 = 1 + rand () % 10, M2 = 1 + Rand() % 1, M3 = 1 + Rand() % 10, M = M1 + M2 + M3,
@@ -97,7 +97,7 @@ static void UnitTestCuSubVector() {
 
 
 
-template<typename Real> 
+template<typename Real>
 static void UnitTestCuVectorMulTp() {
   for (int32 i = 1; i < 10; i++) {
     MatrixIndexT dim = 10 * i;
@@ -105,7 +105,7 @@ static void UnitTestCuVectorMulTp() {
     A.SetRandn();
     TpMatrix<Real> B(dim);
     B.SetRandn();
-    
+
     CuVector<Real> C(A);
     CuTpMatrix<Real> D(B);
 
@@ -127,10 +127,10 @@ static void UnitTestCuVectorAddTp() {
     B.SetRandn();
     Vector<Real> C(dim);
     C.SetRandn();
-    
+
     CuVector<Real> D(A);
     CuTpMatrix<Real> E(B);
-    CuVector<Real> F(C); 
+    CuVector<Real> F(C);
 
     A.AddTpVec(1.0, B, kNoTrans, C, 1.0);
     D.AddTpVec(1.0, E, kNoTrans, F, 1.0);
@@ -160,7 +160,7 @@ template<typename Real> void CuVectorUnitTestAddVec() {
   CuVector<Real> vec1_orig(vec1);
   BaseFloat alpha = 0.43243;
   vec1.AddVec(alpha, vec2);
-  
+
   for (int32 i = 0; i < M; i++)
     AssertEqual(vec1_orig(i) + alpha * vec2(i), vec1(i));
 }
@@ -177,7 +177,7 @@ template<typename Real> void CuVectorUnitTestAddVecCross() {
       CuVector<Real> vec1_orig(vec1);
       Real alpha = 0.43243;
       vec1.AddVec(alpha, vec2);
-  
+
       for (int32 i = 0; i < M; i++)
         AssertEqual(vec1_orig(i) + alpha * vec2(i), vec1(i));
     } else {
@@ -198,7 +198,7 @@ template<typename Real> void CuVectorUnitTestAddVecExtra() {
   CuVector<Real> vec1_orig(vec1);
   BaseFloat alpha = 0.43243, beta = 1.4321;
   vec1.AddVec(alpha, vec2, beta);
-  
+
   for (int32 i = 0; i < M; i++)
     AssertEqual(beta * vec1_orig(i) + alpha * vec2(i), vec1(i));
 }
@@ -268,6 +268,20 @@ template<typename Real> static void UnitTestCuVectorReplaceValue() {
   }
 }
 
+template<typename Real> static void UnitTestCuVectorSum() {
+  for (int32 i = 0; i < 200; i++) {
+    int32 start_dim = RandInt(1, 500), end_dim = RandInt(1, 500);
+    int32 dim = RandInt(10, 12000) + start_dim + end_dim;
+    Real quiet_nan = nan("");  // this is from <cmath>.
+    Vector<BaseFloat> vec(start_dim + dim + end_dim);
+    vec.Range(0, start_dim).Set(quiet_nan);
+    vec.Range(start_dim, dim).Set(1.0);
+    vec.Range(start_dim + dim, end_dim).Set(quiet_nan);
+    BaseFloat sum = vec.Range(start_dim, dim).Sum();
+    KALDI_ASSERT(ApproxEqual(sum, dim));
+  }
+}
+
 template<typename Real> void CuVectorUnitTestInvertElements() {
   // Also tests MulElements();
   int32 M = 256 + Rand() % 100;
@@ -288,7 +302,7 @@ template<typename Real> void CuVectorUnitTestSum() {
     CuVector<Real> A(dim), ones(dim);
     A.SetRandn();
     ones.Set(1.0);
-    
+
     AssertEqual(VecVec(A, ones), A.Sum());
   }
 }
@@ -320,7 +334,7 @@ template<typename Real> void CuVectorUnitTestCopyFromMat() {
   }
   Matrix<Real> matrix(cu_matrix), matrix2(M, N);
   CuMatrix<Real> matrix3(M, N);
-  
+
   CuVector<Real> vector(M * N), vector2(M * N);
   vector.CopyRowsFromMat(cu_matrix);
   vector2.CopyRowsFromMat(matrix);
@@ -328,8 +342,8 @@ template<typename Real> void CuVectorUnitTestCopyFromMat() {
   matrix3.CopyRowsFromVec(Vector<Real>(vector2));
   Vector<Real> vector3(M * N);
   vector3.CopyRowsFromMat(cu_matrix);
-                                         
-  
+
+
   for(int32 j = 0; j < M*N; j++) {
     if (Rand() % 500 == 0) { // random small subset (it was slow)
       KALDI_ASSERT(vector(j) == cu_matrix(j/N, j%N));
@@ -412,7 +426,7 @@ template<typename Real> void CuVectorUnitTestNorm() {
   KALDI_ASSERT(ApproxEqual(cu_vector.Norm(1.0), 3.0));
   KALDI_ASSERT(ApproxEqual(cu_vector.Norm(2.0), sqrt(5.0)));
 }
-               
+
 
 template<typename Real> void CuVectorUnitTestMin() {
   for (int32 p = 0; p < 5; p++) {
@@ -496,7 +510,7 @@ template<typename Real> void CuVectorUnitTestApplyFloor() {
     BaseFloat floor = 0.33 * (-5 + Rand() % 10);
     int32 i = cu_vector.ApplyFloor(floor);
     int32 j = vector.ApplyFloor(floor);
-  
+
     CuVector<Real> cu2(vector);
 
     AssertEqual(cu2, cu_vector);
@@ -507,6 +521,27 @@ template<typename Real> void CuVectorUnitTestApplyFloor() {
   }
 }
 
+template<typename Real> void CuVectorUnitTestApplyCeiling() {
+  for (int32 l = 0; l < 10; l++) {
+    int32 dim = 100 + Rand() % 700;
+    CuVector<Real> cu_vector(dim);
+    cu_vector.SetRandn();
+
+    Vector<Real> vector(cu_vector);
+    BaseFloat floor = 0.33 * (-5 + Rand() % 10);
+    int32 i = cu_vector.ApplyCeiling(floor);
+    int32 j = vector.ApplyCeiling(floor);
+
+    CuVector<Real> cu2(vector);
+
+    AssertEqual(cu2, cu_vector);
+    if (i != j) {
+      KALDI_WARN << "ApplyCeiling return code broken...";
+    }
+    KALDI_ASSERT(i==j);
+  }
+}
+
 template<typename Real> void CuVectorUnitTestApplyPow() {
   for (int32 l = 0; l < 10; l++) {
     int32 dim = 100 + Rand() % 700;
@@ -519,7 +554,7 @@ template<typename Real> void CuVectorUnitTestApplyPow() {
     BaseFloat pow = -2 + (Rand() % 5);
     cu_vector.ApplyPow(pow);
     vector.ApplyPow(pow);
-  
+
     CuVector<Real> cu2(vector);
 
     AssertEqual(cu2, cu_vector);
@@ -558,7 +593,7 @@ template<typename Real> void CuVectorUnitTestAddDiagMat2() {
     cu_mat_orig.SetRandn();
     MatrixTransposeType trans = (p % 2 == 0 ? kNoTrans : kTrans);
     CuMatrix<Real> cu_mat(cu_mat_orig, trans);
-    
+
     Vector<Real> vector(cu_vector);
     Matrix<Real> mat(cu_mat);
 
@@ -583,12 +618,12 @@ static void CuVectorUnitTestAddDiagMatMat() {
     MatrixTransposeType transM = (iter % 2 == 0 ? kNoTrans : kTrans);
     MatrixTransposeType transN = ((iter/2) % 2 == 0 ? kNoTrans : kTrans);
     CuMatrix<Real> M(M_orig, transM), N(N_orig, transN);
-    
+
     v.SetRandn();
     CuVector<Real> w(v);
 
     w.AddDiagMatMat(alpha, M, transM, N, transN, beta);
-    
+
     {
       CuVector<Real> w2(v);
       CuMatrix<Real> MN(dimM, dimM);
@@ -648,7 +683,7 @@ template<typename Real> void CuVectorUnitTestAddSpVec() {
     CuSpMatrix<Real> mat_cu(M);
     mat_cu.SetRandn();
     SpMatrix<Real> mat(mat_cu);
-    
+
     BaseFloat alpha = 0.5 * (Rand() % 5), beta = 0.5 * (Rand() % 5);
     dst_cu.AddSpVec(alpha, mat_cu, src_cu, beta);
     dst.AddSpVec(alpha, mat, src, beta);
@@ -674,6 +709,7 @@ template<typename Real> void CuVectorUnitTest() {
   CuVectorUnitTestScale<Real>();
   CuVectorUnitTestSum<Real>();
   CuVectorUnitTestInvertElements<Real>();
+  UnitTestCuVectorSum<Real>();
   CuVectorUnitTestAddRowSumMat<Real>();
   CuVectorUnitTestAddColSumMat<Real>();
   UnitTestCuVectorReplaceValue<Real>();
@@ -687,11 +723,12 @@ template<typename Real> void CuVectorUnitTest() {
   CuVectorUnitTestCopyDiagFromPacked<Real>();
   CuVectorUnitTestCopyDiagFromMat<Real>();
   CuVectorUnitTestCopyCross<Real>();
-  CuVectorUnitTestCopyCross2<Real>();  
-  CuVectorUnitTestNorm<Real>();  
+  CuVectorUnitTestCopyCross2<Real>();
+  CuVectorUnitTestNorm<Real>();
   CuVectorUnitTestApplyExp<Real>();
   CuVectorUnitTestApplyLog<Real>();
   CuVectorUnitTestApplyFloor<Real>();
+  CuVectorUnitTestApplyCeiling<Real>();
   CuVectorUnitTestApplyPow<Real>();
   CuVectorUnitTestAddMatVec<Real>();
   CuVectorUnitTestAddSpVec<Real>();
@@ -710,10 +747,10 @@ int main(int argc, char *argv[]) {
   const char *usage = "Usage: cu-vector-test [options]";
 
   ParseOptions po(usage);
-  std::string use_gpu = "yes";    
+  std::string use_gpu = "yes";
   po.Register("use-gpu", &use_gpu, "yes|no|optional");
   po.Read(argc, argv);
-  
+
   if (po.NumArgs() != 0) {
     po.PrintUsage();
     exit(1);
diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc
index 16b554cab9a..6deb3809d85 100644
--- a/src/cudamatrix/cu-vector.cc
+++ b/src/cudamatrix/cu-vector.cc
@@ -20,7 +20,7 @@
 
 #if HAVE_CUDA == 1
 #include <cuda_runtime_api.h>
-#include <cublas.h>
+#include <cublas_v2.h>
 #endif
 
 #include "base/timer.h"
@@ -48,12 +48,10 @@ Real VecVec(const CuVectorBase<Real> &a,
   KALDI_ASSERT(a.Dim() == b.Dim());
   Real result = 0;
 #if HAVE_CUDA == 1
-  if (CuDevice::Instantiate().Enabled()) {    
+  if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
-
-    result = cublas_dot(a.Dim(), a.Data(), 1, b.Data(), 1);
-
-    CU_SAFE_CALL(cublasGetError());
+    CU_SAFE_CALL(cublas_dot(GetCublasHandle(), a.Dim(), a.Data(), 1, b.Data(),
+			    1, &result));
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
 } else
 #endif
@@ -85,11 +83,9 @@ void CuVectorBase<Real>::CopyColFromMat(const CuMatrixBase<Real> &mat, MatrixInd
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
-    int dimBlock(CU1DBLOCK);
-    int dimGrid(n_blocks(dim_,CU1DBLOCK));
-
-    cuda_copy_col_from_mat(dimGrid, dimBlock, data_, col, mat.Data(), mat.Dim(), dim_);
-    CU_SAFE_CALL(cudaGetLastError());    
+    cublas_copy(GetCublasHandle(),
+                this->dim_, mat.Data() + col, mat.Stride(), this->data_, 1);
+    CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile("CuVectorBase::CopyColFromMat", tim.Elapsed());
   } else
 #endif
@@ -110,7 +106,7 @@ void CuVectorBase<double>::CopyColFromMat(const CuMatrixBase<float> &mat, Matrix
     int dimGrid(n_blocks(dim_,CU1DBLOCK));
 
     cuda_copy_col_from_mat_df(dimGrid, dimBlock, data_, col, mat.Data(), mat.Dim(), dim_);
-    CU_SAFE_CALL(cudaGetLastError());    
+    CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile("CuVectorBase::CopyColFromMat", tim.Elapsed());
   } else
 #endif
@@ -132,8 +128,8 @@ void CuVectorBase<float>::CopyColFromMat(const CuMatrixBase<double> &mat, Matrix
     int dimGrid(n_blocks(dim_,CU1DBLOCK));
 
     cuda_copy_col_from_mat_fd(dimGrid, dimBlock, data_, col, mat.Data(), mat.Dim(), dim_);
-    CU_SAFE_CALL(cudaGetLastError());    
-    CuDevice::Instantiate().AccuProfile("CuVectorBase::CopyColFromMat", tim.Elapsed());   
+    CU_SAFE_CALL(cudaGetLastError());
+    CuDevice::Instantiate().AccuProfile("CuVectorBase::CopyColFromMat", tim.Elapsed());
   } else
 #endif
   {
@@ -143,7 +139,7 @@ void CuVectorBase<float>::CopyColFromMat(const CuMatrixBase<double> &mat, Matrix
 
 template<typename Real>
 void CuVectorBase<Real>::CopyRowsFromMat(const CuMatrixBase<Real> &mat) {
-  KALDI_ASSERT(dim_ == mat.NumCols() * mat.NumRows());  
+  KALDI_ASSERT(dim_ == mat.NumCols() * mat.NumRows());
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     if (dim_ == 0) return;
@@ -177,9 +173,9 @@ Real CuVectorBase<Real>::Norm(Real p) {
     KALDI_ASSERT(p == 1.0 || p == 2.0);
     if (dim_ == 0) return 0.0;
     if (p == 1.0) {
-      ans = cublas_asum(dim_, data_, 1);
+      cublas_asum(GetCublasHandle(), dim_, data_, 1, &ans);
     } else {
-      ans = cublas_nrm2(dim_, data_, 1);
+      cublas_nrm2(GetCublasHandle(), dim_, data_, 1, &ans);
     }
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
     if (ans != ans) {
@@ -248,7 +244,7 @@ void MatrixBase<Real>::CopyRowsFromVec(const CuVectorBase<Real> &v) {
     CopyRowsFromVec(v.Vec());
   }
 }
-  
+
 // instantiate the template above.
 template void MatrixBase<float>::CopyRowsFromVec(const CuVectorBase<float> &v);
 template void MatrixBase<double>::CopyRowsFromVec(const CuVectorBase<double> &v);
@@ -280,10 +276,9 @@ Real CuVectorBase<Real>::Sum() const {
       CU_SAFE_CALL(cudaGetLastError());
       Vector<Real> tmp(dimGrid);
       g.CopyToVec(&tmp);
-      CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());    
+      CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
       return tmp.Sum();
     } else {
-      if (dim_ == 0) return 0.0;
       CuVector<Real> tmp(1, kUndefined);
       int dimBlock(CU1DBLOCK);
       int dimGrid = 1; // only 1 block here. we have loops in each thread.
@@ -306,7 +301,7 @@ void CuVectorBase<Real>::ApplySoftMax() {
     if (dim_ == 0) return;
     Timer tim;
     size_t dimBlock = dim_ > CU1DBLOCK ? CU1DBLOCK : dim_; // for cuda_softmax_reduce function, dimBlock value is fixed min(CU1DBLOCK, dim) , represent CU1DBLOCK threads reduce a row at the same time.
-    size_t dimGrid = 1;       // dimGrid value represent the number of rows 
+    size_t dimGrid = 1;       // dimGrid value represent the number of rows
     ::MatrixDim dim = { 1, this->dim_, this->dim_};
     cuda_softmax_reduce(dimGrid, dimBlock, data_, data_, dim, this->dim_);//actually dim is not stride...
     CU_SAFE_CALL(cudaGetLastError());
@@ -329,9 +324,9 @@ MatrixIndexT CuVectorBase<Real>::ApplyFloor(Real floor_val) {
     int dimGrid(n_blocks(dim_,CU1DBLOCK));
 
     CuVector<float> count_vec(dim_, kUndefined);
-    
+
     cuda_vec_apply_floor(dimGrid, dimBlock, data_, floor_val, count_vec.Data(), dim_);
-    CU_SAFE_CALL(cudaGetLastError());    
+    CU_SAFE_CALL(cudaGetLastError());
     num_floored = count_vec.Sum();
     CuDevice::Instantiate().AccuProfile("CuVectorBase::ApplyFloor", tim.Elapsed());
   } else
@@ -344,22 +339,27 @@ MatrixIndexT CuVectorBase<Real>::ApplyFloor(Real floor_val) {
 }
 
 template<typename Real>
-void CuVectorBase<Real>::ApplyCeiling(Real ceiling_val) {
+MatrixIndexT CuVectorBase<Real>::ApplyCeiling(Real ceiling_val) {
+  MatrixIndexT num_ceiled = 0;
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
-    if (dim_ == 0) return;
+    if (dim_ == 0) return 0;
     Timer tim;
-    dim3 dimBlock(CU1DBLOCK, 1);
-    dim3 dimGrid(n_blocks(Dim(), CU1DBLOCK), 1);
-    MatrixDim pseudo_matrix_dim = { 1, Dim(), Dim() }; // vector is a matix with 1 row,
-    cuda_apply_ceiling(dimGrid, dimBlock, data_, ceiling_val, pseudo_matrix_dim);
-    CU_SAFE_CALL(cudaGetLastError());    
-    CuDevice::Instantiate().AccuProfile("CuVectorBase::ApplyCeiling", tim.Elapsed());
+    int dimBlock(CU1DBLOCK);
+    int dimGrid(n_blocks(dim_,CU1DBLOCK));
+
+    CuVector<float> count_vec(dim_, kUndefined);
+
+    cuda_vec_apply_ceiling(dimGrid, dimBlock, data_, ceiling_val, count_vec.Data(), dim_);
+    CU_SAFE_CALL(cudaGetLastError());
+    num_ceiled = count_vec.Sum();
+    CuDevice::Instantiate().AccuProfile("CuVectorBase::ApplyFloor", tim.Elapsed());
   } else
 #endif
   {
-    Vec().ApplyCeiling(ceiling_val);
+    num_ceiled = Vec().ApplyCeiling(ceiling_val);
   }
+  return num_ceiled;
 }
 
 template<typename Real>
@@ -370,12 +370,12 @@ void CuVectorBase<Real>::ApplyPow(Real power) {
     Timer tim;
     // for this particular kernel, x is #rows, y is #cols.  so
     // fake matrix with 1 row, Dim() cols.
-    dim3 dimBlock(1, CU1DBLOCK);
-    dim3 dimGrid(1, n_blocks(Dim(), CU1DBLOCK));
+    dim3 dimBlock(CU1DBLOCK, 1);
+    dim3 dimGrid(n_blocks(Dim(), CU1DBLOCK), 1);
     ::MatrixDim fake_matrix_dim = { 1, Dim(), 1 };
     // num_cols is Dim(), num_rows is 1, stride is 1 (it's a don't-care).
     cuda_apply_pow(dimGrid, dimBlock, data_, power, fake_matrix_dim);
-    CU_SAFE_CALL(cudaGetLastError());    
+    CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile("CuVectorBase::ApplyFloor", tim.Elapsed());
   } else
 #endif
@@ -395,7 +395,7 @@ void CuVectorBase<Real>::ApplyExp() {
     int dimGrid(n_blocks(dim_,CU1DBLOCK));
 
     cuda_vec_apply_exp(dimGrid, dimBlock, data_, dim_);
-    CU_SAFE_CALL(cudaGetLastError());    
+    CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile("CuVectorBase::ApplyExp", tim.Elapsed());
   } else
 #endif
@@ -416,7 +416,7 @@ void CuVectorBase<Real>::ApplyLog() {
 
     CuVector<Real> flag(1);
     cuda_vec_apply_log(dimGrid, dimBlock, data_, flag.Data(), dim_);
-    CU_SAFE_CALL(cudaGetLastError());    
+    CU_SAFE_CALL(cudaGetLastError());
     if (flag(0) > 0)
       KALDI_ERR << "Trying to take log of a negative number.";
     CuDevice::Instantiate().AccuProfile("CuVectorBase::ApplyLog", tim.Elapsed());
@@ -439,15 +439,16 @@ void CuVectorBase<Real>::AddMatVec(const Real alpha,
   KALDI_ASSERT(&v != this);
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
-    if (dim_ == 0) return;    
+    if (dim_ == 0) return;
     Timer tim;
 
     // Everything is backwards in CuBlas.  We need to reverse rows, columns,
     // transpose-ness.
-    cublas_gemv((trans==kTrans?'N':'T'), M.NumCols(), M.NumRows(), alpha,
-                M.Data(), M.Stride(), v.Data(), 1, beta, data_, 1);
+    CU_SAFE_CALL(cublas_gemv(GetCublasHandle(),
+			    (trans==kTrans? CUBLAS_OP_N:CUBLAS_OP_T),
+			    M.NumCols(), M.NumRows(), alpha, M.Data(),
+			    M.Stride(), v.Data(), 1, beta, data_, 1));
 
-    CU_SAFE_CALL(cublasGetError());
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
   } else
 #endif
@@ -470,9 +471,9 @@ void CuVectorBase<Real>::AddSpVec(const Real alpha,
 
     // Note: in our opinion the CuSpMatrix represents a lower-triangular matrix, but
     // in CUBLAS, for some stupid reason, everything is reversed.
-    cublas_spmv('U', Dim(), alpha, M.Data(), v.Data(), 1, beta, data_, 1);
+    CU_SAFE_CALL(cublas_spmv(GetCublasHandle(), CUBLAS_FILL_MODE_UPPER, Dim(),
+			    alpha, M.Data(), v.Data(), 1, beta, data_, 1));
 
-    CU_SAFE_CALL(cublasGetError());
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
   } else
 #endif
@@ -494,7 +495,7 @@ void CuVectorBase<Real>::AddVecVec(Real alpha, const CuVectorBase<Real> &v,
     int dimGrid(n_blocks(dim_,CU1DBLOCK));
 
     cuda_add_vec_vec(dimGrid, dimBlock, alpha, data_, v.Data(), r.Data(), beta, dim_);
-    CU_SAFE_CALL(cudaGetLastError());    
+    CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile("CuVectorBase::AddVecVec", tim.Elapsed());
   } else
 #endif
@@ -529,7 +530,7 @@ void CuVectorBase<Real>::AddDiagMat2(Real alpha, const CuMatrixBase<Real> &M,
 #endif
   {
     Vec().AddDiagMat2(alpha, M.Mat(), trans, beta);
-  }      
+  }
 }
 
 template<typename Real>
@@ -562,19 +563,19 @@ void CuVectorBase<Real>::AddDiagMatMat(
     int dimGridLimit = (transM == kNoTrans && transN == kTrans ? 2048 :
                         (transM == kTrans && transN == kNoTrans ? 16 : 32));
 
-    
+
     while (M_col_dim > 10 * threads_per_element &&
            dimGrid < dimGridLimit && threads_per_element < 256) {
       threads_per_element *= 2;
       dimGrid = n_blocks(dim * threads_per_element, CU1DBLOCK);
     }
-    
+
     cuda_add_diag_mat_mat(dimGrid, dimBlock, alpha, data_, dim,
                           M.Data(), M_col_dim, M_row_stride, M_col_stride,
                           N.Data(), N_row_stride, N_col_stride,
                           threads_per_element, beta);
     CU_SAFE_CALL(cudaGetLastError());
-    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());    
+    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
   } else
 #endif
   {
@@ -590,7 +591,7 @@ void CuVectorBase<Real>::AddTpVec(const Real alpha, const CuTpMatrix<Real> &M,
   KALDI_ASSERT(dim_ == v.dim_ && dim_ == M.NumRows());
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
-    if (dim_ == 0) return;    
+    if (dim_ == 0) return;
     Timer tim;
     if (beta == 0.0) {
       if (&v != this) CopyFromVec(v);
@@ -601,7 +602,7 @@ void CuVectorBase<Real>::AddTpVec(const Real alpha, const CuTpMatrix<Real> &M,
       tmp.MulTp(M, trans);
       if (beta != 1.0) Scale(beta);  // *this <-- beta * *this
       AddVec(alpha, tmp, 1.0);          // *this += alpha * M * v
-    }      
+    }
   } else
 #endif
   {
@@ -617,8 +618,9 @@ void CuVectorBase<Real>::MulTp(const CuTpMatrix<Real> &M, const MatrixTransposeT
   if (CuDevice::Instantiate().Enabled()) {
     if (dim_ == 0) return;
     Timer tim;
-    cublas_tpmv((trans==kTrans?'N':'T'), M.NumRows(), M.Data(), data_, 1);
-    CuDevice::Instantiate().AccuProfile("CuVectorBase::MulTp", tim.Elapsed());    
+    cublas_tpmv(GetCublasHandle(), (trans==kTrans? CUBLAS_OP_N:CUBLAS_OP_T),
+		M.NumRows(), M.Data(), data_, 1);
+    CuDevice::Instantiate().AccuProfile("CuVectorBase::MulTp", tim.Elapsed());
   } else
 #endif
   {
@@ -655,11 +657,11 @@ Real CuVectorBase<Real>::Max() const {
   if (CuDevice::Instantiate().Enabled()) {
     if (dim_ == 0) {  // max of an empty set is -infinity.
       return -std::numeric_limits<Real>::infinity();
-    }    
+    }
     Timer tim;
     CuVector<Real> ans(1);
     cuda_vec_max(data_, ans.Data(), dim_);
-    CU_SAFE_CALL(cudaGetLastError());    
+    CU_SAFE_CALL(cudaGetLastError());
     result = ans(0);
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
   } else
@@ -670,9 +672,9 @@ Real CuVectorBase<Real>::Max() const {
   return result;
 }
 
-template<typename Real> 
+template<typename Real>
 void CuVectorBase<Real>::ReplaceValue(Real orig, Real changed) {
-#if HAVE_CUDA == 1 
+#if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     if (dim_ == 0) return;
     Timer tim;
@@ -698,7 +700,7 @@ void CuVectorBase<Real>::MulElements(const CuVectorBase<Real> &v) {
     int dimBlock(CU1DBLOCK);
     int dimGrid(n_blocks(dim_, CU1DBLOCK));
     cuda_vec_mul_elements(dimGrid, dimBlock, data_, v.Data(), dim_);
-    CU_SAFE_CALL(cudaGetLastError());    
+    CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile("CuVectorBase::MulElements", tim.Elapsed());
   } else
 #endif
@@ -719,7 +721,7 @@ void CuVectorBase<double>::CopyFromVec(const CuVectorBase<float> &src) {
     int dimGrid(n_blocks(dim_, CU2DBLOCK));
     cuda_copy_from_vec_df(dimGrid, dimBlock, data_, src.data_, dim_);
     CU_SAFE_CALL(cudaGetLastError());
-    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());    
+    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
   } else
 #endif
   {
@@ -752,14 +754,14 @@ template<typename Real>
 template<typename OtherReal>
 void CuVectorBase<Real>::CopyFromVec(const VectorBase<OtherReal> &src) {
 #if HAVE_CUDA == 1
-  if (CuDevice::Instantiate().Enabled()) {      
+  if (CuDevice::Instantiate().Enabled()) {
     if (sizeof(Real) != sizeof(OtherReal)) {
       CuVector<OtherReal> temp(dim_, kUndefined);
       temp.CopyFromVec(src);
       this->CopyFromVec(temp);
     } else {
       KALDI_ASSERT(src.Dim() == dim_);
-      if (dim_ == 0) return;      
+      if (dim_ == 0) return;
       Timer tim;
       CU_SAFE_CALL(cudaMemcpy(data_, src.Data(), src.Dim()*sizeof(Real), cudaMemcpyHostToDevice));
       CuDevice::Instantiate().AccuProfile("CuVector::CopyFromVecH2D",tim.Elapsed());
@@ -780,34 +782,6 @@ void CuVectorBase<float>::CopyFromVec(const VectorBase<double> &src);
 template
 void CuVectorBase<double>::CopyFromVec(const VectorBase<double> &src);
 
-template<typename Real>
-template<typename OtherReal>
-void CuVectorBase<Real>::CopyFromSmat(const CuSparseMatrix<OtherReal> &smat) {
-  KALDI_ASSERT(dim_ == smat.NumElements());
-#if HAVE_CUDA == 1
-  if (CuDevice::Instantiate().Enabled()) {      
-    Timer tim;
-    dim3 dimBlock(CU1DBLOCK, 1);
-    dim3 dimGrid(n_blocks(smat.NumElements(), CU1DBLOCK), 1);
-    cuda_copy_from_smat_as_vec(dimGrid, dimBlock, this->data_,
-                               smat.Data(), smat.NumElements());
-    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
-  } else
-#endif
-  {
-    Vector<Real> tmp(smat.Mat());
-    this->CopyFromVec(tmp);
-  }
-}
-template
-void CuVectorBase<float>::CopyFromSmat(const CuSparseMatrix<float> &smat);
-template
-void CuVectorBase<float>::CopyFromSmat(const CuSparseMatrix<double> &smat);
-template
-void CuVectorBase<double>::CopyFromSmat(const CuSparseMatrix<float> &smat);
-template
-void CuVectorBase<double>::CopyFromSmat(const CuSparseMatrix<double> &smat);
-
 template<typename Real>
 template<typename OtherReal>
 void CuVectorBase<Real>::CopyToVec(VectorBase<OtherReal> *dst) const {
@@ -879,18 +853,18 @@ void CuVector<Real>::Resize(MatrixIndexT dim, MatrixResizeType t) {
     this->data_ = static_cast<Real*>(CuDevice::Instantiate().Malloc(dim * sizeof(Real)));
     this->dim_ = dim;
     if (t == kSetZero) this->SetZero();
-    CuDevice::Instantiate().AccuProfile("CuVector::Resize", tim.Elapsed());    
+    CuDevice::Instantiate().AccuProfile("CuVector::Resize", tim.Elapsed());
   } else
 #endif
   {
     Vector<Real> vec(dim);
-    this->Swap(&vec); 
+    this->Swap(&vec);
   }
 }
 
 template<typename Real>
 void CuVector<Real>::Swap(Vector<Real> *vec) {
-#if HAVE_CUDA == 1 
+#if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     if (this->dim_ == 0) {
       if (vec->dim_ != 0) {
@@ -927,7 +901,7 @@ void CuVector<Real>::Swap(Vector<Real> *vec) {
 template<typename Real>
 void CuVector<Real>::Destroy() {
 #if HAVE_CUDA == 1
-  if (CuDevice::Instantiate().Enabled()) { 
+  if (CuDevice::Instantiate().Enabled()) {
     if (this->data_ != NULL)
       CuDevice::Instantiate().Free(this->data_);
   } else
@@ -962,7 +936,7 @@ template<typename Real>
 void CuVectorBase<Real>::SetZero() {
   if (dim_==0 || data_==NULL) return;
 #if HAVE_CUDA == 1
-  if (CuDevice::Instantiate().Enabled()) { 
+  if (CuDevice::Instantiate().Enabled()) {
     KALDI_ASSERT(dim_>=0);
     KALDI_ASSERT(data_!=NULL);
     Timer tim;
@@ -997,13 +971,13 @@ std::ostream &operator << (std::ostream &out, const CuVectorBase<double> &vec);
 template<typename Real>
 void CuVectorBase<Real>::Set(Real value) {
 #if HAVE_CUDA == 1
-  if (CuDevice::Instantiate().Enabled()) { 
+  if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
-    
+
     dim3 dimBlock(CU1DBLOCK);
     dim3 dimGrid(n_blocks(Dim(), CU1DBLOCK));
     ::MatrixDim d = { 1, Dim(), Dim() };
-    
+
     cuda_set_const(dimGrid, dimBlock, data_, value, d);
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
@@ -1019,7 +993,7 @@ void CuVectorBase<Real>::Set(Real value) {
 template<typename Real>
 void CuVectorBase<Real>::Add(Real value) {
 #if HAVE_CUDA == 1
-  if (CuDevice::Instantiate().Enabled()) { 
+  if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
 
     dim3 dimBlock(CU1DBLOCK);
@@ -1063,8 +1037,9 @@ void CuVectorBase<Real>::CopyDiagFromMat(const CuMatrix<Real> &M) {
   if (CuDevice::Instantiate().Enabled()) {
     KALDI_ASSERT(dim_ == std::min(M.NumRows(), M.NumCols()));
     Timer tim;
-    cublas_copy(dim_, M.Data(), M.Stride() + 1, data_, 1);
-    CU_SAFE_CALL(cudaGetLastError());
+    CU_SAFE_CALL(cublas_copy(GetCublasHandle(), dim_, M.Data(), M.Stride() + 1,
+			    data_, 1));
+
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
   } else
 #endif
@@ -1101,14 +1076,13 @@ void CuVectorBase<Real>::AddVec(Real alpha, const CuVectorBase<Real> &vec,
   KALDI_ASSERT(vec.Dim() == Dim());
 
 #if HAVE_CUDA == 1
-  if (CuDevice::Instantiate().Enabled()) { 
+  if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
     int32 dim = this->dim_;
     Real *data = this->data_;
     const Real *vec_data = vec.data_;
-    if (beta != 1.0) cuda_scal(dim, beta, data, 1);
-    if (alpha != 0.0) cuda_axpy(dim, alpha, vec_data, 1, data, 1);
-    CU_SAFE_CALL(cudaGetLastError());    
+    if (beta != 1.0) CU_SAFE_CALL(cuda_scal(GetCublasHandle(), dim, beta, data, 1));
+    if (alpha != 0.0) CU_SAFE_CALL(cuda_axpy(GetCublasHandle(), dim, alpha, vec_data, 1, data, 1));
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
   } else
   #endif
@@ -1161,20 +1135,20 @@ void CuVectorBase<Real>::AddColSumMat(Real alpha,
 }
 
 
- 
-template<typename Real> 
+
+template<typename Real>
 void CuVectorBase<Real>::InvertElements() {
 #if HAVE_CUDA == 1
-  if (CuDevice::Instantiate().Enabled()) { 
+  if (CuDevice::Instantiate().Enabled()) {
     Timer tim;
-    
+
     dim3 dimBlock(CU1DBLOCK, 1);
     dim3 dimGrid(n_blocks(dim_, CU1DBLOCK));
     MatrixDim d = {1, dim_, dim_};
 
     cuda_invert_elements(dimGrid, dimBlock, data_, d);
     CU_SAFE_CALL(cudaGetLastError());
-    
+
     CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
   } else
 #endif
diff --git a/src/cudamatrix/cu-vector.h b/src/cudamatrix/cu-vector.h
index ed7dd5bdcb2..54c1ac0ad4f 100644
--- a/src/cudamatrix/cu-vector.h
+++ b/src/cudamatrix/cu-vector.h
@@ -82,9 +82,6 @@ class CuVectorBase {
   template<typename OtherReal>
   void CopyFromVec(const VectorBase<OtherReal> &src);
 
-  template<typename OtherReal>
-  void CopyFromSmat(const CuSparseMatrix<OtherReal> &smat);
-
   template<typename OtherReal>
   void CopyToVec(VectorBase<OtherReal> *dst) const;
   
@@ -125,7 +122,7 @@ class CuVectorBase {
   void ApplyExp();
   void ApplyLog();
   MatrixIndexT ApplyFloor(Real floor_val);
-  void ApplyCeiling(Real ceiling_val);
+  MatrixIndexT ApplyCeiling(Real ceiling_val);
   void ApplyPow(Real power);
   Real Sum() const;
   void SetRandn();
@@ -215,6 +212,7 @@ class CuVectorBase {
   Real *data_; ///< GPU data pointer (or regular data pointer
                ///< if CUDA is not compiled in or we have no GPU).
   MatrixIndexT dim_; ///< dimension of the vector
+
  private:
   KALDI_DISALLOW_COPY_AND_ASSIGN(CuVectorBase);
 };
@@ -252,13 +250,6 @@ class CuVector: public CuVectorBase<Real> {
     this->CopyFromVec(Vector<Real>(v));
   }
 
-  template<typename OtherReal>
-  explicit CuVector(const CuSparseMatrix<OtherReal> &smat) :
-      CuVectorBase<Real> () {
-    Resize(smat.NumElements(), kUndefined);
-    this->CopyFromSmat(smat);
-  }
-
   /// Allocate the memory
   void Resize(MatrixIndexT dim, MatrixResizeType t = kSetZero);
   
@@ -339,8 +330,8 @@ bool ApproxEqual(const CuVectorBase<Real> &a,
 }
 
 template<typename Real>
-inline void AssertEqual(CuVectorBase<Real> &a, CuVectorBase<Real> &b,
-                        float tol = 0.01) {
+inline void AssertEqual(const CuVectorBase<Real> &a, 
+                        const CuVectorBase<Real> &b, Real tol = 0.01) {
   KALDI_ASSERT(a.ApproxEqual(b, tol));
 }
 
diff --git a/src/cudamatrix/cublas-wrappers.h b/src/cudamatrix/cublas-wrappers.h
index cec9b1fe9ac..f1d018a248d 100644
--- a/src/cudamatrix/cublas-wrappers.h
+++ b/src/cudamatrix/cublas-wrappers.h
@@ -25,85 +25,126 @@
 namespace kaldi {
 #if HAVE_CUDA == 1
 
-inline void cublas_gemm(char transa, char transb, int m, int n,int k, float alpha, const float *A, int lda,const float *B, int ldb, float beta, float *C, int ldc) {
-  cublasSgemm(transa,transb,m,n,k,alpha,A,lda,B,ldb,beta,C,ldc);
-}
-inline void cublas_gemm(char transa, char transb, int m, int n,int k, double alpha, const double *A, int lda,const double *B, int ldb, double beta, double *C, int ldc) {
-  cublasDgemm(transa,transb,m,n,k,alpha,A,lda,B,ldb,beta,C,ldc);
-}
-inline void cublas_trsm(int m, int n, float alpha, const float* A, int lda, float* B, int ldb) {
-  cublasStrsm('l','u','n','n',m,n,alpha,A,lda,B,ldb);
-}
-inline void cublas_trsm(int m, int n, double alpha, const double* A, int lda, double* B, int ldb) {
-  cublasDtrsm('l','u','n','n',m,n,alpha,A,lda,B,ldb);
-}
-inline void cublas_syrk(char uplo, char trans, int n, int k,
-                        float alpha, const float *A, int lda,
-                        float beta, float *C, int ldc) {
-  cublasSsyrk(uplo,trans,n,k,alpha,A,lda,beta,C,ldc);
-}
-inline void cublas_syrk(char uplo, char trans, int n, int k,
-                        double alpha, const double *A, int lda,
-                        double beta, double *C, int ldc) {
-  cublasDsyrk(uplo,trans,n,k,alpha,A,lda,beta,C,ldc);
-}
-inline float cublas_dot(int n, const float *x, int incx, const float *y, int incy) {
-  return cublasSdot(n, x, incx, y, incy);
-}
-inline double cublas_dot(int n, const double *x, int incx, const double *y, int incy) {
-  return cublasDdot(n, x, incx, y, incy);
-}
-inline float cublas_asum(int n, const float* x, int incx) {
-  return cublasSasum(n, x, incx);
-}
-inline double cublas_asum(int n, const double* x, int incx) {
-  return cublasDasum(n, x, incx);
-}
-inline float cublas_nrm2(int n, const float* x, int incx) {
-  return cublasSnrm2(n, x, incx);
+inline cublasStatus_t cublas_gemm(cublasHandle_t handle, cublasOperation_t transa, 
+		cublasOperation_t transb, int m, int n,int k, float alpha, 
+		const float *A, int lda, const float *B, int ldb, float beta, 
+		float *C, int ldc) {
+  return cublasSgemm_v2(handle,transa,transb,m,n,k,&alpha,A,lda,B,ldb,&beta,C,ldc);
+}
+inline cublasStatus_t cublas_gemm(cublasHandle_t handle, cublasOperation_t transa,
+	       	cublasOperation_t transb, int m, int n,int k, double alpha, 
+		const double *A, int lda, const double *B, int ldb, double beta, 
+		double *C, int ldc) {
+  return cublasDgemm_v2(handle,transa,transb,m,n,k,&alpha,A,lda,B,ldb,&beta,C,ldc);
+}
+inline cublasStatus_t cublas_ger(cublasHandle_t handle, int m, int n, float alpha,
+        const float *x, int incx, const float *y, int incy, float *A, int lda ) {
+  return cublasSger_v2(handle,m,n,&alpha,x,incx,y,incy,A,lda);
+}
+inline cublasStatus_t cublas_ger(cublasHandle_t handle, int m, int n, double alpha,
+        const double *x, int incx, const double *y, int incy, double *A, int lda ) {
+  return cublasDger_v2(handle,m,n,&alpha,x,incx,y,incy,A,lda);
+}
+inline cublasStatus_t cublas_gemmBatched(cublasHandle_t handle, cublasOperation_t transa,
+	       	cublasOperation_t transb, int m, int n, int k, float alpha,
+		const float *A[], int lda, const float *B[], int ldb, float beta,
+		float *C[], int ldc, int batchCount) {
+  return cublasSgemmBatched(handle, transa, transb, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc, batchCount); 
+}
+inline cublasStatus_t cublas_gemmBatched(cublasHandle_t handle, cublasOperation_t transa,
+	       	cublasOperation_t transb, int m, int n, int k, double alpha,
+		const double *A[], int lda, const double *B[], int ldb, double beta,
+		double *C[], int ldc, int batchCount) {
+  return cublasDgemmBatched(handle, transa, transb, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc, batchCount); 
+}
+inline cublasStatus_t cublas_trsm(cublasHandle_t handle, int m, int n, float alpha,
+	       	const float* A, int lda, float* B, int ldb) {
+  return cublasStrsm_v2(handle,CUBLAS_SIDE_LEFT,CUBLAS_FILL_MODE_UPPER,CUBLAS_OP_N,CUBLAS_DIAG_NON_UNIT,m,n,&alpha,A,lda,B,ldb);
+}
+inline cublasStatus_t cublas_trsm(cublasHandle_t handle, int m, int n, double alpha,
+	       	const double* A, int lda, double* B, int ldb) {
+  return cublasDtrsm_v2(handle,CUBLAS_SIDE_LEFT,CUBLAS_FILL_MODE_UPPER,CUBLAS_OP_N,CUBLAS_DIAG_NON_UNIT,m,n,&alpha,A,lda,B,ldb);
+}
+inline cublasStatus_t cublas_syrk(cublasHandle_t handle, cublasFillMode_t uplo, 
+		cublasOperation_t trans, int n, int k, float alpha, 
+		const float *A, int lda, float beta, float *C, int ldc) {
+  return cublasSsyrk_v2(handle,uplo,trans,n,k,&alpha,A,lda,&beta,C,ldc);
+}
+inline cublasStatus_t cublas_syrk(cublasHandle_t handle, cublasFillMode_t uplo,
+	       	cublasOperation_t trans, int n, int k, double alpha,
+	       	const double *A, int lda, double beta, double *C, int ldc) {
+  return cublasDsyrk_v2(handle,uplo,trans,n,k,&alpha,A,lda,&beta,C,ldc);
+}
+inline cublasStatus_t cublas_dot(cublasHandle_t handle, int n, const float *x,
+	       	int incx, const float *y, int incy, float *result) {
+  return cublasSdot_v2(handle, n, x, incx, y, incy, result);
+}
+inline cublasStatus_t cublas_dot(cublasHandle_t handle, int n, const double *x, 
+		int incx, const double *y, int incy, double *result) {
+  return cublasDdot_v2(handle, n, x, incx, y, incy, result);
+}
+inline cublasStatus_t cublas_asum(cublasHandle_t handle, int n, const float* x,
+	       	int incx, float *result) {
+  return cublasSasum_v2(handle, n, x, incx, result);
+}
+inline cublasStatus_t cublas_asum(cublasHandle_t handle, int n, const double* x,
+	       	int incx, double *result) {
+  return cublasDasum_v2(handle, n, x, incx, result);
+}
+inline cublasStatus_t cublas_nrm2(cublasHandle_t handle, int n, const float* x,
+	       	int incx, float *result) {
+  return cublasSnrm2_v2(handle, n, x, incx, result);
+
 }
-inline double cublas_nrm2(int n, const double* x, int incx) {
-  return cublasDnrm2(n, x, incx);
+inline cublasStatus_t cublas_nrm2(cublasHandle_t handle, int n, const double* x, 
+		int incx, double *result) {
+  return cublasDnrm2_v2(handle, n, x, incx, result);
 }
-inline void cublas_copy(int n, const float* x, int incx,
-                        float* y, int incy) {
-  cublasScopy(n,x,incx,y,incy);
+inline cublasStatus_t cublas_copy(cublasHandle_t handle, int n, const float* x, 
+		int incx, float* y, int incy) {
+  return cublasScopy_v2(handle,n,x,incx,y,incy);
 }
-inline void cublas_copy(int n, const double* x, int incx,
-                          double* y, int incy) {
-  cublasDcopy(n,x,incx,y,incy);
+inline cublasStatus_t cublas_copy(cublasHandle_t handle, int n, const double* x, 
+		int incx, double* y, int incy) {
+  return cublasDcopy_v2(handle,n,x,incx,y,incy);
 }
-inline void cublas_scal(int n, float alpha, float* mat, int incx) {
-  cublasSscal(n, alpha, mat, incx);
+inline cublasStatus_t cublas_scal(cublasHandle_t handle, int n, float alpha, 
+		float* mat, int incx) {
+  return cublasSscal_v2(handle, n, &alpha, mat, incx);
 }
-inline void cublas_scal(int n, double alpha, double* mat, int incx) {
-  cublasDscal(n, alpha, mat, incx);
+inline cublasStatus_t cublas_scal(cublasHandle_t handle, int n, double alpha, 
+		double* mat, int incx) {
+  return cublasDscal_v2(handle, n, &alpha, mat, incx);
 }
 
-inline void cublas_axpy(int n, float alpha, const float* x, int incx, float* y, int incy) {
-  cublasSaxpy(n, alpha, x, incx, y, incy);
+inline cublasStatus_t cublas_axpy(cublasHandle_t handle, int n, float alpha, 
+		const float* x, int incx, float* y, int incy) {
+  return cublasSaxpy_v2(handle, n, &alpha, x, incx, y, incy);
 }
-inline void cublas_axpy(int n, double alpha, const double* x, int incx, double* y, int incy) {
-  cublasDaxpy(n, alpha, x, incx, y, incy);
+inline cublasStatus_t cublas_axpy(cublasHandle_t handle, int n, double alpha, 
+		const double* x, int incx, double* y, int incy) {
+  return cublasDaxpy_v2(handle, n, &alpha, x, incx, y, incy);
 }
-inline void cublas_gemv(char trans, int m, int n, float alpha,
-                        const float* A, int lda, const float* x,
-                        int incx, float beta, float* y, int incy) {
-  cublasSgemv(trans,m,n,alpha,A,lda,x,incx,beta,y,incy);
+inline cublasStatus_t cublas_gemv(cublasHandle_t handle, cublasOperation_t trans,
+	       	int m, int n, float alpha, const float* A, int lda, const float* x,
+                int incx, float beta, float* y, int incy) {
+  return cublasSgemv_v2(handle,trans,m,n,&alpha,A,lda,x,incx,&beta,y,incy);
 }
-inline void cublas_gemv(char trans, int m, int n, double alpha,
-                        const double* A, int lda, const double* x,
-                        int incx, double beta, double* y, int incy) {
-  cublasDgemv(trans,m,n,alpha,A,lda,x,incx,beta,y,incy);
+inline cublasStatus_t cublas_gemv(cublasHandle_t handle, cublasOperation_t trans,
+	       	int m, int n, double alpha, const double* A, int lda, const double* x,
+                int incx, double beta, double* y, int incy) {
+  return cublasDgemv_v2(handle,trans,m,n,&alpha,A,lda,x,incx,&beta,y,incy);
 }
 
-inline void cublas_spmv(char uplo, int n, float alpha, const float *AP, const float *x,
-                        int incx, float beta, float *y, int incy) {
-  cublasSspmv(uplo, n, alpha, AP, x, incx, beta, y, incy);
+inline cublasStatus_t cublas_spmv(cublasHandle_t handle, cublasFillMode_t uplo, 
+		int n, float alpha, const float *AP, const float *x, int incx, 
+		float beta, float *y, int incy) {
+  return cublasSspmv_v2(handle, uplo, n, &alpha, AP, x, incx, &beta, y, incy);
 }
-inline void cublas_spmv(char uplo, int n, double alpha, const double *AP, const double *x,
-                        int incx, double beta, double *y, int incy) {
-  cublasDspmv(uplo, n, alpha, AP, x, incx, beta, y, incy);
+inline cublasStatus_t cublas_spmv(cublasHandle_t handle, cublasFillMode_t uplo,
+	       	int n, double alpha, const double *AP, const double *x, int incx, 
+		double beta, double *y, int incy) {
+  return cublasDspmv_v2(handle, uplo, n, &alpha, AP, x, incx, &beta, y, incy);
 }
 
 // Use caution with these, the 'transpose' argument is the opposite of what it
@@ -111,22 +152,22 @@ inline void cublas_spmv(char uplo, int n, double alpha, const double *AP, const
 // had to switch 'l' to 'u'; we view our packed matrices as lower-triangular,
 // row-by-row, but CUDA views the same layout as upper-triangular,
 // column-by-column.
-inline void cublas_tpmv(char trans, int n,
-                        const float* Ap, float* x, int incx) {
-  return cublasStpmv('u', trans, 'n', n, Ap, x, incx);
+inline cublasStatus_t cublas_tpmv(cublasHandle_t handle, cublasOperation_t trans, 
+		int n, const float* Ap, float* x, int incx) {
+  return cublasStpmv_v2(handle, CUBLAS_FILL_MODE_UPPER, trans, CUBLAS_DIAG_NON_UNIT, n, Ap, x, incx);
 }
-inline void cublas_tpmv(char trans, int n, const double* Ap,
-                        double* x,int incx) {
-  return cublasDtpmv('u', trans, 'n', n, Ap, x, incx);
+inline cublasStatus_t cublas_tpmv(cublasHandle_t handle, cublasOperation_t trans, 
+		int n, const double* Ap, double* x,int incx) {
+  return cublasDtpmv_v2(handle, CUBLAS_FILL_MODE_UPPER, trans, CUBLAS_DIAG_NON_UNIT, n, Ap, x, incx);
 }
 
-inline void cublas_spr(char uplo, int n, float alpha, const float *x,
-                      int incx, float *AP) {
-  cublasSspr(uplo, n, alpha, x, incx, AP);
+inline cublasStatus_t cublas_spr(cublasHandle_t handle, cublasFillMode_t uplo, 
+		int n, float alpha, const float *x, int incx, float *AP) {
+  return cublasSspr_v2(handle, uplo, n, &alpha, x, incx, AP);
 }
-inline void cublas_spr(char uplo, int n, double alpha, const double *x,
-                      int incx, double *AP) {
-  cublasDspr(uplo, n, alpha, x, incx, AP);
+inline cublasStatus_t cublas_spr(cublasHandle_t handle, cublasFillMode_t uplo, 
+		int n, double alpha, const double *x, int incx, double *AP) {
+  return cublasDspr_v2(handle, uplo, n, &alpha, x, incx, AP);
 }
 
 #endif
diff --git a/src/decoder/Makefile b/src/decoder/Makefile
index e38f5ab63b6..95d5c6effca 100644
--- a/src/decoder/Makefile
+++ b/src/decoder/Makefile
@@ -1,9 +1,9 @@
 all:
 
-EXTRA_CXXFLAGS = -Wno-sign-compare -O3
+EXTRA_CXXFLAGS = -Wno-sign-compare
 include ../kaldi.mk
 
-TESTFILES = 
+TESTFILES =
 
 OBJFILES = training-graph-compiler.o lattice-simple-decoder.o lattice-faster-decoder.o \
    lattice-faster-online-decoder.o simple-decoder.o faster-decoder.o \
@@ -13,7 +13,7 @@ LIBNAME = kaldi-decoder
 
 ADDLIBS = ../transform/kaldi-transform.a ../tree/kaldi-tree.a ../lat/kaldi-lat.a \
      ../sgmm/kaldi-sgmm.a ../gmm/kaldi-gmm.a ../hmm/kaldi-hmm.a ../util/kaldi-util.a \
-     ../base/kaldi-base.a ../matrix/kaldi-matrix.a 
+     ../thread/kaldi-thread.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
 
diff --git a/src/decoder/lattice-faster-decoder.cc b/src/decoder/lattice-faster-decoder.cc
index 275335a5ce9..c5c9aae743c 100644
--- a/src/decoder/lattice-faster-decoder.cc
+++ b/src/decoder/lattice-faster-decoder.cc
@@ -145,12 +145,12 @@ bool LatticeFasterDecoder::GetRawLattice(Lattice *ofst,
     TopSortTokens(active_toks_[f].toks, &token_list);
     for (size_t i = 0; i < token_list.size(); i++)
       if (token_list[i] != NULL)
-        tok_map[token_list[i]] = ofst->AddState();    
+        tok_map[token_list[i]] = ofst->AddState();
   }
   // The next statement sets the start state of the output FST.  Because we
   // topologically sorted the tokens, state zero must be the start-state.
   ofst->SetStart(0);
-  
+
   KALDI_VLOG(4) << "init:" << num_toks_/2 + 3 << " buckets:"
                 << tok_map.bucket_count() << " load:" << tok_map.load_factor()
                 << " max:" << tok_map.max_load_factor();
@@ -224,6 +224,32 @@ void LatticeFasterDecoder::PossiblyResizeHash(size_t num_toks) {
   }
 }
 
+/*
+  A note on the definition of extra_cost.
+
+  extra_cost is used in pruning tokens, to save memory.
+
+  Define the 'forward cost' of a token as zero for any token on the frame
+  we're currently decoding; and for other frames, as the shortest-path cost
+  between that token and a token on the frame we're currently decoding.
+  (by "currently decoding" I mean the most recently processed frame).
+
+  Then define the extra_cost of a token (always >= 0) as the forward-cost of
+  the token minus the smallest forward-cost of any token on the same frame.
+
+  We can use the extra_cost to accurately prune away tokens that we know will
+  never appear in the lattice.  If the extra_cost is greater than the desired
+  lattice beam, the token would provably never appear in the lattice, so we can
+  prune away the token.
+
+  The advantage of storing the extra_cost rather than the forward-cost, is that
+  it is less costly to keep the extra_cost up-to-date when we process new frames.
+  When we process a new frame, *all* the previous frames' forward-costs would change;
+  but in general the extra_cost will change only for a finite number of frames.
+  (Actually we don't update all the extra_costs every time we update a frame; we
+  only do it every 'config_.prune_interval' frames).
+ */
+
 // FindOrAddToken either locates a token in hash of toks_,
 // or if necessary inserts a new, empty token (i.e. with no forward links)
 // for the current frame.  [note: it's inserted if necessary into hash toks_
@@ -352,7 +378,7 @@ void LatticeFasterDecoder::PruneForwardLinksFinal() {
 
   if (active_toks_[frame_plus_one].toks == NULL)  // empty list; should not happen.
     KALDI_WARN << "No tokens alive at end of file";
-  
+
   typedef unordered_map<Token*, BaseFloat>::const_iterator IterType;
   ComputeFinalCosts(&final_costs_, &final_relative_cost_, &final_best_cost_);
   decoding_finalized_ = true;
@@ -623,7 +649,7 @@ BaseFloat LatticeFasterDecoder::GetCutoff(Elem *list_head, size_t *tok_count,
 
     KALDI_VLOG(6) << "Number of tokens active on frame " << NumFramesDecoded()
                   << " is " << tmp_array_.size();
-    
+
     if (tmp_array_.size() > static_cast<size_t>(config_.max_active)) {
       std::nth_element(tmp_array_.begin(),
                        tmp_array_.begin() + config_.max_active,
@@ -634,7 +660,7 @@ BaseFloat LatticeFasterDecoder::GetCutoff(Elem *list_head, size_t *tok_count,
       if (adaptive_beam)
         *adaptive_beam = max_active_cutoff - best_weight + config_.beam_delta;
       return max_active_cutoff;
-    }     
+    }
     if (tmp_array_.size() > static_cast<size_t>(config_.min_active)) {
       if (config_.min_active == 0) min_active_cutoff = best_weight;
       else {
@@ -645,7 +671,7 @@ BaseFloat LatticeFasterDecoder::GetCutoff(Elem *list_head, size_t *tok_count,
                          tmp_array_.end());
         min_active_cutoff = tmp_array_[config_.min_active];
       }
-    }    
+    }
     if (min_active_cutoff > beam_cutoff) { // min_active is looser than beam.
       if (adaptive_beam)
         *adaptive_beam = min_active_cutoff - best_weight + config_.beam_delta;
@@ -673,7 +699,7 @@ BaseFloat LatticeFasterDecoder::ProcessEmitting(DecodableInterface *decodable) {
   BaseFloat cur_cutoff = GetCutoff(final_toks, &tok_cnt, &adaptive_beam, &best_elem);
   KALDI_VLOG(6) << "Adaptive beam on frame " << NumFramesDecoded() << " is "
                 << adaptive_beam;
-  
+
   PossiblyResizeHash(tok_cnt);  // This makes sure the hash is always big enough.
 
   BaseFloat next_cutoff = std::numeric_limits<BaseFloat>::infinity();
@@ -761,7 +787,7 @@ void LatticeFasterDecoder::ProcessNonemitting(BaseFloat cutoff) {
   // it may cause us to process states unnecessarily (e.g. more than once),
   // but in the baseline code, turning this vector into a set to fix this
   // problem did not improve overall speed.
-  
+
   KALDI_ASSERT(queue_.empty());
   for (const Elem *e = toks_.GetList(); e != NULL;  e = e->tail)
     queue_.push_back(e->key);
@@ -771,7 +797,7 @@ void LatticeFasterDecoder::ProcessNonemitting(BaseFloat cutoff) {
       warned_ = true;
     }
   }
-  
+
   while (!queue_.empty()) {
     StateId state = queue_.back();
     queue_.pop_back();
diff --git a/src/decoder/lattice-faster-decoder.h b/src/decoder/lattice-faster-decoder.h
index 158248cc445..514886d65ee 100644
--- a/src/decoder/lattice-faster-decoder.h
+++ b/src/decoder/lattice-faster-decoder.h
@@ -54,7 +54,7 @@ struct LatticeFasterDecoderConfig {
   // LatticeFasterDecoder class itself, but by the code that calls it, for
   // example in the function DecodeUtteranceLatticeFaster.
   fst::DeterminizeLatticePhonePrunedOptions det_opts;
-  
+
   LatticeFasterDecoderConfig(): beam(16.0),
                                 max_active(std::numeric_limits<int32>::max()),
                                 min_active(200),
@@ -99,7 +99,7 @@ class LatticeFasterDecoder {
   typedef Arc::Label Label;
   typedef Arc::StateId StateId;
   typedef Arc::Weight Weight;
-  
+
   // instantiate this class once for each thing you have to decode.
   LatticeFasterDecoder(const fst::Fst<fst::StdArc> &fst,
                        const LatticeFasterDecoderConfig &config);
@@ -117,7 +117,7 @@ class LatticeFasterDecoder {
   const LatticeFasterDecoderConfig &GetOptions() const {
     return config_;
   }
-  
+
   ~LatticeFasterDecoder();
 
   /// Decodes until there are no more frames left in the "decodable" object..
@@ -230,12 +230,9 @@ class LatticeFasterDecoder {
   // links from it when we process the next frame.
   struct Token {
     BaseFloat tot_cost; // would equal weight.Value()... cost up to this point.
-    BaseFloat extra_cost; // >= 0.  After calling PruneForwardLinks, this equals
-    // the minimum difference between the cost of the best path, and the cost of
-    // this is on, and the cost of the absolute best path, under the assumption
-    // that any of the currently active states at the decoding front may
-    // eventually succeed (e.g. if you were to take the currently active states
-    // one by one and compute this difference, and then take the minimum).
+    BaseFloat extra_cost; // >= 0.  This is used in pruning a way tokens.
+    // there is a comment in lattice-faster-decoder.cc explaining this;
+    // search for "a note on the definition of extra_cost".
 
     ForwardLink *links; // Head of singly linked list of ForwardLinks
 
@@ -365,8 +362,9 @@ class LatticeFasterDecoder {
   const fst::Fst<fst::StdArc> &fst_;
   bool delete_fst_;
   std::vector<BaseFloat> cost_offsets_; // This contains, for each
-  // frame, an offset that was added to the acoustic likelihoods on that
-  // frame in order to keep everything in a nice dynamic range.
+  // frame, an offset that was added to the acoustic log-likelihoods on that
+  // frame in order to keep everything in a nice dynamic range i.e.  close to
+  // zero, to reduce roundoff errors.
   LatticeFasterDecoderConfig config_;
   int32 num_toks_; // current total #toks allocated...
   bool warned_;
@@ -409,7 +407,7 @@ class LatticeFasterDecoder {
 
   void ClearActiveTokens();
 
-  KALDI_DISALLOW_COPY_AND_ASSIGN(LatticeFasterDecoder);  
+  KALDI_DISALLOW_COPY_AND_ASSIGN(LatticeFasterDecoder);
 };
 
 
diff --git a/src/decoder/lattice-faster-online-decoder.h b/src/decoder/lattice-faster-online-decoder.h
index 30adb6df302..b69b5492fb7 100644
--- a/src/decoder/lattice-faster-online-decoder.h
+++ b/src/decoder/lattice-faster-online-decoder.h
@@ -62,7 +62,7 @@ class LatticeFasterOnlineDecoder {
     BestPathIterator(void *t, int32 f): tok(t), frame(f) { }
     bool Done() { return tok == NULL; }
   };
-  
+
   // instantiate this class once for each thing you have to decode.
   LatticeFasterOnlineDecoder(const fst::Fst<fst::StdArc> &fst,
                              const LatticeFasterDecoderConfig &config);
@@ -80,7 +80,7 @@ class LatticeFasterOnlineDecoder {
   const LatticeFasterDecoderConfig &GetOptions() const {
     return config_;
   }
-  
+
   ~LatticeFasterOnlineDecoder();
 
   /// Decodes until there are no more frames left in the "decodable" object..
@@ -107,12 +107,12 @@ class LatticeFasterOnlineDecoder {
   bool GetBestPath(Lattice *ofst,
                    bool use_final_probs = true) const;
 
-  
+
   /// This function does a self-test of GetBestPath().  Returns true on
   /// success; returns false and prints a warning on failure.
   bool TestGetBestPath(bool use_final_probs = true) const;
-  
-  
+
+
   /// This function returns an iterator that can be used to trace back
   /// the best path.  If use_final_probs == true and at least one final state
   /// survived till the end, it will use the final-probs in working out the best
@@ -133,7 +133,7 @@ class LatticeFasterOnlineDecoder {
   /// while leaving its "nextstate" variable unchanged.
   BestPathIterator TraceBackBestPath(
       BestPathIterator iter, LatticeArc *arc) const;
-  
+
   /// Outputs an FST corresponding to the raw, state-level
   /// tracebacks.  Returns true if result is nonempty.
   /// If "use_final_probs" is true AND we reached the final-state
@@ -152,7 +152,7 @@ class LatticeFasterOnlineDecoder {
                            bool use_final_probs,
                            BaseFloat beam) const;
 
-  
+
   /// InitDecoding initializes the decoding, and should only be used if you
   /// intend to call AdvanceDecoding().  If you call Decode(), you don't need to
   /// call this.  You can also call InitDecoding if you have already decoded an
@@ -334,7 +334,7 @@ class LatticeFasterOnlineDecoder {
   /// Gets the weight cutoff.  Also counts the active tokens.
   BaseFloat GetCutoff(Elem *list_head, size_t *tok_count,
                       BaseFloat *adaptive_beam, Elem **best_elem);
-  
+
   /// Processes emitting arcs for one frame.  Propagates from prev_toks_ to cur_toks_.
   /// Returns the cost cutoff for subsequent ProcessNonemitting() to use.
   BaseFloat ProcessEmitting(DecodableInterface *decodable);
@@ -343,7 +343,7 @@ class LatticeFasterOnlineDecoder {
   /// ProcessEmitting() on each frame.  The cost cutoff is computed by the
   /// preceding ProcessEmitting().
   void ProcessNonemitting(BaseFloat cost_cutoff);
-  
+
   // HashList defined in ../util/hash-list.h.  It actually allows us to maintain
   // more than one list (e.g. for current and previous frames), but only one of
   // them at a time can be indexed by StateId.  It is indexed by frame-index
@@ -361,9 +361,10 @@ class LatticeFasterOnlineDecoder {
   // make it class member to avoid internal new/delete.
   const fst::Fst<fst::StdArc> &fst_;
   bool delete_fst_;
-  std::vector<BaseFloat> cost_offsets_; // This contains, for each
-  // frame, an offset that was added to the acoustic likelihoods on that
-  // frame in order to keep everything in a nice dynamic range.
+  std::vector<BaseFloat> cost_offsets_;  // This contains, for each
+  // frame, an offset that was added to the acoustic log-likelihoods on that
+  // frame in order to keep everything in a nice dynamic range i.e.  close to
+  // zero, to reduce roundoff errors.
   LatticeFasterDecoderConfig config_;
   int32 num_toks_; // current total #toks allocated...
   bool warned_;
diff --git a/src/decoder/lattice-tracking-decoder.h b/src/decoder/lattice-tracking-decoder.h
index 91484b56c60..0737ca3db36 100644
--- a/src/decoder/lattice-tracking-decoder.h
+++ b/src/decoder/lattice-tracking-decoder.h
@@ -74,7 +74,7 @@ struct LatticeTrackingDecoderConfig {
 
   }
   void Check() const {
-    KALDI_ASSERT(beam > 0.0 && max_active > 1 && lattice_beam > 0.0 
+    KALDI_ASSERT(beam > 0.0 && max_active > 1 && lattice_beam > 0.0
                  && prune_interval > 0 && beam_delta > 0.0 && hash_ratio >= 1.0
                  && extra_beam >= 0.0 && max_beam >= beam);
   }
@@ -135,7 +135,7 @@ class LatticeTrackingDecoder {
   /// format.
   bool Decode(DecodableInterface *decodable,
               const fst::StdVectorFst &arc_graph);
-  
+
   /// says whether a final-state was active on the last frame.  If it was not, the
   /// lattice (or traceback) will end with states that are not final-states.
   bool ReachedFinal() const { return final_active_; }
@@ -167,7 +167,7 @@ class LatticeTrackingDecoder {
   /// final-probs as one.
   bool GetLattice(fst::MutableFst<CompactLatticeArc> *ofst,
                   bool use_final_probs = true) const;
-  
+
  private:
   struct Token;
   // ForwardLinks are the links from a token to a token on the next frame.
@@ -181,13 +181,13 @@ class LatticeTrackingDecoder {
     ForwardLink *next; // next in singly-linked list of forward links from a
                        // token.
     inline ForwardLink(Token *next_tok, Label ilabel, Label olabel,
-                       BaseFloat graph_cost, BaseFloat acoustic_cost, 
+                       BaseFloat graph_cost, BaseFloat acoustic_cost,
                        ForwardLink *next):
         next_tok(next_tok), ilabel(ilabel), olabel(olabel),
-        graph_cost(graph_cost), acoustic_cost(acoustic_cost), 
+        graph_cost(graph_cost), acoustic_cost(acoustic_cost),
         next(next) { }
-  };  
-  
+  };
+
   // Token is what's resident in a particular state at a particular time.
   // In this decoder a Token actually contains *forward* links.
   // When first created, a Token just has the (total) cost.    We add forward
@@ -200,19 +200,19 @@ class LatticeTrackingDecoder {
     // that any of the currently active states at the decoding front may
     // eventually succeed (e.g. if you were to take the currently active states
     // one by one and compute this difference, and then take the minimum).
-    
+
     ForwardLink *links; // Head of singly linked list of ForwardLinks
-    
+
     Token *next; // Next in list of tokens for this frame.
-    
+
     StateId lat_state; // current state in graph arc lattice from first pass decoding
     // lat_state == fst::kNoStateId means that this token is not tracked
-    
+
     inline Token(BaseFloat tot_cost, BaseFloat extra_cost, ForwardLink *links,
                  Token *next, StateId lat_state): tot_cost(tot_cost), extra_cost(extra_cost),
                  links(links), next(next), lat_state(lat_state) { }
     inline void DeleteForwardLinks() {
-      ForwardLink *l = links, *m; 
+      ForwardLink *l = links, *m;
       while (l != NULL) {
         m = l->next;
         delete l;
@@ -221,7 +221,7 @@ class LatticeTrackingDecoder {
       links = NULL;
     }
   };
-  
+
   // head and tail of per-frame list of Tokens (list is in topological order),
   // and something saying whether we ever pruned it using PruneForwardLinks.
   struct TokenList {
@@ -231,7 +231,7 @@ class LatticeTrackingDecoder {
     TokenList(): toks(NULL), must_prune_forward_links(true),
                  must_prune_tokens(true) { }
   };
-  
+
   typedef HashList<StateId, Token*>::Elem Elem;
 
   void PossiblyResizeHash(size_t num_toks);
@@ -248,7 +248,7 @@ class LatticeTrackingDecoder {
   // lat_state is the next state in the arc graph lattice
   inline Token *FindOrAddToken(StateId state, StateId lat_state, int32 frame,
                                BaseFloat tot_cost, bool *changed);
-  
+
   // prunes outgoing links for all tokens in active_toks_[frame]
   // it's called by PruneActiveTokens
   // all links, that have link_extra_cost > lattice_beam are pruned
@@ -267,13 +267,13 @@ class LatticeTrackingDecoder {
   // on the final frame.  If there are final tokens active, it uses
   // the final-probs for pruning, otherwise it treats all tokens as final.
   void PruneForwardLinksFinal(int32 frame);
-  
+
   // Prune away any tokens on this frame that have no forward links.
   // [we don't do this in PruneForwardLinks because it would give us
   // a problem with dangling pointers].
   // It's called by PruneActiveTokens if any forward links have been pruned
   void PruneTokensForFrame(int32 frame);
-  
+
   // Go backwards through still-alive tokens, pruning them.  note: cur_frame is
   // where hash toks_ are (so we do not want to mess with it because these tokens
   // don't yet have forward pointers), but we do all previous frames, unless we
@@ -286,7 +286,7 @@ class LatticeTrackingDecoder {
   /// Version of PruneActiveTokens that we call on the final frame.
   /// Takes into account the final-prob of tokens.
   void PruneActiveTokensFinal(int32 cur_frame);
-  
+
   /// Gets the weight cutoff.  Also counts the active tokens.
   BaseFloat GetCutoff(Elem *list_head, size_t *tok_count,
                       BaseFloat *adaptive_beam, Elem **best_elem);
@@ -311,9 +311,10 @@ class LatticeTrackingDecoder {
   std::vector<BaseFloat> tmp_array_;  // used in GetCutoff.
   // make it class member to avoid internal new/delete.
   const fst::Fst<fst::StdArc> &fst_;
-  std::vector<BaseFloat> cost_offsets_; // This contains, for each
-  // frame, an offset that was added to the acoustic likelihoods on that
-  // frame in order to keep everything in a nice dynamic range.
+  std::vector<BaseFloat> cost_offsets_;  // This contains, for each
+  // frame, an offset that was added to the acoustic log-likelihoods on that
+  // frame in order to keep everything in a nice dynamic range i.e.  close to
+  // zero, to reduce roundoff errors.
   LatticeTrackingDecoderConfig config_;
   int32 num_toks_; // current total #toks allocated...
   bool warned_;
@@ -331,9 +332,9 @@ class LatticeTrackingDecoder {
   // to the caller, who then has to call toks_.Delete(e) for each one.  It was designed
   // this way for convenience in propagating tokens from one frame to the next.
   void ClearToks(Elem *list);
-  
+
   void ClearActiveTokens();
-  
+
 };
 
 
diff --git a/src/doc/README b/src/doc/README
index ea30b348450..566f0d0bf64 100644
--- a/src/doc/README
+++ b/src/doc/README
@@ -3,7 +3,7 @@
 #code itself, and its comments, is the rest of the source).  Doxygen will create
 #the actual documentation in ../html/ (e.g. open ../html/index.html in a browser).
 #To run doxygen, type "doxygen" from one directory above this.  If this does
-#not work, search for "Kaldi main page" online and you will hopefully get a 
+#not work, search for "Kaldi main page" online and you will hopefully get a
 #version of the documentation.
 
 # Note: I generally run this file by typing ". doc/README" from src/,
@@ -13,7 +13,7 @@
 #ssh-keygen -t dsa -C "vpanayotov@shell.sf.net"
 #ssh-add
 # end then import the contents of .ssh/id_dsa.pub into
-# http://sourceforge.net/account/services 
+# http://sourceforge.net/account/services
 
 #(from Dan:) The commands below show how I compile the documentation and copy it
 #to the homepage at sourceforge.  I do this from JHU at the current time.
@@ -24,7 +24,7 @@ doxygen
 cp doc/*.pptx html/;
 # get the style sheet in the html/ directory.
 # note, we actually use a modified version of the header, which is checked into
-# doc/.  
+# doc/.
 doxygen -w html header.html footer.html stylesheet.css
 rm header.html footer.html
 mv stylesheet.css html/
@@ -34,37 +34,21 @@ if [[ $(hostname -f) == *.clsp.jhu.edu ]]; then
    cp ../misc/logo/KaldiIco.png html/favicon.ico
    tar -czf html.tar.gz html
 
-   echo "**First copying to kaldi-asr.org**"
-   # First copy to kaldi-asr.org/docs
+   # Copy to kaldi-asr.org/docs2/
    scp html.tar.gz newrelay:/var/www/kaldi-asr
 
    echo 'cd /var/www/kaldi-asr/; rm -rf html doc.old;
-        tar -xzf html.tar.gz; mv doc doc.old; mv html doc; rm -rf doc.old' \
+        tar -xzf html.tar.gz; mv doc doc.old; mv html doc; rm -rf doc.old; rm html.tar.gz' \
            | ssh newrelay bash
-
-   echo "**Now copying to sourceforge**"
-   # Next copy to sourceforge.
-   if true; then  # use method that works when their shell access is down.
-     rm -rf htdocs # make sure it's not left over from before.
-     mv html htdocs
-     scp -r htdocs  danielpovey@web.sourceforge.net:/home/project-web/kaldi/
-     mv htdocs html
-   else
-     scp html.tar.gz danielpovey@web.sourceforge.net:/home/project-web/kaldi/htdocs/
-     ssh danielpovey,kaldi@shell.sourceforge.net create
-     echo 'cd /home/project-web/kaldi/htdocs/; rm -rf html;
-         tar -xzf html.tar.gz; for x in html/*; do mv $x .; done ' \
-          | ssh danielpovey,kaldi@shell.sourceforge.net bash
-   fi
 fi
 
 
 # You could uncomment and run the lines below as an example of how to figure out
 # the amount of posts to the Kaldi forums on Sourceforge, per month.
 #curl 'http://sourceforge.net/p/kaldi/discussion/stats_data?forum=&begin=2011-04-14&end=2014-06-13'  > foo
-#cat foo |  perl -ane ' s/.*://; @A = split("]"); 
+#cat foo |  perl -ane ' s/.*://; @A = split("]");
 #   foreach $a(@A){ $a =~ s/[,\[]//g; print "$a\n"; }' | \
-# perl -e 'while(<>) { @A = split; if (@A == 2) { ($date, $count) = @A;  $date /= 1000; 
+# perl -e 'while(<>) { @A = split; if (@A == 2) { ($date, $count) = @A;  $date /= 1000;
 #  @date_array = gmtime $date; $month = $date_array[4]; $year = 1900 + $date_array[5]; $count{$year. " " .sprintf("%02d", $month+1)} +=  $count; }}
 #  foreach $k (sort keys %count) { print "$k $count{$k}\n"; } '
 
@@ -78,5 +62,3 @@ fi
 # and added it to the repo.
 #<link rel="icon" type="image/png" href="http://kaldi.sf.net/favicon.ico">
 
-
-
diff --git a/src/doc/chain.dox b/src/doc/chain.dox
new file mode 100644
index 00000000000..9aa515d5b0e
--- /dev/null
+++ b/src/doc/chain.dox
@@ -0,0 +1,424 @@
+// doc/chain.dox
+
+// Copyright 2015   Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//  http://www.apache.org/licenses/LICENSE-2.0
+
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+namespace kaldi {
+
+/**
+  \page chain 'Chain' models
+
+  \section chain_intro Introduction to 'chain' models
+
+  The 'chain' models are a type of DNN-HMM model, implemented using \ref dnn3 "nnet3", and differ from the
+  conventional model in various ways; you can think of them as a different
+  design point in the space of acoustic models.
+
+   - We use a 3 times smaller frame rate at the output of the neural net,
+     This significantly reduces the amount of computation required in
+     test time, making real-time decoding much easier.
+   - The models are trained right from the start with a sequence-level
+     objective function-- namely, the log probability of the correct sequence.  It is
+     essentially MMI implemented without lattices on the GPU, by doing a full
+     forward-backward on a decoding graph derived from a phone n-gram language
+     model.
+   - Because of the reduced frame rate, we need to use unconventional
+     HMM topologies (allowing the traversal of the HMM in one state).
+   - We use fixed transition probabilities in the HMM, and don't train
+     them (we may decide train them in future; but for the most part the neural-net
+     output probabilities can do the same job as the transition probabilities,
+     depending on the topology).
+   - Currently, only nnet3 DNNs are supported (see \ref dnn3), and
+     online decoding has not yet been implemented (we're aiming for April to June 2016).
+   - Currently the results are a bit better then those of conventional
+     DNN-HMMs (about 5\% relative better), but the system is about 3 times
+     faster to decode; training time is probably a bit faster too, but
+     we haven't compared it exactly.
+
+  \section chain_scripts  Where to find scripts for the 'chain' models
+
+  The current best scripts for the 'chain' models can be found in the
+  Switchboard setup in egs/swbd/s5c; the script local/chain/run_tdnn_2o.sh is
+  the current best one.  This is currently available in the 'chain' branch of
+  the official github repository (https://github.com/kaldi-asr/kaldi.git) and
+  eventually will be merged to the master.
+
+  This script uses TDNNs as the neural net (we've been doing the development
+  with TDNNs because they are easier to tune then LSTMs), and gives a better WER
+  WER than the baseline TDNN: 11.4\%, versus 12.1\% for the best TDNN baseline
+  (on the Switchboard-only portion of eval2000).
+
+  \section chain_model  The chain model
+
+  The chain model itself is no different from a conventional DNN-HMM, used with
+  a (currently) 3-fold reduced frame rate at the output of the DNN.  The input
+  features of the DNN are at the original frame rate of 100 per second; this makes
+  sense because all the neural nets we are currently using (LSTMs, TDNNs) have some kind
+  of recurrent connections or splicing inside them, i.e. they are not purely feedforward
+  nets.
+
+  The difference from a normal model is the objective function used to train it:
+  instead of a frame-level objective, we use the log-probability of the correct
+  phone sequence as the objective function.  The training process is quite
+  similar in principle to MMI training, in which we compute numerator and
+  denominator 'occupation probabilities' and the difference between the two is
+  used in the derivative computation.  There is no need to normalize the DNN
+  outputs to sum to one on each frame any more- such normalization makes no difference.
+
+  Because of the reduced frame rate (one frame every 30 ms), we need to use a
+  modified HMM topology.  We would like the HMM to be traversable in one
+  transition (as opposed to the 3 transitions of a model mat the normal frame
+  rate).  The currently favored topology has a state that can only occur once,
+  and then another state that can appear zero or more times.  The state-clustering
+  is obtained using the same procedure as for GMM-based models, although
+  of course with a different topology (we convert the alignments to the new topology
+  and frame-rate).
+
+  \section chain_training The training procedure for 'chain' models
+
+  The training procedure for chain models is a lattice-free version of
+  MMI, where the denominator state posteriors are obtained by the
+  forward-backward algorithm over a HMM formed from a phone-level decoding graph,
+  and the numerator state posteriors are obtained by a similar forward-backward
+  algorithm but limited to sequences corresponding to the transcript.
+
+  For each output index of the neural net (i.e. for each pdf-id), we
+  compute a derivative of of the form (numerator occupation probability -
+  denominator occupation probability), and these are propagated back to the
+  network.
+
+
+  \subsection chain_training_denominator  The denominator FST
+
+  For the denominator part of the computation we do forward-backward over a HMM.
+ Actually, because we represent it as a finite state acceptor, the labels
+ (pdf-ids) are associated with the arcs and not the states, so it's not really a
+ HMM in the normal formulation, but it's easier think of it as a HMM because
+ we use the forward-backward algorithm to get posteriors.
+ In the code and scripts we refer to it as the 'denominator FST'.
+
+  \subsubsection chain_training_denominator_phone_lm Phone language model for the denominator FST
+
+  The first stage in constructing the denominator FST is to create a phone
+  language model.  This language model is learned from the training-data phone
+  alignments.  This is an un-smoothed language model, meaning that we never do
+  backoff to lower order n-grams.  However, some language-model states are
+  removed entirely, so transitions to those states go instead to the lower-order
+  n-gram's state.   The reason we avoid smoothing is to reduce the number of
+  arcs that there will be in the compiled graph after phonetic context expansion.
+
+ The configuration that we settled on is to estimate a 4-gram language model,
+  and to never prune LM states below trigram (so we always maintain at least a
+  2-phone history).  On top of the number of states dictated by the no-prune
+  trigram rule, we have a specifiable number (e.g. 2000) of 4-gram language
+  model states which are to be retained (all the rest are identified with the
+  corresponding trigram state), and the ones we choose to retain are determined
+  in a way that maximizes the training-data likelihood.  All probabilities are
+  estimated to maximize the training-data likelihood.  The reason not to prune
+  the trigrams is that any sparsity of which trigrams are allowed, will tend to
+  minimize the size of the compiled graph.  Note that if our phone LM was just a
+  simple phone loop (i.e. a unigram), it would get expanded to triphones anyway
+  due to phonetic context effects, but it would have arcs for all possible
+  trigrams in it.  So any sparsity we get from using the un-pruned trigram model
+  is a bonus.  Empirically, an un-smoothed trigram LM is what expands to the
+  smallest possible FST; and pruning some of the trigrams, while it increases
+  the size of the compiled FST, results in little or no WER improvement (at
+  least on 300 hours of data expanded 3-fold with speed perturbation; on less
+  data it might help).
+
+  On the Switchboard setups the phone-LM perplexities for the various models we
+  tried were in the range 5 to 7; the phone-LM perplexity with our chosen
+  configuration (4-gram, pruned to trigram for all but 2000 states) was about 6.
+  It was not the case that lower phone-LM perplexity always led to better WER
+  of the trained system; as for conventional (word-based) MMI training, an
+  intermediate strength of language model seemed to work best.
+
+ \subsubsection chain_training_denominator_compilation  Compilation of the denominator FST
+
+  The phone language model described in the previous section is expanded into a
+  FST with 'pdf-ids' as the arcs, in a process that mirrors the process of
+  decoding-graph compilation in normal Kaldi decoding (see \ref
+  graph_recipe_test), except that there is no lexicon is involved, and at the
+  end we convert the transition-ids to pdf-ids.
+
+  One difference lies in how we minimize the size of the graph.  The normal
+  recipe involves determinization and minimization.  We were not able to
+  reduce the size of the graph using this procedure, or variants of it with
+  disambiguation symbols.  Instead, our graph-minimization process can be described
+  compactly as follows: "Repeat 3 times: push, minimize, reverse; push, minimize reverse.".
+  'push' refers to weight-pushing; 'reverse' refers to reversing the directions of arcs, and
+  swapping initial and final states.
+
+
+ \subsubsection chain_training_denominator_normalization Initial and final probabilities, and 'normalization FST'
+
+  The graph-creation process mentioned above naturally gives us an initial
+  state, and final probabilities for each state; but these are not the ones we
+  use in the forward-backward.  The reason is that these probabilities are
+  applicable to utterance boundaries, but we train on split-up chunks of
+  utterance of a fixed length (e.g. 1.5 seconds).  Constraining the HMM at these
+  arbitrarily chosen cut points to the initial and final states is not
+  appropriate.  Instead, we use initial probabilities derived from 'running the HMM' for
+  a fixed number of iterations and averaging the probabilities; and final probabilities
+  equal to 1.0 for each state.  We have a justification for this but don't have time to
+  explain it right now.  In the denominator forward-backward process we apply these initial and
+  final probabilities to the initial and final frame as part of the computation.  However, we also
+  write out a version of the denominator FST that has these initial and final probabilities, and we refer to
+  this as the 'normalization FST.'  (The initial probabilities are emulated using epsilon arcs, because
+  FSTs do not support initial probabilities).  This 'normalization FST' will be used to add probabilities to the
+ numerator FSTs in a way that we'll describe later.
+
+  \subsection chain_training_numerator  Numerator FSTs
+
+ As part of our preparation for the training process we produce something
+ called a 'numerator FST' for each utterance.  The numerator FST encodes the
+ supervision transcript, and also encodes an alignment of that transcript
+ (i.e. it forces similarity to a reference alignment obtained from a baseline
+ system), but it allows a little 'wiggle room' to vary from that reference.
+ By default we allow a phone to occur 0.05 seconds before or after its
+ begin and end position respectively, in the lattice alignment.
+ Incorporating the alignment information is important because of the way we
+ train not on entire utterances but on split-up fixed-length pieces of
+ utterances (which, in turn, is important for GPU-based training): splitting up
+ the utterance into pieces if we know where the transcript aligns.
+
+ Instead of enforcing a particular pronunciation of the training data, we use as
+ our reference a lattice of alternative pronunciations of the training data,
+ generated by a lattice-generating decoding procedure using an
+ utterance-specific graph as the decoding graph.  This generates all alignments
+ of pronunciations that were within a beam of the best-scoring pronunciation.
+
+  \subsubsection chain_training_numerator_splitting Splitting the numerator FSTs
+
+ As mentioned, we train on fixed sized pieces of utterances (e.g. 1.5 seconds in
+ length).  This requires that we split up the numerator FSTs up into fixed-size
+ pieces.  This isn't hard, since the numerator FSTs (which, remember, encode
+ time-alignment information), naturally have a structure where we can identify
+ any FST state with a particular frame index.  Note: at the stage where we do this
+ splitting, there are no costs in the numerator FST yet-- it's just viewed as
+ encoding a constraint on paths-- so we do not have to make a decision how to split up the costs
+on the paths.
+
+  \subsubsection chain_training_numerator_normalization Normalizing the numerator FSTs
+
+ Above (\ref chain_training_denominator_compilation) we mentioned how we compute
+ initial and final probabilities for the denominator FST, and how we encode
+ these in a 'normalization FST'.  We compose the split-up pieces of numerator
+ FST with this this 'normalization FST' to ensure that the costs from the
+ denominator FST are reflected in the numerator FST.  This ensures that
+ objective functions can never be positive (which makes them easier to
+ interpret), and also guards against the possibility that the numerator FST
+ could contain state sequences not allowed by the denominator FST, which in
+ principle could allow the objective function to increase without bound.  The
+ reason why this could happen is that the phone LM lacks smoothing, and is
+ estimated from 1-best alignments, so the lattices could contain phone n-grams
+ sequences not seen in training.
+
+ It happens occasionally (but very rarely) that this normalization process
+ generates an empty FST: this can occur when the lattice contains triphones that
+ were not not present in the 1-best alignment used to train the phone language
+ model, and does not have any alternative paths at that point in the lattice
+ that could make up for the resulting 'failed' paths.  This can happen because
+ the 1-best alignment and the lattice-producing alignment chose different
+ pronunciations of a word.  These pieces of utterances are just discarded.
+
+  \subsubsection chain_training_numerator_normalization Format of the numerator FSTs
+
+  The numerator FSTs are weighted acceptors where the labels correspond to
+  pdf-ids plus one.  We can't use pdf-ids, because they could be zero; and zero
+  is treated specially (as epsilon) by OpenFst.  When we form minibatches, instead
+  of storing an array of separate numerator FSTs we actually append them together to form a longer FST;
+  this enables us to do a single forward-backward over all utterances in the minibatch,
+  which directly computes the total numerator log-probability.  (This isn't an important
+  feature, it's just a software detail, which we explain here lest it generate confusion).
+
+  \subsection chain_training_splitting  Fixed-length chunks, and minibatches
+
+  In order to train on minibatches, we split up our utterances into fixed-length
+  chunks of speech (of length 1.5 seconds in our current scripts).  Utterances
+  shorter than this are discarded; those longer, are split into chunks with
+  either overlaps between the chunks, or small gaps between the chunks.  Note that
+  our acoustic models typically require left or right frames for acoustic
+  context; we add that, but this is separate issue; the context is added after
+  the chunks are decided on.
+
+  Our minibatch size is usually a power of 2, and it can be limited by GPU
+  memory considerations.  Many of our example scripts use 128 chunks per
+  minibatch.  The largest single consumer of GPU memory is the alpha
+  probabilities in the forward-backward computation.  For instance, with 1.5
+  second chunk, we have 50 time steps after the 3-fold subsampling.  In our
+  Switchboard setup a typical denominator FST has 30,000 states in it.  We use
+  single-precision floating point for the alphas, so the memory used in
+  gigabytes is (128 * 50 * 30000 * 4) / 10^9 = 0.768G.
+
+  This won't use up all the GPU memory, but there are other sources of memory,
+  e.g. we keep around two copies of the nnet outputs in memory, which takes a
+  fair amount of memory depending on the configuration-- e.g. replace the 30000
+  above with about 10000 and it will give you the amount of memory used for one
+  copy of the nnet outputs in a reasonable configuration.
+
+
+  \subsection chain_training_shifting  Training on frame-shifted data
+
+  In neural net training we already have ways of generating perturbed data to
+  artificially increase the amount of data we train on.  Our standard nnet3
+  neural-net training example scripts do time-warping of the raw audio, by
+  factors of 0.9, 1.0 and 1.0, to create 3-fold augmented data.  This is
+  orthogonal to the 'chain' models, and we do it (or not) just as we would for
+  the baseline.  However, there is an extra way we can augment the data for the
+  chain models, by shifting the frames.  The output frame rate for these models
+  is one third the regular frame rate (configurable, of course), meaning we only
+  evaluate nnet output at <code>t</code> values that are multiples of 3, so we
+  can generate different versions of the training data by shifting the training
+  examples by 0, 1 and 2 frames.  This is done automatically in the training
+  script, and it's done 'on the fly' as we read the training examples from
+  disk-- the program <code>nnet3-chain-copy-egs</code> has a
+  <code>--frame-shift</code> option that is set by the script.  This affects how
+  the number of epochs is interpreted.  If the user requests, for instance, 4
+  epochs, then we actually train for 12 epochs; we just do so on 3
+  differently-shifted versions of the data.  What the option
+  <code>--frame-shift=t</code> option actually does is to shift the input frames
+  by <code>t</code> and shift the output frames by the closest multple of 3 to
+  <code>t</code>.  (In general it might not be 3, it's a configuration variable
+  named <code>--frame-subsampling-factor</code>).
+
+  \subsection chain_training_gpu GPU issues in training
+
+ The parts of the computation that are specific to the 'chain' computation are
+ the forward-backward over the numerator FST and over the denominator HMM.  The
+ numerator part of this is very fast.  The denominator forward-backward takes
+ quite a lot of time, because there can be a lot of arcs in the
+ denominator FST (e.g. 200,000 arcs and 30,000 states in a typical Switchboard setup).
+ The time taken can be almost as much as the time taken in the neural-net
+ parts of the computation.  We were quite careful to ensure memory locality.
+
+ The next step to further speed this up is probably to implement a pruned
+ version of the forward-backward computation (like pruned Viterbi, but
+ computing posteriors).  In order to get a speedup we'd have to prune away a
+ very high percentage of states, because we'd need to make up for the loss of
+ memory locality that pruning would bring.  In our current implementation we are
+ careful to ensure that a group of GPU threads are all processing the same
+ HMM-state and time, just from different chunks (we call these different
+ 'sequences' in the code); and we make sure that the memory locations
+ corresponding to a these different sequences are all next to each other in
+ memory, so the GPU can do 'consolidated memory access'.  With state-level
+ pruning, since the memory access for the different sequences would no longer be
+ 'in sync', we would lose this advantage.  It should still be doable to get a
+ pruned version of the forward-backward algorithm, though.
+
+ For speed, we don't use log values in the alpha-beta computation for the
+ denominator graph.  In order to keep all the numerical values in a suitable
+ range, we multiply all the acoustic probabilities (exponentiated nnet outputs)
+ on each frame, by an 'arbitrary value' selected to ensure that our alpha scores
+ stay in a good range.  We call this an 'arbitrary value' because the algorithm
+ is designed so that we could choose any value here, and it would still be
+ mathematically correct.  We designate one HMM state as a 'special state', and
+ the 'arbitrary constant' is chosen is the inverse of that special state's alpha
+ on the previous frame.  This keeps the special state's alpha values close to
+ one.  As the 'special state' we choose a state that has high probability in the
+ limiting distribution of the HMM, and which can access the majority of states
+ of the HMM.
+
+ \section chain_decoding  Decoding with 'chain' models
+
+ The decoding process with 'chain' models is exactly the same as for regular nnet3
+ neural-net based models, and in fact uses the same script (steps/nnet3/decode.sh).
+ There are a few configuration differences:
+
+    - Firstly, the graph is built with a different and simpler topology; but this requires
+      no special action by the user, as the graph-building script anyway takes the
+      topology from the 'final.mdl' produced by the 'chain' training script, which
+      contains the correct topology.
+
+    - By default when we compile the graph, we use a 'self-loop-scale' of 0.1.
+      This affects how the transition probabilities on self-loops are treated
+      (it generally works better).  However, for the 'chain' models, because of
+      how they were trained, we need to use exactly the same
+      transition-probability scaling we trained with, which for simplicity we
+      have set to 1.0.  So we supply the option <code>--self-loop-scale
+      1.0</code> to the <code>utils/mkgraph.sh</code> script.
+
+    - There is no 'division by the prior' necessary in these models.  So we simply
+      don't set the vector of priors in the <code>.mdl</code> files; we made sure
+      that the decoder just omits the division by the prior if the priors are not set.
+
+    - The default acoustic scale we typically use in decoding (0.1) is not
+      suitable-- for 'chain' models the optimal acoustic scale is very close to 1.
+      So we supply the option <code>--acwt 1.0</code> to the script
+      <code>steps/nnet3/decode.sh</code>.
+
+    - The scoring scripts can only search the language-model scale in increments
+      of 1, which works well in typical setups where the optimal language model scale
+      is between 10 and 15, but not when the optimal language-model scale is close
+      to 1 as it here.  (Note: for current purposes you can treat the language-model
+      scale as the same as the inverse of the acoustic scale).  In order to
+      work around this issue without changing the scoring scripts (which are
+      database-specific), we supply a new option <code>--post-decode-acwt 10.0</code>
+      to the script <code>steps/nnet3/decode.sh</code>,
+      which scales the acoustic probabilities by 10 before dumping the lattice.
+      After this, the optimal language-model scale will be around 10, which might
+      be a little confusing if you are not aware of this issue, but is convenient
+      for the way the scoring scripts are set up.
+
+   - The default decoding and lattice beams are suitable without modification
+     for the 'chain' models, once you use the <code>--acwt 1.0</code> option.
+     However, they won't show the full possible speedup and you can get faster
+     decoding by using slightly tighter beams.  By tightening the beam in the
+     Switchboard setup we were able to get decoding time down from around 1.5
+     times real time to around 0.5 times real time, with only around 0.2\%
+     degradation in accuracy (this was with neural net evaluation on the CPU; on
+     the GPU it would have been even faster).  Note from Dan: this is all to the best
+     of my recollection as I write this; actually the degradation may have been more than
+     that.  And bear in mind that this was on high-powered modern server machines
+    (single-threaded).
+
+ You might notice in the current example scripts that we use iVectors.  We do so
+ just because they generally help a bit, and because the baseline setup we were
+ comparing with, uses them.  There is no inherent connection with 'chain'
+ models, and no fundamental requirement to use them.  Actually we want to get rid
+ of them (see below).
+
+
+ \section chain_next_steps  Next steps (TODOs) with 'chain' models
+
+  (Note: this list is valid as of Dec 13 2015, but may become out of date).
+   Things we need to do (and that we'd like help with) are:
+     - Supply example scripts (and tune them) on a wide range of corpora
+       (It will be interesting to see whether there are scale-dependent effects
+       affecting how well this model works).
+     - Create and tune LSTM and BLSTM versions of the training script.  (This
+       may involve some playing around with learning rate schedules and
+       configurations).
+     - Figure out how to speed up the forward-backward part of the computation.
+       (E.g. using state-level pruning, or just by optimizing the current kernels or
+       data structures).
+
+  A longer-term TODO, which Dan should do, is to create an online decoding setup
+  for these models.  Actually this isn't really distinct from nnet3 online
+  decoding in general, since the models are no different from regular nnet3
+  acoustic models.  But we do have to decide whether to continue to support
+  iVectors-- getting rid of them would simplify the setup considerably, and
+  would hopefully make it more robust.  We are hoping that with LSTMs, since it
+  already sees quite a wide acoustic context, iVector adaptation will no longer
+  be as helpful and could be dropped.  We also have other ideas how to
+  incorporate adaptation as part of the neural network, without the use of
+  iVectors.  This will require some experimentation.
+
+
+*/
+
+}
diff --git a/src/doc/cpplint.py b/src/doc/cpplint.py
index 837620b0b68..03d0569ab1c 100755
--- a/src/doc/cpplint.py
+++ b/src/doc/cpplint.py
@@ -2567,8 +2567,8 @@ def CheckLanguage(filename, clean_lines, linenum, file_extension, include_state,
     error(filename, linenum, 'runtime/memset', 4,
           'Did you mean "memset(%s, 0, %s)"?'
           % (match.group(1), match.group(2)))
-
-  if Search(r'\busing namespace\b', line):
+  match = Search(r'\busing namespace kaldi\b',line)
+  if not match and Search(r'\busing namespace\b', line):
     error(filename, linenum, 'build/namespaces', 5,
           'Do not use namespace using-directives.  '
           'Use using-declarations instead.')
diff --git a/src/doc/data_prep.dox b/src/doc/data_prep.dox
index ecf0ecc67b9..9db285b340b 100644
--- a/src/doc/data_prep.dox
+++ b/src/doc/data_prep.dox
@@ -25,11 +25,11 @@
   After running the example scripts (see \ref tutorial), you may want to set up
   Kaldi to run with your own data.  This section explains how to prepare the data.
   This page will assume that you are using the latest version of the example scripts
-  (typically named "s5" in the example directories, e.g. egs/rm/s5/).  
+  (typically named "s5" in the example directories, e.g. egs/rm/s5/).
   In addition to this page, you can refer to the data preparation scripts in those
  directories.  The top-level run.sh scripts (e.g. egs/rm/s5/run.sh) have a few commands at
  the top of them that relate to various phases of data preparation.  The parts in
- the sub-directory named local/ are always specific to the database.  For example, 
+ the sub-directory named local/ are always specific to the database.  For example,
  in the Resource Management (RM) setup it is local/rm_data_prep.sh.  In the case of
  RM these commands are:
 \verbatim
@@ -85,7 +85,7 @@ cmvn.scp  feats.scp  reco2file_and_channel  segments  spk2utt  text  utt2spk  wa
 Not all of the files are equally important.  For a simple setup where there is no
 "segmentation" information (i.e. each utterance corresponds to a single file), the only
 files you have to create yourself are "utt2spk", "text" and "wav.scp" and possibly
-"segments" and "reco2file_and_channel", and the rest will be created by standard scripts.  
+"segments" and "reco2file_and_channel", and the rest will be created by standard scripts.
 
 We will describe the files in this directory, starting with the files you need to create
 yourself.
@@ -95,7 +95,7 @@ yourself.
  The file "text" contains the transcriptions of each utterance.
 \verbatim
 s5# head -3 data/train/text
-sw02001-A_000098-001156 HI UM YEAH I'D LIKE TO TALK ABOUT HOW YOU DRESS FOR WORK AND 
+sw02001-A_000098-001156 HI UM YEAH I'D LIKE TO TALK ABOUT HOW YOU DRESS FOR WORK AND
 sw02001-A_001980-002131 UM-HUM
 sw02001-A_002736-002893 AND IS
 \endverbatim
@@ -104,12 +104,12 @@ but if you have speaker information in your setup, you should make the speaker-i
 prefix of the utterance id; this is important for reasons relating to the sorting of
 these files.  The rest of the line is the transcription of each sentence.  You don't
 have to make sure that all words in this file are in your vocabulary; out of vocabulary words will
-get mapped to a word specified in the file data/lang/oov.txt.    
+get mapped to a word specified in the file data/lang/oov.txt.
 Note: although, in this particular example we have used an underscore to separate the
 "speaker" and "utterance" parts of the utterance-id, in general it is probably safer to
 use a dash ("-").  This is because it has a lower ASCII value; it has been pointed out
-to me that if an underscore is used, and if the speaker-ids vary in length, in certain 
-cases the speaker-ids and their corresponding utterance ids can end up being sorted in 
+to me that if an underscore is used, and if the speaker-ids vary in length, in certain
+cases the speaker-ids and their corresponding utterance ids can end up being sorted in
 different orders when using the standard "C"-style ordering on strings.
 \endverbatim
 Another important file is <DFN>wav.scp</DFN>.  In the Switchboard example,
@@ -118,7 +118,7 @@ s5# head -3 data/train/wav.scp
 sw02001-A /home/dpovey/kaldi-trunk/tools/sph2pipe_v2.5/sph2pipe -f wav -p -c 1 /export/corpora3/LDC/LDC97S62/swb1/sw02001.sph |
 sw02001-B /home/dpovey/kaldi-trunk/tools/sph2pipe_v2.5/sph2pipe -f wav -p -c 2 /export/corpora3/LDC/LDC97S62/swb1/sw02001.sph |
 \endverbatim
-The format of this file is 
+The format of this file is
 \verbatim
 <recording-id> <extended-filename>
 \endverbatim
@@ -135,7 +135,7 @@ sw02001-A_000098-001156 sw02001-A 0.98 11.56
 sw02001-A_001980-002131 sw02001-A 19.8 21.31
 sw02001-A_002736-002893 sw02001-A 27.36 28.93
 \endverbatim
-The format of the "segments" file is: 
+The format of the "segments" file is:
 \verbatim
 <utterance-id> <recording-id> <segment-begin> <segment-end>
 \endverbatim
@@ -146,7 +146,7 @@ an arbitrary identifier that you can choose.
 The file "reco2file_and_channel" is only used when scoring (measuring
 error rates) with NIST's "sclite" tool:
 \verbatim
-s5# head -3 data/train/reco2file_and_channel 
+s5# head -3 data/train/reco2file_and_channel
 sw02001-A sw02001 A
 sw02001-B sw02001 B
 sw02005-A sw02005 A
@@ -156,7 +156,7 @@ The format is:
 <recording-id> <filename> <recording-side (A or B)>
 \endverbatim
 The filename is typically the name of the .sph file, without the suffix, but in
-general it's whatever identifier you have in your "stm" file. 
+general it's whatever identifier you have in your "stm" file.
 The recording side is a concept that relates to telephone conversations where there are
 two channels, and if not, it's probably safe to use "A". If you don't have
 an "stm" file or you have no idea what this is all about, then you don't need
@@ -202,8 +202,8 @@ All of these files should be sorted.  If they are not sorted, you will get error
 when you run the scripts.  In \ref io_sec_tables we explain why this is needed.
 It has to do with the I/O framework; the ultimate reason for the sorting is to
 enable something equivalent to random-access lookup on a stream that doesn't support
-fseek(), such as a piped command.  Many Kaldi programs are reading multiple pipes 
-from other Kaldi commands, reading different types of object, and are doing something 
+fseek(), such as a piped command.  Many Kaldi programs are reading multiple pipes
+from other Kaldi commands, reading different types of object, and are doing something
 roughly comparable to merge-sort
 on the different inputs; merge-sort, of course, requires that the inputs be sorted.
 Be careful when you sort that you have the shell variable LC_ALL defined as "C",
@@ -249,37 +249,37 @@ that is what we use in this particular script.  The format is:
 \verbatim
 <utterance-id> <extended-filename-of-features>
 \endverbatim
-Each of the feature files contains a matrix, in Kaldi format. 
-In this case the dimension of the matrix would be (the length of the file in 10ms intervals) by 13. 
+Each of the feature files contains a matrix, in Kaldi format.
+In this case the dimension of the matrix would be (the length of the file in 10ms intervals) by 13.
 The "extended filename" <DFN>/home/dpovey/kaldi-trunk/egs/swbd/s5/mfcc/raw_mfcc_train.1.ark:24</DFN>
 means, open the "archive" file /home/dpovey/kaldi-trunk/egs/swbd/s5/mfcc/raw_mfcc_train.1.ark, fseek()
-to position 24, and read the data that's there.  
+to position 24, and read the data that's there.
 
 This feats.scp file is created by the command
 \verbatim
-steps/make_mfcc.sh --nj 20 --cmd "$train_cmd" data/train exp/make_mfcc/train $mfccdir 
+steps/make_mfcc.sh --nj 20 --cmd "$train_cmd" data/train exp/make_mfcc/train $mfccdir
 \endverbatim
 which is invoked by the top-level "run.sh" script.  For the definitions of the
 shell variables, see that script.  <DFN>\$mfccdir</DFN> is a user-specified directory where the
-.ark files will be written. 
+.ark files will be written.
 
 The last file in the directory data/train is "cmvn.scp".  This contains statistics
 for cepstral mean and variance normalization, indexed by speaker.  Each set of
 statistics is a matrix, of dimension 2 by 14 in this case.  In our example, we have:
 \verbatim
-s5# head -3 data/train/cmvn.scp 
+s5# head -3 data/train/cmvn.scp
 2001-A /home/dpovey/kaldi-trunk/egs/swbd/s5/mfcc/cmvn_train.ark:7
 2001-B /home/dpovey/kaldi-trunk/egs/swbd/s5/mfcc/cmvn_train.ark:253
 2005-A /home/dpovey/kaldi-trunk/egs/swbd/s5/mfcc/cmvn_train.ark:499
 \endverbatim
 Unlike feats.scp, this scp file is indexed by speaker-id, not utterance-id.
-This file is created by a command such as this: 
+This file is created by a command such as this:
 \verbatim
-steps/compute_cmvn_stats.sh data/train exp/make_mfcc/train $mfccdir 
+steps/compute_cmvn_stats.sh data/train exp/make_mfcc/train $mfccdir
 \endverbatim
 (this example is from <DFN>egs/swbd/s5/run.sh</DFN>).
 
-Because errors in data preparation can cause problems later on, we have a script to 
+Because errors in data preparation can cause problems later on, we have a script to
 check that the data directory is correctly formatted.  Run e.g.
 \verbatim
 utils/validate_data_dir.sh data/train
@@ -307,7 +307,7 @@ s5# ls data/lang_test
 G.fst  L.fst  L_disambig.fst  oov.int  oov.txt	phones	phones.txt  topo  words.txt
 \endverbatim
 Note that lang_test/ was created by copying lang/ and adding G.fst.
-Each of these directories seems to contain only a few files.  
+Each of these directories seems to contain only a few files.
 It's not quite as simple as this though, because "phones" is a directory:
 \verbatim
 s5# ls data/lang/phones
@@ -347,9 +347,9 @@ utils/int2sym.pl and utils/sym2int.pl, and by the OpenFst programs fstcompile an
 fstprint.
 
 The file <DFN>L.fst</DFN> is the Finite State Transducer form of the lexicon (L,
-see  <a href=http://www.cs.nyu.edu/~mohri/pub/hbka.pdf> "Speech Recognition 
-with Weighted Finite-State Transducers" </a> by Mohri, Pereira and 
-Riley, in Springer Handbook on SpeechProcessing and Speech Communication, 2008). 
+see  <a href=http://www.cs.nyu.edu/~mohri/pub/hbka.pdf> "Speech Recognition
+with Weighted Finite-State Transducers" </a> by Mohri, Pereira and
+Riley, in Springer Handbook on SpeechProcessing and Speech Communication, 2008).
 with phone symbols on the input and word symbols on the output.  The file
 <DFN>L_disambig.fst</DFN> is the lexicon, as above but including the disambiguation
 symbols \#1, \#2, and so on, as well as the self-loop with \#0 on it to "pass through"
@@ -368,7 +368,7 @@ containing just a phone that we designate as a "garbage phone"; this phone will
 align with various kinds of spoken noise.  In our particular setup, this phone
 is called <DFN>\<SPN\></DFN> (short for "spoken noise"):
 \verbatim
-s5# grep -w UNK data/local/dict/lexicon.txt 
+s5# grep -w UNK data/local/dict/lexicon.txt
 <UNK> SPN
 \endverbatim
 The file <DFN>oov.int</DFN> contains the integer form of this (extracted from <DFN>words.txt</DFN>),
@@ -404,20 +404,20 @@ s5# cat data/lang/topo
 </Topology>
 \endverbatim
 This specifies the topology of the HMMs we use.  In this case, the "real" phones contain
-three emitting states 
+three emitting states
 with the standard 3-state left-to-right topology-- the "Bakis model".
 (Emitting states are states that "emit" feature vectors, as distinct from the "fake"
 non-emitting states that are just used to glue other states together).
 Phones 1 to 20 are various kinds of silence and noise; we have a lot because of word-position-dependency,
 and in fact most of these will never be used; the real number excluding word position
-dependency is more like five.  The "silence phones" have a more complex topology with an 
+dependency is more like five.  The "silence phones" have a more complex topology with an
 initial emitting state and an end emitting state, but then three emitting states in the middle.
 You don't have to create this file by hand.
 
 There are a number of files in <DFN>data/lang/phones/</DFN> that specify various things about
 the phone set.  Most of these files exist in three separate versions: a ".txt" form, e.g.:
 \verbatim
-s5# head -3 data/lang/phones/context_indep.txt 
+s5# head -3 data/lang/phones/context_indep.txt
 SIL
 SIL_B
 SIL_E
@@ -432,7 +432,7 @@ s5# head -3 data/lang/phones/context_indep.int
 and a ".csl" form, which in a slight abuse of notation, denotes a colon-separated list,
 not a comma-separated list:
 \verbatim
-s5# cat data/lang/phones/context_indep.csl 
+s5# cat data/lang/phones/context_indep.csl
 1:2:3:4:5:6:7:8:9:10:11:12:13:14:15:16:17:18:19:20
 \endverbatim
 These files always contain the same information, so let's focus on the ".txt" form which
@@ -474,7 +474,7 @@ lexicon (not part of a word), <DFN>SIL_B</DFN> would be a silence phone at the b
 (which should never exist), <DFN>SIL_I</DFN> word-internal silence (unlikely to exist), <DFN>SIL_E</DFN>
 word-ending silence (should never exist), and <DFN>SIL_S</DFN> would be silence as a "singleton
 word", i.e. a phone with only one word-- this might be used if you had a "silence word" in your
-lexicon and explicit silences appear in your transcriptions.  
+lexicon and explicit silences appear in your transcriptions.
 
 The files <DFN>silence.txt</DFN> and <DFN>nonsilence.txt</DFN> contains lists of the silence
 phones and nonsilence phones respectively.  These should be mutually exclusive and together,
@@ -489,11 +489,11 @@ to designate all silence, noise and vocalized-noise phones as "silence" phones,
 phones representing traditional phonemes as "nonsilence" phones.  We haven't experimented
 in Kaldi with the best way to do this.
 \verbatim
-s5# head -3 data/lang/phones/silence.txt 
+s5# head -3 data/lang/phones/silence.txt
 SIL
 SIL_B
 SIL_E
-s5# head -3 data/lang/phones/nonsilence.txt 
+s5# head -3 data/lang/phones/nonsilence.txt
 IY_B
 IY_E
 IY_I
@@ -502,7 +502,7 @@ IY_I
 The file <DFN>disambig.txt</DFN> contains a list of the "disambiguation symbols"
 (see \ref graph_disambig):
 \verbatim
-s5# head -3 data/lang/phones/disambig.txt 
+s5# head -3 data/lang/phones/disambig.txt
 #0
 #1
 #2
@@ -512,7 +512,7 @@ These symbols appear in the file <DFN>phones.txt</DFN> as if they were phones.
 The file <DFN>optional_silence.txt</DFN> contains a single phone which can optionally
 appear between words:
 \verbatim
-s5# cat data/lang/phones/optional_silence.txt 
+s5# cat data/lang/phones/optional_silence.txt
 SIL
 \endverbatim
 The mechanism by which it appears optionally between words is that it appears
@@ -527,7 +527,7 @@ rather than linguistically meaningful ones).
 In this particular setup, <DFN>sets.txt</DFN> groups together all the word-position-dependent
 versions of each phone:
 \verbatim
-s5# head -3 data/lang/phones/sets.txt 
+s5# head -3 data/lang/phones/sets.txt
 SIL SIL_B SIL_E SIL_I SIL_S
 SPN SPN_B SPN_E SPN_I SPN_S
 NSN NSN_B NSN_E NSN_I NSN_S
@@ -536,27 +536,27 @@ NSN NSN_B NSN_E NSN_I NSN_S
 The file <DFN>extra_questions.txt</DFN> contains some extra questions which we'll include
 in addition to the automatically generated questions:
 \verbatim
-s5# cat data/lang/phones/extra_questions.txt 
-IY_B B_B D_B F_B G_B K_B SH_B L_B M_B N_B OW_B AA_B TH_B P_B OY_B R_B UH_B AE_B S_B T_B AH_B V_B W_B Y_B Z_B CH_B AO_B DH_B UW_B ZH_B EH_B AW_B AX_B EL_B AY_B EN_B HH_B ER_B IH_B JH_B EY_B NG_B 
-IY_E B_E D_E F_E G_E K_E SH_E L_E M_E N_E OW_E AA_E TH_E P_E OY_E R_E UH_E AE_E S_E T_E AH_E V_E W_E Y_E Z_E CH_E AO_E DH_E UW_E ZH_E EH_E AW_E AX_E EL_E AY_E EN_E HH_E ER_E IH_E JH_E EY_E NG_E 
-IY_I B_I D_I F_I G_I K_I SH_I L_I M_I N_I OW_I AA_I TH_I P_I OY_I R_I UH_I AE_I S_I T_I AH_I V_I W_I Y_I Z_I CH_I AO_I DH_I UW_I ZH_I EH_I AW_I AX_I EL_I AY_I EN_I HH_I ER_I IH_I JH_I EY_I NG_I 
-IY_S B_S D_S F_S G_S K_S SH_S L_S M_S N_S OW_S AA_S TH_S P_S OY_S R_S UH_S AE_S S_S T_S AH_S V_S W_S Y_S Z_S CH_S AO_S DH_S UW_S ZH_S EH_S AW_S AX_S EL_S AY_S EN_S HH_S ER_S IH_S JH_S EY_S NG_S 
-SIL SPN NSN LAU 
-SIL_B SPN_B NSN_B LAU_B 
-SIL_E SPN_E NSN_E LAU_E 
-SIL_I SPN_I NSN_I LAU_I 
-SIL_S SPN_S NSN_S LAU_S 
+s5# cat data/lang/phones/extra_questions.txt
+IY_B B_B D_B F_B G_B K_B SH_B L_B M_B N_B OW_B AA_B TH_B P_B OY_B R_B UH_B AE_B S_B T_B AH_B V_B W_B Y_B Z_B CH_B AO_B DH_B UW_B ZH_B EH_B AW_B AX_B EL_B AY_B EN_B HH_B ER_B IH_B JH_B EY_B NG_B
+IY_E B_E D_E F_E G_E K_E SH_E L_E M_E N_E OW_E AA_E TH_E P_E OY_E R_E UH_E AE_E S_E T_E AH_E V_E W_E Y_E Z_E CH_E AO_E DH_E UW_E ZH_E EH_E AW_E AX_E EL_E AY_E EN_E HH_E ER_E IH_E JH_E EY_E NG_E
+IY_I B_I D_I F_I G_I K_I SH_I L_I M_I N_I OW_I AA_I TH_I P_I OY_I R_I UH_I AE_I S_I T_I AH_I V_I W_I Y_I Z_I CH_I AO_I DH_I UW_I ZH_I EH_I AW_I AX_I EL_I AY_I EN_I HH_I ER_I IH_I JH_I EY_I NG_I
+IY_S B_S D_S F_S G_S K_S SH_S L_S M_S N_S OW_S AA_S TH_S P_S OY_S R_S UH_S AE_S S_S T_S AH_S V_S W_S Y_S Z_S CH_S AO_S DH_S UW_S ZH_S EH_S AW_S AX_S EL_S AY_S EN_S HH_S ER_S IH_S JH_S EY_S NG_S
+SIL SPN NSN LAU
+SIL_B SPN_B NSN_B LAU_B
+SIL_E SPN_E NSN_E LAU_E
+SIL_I SPN_I NSN_I LAU_I
+SIL_S SPN_S NSN_S LAU_S
 \endverbatim
 You will observe that a question is simply a set of phones.
 The first four questions are asking about the word-position, for regular phones; and the last five do the same for
 the "silence phones".  The "silence" phones also come in a variety without a suffix like <DFN>_B</DFN>,
 for example <DFN>SIL</DFN>.  These may appear as optional silence in the lexicon, i.e. not inside an
 actual word.  In setups with things like tone dependency or stress markings, <DFN>extra_questions.txt</DFN>
-may contain questions that relate to those features.  
+may contain questions that relate to those features.
 
 The file <DFN>word_boundary.txt</DFN> explains how the phones relate to word positions:
 \verbatim
-s5# head  data/lang/phones/word_boundary.txt 
+s5# head  data/lang/phones/word_boundary.txt
 SIL nonword
 SIL_B begin
 SIL_E end
@@ -570,14 +570,14 @@ we don't like to hardcode this in the text form of the phones-- for one thing, K
 never see the text form of the phones, but only an integerized form.  So it is specified
 by this file <DFN>word_boundary.txt</DFN>.  The main reason we need this information is
 in order to recover the word boundaries within lattices (for example, the program
-lattice-align-words reads the integer versin of this file, <DFN>word_boundaray.int</DFN>).  
+lattice-align-words reads the integer versin of this file, <DFN>word_boundaray.int</DFN>).
 Finding the word boundaries is useful for reasons including NIST sclite scoring, which requires
 the time markings for words, and for other downstream processing.
 
 The file <DFN>roots.txt</DFN> contains information that relates to how we build the phonetic-context
 decision tree:
 \verbatim
-head data/lang/phones/roots.txt 
+head data/lang/phones/roots.txt
 shared split SIL SIL_B SIL_E SIL_I SIL_S
 shared split SPN SPN_B SPN_E SPN_I SPN_S
 shared split NSN NSN_B NSN_E NSN_I NSN_S
@@ -607,7 +607,7 @@ utils/prepare_lang.sh data/local/dict "<UNK>" data/local/lang data/lang
 \endverbatim
 Here, the inputs are the directory <DFN>data/local/dict/</DFN>, and the label <DFN>\<UNK\></DFN>
 which is the dictionary word we will map OOV words to when appear in transcripts
-(this becomes data/lang/oov.txt).  The location <DFN>data/local/lang/</DFN> is simply a 
+(this becomes data/lang/oov.txt).  The location <DFN>data/local/lang/</DFN> is simply a
 temporary directory which the script will use; <DFN>data/lang/</DFN> is where
 it actually puts its output.
 
@@ -617,21 +617,21 @@ The thing which you, as the data-preparer, need to create, is the directory
 s5# ls data/local/dict
 extra_questions.txt  lexicon.txt nonsilence_phones.txt  optional_silence.txt  silence_phones.txt
 \endverbatim
-(in fact there are a few more files there which we haven't listed, but they are just temporary files that 
+(in fact there are a few more files there which we haven't listed, but they are just temporary files that
 were put there while creating that directory, and we can ignore them).  The commands below give
 you an idea what is in these files:
 \verbatim
-s5# head -3 data/local/dict/nonsilence_phones.txt 
+s5# head -3 data/local/dict/nonsilence_phones.txt
 IY
 B
 D
-s5# cat data/local/dict/silence_phones.txt 
+s5# cat data/local/dict/silence_phones.txt
 SIL
 SPN
 NSN
 LAU
-s5# cat data/local/dict/extra_questions.txt 
-s5# head -5 data/local/dict/lexicon.txt 
+s5# cat data/local/dict/extra_questions.txt
+s5# head -5 data/local/dict/lexicon.txt
 !SIL SIL
 -'S S
 -'S Z
@@ -650,7 +650,7 @@ on separate lines,
 if we have multiple pronunciations for it.  If you want to use pronunciation
 probabilities, instead of creating the file <DFN>lexicon.txt</DFN>, create a file
 called <DFN>lexiconp.txt</DFN> that has the probability as the second field.
-Note that it is a common practice to normalize the pronunciations probabilities so that 
+Note that it is a common practice to normalize the pronunciations probabilities so that
 instead of summing to one, the most probable pronunciation for each word is one.  This
 tends to give better results.  For a top-level script that runs with
 pronunciation probabilities, search for <DFN>pp</DFN> in <DFN>egs/wsj/s5/run.sh</DFN>.
@@ -666,35 +666,35 @@ versions of a particular phone that have different stress or tone.  In order
 to demonstrate what this looks like, we'll view the same files as above,
 but in the <DFN>egs/wsj/s5/</DFN> setup.  The result is below:
 \verbatim
-s5# cat data/local/dict/silence_phones.txt 
+s5# cat data/local/dict/silence_phones.txt
 SIL
 SPN
 NSN
-s5# head data/local/dict/nonsilence_phones.txt 
-S 
-UW UW0 UW1 UW2 
-T 
-N 
-K 
-Y 
-Z 
-AO AO0 AO1 AO2 
-AY AY0 AY1 AY2 
-SH 
-s5# head -6 data/local/dict/lexicon.txt 
+s5# head data/local/dict/nonsilence_phones.txt
+S
+UW UW0 UW1 UW2
+T
+N
+K
+Y
+Z
+AO AO0 AO1 AO2
+AY AY0 AY1 AY2
+SH
+s5# head -6 data/local/dict/lexicon.txt
 !SIL SIL
 <SPOKEN_NOISE> SPN
 <UNK> SPN
 <NOISE> NSN
 !EXCLAMATION-POINT  EH2 K S K L AH0 M EY1 SH AH0 N P OY2 N T
 "CLOSE-QUOTE  K L OW1 Z K W OW1 T
-s5# cat data/local/dict/extra_questions.txt 
-SIL SPN NSN 
-S UW T N K Y Z AO AY SH W NG EY B CH OY JH D ZH G UH F V ER AA IH M DH L AH P OW AW HH AE R TH IY EH 
-UW1 AO1 AY1 EY1 OY1 UH1 ER1 AA1 IH1 AH1 OW1 AW1 AE1 IY1 EH1 
-UW0 AO0 AY0 EY0 OY0 UH0 ER0 AA0 IH0 AH0 OW0 AW0 AE0 IY0 EH0 
-UW2 AO2 AY2 EY2 OY2 UH2 ER2 AA2 IH2 AH2 OW2 AW2 AE2 IY2 EH2 
-s5# 
+s5# cat data/local/dict/extra_questions.txt
+SIL SPN NSN
+S UW T N K Y Z AO AY SH W NG EY B CH OY JH D ZH G UH F V ER AA IH M DH L AH P OW AW HH AE R TH IY EH
+UW1 AO1 AY1 EY1 OY1 UH1 ER1 AA1 IH1 AH1 OW1 AW1 AE1 IY1 EH1
+UW0 AO0 AY0 EY0 OY0 UH0 ER0 AA0 IH0 AH0 OW0 AW0 AE0 IY0 EH0
+UW2 AO2 AY2 EY2 OY2 UH2 ER2 AA2 IH2 AH2 OW2 AW2 AE2 IY2 EH2
+s5#
 \endverbatim
 You may notice that some of the lines in <DFN>nonsilence_phones.txt</DFN> contain
 multiple phones on a single line.  These are the different stress-dependent
@@ -722,8 +722,8 @@ of the stress-dependent versions of phones may have too little data to
 robustly estimate either a separate decision tree or the phone clustering
 information that's used in producing the questions.  By grouping them together
 like this, we ensure that in the absence of enough data to estimate them
-separately, these different versions of the phone all "stay together" throughout 
-the decision-tree building process. 
+separately, these different versions of the phone all "stay together" throughout
+the decision-tree building process.
 
 We should mention at this point that the script <DFN>utils/prepare_lang.sh</DFN>
 supports a number of options.  To give you an idea of what they are, here is
@@ -731,7 +731,7 @@ the usage messages of that script:
 \verbatim
 usage: utils/prepare_lang.sh <dict-src-dir> <oov-dict-entry> <tmp-dir> <lang-dir>
 e.g.: utils/prepare_lang.sh data/local/dict <SPOKEN_NOISE> data/local/lang data/lang
-options: 
+options:
      --num-sil-states <number of states>             # default: 5, #states in silence models.
      --num-nonsil-states <number of states>          # default: 3, #states in non-silence models.
      --position-dependent-phones (true|false)        # default: true; if true, use _B, _E, _S & _I
@@ -778,7 +778,7 @@ local/make_rm_lm.pl $RMROOT/rm1_audio1/rm1/doc/wp_gram.txt  > $tmpdir/G.txt || e
 This script <DFN>local/make_rm_lm.pl</DFN> creates a grammar in FST format (text format,
 not binary format).  It contains lines like the following:
 \verbatim
-s5# head data/local/tmp/G.txt 
+s5# head data/local/tmp/G.txt
 0    1    ADD    ADD    5.19849703126583
 0    2    AJAX+S    AJAX+S    5.19849703126583
 0    3    APALACHICOLA+S    APALACHICOLA+S    5.19849703126583
@@ -788,7 +788,7 @@ have a useful tutorial).  The script <DFN>local/rm_prepare_grammar.sh</DFN> will
 the binary-format file <DFN>G.fst</DFN> using the following statement:
 \verbatim
 fstcompile --isymbols=data/lang/words.txt --osymbols=data/lang/words.txt --keep_isymbols=false \
-    --keep_osymbols=false $tmpdir/G.txt > data/lang/G.fst 
+    --keep_osymbols=false $tmpdir/G.txt > data/lang/G.fst
 \endverbatim
 If you want to create your own grammar, you will probably want to do something similar.
 Note: this type of procedure only applies to grammars of a certain class: it won't
@@ -797,7 +797,13 @@ in OpenFst format.  There are ways to do this in the WFST framework
 (e.g. see recent work by Mike Riley with push down transducers), but we have not yet
 worked with those ideas in Kaldi.
 
-In the WSJ setup, we use a statistical language model.  The script <DFN>local/wsj_format_data.sh</DFN>
+Please, before asking any questions on the list about language models or about making
+grammar FSTs, read "A Bit of Progress in Language Modeling" by Joshua Goodman; and go to
+www.openfst.org and do the FST tutorial so that you understand the basics of finite
+state transducers.  (Note that language models would be represented as finite state
+acceptors, or FSAs, which can be considered as a special case of finite state transducers).
+
+In the WSJ setup (like most setups), we use a statistical language model.  The script <DFN>local/wsj_format_data.sh</DFN>
 deals with converting the ARPA-format language models supplied with the WSJ database, into
 an OpenFst format.  Some of the key commands from that script are:
 \verbatim
@@ -832,5 +838,9 @@ FST from text form into OpenFst binary form; and <DFN>fstrmepsilon</DFN> is also
 an OpenFst command, which removes the small number of <DFN>\<eps\></DFN> symbols from the
 FST, which were converted from <DFN>\<s\></DFN> and <DFN>\</s\></DFN>.
 
+A popular toolkit for building language models is SRILM.  Various language
+modeling toolkits are used in the Kaldi example scripts.  SRILM is the best
+documented and most fully featured, and we generally recommend it (its only
+drawback is that it don't have the most free licence).
 
 */
diff --git a/src/doc/dnn.dox b/src/doc/dnn.dox
index 25b6e8e19cf..5b3d2b98261 100644
--- a/src/doc/dnn.dox
+++ b/src/doc/dnn.dox
@@ -76,6 +76,7 @@ namespace kaldi {
   - Documentation for Karel's version is available at \subpage dnn1 
   - Documentation for Dan's old version is available at \subpage dnn2.
   - Documentation for the nnet3 setup is available at \subpage dnn3.  
+  - Documentation for the 'nnet3+chain' setup is available at \subpage chain.
 
 */
 
diff --git a/src/doc/dnn3_code_data_types.dox b/src/doc/dnn3_code_data_types.dox
index f72721e1715..30623e6c658 100644
--- a/src/doc/dnn3_code_data_types.dox
+++ b/src/doc/dnn3_code_data_types.dox
@@ -26,7 +26,7 @@ namespace nnet3 {
 
   - Up: \ref dnn3
   - Next: \ref dnn3_code_compilation
- 
+
   \section dnn3_dt_problem  Objectives and background
 
   The previous \ref dnn1 "nnet1" and \ref dnn2 "nnet2" setups are based on a Component
@@ -89,7 +89,7 @@ output-node name=output input=output_nonlin
   with a number of additional indexes: time (t), an index (n) that
   indicates the example within the minibatch (e.g. 0 through 511 for
   a 512-example minibatch), plus an "extra" index (x) that may eventually be
-  useful in convolutional approaches but is usually zero for now. 
+  useful in convolutional approaches but is usually zero for now.
 
   To formalize the above, we define an Index is a tuple (n, t, x).  We will also
   define a \ref Cindex as a tuple (node-index, Index), where the node-index is
@@ -108,7 +108,7 @@ output-node name=output input=output_nonlin
      for receiving matrix-valued input, evaluating the NnetComputation, and supplying
      matrix-valued output.  Think of this as the run-time of a very limited interpreted
      language.
- 
+
   \section dnn3_dt_data_structures  Basic data structures in nnet3
 
    \subsection dnn3_dt_datastruct_index Indexes
@@ -149,13 +149,13 @@ output-node name=output input=output_nonlin
   \verbatim
    [ (0, -1:1) (1, -1:1) ... ]
   \endverbatim
-  
-  
+
+
    \subsection dnn3_dt_datastruct_cindex  Cindexes
 
   A \ref Cindex is a pair (int32, Index), where the int32 corresponds to the index of a node
   in a neural network.  As mentioned above, a \ref Nnet "neural network" consists of a collection of
-  named Components and a kind of graph on "nodes", and the nodes have indexes. 
+  named Components and a kind of graph on "nodes", and the nodes have indexes.
   Cindexes are used during the compilation process, and
   they correspond to the nodes of a "computation graph" corresponding to a specific
   neural net computation.  There is a correspondence
@@ -217,8 +217,8 @@ output-node name=output input=output_nonlin
   Computation acts on are a list of matrices, and also submatrices that may
   occupy row or column ranges of a matrix.  A Computation also contains various
   sets of indexes (arrays of integers and so on) that are sometimes required as
-  arguments to particular matrix operations.  
-  
+  arguments to particular matrix operations.
+
   We will describe this in more detail below in \ref dnn3_dt_nnet_computation.
 
   \subsection dnn3_dt_data_struct_computer NnetComputer
@@ -229,7 +229,7 @@ output-node name=output input=output_nonlin
  the NnetComputation.
 
 
-  \section dnn3_dt_nnet  Neural networks in nnet3 
+  \section dnn3_dt_nnet  Neural networks in nnet3
 
  The previous section should have given you a high-level overview of how the
  framework fits together.  In this section we will go into a little more detail
@@ -251,7 +251,7 @@ class Component {
   virtual void Backprop(const std::string &debug_info,
                         const ComponentPrecomputedIndexes *indexes,
                         const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,                        
+                        const CuMatrixBase<BaseFloat> &out_value,
                         const CuMatrixBase<BaseFloat> &out_deriv,
                         Component *to_update, // may be NULL; may be identical
                                               // to "this" or different.
@@ -266,7 +266,7 @@ class Component {
  to create the corresponding row of the output.  In terms of Indexes, this means
  that the Indexes corresponding to each element of input and output are the same.
  Similar logic holds in the Backprop function.
- 
+
 
  \subsection dnn3_dt_nnet_component_properties Components (properties)
 
@@ -295,7 +295,7 @@ class Component {
  standard methods (RNNs, LSTMs and so on).  Unlike in the \ref dnn2 "nnet2" framework,
  Components are not responsible for implementing things like splicing across frames;
  instead we use \ref dnn3_dt_nnet_descriptor_code "Descriptors" to handle that, as will be explained below.
- 
+
 
  \subsection dnn3_dt_nnet_node_outline Neural network nodes (outline)
 
@@ -349,7 +349,13 @@ Each component-node in the config file gets expanded to two nodes: a node of
 type kComponent, and an immediately preceding node of type kDescriptor that is
 defined by the "input" field.
 
- 
+The config file above doesn't give an example of a dim-range node.  The basic format
+of a dim-range node is this (this example would take the first 50 dimensions from the 65 dimensions
+of component affine1):
+\verbatim
+dim-range-node name=dim-range-node1 input-node=affine1_node dim-offset=0 dim=50
+\endverbatim
+
  \subsection dnn3_dt_nnet_descriptor_code Descriptors in config files
 
   A Descriptor is a very limited type of expression that refers to quantities defined
@@ -366,8 +372,8 @@ defined by the "input" field.
 \verbatim
 # caution, this is a simplification that overgenerates descriptors.
 <descriptor>  ::=   <node-name>      ;; node name of kInput or kComponent node.
-<descriptor>  ::=   Append(<descriptor>, <descriptor> [, <descriptor> ... ] )
-<descriptor>  ::=   Sum(<descriptor>, <descriptor>)
+<descriptor>  ::=   Append(<descriptor>, [, <descriptor> ... ] )
+<descriptor>  ::=   Sum(<descriptor>, [<descriptor>, ...])
 ;; Failover or IfDefined might be useful for time t=-1 in a RNN, for instance.
 <descriptor>  ::=   Failover(<descriptor>, <descriptor>)   ;; 1st arg if computable, else 2nd
 <descriptor>  ::=   IfDefined(<descriptor>)     ;; the arg if defined, else zero.
@@ -386,10 +392,12 @@ defined by the "input" field.
 <descriptor>  ::=   ReplaceIndex(<descriptor>, <variable-name>, <value>)
 \endverbatim
 
-
-Now we will describe the actual syntax, which differs from the above simplified
-version because expressions may appear only in a certain hierarchy.  This
-syntax also corresponds more closely with the class names in the real code.
+Now we will describe the actual syntax which the code uses internally, which
+differs from the above simplified version because expressions may appear only in
+a certain hierarchy.  This syntax also corresponds more closely with the class
+names in the real code.  The code that reads Descriptors attempts to normalize
+them in as general as possible a way, so that almost all of the above syntax
+can be read and converted to the internal representation.
 \verbatim
 ;;; <descriptor> == class Descriptor
 <descriptor> ::=  Append(<sum-descriptor>[, <sum-descriptor> ... ] )
@@ -477,7 +485,7 @@ If the Descriptor is computable for this Index, the function will return true.
 For instance, the expression <code>Sum(X, Y)</code> would only be computable
 if <code>X</code> and <code>Y</code> are computable.  If this function is going to
 return true, it will also append to "input_terms" only the input Cindexes that
-actually appear in the evaluated expression. 
+actually appear in the evaluated expression.
 For example (and speaking
 loosely), in an expression of the form <code>Failover(X, Y)</code>, if <code>X</code> is computable then
 only <code>X</code> would be appended to "input_terms", and not <code>Y</code>.
@@ -491,7 +499,7 @@ and \ref Descriptor::IsComputable() "IsComputable()" with the same interface as
 SumDescriptor, and also functions such as \ref Descriptor::NumParts() "NumParts()" and
 \ref Descriptor::Part() "Part(int32 n)" that allow the user to access the
 individual SumDescriptors in its vector.
-  
+
 \subsection dnn3_dt_nnet_node_detail Neural network nodes (detail)
 
 We will now describe neural network nodes in more detail.  As mentioned above,
@@ -502,9 +510,9 @@ enum NodeType { kInput, kDescriptor, kComponent, kDimRange };
 The actual NetworkNode is a struct.  To avoid the hassle of pointers and because C++ doesn't
 allow unions containing classes, we have a slightly messy layout:
 \verbatim
-struct NetworkNode {  
+struct NetworkNode {
   NodeType node_type;
-  // "descriptor" is relevant only for nodes of type kDescriptor.  
+  // "descriptor" is relevant only for nodes of type kDescriptor.
   Descriptor descriptor;
   union {
     // For kComponent, the index into Nnet::components_
@@ -537,7 +545,7 @@ public:
   ...
 private:
   std::vector<std::string> component_names_;
-  std::vector<Component*> components_;  
+  std::vector<Component*> components_;
   std::vector<std::string> node_names_;
   std::vector<NetworkNode> nodes_;
 
@@ -558,7 +566,7 @@ information necessary to interpret them.  Internally it defines a number of type
 including the following enum value:
 \verbatim
   enum CommandType {
-    kAllocMatrixUndefined, kAllocMatrixZeroed, 
+    kAllocMatrixUndefined, kAllocMatrixZeroed,
     kDeallocMatrix, kPropagate, kStoreStats, kBackprop,
     kMatrixCopy, kMatrixAdd, kCopyRows, kAddRows,
     kCopyRowsMulti, kCopyToRowsMulti, kAddRowsMulti, kAddToRowsMulti,
@@ -591,9 +599,9 @@ restricted row and column range of a matrix, like the matlab syntax
   };
   struct SubMatrixInfo {
     int32 matrix_index;  // index into "matrices": the underlying matrix.
-    int32 row_offset;    
+    int32 row_offset;
     int32 num_rows;
-    int32 col_offset;    
+    int32 col_offset;
     int32 num_cols;
   };
 \endverbatim
diff --git a/src/doc/examples.dox b/src/doc/examples.dox
new file mode 100755
index 00000000000..7ba7a6043f3
--- /dev/null
+++ b/src/doc/examples.dox
@@ -0,0 +1,404 @@
+// doc/examples.dox
+
+// Copyright 2016 Fred Richardson  Allen Guo
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//  http://www.apache.org/licenses/LICENSE-2.0
+
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+ \page examples Examples included with Kaldi
+
+ When you check out the Kaldi source tree (see \ref install), you will find many
+ sets of example scripts in the egs/ directory.  This table summarizes some key
+ facts about some of those example scripts; however, it it not an exhaustive
+ list.
+
+<table><tr>
+  <th>Name</th>
+  <th>BW</th>
+  <th>Lang</th>
+  <th>Train Domain</th>
+  <th>Train Hours</th>
+  <th>Train Speakers</th>
+  <th>License and Availability</th>
+  <th>Year Released</th>
+  <th>Speech Style</th>
+  <th>Test Domain</th>
+  <th>Kaldi Aprox Perf</th>
+  <th>Model Type</th>
+  <th>LM Data</th>
+  <th>Lexicon</th>
+</tr>
+<tr>
+  <!-- Name                 --> <td>AMI</td>
+  <!-- BW                   --> <td>16k</td>
+  <!-- Lang                 --> <td>English<br>(+non-native)</td>
+  <!-- Train Domain         --> <td>Microphone: head-mike,<br>single and multiple<br>distance mikes</td>
+  <!-- Train Hours          --> <td>100</td>
+  <!-- Train Speakers       --> <td>123 M<br>66 F</td>
+  <!-- License/Availability --> <td>Free /<br>Download<br>http://groups.inf.ed.ac.uk/ami/corpus/</td>
+  <!-- Year Released        --> <td>2014</td>
+  <!-- Speech Style         --> <td>Meeting room</td>
+  <!-- Test Domain          --> <td>Same as train<br>no overlap(?)</td>
+  <!-- Kaldi Aprox Perf     --> <td>~25% WER head (T)DNN<br>~45% WER distant (B)LSTM</td>
+  <!-- Model Type           --> <td></td>
+  <!-- LM Data              --> <td>AMI + (opt) Fisher</td>
+  <!-- Lexicon              --> <td>50K (CMU dict +<br>kaldi sources)</td>
+</tr>
+<tr>
+  <!-- Name                 --> <td>Aspire</td>
+  <!-- BW                   --> <td></td>
+  <!-- Lang                 --> <td>English</td>
+  <!-- Train Domain         --> <td>Conversational microphone<br>developed on telephone</td>
+  <!-- Train Hours          --> <td>see Fisher</td>
+  <!-- Train Speakers       --> <td></td>
+  <!-- License/Availability --> <td></td>
+  <!-- Year Released        --> <td>2015</td>
+  <!-- Speech Style         --> <td></td>
+  <!-- Test Domain          --> <td></td>
+  <!-- Kaldi Aprox Perf     --> <td>30.8% WER (dev or eval?)</td>
+  <!-- Model Type           --> <td></td>
+  <!-- LM Data              --> <td></td>
+  <!-- Lexicon              --> <td></td>
+</tr>
+<tr>
+  <!-- Name                 --> <td>WSJ</td>
+  <!-- BW                   --> <td>16k</td>
+  <!-- Lang                 --> <td>English</td>
+  <!-- Train Domain         --> <td>Clean close-mic<br>read speech</td>
+  <!-- Train Hours          --> <td>80</td>
+  <!-- Train Speakers       --> <td></td>
+  <!-- License/Availability --> <td>LDC<br>LDC93S6B (WSJ0) and LDC94S13B (WSJ1)</td>
+  <!-- Year Released        --> <td>1993</td>
+  <!-- Speech Style         --> <td>Read speech</td>
+  <!-- Test Domain          --> <td>Same</td>
+  <!-- Kaldi Aprox Perf     --> <td>6-7% WER</td>
+  <!-- Model Type           --> <td></td>
+  <!-- LM Data              --> <td>same as train</td>
+  <!-- Lexicon              --> <td>20k (CMU dict)</td>
+</tr>
+<tr>
+  <!-- Name                 --> <td>RM</td>
+  <!-- BW                   --> <td></td>
+  <!-- Lang                 --> <td>English</td>
+  <!-- Train Domain         --> <td>read transcript<br>limited vocab and grammar</td>
+  <!-- Train Hours          --> <td></td>
+  <!-- Train Speakers       --> <td></td>
+  <!-- License/Availability --> <td>LDC<br>LDC93S3A</td>
+  <!-- Year Released        --> <td>1987-1989</td>
+  <!-- Speech Style         --> <td>read speech</td>
+  <!-- Test Domain          --> <td>same</td>
+  <!-- Kaldi Aprox Perf     --> <td>1-2% WER</td>
+  <!-- Model Type           --> <td></td>
+  <!-- LM Data              --> <td>predefined grammar</td>
+  <!-- Lexicon              --> <td>&lt;1K<br>RM dict</td>
+</tr>
+<tr>
+  <!-- Name                 --> <td>Timit</td>
+  <!-- BW                   --> <td>16k</td>
+  <!-- Lang                 --> <td>English</td>
+  <!-- Train Domain         --> <td>read transcript<br>very limited grammar</td>
+  <!-- Train Hours          --> <td></td>
+  <!-- Train Speakers       --> <td>630</td>
+  <!-- License/Availability --> <td></td>
+  <!-- Year Released        --> <td>1986</td>
+  <!-- Speech Style         --> <td>read speech</td>
+  <!-- Test Domain          --> <td>same</td>
+  <!-- Kaldi Aprox Perf     --> <td>~30-40% PER</td>
+  <!-- Model Type           --> <td></td>
+  <!-- LM Data              --> <td>none</td>
+  <!-- Lexicon              --> <td>~47 phones</td>
+</tr>
+<tr>
+  <!-- Name                 --> <td>fisher_english</td>
+  <!-- BW                   --> <td>8k</td>
+  <!-- Lang                 --> <td>English</td>
+  <!-- Train Domain         --> <td>Telephone speech<br>Auto-transcribed<br>(errorful transcriptions)</td>
+  <!-- Train Hours          --> <td>1,600</td>
+  <!-- Train Speakers       --> <td>5203 M<br>7198 F</td>
+  <!-- License/Availability --> <td>LDC<br>speech: LDC2004S13, LDC2005S13<br>transcript: LDC2004T19, LDC2005T19</td>
+  <!-- Year Released        --> <td>2004/2005</td>
+  <!-- Speech Style         --> <td>CTS</td>
+  <!-- Test Domain          --> <td>Fisher (may<br>overlap witb<br>train)</td>
+  <!-- Kaldi Aprox Perf     --> <td>~22% WER (DNN)</td>
+  <!-- Model Type           --> <td></td>
+  <!-- LM Data              --> <td>LDC Fisher</td>
+  <!-- Lexicon              --> <td>CMU dict<br>Size UNK</td>
+</tr>
+<tr>
+  <!-- Name                 --> <td>Switchboard 1</td>
+  <!-- BW                   --> <td>8k</td>
+  <!-- Lang                 --> <td>English</td>
+  <!-- Train Domain         --> <td>CTS</td>
+  <!-- Train Hours          --> <td>300</td>
+  <!-- Train Speakers       --> <td></td>
+  <!-- License/Availability --> <td>LDC<br>Train: LDC97S62<br>Mississippi State transcriptions<br>Eval: LDC2002S09 and LDC2002T43</td>
+  <!-- Year Released        --> <td>1993/1997/2000</td>
+  <!-- Speech Style         --> <td>CTS</td>
+  <!-- Test Domain          --> <td>CTS<br>eval2000 (hub5)</td>
+  <!-- Kaldi Aprox Perf     --> <td>~10% WER (LSTM)</td>
+  <!-- Model Type           --> <td></td>
+  <!-- LM Data              --> <td>Mississippi Trans<br>+ (opt) Fisher</td>
+  <!-- Lexicon              --> <td>30K (CMU dict)</td>
+</tr>
+<tr>
+  <!-- Name                 --> <td>Switchboard 1<br>+ Fisher</td>
+  <!-- BW                   --> <td>8k</td>
+  <!-- Lang                 --> <td>English</td>
+  <!-- Train Domain         --> <td>CTS</td>
+  <!-- Train Hours          --> <td>see above</td>
+  <!-- Train Speakers       --> <td>see above</td>
+  <!-- License/Availability --> <td>see above</td>
+  <!-- Year Released        --> <td>see above</td>
+  <!-- Speech Style         --> <td>CTS</td>
+  <!-- Test Domain          --> <td>eval2000<br>rt03</td>
+  <!-- Kaldi Aprox Perf     --> <td>~12% eval2000<br>~19% rt03</td>
+  <!-- Model Type           --> <td></td>
+  <!-- LM Data              --> <td>see above</td>
+  <!-- Lexicon              --> <td>see above</td>
+</tr>
+<tr>
+  <!-- Name                 --> <td>Callhome<br>Egyptian</td>
+  <!-- BW                   --> <td></td>
+  <!-- Lang                 --> <td>Egyptian<br>Colloquial<br>Arabic</td>
+  <!-- Train Domain         --> <td>CTS</td>
+  <!-- Train Hours          --> <td>120 conv</td>
+  <!-- Train Speakers       --> <td></td>
+  <!-- License/Availability --> <td>LDC<br>Speech : LDC97S45<br>Transcripts : LDC97T19<br>Lexicon : LDC99L22</td>
+  <!-- Year Released        --> <td>1997</td>
+  <!-- Speech Style         --> <td>CTS</td>
+  <!-- Test Domain          --> <td>hub5 arabic<br>LDC2002S22<br>LDC2002T39</td>
+  <!-- Kaldi Aprox Perf     --> <td>50-60% WER</td>
+  <!-- Model Type           --> <td></td>
+  <!-- LM Data              --> <td>Train trans</td>
+  <!-- Lexicon              --> <td>LDC dict</td>
+</tr>
+<tr>
+  <!-- Name                 --> <td>Corpus of<br>Spontaneous<br>Japanese</td>
+  <!-- BW                   --> <td></td>
+  <!-- Lang                 --> <td>Japanese</td>
+  <!-- Train Domain         --> <td>Mixed style<br>Close-talking mic</td>
+  <!-- Train Hours          --> <td>650 hours<br>(240 hr train)</td>
+  <!-- Train Speakers       --> <td>&gt;1,400</td>
+  <!-- License/Availability --> <td>Unclear how to get this<br>http://www.ninjal.ac.jp/english/products/csj/<br>http://pj.ninjal.ac.jp/corpus_center/csj/</td>
+  <!-- Year Released        --> <td>2004</td>
+  <!-- Speech Style         --> <td>Mixed</td>
+  <!-- Test Domain          --> <td></td>
+  <!-- Kaldi Aprox Perf     --> <td>9-10% WER</td>
+  <!-- Model Type           --> <td></td>
+  <!-- LM Data              --> <td>UNK</td>
+  <!-- Lexicon              --> <td>UNK</td>
+</tr>
+<tr>
+  <!-- Name                 --> <td>Fisher Spanish<br>Callhome Spanish</td>
+  <!-- BW                   --> <td></td>
+  <!-- Lang                 --> <td>Caribbean<br>Spanish</td>
+  <!-- Train Domain         --> <td>CTS</td>
+  <!-- Train Hours          --> <td>Fisher: 163 hrs<br>Callhome: 60 hrs?<br>120 30min conv</td>
+  <!-- Train Speakers       --> <td>Fisher: 136<br>Callhome:</td>
+  <!-- License/Availability --> <td>LDC<br>Fisher speech : LDC96S35<br>Fisher transcripts : LDC96T17<br>Callhome Speech : LDC96S35<br>Callhome Transcripts : LDC96T17</td>
+  <!-- Year Released        --> <td>Fisher: 2010<br>Callhome: 1996</td>
+  <!-- Speech Style         --> <td>CTS</td>
+  <!-- Test Domain          --> <td>Kaldi subset<br>of Fisher</td>
+  <!-- Kaldi Aprox Perf     --> <td>29-30% WER</td>
+  <!-- Model Type           --> <td></td>
+  <!-- LM Data              --> <td>Fisher trans</td>
+  <!-- Lexicon              --> <td>LDC96L16</td>
+</tr>
+<tr>
+  <!-- Name                 --> <td>Gale Arabic<br>Phase 2</td>
+  <!-- BW                   --> <td>16K</td>
+  <!-- Lang                 --> <td>Arabic</td>
+  <!-- Train Domain         --> <td>Broadcast<br>Conversational/Report</td>
+  <!-- Train Hours          --> <td>320 train<br>9.3 test</td>
+  <!-- Train Speakers       --> <td></td>
+  <!-- License/Availability --> <td>LDC2013S02  LDC2014S07<br>LDC2013S07  LDC2014T17<br>LDC2013T17<br>LDC2013T04</td>
+  <!-- Year Released        --> <td>Collected<br>2006/2007</td>
+  <!-- Speech Style         --> <td>Broadcast<br>Conversational<br>and Report</td>
+  <!-- Test Domain          --> <td></td>
+  <!-- Kaldi Aprox Perf     --> <td>Report: 13% WER (LSTM)<br>Conver: 28% WER (LSTM)<br>Comb: 24% WER (LSTM)</td>
+  <!-- Model Type           --> <td></td>
+  <!-- LM Data              --> <td>LDC2013T17<br>LDC2013T04<br>LDC2014T17</td>
+  <!-- Lexicon              --> <td>http://alt.qcri.org/</td>
+</tr>
+<tr>
+  <!-- Name                 --> <td>Gale Mandarin</td>
+  <!-- BW                   --> <td>16K</td>
+  <!-- Lang                 --> <td>Mandarin<br>Chinese</td>
+  <!-- Train Domain         --> <td>Broadcast</td>
+  <!-- Train Hours          --> <td>126</td>
+  <!-- Train Speakers       --> <td></td>
+  <!-- License/Availability --> <td>LDC2013S08  LDC2013T20</td>
+  <!-- Year Released        --> <td>2006-2007</td>
+  <!-- Speech Style         --> <td>Broadcast</td>
+  <!-- Test Domain          --> <td>Same as train</td>
+  <!-- Kaldi Aprox Perf     --> <td>17.5% WER [1]</td>
+  <!-- Model Type           --> <td></td>
+  <!-- LM Data              --> <td>LDC2013S08<br>LDC2013T20</td>
+  <!-- Lexicon              --> <td>Same as HKUST below</td>
+</tr>
+<tr>
+  <!-- Name                 --> <td>hkust<br>EARS RT04F data<br>dev and train [2]</td>
+  <!-- BW                   --> <td>8K</td>
+  <!-- Lang                 --> <td>Mandarin<br>Chinese</td>
+  <!-- Train Domain         --> <td>Telephone Conversational</td>
+  <!-- Train Hours          --> <td>~145</td>
+  <!-- Train Speakers       --> <td>~873</td>
+  <!-- License/Availability --> <td>LDC2005S15  LDC2005T32</td>
+  <!-- Year Released        --> <td>2004</td>
+  <!-- Speech Style         --> <td>Conversational</td>
+  <!-- Test Domain          --> <td>Same as train</td>
+  <!-- Kaldi Aprox Perf     --> <td>33.5% CER</td>
+  <!-- Model Type           --> <td></td>
+  <!-- LM Data              --> <td>Acoustic trans<br>(very little)</td>
+  <!-- Lexicon              --> <td>Both Eng and Man.<br>CMU dict use for Eng<br>mdbg dict use for Man<br>http://www.mdbg.net</td>
+</tr>
+<tr>
+  <!-- Name                 --> <td>librispeech [3]</td>
+  <!-- BW                   --> <td>16K</td>
+  <!-- Lang                 --> <td>English</td>
+  <!-- Train Domain         --> <td>Read transcription</td>
+  <!-- Train Hours          --> <td>100 - 960<br>(460 </td>
+  <!-- Train Speakers       --> <td>F: 125-1128<br>M: 126-1167</td>
+  <!-- License/Availability --> <td>http://www.openslr.org/12/</td>
+  <!-- Year Released        --> <td>2015</td>
+  <!-- Speech Style         --> <td>Read trans</td>
+  <!-- Test Domain          --> <td>Librispeech<br></td>
+  <!-- Kaldi Aprox Perf     --> <td>~5% </td>
+  <!-- Model Type           --> <td></td>
+  <!-- LM Data              --> <td>Large (books)</td>
+  <!-- Lexicon              --> <td>cmu (with sequitur)<br>G2P)</td>
+</tr>
+<tr>
+  <!-- Name                 --> <td>reverb</td>
+  <!-- BW                   --> <td></td>
+  <!-- Lang                 --> <td></td>
+  <!-- Train Domain         --> <td></td>
+  <!-- Train Hours          --> <td></td>
+  <!-- Train Speakers       --> <td></td>
+  <!-- License/Availability --> <td></td>
+  <!-- Year Released        --> <td></td>
+  <!-- Speech Style         --> <td></td>
+  <!-- Test Domain          --> <td></td>
+  <!-- Kaldi Aprox Perf     --> <td></td>
+  <!-- Model Type           --> <td></td>
+  <!-- LM Data              --> <td></td>
+  <!-- Lexicon              --> <td></td>
+</tr>
+<tr>
+  <!-- Name                 --> <td>sprakbanken</td>
+  <!-- BW                   --> <td></td>
+  <!-- Lang                 --> <td>Danish</td>
+  <!-- Train Domain         --> <td>Read transcript?</td>
+  <!-- Train Hours          --> <td>350</td>
+  <!-- Train Speakers       --> <td></td>
+  <!-- License/Availability --> <td>Free download<br>http://www.nb.no/sprakbanken/#ticketsfrom?lang=en</td>
+  <!-- Year Released        --> <td>2012</td>
+  <!-- Speech Style         --> <td>Read/Dictation</td>
+  <!-- Test Domain          --> <td>Same as train</td>
+  <!-- Kaldi Aprox Perf     --> <td>14% WER</td>
+  <!-- Model Type           --> <td></td>
+  <!-- LM Data              --> <td>NST Provided</td>
+  <!-- Lexicon              --> <td>NST Provided?</td>
+</tr>
+<tr>
+  <!-- Name                 --> <td>vystadial_en [4]</td>
+  <!-- BW                   --> <td>8Khz</td>
+  <!-- Lang                 --> <td>English</td>
+  <!-- Train Domain         --> <td>Telephone, dialog system</td>
+  <!-- Train Hours          --> <td>41</td>
+  <!-- Train Speakers       --> <td>unk</td>
+  <!-- License/Availability --> <td>Free</td>
+  <!-- Year Released        --> <td>2014</td>
+  <!-- Speech Style         --> <td>Dialog sys</td>
+  <!-- Test Domain          --> <td>Same as train</td>
+  <!-- Kaldi Aprox Perf     --> <td>~11% WER (GMM/HMM)</td>
+  <!-- Model Type           --> <td></td>
+  <!-- LM Data              --> <td>Train trans</td>
+  <!-- Lexicon              --> <td>CMU + 250</td>
+</tr>
+<tr>
+  <!-- Name                 --> <td>vystadial_cz [4]</td>
+  <!-- BW                   --> <td>8Khz</td>
+  <!-- Lang                 --> <td>Czech</td>
+  <!-- Train Domain         --> <td>Telephone, dialog system</td>
+  <!-- Train Hours          --> <td>15</td>
+  <!-- Train Speakers       --> <td>unk</td>
+  <!-- License/Availability --> <td>Free</td>
+  <!-- Year Released        --> <td>2014</td>
+  <!-- Speech Style         --> <td>Dialog sys</td>
+  <!-- Test Domain          --> <td>Same as train</td>
+  <!-- Kaldi Aprox Perf     --> <td>~50% WER (GMM/HMM)</td>
+  <!-- Model Type           --> <td></td>
+  <!-- LM Data              --> <td>Train trans</td>
+  <!-- Lexicon              --> <td>Rule derived</td>
+</tr>
+<tr>
+  <!-- Name                 --> <td>chime3</td>
+  <!-- BW                   --> <td>16Khz</td>
+  <!-- Lang                 --> <td>English</td>
+  <!-- Train Domain         --> <td>Read trans, simulated<br>and real noise</td>
+  <!-- Train Hours          --> <td>18</td>
+  <!-- Train Speakers       --> <td>WSJ0 + 4</td>
+  <!-- License/Availability --> <td>Not clear (Chime performers)</td>
+  <!-- Year Released        --> <td>2015</td>
+  <!-- Speech Style         --> <td>Read<br>transcript</td>
+  <!-- Test Domain          --> <td>Same as train<br>(same channels!)</td>
+  <!-- Kaldi Aprox Perf     --> <td>~12% WER real (4 spkrs)<br>~12% WER simu</td>
+  <!-- Model Type           --> <td></td>
+  <!-- LM Data              --> <td>Official WSJ0 5K<br>trans</td>
+  <!-- Lexicon              --> <td>WSJ0</td>
+</tr>
+<tr>
+  <!-- Name                 --> <td>voxforge</td>
+  <!-- BW                   --> <td>16Khz</td>
+  <!-- Lang                 --> <td>English</td>
+  <!-- Train Domain         --> <td>Read trans</td>
+  <!-- Train Hours          --> <td>&gt;75hrs</td>
+  <!-- Train Speakers       --> <td>unk</td>
+  <!-- License/Availability --> <td>Free GPL</td>
+  <!-- Year Released        --> <td>2008?</td>
+  <!-- Speech Style         --> <td>Read trans</td>
+  <!-- Test Domain          --> <td>unk</td>
+  <!-- Kaldi Aprox Perf     --> <td>unk</td>
+  <!-- Model Type           --> <td></td>
+  <!-- LM Data              --> <td>Train</td>
+  <!-- Lexicon              --> <td>cmu + g2p for oov</td>
+</tr>
+<tr>
+  <!-- Name                 --> <td>Tedlium</td>
+  <!-- BW                   --> <td>16KHz</td>
+  <!-- Lang                 --> <td>English</td>
+  <!-- Train Domain         --> <td>Presentation/talk</td>
+  <!-- Train Hours          --> <td>118</td>
+  <!-- Train Speakers       --> <td>666</td>
+  <!-- License/Availability --> <td>Free download</td>
+  <!-- Year Released        --> <td>2014?</td>
+  <!-- Speech Style         --> <td>Presentation</td>
+  <!-- Test Domain          --> <td>Same as train</td>
+  <!-- Kaldi Aprox Perf     --> <td>~10% WER</td>
+  <!-- Model Type           --> <td></td>
+  <!-- LM Data              --> <td>Cantab provided LM</td>
+  <!-- Lexicon              --> <td>Cantab provided dict</td>
+</tr>
+<!-- Add a new row by copying a <tr></tr> block from above
+     and pasting it above this message. -->
+</table>
+
+[1] "Audio Augmentation for Speech Recognition" Tom Ko, Vijayaditya Peddinti, Daniel Povey, Sanjeev Khudanpur.<br>
+[2] There should be more Mandarin data from rt04f - 50 hours of dev data I believe (see LDC2004E67, LDC2004E68).  There should also be eval data. See https://www.ldc.upenn.edu/collaborations/past-projects/gale/data/gale-pubs.<br>
+[3] See http://www.danielpovey.com/files/2015_icassp_librispeech.pdf for details.  Acoustic and language models are available online.<br>
+[4] See http://www.lrec-conf.org/proceedings/lrec2014/pdf/535_Paper.pdf.
+*/
diff --git a/src/doc/glossary.dox b/src/doc/glossary.dox
index ba42ea12370..31fa62d3389 100644
--- a/src/doc/glossary.dox
+++ b/src/doc/glossary.dox
@@ -26,7 +26,7 @@
  search function of your browser.  For convenience the definition of each
  term's section is preceded and followed by a colon, so for
  instance, typing ctrl-f ":lattice:" would take you to the section for "lattice".
- 
+
 
 <div style="text-indent: -1.5em;  padding-left: 1.5em;">
 
@@ -37,7 +37,7 @@ synonymous with a sequence of <b>transition-ids</b>.  Most of the time an
 alignment is derived from aligning the reference transcript of an utterance,
 in which case it is called a <b>forced alignment</b>.  <b>lattices</b> also
 contain alignment information as sequences of transition-ids for each word
-sequence in the lattice.  The program \ref bin/show-alignments.cc "show-alignments" shows 
+sequence in the lattice.  The program \ref bin/show-alignments.cc "show-alignments" shows
 alignments in a human-readable format.
 
 <b>:forced alignment:</b> see <b>alignment</b>.
@@ -54,6 +54,18 @@ of the HMMs, and also various other important integer mappings; see \ref transit
 This object is generally written at the start of model files.  The program
 \ref bin/show-transitions.cc "show-transitions" shows these.
 
+<b>:G.fst:</b>  The grammar FST <code>G.fst</code> which lives in the
+  <code>data/lang/</code> directory in the scripts (see \ref data_prep_lang) represents
+  the language model in a Finite State Transducer format (see www.openfst.org).
+ For the most part it is an acceptor, meaning the input and output symbols on the
+ arcs are the same, but for statistical language models with backoff, the backoff
+ arcs have the "disambiguation symbol" <code>#0</code> on the input side only.
+ For many purposes you'll want to get rid of the disambiguation symbols
+  using the command <code>fstproject --project_output=true</code>.  The disambiguation symbols
+ are needed during graph compilation to make the FST determinizable, but for things
+ like language-model rescoring you don't want them.
+
+
 </div>
 
 */
diff --git a/src/doc/history.dox b/src/doc/history.dox
index a3cb6d6fe27..bf114a3a9e0 100644
--- a/src/doc/history.dox
+++ b/src/doc/history.dox
@@ -55,13 +55,13 @@
  and documentation); we were visited by Michael Riley (who helped us to understand
  OpenFst and gave some lectures on FSTs), and would like to acknowledge the help of
  Honza Cernocky (for allowing us to have the workshop and helping to organize it),
- Renata Kohlova (administration), and Tomas Kasparek (system administration).  
+ Renata Kohlova (administration), and Tomas Kasparek (system administration).
  It is possible that this list of contributors contains
  oversights; any important omissions are unlikely to be intentional.
 
  A lot of code was written during the summer of 2010 but we still did not have a
  complete working system.  Some of the participants of the 2010 workshop
- continued working to complete the toolkit and get a working set of training scripts.  
+ continued working to complete the toolkit and get a working set of training scripts.
  The code was released on May 14th, 2011.
 
  Since the initial release, Kaldi has been maintained and developed to a large
@@ -95,9 +95,15 @@
  for his help in organizing the JHU'09 workshop and with the Wall Street
  Journal recipe.  We would also like to acknowledge the help
  of faculty and staff at Johns Hopkins University's Center for Language and
- Speech Processing during the JHU'09 workshop: particularly 
+ Speech Processing during the JHU'09 workshop: particularly
  Sanjeev Khudanpur, Desiree Cleves and the late Fred Jelinek.
 
+ Since 2012, Kaldi development has received significant support from IARPA's
+ BABEL program (IARPA-BAA-11-02) and from the Human Language Technology
+ Center of Excellence (HLTCOE); and since 2015, from the NSF computing
+ research infrastructure (CRI) award ``CI-EN: Enhancements for the Kaldi Speech
+ Recognition Toolkit''.
+
  Sanjeev Khudanpur deserves special mention for creating the conditions for the
  Kaldi project to succeed, first at the JHU'09 workshop where in his role as
  workshop organizer he was instrumental in putting the team together
diff --git a/src/doc/hmm.dox b/src/doc/hmm.dox
index 9935fa52711..938321fd7b2 100644
--- a/src/doc/hmm.dox
+++ b/src/doc/hmm.dox
@@ -447,9 +447,10 @@ We now explain what these three scales do:
    when we add the self-loop, let the probability mass given to the self-loop be p
    and the mass given to the rest be (1-p).  We add a self-loop with log-probability
    self_loop_scale * log(p), and add (self_loop_scale * log(1-p)) to all the other 
-   log transition probabilities
-   out of that state.  In typical topologies, the self-loop scale is the only scale
-   that matters.
+   log transition probabilities out of that state.  (Note: in the initial stage of
+   graph creation we create a graph without self-loops, and with the non-self-loop
+   transition probabilities renormalized to sum to one).  In typical topologies, the 
+   self-loop scale is the only scale that matters.
 
 The reason we feel it might make sense to apply a different probability scale to
 the self-loops versus the normal transition scale is we think they could be
diff --git a/src/doc/install.dox b/src/doc/install.dox
index 0ffb2b1220f..b40b139a8dc 100644
--- a/src/doc/install.dox
+++ b/src/doc/install.dox
@@ -29,8 +29,8 @@
   possibly including unfinished and experimental features, can
    be downloaded by typing into a shell:
   \verbatim
-    git clone https://github.com/kaldi-asr/kaldi.git kaldi-trunk --origin golden
-    cd kaldi-trunk
+    git clone https://github.com/kaldi-asr/kaldi.git kaldi --origin upstream
+    cd kaldi
   \endverbatim
  If you want to get updates and bug fixes you can go to some checked-out
  directory, and type
diff --git a/src/doc/mainpage.dox b/src/doc/mainpage.dox
index 1665607b330..7bedc25ef13 100644
--- a/src/doc/mainpage.dox
+++ b/src/doc/mainpage.dox
@@ -45,6 +45,7 @@
    - \subpage dependencies
    - \subpage legal
    - \subpage tutorial
+   - \subpage examples
    - \subpage glossary
    - \subpage data_prep
    - \subpage build_setup 
@@ -75,6 +76,7 @@
      - \ref dnn1
      - \ref dnn2
      - \ref dnn3     
+     - \ref chain
    - \subpage online_decoding
    - \subpage kws
    - \subpage queue
diff --git a/src/doc/online_decoding.dox b/src/doc/online_decoding.dox
index e03d350e308..97c81dd0bcc 100644
--- a/src/doc/online_decoding.dox
+++ b/src/doc/online_decoding.dox
@@ -40,11 +40,11 @@ namespace kaldi {
  deprecated, and may eventually be removed from the trunk (but remain in
  ^/branches/complete).
 
- There is some documentation for the older setup \ref online_programs "here", 
+ There is some documentation for the older setup \ref online_programs "here",
  but we recommend to read this page first.
 
   \section online_decoding_scope Scope of online decoding in Kaldi
-   
+
   In Kaldi we aim to provide facilities for online decoding as a library.
   That is, we aim to provide the functionality for online decoding but
   not necessarily command-line tools for it.  The reason is, different
@@ -59,7 +59,7 @@ namespace kaldi {
   neural net models (see section \ref online_decoding_nnet2).
 
   \section GMM-based online decoding
- 
+
   The program online2-wav-gmm-latgen-faster.cc is currently the primary example program
   for the GMM-based online-decoding setup.  It reads in whole wave files but internally it processes them chunk by chunk with
   no dependency on the future.  In the example script egs/rm/s5/local/online/run_gmm.sh
@@ -68,7 +68,7 @@ namespace kaldi {
   procedure within a typical batch-processing framework, so that you can easily
   evaluate word error rates.  We plan to add similar programs for SGMMs and DNNs.
   In order to actually do online decoding, you would have to modify this program.
-  We should note (and this is obvious to speech recognition people but not to outsiders) 
+  We should note (and this is obvious to speech recognition people but not to outsiders)
   that the audio sample rate needs to exactly match what you used in training (and
   oversampling won't work but subsampling will).
 
@@ -77,11 +77,11 @@ namespace kaldi {
   In Kaldi, when we use the term "decoder" we don't generally mean the entire decoding
   program.  We mean the inner decoder object, generally of the type LatticeFasterDecoder.
   This object takes the decoding graph (as an FST), and the decodable object
-  (see \ref decodable_interface).  All the decoders naturally support online decoding; it 
+  (see \ref decodable_interface).  All the decoders naturally support online decoding; it
   is the code in the decoding program (but outside of the decoder) that needs to
   change.  We should note, though, a difference in how you need to invoke the decoder
   for online decoding.
-     - In the old online-decoding setup (in online/), if "decoder" is some decoder 
+     - In the old online-decoding setup (in online/), if "decoder" is some decoder
        (e.g. of type LatticeFasterDecoder) and "decodable" is a decodable object of
        a suitable type, you would call decoder.Decode(&decodable),
        and this call would block until the input was finished (because the decoder
@@ -90,16 +90,16 @@ namespace kaldi {
        decoder.InitDecoding(), and then each time you get more feature data, you
        would call decoder.AdvanceDecoding().  For offline use, you can still call
        Decode().
-  
+
   We should mention here that in the old online setup, there is a decoder called
   OnlineFasterDecoder.  Do not assume from the name of this that it is the only
   decoder to support online decoding.  The special thing about the OnlineFasterDecoder
   is that it has the ability to work out which words are going to be "inevitably"
   decoded regardless of what audio data comes in in future, so you can output those
   words.  This is useful in an online-transcription context, and if there seems to
-  be a demand for this, we may move that decoder from online/ into the decoder/ 
-  directory and make it compatible with the new online setup.  
-  
+  be a demand for this, we may move that decoder from online/ into the decoder/
+  directory and make it compatible with the new online setup.
+
 
   \section online_decoding_feature Feature extraction in online decoding
 
@@ -113,14 +113,14 @@ namespace kaldi {
   (OnlineFeatureInterface::GetFrame()) and how it says how many frames
   are ready (OnlineFeatureInterface::NumFramesReady()), but does not
   say how it obtains those features.  That is up to the child class.
-  
+
   In online-feature.h we define classes OnlineMfcc and OnlinePlp which
-  are the lowest-level features.  They have a member function 
+  are the lowest-level features.  They have a member function
   OnlineMfccOrPlp::AcceptWaveform(), which the user should call when
   data is captured.  All the other online feature types in online-feature.h
   are "derived" features, so they take an object of OnlineFeatureInterface
   in their constructor and get their input features through a stored pointer
-  to that object.  
+  to that object.
 
   The only part of the online feature extraction code in online-feature.h
   that is non-trivial is the cepstral mean and variance normalization (CMVN)
@@ -143,7 +143,7 @@ namespace kaldi {
   In the Kaldi scripts, cepstral mean and variance normalization (CMVN) is
   generally done on a per-speaker basis.  Obviously in an online-decoding
   context, this is impossible to do because it is "non-causal" (the current
-  feature depends on future features).  
+  feature depends on future features).
 
   The basic solution we use is to do "moving-window" cepstral mean
   normalization.  We accumulate the mean over a moving window of, by default, 6
@@ -178,7 +178,7 @@ namespace kaldi {
   using a method called basis-fMLLR (again, see below) where we incrementally
   estimate the parameters, and it is not completely invariant to offsets.
 
- 
+
   \section online_decoding_adaptation  Adaptation in online decoding
 
  The most standard adaptation method used for speech recognition is
@@ -187,13 +187,13 @@ namespace kaldi {
  code and documentation.  fMLLR consists of an affine (linear + offset) transform
  of the features; the number of parameters is d * (d+1), where d is the
  final feature dimension (typically 40).  In the online decoding program
- a basis method to incrementally estimate an increasing number of 
+ a basis method to incrementally estimate an increasing number of
  transform parameters as we decode more data.  The top-level logic for this at the
  decoder level is mostly implemented in class SingleUtteranceGmmDecoder.
- 
+
  The fMLLR estimation is done not continuously but periodically, since it involvesa
  computing lattice posteriors and this can't very easily be done in a continuous
- manner.  Configuration variables in class OnlineGmmDecodingAdaptationPolicyConfig 
+ manner.  Configuration variables in class OnlineGmmDecodingAdaptationPolicyConfig
  determine when we re-estimate fMLLR.  The default currently is, during the first
  utterance, to estimate it after 2 seconds, and thereafter at times in a geometrically
  increasing ratio with constant 1.5 (so at 2 seconds, 3 seconds, 4.5 seconds...).
@@ -202,11 +202,11 @@ namespace kaldi {
 
  Note that the CMN adaptation state is frozen, as mentioned above, the first time
  we estimate fMLLR for a speaker, which by default will be two seconds into the
- first utterance.  
+ first utterance.
 
  \section online_decoding_models  Use of multiple models in GMM-based online decoding
 
-   In the online decoding decode for GMMs in online-gmm-decoding.h, up to three 
+   In the online decoding decode for GMMs in online-gmm-decoding.h, up to three
   models can be supplied.  These are held in class OnlineGmmDecodingModels, which
   takes care of the logic necessary to decide which model to use for different purposes
   if fewer models are supplied.  The three models are:
@@ -215,12 +215,12 @@ namespace kaldi {
      - A speaker adapted model, trained with fMLLR
      - A discriminatively trained version of the speaker adapted model
   It is our practice to use a Maximum Likelihood estimated model to estimate
-  adaptation parameters, as this is more consistent with the Maximum framework 
+  adaptation parameters, as this is more consistent with the Maximum framework
   than using a discriminatively trained model, although this probably makes little
   difference and you would lose little (and save some memory) by using the discriminatively
-  trained model for this purpose. 
+  trained model for this purpose.
+
 
-  
  \section online_decoding_nnet2  Neural net based online decoding with iVectors
 
   Our best online-decoding setup, which we recommend should be used, is the neural
@@ -245,31 +245,31 @@ namespace kaldi {
  example setups, e.g. in egs/rm/s5, egs/wsj/s5, egs/swbd/s5b,  and egs/fisher_english/s5.
  The top-level example script is always called local/online/run_nnet2.sh.  In the case of the
  Resource Management recipe there is also a script local/online/run_nnet2_wsj.sh.  This demonstrates
- how to take a larger neural net trained on out-of-domain speech with the same sampling rate (in 
+ how to take a larger neural net trained on out-of-domain speech with the same sampling rate (in
  this example, WSJ), and retrain it on in-domain data.  In this way we obtained our best-ever
  results on RM.
 
  We are currently working on example scripts for discriminative training for this setup.
 
   \subsection online_decoding_nnet2_example Example for using already-built online-nnet2 models
-  
+
   In this section we will explain how to download already-build online-nnet2 models from www.kaldi-asr.org
   and evaluate them on your own data.
 
- The reader can download the models and other relating files from <b> 
- http://kaldi-asr.org/downloads/build/2/sandbox/online/egs/fisher_english/s5 </b>, 
+ The reader can download the models and other relating files from <b>
+ http://kaldi-asr.org/downloads/build/2/sandbox/online/egs/fisher_english/s5 </b>,
  which are built using the fisher_english recipe. To use the online-nnet2 models, the reader
- only needs to download two directories: exp/tri5a/graph and exp/nnet2_online/nnet_a_gpu_online. Use the 
+ only needs to download two directories: exp/tri5a/graph and exp/nnet2_online/nnet_a_gpu_online. Use the
  following commands to download the archives and extract them:
- 
+
  \verbatim
 wget http://kaldi-asr.org/downloads/build/5/trunk/egs/fisher_english/s5/exp/nnet2_online/nnet_a_gpu_online/archive.tar.gz -O nnet_a_gpu_online.tar.gz
 wget http://kaldi-asr.org/downloads/build/2/sandbox/online/egs/fisher_english/s5/exp/tri5a/graph/archive.tar.gz -O graph.tar.gz
 mkdir -p nnet_a_gpu_online graph
 tar zxvf nnet_a_gpu_online.tar.gz -C nnet_a_gpu_online
 tar zxvf graph.tar.gz -C graph
- \endverbatim 
- Here the archives are extracted to the local directory.  We need to modify pathnames in the 
+ \endverbatim
+ Here the archives are extracted to the local directory.  We need to modify pathnames in the
  config files, which we can do as follows:
 \verbatim
 for x in nnet_a_gpu_online/conf/*conf; do
@@ -280,7 +280,7 @@ done
  Next, choose a single wav file to decode. The reader can download a sample file by typing
  \verbatim
  wget http://www.signalogic.com/melp/EngSamples/Orig/ENG_M.wav
- \endverbatim 
+ \endverbatim
  This is a 8kHz-sampled wav file that we found online (unfortunately it is UK
  English, so the accuracy is not very good).  It can be decoded with the following command:
  \verbatim
@@ -296,17 +296,132 @@ done
  You can see the result in the logging output (although there are other ways to retrieve this).
  For us, the logging output was as follows:
 \verbatim
-/home/dpovey/kaldi-online/src/online2bin/online2-wav-nnet2-latgen-faster --do-endpointing=false --online=false --config=nnet_a_gpu_online/conf/online_nnet2_decoding.conf --max-active=7000 --beam=15.0 --lattice-beam=6.0 --acoustic-scale=0.1 --word-symbol-table=graph/words.txt nnet_a_gpu_online/smbr_epoch2.mdl graph/HCLG.fst 'ark:echo utterance-id1 utterance-id1|' 'scp:echo utterance-id1 ENG_M.wav|' ark:/dev/null 
+/home/dpovey/kaldi-online/src/online2bin/online2-wav-nnet2-latgen-faster --do-endpointing=false --online=false --config=nnet_a_gpu_online/conf/online_nnet2_decoding.conf --max-active=7000 --beam=15.0 --lattice-beam=6.0 --acoustic-scale=0.1 --word-symbol-table=graph/words.txt nnet_a_gpu_online/smbr_epoch2.mdl graph/HCLG.fst 'ark:echo utterance-id1 utterance-id1|' 'scp:echo utterance-id1 ENG_M.wav|' ark:/dev/null
 LOG (online2-wav-nnet2-latgen-faster:ComputeDerivedVars():ivector-extractor.cc:180) Computing derived variables for iVector extractor
 LOG (online2-wav-nnet2-latgen-faster:ComputeDerivedVars():ivector-extractor.cc:201) Done.
-utterance-id1 tons of who was on the way for races two miles and then in nineteen ninety to buy sodas sale the rate them all these to commemorate columbus is drawn into the new world five hundred years ago on the one to the moon is to promote the use of so the sales in space exploration 
+utterance-id1 tons of who was on the way for races two miles and then in nineteen ninety to buy sodas sale the rate them all these to commemorate columbus is drawn into the new world five hundred years ago on the one to the moon is to promote the use of so the sales in space exploration
 LOG (online2-wav-nnet2-latgen-faster:main():online2-wav-nnet2-latgen-faster.cc:253) Decoded utterance utterance-id1
 LOG (online2-wav-nnet2-latgen-faster:Print():online-timing.cc:51) Timing stats: real-time factor for offline decoding was 1.62102 = 26.7482 seconds  / 16.5009 seconds.
 LOG (online2-wav-nnet2-latgen-faster:main():online2-wav-nnet2-latgen-faster.cc:259) Decoded 1 utterances, 0 with errors.
 LOG (online2-wav-nnet2-latgen-faster:main():online2-wav-nnet2-latgen-faster.cc:261) Overall likelihood per frame was 0.230575 per frame over 1648 frames.
 \endverbatim
 
+Note that for mismatched data, sometimes the iVector estimation can get confused and lead to bad results.
+Something that we have found useful is to weight down the silence in the iVector estimation.
+To do this you can set e.g. <code>--ivector-silence-weighting.silence-weight=0.001</code>; you need to set the silence
+phones as appropriate, e.g. <code>--ivector-silence-weighting.silence-phones=1:2:3:4</code>
+(this should be a list of silence or noise phones in your phones.txt; you can experiment with
+which ones to include).
+
+\subsection online_decoding_nnet2_lm Example for using your own language model with existing online-nnet2 models
+Oftentimes users will have to use their own language model to improve the
+recognition accuracy. In this section we will explain how to build a language
+model with SRILM, and how to incorporate this language model to the existing
+online-nnet2 models.
+
+We first have to build an ARPA format language model with SRILM. Note that SRILM
+comes with a lot of training options, and we assume it's the user's
+responsibility to figure out what is the best setting for their own application.
+Suppose "train.txt" is our language model training corpus (e.g., training
+data transcriptions), and "wordlist" is our vocabulary. Here we assume the
+language model vocabulary is the same as the recognizer's vocabulary, i.e., it
+only contains the words from data/lang/words.txt, except the epsilon symbol
+"<eps>" and disambiguation symbol "#0". We will explain how we can use a
+different vocabulary in the next section. We can build a 3gram Kneser-Ney
+language model using the following SRILM command
+\verbatim
+ngram-count -text train.txt -order 3 -limit-vocab -vocab wordlist -unk \
+  -map-unk "<unk>" -kndiscount -interpolate -lm srilm.o3g.kn.gz
+\endverbatim
 
+Now that we have the ARPA format language model trained, we have to compile it
+into WFST format. Let's first define the following variables
+\verbatim
+lm=srilm.o3g.kn.gz                      # ARPA format LM you just built.
+lang=data/lang                          # Old lang directory provided by the online-nnet2 models
+lang_own=data/lang_own                  # New lang directory we are going to create, which contains the new language model
+lang_own_tmp=data/local/lang_own_tmp/   # Temporary directory.
+\endverbatim
+
+Given the above variables, we can compile an ARPA format language model into
+WFST format using the following commands
+\verbatim
+mkdir -p $lang_own_tmp
+mkdir -p $lang_own
+cp -r $lang/* $lang_own
+gunzip -c $lm | utils/find_arpa_oovs.pl $lang_own/words.txt \
+  > $lang_own_tmp/oovs.txt || exit 1
+gunzip -c $lm | \
+  grep -v '<s> <s>' | \
+  grep -v '</s> <s>' | \
+  grep -v '</s> </s>' | \
+  arpa2fst - | fstprint | \
+  utils/remove_oovs.pl $lang_own_tmp/oovs.txt | \
+  utils/eps2disambig.pl | utils/s2eps.pl | \
+  fstcompile --isymbols=$lang_own/words.txt --osymbols=$lang_own/words.txt  \
+  --keep_isymbols=false --keep_osymbols=false | \
+  fstrmepsilon | fstarcsort --sort_type=ilabel > $lang_own/G.fst
+utils/validate_lang.pl --skip-determinization-check $lang_own || exit 1;
+\endverbatim
+
+Now, we can compile the decoding graph with the new language model, using the
+following command
+\verbatim
+graph_own_dir=$model_dir/graph_own
+utils/mkgraph.sh $lang_own $model_dir $graph_own_dir || exit 1;
+\endverbatim
+where $model_dir is the model directory which contains the model "final.mdl"
+and the tree "tree". At this point, we can use $graph_own_dir/HCLG.fst to
+replace the old HCLG.fst, which uses the language model we just built.
+
+\subsection online_decoding_nnet2_vocab Example for using a different vocabulary with existing online-nnet2 models
+For most applications users will also have to change the recognizer's existing
+vocabulary, for example, adding out-of-vocabulary words such as person names
+to the existing vocabulary. In this section we will explain how this can be
+done.
+
+We first have to create a new pronunciation lexicon, typically by adding more
+words to the recognizer's existing pronunciation lexicon. The recognizer's
+lexicon that we are going to modify is usually located at the $dict_dir/lexicon.txt,
+where $dict_dir is the recognizer's dictionary directory, and is usually
+data/local/dict. The new lexicon can be created manually by adding new lexical
+entries to $dict_dir/lexicon.txt. If we do not have pronunciations for the new
+words, we can use grapheme-to-phoneme (G2P) conversion to generate pronunciations
+automatically. The commonly used G2P tools are Sequitur and Phonetisaurus, the
+later is usually much faster.
+
+The second step is to create a dictionary directory for our new lexicon, which
+contains the required files, for example, lexicon.txt, lexiconp.txt, etc.
+Most likely if we don't change the lexicon's phone set, the old files such as
+extra_questions.txt, nonsilence_phones.txt, optional_silence.txt,
+silence_phones.txt can be re-used. For details of how to create those files, we
+suggest the users follow the existing Kaldi scripts, for example this one:
+egs/wsj/s5/local/wsj_prepare_dict.sh. The format of the dictionary directory is
+described \ref data_prep_lang_creating "here". 
+
+Now we can create a new lang directory with the updated lexicon. Suppose
+$lang is the recognizer's old lang directory, $lang_own is the new lang
+directory that we are going to create, $dict_own is the dictionary directory we
+just created, and "<SPOKEN_NOISE>" is the word symbol that represents
+out-of-vocabulary words in the lexicon, we can generate the new lang directory
+with the updated lexicon using the following command
+\verbatim
+utils/prepare_lang.sh \
+  --phone-symbol-table $lang/phones.txt \
+  $dict_own "<SPOKEN_NOISE>" $lang_own_tmp $lang_own
+\endverbatim
+Make usre you use the option "--phone-symbol-table", which makes sure that
+phones in your new lexicon will be compatible with the recognizer.
+
+The last step is of course to update the decoding graph, using the following
+command
+\verbatim
+graph_own_dir=$model_dir/graph_own
+utils/mkgraph.sh $lang_own $model_dir $graph_own_dir || exit 1;
+\endverbatim
+where $model_dir is the model directory which contains the model "final.mdl"
+and the tree "tree". We now can use $graph_own_dir/HCLG.fst to replace the old
+HCLG.fst.
 */
 
 
diff --git a/src/doc/queue.dox b/src/doc/queue.dox
index cdf0cf63c40..72b2c44eab8 100644
--- a/src/doc/queue.dox
+++ b/src/doc/queue.dox
@@ -34,7 +34,7 @@ namespace kaldi {
  If you look at a top-level example script like <tt>egs/wsj/s5/run.sh</tt>, you'll see commands like
 \verbatim
  steps/train_sat.sh  --cmd "$train_cmd" \
-   4200 40000 data/train_si284 data/lang exp/tri3b_ali_si284 exp/tri4a 
+   4200 40000 data/train_si284 data/lang exp/tri3b_ali_si284 exp/tri4a
 \endverbatim
  At the top of the <tt>run.sh</tt> script you'll see it sourcing a file called <tt>cmd.sh</tt>:
 \verbatim
@@ -44,8 +44,8 @@ and in <tt>cmd.sh</tt> you'll see the following variable being set:
 \verbatim
 export train_cmd="queue.pl -l arch=*64"
 \endverbatim
-You'll change this variable if you don't have GridEngine or if your queue is configured 
-differently from CLSP\@JHU.   To run everything locally on a single machine you can 
+You'll change this variable if you don't have GridEngine or if your queue is configured
+differently from CLSP\@JHU.   To run everything locally on a single machine you can
 set <tt>export train_cmd=run.pl</tt>.
 
 In <tt>steps/train_sat.sh</tt> the varible <tt>cmd</tt> is set to the argument
@@ -87,10 +87,10 @@ In this case, the command that actually gets executed will be something like:
 \verbatim
 echo "hello world number JOB" | head -n 1 > output.JOB
 \endverbatim
-If you want to see what's actually getting executed, you can look in a file like 
+If you want to see what's actually getting executed, you can look in a file like
 <tt>foo.1.log</tt>, where you'll see the following:
 \verbatim
-# echo "hello world number 1" | head -n 1 > output.1 
+# echo "hello world number 1" | head -n 1 > output.1
 # Started at Sat Jan  3 17:44:20 PST 2015
 #
 # Accounting: time=0 threads=1
@@ -114,15 +114,15 @@ and what we are about to say also holds for <tt>run.pl</tt>, <tt>ssh.pl</tt> and
 
 <tt><options></tt> may include some or all of the following:
 <ul>
-  <li> A job range specifier (e.g. JOB=1:10).  The name is uppercase by convention only, and may include underscores. 
+  <li> A job range specifier (e.g. JOB=1:10).  The name is uppercase by convention only, and may include underscores.
        The starting index must be 1 or more; this is a GridEngine limitation.
-  <li> Anything that looks as if it would be accepted by GridEngine as an option to <tt>qsub</tt>.  
+  <li> Anything that looks as if it would be accepted by GridEngine as an option to <tt>qsub</tt>.
        For example, <tt>-l arch=*64*</tt>, or <tt>-l mem_free=6G,ram_free=6G</tt>, or <tt>-pe smp 6</tt>.
        For compatibility, scripts other than <tt>queue.pl</tt> will ignore such options.
   <li> New-style options like <tt>--mem 10G</tt> (see below).
 </ul>
 
-<tt><log-file></tt> is just a filename, which for array jobs must contain the identifier of 
+<tt><log-file></tt> is just a filename, which for array jobs must contain the identifier of
  the array (e.g. <tt>exp/foo/log/process_data.JOB.log</tt>).
 
 <tt><command></tt> can basically be anything, including symbols that would
@@ -144,7 +144,7 @@ string itself contains single quotes then it uses double quotes instead.  This
 usually does what we want.  The <tt>PATH</tt> variable from the shell that
 you executed </tt>queue.pl</tt> from will be passed through to the scripts
 that get executed, and just to be certain you get everything you need,
-the file <tt>./path.sh</tt> will also be sourced.  The commands will be executed 
+the file <tt>./path.sh</tt> will also be sourced.  The commands will be executed
 with bash.
 
 \subsection parallelization_common_new New-style options (unified interface)
@@ -177,7 +177,7 @@ file specifies how to convert the "new-style" options into options that
 GridEngine or similar software can interpret.  The following example show the
 behavior that the default config file specifies:
 <table>
- <tr> <th> New-style option </th>  <th>  Converted form (for GridEngine) </th> <th> Comment </th> </tr>  
+ <tr> <th> New-style option </th>  <th>  Converted form (for GridEngine) </th> <th> Comment </th> </tr>
  <tr> <td> <tt>--mem 10G</tt></td>  <td> <tt>-l mem_free=10G,ram_free=10G</tt></td> <td></td> </tr>
  <tr> <td> <tt>--max-jobs-run 10</tt></td> <td> <tt>-tc 10</tt></td> <td> (We use this for jobs that cause too much I/O). </td></tr>
  <tr> <td> <tt>--num-threads 6</tt></td> <td> <tt>-pe smp 6</tt></td> <td></td>(general case) </tr>
@@ -187,7 +187,7 @@ behavior that the default config file specifies:
 </table>
 It's also possible to add extra options with this general format, i.e. options
 that look like
-<tt>--foo-bar</tt> and take one argument.  The default configuration tabulated above works for the CLSP grid 
+<tt>--foo-bar</tt> and take one argument.  The default configuration tabulated above works for the CLSP grid
 but may not work everywhere, because GridEngine is very configurable.  Thefore you may
 have to create a config file <tt>conf/queue.conf</tt> and edit it to work with your grid.
 The following configuration file is the one that <tt>queue.pl</tt> defaults to if <tt>conf/queue.conf</tt>
@@ -209,10 +209,10 @@ The line beginning with
 <tt>command</tt> specifies the unchanging part of the command line, and you can
 modify this to get it to use grid software other than GridEngine, or to specify
 options that you always want.  The lines beginning with
-<tt>option</tt> specify how to transform the input options such as <tt>--mem</tt>. 
+<tt>option</tt> specify how to transform the input options such as <tt>--mem</tt>.
 Lines beginning with something like <tt>"option mem=*"</tt> handle the general case
 (the <tt>$0</tt> gets replaced with the actual argument to the option), while
-lines like <tt>"option gpu=0"</tt> allow you to specify special behavior for special 
+lines like <tt>"option gpu=0"</tt> allow you to specify special behavior for special
 cases of the argument, so in this case the option <tt>--gpu 0</tt> is configured
 to produce no extra options to <tt>qsub</tt> at all.  The line <tt>"default gpu=0"</tt> specifies
 that if you don't give the <tt>--gpu</tt> option at all, <tt>queue.pl</tt> should act like
@@ -224,10 +224,10 @@ configured with a line: <tt>"option gpu=0 -q all.q"</tt>, so there was a time wh
 
 The mapping from what the config-file specifies to what appears on the command-line
 of qsub sometimes has to be tweaked slightly in the perl code: for instance, we made it
-so that the <tt>--max-jobs-run</tt> option is ignored for non-array jobs.  
+so that the <tt>--max-jobs-run</tt> option is ignored for non-array jobs.
+
+ \subsection parallelization_common_new_example Example of configuring grid software with new-style options
 
- \subsection parallelization_common_new_example Example of configuring grid
- software with new-style options
 
 We'd like to give an example of how the config file can be used in a real
 situation.  We had a problem where, due to a bug in an outdated version of the
@@ -270,10 +270,10 @@ parallelization scripts.
  line.  The scripts that we ask <tt>qsub</tt> to run also make use of the variable
  <tt>$SGE_TASK_ID</tt>, which SGE sets to the job index for array jobs.  Our plan is to extend
  the config-file mechanism as necessary to accommodate whatever changes are needed to support
- other grid software, within reason. 
+ other grid software, within reason.
 
  Since we have explained the behavior of <tt>queue.pl</tt> at length above, we aren't going
- to provide many further details in this section, but please see below the section 
+ to provide many further details in this section, but please see below the section
  \ref parallelization_gridengine.
 
 \subsection parallelization_specific_run Parallelization using run.pl
@@ -300,12 +300,12 @@ parallelization scripts.
 
  <tt>ssh.pl</tt> is a poor man's <tt>queue.pl</tt>, for use in case
  you have a small cluster of several machines but don't want the trouble of setting
- up GridEngine.  Like <tt>run.pl</tt>, it doesn't attempt to keep track of 
+ up GridEngine.  Like <tt>run.pl</tt>, it doesn't attempt to keep track of
  CPUs or memory; it works like <tt>run.pl</tt> except that it distributes the
- jobs across multiple machines.  
+ jobs across multiple machines.
  You have to create a file <tt>.queue/machines</tt>
  (where <tt>.queue</tt> is a subdirectory of the directory you are running the script from),
- where each line contains the name of a machine.  It needs to be possible to ssh to each 
+ where each line contains the name of a machine.  It needs to be possible to ssh to each
  of these machines without a password, i.e. you have to set up your ssh keys.
 
 
@@ -355,7 +355,7 @@ parallelization scripts.
   To install GridEngine on the master, you'll run (on your chosen master node):
 \verbatim
   sudo apt-get install gridengine-master gridengine-client
-\endverbatim 
+\endverbatim
  Select "yes" for automatic configuration.
  It will ask you for the "cell name", which you can leave as "default", and it
  will ask for the name of the "master", which you should set to the hostname of
@@ -366,7 +366,7 @@ parallelization scripts.
  sometimes be traced to this.  Also be aware that doing "apt-get remove" of these
  packages and reinstalling them won't give you a blank slate because Debian
  sometimes remembers your selections; this can be a pain.
- 
+
  It will make your life easier if you add yourself as manager, so do:
 \verbatim
  sudo qconf -am <your-user-id>
@@ -377,9 +377,9 @@ parallelization scripts.
  To install GridEngine on the normal nodes, you'll run
 \verbatim
   sudo apt-get install gridengine-client gridengine-exec
-\endverbatim 
+\endverbatim
  The "cell name" should be left as "default", and the "master" should be the name of
- the master node that you previously installed.  
+ the master node that you previously installed.
  You can run this on the master too if the master is to run jobs also.
 
  Typing <tt>qstat</tt> and <tt>qhost -q</tt> will let you know whether things are working.
@@ -399,13 +399,13 @@ instance-1.c.analytical-rig-638.internal lx26-amd64      1  0.07    3.6G  133.9M
  doesn't like it when these things are inconsistent.  If you need to change the name of
  the master from what you told the installer, you may be able to do so by editing the file
 \verbatim
-/var/lib/gridengine/default/common/act_qmaster 
+/var/lib/gridengine/default/common/act_qmaster
 \endverbatim
  (at least, this is where it's located in Debian Wheezy).
 
   \subsection parallelization_gridengine_configuring Configuring GridEngine
 
- First let's make sure that a queue is defined.  GridEngine doesn't define any queues by 
+ First let's make sure that a queue is defined.  GridEngine doesn't define any queues by
  default.  We'll set up a queue called <tt>all.q</tt>.  Make sure the shell variable <tt>EDITOR</tt>
  is set to your favorite shell (e.g. <tt>vim</tt> or <tt>emacs</tt>), and type as follows; and this should
  work from master or client.
@@ -438,9 +438,9 @@ change <tt>root</tt> to an email address where you want to receive
 notifications if things go wrong.  Be advised that due to anti-spam measures,
 sending emails from the cloud is painful from EC2 and close to impossible
 from Google's cloud offering, so it may be best just to leave this field the
-way it is and make do without email notifications.  You could also edit the file so that it says 
+way it is and make do without email notifications.  You could also edit the file so that it says
 \verbatim
-                      flush_time=00:00:10 
+                      flush_time=00:00:10
 \endverbatim (the default is <tt>00:00:15</tt>), which will
 give a slightly faster turnaround time for submitting jobs.
 
@@ -449,23 +449,23 @@ your jobs, and these can be viewed using <tt>qconf -sc</tt>.  Modify them
 using <tt>qconf -mc</tt>.  Modify the <tt>mem_free</tt> line to change the default
 memory requirement from 0 to 1G, i.e.:
 \verbatim
-#name               shortcut    type        relop requestable consumable default  urgency  
+#name               shortcut    type        relop requestable consumable default  urgency
 #------------------------------------------------------------------------------------------
-<snip> 
+<snip>
 mem_free            mf         MEMORY      <=    YES         NO         1G        0
 \endverbatim
  and also add the following two new lines; it doesn't matter where in the file you add them.
 \verbatim
-#name               shortcut    type        relop requestable consumable default  urgency  
+#name               shortcut    type        relop requestable consumable default  urgency
 #------------------------------------------------------------------------------------------
-<snip> 
+<snip>
 gpu                 g           INT         <=    YES         YES        0        10000
 ram_free            ram_free    MEMORY      <=    YES         JOB        1G       0
 \endverbatim
  You'll only need the "gpu" field if you add GPUs to your grid; the ram_free is a field
  that we find useful in managing the memory of the machines, as the inbuilt field
  <tt>mem_free</tt> doesn't seem to work quite right for our purposes.  Later on
- when we add hosts to the grid, we'll use the command <tt>qconf -me <some-hostname></tt> to 
+ when we add hosts to the grid, we'll use the command <tt>qconf -me <some-hostname></tt> to
  edit the <tt>complex_values</tt> field to read something like:
 \verbatim
  complex_values        ram_free=112G,gpu=2
@@ -474,7 +474,7 @@ ram_free            ram_free    MEMORY      <=    YES         JOB        1G
  a job that needs 10G of memory, we'll specify <tt>-l mem_free=10G,ram_free=10G</tt> as an
  option to <tt>qsub</tt>; the <tt>mem_free</tt> requirement makes sure the machine has that much free
  memory at the time the job starts, and the <tt>ram_free</tt> requirement makes sure we
- don't submit a lot of jobs requiring a lot of memory, all to the same host.  
+ don't submit a lot of jobs requiring a lot of memory, all to the same host.
  We tried, as an alternative to adding the <tt>ram_free</tt> resource,
  using <tt>qconf -mc</tt> to edit the <tt>consumable</tt> field of the inbuilt <tt>mem_free</tt> resource to say
  <tt>YES</tt>, to make GridEngine keep track of memory requests; but this did not
@@ -512,7 +512,7 @@ pe_list               make smp
 \verbatim
 prolog                /var/lib/gridengine/default/common/prolog.sh
 \endverbatim
- (the default was <tt>NONE</tt>), 
+ (the default was <tt>NONE</tt>),
  and the script <tt>/var/lib/gridengine/default/common/prolog.sh</tt>,
  which we copied to that location on each individual node in the cluster,
  reads as follows.  Its only purpose is to wait a short time if the job script can't be
@@ -531,7 +531,7 @@ function test_ok {
   if [ ! -z "$SGE_STDERR_PATH" ]; then
     if [ ! -d "`dirname $SGE_STDERR_PATH`" ]; then
       echo "$0: warning: no such directory $JOB_SCRIPT, will wait." 1>&2
-      return 1; 
+      return 1;
     fi
   fi
   return 0;
@@ -558,7 +558,7 @@ We also edited the queue with <tt>qconf -mq all.q</tt> to change
 rerun                 TRUE
 \endverbatim
 This means that when jobs fail, they get in a status that shows up in the output of
-<tt>qstat</tt> as <tt>Eqw</tt>, with the <tt>E</tt> indicating error, and you can ask the 
+<tt>qstat</tt> as <tt>Eqw</tt>, with the <tt>E</tt> indicating error, and you can ask the
 queue to reschedule them by clearing the error status with <tt>qmod -cj <numeric-job-id></tt>
 (or if you don't want to rerun them, you can delete them with <tt>qmod -dj <numeric-job-id></tt>).
 Setting the queue to allow reruns can avoid the hassle of rerunning scripts from the
@@ -582,8 +582,8 @@ rlogin_daemon                builtin
 rsh_command                  builtin
 rsh_daemon                   builtin
 \endverbatim
-This was to solve a problem whose nature we can no longer recall, but it's something you might want to try it if 
-commands like <tt>qlogin</tt> and <tt>qrsh</tt> don't work.  
+This was to solve a problem whose nature we can no longer recall, but it's something you might want to try it if
+commands like <tt>qlogin</tt> and <tt>qrsh</tt> don't work.
 
 \subsection parallelization_gridengine_configuring_adding Configuring GridEngine (adding nodes)
 
@@ -591,7 +591,7 @@ commands like <tt>qlogin</tt> and <tt>qrsh</tt> don't work.
  As mentioned above, you can install GridEngine on nodes by doing
 \verbatim
   sudo apt-get install gridengine-client gridengine-exec
-\endverbatim 
+\endverbatim
  and you need to specify <tt>default</tt> as the cluster name, and the name of your master
  node as the master (probably using the FQDN of the master is safest here, but if you are on
  a local network, just the last part of the name may also work).
@@ -613,7 +613,7 @@ commands like <tt>qlogin</tt> and <tt>qrsh</tt> don't work.
 \verbatim
 complex_values    ram_free=112G,gpu=1
 \endverbatim
-You'll notice is a slight asymmetry between the commands <tt>qconf -sh</tt> 
+You'll notice is a slight asymmetry between the commands <tt>qconf -sh</tt>
 and <tt>qconf -ss</tt> on the one hand, and <tt>qconf -sel</tt> on the other.
 The <tt>"l"</tt> in the latter command means show the list.   The difference is that
 administrative and submit host lists are just lists of hosts, whereas
@@ -623,7 +623,7 @@ You can view the information about a particular host with <tt>qconf -se <some-ho
 add a new host with <tt>qconf -ae <some-hostname</tt>, and
 modify with <tt>qconf -me <some-hostname></tt>.  This is a general pattern in GridEngine:
 for things like queues that have a bunch of information in them, you can show the full list
-by typing a command ending in "l" like <tt>qconf -sql</tt>, and the corresponding "add" 
+by typing a command ending in "l" like <tt>qconf -sql</tt>, and the corresponding "add"
 (<tt>"a"</tt>) and "modify" (<tt>"m"</tt>) commands accept arguments.
 
 It's not enough to tell GridEngine that a node is an execution host; you have to also add it to the queue,
@@ -647,7 +647,7 @@ nodes with that number of slots you can save yourself some time and avoid adding
 name to the <tt>slots</tt> field.
 There is an alternative way to set up the <tt>hostlist</tt> field.  GridEngine has the concept of host groups,
 so you could do <tt>qconf -ahgrp \@allhosts</tt> to add a group of hosts, and edit it using
-<tt>qconf -mhgrp \@allhosts</tt> to add your new nodes.  The configuration of 
+<tt>qconf -mhgrp \@allhosts</tt> to add your new nodes.  The configuration of
 <tt>all.q</tt> could then just read:
 \verbatim
 hostlist            @allhosts
@@ -663,12 +663,12 @@ HOSTNAME                ARCH         NCPU  LOAD  MEMTOT  MEMUSE  SWAPTO  SWAPUS
 -------------------------------------------------------------------------------
 global                  -               -     -       -       -       -       -
 a01.clsp.jhu.edu        lx26-amd64     24 12.46  126.2G   11.3G   86.6G  213.7M
-   all.q                BIP   0/6/20        
+   all.q                BIP   0/6/20
 a02.clsp.jhu.edu        lx26-amd64     24 16.84  126.2G   12.4G   51.3G  164.5M
-   all.q                BIP   0/18/20       
+   all.q                BIP   0/18/20
 <snip>
 \endverbatim
-If you see the letter <tt>"E"</tt> in the place where the example above shows <tt>"BIP"</tt>, 
+If you see the letter <tt>"E"</tt> in the place where the example above shows <tt>"BIP"</tt>,
 it means the node is in the error state.  Other letters you don't want to see in that position are
 <tt>"a"</tt> for alarm (a generic indicator of badness) and <tt>"u"</tt> for unreachable.
 <tt>"d"</tt> means a node has been disabled by an administrator.
@@ -694,8 +694,8 @@ You can view all jobs from all users by running
 \verbatim
 qstat -u '*'
 \endverbatim
- 
-\section parallelization_grid_stable Keeping your grid stable 
+
+\section parallelization_grid_stable Keeping your grid stable
 
 In this section we have some general notes on how to ensure stability in
 a compute cluster of the kind useful for Kaldi.
@@ -754,7 +754,7 @@ We show it as if we're grepping it from /etc/fstab; this isn't actually how we d
 # grep a05 /etc/fstab
 a05:/mnt/data /export/a05 nfs rw,vers=3,rsize=8192,wsize=8192,acdirmin=5,acdirmax=8,hard,proto=tcp 0 0
 \endverbatim
-The option "vers=3" means we use NFS version 3, which is stateless.  We tried using version 4, 
+The option "vers=3" means we use NFS version 3, which is stateless.  We tried using version 4,
 a supposedly more advanced "stateful" protocol, but we got a lot of crashes.
 
 The <tt>acdirmin=5</tt> and <tt>acdirmin=8</tt> options are the minimum and maximum times that NFS
@@ -762,7 +762,7 @@ waits before re-reading cached directory information; the defaults are 30 and 60
 This is important for Kaldi scripts, because the files that we execute on GridEngine are written
 only shortly before we run the scripts, so with default NFS options they may not yet be visible
 on the execution host at the time they are needed.  Above we showed our script <tt>/var/lib/gridengine/default/common/prolog.sh</tt>
-which waits up to 14 seconds for the script to appear.  It's significant that 14 > 8, i.e. that the 
+which waits up to 14 seconds for the script to appear.  It's significant that 14 > 8, i.e. that the
 number of seconds the prolog script will wait for is greater than the maximum directory caching period for NFS.
 
 The <tt>hard</tt> option is also important; it means that if the server is busy, the client will wait
@@ -813,7 +813,7 @@ and manage a compute grid.
 In CLSP we use a lot of NFS hosts, not just one or two; in fact, most of our
 nodes also export data via NFS.  If you do this you should use
 our <tt>mem-killer.pl</tt> or a similar script, or you will get instability due
-to memory exhaustion when users make mistakes. 
+to memory exhaustion when users make mistakes.
 Having a large number of file
 servers is a particularly good idea for queues that are shared by many people,
 because it's inevitable that people will overload file servers, and if there are
diff --git a/src/doc/tree_externals.dox b/src/doc/tree_externals.dox
index ee2bc11d8b9..df9f96e8430 100644
--- a/src/doc/tree_externals.dox
+++ b/src/doc/tree_externals.dox
@@ -32,13 +32,13 @@ namespace kaldi {
 
   The basic algorithm that is being implemented is a top-down greedy splitting, where we have a number of
   ways we can split the data by asking about, say, the left phone, the right
-  phone, the central phone, the state we're in, and so on.  
+  phone, the central phone, the state we're in, and so on.
  The algorithm we implement is similar to the standard algorithm,
  see for example the paper "Tree-based State Tying for High Accuracy Acoustic Modeling" by
  Young, Odell and Woodland. In this algorithm, we split the data up by asking the locally
   optimal question, i.e. the one that gives the most likelihood increase, supposing
-  we model the data on each side of the split by a single Gaussian. 
- Differences from standard implementations include added flexibility 
+  we model the data on each side of the split by a single Gaussian.
+ Differences from standard implementations include added flexibility
  about how to configure the tree roots; the ability to ask questions about the HMM-state and
  the central phone; and the fact that by default in the Kaldi scripts, the questions
  are automatically generated by a top-down binary clustering of the data, which means
@@ -50,7 +50,7 @@ namespace kaldi {
  be the tree roots.  For how to configure it using the standard scripts, see
  \ref data_prep.   In practice we generally let each tree-root correspond to a "real phone", meaning
  that we group together all word-position-dependent, tone-dependent or stress-dependent versions of
- each phone into one group that becomes a tree root.  
+ each phone into one group that becomes a tree root.
 
   The rest of this page mostly gives details at the code level of what is happening.
 
@@ -74,7 +74,7 @@ below summarizes these values:
 </table>
 
 N is the width of the context window and P is the identity of the designated
-"central phone".  Normally P is exactly in the middle of the window 
+"central phone".  Normally P is exactly in the middle of the window
 (hence the name "central-position"); for example, with N=3, we would normally
 have P=1, but you are free to choose any value from 0 to N-1; for instance, P=2 and
 N=3 means two phones of left context and no right context at all.
@@ -82,32 +82,32 @@ In the code, when we talk about the "central phone" we always mean the P'th
 phone which may or may not actually be the central phone of the context window.
 
 A vector of integers representing a typical triphone context window might be:
-\code 
-// probably not valid C++ 
+\code
+// probably not valid C++
 vector<int32> ctx_window = { 12, 15, 21 };
 \endcode
-Assuming N=3 and P=1, this would represent phone 15 with 
+Assuming N=3 and P=1, this would represent phone 15 with
 a right context of 21 and a left context of 12.  The way we handle end
 effects is using zero (which is not a valid phone because it's reserved in
 OpenFst for the epsilon meaning "no symbol"), so for instance:
-\code 
+\code
 vector<int32> ctx_window = { 12, 15, 0 };
 \endcode
 means phone 15 with a left-context of 12 and no right-context because it's the
 end of the utterance.  At the end of utterance in particular, the use of zero
 this way may be a little unexpected because the last "phone" is actually the
-subsequential symbol "$" (see \ref graph_c), but for the convenience 
+subsequential symbol "$" (see \ref graph_c), but for the convenience
 of the decision-tree code we don't
 put the subsequential symbol in these context windows, we put zero.  Note
 that if we had N=3 and P=2, the above context window would be invalid because
 its P'th element would be zero which is not a real phone; also of course,
-if we had a tree with N=1, neither of the windows above would be valid because they 
+if we had a tree with N=1, neither of the windows above would be valid because they
 are the wrong size.  In the monophone case, we would have a window like:
-\code 
+\code
 vector<int32> ctx_window = { 15 };
 \endcode
 so monophone systems are just treated as a special case of context-dependent
-systems, with a window size N of 1 and a tree that doesn't do anything very 
+systems, with a window size N of 1 and a tree that doesn't do anything very
 interesting.
 
 
@@ -126,28 +126,28 @@ TransitionModel object and an AmDiagGmm object).  If the program gmm-init-mono
 receives an option called --shared-phones, it will share the pdfs between
 specified sets of phones; otherwise it makes all the phones separate.
 
-After training a monophone system starting from a flat start, we take 
+After training a monophone system starting from a flat start, we take
 the monophone alignments
-and use the function AccumulateTreeStats() (called from \ref acc-tree-stats.cc 
+and use the function AccumulateTreeStats() (called from \ref acc-tree-stats.cc
 "acc-tree-stats") to accumulate statistics for training the tree.  This program is
 not limited to reading in monophone alignments; it works from context-dependent
 alignments too so we can build trees based on e.g. triphone alignments.
-The statistics for tree building are written to disk as the type \ref BuildTreeStatsType 
-(see \ref treei_stats).  
+The statistics for tree building are written to disk as the type \ref BuildTreeStatsType
+(see \ref treei_stats).
 The function AccumulateTreeStats() takes the values N and P, as explained in the
 previous section; the command-line programs will set these by default to 3 and
 1 respectively, but this can be overridden using the --context-width
-and --central-position options.  The program \ref acc-tree-stats.cc 
+and --central-position options.  The program \ref acc-tree-stats.cc
 "acc-tree-stats" takes a list of context-independent phones (e.g. silence), but this is
 not required even if there are context-independent phones; it is just
-a mechanism to reduce the size of the statistics.  
+a mechanism to reduce the size of the statistics.
 For context-independent hones, the program will accumulate the
 corresponding statistics without the keys corresponding to the left and right phones defined
 (c.f. \ref treei_event_map).
 
 When the statistics have been
-accumulated we use the program \ref build-tree.cc "build-tree" to 
-build the tree.  This outputs the tree.  
+accumulated we use the program \ref build-tree.cc "build-tree" to
+build the tree.  This outputs the tree.
 The program \ref build-tree.cc "build-tree" requires three things:
   - The statistics (of type BuildTreeStatsType)
   - The questions config (of type Questions)
@@ -160,21 +160,32 @@ scripts, these are automatically obtained from tree-building statistics
 by the program cluster-phones.  The roots file specifies sets of phones
 that are goint to have shared roots in the decision-tree clustering process, and says
 for each phone set the following two things:
-  - "shared" or "not-shared" says whether or not there should be separate  
-    roots for each of the \ref pdf_class "pdf-classes" (i.e. HMM-states,
-    in the typical case), or if the roots
-    should be shared.  If we are going to be splitting (the "split" option
-    below) we enforce that the roots should be shared.
+
+  - "shared" or "not-shared" says whether or not there should be separate roots
+    for each of the \ref pdf_class "pdf-classes" (i.e. HMM-states, in the
+    typical case), or if the roots should be shared.  If it says "shared" there
+    will be a single tree-root for all HMM states (e.g. all three states, in a
+    normal topology) ; if "not-shared" there would be (e.g.) three tree-roots,
+    one for each pdf-class.
+
   - "split" or "not-split" says whether or not the decision tree splitting
     should actually be done for the roots in question (for silence, we
-    typically don't split).
+    typically don't split).  If the line says "split" (the normal case) then
+    we do the decision tree splitting.  If it says "not-split" then no splitting
+    is done and the roots are left un-split.
 
-Be careful because the notation is a bit tricky.  The "shared" on the line of
-the roots file is about whether we will share all the 3 HMM-states of the phone
-in a single tree root.  But we will always share together the roots of all the phones that
-appear on a single lines of the roots file.  This is not configurable via these
-strings because if you don't want to share them, you can just put them on
-separate lines of the roots file. 
+
+The following will clarify some aspects of how this works:
+
+ - If we say "shared split", then
+   even though there is one root node for all three HMM-states, the different
+   HMM states can still get different leaves because the tree can ask questions
+   about the pdf-class as well as about phonetic context.
+
+ - We always share together the roots of all the phones that appear on a single
+   lines of the roots file.  This is not configurable via these strings because
+   if you don't want to share the phones' roots, you can just put them on
+   separate lines of the roots file.
 
 Below is an example of a roots file; this assumes that phone 1 is silence
 and all the other phones have separate roots.
@@ -185,14 +196,14 @@ shared split 3
 ...
 shared split 28
 \endverbatim
-Having multiple phones on the same line is most useful when we have things like position and 
+Having multiple phones on the same line is most useful when we have things like position and
 stress-dependent phones; in this case each "real" phone would correspond
 to a set of integer phone ids.  In that case we share the roots for all
 versions of a particular underlying phone.  Below is an example of a roots file
-for Wall Street Journal, from the egs/wsj/s5 scripts (this is in text, not integer form; 
+for Wall Street Journal, from the egs/wsj/s5 scripts (this is in text, not integer form;
 it would have to be converted to integer form before being read by Kalid):
 \verbatim
-not-shared not-split SIL SIL_B SIL_E SIL_I SIL_S SPN SPN_B SPN_E SPN_I SPN_S NSN NSN_B NSN_E NSN_I NSN_S 
+not-shared not-split SIL SIL_B SIL_E SIL_I SIL_S SPN SPN_B SPN_E SPN_I SPN_S NSN NSN_B NSN_E NSN_I NSN_S
 shared split AA_B AA_E AA_I AA_S AA0_B AA0_E AA0_I AA0_S AA1_B AA1_E AA1_I AA1_S AA2_B AA2_E AA2_I AA2_S
 shared split AE_B AE_E AE_I AE_S AE0_B AE0_E AE0_I AE0_S AE1_B AE1_E AE1_I AE1_S AE2_B AE2_E AE2_I AE2_S
 shared split AH_B AH_E AH_I AH_S AH0_B AH0_E AH0_I AH0_S AH1_B AH1_E AH1_I AH1_S AH2_B AH2_E AH2_I AH2_S
@@ -207,7 +218,7 @@ When creating the roots file, you should ensure that at least one phone on each
 For instance, in this case, if the phone AY was seen in at least some combination of stress and
 word-position, we would be OK.
 
-In this example, we have various word-position-dependent variants of silence and so on.  
+In this example, we have various word-position-dependent variants of silence and so on.
 In this example they will all share their pdf's because they are on the same line and are
 "not-split"-- but they may have different transition parameters.  In fact, most of these
 variants of silence would never be used as silence never appears inside words; this is for
@@ -224,13 +235,13 @@ tree to another using the program \ref convert-ali.cc "convert-ali".
  pdf-id, and these are contiguous (typically there are several thousand of these in an LVCSR
  system).  They are originally assigned when the tree is first built.  Depending
  how the tree is built, it may or may not be possible to say, for each pdf-id, which phone
- it corresponds to.  
+ it corresponds to.
 
 \section tree_ctxdep Context dependency objects
 
  The ContextDependencyInterface object is a virtual base-class for the
  tree that specifies how it interacts with the graph-building code.  This
- interface contains only four functions: 
+ interface contains only four functions:
     - \ref ContextDependencyInterface::ContextWidth() "ContextWidth()" returns
         the value of N (context-width) that the tree requires.
     - \ref ContextDependencyInterface::CentralPosition() "CentralPosition()" returns
@@ -264,8 +275,8 @@ else
 \endcode
 
 The only class that currently inherits from ContextDependencyInterface
-is the class ContextDependency, which has marginally richer interface; 
-the only important addition is the function \ref ContextDependency::GetPdfInfo 
+is the class ContextDependency, which has marginally richer interface;
+the only important addition is the function \ref ContextDependency::GetPdfInfo
 "GetPdfInfo" which is used by the TransitionModel class to work out which
 phones a particular pdf can possibly correspond to (this function could
 be emulated given only the interface of ContextDependencyInterface, by
@@ -274,7 +285,7 @@ enumerating all contexts).
 The ContextDependency object is actually a fairly thin wrapper for the
 EventMap object; see \ref tree_internals.  We wanted to hide
 the actual implementation of the tree as much as possible to make it
-easy to refactor the code later if needed. 
+easy to refactor the code later if needed.
 
 \section tree_example An example of a decision tree
 
@@ -309,18 +320,18 @@ Below is a kind of quasi-BNF notation that explains the tree-file format.
 In the example below, the top-level EventMap of the tree is a SplitEventMap (SE) that
 splits on key 1, which is the central phone.  In square brackets are a contiguous range
 of phone-ids.  As it happens, these don't represent a question, but are just a way of
-splitting on phones so we can get to the "real" decision trees which are per phone.  
+splitting on phones so we can get to the "real" decision trees which are per phone.
 The issue is that this tree was built with "shared roots", so there are various phone-ids,
 corresponding to different word-position-and-stress-marked versions of the same phone,
 that share the root.  We can't use a TableEventMap (TE) at the top level of the tree,
 or we'd have to repeat each decision tree several times (since the EventMap is a pure
-tree, not a general graph, it has no mechanism for pointers to be "shared").  
-The next few instances of the "SE" label are also part of this "quasi-tree" which 
+tree, not a general graph, it has no mechanism for pointers to be "shared").
+The next few instances of the "SE" label are also part of this "quasi-tree" which
 is initially splitting on the central phone (as we go down this file we are going
 deeper into the tree; notice that the braces "{" are opening but not yet closing).
 Then we have the string
 "TE -1 5 ( CE 0 CE 1 CE 2 CE 3 CE 4 )", which represents splitting with a TableEventMap
-on the pdf-class "-1" (effectively, the HMM-position), and returning values 0 through 4.  
+on the pdf-class "-1" (effectively, the HMM-position), and returning values 0 through 4.
 The values represent the five pdf-ids
 for the silence and noise phones SIL, NSN and SPN; in our setup, the pdfs are shared between these
 three non-speech phones (only the transition matrix is specific to each non-speech phone).
@@ -332,8 +343,8 @@ various versions of the phone AA; and question is asking whether the pdf-class (
 has value 0 (i.e. the leftmost HMM-state).  Assuming the answer is "yes", the next question
 is "SE 2 [ 220 221 222 223 ]", which is asking whether the phone to the right is one of various
 forms of the phone "M" (a rather unintuitive question to ask, since we're
-in the leftmost HMM-state); if yes, we ask "SE 0 [ 104 105 106 107... 286 287 ]" which is 
-a question about the phone to the right; if yes, then the pdf-id is 5 ("CE 5") and if 
+in the leftmost HMM-state); if yes, we ask "SE 0 [ 104 105 106 107... 286 287 ]" which is
+a question about the phone to the right; if yes, then the pdf-id is 5 ("CE 5") and if
 no, 696 ("CE 696").
 \verbatim
 s3# copy-tree --binary=false exp/tri1/tree - 2>/dev/null | head -100
@@ -366,8 +377,8 @@ SE 2 [ 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 36 37 38 39 40 41 42 43 44 45 4
 \endverbatim
 
 Below is a simpler example: the monophone tree from the Resource Management
-recipe.  The top-level EventMap is a TableEventMap ("TE 0 49 ...").  
-The key "0" is the phone-position of zero which represents the central (and only) phone 
+recipe.  The top-level EventMap is a TableEventMap ("TE 0 49 ...").
+The key "0" is the phone-position of zero which represents the central (and only) phone
 since the context width (N) is 1.  The number of entries in the table is 49
 (in this case, the number of phones plus one).  The
 first EventMap in the table (index zero) is NULL, because there is no phone with
@@ -375,11 +386,11 @@ index zero.  The next one is a TableEventMap with three elements, corresponding
 to the three HMM-states (technically, pdf-classes) of the first phone: "TE -1 3 ( CE 0 CE 1 CE 2 )".
 \verbatim
 s3# copy-tree --binary=false exp/mono/tree - 2>/dev/null| head -5
-ContextDependency 1 0 ToPdf TE 0 49 ( NULL TE -1 3 ( CE 0 CE 1 CE 2 ) 
-TE -1 3 ( CE 3 CE 4 CE 5 ) 
-TE -1 3 ( CE 6 CE 7 CE 8 ) 
-TE -1 3 ( CE 9 CE 10 CE 11 ) 
-TE -1 3 ( CE 12 CE 13 CE 14 ) 
+ContextDependency 1 0 ToPdf TE 0 49 ( NULL TE -1 3 ( CE 0 CE 1 CE 2 )
+TE -1 3 ( CE 3 CE 4 CE 5 )
+TE -1 3 ( CE 6 CE 7 CE 8 )
+TE -1 3 ( CE 9 CE 10 CE 11 )
+TE -1 3 ( CE 12 CE 13 CE 14 )
 \endverbatim
 
 
@@ -391,8 +402,8 @@ disambiguation symbols and possibly epsilon symbols).  In the graph, as always,
 these are represented by integer labels.  We use an object that, in code
 and in filenames, is generally called ilabel_info.  The ilabel_info object
 4has a strong connection to the \ref fst::ContextFst "ContextFst" objects, see \ref graph_context.
-As with many other Kaldi types, ilabel_info is a generic (STL) type but 
-we use a consistent variable name 
+As with many other Kaldi types, ilabel_info is a generic (STL) type but
+we use a consistent variable name
 to make it identifiable.  It is of the following type:
 \code
  std::vector<std::vector<int32> > ilabel_info;
@@ -402,7 +413,7 @@ input label the corresponding phonetic context window (see above,
 \ref tree_window).  For example, suppose symbol 1500 is phone
 30 with a right-context of 12 and a left-context of 4, we would
 have
-\code 
+\code
  // not valid C++
  ilabel_info[1500] == { 4, 30, 12 };
 \endcode
@@ -410,14 +421,14 @@ In the monophone case, we would have things like:
 \code
  ilabel_info[30] == { 28 };
 \endcode
-There are special cases to deal with disambiguation symbols (see 
-\ref graph_disambig or the 
+There are special cases to deal with disambiguation symbols (see
+\ref graph_disambig or the
 Springer Handbook paper referenced above for an explanation of what these
 are).  If an ilabel_info entry corresponds to a disambiguation symbol,
 we put in it the negative of the symbol-table entry of the disambiguation
 symbol (note that this is not the same as the number of the printed form
-of the disambiguation symbol as in #0, #1, #2 etc., it is the number 
-corresponding to it in a symbol-table file, which in our current scripts is 
+of the disambiguation symbol as in #0, #1, #2 etc., it is the number
+corresponding to it in a symbol-table file, which in our current scripts is
 called phones_disambig.txt).  For example,
 \code
  ilabel_info[5] == { -42 };
@@ -428,7 +439,7 @@ so the programs that interpret the ilabel_info object don't need to be
 given a list of disambiguation symbols in order to be able to distinguish them from
 real phones in the monophone case.  There are two additional special cases:
 we have
-\code 
+\code
  ilabel_info[0] == { }; // epsilon
  ilabel_info[1] == { 0 }; // disambig symbol #-1;
  // we use symbol 1, but don't consider this hardwired.
diff --git a/src/doc/tutorial.dox b/src/doc/tutorial.dox
index ea94ee93e50..2e47624abeb 100644
--- a/src/doc/tutorial.dox
+++ b/src/doc/tutorial.dox
@@ -23,7 +23,7 @@
   - \subpage tutorial_prereqs "Prerequisites"
   - \subpage tutorial_setup "Getting started" (15 minutes)
   - \subpage tutorial_git "Version control with Git" (5 minutes)
-  - \subpage tutorial_looking "Overview of the distribution"  (25 minutes)
+  - \subpage tutorial_looking "Overview of the distribution"  (20 minutes)
   - \subpage tutorial_running "Running the example scripts" (40 minutes)
   - \subpage tutorial_code "Reading and modifying the code" (30 minutes)
 
diff --git a/src/doc/tutorial_git.dox b/src/doc/tutorial_git.dox
index 63676df86c1..7612a1b1e4a 100644
--- a/src/doc/tutorial_git.dox
+++ b/src/doc/tutorial_git.dox
@@ -252,6 +252,15 @@ GitHub will automatically update the pull request web page.
 Then reply e. g. "Done" under the comments that you received, so that they know
 you followed up on their comments.
 
+If you are creating a pull request only for a review of an incomplete piece of
+work, which makes sense and is encouraged if you want early feedback on a
+proposed feature, begin the title of your pull request with the prefix
+<tt>WIP:</tt>. This will tell the maintainers not to merge the pull request
+yet. When you push more commits to your branch, they automatically show in the
+pull request. When you think the work is complete, edit the pull request title
+to remove the \c WIP prefix and then add a comment to this effect, so that the
+maintainers are notified.
+
     \ref tutorial "Up: Kaldi tutorial" <BR>
     \ref tutorial_setup "Previous: Getting started" <BR>
     \ref tutorial_looking "Next: Overview of the distribution" <BR>
diff --git a/src/doc/tutorial_looking.dox b/src/doc/tutorial_looking.dox
index 6d525df93e9..420abfc9bce 100644
--- a/src/doc/tutorial_looking.dox
+++ b/src/doc/tutorial_looking.dox
@@ -35,11 +35,8 @@
 
  The directory "tools/' is where we install things that Kaldi depends on in
  various ways.  Change directory to tools/ and list it.  You will see various
- files and subdirectories, mostly things that have been installed by the script
- install.sh.  Look very quickly at the files install.sh and INSTALL.  These files
- contain similar material since they cover the same steps, but INSTALL is the
- manual version of the instructions and install.sh is the automatic version.  The
- manual version may be helpful as a fall-back plan in case you have installation problems.
+ files and subdirectories, mostly things that have been installed by the make command. 
+ Look very quickly at the file INSTALL. This file gives instructions on how to install the tools.
 
  The most important subdirectory is the one for OpenFst.  cd to openfst/.  This is a soft link
  to the actual directory which has a version number.  List the openfst directory.
@@ -142,16 +139,16 @@ include ../kaldi.mk
  Look at the file ../kaldi.mk.  It will contain
  some rules related to valgrind (for memory debugging), and then some
  system-specific configuration in the form of variables such as CXXFLAGS.
- See if there are any -O options (e.g. -O0).  You might want to remove the flags
- -O0 and -DKALDI_PARANOID before running big experiments, as they slow things
- down (we enable them by default for better debugging).
+ See if there are any -O options (e.g. -O0).  The flags
+ -O0 and -DKALDI_PARANOID are disabled by default as they slow things
+ down (you might want to enable them for better debugging).
  Look again at base/Makefile.  The statement "all:" at the top tells Make
  that "all" is the top-level target (because there are targets in kaldi.mk
  and we don't want these to become the top-level target).  Because the
  dependencies of "all" depend on variables defined later, we have another
- statement down below in which we define what "all" depends on.  Look for
- it.  Several other targets are defined, starting with "clean".  Look for
- them in the Makefile.  To make "clean" you would type "make clean".
+ statement (the target is defined in default_rules.mk) in which we define what "all" depends on.
+ Look for it.  Several other targets are defined, starting with "clean".
+ Look for them.  To make "clean" you would type "make clean".
  The target .valgrind is not something you would invoke from the command line;
  you would type "make valgrind" (the target is defined in kaldi.mk).
  Invoke all of these targets, i.e. type "make clean" and the same for the others,
diff --git a/src/doc/tutorial_running.dox b/src/doc/tutorial_running.dox
index a9f782b9fc2..1f3cb4ee82a 100644
--- a/src/doc/tutorial_running.dox
+++ b/src/doc/tutorial_running.dox
@@ -148,7 +148,7 @@ Look at the files with suffix .csl (in data/lang/phones). These are colon-separa
 Look at phones.txt (in data/lang/).  This file is a phone symbol table that also 
 handles the "disambiguation symbols" used in the standard FST recipe.
 These symbols are conventionally called \#1, \#2 and so on;
- see the paper <a href=www.cs.nyu.edu/~mohri/pub/hbka.pdf> "Speech Recognition
+ see the paper <a href=http://www.cs.nyu.edu/~mohri/pub/hbka.pdf> "Speech Recognition
 with Weighted Finite State Transducers" </a>.  We also add a symbol \#0
 which replaces epsilon transitions in the language model; see
 \ref graph_disambig for more information.  How many disambiguation symbols
diff --git a/src/feat/Makefile b/src/feat/Makefile
index 8b8fa5145ad..858ed714be3 100644
--- a/src/feat/Makefile
+++ b/src/feat/Makefile
@@ -6,16 +6,18 @@ include ../kaldi.mk
 
 TESTFILES = feature-mfcc-test feature-plp-test feature-fbank-test \
          feature-functions-test pitch-functions-test feature-sdc-test \
-         resample-test online-feature-test sinusoid-detection-test
+         resample-test online-feature-test sinusoid-detection-test \
+         signal-test
 
 OBJFILES = feature-functions.o feature-mfcc.o feature-plp.o feature-fbank.o \
            feature-spectrogram.o mel-computations.o wave-reader.o \
-           pitch-functions.o resample.o online-feature.o sinusoid-detection.o
+           pitch-functions.o resample.o online-feature.o sinusoid-detection.o \
+           signal.o feature-window.o
 
 LIBNAME = kaldi-feat
 
 ADDLIBS = ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a ../tree/kaldi-tree.a \
-	../util/kaldi-util.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a ../thread/kaldi-thread.a
+	../util/kaldi-util.a ../thread/kaldi-thread.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
 
diff --git a/src/feat/feature-common-inl.h b/src/feat/feature-common-inl.h
new file mode 100644
index 00000000000..a9c3c47ebbc
--- /dev/null
+++ b/src/feat/feature-common-inl.h
@@ -0,0 +1,74 @@
+// feat/feature-common-inl.h
+
+// Copyright       2016  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_FEAT_FEATURE_COMMON_INL_H_
+#define KALDI_FEAT_FEATURE_COMMON_INL_H_
+
+// Do not include this file directly.  It is included by feat/feature-common.h
+
+namespace kaldi {
+
+template <class F>
+void OfflineFeatureTpl<F>::Compute(
+    const VectorBase<BaseFloat> &wave,
+    BaseFloat vtln_warp,
+    Matrix<BaseFloat> *output,
+    Vector<BaseFloat> *deprecated_wave_remainder) {
+  KALDI_ASSERT(output != NULL);
+  int32 rows_out = NumFrames(wave.Dim(), computer_.GetFrameOptions()),
+      cols_out = computer_.Dim();
+  if (rows_out == 0) {
+    output->Resize(0, 0);
+    if (deprecated_wave_remainder != NULL)
+      *deprecated_wave_remainder = wave;
+    return;
+  }
+  output->Resize(rows_out, cols_out);
+  if (deprecated_wave_remainder != NULL)
+    ExtractWaveformRemainder(wave, computer_.GetFrameOptions(),
+                             deprecated_wave_remainder);
+  Vector<BaseFloat> window;  // windowed waveform.
+  bool use_raw_log_energy = computer_.NeedRawLogEnergy();
+  for (int32 r = 0; r < rows_out; r++) {  // r is frame index.
+    BaseFloat raw_log_energy = 0.0;
+    ExtractWindow(0, wave, r, computer_.GetFrameOptions(),
+                  feature_window_function_, &window,
+                  (use_raw_log_energy ? &raw_log_energy : NULL));
+
+    SubVector<BaseFloat> output_row(*output, r);
+    computer_.Compute(raw_log_energy, vtln_warp, &window, &output_row);
+  }
+}
+
+template <class F>
+void OfflineFeatureTpl<F>::Compute(
+    const VectorBase<BaseFloat> &wave,
+    BaseFloat vtln_warp,
+    Matrix<BaseFloat> *output,
+    Vector<BaseFloat> *deprecated_wave_remainder) const {
+  OfflineFeatureTpl<F> temp(*this);
+  // call the non-const version of Compute() on a temporary copy of this object.
+  // This is a workaround for const-ness that may sometimes be useful in
+  // multi-threaded code, although it's not optimally efficient.
+  temp.Compute(wave, vtln_warp, output, deprecated_wave_remainder);
+}
+
+} // end namespace kaldi
+
+#endif
diff --git a/src/feat/feature-common.h b/src/feat/feature-common.h
new file mode 100644
index 00000000000..70d8f4b043e
--- /dev/null
+++ b/src/feat/feature-common.h
@@ -0,0 +1,161 @@
+// feat/feature-common.h
+
+// Copyright      2016   Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABILITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_FEAT_FEATURE_COMMON_H_
+#define KALDI_FEAT_FEATURE_COMMON_H_
+
+#include <map>
+#include <string>
+#include "feat/feature-window.h"
+
+namespace kaldi {
+/// @addtogroup  feat FeatureCommon
+/// @{
+
+
+
+/// This class is only added for documentation, it is not intended to ever be
+/// used.
+struct ExampleFeatureComputerOptions {
+  FrameExtractionOptions frame_opts;
+  // .. more would go here.
+};
+
+/// This class is only added for documentation, it is not intended to ever be
+/// used.  It documents the interface of the *Computer classes which wrap the
+/// low-level feature extraction.  The template argument F of OfflineFeatureTpl must
+/// follow this interface.  This interface is intended for features such as
+/// MFCCs and PLPs which can be computed frame by frame.
+class ExampleFeatureComputer {
+ public:
+  typedef ExampleFeatureComputerOptions Options;
+
+  /// Returns a reference to the frame-extraction options class, which
+  /// will be part of our own options class.
+  const FrameExtractionOptions &GetFrameOptions() const {
+    return opts_.frame_opts;
+  }
+
+  /// Returns the feature dimension
+  int32 Dim();
+
+  /// Returns true if this function may inspect the raw log-energy of the signal
+  /// (before windowing and pre-emphasis); it's safe to always return true, but
+  /// setting it to false enables an optimization.
+  bool NeedRawLogEnergy() { return true; }
+
+  /// constructor from options class; it should not store a reference or pointer
+  /// to the options class but should copy it.
+  explicit ExampleFeatureComputer(const ExampleFeatureComputerOptions &opts):
+      opts_(opts) { }
+
+  /// Copy constructor; all of these classes must have one.
+  ExampleFeatureComputer(const ExampleFeatureComputer &other);
+
+  /**
+     Function that computes one frame of features from
+     one frame of signal.
+
+     @param [in] signal_raw_log_energy The log-energy of the frame of the signal
+         prior to windowing and pre-emphasis, or
+         log(numeric_limits<float>::min()), whichever is greater.  Must be
+         ignored by this function if this class returns false from
+         this->NeedRawLogEnergy().
+     @param [in] vtln_warp  The VTLN warping factor that the user wants
+         to be applied when computing features for this utterance.  Will
+         normally be 1.0, meaning no warping is to be done.  The value will
+         be ignored for feature types that don't support VLTN, such as
+         spectrogram features.
+     @param [in] signal_frame  One frame of the signal,
+       as extracted using the function ExtractWindow() using the options
+       returned by this->GetFrameOptions().  The function will use the
+       vector as a workspace, which is why it's a non-const pointer.
+     @param [out] feature  Pointer to a vector of size this->Dim(), to which
+         the computed feature will be written.
+  */
+  void Compute(BaseFloat signal_log_energy,
+               BaseFloat vtln_warp,
+               VectorBase<BaseFloat> *signal_frame,
+               VectorBase<BaseFloat> *feature);
+
+ private:
+  // disallow assignment.
+  ExampleFeatureComputer &operator = (const ExampleFeatureComputer &in);
+  Options opts_;
+};
+
+
+/// This templated class is intended for offline feature extraction, i.e. where
+/// you have access to the entire signal at the start.  It exists mainly to be
+/// drop-in replacement for the old (pre-2016) classes Mfcc, Plp and so on, for
+/// use in the offline case.  In April 2016 we reorganized the online
+/// feature-computation code for greater modularity and to have correct support
+/// for the snip-edges=false option.
+template <class F>
+class OfflineFeatureTpl {
+ public:
+  typedef typename F::Options Options;
+
+  // Note: feature_window_function_ is the windowing function, which initialized
+  // using the options class, that we cache at this level.
+  OfflineFeatureTpl(const Options &opts):
+      computer_(opts),
+      feature_window_function_(computer_.GetFrameOptions()) { }
+
+  // Computes the features for one file (one sequence of features).
+  // Use of the 'deprecatd_wave_remainder' argument is highly deprecated; it is
+  // only provided for back-compatibility for code that may have
+  // relied on the older interface.  It's deprecated because it
+  // doesn't support the --snip-edges=false option, and because
+  // we plan to eventually remove this argument so that there
+  // will be only one way to do online feature extraction.
+  void Compute(const VectorBase<BaseFloat> &wave,
+               BaseFloat vtln_warp,
+               Matrix<BaseFloat> *output,
+               Vector<BaseFloat> *deprecated_wave_remainder = NULL);
+
+  // This const version of Compute() is a wrapper that
+  // calls the non-const version on a temporary object.
+  // It's less efficient than the non-const version.
+  void Compute(const VectorBase<BaseFloat> &wave,
+               BaseFloat vtln_warp,
+               Matrix<BaseFloat> *output,
+               Vector<BaseFloat> *deprecated_wave_remainder = NULL) const;
+
+  int32 Dim() const { return computer_.Dim(); }
+
+  // Copy constructor.
+  OfflineFeatureTpl(const OfflineFeatureTpl<F> &other):
+      computer_(other.computer_),
+      feature_window_function_(other.feature_window_function_) { }
+  private:
+  // Disallow assignment.
+  OfflineFeatureTpl<F> &operator =(const OfflineFeatureTpl<F> &other);
+
+  F computer_;
+  FeatureWindowFunction feature_window_function_;
+};
+
+/// @} End of "addtogroup feat"
+}  // namespace kaldi
+
+
+#include "feat/feature-common-inl.h"
+
+#endif  // KALDI_FEAT_FEATURE_COMMON_H_
diff --git a/src/feat/feature-fbank.cc b/src/feat/feature-fbank.cc
index bac61ed2059..6b8d49e9403 100644
--- a/src/feat/feature-fbank.cc
+++ b/src/feat/feature-fbank.cc
@@ -1,6 +1,7 @@
 // feat/feature-fbank.cc
 
 // Copyright 2009-2012  Karel Vesely
+//                2016  Johns Hopkins University (author: Daniel Povey)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -20,11 +21,10 @@
 
 #include "feat/feature-fbank.h"
 
-
 namespace kaldi {
 
-Fbank::Fbank(const FbankOptions &opts)
-    : opts_(opts), feature_window_function_(opts.frame_opts), srfft_(NULL) {
+FbankComputer::FbankComputer(const FbankOptions &opts):
+    opts_(opts), srfft_(NULL) {
   if (opts.energy_floor > 0.0)
     log_energy_floor_ = Log(opts.energy_floor);
 
@@ -33,21 +33,29 @@ Fbank::Fbank(const FbankOptions &opts)
     srfft_ = new SplitRadixRealFft<BaseFloat>(padded_window_size);
 
   // We'll definitely need the filterbanks info for VTLN warping factor 1.0.
-  // [note: this call caches it.]  The reason we call this here is to
-  // improve the efficiency of the "const" version of Compute().
+  // [note: this call caches it.]
   GetMelBanks(1.0);
 }
 
-Fbank::~Fbank() {
+FbankComputer::FbankComputer(const FbankComputer &other):
+    opts_(other.opts_), log_energy_floor_(other.log_energy_floor_),
+    mel_banks_(other.mel_banks_), srfft_(NULL) {
   for (std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.begin();
       iter != mel_banks_.end();
       ++iter)
+    iter->second = new MelBanks(*(iter->second));
+  if (other.srfft_)
+    srfft_ = new SplitRadixRealFft<BaseFloat>(*(other.srfft_));
+}
+
+FbankComputer::~FbankComputer() {
+  for (std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.begin();
+      iter != mel_banks_.end(); ++iter)
     delete iter->second;
-  if (srfft_ != NULL)
-    delete srfft_;
+  delete srfft_;
 }
 
-const MelBanks *Fbank::GetMelBanks(BaseFloat vtln_warp) {
+const MelBanks* FbankComputer::GetMelBanks(BaseFloat vtln_warp) {
   MelBanks *this_mel_banks = NULL;
   std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.find(vtln_warp);
   if (iter == mel_banks_.end()) {
@@ -61,124 +69,52 @@ const MelBanks *Fbank::GetMelBanks(BaseFloat vtln_warp) {
   return this_mel_banks;
 }
 
-const MelBanks *Fbank::GetMelBanks(BaseFloat vtln_warp,
-                                   bool *must_delete) const {
-  MelBanks *this_mel_banks = NULL;
-  std::map<BaseFloat, MelBanks*>::const_iterator iter =
-      mel_banks_.find(vtln_warp);
-  if (iter == mel_banks_.end()) {
-    this_mel_banks = new MelBanks(opts_.mel_opts,
-                                  opts_.frame_opts,
-                                  vtln_warp);
-    *must_delete = true;
-  } else {
-    this_mel_banks = iter->second;
-    *must_delete = false;
-  }
-  return this_mel_banks;
-}
+void FbankComputer::Compute(BaseFloat signal_log_energy,
+                            BaseFloat vtln_warp,
+                            VectorBase<BaseFloat> *signal_frame,
+                            VectorBase<BaseFloat> *feature) {
 
-void Fbank::Compute(const VectorBase<BaseFloat> &wave,
-                    BaseFloat vtln_warp,
-                    Matrix<BaseFloat> *output,
-                    Vector<BaseFloat> *wave_remainder) {
-  const MelBanks *this_mel_banks = GetMelBanks(vtln_warp);
-  ComputeInternal(wave, *this_mel_banks, output, wave_remainder);  
-}
+  const MelBanks &mel_banks = *(GetMelBanks(vtln_warp));
 
-void Fbank::Compute(const VectorBase<BaseFloat> &wave,
-                    BaseFloat vtln_warp,
-                    Matrix<BaseFloat> *output,
-                    Vector<BaseFloat> *wave_remainder) const {
-  bool must_delete_mel_banks;
-  const MelBanks *mel_banks = GetMelBanks(vtln_warp,
-                                          &must_delete_mel_banks);
-  
-  ComputeInternal(wave, *mel_banks, output, wave_remainder);
-  
-  if (must_delete_mel_banks)
-    delete mel_banks;
-}
+  KALDI_ASSERT(signal_frame->Dim() == opts_.frame_opts.PaddedWindowSize() &&
+               feature->Dim() == this->Dim());
 
 
-void Fbank::ComputeInternal(const VectorBase<BaseFloat> &wave,
-                            const MelBanks &mel_banks,
-                            Matrix<BaseFloat> *output,
-                            Vector<BaseFloat> *wave_remainder) const {
-  KALDI_ASSERT(output != NULL);
+  // Compute energy after window function (not the raw one).
+  if (opts_.use_energy && !opts_.raw_energy)
+    signal_log_energy = Log(std::max(VecVec(*signal_frame, *signal_frame),
+                                     std::numeric_limits<BaseFloat>::min()));
 
-  // Get dimensions of output features
-  int32 rows_out = NumFrames(wave.Dim(), opts_.frame_opts);
-  int32 cols_out = opts_.mel_opts.num_bins + opts_.use_energy;
-  if (rows_out == 0) {
-    output->Resize(0, 0);
-    *wave_remainder = wave;
-    return;
-  }
-  // Prepare the output buffer
-  output->Resize(rows_out, cols_out);
-
-  // Optionally extract the remainder for further processing
-  if (wave_remainder != NULL)
-    ExtractWaveformRemainder(wave, opts_.frame_opts, wave_remainder);
-
-  // Buffers
-  Vector<BaseFloat> window;  // windowed waveform.
-  Vector<BaseFloat> mel_energies;
-  std::vector<BaseFloat> temp_buffer;  // used by srfft.  
-  BaseFloat log_energy;
-
-  // Compute all the freames, r is frame index..
-  for (int32 r = 0; r < rows_out; r++) {
-    // Cut the window, apply window function
-    ExtractWindow(wave, r, opts_.frame_opts, feature_window_function_, &window,
-                  (opts_.use_energy && opts_.raw_energy ? &log_energy : NULL));
-
-    // Compute energy after window function (not the raw one)
-    if (opts_.use_energy && !opts_.raw_energy)
-      log_energy = Log(std::max(VecVec(window, window),
-                                std::numeric_limits<BaseFloat>::min()));
-
-    if (srfft_ != NULL)  // Compute FFT using split-radix algorithm.
-      srfft_->Compute(window.Data(), true, &temp_buffer);
-    else  // An alternative algorithm that works for non-powers-of-two.
-      RealFft(&window, true);
-
-    // Convert the FFT into a power spectrum.
-    ComputePowerSpectrum(&window);
-    SubVector<BaseFloat> power_spectrum(window, 0, window.Dim()/2 + 1);
-
-    // Sum with MelFiterbank over power spectrum
-    mel_banks.Compute(power_spectrum, &mel_energies);
-    if (opts_.use_log_fbank) {
-      // avoid log of zero (which should be prevented anyway by dithering).
-      mel_energies.ApplyFloor(std::numeric_limits<BaseFloat>::min());
-      mel_energies.ApplyLog();  // take the log.
-    }
+  if (srfft_ != NULL)  // Compute FFT using split-radix algorithm.
+    srfft_->Compute(signal_frame->Data(), true);
+  else  // An alternative algorithm that works for non-powers-of-two.
+    RealFft(signal_frame, true);
 
-    // Output buffers
-    SubVector<BaseFloat> this_output(output->Row(r));
-    SubVector<BaseFloat> this_fbank(this_output.Range((opts_.use_energy? 1 : 0),
-                                                      opts_.mel_opts.num_bins));
-
-    // Copy to output
-    this_fbank.CopyFromVec(mel_energies);
-    // Copy energy as first value
-    if (opts_.use_energy) {
-      if (opts_.energy_floor > 0.0 && log_energy < log_energy_floor_) {
-        log_energy = log_energy_floor_;
-      }
-      this_output(0) = log_energy;
-    }
+  // Convert the FFT into a power spectrum.
+  ComputePowerSpectrum(signal_frame);
+  SubVector<BaseFloat> power_spectrum(*signal_frame, 0,
+                                      signal_frame->Dim() / 2 + 1);
+
+  int32 mel_offset = ((opts_.use_energy && !opts_.htk_compat) ? 1 : 0);
+  SubVector<BaseFloat> mel_energies(*feature,
+                                    mel_offset,
+                                    opts_.mel_opts.num_bins);
+
+  // Sum with mel fiterbanks over the power spectrum
+  mel_banks.Compute(power_spectrum, &mel_energies);
+  if (opts_.use_log_fbank) {
+    // Avoid log of zero (which should be prevented anyway by dithering).
+    mel_energies.ApplyFloor(std::numeric_limits<BaseFloat>::epsilon());
+    mel_energies.ApplyLog();  // take the log.
+  }
 
-    // HTK compat: Shift features, so energy is last value
-    if (opts_.htk_compat && opts_.use_energy) {
-      BaseFloat energy = this_output(0);
-      for (int32 i = 0; i < opts_.mel_opts.num_bins; i++) {
-        this_output(i) = this_output(i+1);
-      }
-      this_output(opts_.mel_opts.num_bins) = energy;
+  // Copy energy as first value (or the last, if htk_compat == true).
+  if (opts_.use_energy) {
+    if (opts_.energy_floor > 0.0 && signal_log_energy < log_energy_floor_) {
+      signal_log_energy = log_energy_floor_;
     }
+    int32 energy_index = opts_.htk_compat ? opts_.mel_opts.num_bins : 0;
+    (*feature)(energy_index) = signal_log_energy;
   }
 }
 
diff --git a/src/feat/feature-fbank.h b/src/feat/feature-fbank.h
index 966f305ea6c..b93ed6f58cf 100644
--- a/src/feat/feature-fbank.h
+++ b/src/feat/feature-fbank.h
@@ -1,6 +1,7 @@
 // feat/feature-fbank.h
 
 // Copyright 2009-2012  Karel Vesely
+//                2016  Johns Hopkins University (author: Daniel Povey)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -23,14 +24,17 @@
 #include<map>
 #include <string>
 
+#include "feat/feature-common.h"
 #include "feat/feature-functions.h"
+#include "feat/feature-window.h"
+#include "feat/mel-computations.h"
 
 namespace kaldi {
 /// @addtogroup  feat FeatureExtraction
 /// @{
 
 
-/// FbankOptions contains basic options for computing FBANK features
+/// FbankOptions contains basic options for computing filterbank features.
 /// It only includes things that can be done in a "stateless" way, i.e.
 /// it does not include energy max-normalization.
 /// It does not include delta computation.
@@ -42,7 +46,7 @@ struct FbankOptions {
   bool raw_energy;  // If true, compute energy before preemphasis and windowing
   bool htk_compat;  // If true, put energy last (if using energy)
   bool use_log_fbank;  // if true (default), produce log-filterbank, else linear
-  
+
   FbankOptions(): mel_opts(23),
                  // defaults the #mel-banks to 23 for the FBANK computations.
                  // this seems to be common for 16khz-sampled data,
@@ -70,54 +74,67 @@ struct FbankOptions {
   }
 };
 
-class MelBanks;
-
 
 /// Class for computing mel-filterbank features; see \ref feat_mfcc for more
 /// information.
-class Fbank {
+class FbankComputer {
  public:
-  explicit Fbank(const FbankOptions &opts);
-  ~Fbank();
-
-  int32 Dim() const { return opts_.mel_opts.num_bins; }
-
-  /// Will throw exception on failure (e.g. if file too short for even one
-  /// frame).  The output "wave_remainder" is the last frame or two of the
-  /// waveform that it would be necessary to include in the next call to Compute
-  /// for the same utterance.  It is not exactly the un-processed part (it may
-  /// have been partly processed), it's the start of the next window that we
-  /// have not already processed.
-  void Compute(const VectorBase<BaseFloat> &wave,
-               BaseFloat vtln_warp,
-               Matrix<BaseFloat> *output,
-               Vector<BaseFloat> *wave_remainder = NULL);
-  
-  /// Const version of Compute()
-  void Compute(const VectorBase<BaseFloat> &wave,
-               BaseFloat vtln_warp,
-               Matrix<BaseFloat> *output,
-               Vector<BaseFloat> *wave_remainder = NULL) const;
   typedef FbankOptions Options;
+
+  explicit FbankComputer(const FbankOptions &opts);
+  FbankComputer(const FbankComputer &other);
+
+  int32 Dim() const {
+    return opts_.mel_opts.num_bins + (opts_.use_energy ? 1 : 0);
+  }
+
+  bool NeedRawLogEnergy() { return opts_.use_energy && opts_.raw_energy; }
+
+  const FrameExtractionOptions &GetFrameOptions() const {
+    return opts_.frame_opts;
+  }
+
+  /**
+     Function that computes one frame of features from
+     one frame of signal.
+
+     @param [in] signal_raw_log_energy The log-energy of the frame of the signal
+         prior to windowing and pre-emphasis, or
+         log(numeric_limits<float>::min()), whichever is greater.  Must be
+         ignored by this function if this class returns false from
+         this->NeedsRawLogEnergy().
+     @param [in] vtln_warp  The VTLN warping factor that the user wants
+         to be applied when computing features for this utterance.  Will
+         normally be 1.0, meaning no warping is to be done.  The value will
+         be ignored for feature types that don't support VLTN, such as
+         spectrogram features.
+     @param [in] signal_frame  One frame of the signal,
+       as extracted using the function ExtractWindow() using the options
+       returned by this->GetFrameOptions().  The function will use the
+       vector as a workspace, which is why it's a non-const pointer.
+     @param [out] feature  Pointer to a vector of size this->Dim(), to which
+         the computed feature will be written.
+  */
+  void Compute(BaseFloat signal_log_energy,
+               BaseFloat vtln_warp,
+               VectorBase<BaseFloat> *signal_frame,
+               VectorBase<BaseFloat> *feature);
+
+  ~FbankComputer();
+
  private:
-  void ComputeInternal(const VectorBase<BaseFloat> &wave,
-                       const MelBanks &mel_banks,
-                       Matrix<BaseFloat> *output,
-                       Vector<BaseFloat> *wave_remainder = NULL) const;
-  
   const MelBanks *GetMelBanks(BaseFloat vtln_warp);
 
-  const MelBanks *GetMelBanks(BaseFloat vtln_warp,
-                              bool *must_delete) const;
 
   FbankOptions opts_;
   BaseFloat log_energy_floor_;
   std::map<BaseFloat, MelBanks*> mel_banks_;  // BaseFloat is VTLN coefficient.
-  FeatureWindowFunction feature_window_function_;
   SplitRadixRealFft<BaseFloat> *srfft_;
-  KALDI_DISALLOW_COPY_AND_ASSIGN(Fbank);
+  // Disallow assignment.
+  FbankComputer &operator =(const FbankComputer &other);
 };
 
+typedef OfflineFeatureTpl<FbankComputer> Fbank;
 
 /// @} End of "addtogroup feat"
 }  // namespace kaldi
diff --git a/src/feat/feature-functions.cc b/src/feat/feature-functions.cc
index 9678e909a5a..b8a7b3178f2 100644
--- a/src/feat/feature-functions.cc
+++ b/src/feat/feature-functions.cc
@@ -26,162 +26,6 @@
 
 namespace kaldi {
 
-int32 NumFrames(int32 nsamp,
-                const FrameExtractionOptions &opts) {
-  int32 frame_shift = opts.WindowShift();
-  int32 frame_length = opts.WindowSize();
-  KALDI_ASSERT(frame_shift != 0 && frame_length != 0);
-  if (opts.snip_edges) {
-    if (static_cast<int32>(nsamp) < frame_length)
-      return 0;
-    else
-      return (1 + ((nsamp - frame_length) / frame_shift));
-      // view the expression above as: nsamp-frame_length is how much room we
-      // have to shift the frame within the waveform; frame_shift is how much
-      // we shift it each time and the ratio is how many times we can shift
-      // it (integer arithmetic rounds down).
-  } else {
-    return (int32)(nsamp * 1.0f / frame_shift + 0.5f);
-    // if --snip-edges=false, the number of frames would be determined by
-    // rounding the (file-length / frame-shift) to the nearest integer
-  }
-}
-
-
-void Dither(VectorBase<BaseFloat> *waveform, BaseFloat dither_value) {
-  for (int32 i = 0; i < waveform->Dim(); i++)
-    (*waveform)(i) += RandGauss() * dither_value;
-}
-
-
-void Preemphasize(VectorBase<BaseFloat> *waveform, BaseFloat preemph_coeff) {
-  if (preemph_coeff == 0.0) return;
-  KALDI_ASSERT(preemph_coeff >= 0.0 && preemph_coeff <= 1.0);
-  for (int32 i = waveform->Dim()-1; i > 0; i--)
-    (*waveform)(i) -= preemph_coeff * (*waveform)(i-1);
-  (*waveform)(0) -= preemph_coeff * (*waveform)(0);
-}
-
-
-
-FeatureWindowFunction::FeatureWindowFunction(const FrameExtractionOptions &opts) {
-  int32 frame_length = opts.WindowSize();
-  KALDI_ASSERT(frame_length > 0);
-  window.Resize(frame_length);
-  for (int32 i = 0; i < frame_length; i++) {
-    BaseFloat i_fl = static_cast<BaseFloat>(i);
-    if (opts.window_type == "hanning") {
-      window(i) = 0.5  - 0.5*cos(M_2PI * i_fl / (frame_length-1));
-    } else if (opts.window_type == "hamming") {
-      window(i) = 0.54 - 0.46*cos(M_2PI * i_fl / (frame_length-1));
-    } else if (opts.window_type == "povey") {  // like hamming but goes to zero at edges.
-      window(i) = pow(0.5 - 0.5*cos(M_2PI * i_fl / (frame_length-1)), 0.85);
-    } else if (opts.window_type == "rectangular") {
-      window(i) = 1.0;
-    } else {
-      KALDI_ERR << "Invalid window type " << opts.window_type;
-    }
-  }
-}
-
-// ExtractWindow extracts a windowed frame of waveform with a power-of-two,
-// padded size.  It does mean subtraction, pre-emphasis and dithering as
-// requested.
-
-void ExtractWindow(const VectorBase<BaseFloat> &wave,
-                   int32 f,  // with 0 <= f < NumFrames(feats, opts)
-                   const FrameExtractionOptions &opts,
-                   const FeatureWindowFunction &window_function,
-                   Vector<BaseFloat> *window,
-                   BaseFloat *log_energy_pre_window) {
-  int32 frame_shift = opts.WindowShift();
-  int32 frame_length = opts.WindowSize();
-  KALDI_ASSERT(window_function.window.Dim() == frame_length);
-  KALDI_ASSERT(frame_shift != 0 && frame_length != 0);
-
-  Vector<BaseFloat> wave_part(frame_length);
-  if (opts.snip_edges) {
-    int32 start = frame_shift*f, end = start + frame_length;
-    KALDI_ASSERT(start >= 0 && end <= wave.Dim());
-    wave_part.CopyFromVec(wave.Range(start, frame_length));
-  } else {
-    // If opts.snip_edges = false, we allow the frames to go slightly over the
-    // edges of the file; we'll extend the data by reflection.
-    int32 mid = frame_shift * (f + 0.5),
-        begin = mid - frame_length / 2,
-        end = begin + frame_length,
-        begin_limited = std::max<int32>(0, begin),
-        end_limited = std::min(end, wave.Dim()),
-        length_limited = end_limited - begin_limited;
-
-    // Copy the main part.  Usually this will be the entire window.
-    wave_part.Range(begin_limited - begin, length_limited).
-        CopyFromVec(wave.Range(begin_limited, length_limited));
-    
-    // Deal with any end effects by reflection, if needed.  This code will
-    // rarely be reached, so we don't concern ourselves with efficiency.
-    for (int32 f = begin; f < 0; f++) {
-      int32 reflected_f = -f;
-      // The next statement will only have an effect in the case of files
-      // shorter than a single frame, it's to avoid a crash in those cases.
-      reflected_f = reflected_f % wave.Dim(); 
-      wave_part(f - begin) = wave(reflected_f);
-    }
-    for (int32 f = wave.Dim(); f < end; f++) {
-      int32 distance_to_end = f - wave.Dim();
-      // The next statement will only have an effect in the case of files
-      // shorter than a single frame, it's to avoid a crash in those cases.
-      distance_to_end = distance_to_end % wave.Dim();
-      int32 reflected_f = wave.Dim() - 1 - distance_to_end;
-      wave_part(f - begin) = wave(reflected_f);
-    }
-  }
-  KALDI_ASSERT(window != NULL);
-  int32 frame_length_padded = opts.PaddedWindowSize();
-
-  if (window->Dim() != frame_length_padded)
-    window->Resize(frame_length_padded);
-
-  SubVector<BaseFloat> window_part(*window, 0, frame_length);
-  window_part.CopyFromVec(wave_part);
-
-  if (opts.dither != 0.0) Dither(&window_part, opts.dither);
-
-  if (opts.remove_dc_offset)
-    window_part.Add(-window_part.Sum() / frame_length);
-
-  if (log_energy_pre_window != NULL) {
-    BaseFloat energy = std::max(VecVec(window_part, window_part),
-                                std::numeric_limits<BaseFloat>::min());
-    *log_energy_pre_window = Log(energy);
-  }
-
-  if (opts.preemph_coeff != 0.0)
-    Preemphasize(&window_part, opts.preemph_coeff);
-
-  window_part.MulElements(window_function.window);
-
-  if (frame_length != frame_length_padded)
-    SubVector<BaseFloat>(*window, frame_length,
-                         frame_length_padded-frame_length).SetZero();
-}
-
-void ExtractWaveformRemainder(const VectorBase<BaseFloat> &wave,
-                              const FrameExtractionOptions &opts,
-                              Vector<BaseFloat> *wave_remainder) {
-  int32 frame_shift = opts.WindowShift();
-  int32 num_frames = NumFrames(wave.Dim(), opts);
-  // offset is the amount at the start that has been extracted.
-  int32 offset = num_frames * frame_shift;
-  KALDI_ASSERT(wave_remainder != NULL);
-  int32 remaining_len = wave.Dim() - offset;
-  wave_remainder->Resize(remaining_len);
-  KALDI_ASSERT(remaining_len >= 0);
-  if (remaining_len > 0)
-    wave_remainder->CopyFromVec(SubVector<BaseFloat>(wave, offset, remaining_len));
-}
-
-
 void ComputePowerSpectrum(VectorBase<BaseFloat> *waveform) {
   int32 dim = waveform->Dim();
 
@@ -341,22 +185,6 @@ void ComputeShiftedDeltas(const ShiftedDeltaFeaturesOptions &delta_opts,
 }
 
 
-
-
-
-void GetEqualLoudnessVector(const MelBanks &mel_banks,
-                            Vector<BaseFloat> *ans) {
-  int32 n = mel_banks.NumBins();
-  // central freq of each mel bin
-  const Vector<BaseFloat> &f0 = mel_banks.GetCenterFreqs();
-  ans->Resize(n);
-  for (int32 i = 0; i < n; i++) {
-    BaseFloat fsq = f0(i) * f0(i);
-    BaseFloat fsub = fsq / (fsq + 1.6e5);
-    (*ans)(i) = fsub * fsub * ((fsq + 1.44e6) / (fsq + 9.61e6));
-  }
-}
-
 void InitIdftBases(int32 n_bases, int32 dimension, Matrix<BaseFloat> *mat_out) {
   BaseFloat angle = M_PI / static_cast<BaseFloat>(dimension - 1);
   BaseFloat scale = 1.0f / (2.0 * static_cast<BaseFloat>(dimension - 1));
@@ -374,20 +202,6 @@ void InitIdftBases(int32 n_bases, int32 dimension, Matrix<BaseFloat> *mat_out) {
   }
 }
 
-// Compute LP coefficients from autocorrelation coefficients.
-BaseFloat ComputeLpc(const VectorBase<BaseFloat> &autocorr_in,
-                     Vector<BaseFloat> *lpc_out) {
-  int32 n = autocorr_in.Dim() - 1;
-  KALDI_ASSERT(lpc_out->Dim() == n);
-  Vector<BaseFloat> tmp(n);
-  BaseFloat ans =  Durbin(n, autocorr_in.Data(),
-                          lpc_out->Data(),
-                          tmp.Data());
-  if (ans <= 0.0)
-    KALDI_WARN << "Zero energy in LPC computation";
-  return -Log((double)1.0/ans);  // forms the C0 value
-}
-
 void SpliceFrames(const MatrixBase<BaseFloat> &input_features,
                   int32 left_context,
                   int32 right_context,
diff --git a/src/feat/feature-functions.h b/src/feat/feature-functions.h
index c5dfe9a3010..42a9703757f 100644
--- a/src/feat/feature-functions.h
+++ b/src/feat/feature-functions.h
@@ -2,6 +2,7 @@
 
 // Copyright 2009-2011  Karel Vesely;  Petr Motlicek;  Microsoft Corporation
 //                2014  IMSL, PKU-HKUST (author: Wei Shi)
+//                2016  Johns Hopkins University (author: Daniel Povey)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -28,141 +29,12 @@
 #include "matrix/matrix-lib.h"
 #include "util/common-utils.h"
 #include "base/kaldi-error.h"
-#include "feat/mel-computations.h"
 
 namespace kaldi {
 /// @addtogroup  feat FeatureExtraction
 /// @{
 
 
-struct MelBanksOptions {
-  int32 num_bins;  // e.g. 25; number of triangular bins
-  BaseFloat low_freq;  // e.g. 20; lower frequency cutoff
-  BaseFloat high_freq;  // an upper frequency cutoff; 0 -> no cutoff, negative
-  // ->added to the Nyquist frequency to get the cutoff.
-  BaseFloat vtln_low;  // vtln lower cutoff of warping function.
-  BaseFloat vtln_high;  // vtln upper cutoff of warping function: if negative, added
-                        // to the Nyquist frequency to get the cutoff.
-  bool debug_mel;
-  // htk_mode is a "hidden" config, it does not show up on command line.
-  // Enables more exact compatibibility with HTK, for testing purposes.  Affects
-  // mel-energy flooring and reproduces a bug in HTK.
-  bool htk_mode;
-  explicit MelBanksOptions(int num_bins = 25)
-      : num_bins(num_bins), low_freq(20), high_freq(0), vtln_low(100),
-        vtln_high(-500), debug_mel(false), htk_mode(false) {}
-
-  void Register(OptionsItf *opts) {
-    opts->Register("num-mel-bins", &num_bins,
-                   "Number of triangular mel-frequency bins");
-    opts->Register("low-freq", &low_freq,
-                   "Low cutoff frequency for mel bins");
-    opts->Register("high-freq", &high_freq,
-                   "High cutoff frequency for mel bins (if < 0, offset from Nyquist)");
-    opts->Register("vtln-low", &vtln_low,
-                   "Low inflection point in piecewise linear VTLN warping function");
-    opts->Register("vtln-high", &vtln_high,
-                   "High inflection point in piecewise linear VTLN warping function"
-                   " (if negative, offset from high-mel-freq");
-    opts->Register("debug-mel", &debug_mel,
-                   "Print out debugging information for mel bin computation");
-  }
-};
-
-
-struct FrameExtractionOptions {
-  BaseFloat samp_freq;
-  BaseFloat frame_shift_ms;  // in milliseconds.
-  BaseFloat frame_length_ms;  // in milliseconds.
-  BaseFloat dither;  // Amount of dithering, 0.0 means no dither.
-  BaseFloat preemph_coeff;  // Preemphasis coefficient.
-  bool remove_dc_offset;  // Subtract mean of wave before FFT.
-  std::string window_type;  // e.g. Hamming window
-  bool round_to_power_of_two;
-  bool snip_edges;
-  // Maybe "hamming", "rectangular", "povey", "hanning"
-  // "povey" is a window I made to be similar to Hamming but to go to zero at the
-  // edges, it's pow((0.5 - 0.5*cos(n/N*2*pi)), 0.85)
-  // I just don't think the Hamming window makes sense as a windowing function.
-  FrameExtractionOptions():
-      samp_freq(16000),
-      frame_shift_ms(10.0),
-      frame_length_ms(25.0),
-      dither(1.0),
-      preemph_coeff(0.97),
-      remove_dc_offset(true),
-      window_type("povey"),
-      round_to_power_of_two(true),
-      snip_edges(true){ }
-
-  void Register(OptionsItf *opts) {
-    opts->Register("sample-frequency", &samp_freq,
-                   "Waveform data sample frequency (must match the waveform file, "
-                   "if specified there)");
-    opts->Register("frame-length", &frame_length_ms, "Frame length in milliseconds");
-    opts->Register("frame-shift", &frame_shift_ms, "Frame shift in milliseconds");
-    opts->Register("preemphasis-coefficient", &preemph_coeff,
-                   "Coefficient for use in signal preemphasis");
-    opts->Register("remove-dc-offset", &remove_dc_offset,
-                   "Subtract mean from waveform on each frame");
-    opts->Register("dither", &dither, "Dithering constant (0.0 means no dither)");
-    opts->Register("window-type", &window_type, "Type of window "
-                   "(\"hamming\"|\"hanning\"|\"povey\"|\"rectangular\")");
-    opts->Register("round-to-power-of-two", &round_to_power_of_two,
-                   "If true, round window size to power of two.");
-    opts->Register("snip-edges", &snip_edges,
-                   "If true, end effects will be handled by outputting only frames that "
-                   "completely fit in the file, and the number of frames depends on the "
-                   "frame-length.  If false, the number of frames depends only on the "
-                   "frame-shift, and we reflect the data at the ends.");
-  }
-  int32 WindowShift() const {
-    return static_cast<int32>(samp_freq * 0.001 * frame_shift_ms);
-  }
-  int32 WindowSize() const {
-    return static_cast<int32>(samp_freq * 0.001 * frame_length_ms);
-  }
-  int32 PaddedWindowSize() const {
-    return (round_to_power_of_two ? RoundUpToNearestPowerOfTwo(WindowSize()) :
-                                    WindowSize());
-  }
-};
-
-
-struct FeatureWindowFunction {
-  FeatureWindowFunction() {}
-  explicit FeatureWindowFunction(const FrameExtractionOptions &opts);
-  Vector<BaseFloat> window;
-};
-
-int32 NumFrames(int32 wave_length,
-                const FrameExtractionOptions &opts);
-
-void Dither(VectorBase<BaseFloat> *waveform, BaseFloat dither_value);
-
-void Preemphasize(VectorBase<BaseFloat> *waveform, BaseFloat preemph_coeff);
-
-
-// ExtractWindow extracts a windowed frame of waveform with a power-of-two,
-// padded size. If log_energy_pre_window != NULL, outputs the log of the
-// sum-of-squared samples before preemphasis and windowing
-void ExtractWindow(const VectorBase<BaseFloat> &wave,
-                   int32 f,  // with 0 <= f < NumFrames(wave.Dim(), opts)
-                   const FrameExtractionOptions &opts,
-                   const FeatureWindowFunction &window_function,
-                   Vector<BaseFloat> *window,
-                   BaseFloat *log_energy_pre_window = NULL);
-
-// ExtractWaveformRemainder is useful if the waveform is coming in segments.
-// It extracts the bit of the waveform at the end of this block that you
-// would have to append the next bit of waveform to, if you wanted to have
-// the same effect as everything being in one big block.
-void ExtractWaveformRemainder(const VectorBase<BaseFloat> &wave,
-                              const FrameExtractionOptions &opts,
-                              Vector<BaseFloat> *wave_remainder);
-
-
-
 // ComputePowerSpectrum converts a complex FFT (as produced by the FFT
 // functions in matrix/matrix-functions.h), and converts it into
 // a power spectrum.  If the complex FFT is a vector of size n (representing
@@ -173,22 +45,6 @@ void ExtractWaveformRemainder(const VectorBase<BaseFloat> &wave,
 void ComputePowerSpectrum(VectorBase<BaseFloat> *complex_fft);
 
 
-
-inline void MaxNormalizeEnergy(Matrix<BaseFloat> *feats) {
-  // Just subtract the largest energy value... assume energy is the first
-  // column of the mfcc features.  Don't do the flooring of energy (dithering
-  // should prevent exact zeros).
-  // We didn't put this in the main MFCC computation as we wanted to make sure
-  // it is stateless (so we can do it bit by bit for large waveforms).
-  // not compatible with the order_as_htk_ option in MfccOptions.
-  SubMatrix<BaseFloat> energy(*feats, 0, feats->NumRows(), 0, 1);
-  energy.Add(-energy.Max());
-}
-
-
-
-
-
 struct DeltaFeaturesOptions {
   int32 order;
   int32 window;  // e.g. 2; controls window size (window size is 2*window + 1)
@@ -293,19 +149,10 @@ void SpliceFrames(const MatrixBase<BaseFloat> &input_features,
 void ReverseFrames(const MatrixBase<BaseFloat> &input_features,
                   Matrix<BaseFloat> *output_features);
 
-class MelBanks;
-
-void GetEqualLoudnessVector(const MelBanks &mel_banks,
-                            Vector<BaseFloat> *ans);
-
 
 void InitIdftBases(int32 n_bases, int32 dimension, Matrix<BaseFloat> *mat_out);
 
 
-// Compute LP coefficients from autocorrelation coefficients.
-BaseFloat ComputeLpc(const VectorBase<BaseFloat> &autocorr_in,
-                     Vector<BaseFloat> *lpc_out);
-
 // This is used for speaker-id.  Also see OnlineCmnOptions in ../online2/, which
 // is online CMN with no latency, for online speech recognition.
 struct SlidingWindowCmnOptions {
diff --git a/src/feat/feature-mfcc.cc b/src/feat/feature-mfcc.cc
index 135152733d9..c1962a5c1d1 100644
--- a/src/feat/feature-mfcc.cc
+++ b/src/feat/feature-mfcc.cc
@@ -1,6 +1,7 @@
 // feat/feature-mfcc.cc
 
 // Copyright 2009-2011  Karel Vesely;  Petr Motlicek
+//                2016  Johns Hopkins University (author: Daniel Povey)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -23,8 +24,64 @@
 
 namespace kaldi {
 
-Mfcc::Mfcc(const MfccOptions &opts)
-    : opts_(opts), feature_window_function_(opts.frame_opts), srfft_(NULL) {
+
+void MfccComputer::Compute(BaseFloat signal_log_energy,
+                           BaseFloat vtln_warp,
+                           VectorBase<BaseFloat> *signal_frame,
+                           VectorBase<BaseFloat> *feature) {
+  KALDI_ASSERT(signal_frame->Dim() == opts_.frame_opts.PaddedWindowSize() &&
+               feature->Dim() == this->Dim());
+
+  const MelBanks &mel_banks = *(GetMelBanks(vtln_warp));
+
+  if (opts_.use_energy && !opts_.raw_energy)
+    signal_log_energy = Log(std::max(VecVec(*signal_frame, *signal_frame),
+                                     std::numeric_limits<BaseFloat>::min()));
+
+  if (srfft_ != NULL)  // Compute FFT using the split-radix algorithm.
+    srfft_->Compute(signal_frame->Data(), true);
+  else  // An alternative algorithm that works for non-powers-of-two.
+    RealFft(signal_frame, true);
+
+  // Convert the FFT into a power spectrum.
+  ComputePowerSpectrum(signal_frame);
+  SubVector<BaseFloat> power_spectrum(*signal_frame, 0,
+                                      signal_frame->Dim() / 2 + 1);
+
+  mel_banks.Compute(power_spectrum, &mel_energies_);
+
+  // avoid log of zero (which should be prevented anyway by dithering).
+  mel_energies_.ApplyFloor(std::numeric_limits<BaseFloat>::epsilon());
+  mel_energies_.ApplyLog();  // take the log.
+
+  feature->SetZero();  // in case there were NaNs.
+  // feature = dct_matrix_ * mel_energies [which now have log]
+  feature->AddMatVec(1.0, dct_matrix_, kNoTrans, mel_energies_, 0.0);
+
+  if (opts_.cepstral_lifter != 0.0)
+    feature->MulElements(lifter_coeffs_);
+
+  if (opts_.use_energy) {
+    if (opts_.energy_floor > 0.0 && signal_log_energy < log_energy_floor_)
+      signal_log_energy = log_energy_floor_;
+    (*feature)(0) = signal_log_energy;
+  }
+
+  if (opts_.htk_compat) {
+    BaseFloat energy = (*feature)(0);
+    for (int32 i = 0; i < opts_.num_ceps - 1; i++)
+      (*feature)(i) = (*feature)(i+1);
+    if (!opts_.use_energy)
+      energy *= M_SQRT2;  // scale on C0 (actually removing a scale
+    // we previously added that's part of one common definition of
+    // the cosine transform.)
+    (*feature)(opts_.num_ceps - 1)  = energy;
+  }
+}
+
+MfccComputer::MfccComputer(const MfccOptions &opts):
+    opts_(opts), srfft_(NULL),
+    mel_energies_(opts.mel_opts.num_bins) {
   int32 num_bins = opts.mel_opts.num_bins;
   Matrix<BaseFloat> dct_matrix(num_bins, num_bins);
   ComputeDctMatrix(&dct_matrix);
@@ -44,23 +101,37 @@ Mfcc::Mfcc(const MfccOptions &opts)
   int32 padded_window_size = opts.frame_opts.PaddedWindowSize();
   if ((padded_window_size & (padded_window_size-1)) == 0)  // Is a power of two...
     srfft_ = new SplitRadixRealFft<BaseFloat>(padded_window_size);
-  
+
   // We'll definitely need the filterbanks info for VTLN warping factor 1.0.
-  // [note: this call caches it.]  The reason we call this here is to
-  // improve the efficiency of the "const" version of Compute().
+  // [note: this call caches it.]
   GetMelBanks(1.0);
 }
 
-Mfcc::~Mfcc() {
+MfccComputer::MfccComputer(const MfccComputer &other):
+    opts_(other.opts_), lifter_coeffs_(other.lifter_coeffs_),
+    dct_matrix_(other.dct_matrix_),
+    log_energy_floor_(other.log_energy_floor_),
+    mel_banks_(other.mel_banks_),
+    srfft_(NULL),
+    mel_energies_(other.mel_energies_.Dim(), kUndefined) {
+  for (std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.begin();
+       iter != mel_banks_.end(); ++iter)
+    iter->second = new MelBanks(*(iter->second));
+  if (other.srfft_ != NULL)
+    srfft_ = new SplitRadixRealFft<BaseFloat>(*(other.srfft_));
+}
+
+
+
+MfccComputer::~MfccComputer() {
   for (std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.begin();
       iter != mel_banks_.end();
       ++iter)
     delete iter->second;
-  if (srfft_ != NULL)
-    delete srfft_;
+  delete srfft_;
 }
 
-const MelBanks *Mfcc::GetMelBanks(BaseFloat vtln_warp) {
+const MelBanks *MfccComputer::GetMelBanks(BaseFloat vtln_warp) {
   MelBanks *this_mel_banks = NULL;
   std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.find(vtln_warp);
   if (iter == mel_banks_.end()) {
@@ -75,117 +146,5 @@ const MelBanks *Mfcc::GetMelBanks(BaseFloat vtln_warp) {
 }
 
 
-const MelBanks *Mfcc::GetMelBanks(BaseFloat vtln_warp, bool *must_delete) const {
-  MelBanks *this_mel_banks = NULL;
-  std::map<BaseFloat, MelBanks*>::const_iterator iter =
-      mel_banks_.find(vtln_warp);
-  if (iter == mel_banks_.end()) {
-    this_mel_banks = new MelBanks(opts_.mel_opts,
-                                  opts_.frame_opts,
-                                  vtln_warp);
-    *must_delete = true;
-  } else {
-    this_mel_banks = iter->second;
-    *must_delete = false;
-  }
-  return this_mel_banks;
-}
-
-
-void Mfcc::Compute(const VectorBase<BaseFloat> &wave,
-                   BaseFloat vtln_warp,
-                   Matrix<BaseFloat> *output,
-                   Vector<BaseFloat> *wave_remainder) {
-  const MelBanks *this_mel_banks = GetMelBanks(vtln_warp);
-  ComputeInternal(wave, *this_mel_banks, output, wave_remainder);  
-}
-
-void Mfcc::Compute(const VectorBase<BaseFloat> &wave,
-                   BaseFloat vtln_warp,
-                   Matrix<BaseFloat> *output,
-                   Vector<BaseFloat> *wave_remainder) const {
-  bool must_delete_mel_banks;
-  const MelBanks *mel_banks = GetMelBanks(vtln_warp,
-                                               &must_delete_mel_banks);
-  
-  ComputeInternal(wave, *mel_banks, output, wave_remainder);
-  
-  if (must_delete_mel_banks)
-    delete mel_banks;
-}
-
-void Mfcc::ComputeInternal(const VectorBase<BaseFloat> &wave,
-                           const MelBanks &mel_banks,
-                           Matrix<BaseFloat> *output,
-                           Vector<BaseFloat> *wave_remainder) const {
-  KALDI_ASSERT(output != NULL);
-  int32 rows_out = NumFrames(wave.Dim(), opts_.frame_opts),
-      cols_out = opts_.num_ceps;
-  if (rows_out == 0) {
-    output->Resize(0, 0);
-    *wave_remainder = wave;
-    return;
-  }
-  output->Resize(rows_out, cols_out);
-  if (wave_remainder != NULL)
-    ExtractWaveformRemainder(wave, opts_.frame_opts, wave_remainder);
-  Vector<BaseFloat> window;  // windowed waveform.
-  Vector<BaseFloat> mel_energies;
-  std::vector<BaseFloat> temp_buffer;  // used by srfft.
-  for (int32 r = 0; r < rows_out; r++) {  // r is frame index..
-    BaseFloat log_energy;
-    ExtractWindow(wave, r, opts_.frame_opts, feature_window_function_, &window,
-                  (opts_.use_energy && opts_.raw_energy ? &log_energy : NULL));
-
-    if (opts_.use_energy && !opts_.raw_energy)
-      log_energy = Log(std::max(VecVec(window, window),
-                                std::numeric_limits<BaseFloat>::min()));
-
-    if (srfft_ != NULL)  // Compute FFT using the split-radix algorithm.
-      srfft_->Compute(window.Data(), true, &temp_buffer);
-    else  // An alternative algorithm that works for non-powers-of-two.
-      RealFft(&window, true);
-
-    // Convert the FFT into a power spectrum.
-    ComputePowerSpectrum(&window);
-    SubVector<BaseFloat> power_spectrum(window, 0, window.Dim()/2 + 1);
-
-    mel_banks.Compute(power_spectrum, &mel_energies);
-
-    // avoid log of zero (which should be prevented anyway by dithering).
-    mel_energies.ApplyFloor(std::numeric_limits<BaseFloat>::min());
-    mel_energies.ApplyLog();  // take the log.
-
-    SubVector<BaseFloat> this_mfcc(output->Row(r));
-
-    // this_mfcc = dct_matrix_ * mel_energies [which now have log]
-    this_mfcc.AddMatVec(1.0, dct_matrix_, kNoTrans, mel_energies, 0.0);
-
-    if (opts_.cepstral_lifter != 0.0)
-      this_mfcc.MulElements(lifter_coeffs_);
-
-    if (opts_.use_energy) {
-      if (opts_.energy_floor > 0.0 && log_energy < log_energy_floor_)
-        log_energy = log_energy_floor_;
-      this_mfcc(0) = log_energy;
-    }
-
-    if (opts_.htk_compat) {
-      BaseFloat energy = this_mfcc(0);
-      for (int32 i = 0; i < opts_.num_ceps-1; i++)
-        this_mfcc(i) = this_mfcc(i+1);
-      if (!opts_.use_energy)
-        energy *= M_SQRT2;  // scale on C0 (actually removing scale
-      // we previously added that's part of one common definition of
-      // cosine transform.)
-      this_mfcc(opts_.num_ceps-1)  = energy;
-    }
-  }
-}
-
-
-
-
-
 
 }  // namespace kaldi
diff --git a/src/feat/feature-mfcc.h b/src/feat/feature-mfcc.h
index 1f814333390..d1d2b8f9d09 100644
--- a/src/feat/feature-mfcc.h
+++ b/src/feat/feature-mfcc.h
@@ -1,6 +1,7 @@
 // feat/feature-mfcc.h
 
 // Copyright 2009-2011  Karel Vesely;  Petr Motlicek;  Saarland University
+//           2014-2016  Johns Hopkins University (author: Daniel Povey)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -23,17 +24,17 @@
 #include <map>
 #include <string>
 
+#include "feat/feature-common.h"
 #include "feat/feature-functions.h"
+#include "feat/feature-window.h"
+#include "feat/mel-computations.h"
 
 namespace kaldi {
 /// @addtogroup  feat FeatureExtraction
 /// @{
 
 
-/// MfccOptions contains basic options for computing MFCC features
-/// It only includes things that can be done in a "stateless" way, i.e.
-/// it does not include energy max-normalization.
-/// It does not include delta computation.
+/// MfccOptions contains basic options for computing MFCC features.
 struct MfccOptions {
   FrameExtractionOptions frame_opts;
   MelBanksOptions mel_opts;
@@ -77,56 +78,70 @@ struct MfccOptions {
   }
 };
 
-class MelBanks;
 
 
-/// Class for computing MFCC features; see \ref feat_mfcc for more information.
-class Mfcc {
+// This is the new-style interface to the MFCC computation.
+class MfccComputer {
  public:
-  explicit Mfcc(const MfccOptions &opts);
-  ~Mfcc();
+  typedef MfccOptions Options;
+  explicit MfccComputer(const MfccOptions &opts);
+  MfccComputer(const MfccComputer &other);
+
+  const FrameExtractionOptions &GetFrameOptions() const {
+    return opts_.frame_opts;
+  }
 
   int32 Dim() const { return opts_.num_ceps; }
 
-  /// Will throw exception on failure (e.g. if file too short for even one
-  /// frame).  The output "wave_remainder" is the last frame or two of the
-  /// waveform that it would be necessary to include in the next call to Compute
-  /// for the same utterance.  It is not exactly the un-processed part (it may
-  /// have been partly processed), it's the start of the next window that we
-  /// have not already processed.
-  void Compute(const VectorBase<BaseFloat> &wave,
+  bool NeedRawLogEnergy() { return opts_.use_energy && opts_.raw_energy; }
+
+  /**
+     Function that computes one frame of features from
+     one frame of signal.
+
+     @param [in] signal_raw_log_energy The log-energy of the frame of the signal
+         prior to windowing and pre-emphasis, or
+         log(numeric_limits<float>::min()), whichever is greater.  Must be
+         ignored by this function if this class returns false from
+         this->NeedsRawLogEnergy().
+     @param [in] vtln_warp  The VTLN warping factor that the user wants
+         to be applied when computing features for this utterance.  Will
+         normally be 1.0, meaning no warping is to be done.  The value will
+         be ignored for feature types that don't support VLTN, such as
+         spectrogram features.
+     @param [in] signal_frame  One frame of the signal,
+       as extracted using the function ExtractWindow() using the options
+       returned by this->GetFrameOptions().  The function will use the
+       vector as a workspace, which is why it's a non-const pointer.
+     @param [out] feature  Pointer to a vector of size this->Dim(), to which
+         the computed feature will be written.
+  */
+  void Compute(BaseFloat signal_log_energy,
                BaseFloat vtln_warp,
-               Matrix<BaseFloat> *output,
-               Vector<BaseFloat> *wave_remainder = NULL);
+               VectorBase<BaseFloat> *signal_frame,
+               VectorBase<BaseFloat> *feature);
 
-  /// Const version of Compute()
-  void Compute(const VectorBase<BaseFloat> &wave,
-               BaseFloat vtln_warp,
-               Matrix<BaseFloat> *output,
-               Vector<BaseFloat> *wave_remainder = NULL) const;
-  
-  typedef MfccOptions Options;
+  ~MfccComputer();
  private:
-  void ComputeInternal(const VectorBase<BaseFloat> &wave,
-                       const MelBanks &mel_banks,
-                       Matrix<BaseFloat> *output,
-                       Vector<BaseFloat> *wave_remainder = NULL) const;
-  
+  // disallow assignment.
+  MfccComputer &operator = (const MfccComputer &in);
+
   const MelBanks *GetMelBanks(BaseFloat vtln_warp);
 
-  const MelBanks *GetMelBanks(BaseFloat vtln_warp,
-                              bool *must_delete) const;
-  
   MfccOptions opts_;
   Vector<BaseFloat> lifter_coeffs_;
   Matrix<BaseFloat> dct_matrix_;  // matrix we left-multiply by to perform DCT.
   BaseFloat log_energy_floor_;
   std::map<BaseFloat, MelBanks*> mel_banks_;  // BaseFloat is VTLN coefficient.
-  FeatureWindowFunction feature_window_function_;
   SplitRadixRealFft<BaseFloat> *srfft_;
-  KALDI_DISALLOW_COPY_AND_ASSIGN(Mfcc);
+
+  // note: mel_energies_ is specific to the frame we're processing, it's
+  // just a temporary workspace.
+  Vector<BaseFloat> mel_energies_;
 };
 
+typedef OfflineFeatureTpl<MfccComputer> Mfcc;
+
 
 /// @} End of "addtogroup feat"
 }  // namespace kaldi
diff --git a/src/feat/feature-plp.cc b/src/feat/feature-plp.cc
index fe439864346..0034027cbe6 100644
--- a/src/feat/feature-plp.cc
+++ b/src/feat/feature-plp.cc
@@ -1,6 +1,7 @@
 // feat/feature-plp.cc
 
 // Copyright 2009-2011  Petr Motlicek;  Karel Vesely
+//                2016  Johns Hopkins University (author: Daniel Povey)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -19,13 +20,16 @@
 
 
 #include "feat/feature-plp.h"
-#include "util/parse-options.h"
-
 
 namespace kaldi {
 
-Plp::Plp(const PlpOptions &opts)
-    : opts_(opts), feature_window_function_(opts.frame_opts), srfft_(NULL) {
+PlpComputer::PlpComputer(const PlpOptions &opts):
+    opts_(opts), srfft_(NULL),
+    mel_energies_duplicated_(opts_.mel_opts.num_bins + 2, kUndefined),
+    autocorr_coeffs_(opts_.lpc_order + 1, kUndefined),
+    lpc_coeffs_(opts_.lpc_order, kUndefined),
+    raw_cepstrum_(opts_.lpc_order, kUndefined) {
+
   if (opts.cepstral_lifter != 0.0) {
     lifter_coeffs_.Resize(opts.num_ceps);
     ComputeLifterCoeffs(opts.cepstral_lifter, &lifter_coeffs_);
@@ -41,28 +45,42 @@ Plp::Plp(const PlpOptions &opts)
     srfft_ = new SplitRadixRealFft<BaseFloat>(padded_window_size);
 
   // We'll definitely need the filterbanks info for VTLN warping factor 1.0.
-  // [note: this call caches it.]  The reason we call this here is to
-  // improve the efficiency of the "const" version of Compute().
+  // [note: this call caches it.]
   GetMelBanks(1.0);
 }
 
-Plp::~Plp() {
+PlpComputer::PlpComputer(const PlpComputer &other):
+    opts_(other.opts_), lifter_coeffs_(other.lifter_coeffs_),
+    idft_bases_(other.idft_bases_), log_energy_floor_(other.log_energy_floor_),
+    mel_banks_(other.mel_banks_), equal_loudness_(other.equal_loudness_),
+    srfft_(NULL),
+    mel_energies_duplicated_(opts_.mel_opts.num_bins + 2, kUndefined),
+    autocorr_coeffs_(opts_.lpc_order + 1, kUndefined),
+    lpc_coeffs_(opts_.lpc_order, kUndefined),
+    raw_cepstrum_(opts_.lpc_order, kUndefined) {
   for (std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.begin();
-      iter != mel_banks_.end();
-      ++iter)
-    delete iter->second;
+       iter != mel_banks_.end(); ++iter)
+    iter->second = new MelBanks(*(iter->second));
+  for (std::map<BaseFloat, Vector<BaseFloat>*>::iterator
+           iter = equal_loudness_.begin();
+       iter != equal_loudness_.end(); ++iter)
+    iter->second = new Vector<BaseFloat>(*(iter->second));
+  if (other.srfft_ != NULL)
+    srfft_ = new SplitRadixRealFft<BaseFloat>(*(other.srfft_));
+}
 
-  for (std::map<BaseFloat,
-                Vector<BaseFloat>* >::iterator iter = equal_loudness_.begin();
-      iter != equal_loudness_.end();
-      ++iter)
+PlpComputer::~PlpComputer() {
+  for (std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.begin();
+      iter != mel_banks_.end(); ++iter)
     delete iter->second;
-
-  if (srfft_ != NULL)
-    delete srfft_;
+  for (std::map<BaseFloat, Vector<BaseFloat>* >::iterator
+           iter = equal_loudness_.begin();
+       iter != equal_loudness_.end(); ++iter)
+    delete iter->second;
+  delete srfft_;
 }
 
-const MelBanks *Plp::GetMelBanks(BaseFloat vtln_warp) {
+const MelBanks *PlpComputer::GetMelBanks(BaseFloat vtln_warp) {
   MelBanks *this_mel_banks = NULL;
   std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.find(vtln_warp);
   if (iter == mel_banks_.end()) {
@@ -76,23 +94,7 @@ const MelBanks *Plp::GetMelBanks(BaseFloat vtln_warp) {
   return this_mel_banks;
 }
 
-const MelBanks *Plp::GetMelBanks(BaseFloat vtln_warp, bool *must_delete) const {
-  MelBanks *this_mel_banks = NULL;
-  std::map<BaseFloat, MelBanks*>::const_iterator iter =
-      mel_banks_.find(vtln_warp);
-  if (iter == mel_banks_.end()) {
-    this_mel_banks = new MelBanks(opts_.mel_opts,
-                                  opts_.frame_opts,
-                                  vtln_warp);
-    *must_delete = true;
-  } else {
-    this_mel_banks = iter->second;
-    *must_delete = false;
-  }
-  return this_mel_banks;
-}
-
-const Vector<BaseFloat> *Plp::GetEqualLoudness(BaseFloat vtln_warp) {
+const Vector<BaseFloat> *PlpComputer::GetEqualLoudness(BaseFloat vtln_warp) {
   const MelBanks *this_mel_banks = GetMelBanks(vtln_warp);
   Vector<BaseFloat> *ans = NULL;
   std::map<BaseFloat, Vector<BaseFloat>*>::iterator iter
@@ -107,160 +109,81 @@ const Vector<BaseFloat> *Plp::GetEqualLoudness(BaseFloat vtln_warp) {
   return ans;
 }
 
+void PlpComputer::Compute(BaseFloat signal_log_energy,
+                          BaseFloat vtln_warp,
+                          VectorBase<BaseFloat> *signal_frame,
+                          VectorBase<BaseFloat> *feature) {
+  KALDI_ASSERT(signal_frame->Dim() == opts_.frame_opts.PaddedWindowSize() &&
+               feature->Dim() == this->Dim());
 
-const Vector<BaseFloat> *Plp::GetEqualLoudness(BaseFloat vtln_warp,
-                                               const MelBanks &mel_banks,
-                                               bool *must_delete) const {
-  Vector<BaseFloat> *ans = NULL;
-  std::map<BaseFloat, Vector<BaseFloat>*>::const_iterator iter
-      = equal_loudness_.find(vtln_warp);
-  if (iter == equal_loudness_.end()) {
-    ans = new Vector<BaseFloat>;
-    GetEqualLoudnessVector(mel_banks, ans);
-    *must_delete = true;
-  } else {
-    ans = iter->second;
-    *must_delete = false;
-  }
-  return ans;
-}
+  const MelBanks &mel_banks = *GetMelBanks(vtln_warp);
+  const Vector<BaseFloat> &equal_loudness = *GetEqualLoudness(vtln_warp);
 
 
-void Plp::Compute(const VectorBase<BaseFloat> &wave,
-                   BaseFloat vtln_warp,
-                   Matrix<BaseFloat> *output,
-                   Vector<BaseFloat> *wave_remainder) {
-  const MelBanks *mel_banks = GetMelBanks(vtln_warp);
-  const Vector<BaseFloat> *equal_loudness = GetEqualLoudness(vtln_warp);
-  ComputeInternal(wave, *mel_banks,
-                  *equal_loudness,
-                  output, wave_remainder);  
-}
+  KALDI_ASSERT(opts_.num_ceps <= opts_.lpc_order+1);  // our num-ceps includes C0.
 
-void Plp::Compute(const VectorBase<BaseFloat> &wave,
-                   BaseFloat vtln_warp,
-                   Matrix<BaseFloat> *output,
-                   Vector<BaseFloat> *wave_remainder) const {
-  bool must_delete_mel_banks, must_delete_equal_loudness;
-  const MelBanks *mel_banks = GetMelBanks(vtln_warp,
-                                               &must_delete_mel_banks);
-  const Vector<BaseFloat> *equal_loudness
-      = GetEqualLoudness(vtln_warp, *mel_banks,
-                         &must_delete_equal_loudness);
-
-  ComputeInternal(wave, *mel_banks, *equal_loudness,
-                  output, wave_remainder);
-
-  if (must_delete_mel_banks)
-    delete mel_banks;
-  if (must_delete_equal_loudness)
-    delete equal_loudness;  
-}
 
+  if (opts_.use_energy && !opts_.raw_energy)
+    signal_log_energy = Log(std::max(VecVec(*signal_frame, *signal_frame),
+                                     std::numeric_limits<float>::min()));
+
+  if (srfft_ != NULL)  // Compute FFT using split-radix algorithm.
+    srfft_->Compute(signal_frame->Data(), true);
+  else  // An alternative algorithm that works for non-powers-of-two.
+    RealFft(signal_frame, true);
+
+  // Convert the FFT into a power spectrum.
+  ComputePowerSpectrum(signal_frame);  // elements 0 ... signal_frame->Dim()/2
+
+  SubVector<BaseFloat> power_spectrum(*signal_frame,
+                                      0, signal_frame->Dim() / 2 + 1);
 
-void Plp::ComputeInternal(const VectorBase<BaseFloat> &wave,
-                          const MelBanks &mel_banks,
-                          const Vector<BaseFloat> &equal_loudness,
-                          Matrix<BaseFloat> *output,
-                          Vector<BaseFloat> *wave_remainder) const {
-  KALDI_ASSERT(output != NULL);
-  int32 rows_out = NumFrames(wave.Dim(), opts_.frame_opts),
-      cols_out = opts_.num_ceps;
-  if (rows_out == 0) {
-    output->Resize(0, 0);
-    *wave_remainder = wave;
-    return;
-  }
-  output->Resize(rows_out, cols_out);
-  if (wave_remainder != NULL)
-    ExtractWaveformRemainder(wave, opts_.frame_opts, wave_remainder);
-  Vector<BaseFloat> window;  // windowed waveform.
   int32 num_mel_bins = opts_.mel_opts.num_bins;
-  Vector<BaseFloat> mel_energies(num_mel_bins);
-  Vector<BaseFloat> mel_energies_duplicated(num_mel_bins+2);
-  Vector<BaseFloat> autocorr_coeffs(opts_.lpc_order+1);
-  Vector<BaseFloat> lpc_coeffs(opts_.lpc_order);
-  Vector<BaseFloat> raw_cepstrum(opts_.lpc_order);  // not including C0,
-  // and size may differ from final size.
-  Vector<BaseFloat> final_cepstrum(opts_.num_ceps);
-  std::vector<BaseFloat> temp_buffer;  // used by srfft.
-  
-  KALDI_ASSERT(opts_.num_ceps <= opts_.lpc_order+1);  // our num-ceps includes C0.
-  for (int32 r = 0; r < rows_out; r++) {  // r is frame index..
-    BaseFloat log_energy;
-    ExtractWindow(wave, r, opts_.frame_opts,
-                  feature_window_function_, &window,
-                  (opts_.use_energy && opts_.raw_energy ? &log_energy : NULL));
-
-    if (opts_.use_energy && !opts_.raw_energy)
-      log_energy = Log(std::max(VecVec(window, window),
-                                std::numeric_limits<BaseFloat>::min()));
-
-    if (srfft_ != NULL)  // Compute FFT using split-radix algorithm.
-      srfft_->Compute(window.Data(), true, &temp_buffer);
-    else  // An alternative algorithm that works for non-powers-of-two.
-      RealFft(&window, true);
-
-    // Convert the FFT into a power spectrum.
-    ComputePowerSpectrum(&window);  // elements 0 ... window.Dim()/2
-
-    SubVector<BaseFloat> power_spectrum(window, 0, window.Dim()/2 + 1);
-
-    mel_banks.Compute(power_spectrum, &mel_energies);
-
-    mel_energies.MulElements(equal_loudness);
-    
-    mel_energies.ApplyPow(opts_.compress_factor);
-    
-    // duplicate first and last elements.
-    {
-      SubVector<BaseFloat> v(mel_energies_duplicated, 1, num_mel_bins);
-      v.CopyFromVec(mel_energies);
-    }
-    mel_energies_duplicated(0) = mel_energies(0);
-    mel_energies_duplicated(num_mel_bins+1) = mel_energies(num_mel_bins-1);
-
-    autocorr_coeffs.AddMatVec(1.0, idft_bases_, kNoTrans,
-                              mel_energies_duplicated,  0.0);
-    
-    BaseFloat energy = ComputeLpc(autocorr_coeffs, &lpc_coeffs);
-
-    energy = std::max(energy,
-                      std::numeric_limits<BaseFloat>::min());
-    
-    Lpc2Cepstrum(opts_.lpc_order, lpc_coeffs.Data(), raw_cepstrum.Data());
-    {
-      SubVector<BaseFloat> dst(final_cepstrum, 1, opts_.num_ceps-1);
-      SubVector<BaseFloat> src(raw_cepstrum, 0, opts_.num_ceps-1);
-      dst.CopyFromVec(src);
-      final_cepstrum(0) = energy;
-    }
-
-    if (opts_.cepstral_lifter != 0.0)
-      final_cepstrum.MulElements(lifter_coeffs_);
-
-    if (opts_.cepstral_scale != 1.0)
-      final_cepstrum.Scale(opts_.cepstral_scale);
-
-    if (opts_.use_energy) {
-      if (opts_.energy_floor > 0.0 && log_energy < log_energy_floor_)
-        log_energy = log_energy_floor_;
-      final_cepstrum(0) = log_energy;
-    }
-
-    if (opts_.htk_compat) {
-      BaseFloat energy = final_cepstrum(0);
-      for (int32 i = 0; i < opts_.num_ceps-1; i++)
-        final_cepstrum(i) = final_cepstrum(i+1);
-      // if (!opts_.use_energy)
-        // energy *= M_SQRT2;  // scale on C0 (actually removing scale
-      // we previously added that's part of one common definition of
-      // cosine transform.)
-      final_cepstrum(opts_.num_ceps-1)  = energy;
-    }
-
-    output->Row(r).CopyFromVec(final_cepstrum);
-    // std::cout << "FIN" << final_cepstrum;
+
+  SubVector<BaseFloat> mel_energies(mel_energies_duplicated_, 1, num_mel_bins);
+
+  mel_banks.Compute(power_spectrum, &mel_energies);
+
+  mel_energies.MulElements(equal_loudness);
+
+  mel_energies.ApplyPow(opts_.compress_factor);
+
+  // duplicate first and last elements
+  mel_energies_duplicated_(0) = mel_energies_duplicated_(1);
+  mel_energies_duplicated_(num_mel_bins + 1) =
+      mel_energies_duplicated_(num_mel_bins);
+
+  autocorr_coeffs_.SetZero();  // In case of NaNs or infs
+  autocorr_coeffs_.AddMatVec(1.0, idft_bases_, kNoTrans,
+                             mel_energies_duplicated_,  0.0);
+
+  BaseFloat residual_log_energy = ComputeLpc(autocorr_coeffs_, &lpc_coeffs_);
+
+  residual_log_energy = std::max(residual_log_energy,
+                                 std::numeric_limits<BaseFloat>::min());
+
+  Lpc2Cepstrum(opts_.lpc_order, lpc_coeffs_.Data(), raw_cepstrum_.Data());
+  feature->Range(1, opts_.num_ceps - 1).CopyFromVec(
+      raw_cepstrum_.Range(0, opts_.num_ceps - 1));
+  (*feature)(0) = residual_log_energy;
+
+  if (opts_.cepstral_lifter != 0.0)
+    feature->MulElements(lifter_coeffs_);
+
+  if (opts_.cepstral_scale != 1.0)
+    feature->Scale(opts_.cepstral_scale);
+
+  if (opts_.use_energy) {
+    if (opts_.energy_floor > 0.0 && signal_log_energy < log_energy_floor_)
+      signal_log_energy = log_energy_floor_;
+    (*feature)(0) = signal_log_energy;
+  }
+
+  if (opts_.htk_compat) {  // reorder the features.
+    BaseFloat log_energy = (*feature)(0);
+    for (int32 i = 0; i < opts_.num_ceps-1; i++)
+      (*feature)(i) = (*feature)(i+1);
+    (*feature)(opts_.num_ceps-1)  = log_energy;
   }
 }
 
diff --git a/src/feat/feature-plp.h b/src/feat/feature-plp.h
index bbcbecc21c8..d7deab07ec1 100644
--- a/src/feat/feature-plp.h
+++ b/src/feat/feature-plp.h
@@ -23,9 +23,11 @@
 #include <map>
 #include <string>
 
+#include "feat/feature-common.h"
 #include "feat/feature-functions.h"
+#include "feat/feature-window.h"
+#include "feat/mel-computations.h"
 #include "itf/options-itf.h"
-#include "matrix/kaldi-matrix-inl.h"
 
 namespace kaldi {
 /// @addtogroup  feat FeatureExtraction
@@ -86,68 +88,84 @@ struct PlpOptions {
     opts->Register("cepstral-scale", &cepstral_scale,
                    "Scaling constant in PLP computation");
     opts->Register("htk-compat", &htk_compat,
-                   "If true, put energy or C0 last and put factor of sqrt(2) on "
-                   "C0.  Warning: not sufficient to get HTK compatible features "
-                   "(need to change other parameters).");
+                   "If true, put energy or C0 last.  Warning: not sufficient "
+                   "to get HTK compatible features (need to change other "
+                   "parameters).");
   }
 };
 
 
-/// Class for computing PLP features.  See \ref feat_plp where
-/// documentation will eventually be added.
-class Plp {
+/// This is the new-style interface to the PLP computation.
+class PlpComputer {
  public:
-  explicit Plp(const PlpOptions &opts);
-  ~Plp();
+  typedef PlpOptions Options;
+  explicit PlpComputer(const PlpOptions &opts);
+  PlpComputer(const PlpComputer &other);
+
+  const FrameExtractionOptions &GetFrameOptions() const {
+    return opts_.frame_opts;
+  }
 
   int32 Dim() const { return opts_.num_ceps; }
 
-  /// Will throw exception on failure (e.g. if file too short for even one
-  /// frame).  The output "wave_remainder" is the last frame or two of the
-  /// waveform that it would be necessary to include in the next call to Compute
-  /// for the same utterance.  It is not exactly the un-processed part (it may
-  /// have been partly processed), it's the start of the next window that we
-  /// have not already processed.  Will throw exception on failure (e.g. if file
-  /// too short for even one frame).
-  void Compute(const VectorBase<BaseFloat> &wave,
+  bool NeedRawLogEnergy() { return opts_.use_energy && opts_.raw_energy; }
+
+  /**
+     Function that computes one frame of features from
+     one frame of signal.
+
+     @param [in] signal_raw_log_energy The log-energy of the frame of the signal
+         prior to windowing and pre-emphasis, or
+         log(numeric_limits<float>::min()), whichever is greater.  Must be
+         ignored by this function if this class returns false from
+         this->NeedsRawLogEnergy().
+     @param [in] vtln_warp  The VTLN warping factor that the user wants
+         to be applied when computing features for this utterance.  Will
+         normally be 1.0, meaning no warping is to be done.  The value will
+         be ignored for feature types that don't support VLTN, such as
+         spectrogram features.
+     @param [in] signal_frame  One frame of the signal,
+       as extracted using the function ExtractWindow() using the options
+       returned by this->GetFrameOptions().  The function will use the
+       vector as a workspace, which is why it's a non-const pointer.
+     @param [out] feature  Pointer to a vector of size this->Dim(), to which
+         the computed feature will be written.
+  */
+  void Compute(BaseFloat signal_log_energy,
                BaseFloat vtln_warp,
-               Matrix<BaseFloat> *output,
-               Vector<BaseFloat> *wave_remainder = NULL);
+               VectorBase<BaseFloat> *signal_frame,
+               VectorBase<BaseFloat> *feature);
 
-  typedef PlpOptions Options;
-  /// Const version of Compute()
-  void Compute(const VectorBase<BaseFloat> &wave,
-               BaseFloat vtln_warp,
-               Matrix<BaseFloat> *output,
-               Vector<BaseFloat> *wave_remainder = NULL) const;
+  ~PlpComputer();
  private:
-  void ComputeInternal(const VectorBase<BaseFloat> &wave,
-                       const MelBanks &mel_banks,
-                       const Vector<BaseFloat> &equal_loudness,
-                       Matrix<BaseFloat> *output,
-                       Vector<BaseFloat> *wave_remainder = NULL) const;
 
   const MelBanks *GetMelBanks(BaseFloat vtln_warp);
 
-  const MelBanks *GetMelBanks(BaseFloat vtln_warp, bool *must_delete) const;
-
   const Vector<BaseFloat> *GetEqualLoudness(BaseFloat vtln_warp);
 
-  const Vector<BaseFloat> *GetEqualLoudness(BaseFloat vtln_warp,
-                                            const MelBanks &mel_banks,
-                                            bool *must_delete) const;
-  
   PlpOptions opts_;
   Vector<BaseFloat> lifter_coeffs_;
   Matrix<BaseFloat> idft_bases_;
   BaseFloat log_energy_floor_;
   std::map<BaseFloat, MelBanks*> mel_banks_;  // BaseFloat is VTLN coefficient.
   std::map<BaseFloat, Vector<BaseFloat>* > equal_loudness_;
-  FeatureWindowFunction feature_window_function_;
   SplitRadixRealFft<BaseFloat> *srfft_;
-  KALDI_DISALLOW_COPY_AND_ASSIGN(Plp);
+
+  // temporary vector used inside Compute; size is opts_.mel_opts.num_bins + 2
+  Vector<BaseFloat> mel_energies_duplicated_;
+  // temporary vector used inside Compute; size is opts_.lpc_order + 1
+  Vector<BaseFloat> autocorr_coeffs_;
+  // temporary vector used inside Compute; size is opts_.lpc_order
+  Vector<BaseFloat> lpc_coeffs_;
+  // temporary vector used inside Compute; size is opts_.lpc_order
+  Vector<BaseFloat> raw_cepstrum_;
+
+  // Disallow assignment.
+  PlpComputer &operator =(const PlpComputer &other);
 };
 
+typedef OfflineFeatureTpl<PlpComputer> Plp;
+
 /// @} End of "addtogroup feat"
 
 }  // namespace kaldi
diff --git a/src/feat/feature-spectrogram.cc b/src/feat/feature-spectrogram.cc
index df915ad90fe..953f38fc54f 100644
--- a/src/feat/feature-spectrogram.cc
+++ b/src/feat/feature-spectrogram.cc
@@ -24,8 +24,8 @@
 
 namespace kaldi {
 
-Spectrogram::Spectrogram(const SpectrogramOptions &opts)
-    : opts_(opts), feature_window_function_(opts.frame_opts), srfft_(NULL) {
+SpectrogramComputer::SpectrogramComputer(const SpectrogramOptions &opts)
+    : opts_(opts), srfft_(NULL) {
   if (opts.energy_floor > 0.0)
     log_energy_floor_ = Log(opts.energy_floor);
 
@@ -34,63 +34,49 @@ Spectrogram::Spectrogram(const SpectrogramOptions &opts)
     srfft_ = new SplitRadixRealFft<BaseFloat>(padded_window_size);
 }
 
-Spectrogram::~Spectrogram() {
-  if (srfft_ != NULL)
-    delete srfft_;
+SpectrogramComputer::SpectrogramComputer(const SpectrogramComputer &other):
+    opts_(other.opts_), log_energy_floor_(other.log_energy_floor_), srfft_(NULL) {
+  if (other.srfft_ != NULL)
+    srfft_ = new SplitRadixRealFft<BaseFloat>(*other.srfft_);
 }
 
-void Spectrogram::Compute(const VectorBase<BaseFloat> &wave,
-                          Matrix<BaseFloat> *output,
-                          Vector<BaseFloat> *wave_remainder) {
-  KALDI_ASSERT(output != NULL);
-
-  // Get dimensions of output features
-  int32 rows_out = NumFrames(wave.Dim(), opts_.frame_opts);
-  int32 cols_out =  opts_.frame_opts.PaddedWindowSize()/2 +1;
-  if (rows_out == 0)
-    KALDI_ERR << "No frames fit in file (#samples is " << wave.Dim() << ")";
-  // Prepare the output buffer
-  output->Resize(rows_out, cols_out);
-
-  // Optionally extract the remainder for further processing
-  if (wave_remainder != NULL)
-    ExtractWaveformRemainder(wave, opts_.frame_opts, wave_remainder);
-
-  // Buffers
-  Vector<BaseFloat> window;  // windowed waveform.
-  BaseFloat log_energy;
-
-  // Compute all the freames, r is frame index..
-  for (int32 r = 0; r < rows_out; r++) {
-    // Cut the window, apply window function
-    ExtractWindow(wave, r, opts_.frame_opts, feature_window_function_,
-                  &window, (opts_.raw_energy ? &log_energy : NULL));
-
-    // Compute energy after window function (not the raw one)
-    if (!opts_.raw_energy)
-      log_energy = Log(std::max(VecVec(window, window),
-                                std::numeric_limits<BaseFloat>::min()));
-    
-    if (srfft_ != NULL)  // Compute FFT using split-radix algorithm.
-      srfft_->Compute(window.Data(), true);
-    else  // An alternative algorithm that works for non-powers-of-two
-      RealFft(&window, true);
-
-    // Convert the FFT into a power spectrum.
-    ComputePowerSpectrum(&window);
-    SubVector<BaseFloat> power_spectrum(window, 0, window.Dim()/2 + 1);
-
-    power_spectrum.ApplyFloor(std::numeric_limits<BaseFloat>::min());
-    power_spectrum.ApplyLog();
-
-    // Output buffers
-    SubVector<BaseFloat> this_output(output->Row(r));
-    this_output.CopyFromVec(power_spectrum);
-    if (opts_.energy_floor > 0.0 && log_energy < log_energy_floor_) {
-        log_energy = log_energy_floor_;
-    }
-    this_output(0) = log_energy;
-  }
+SpectrogramComputer::~SpectrogramComputer() {
+  delete srfft_;
+}
+
+void SpectrogramComputer::Compute(BaseFloat signal_log_energy,
+                                  BaseFloat vtln_warp,
+                                  VectorBase<BaseFloat> *signal_frame,
+                                  VectorBase<BaseFloat> *feature) {
+  KALDI_ASSERT(signal_frame->Dim() == opts_.frame_opts.PaddedWindowSize() &&
+               feature->Dim() == this->Dim());
+
+
+  // Compute energy after window function (not the raw one)
+  if (!opts_.raw_energy)
+    signal_log_energy = Log(std::max(VecVec(*signal_frame, *signal_frame),
+                                     std::numeric_limits<BaseFloat>::epsilon()));
+
+  if (srfft_ != NULL)  // Compute FFT using split-radix algorithm.
+    srfft_->Compute(signal_frame->Data(), true);
+  else  // An alternative algorithm that works for non-powers-of-two
+    RealFft(signal_frame, true);
+
+  // Convert the FFT into a power spectrum.
+  ComputePowerSpectrum(signal_frame);
+  SubVector<BaseFloat> power_spectrum(*signal_frame,
+                                      0, signal_frame->Dim() / 2 + 1);
+
+  power_spectrum.ApplyFloor(std::numeric_limits<BaseFloat>::epsilon());
+  power_spectrum.ApplyLog();
+
+  feature->CopyFromVec(power_spectrum);
+
+  if (opts_.energy_floor > 0.0 && signal_log_energy < log_energy_floor_)
+    signal_log_energy = log_energy_floor_;
+  // The zeroth spectrogram component is always set to the signal energy,
+  // instead of the square of the constant component of the signal.
+  (*feature)(0) = signal_log_energy;
 }
 
 }  // namespace kaldi
diff --git a/src/feat/feature-spectrogram.h b/src/feat/feature-spectrogram.h
index 500e3f4a588..ec318556f24 100644
--- a/src/feat/feature-spectrogram.h
+++ b/src/feat/feature-spectrogram.h
@@ -24,17 +24,17 @@
 
 #include <string>
 
+#include "feat/feature-common.h"
 #include "feat/feature-functions.h"
+#include "feat/feature-window.h"
 
 namespace kaldi {
 /// @addtogroup  feat FeatureExtraction
 /// @{
 
 
-/// SpectrogramOptions contains basic options for computing SPECTROGRAM features
-/// It only includes things that can be done in a "stateless" way, i.e.
-/// it does not include energy max-normalization.
-/// It does not include delta computation.
+/// SpectrogramOptions contains basic options for computing spectrogram
+/// features.
 struct SpectrogramOptions {
   FrameExtractionOptions frame_opts;
   BaseFloat energy_floor;
@@ -53,26 +53,58 @@ struct SpectrogramOptions {
   }
 };
 
-/// Class for computing SPECTROGRAM features; see \ref feat_mfcc for more information.
-class Spectrogram {
+/// Class for computing spectrogram features.
+class SpectrogramComputer {
  public:
-  explicit Spectrogram(const SpectrogramOptions &opts);
-  ~Spectrogram();
+  typedef SpectrogramOptions Options;
+  explicit SpectrogramComputer(const SpectrogramOptions &opts);
+  SpectrogramComputer(const SpectrogramComputer &other);
 
-  /// Will throw exception on failure (e.g. if file too short for
-  /// even one frame).
-  void Compute(const VectorBase<BaseFloat> &wave,
-               Matrix<BaseFloat> *output,
-               Vector<BaseFloat> *wave_remainder = NULL);
+  const FrameExtractionOptions& GetFrameOptions() const {
+    return opts_.frame_opts;
+  }
+
+  int32 Dim() const { return opts_.frame_opts.PaddedWindowSize() / 2 + 1; }
+
+  bool NeedRawLogEnergy() { return opts_.raw_energy; }
+
+
+  /**
+     Function that computes one frame of spectrogram features from
+     one frame of signal.
+
+     @param [in] signal_raw_log_energy The log-energy of the frame of the signal
+         prior to windowing and pre-emphasis, or
+         log(numeric_limits<float>::min()), whichever is greater.  Must be
+         ignored by this function if this class returns false from
+         this->NeedsRawLogEnergy().
+     @param [in] vtln_warp  This is ignored by this function, it's only
+         needed for interface compatibility.
+     @param [in] signal_frame  One frame of the signal,
+       as extracted using the function ExtractWindow() using the options
+       returned by this->GetFrameOptions().  The function will use the
+       vector as a workspace, which is why it's a non-const pointer.
+     @param [out] feature  Pointer to a vector of size this->Dim(), to which
+         the computed feature will be written.
+  */
+  void Compute(BaseFloat signal_log_energy,
+               BaseFloat vtln_warp,
+               VectorBase<BaseFloat> *signal_frame,
+               VectorBase<BaseFloat> *feature);
+
+  ~SpectrogramComputer();
 
  private:
   SpectrogramOptions opts_;
   BaseFloat log_energy_floor_;
-  FeatureWindowFunction feature_window_function_;
   SplitRadixRealFft<BaseFloat> *srfft_;
-  KALDI_DISALLOW_COPY_AND_ASSIGN(Spectrogram);
+
+  // Disallow assignment.
+  SpectrogramComputer &operator=(const SpectrogramComputer &other);
 };
 
+typedef OfflineFeatureTpl<SpectrogramComputer> Spectrogram;
+
 
 /// @} End of "addtogroup feat"
 }  // namespace kaldi
diff --git a/src/feat/feature-window.cc b/src/feat/feature-window.cc
new file mode 100644
index 00000000000..f084f5c0170
--- /dev/null
+++ b/src/feat/feature-window.cc
@@ -0,0 +1,229 @@
+// feat/feature-window.cc
+
+// Copyright 2009-2011  Karel Vesely;  Petr Motlicek;  Microsoft Corporation
+//           2013-2016  Johns Hopkins University (author: Daniel Povey)
+//                2014  IMSL, PKU-HKUST (author: Wei Shi)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "feat/feature-window.h"
+#include "matrix/matrix-functions.h"
+
+
+namespace kaldi {
+
+
+int64 FirstSampleOfFrame(int32 frame,
+                         const FrameExtractionOptions &opts) {
+  int64 frame_shift = opts.WindowShift();
+  if (opts.snip_edges) {
+    return frame * frame_shift;
+  } else {
+    int64 midpoint_of_frame = frame_shift * frame  +  frame_shift / 2,
+        beginning_of_frame = midpoint_of_frame  -  opts.WindowSize() / 2;
+    return beginning_of_frame;
+  }
+}
+
+int32 NumFrames(int64 num_samples,
+                const FrameExtractionOptions &opts,
+                bool flush) {
+  int64 frame_shift = opts.WindowShift();
+  int64 frame_length = opts.WindowSize();
+  if (opts.snip_edges) {
+    // with --snip-edges=true (the default), we use a HTK-like approach to
+    // determining the number of frames-- all frames have to fit completely into
+    // the waveform, and the first frame begins at sample zero.
+    if (num_samples < frame_length)
+      return 0;
+    else
+      return (1 + ((num_samples - frame_length) / frame_shift));
+    // You can understand the expression above as follows: 'num_samples -
+    // frame_length' is how much room we have to shift the frame within the
+    // waveform; 'frame_shift' is how much we shift it each time; and the ratio
+    // is how many times we can shift it (integer arithmetic rounds down).
+  } else {
+    // if --snip-edges=false, the number of frames is determined by rounding the
+    // (file-length / frame-shift) to the nearest integer.  The point of this
+    // formula is to make the number of frames an obvious and predictable
+    // function of the frame shift and signal length, which makes many
+    // segmentation-related questions simpler.
+    //
+    // Because integer division in C++ rounds toward zero, we add (half the
+    // frame-shift minus epsilon) before dividing, to have the effect of
+    // rounding towards the closest integer.
+    int32 num_frames = (num_samples + (frame_shift / 2)) / frame_shift;
+
+    if (flush)
+      return num_frames;
+
+    // note: 'end' always means the last plus one, i.e. one past the last.
+    int64 end_sample_of_last_frame = FirstSampleOfFrame(num_frames - 1, opts)
+        + frame_length;
+
+    // the following code is optimized more for clarity than efficiency.
+    // If flush == false, we can't output frames that extend past the end
+    // of the signal.
+    while (num_frames > 0 && end_sample_of_last_frame > num_samples) {
+      num_frames--;
+      end_sample_of_last_frame -= frame_shift;
+    }
+    return num_frames;
+  }
+}
+
+
+void Dither(VectorBase<BaseFloat> *waveform, BaseFloat dither_value) {
+  for (int32 i = 0; i < waveform->Dim(); i++)
+    (*waveform)(i) += RandGauss() * dither_value;
+}
+
+
+void Preemphasize(VectorBase<BaseFloat> *waveform, BaseFloat preemph_coeff) {
+  if (preemph_coeff == 0.0) return;
+  KALDI_ASSERT(preemph_coeff >= 0.0 && preemph_coeff <= 1.0);
+  for (int32 i = waveform->Dim()-1; i > 0; i--)
+    (*waveform)(i) -= preemph_coeff * (*waveform)(i-1);
+  (*waveform)(0) -= preemph_coeff * (*waveform)(0);
+}
+
+FeatureWindowFunction::FeatureWindowFunction(const FrameExtractionOptions &opts) {
+  int32 frame_length = opts.WindowSize();
+  KALDI_ASSERT(frame_length > 0);
+  window.Resize(frame_length);
+  for (int32 i = 0; i < frame_length; i++) {
+    BaseFloat i_fl = static_cast<BaseFloat>(i);
+    if (opts.window_type == "hanning") {
+      window(i) = 0.5  - 0.5*cos(M_2PI * i_fl / (frame_length-1));
+    } else if (opts.window_type == "hamming") {
+      window(i) = 0.54 - 0.46*cos(M_2PI * i_fl / (frame_length-1));
+    } else if (opts.window_type == "povey") {  // like hamming but goes to zero at edges.
+      window(i) = pow(0.5 - 0.5*cos(M_2PI * i_fl / (frame_length-1)), 0.85);
+    } else if (opts.window_type == "rectangular") {
+      window(i) = 1.0;
+    } else {
+      KALDI_ERR << "Invalid window type " << opts.window_type;
+    }
+  }
+}
+
+void ProcessWindow(const FrameExtractionOptions &opts,
+                   const FeatureWindowFunction &window_function,
+                   VectorBase<BaseFloat> *window,
+                   BaseFloat *log_energy_pre_window) {
+  int32 frame_length = opts.WindowSize();
+  KALDI_ASSERT(window->Dim() == frame_length);
+
+  if (opts.dither != 0.0)
+    Dither(window, opts.dither);
+
+  if (opts.remove_dc_offset)
+    window->Add(-window->Sum() / frame_length);
+
+  if (log_energy_pre_window != NULL) {
+    BaseFloat energy = std::max(VecVec(*window, *window),
+                                std::numeric_limits<float>::epsilon());
+    *log_energy_pre_window = Log(energy);
+  }
+
+  if (opts.preemph_coeff != 0.0)
+    Preemphasize(window, opts.preemph_coeff);
+
+  window->MulElements(window_function.window);
+}
+
+
+// ExtractWindow extracts a windowed frame of waveform with a power-of-two,
+// padded size.  It does mean subtraction, pre-emphasis and dithering as
+// requested.
+void ExtractWindow(int64 sample_offset,
+                   const VectorBase<BaseFloat> &wave,
+                   int32 f,  // with 0 <= f < NumFrames(feats, opts)
+                   const FrameExtractionOptions &opts,
+                   const FeatureWindowFunction &window_function,
+                   Vector<BaseFloat> *window,
+                   BaseFloat *log_energy_pre_window) {
+  KALDI_ASSERT(sample_offset >= 0 && wave.Dim() != 0);
+  int32 frame_length = opts.WindowSize(),
+      frame_length_padded = opts.PaddedWindowSize();
+  int64 num_samples = sample_offset + wave.Dim(),
+      start_sample = FirstSampleOfFrame(f, opts),
+      end_sample = start_sample + frame_length;
+
+  if (opts.snip_edges) {
+    KALDI_ASSERT(start_sample >= sample_offset &&
+                 end_sample <= num_samples);
+  } else {
+    KALDI_ASSERT(sample_offset == 0 || start_sample >= sample_offset);
+  }
+
+  if (window->Dim() != frame_length_padded)
+    window->Resize(frame_length_padded, kUndefined);
+
+  // wave_start and wave_end are start and end indexes into 'wave', for the
+  // piece of wave that we're trying to extract.
+  int32 wave_start = int32(start_sample - sample_offset),
+      wave_end = wave_start + frame_length;
+  if (wave_start >= 0 && wave_end <= wave.Dim()) {
+    // the normal case-- no edge effects to consider.
+    window->Range(0, frame_length).CopyFromVec(
+        wave.Range(wave_start, frame_length));
+  } else {
+    // Deal with any end effects by reflection, if needed.  This code will only
+    // be reached for about two frames per utterance, so we don't concern
+    // ourselves excessively with efficiency.
+    int32 wave_dim = wave.Dim();
+    for (int32 s = 0; s < frame_length; s++) {
+      int32 s_in_wave = s + wave_start;
+      while (s_in_wave < 0 || s_in_wave >= wave_dim) {
+        // reflect around the beginning or end of the wave.
+        // e.g. -1 -> 0, -2 -> 1.
+        // dim -> dim - 1, dim + 1 -> dim - 2.
+        // the code supports repeated reflections, although this
+        // would only be needed in pathological cases.
+        if (s_in_wave < 0) s_in_wave = - s_in_wave - 1;
+        else s_in_wave = 2 * wave_dim - 1 - s_in_wave;
+      }
+      (*window)(s) = wave(s_in_wave);
+    }
+  }
+
+  if (frame_length_padded > frame_length)
+    window->Range(frame_length, frame_length_padded - frame_length).SetZero();
+
+  SubVector<BaseFloat> frame(*window, 0, frame_length);
+
+  ProcessWindow(opts, window_function, &frame, log_energy_pre_window);
+}
+
+void ExtractWaveformRemainder(const VectorBase<BaseFloat> &wave,
+                              const FrameExtractionOptions &opts,
+                              Vector<BaseFloat> *wave_remainder) {
+  int32 frame_shift = opts.WindowShift();
+  int32 num_frames = NumFrames(wave.Dim(), opts);
+  // offset is the amount at the start that has been extracted.
+  int32 offset = num_frames * frame_shift;
+  KALDI_ASSERT(wave_remainder != NULL);
+  int32 remaining_len = wave.Dim() - offset;
+  wave_remainder->Resize(remaining_len);
+  KALDI_ASSERT(remaining_len >= 0);
+  if (remaining_len > 0)
+    wave_remainder->CopyFromVec(SubVector<BaseFloat>(wave, offset, remaining_len));
+}
+
+
+}  // namespace kaldi
diff --git a/src/feat/feature-window.h b/src/feat/feature-window.h
new file mode 100644
index 00000000000..748a8f91d10
--- /dev/null
+++ b/src/feat/feature-window.h
@@ -0,0 +1,207 @@
+// feat/feature-window.h
+
+// Copyright 2009-2011  Karel Vesely;  Petr Motlicek;  Saarland University
+//           2014-2016  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_FEAT_FEATURE_WINDOW_H_
+#define KALDI_FEAT_FEATURE_WINDOW_H_
+
+#include <map>
+#include <string>
+
+#include "matrix/matrix-lib.h"
+#include "util/common-utils.h"
+#include "base/kaldi-error.h"
+
+namespace kaldi {
+/// @addtogroup  feat FeatureExtraction
+/// @{
+
+struct FrameExtractionOptions {
+  BaseFloat samp_freq;
+  BaseFloat frame_shift_ms;  // in milliseconds.
+  BaseFloat frame_length_ms;  // in milliseconds.
+  BaseFloat dither;  // Amount of dithering, 0.0 means no dither.
+  BaseFloat preemph_coeff;  // Preemphasis coefficient.
+  bool remove_dc_offset;  // Subtract mean of wave before FFT.
+  std::string window_type;  // e.g. Hamming window
+  bool round_to_power_of_two;
+  bool snip_edges;
+  // Maybe "hamming", "rectangular", "povey", "hanning"
+  // "povey" is a window I made to be similar to Hamming but to go to zero at the
+  // edges, it's pow((0.5 - 0.5*cos(n/N*2*pi)), 0.85)
+  // I just don't think the Hamming window makes sense as a windowing function.
+  FrameExtractionOptions():
+      samp_freq(16000),
+      frame_shift_ms(10.0),
+      frame_length_ms(25.0),
+      dither(1.0),
+      preemph_coeff(0.97),
+      remove_dc_offset(true),
+      window_type("povey"),
+      round_to_power_of_two(true),
+      snip_edges(true){ }
+
+  void Register(OptionsItf *opts) {
+    opts->Register("sample-frequency", &samp_freq,
+                   "Waveform data sample frequency (must match the waveform file, "
+                   "if specified there)");
+    opts->Register("frame-length", &frame_length_ms, "Frame length in milliseconds");
+    opts->Register("frame-shift", &frame_shift_ms, "Frame shift in milliseconds");
+    opts->Register("preemphasis-coefficient", &preemph_coeff,
+                   "Coefficient for use in signal preemphasis");
+    opts->Register("remove-dc-offset", &remove_dc_offset,
+                   "Subtract mean from waveform on each frame");
+    opts->Register("dither", &dither, "Dithering constant (0.0 means no dither)");
+    opts->Register("window-type", &window_type, "Type of window "
+                   "(\"hamming\"|\"hanning\"|\"povey\"|\"rectangular\")");
+    opts->Register("round-to-power-of-two", &round_to_power_of_two,
+                   "If true, round window size to power of two.");
+    opts->Register("snip-edges", &snip_edges,
+                   "If true, end effects will be handled by outputting only frames that "
+                   "completely fit in the file, and the number of frames depends on the "
+                   "frame-length.  If false, the number of frames depends only on the "
+                   "frame-shift, and we reflect the data at the ends.");
+  }
+  int32 WindowShift() const {
+    return static_cast<int32>(samp_freq * 0.001 * frame_shift_ms);
+  }
+  int32 WindowSize() const {
+    return static_cast<int32>(samp_freq * 0.001 * frame_length_ms);
+  }
+  int32 PaddedWindowSize() const {
+    return (round_to_power_of_two ? RoundUpToNearestPowerOfTwo(WindowSize()) :
+                                    WindowSize());
+  }
+};
+
+
+struct FeatureWindowFunction {
+  FeatureWindowFunction() {}
+  explicit FeatureWindowFunction(const FrameExtractionOptions &opts);
+  FeatureWindowFunction(const FeatureWindowFunction &other):
+      window(other.window) { }
+  Vector<BaseFloat> window;
+};
+
+
+/**
+   This function returns the number of frames that we can extract from a wave
+   file with the given number of samples in it (assumed to have the same
+   sampling rate as specified in 'opts').
+
+      @param [in] wave_length  The number of samples in the wave file.
+      @param [in] opts     The frame-extraction options class
+
+      @param [in] flush   True if we are asserting that this number of samples is
+             'all there is', false if we expecting more data to possibly come
+             in.  This only makes a difference to the answer if opts.snips_edges
+             == false.  For offline feature extraction you always want flush ==
+             true.  In an online-decoding context, once you know (or decide) that
+             no more data is coming in, you'd call it with flush == true at the
+             end to flush out any remaining data.
+*/
+int32 NumFrames(int64 num_samples,
+                const FrameExtractionOptions &opts,
+                bool flush = true);
+
+/*
+   This function returns the index of the first sample of the frame indexed
+   'frame'.  If snip-edges=true, it just returns frame * opts.WindowShift(); if
+   snip-edges=false, the formula is a little more complicated and the result may
+   be negative.
+*/
+int64 FirstSampleOfFrame(int32 frame,
+                         const FrameExtractionOptions &opts);
+
+
+
+void Dither(VectorBase<BaseFloat> *waveform, BaseFloat dither_value);
+
+void Preemphasize(VectorBase<BaseFloat> *waveform, BaseFloat preemph_coeff);
+
+/**
+  This function does all the windowing steps after actually
+  extracting the windowed signal: depeding on the
+  configuration, it does dithering, dc offset removal,
+  preemphasis, and multiplication by the windowing function.
+   @param [in] opts  The options class to be used
+   @param [in] window_function  The windowing function-- should have
+                    been initialized using 'opts'.
+   @param [in,out] window  A vector of size opts.WindowSize().  Note:
+      it will typically be a sub-vector of a larger vector of size
+      opts.PaddedWindowSize(), with the remaining samples zero,
+      as the FFT code is more efficient if it operates on data with
+      power-of-two size.
+   @param [out]   log_energy_pre_window If non-NULL, then after dithering and
+      DC offset removal, this function will write to this pointer the log of
+      the total energy (i.e. sum-squared) of the frame.
+ */
+void ProcessWindow(const FrameExtractionOptions &opts,
+                   const FeatureWindowFunction &window_function,
+                   VectorBase<BaseFloat> *window,
+                   BaseFloat *log_energy_pre_window = NULL);
+
+
+/*
+  ExtractWindow() extracts a windowed frame of waveform (possibly with a
+  power-of-two, padded size, depending on the config), including all the
+  proessing done by ProcessWindow().
+
+  @param [in] sample_offset  If 'wave' is not the entire waveform, but
+                   part of it to the left has been discarded, then the
+                   number of samples prior to 'wave' that we have
+                   already discarded.  Set this to zero if you are
+                   processing the entire waveform in one piece, or
+                   if you get 'no matching function' compilation
+                   errors when updating the code.
+  @param [in] wave  The waveform
+  @param [in] f     The frame index to be extracted, with
+                    0 <= f < NumFrames(sample_offset + wave.Dim(), opts, true)
+  @param [in] opts  The options class to be used
+  @param [in] window_function  The windowing function, as derived from the
+                    options class.
+  @param [out] window  The windowed, possibly-padded waveform to be
+                     extracted.  Will be resized as needed.
+  @param [out] log_energy_pre_window  If non-NULL, the log-energy of
+                   the signal prior to pre-emphasis and multiplying by
+                   the windowing function will be written to here.
+*/
+void ExtractWindow(int64 sample_offset,
+                   const VectorBase<BaseFloat> &wave,
+                   int32 f,
+                   const FrameExtractionOptions &opts,
+                   const FeatureWindowFunction &window_function,
+                   Vector<BaseFloat> *window,
+                   BaseFloat *log_energy_pre_window = NULL);
+
+
+// ExtractWaveformRemainder is useful if the waveform is coming in segments.
+// It extracts the bit of the waveform at the end of this block that you
+// would have to append the next bit of waveform to, if you wanted to have
+// the same effect as everything being in one big block.
+void ExtractWaveformRemainder(const VectorBase<BaseFloat> &wave,
+                              const FrameExtractionOptions &opts,
+                              Vector<BaseFloat> *wave_remainder);
+
+
+/// @} End of "addtogroup feat"
+}  // namespace kaldi
+
+
+#endif  // KALDI_FEAT_FEATURE_WINDOW_H_
diff --git a/src/feat/mel-computations.cc b/src/feat/mel-computations.cc
index 9949a468d4c..714d963f01b 100644
--- a/src/feat/mel-computations.cc
+++ b/src/feat/mel-computations.cc
@@ -23,8 +23,9 @@
 #include <algorithm>
 #include <iostream>
 
-#include "feat/mel-computations.h"
 #include "feat/feature-functions.h"
+#include "feat/feature-window.h"
+#include "feat/mel-computations.h"
 
 namespace kaldi {
 
@@ -57,7 +58,7 @@ MelBanks::MelBanks(const MelBanksOptions &opts,
     KALDI_ERR << "Bad values in options: low-freq " << low_freq
               << " and high-freq " << high_freq << " vs. nyquist "
               << nyquist;
-  
+
   BaseFloat fft_bin_width = sample_freq / window_length_padded;
   // fft-bin width [think of it as Nyquist-freq / half-window-length]
 
@@ -73,7 +74,7 @@ MelBanks::MelBanks(const MelBanksOptions &opts,
   BaseFloat vtln_low = opts.vtln_low,
       vtln_high = opts.vtln_high;
   if (vtln_high < 0.0) vtln_high += nyquist;
-  
+
   if (vtln_warp_factor != 1.0 &&
       (vtln_low < 0.0 || vtln_low <= low_freq
        || vtln_low >= high_freq
@@ -106,7 +107,8 @@ MelBanks::MelBanks(const MelBanksOptions &opts,
     Vector<BaseFloat> this_bin(num_fft_bins);
     int32 first_index = -1, last_index = -1;
     for (int32 i = 0; i < num_fft_bins; i++) {
-      BaseFloat freq = (fft_bin_width * i);  // center freq of this fft bin.
+      BaseFloat freq = (fft_bin_width * i);  // Center frequency of this fft
+                                             // bin.
       BaseFloat mel = MelScale(freq);
       if (mel > left_mel && mel < right_mel) {
         BaseFloat weight;
@@ -122,7 +124,7 @@ MelBanks::MelBanks(const MelBanksOptions &opts,
     }
     KALDI_ASSERT(first_index != -1 && last_index >= first_index
                  && "You may have set --num-mel-bins too large.");
-                 
+
     bins_[bin].first = first_index;
     int32 size = last_index + 1 - first_index;
     bins_[bin].second.Resize(size);
@@ -131,7 +133,7 @@ MelBanks::MelBanks(const MelBanksOptions &opts,
     // Replicate a bug in HTK, for testing purposes.
     if (opts.htk_mode && bin == 0 && mel_low_freq != 0.0)
       bins_[bin].second(0) = 0.0;
-    
+
   }
   if (debug_) {
     for (size_t i = 0; i < bins_.size(); i++) {
@@ -141,6 +143,12 @@ MelBanks::MelBanks(const MelBanksOptions &opts,
   }
 }
 
+MelBanks::MelBanks(const MelBanks &other):
+    center_freqs_(other.center_freqs_),
+    bins_(other.bins_),
+    debug_(other.debug_),
+    htk_mode_(other.htk_mode_) { }
+
 BaseFloat MelBanks::VtlnWarpFreq(BaseFloat vtln_low_cutoff,  // upper+lower frequency cutoffs for VTLN.
                                  BaseFloat vtln_high_cutoff,
                                  BaseFloat low_freq,  // upper+lower frequency cutoffs in mel computation
@@ -218,19 +226,18 @@ BaseFloat MelBanks::VtlnWarpMelFreq(BaseFloat vtln_low_cutoff,  // upper+lower f
 
 // "power_spectrum" contains fft energies.
 void MelBanks::Compute(const VectorBase<BaseFloat> &power_spectrum,
-                       Vector<BaseFloat> *mel_energies_out) const {
+                       VectorBase<BaseFloat> *mel_energies_out) const {
   int32 num_bins = bins_.size();
-  if (mel_energies_out->Dim() != num_bins)
-    mel_energies_out->Resize(num_bins);
+  KALDI_ASSERT(mel_energies_out->Dim() == num_bins);
 
   for (int32 i = 0; i < num_bins; i++) {
     int32 offset = bins_[i].first;
     const Vector<BaseFloat> &v(bins_[i].second);
     BaseFloat energy = VecVec(v, power_spectrum.Range(offset, v.Dim()));
     // HTK-like flooring- for testing purposes (we prefer dither)
-    if (htk_mode_ && energy < 1.0) energy = 1.0; 
+    if (htk_mode_ && energy < 1.0) energy = 1.0;
     (*mel_energies_out)(i) = energy;
-    
+
     // The following assert was added due to a problem with OpenBlas that
     // we had at one point (it was a bug in that library).  Just to detect
     // it early.
@@ -303,5 +310,33 @@ void Lpc2Cepstrum(int n, const BaseFloat *pLPC, BaseFloat *pCepst) {
   }
 }
 
+void GetEqualLoudnessVector(const MelBanks &mel_banks,
+                            Vector<BaseFloat> *ans) {
+  int32 n = mel_banks.NumBins();
+  // Central frequency of each mel bin.
+  const Vector<BaseFloat> &f0 = mel_banks.GetCenterFreqs();
+  ans->Resize(n);
+  for (int32 i = 0; i < n; i++) {
+    BaseFloat fsq = f0(i) * f0(i);
+    BaseFloat fsub = fsq / (fsq + 1.6e5);
+    (*ans)(i) = fsub * fsub * ((fsq + 1.44e6) / (fsq + 9.61e6));
+  }
+}
+
+
+// Compute LP coefficients from autocorrelation coefficients.
+BaseFloat ComputeLpc(const VectorBase<BaseFloat> &autocorr_in,
+                     Vector<BaseFloat> *lpc_out) {
+  int32 n = autocorr_in.Dim() - 1;
+  KALDI_ASSERT(lpc_out->Dim() == n);
+  Vector<BaseFloat> tmp(n);
+  BaseFloat ans = Durbin(n, autocorr_in.Data(),
+                         lpc_out->Data(),
+                         tmp.Data());
+  if (ans <= 0.0)
+    KALDI_WARN << "Zero energy in LPC computation";
+  return -Log(1.0 / ans);  // forms the C0 value
+}
+
 
 }  // namespace kaldi
diff --git a/src/feat/mel-computations.h b/src/feat/mel-computations.h
index fbc9f532cd0..5df36c8cb90 100644
--- a/src/feat/mel-computations.h
+++ b/src/feat/mel-computations.h
@@ -1,6 +1,7 @@
 // feat/mel-computations.h
 
 // Copyright 2009-2011  Phonexia s.r.o.;  Microsoft Corporation
+//                2016  Johns Hopkins University (author: Daniel Povey)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -36,9 +37,43 @@ namespace kaldi {
 /// @addtogroup  feat FeatureExtraction
 /// @{
 
-struct FrameExtractionOptions;  // defined in feature-function.h
+struct FrameExtractionOptions;  // defined in feature-window.h
+
+
+struct MelBanksOptions {
+  int32 num_bins;  // e.g. 25; number of triangular bins
+  BaseFloat low_freq;  // e.g. 20; lower frequency cutoff
+  BaseFloat high_freq;  // an upper frequency cutoff; 0 -> no cutoff, negative
+  // ->added to the Nyquist frequency to get the cutoff.
+  BaseFloat vtln_low;  // vtln lower cutoff of warping function.
+  BaseFloat vtln_high;  // vtln upper cutoff of warping function: if negative, added
+                        // to the Nyquist frequency to get the cutoff.
+  bool debug_mel;
+  // htk_mode is a "hidden" config, it does not show up on command line.
+  // Enables more exact compatibibility with HTK, for testing purposes.  Affects
+  // mel-energy flooring and reproduces a bug in HTK.
+  bool htk_mode;
+  explicit MelBanksOptions(int num_bins = 25)
+      : num_bins(num_bins), low_freq(20), high_freq(0), vtln_low(100),
+        vtln_high(-500), debug_mel(false), htk_mode(false) {}
+
+  void Register(OptionsItf *opts) {
+    opts->Register("num-mel-bins", &num_bins,
+                   "Number of triangular mel-frequency bins");
+    opts->Register("low-freq", &low_freq,
+                   "Low cutoff frequency for mel bins");
+    opts->Register("high-freq", &high_freq,
+                   "High cutoff frequency for mel bins (if < 0, offset from Nyquist)");
+    opts->Register("vtln-low", &vtln_low,
+                   "Low inflection point in piecewise linear VTLN warping function");
+    opts->Register("vtln-high", &vtln_high,
+                   "High inflection point in piecewise linear VTLN warping function"
+                   " (if negative, offset from high-mel-freq");
+    opts->Register("debug-mel", &debug_mel,
+                   "Print out debugging information for mel bin computation");
+  }
+};
 
-struct MelBanksOptions;  // defined in feature-function.h
 
 class MelBanks {
  public:
@@ -74,14 +109,19 @@ class MelBanks {
   /// Compute Mel energies (note: not log enerties).
   /// At input, "fft_energies" contains the FFT energies (not log).
   void Compute(const VectorBase<BaseFloat> &fft_energies,
-               Vector<BaseFloat> *mel_energies_out) const;
+               VectorBase<BaseFloat> *mel_energies_out) const;
 
   int32 NumBins() const { return bins_.size(); }
 
   // returns vector of central freq of each bin; needed by plp code.
   const Vector<BaseFloat> &GetCenterFreqs() const { return center_freqs_; }
 
+  // Copy constructor
+  MelBanks(const MelBanks &other);
  private:
+  // Disallow assignment
+  MelBanks &operator = (const MelBanks &other);
+
   // center frequencies of bins, numbered from 0 ... num_bins-1.
   // Needed by GetCenterFreqs().
   Vector<BaseFloat> center_freqs_;
@@ -92,7 +132,6 @@ class MelBanks {
 
   bool debug_;
   bool htk_mode_;
-  KALDI_DISALLOW_COPY_AND_ASSIGN(MelBanks);
 };
 
 
@@ -107,10 +146,21 @@ void ComputeLifterCoeffs(BaseFloat Q, VectorBase<BaseFloat> *coeffs);
 // pAC - autocorrelation coefficients [n + 1]
 // pLP - linear prediction coefficients [n] (predicted_sn = sum_1^P{a[i] * s[n-i]}})
 //       F(z) = 1 / (1 - A(z)), 1 is not stored in the demoninator
+// Returns log energy of residual (I think)
 BaseFloat Durbin(int n, const BaseFloat *pAC, BaseFloat *pLP, BaseFloat *pTmp);
 
+// Compute LP coefficients from autocorrelation coefficients.
+// Returns log energy of residual (I think)
+BaseFloat ComputeLpc(const VectorBase<BaseFloat> &autocorr_in,
+                     Vector<BaseFloat> *lpc_out);
+
 void Lpc2Cepstrum(int n, const BaseFloat *pLPC, BaseFloat *pCepst);
 
+
+
+void GetEqualLoudnessVector(const MelBanks &mel_banks,
+                            Vector<BaseFloat> *ans);
+
 /// @} End of "addtogroup feat"
 }  // namespace kaldi
 
diff --git a/src/feat/online-feature-test.cc b/src/feat/online-feature-test.cc
index 7a46e837151..556160f8e53 100644
--- a/src/feat/online-feature-test.cc
+++ b/src/feat/online-feature-test.cc
@@ -172,6 +172,8 @@ void TestOnlineMfcc() {
   op.mel_opts.low_freq = 0.0;
   op.htk_compat = false;
   op.use_energy = false;  // C0 not energy.
+  if (RandInt(0, 1) == 0)
+    op.frame_opts.snip_edges = false;
   Mfcc mfcc(op);
 
   // compute mfcc offline
diff --git a/src/feat/online-feature.cc b/src/feat/online-feature.cc
index b9d74d3a293..267a4724580 100644
--- a/src/feat/online-feature.cc
+++ b/src/feat/online-feature.cc
@@ -24,82 +24,91 @@
 
 namespace kaldi {
 
-
 template<class C>
 void OnlineGenericBaseFeature<C>::GetFrame(int32 frame,
                                            VectorBase<BaseFloat> *feat) {
-  KALDI_ASSERT(frame >= 0 && frame < num_frames_);
-  KALDI_ASSERT(feat->Dim() == Dim());
-  feat->CopyFromVec(features_.Row(frame));
+  // 'at' does size checking.
+  feat->CopyFromVec(*(features_.at(frame)));
 };
 
-template<class C>
-bool OnlineGenericBaseFeature<C>::IsLastFrame(int32 frame) const {
-  return (frame == num_frames_ - 1 && input_finished_);
-}
-
 template<class C>
 OnlineGenericBaseFeature<C>::OnlineGenericBaseFeature(
-    const typename C::Options &opts)
-    :mfcc_or_plp_(opts), input_finished_(false), num_frames_(0),
-    sampling_frequency_(opts.frame_opts.samp_freq) { }
+    const typename C::Options &opts):
+    computer_(opts), window_function_(computer_.GetFrameOptions()),
+    input_finished_(false), waveform_offset_(0) { }
 
 template<class C>
 void OnlineGenericBaseFeature<C>::AcceptWaveform(BaseFloat sampling_rate,
-                                        const VectorBase<BaseFloat> &waveform) {
-  if (waveform.Dim() == 0) {
+                                                 const VectorBase<BaseFloat> &waveform) {
+  BaseFloat expected_sampling_rate = computer_.GetFrameOptions().samp_freq;
+  if (sampling_rate != expected_sampling_rate)
+    KALDI_ERR << "Sampling frequency mismatch, expected "
+              << expected_sampling_rate << ", got " << sampling_rate;
+  if (waveform.Dim() == 0)
     return;  // Nothing to do.
-  }
-  if (input_finished_) {
+  if (input_finished_)
     KALDI_ERR << "AcceptWaveform called after InputFinished() was called.";
-  }
-  if (sampling_rate != sampling_frequency_) {
-    KALDI_ERR << "Sampling frequency mismatch, expected "
-              << sampling_frequency_ << ", got " << sampling_rate;
-  }
-
-  Vector<BaseFloat> appended_wave;
-
-  const VectorBase<BaseFloat> &wave_to_use = (waveform_remainder_.Dim() != 0 ?
-                                              appended_wave : waveform);
-  if (waveform_remainder_.Dim() != 0) {
-    appended_wave.Resize(waveform_remainder_.Dim() +
-                         waveform.Dim());
+  // append 'waveform' to 'waveform_remainder_.'
+  Vector<BaseFloat> appended_wave(waveform_remainder_.Dim() + waveform.Dim());
+  if (waveform_remainder_.Dim() != 0)
     appended_wave.Range(0, waveform_remainder_.Dim()).CopyFromVec(
         waveform_remainder_);
-    appended_wave.Range(waveform_remainder_.Dim(),
-                        waveform.Dim()).CopyFromVec(waveform);
-  }
-  waveform_remainder_.Resize(0);
-
-  Matrix<BaseFloat> feats;
-  BaseFloat vtln_warp = 1.0;  // We don't support VTLN warping in this wrapper.
-  mfcc_or_plp_.Compute(wave_to_use, vtln_warp, &feats, &waveform_remainder_);
+  appended_wave.Range(waveform_remainder_.Dim(), waveform.Dim()).CopyFromVec(
+      waveform);
+  waveform_remainder_.Swap(&appended_wave);
+  ComputeFeatures();
+}
 
-  if (feats.NumRows() == 0) {
-    // Presumably we got a very small waveform and could output no whole
-    // features.  The waveform will have been appended to waveform_remainder_.
-    return;
+template<class C>
+void OnlineGenericBaseFeature<C>::ComputeFeatures() {
+  const FrameExtractionOptions &frame_opts = computer_.GetFrameOptions();
+  int64 num_samples_total = waveform_offset_ + waveform_remainder_.Dim();
+  int32 num_frames_old = features_.size(),
+      num_frames_new = NumFrames(num_samples_total, frame_opts,
+                                 input_finished_);
+  KALDI_ASSERT(num_frames_new >= num_frames_old);
+  features_.resize(num_frames_new, NULL);
+
+  Vector<BaseFloat> window;
+  bool need_raw_log_energy = computer_.NeedRawLogEnergy();
+  for (int32 frame = num_frames_old; frame < num_frames_new; frame++) {
+    BaseFloat raw_log_energy = 0.0;
+    ExtractWindow(waveform_offset_, waveform_remainder_, frame,
+                  frame_opts, window_function_, &window,
+                  need_raw_log_energy ? &raw_log_energy : NULL);
+    Vector<BaseFloat> *this_feature = new Vector<BaseFloat>(computer_.Dim(),
+                                                            kUndefined);
+    // note: this online feature-extraction code does not support VTLN.
+    BaseFloat vtln_warp = 1.0;
+    computer_.Compute(raw_log_energy, vtln_warp, &window, this_feature);
+    features_[frame] = this_feature;
   }
-  int32 new_num_frames = num_frames_ + feats.NumRows();
-  BaseFloat increase_ratio = 1.5;  // This is a tradeoff between memory and
-                                   // compute; it's the factor by which we
-                                   // increase the memory used each time.
-  if (new_num_frames > features_.NumRows()) {
-    int32 new_num_rows = std::max<int32>(new_num_frames,
-                                         features_.NumRows() * increase_ratio);
-    // Increase the size of the features_ matrix and copy over any existing
-    // data.
-    features_.Resize(new_num_rows, Dim(), kCopyData);
+  // OK, we will now discard any portion of the signal that will not be
+  // necessary to compute frames in the future.
+  int64 first_sample_of_next_frame = FirstSampleOfFrame(num_frames_new,
+                                                        frame_opts);
+  int32 samples_to_discard = first_sample_of_next_frame - waveform_offset_;
+  if (samples_to_discard > 0) {
+    // discard the leftmost part of the waveform that we no longer need.
+    int32 new_num_samples = waveform_remainder_.Dim() - samples_to_discard;
+    if (new_num_samples <= 0) {
+      // odd, but we'll try to handle it.
+      waveform_offset_ += waveform_remainder_.Dim();
+      waveform_remainder_.Resize(0);
+    } else {
+      Vector<BaseFloat> new_remainder(new_num_samples);
+      new_remainder.CopyFromVec(waveform_remainder_.Range(samples_to_discard,
+                                                          new_num_samples));
+      waveform_offset_ += samples_to_discard;
+      waveform_remainder_.Swap(&new_remainder);
+    }
   }
-  features_.Range(num_frames_, feats.NumRows(), 0, Dim()).CopyFromMat(feats);
-  num_frames_ = new_num_frames;
 }
 
 // instantiate the templates defined here for MFCC, PLP and filterbank classes.
-template class OnlineGenericBaseFeature<Mfcc>;
-template class OnlineGenericBaseFeature<Plp>;
-template class OnlineGenericBaseFeature<Fbank>;
+template class OnlineGenericBaseFeature<MfccComputer>;
+template class OnlineGenericBaseFeature<PlpComputer>;
+template class OnlineGenericBaseFeature<FbankComputer>;
 
 
 OnlineCmvnState::OnlineCmvnState(const OnlineCmvnState &other):
@@ -317,7 +326,7 @@ void OnlineCmvn::GetFrame(int32 frame,
 
   if (!skip_dims_.empty())
     FakeStatsForSomeDims(skip_dims_, &stats);
-  
+
   // call the function ApplyCmvn declared in ../transform/cmvn.h, which
   // requires a matrix.
   Matrix<BaseFloat> feat_mat(1, dim);
@@ -486,8 +495,7 @@ void OnlineCacheFeature::GetFrame(int32 frame, VectorBase<BaseFloat> *feat) {
 
 void OnlineCacheFeature::ClearCache() {
   for (size_t i = 0; i < cache_.size(); i++)
-    if (cache_[i] != NULL)
-      delete cache_[i];
+    delete cache_[i];
   cache_.resize(0);
 }
 
diff --git a/src/feat/online-feature.h b/src/feat/online-feature.h
index d0b4d54c256..ba87f696492 100644
--- a/src/feat/online-feature.h
+++ b/src/feat/online-feature.h
@@ -41,26 +41,34 @@ namespace kaldi {
 /// @{
 
 
-
+/// This is a templated class for online feature extraction;
+/// it's templated on a class like MfccComputer or PlpComputer
+/// that does the basic feature extraction.
 template<class C>
 class OnlineGenericBaseFeature: public OnlineBaseFeature {
  public:
   //
   // First, functions that are present in the interface:
   //
-  virtual int32 Dim() const { return mfcc_or_plp_.Dim(); }
-
-  // Note: this will only ever return true if you call InputFinished(), which
-  // isn't really necessary to do unless you want to make sure to flush out the
-  // last few frames of delta or LDA features to exactly match a non-online
-  // decode of some data.
-  virtual bool IsLastFrame(int32 frame) const;
-  virtual int32 NumFramesReady() const { return num_frames_; }
+  virtual int32 Dim() const { return computer_.Dim(); }
+
+  // Note: IsLastFrame() will only ever return true if you have called
+  // InputFinished() (and this frame is the last frame).
+  virtual bool IsLastFrame(int32 frame) const {
+    return input_finished_ && frame == NumFramesReady() - 1;
+  }
+  virtual BaseFloat FrameShiftInSeconds() const {
+    return computer_.GetFrameOptions().frame_shift_ms * 1.0e-03;
+  }
+
+  virtual int32 NumFramesReady() const { return features_.size(); }
+
   virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);
 
-  //
   // Next, functions that are not in the interface.
-  //
+
+
+  // Constructor from options class
   explicit OnlineGenericBaseFeature(const typename C::Options &opts);
 
   // This would be called from the application, when you get
@@ -69,42 +77,58 @@ class OnlineGenericBaseFeature: public OnlineBaseFeature {
   // expected in the options.
   virtual void AcceptWaveform(BaseFloat sampling_rate,
                               const VectorBase<BaseFloat> &waveform);
-  
+
 
   // InputFinished() tells the class you won't be providing any
-  // more waveform.  This will help flush out the last few frames
-  // of delta or LDA features.
-  virtual void InputFinished() { input_finished_= true; }
+  // more waveform.  This will help flush out the last frame or two
+  // of features, in the case where snip-edges == false; it also
+  // affects the return value of IsLastFrame().
+  virtual void InputFinished() {
+    input_finished_ = true;
+    ComputeFeatures();
+  }
 
+  ~OnlineGenericBaseFeature() {
+    DeletePointers(&features_);
+  }
 
  private:
-  C mfcc_or_plp_;  // class that does the MFCC or PLP computation
+  // This function computes any additional feature frames that it is possible to
+  // compute from 'waveform_remainder_', which at this point may contain more
+  // than just a remainder-sized quantity (because AcceptWaveform() appends to
+  // waveform_remainder_ before calling this function).  It adds these feature
+  // frames to features_, and shifts off any now-unneeded samples of input from
+  // waveform_remainder_ while incrementing waveform_offset_ by the same amount.
+  void ComputeFeatures();
+
+  C computer_;  // class that does the MFCC or PLP or filterbank computation
+
+  FeatureWindowFunction window_function_;
 
   // features_ is the Mfcc or Plp or Fbank features that we have already computed.
-  Matrix<BaseFloat> features_;
+
+  std::vector<Vector<BaseFloat>*> features_;
 
   // True if the user has called "InputFinished()"
   bool input_finished_;
 
-  // num_frames_ is the number of frames of MFCC features we have
-  // already computed.  It may be less than the size of features_,
-  // because when we resize that matrix we leave some extra room,
-  // so that we don't spend too much time resizing.
-  int32 num_frames_;
-
   // The sampling frequency, extracted from the config.  Should
   // be identical to the waveform supplied.
   BaseFloat sampling_frequency_;
 
+  // waveform_offset_ is the number of samples of waveform that we have
+  // already discarded, i.e. thatn were prior to 'waveform_remainder_'.
+  int64 waveform_offset_;
+
   // waveform_remainder_ is a short piece of waveform that we may need to keep
   // after extracting all the whole frames we can (whatever length of feature
   // will be required for the next phase of computation).
   Vector<BaseFloat> waveform_remainder_;
 };
 
-typedef OnlineGenericBaseFeature<Mfcc> OnlineMfcc;
-typedef OnlineGenericBaseFeature<Plp> OnlinePlp;
-typedef OnlineGenericBaseFeature<Fbank> OnlineFbank;
+typedef OnlineGenericBaseFeature<MfccComputer> OnlineMfcc;
+typedef OnlineGenericBaseFeature<PlpComputer> OnlinePlp;
+typedef OnlineGenericBaseFeature<FbankComputer> OnlineFbank;
 
 
 /// This class takes a Matrix<BaseFloat> and wraps it as an
@@ -119,8 +143,12 @@ class OnlineMatrixFeature: public OnlineFeatureInterface {
 
   virtual int32 Dim() const { return mat_.NumCols(); }
 
+  virtual BaseFloat FrameShiftInSeconds() const {
+    return 0.01f;
+  }
+
   virtual int32 NumFramesReady() const { return mat_.NumRows(); }
-  
+
   virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat) {
     feat->CopyFromVec(mat_.Row(frame));
   }
@@ -129,6 +157,7 @@ class OnlineMatrixFeature: public OnlineFeatureInterface {
     return (frame + 1 == mat_.NumRows());
   }
 
+
  private:
   const MatrixBase<BaseFloat> &mat_;
 };
@@ -156,7 +185,7 @@ struct OnlineCmvnOptions {
                            // buffer used for caching CMVN stats.
   std::string skip_dims; // Colon-separated list of dimensions to skip normalization
                          // of, e.g. 13:14:15.
-  
+
   OnlineCmvnOptions():
       cmn_window(600),
       speaker_frames(600),
@@ -166,7 +195,7 @@ struct OnlineCmvnOptions {
       modulus(20),
       ring_buffer_size(20),
       skip_dims("") { }
-  
+
   void Check() {
     KALDI_ASSERT(speaker_frames <= cmn_window && global_frames <= speaker_frames
                  && modulus > 0);
@@ -225,11 +254,11 @@ struct OnlineCmvnState {
       global_cmvn_stats(global_stats) { }
 
   // Copy constructor
-  OnlineCmvnState(const OnlineCmvnState &other); 
+  OnlineCmvnState(const OnlineCmvnState &other);
 
   void Write(std::ostream &os, bool binary) const;
   void Read(std::istream &is, bool binary);
-  
+
   // Use the default assignment operator.
 };
 
@@ -242,7 +271,7 @@ struct OnlineCmvnState {
    We normally only do so in the "online" GMM-based decoding, e.g.  in
    online2bin/online2-wav-gmm-latgen-faster.cc; see also the script
    steps/online/prepare_online_decoding.sh and steps/online/decode.sh.
-   
+
    In the steady state (in the middle of a long utterance), this class
    accumulates CMVN statistics from the previous "cmn_window" frames (default 600
    frames, or 6 seconds), and uses these to normalize the mean and possibly
@@ -270,13 +299,15 @@ class OnlineCmvn: public OnlineFeatureInterface {
   virtual bool IsLastFrame(int32 frame) const {
     return src_->IsLastFrame(frame);
   }
+  virtual BaseFloat FrameShiftInSeconds() const {
+    return src_->FrameShiftInSeconds();
+  }
 
   // The online cmvn does not introduce any additional latency.
   virtual int32 NumFramesReady() const { return src_->NumFramesReady(); }
 
   virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);
 
-
   //
   // Next, functions that are not in the interface.
   //
@@ -400,6 +431,9 @@ class OnlineSpliceFrames: public OnlineFeatureInterface {
   virtual bool IsLastFrame(int32 frame) const {
     return src_->IsLastFrame(frame);
   }
+  virtual BaseFloat FrameShiftInSeconds() const {
+    return src_->FrameShiftInSeconds();
+  }
 
   virtual int32 NumFramesReady() const;
 
@@ -430,6 +464,9 @@ class OnlineTransform: public OnlineFeatureInterface {
   virtual bool IsLastFrame(int32 frame) const {
     return src_->IsLastFrame(frame);
   }
+  virtual BaseFloat FrameShiftInSeconds() const {
+    return src_->FrameShiftInSeconds();
+  }
 
   virtual int32 NumFramesReady() const { return src_->NumFramesReady(); }
 
@@ -461,6 +498,9 @@ class OnlineDeltaFeature: public OnlineFeatureInterface {
   virtual bool IsLastFrame(int32 frame) const {
     return src_->IsLastFrame(frame);
   }
+  virtual BaseFloat FrameShiftInSeconds() const {
+    return src_->FrameShiftInSeconds();
+  }
 
   virtual int32 NumFramesReady() const;
 
@@ -489,6 +529,9 @@ class OnlineCacheFeature: public OnlineFeatureInterface {
   virtual bool IsLastFrame(int32 frame) const {
     return src_->IsLastFrame(frame);
   }
+  virtual BaseFloat FrameShiftInSeconds() const {
+    return src_->FrameShiftInSeconds(); 
+  }
 
   virtual int32 NumFramesReady() const { return src_->NumFramesReady(); }
 
@@ -520,6 +563,10 @@ class OnlineAppendFeature: public OnlineFeatureInterface {
   virtual bool IsLastFrame(int32 frame) const {
     return (src1_->IsLastFrame(frame) || src2_->IsLastFrame(frame));
   }
+  // Hopefully sources have the same rate
+  virtual BaseFloat FrameShiftInSeconds() const {
+    return src1_->FrameShiftInSeconds();
+  }
 
   virtual int32 NumFramesReady() const {
     return std::min(src1_->NumFramesReady(), src2_->NumFramesReady());
diff --git a/src/feat/pitch-functions.cc b/src/feat/pitch-functions.cc
index 795fab3b2d4..12dd5030184 100644
--- a/src/feat/pitch-functions.cc
+++ b/src/feat/pitch-functions.cc
@@ -576,6 +576,8 @@ class OnlinePitchFeatureImpl {
   explicit OnlinePitchFeatureImpl(const PitchExtractionOptions &opts);
 
   int32 Dim() const { return 2; }
+  
+  BaseFloat FrameShiftInSeconds() const;
 
   int32 NumFramesReady() const;
 
@@ -879,6 +881,10 @@ bool OnlinePitchFeatureImpl::IsLastFrame(int32 frame) const {
   return (input_finished_ && frame + 1 == T);
 }
 
+BaseFloat OnlinePitchFeatureImpl::FrameShiftInSeconds() const {
+  return opts_.frame_shift_ms * 1.0e-03;
+}
+
 int32 OnlinePitchFeatureImpl::NumFramesReady() const {
   int32 num_frames = lag_nccf_.size(),
       latency = frames_latency_;
@@ -1171,6 +1177,10 @@ bool OnlinePitchFeature::IsLastFrame(int32 frame) const {
   return impl_->IsLastFrame(frame);
 }
 
+BaseFloat OnlinePitchFeature::FrameShiftInSeconds() const {
+  return impl_->FrameShiftInSeconds();
+}
+
 void OnlinePitchFeature::GetFrame(int32 frame, VectorBase<BaseFloat> *feat) {
   impl_->GetFrame(frame, feat);
 }
@@ -1335,8 +1345,6 @@ inline void AppendVector(const VectorBase<Real> &src, Vector<Real> *dst) {
   dst->Range(dst->Dim() - src.Dim(), src.Dim()).CopyFromVec(src);
 }
 
-const int32 OnlineProcessPitch::kRawFeatureDim;
-
 /**
    Note on the implementation of OnlineProcessPitch: the
    OnlineFeatureInterface allows random access to features (i.e. not necessarily
diff --git a/src/feat/pitch-functions.h b/src/feat/pitch-functions.h
index 52b3f815cde..fd9ead0c090 100644
--- a/src/feat/pitch-functions.h
+++ b/src/feat/pitch-functions.h
@@ -301,6 +301,8 @@ class OnlinePitchFeature: public OnlineBaseFeature {
   virtual int32 Dim() const { return 2; /* (NCCF, pitch) */ }
 
   virtual int32 NumFramesReady() const;
+  
+  virtual BaseFloat FrameShiftInSeconds() const;
 
   virtual bool IsLastFrame(int32 frame) const;
 
@@ -336,6 +338,9 @@ class OnlineProcessPitch: public OnlineFeatureInterface {
     else
       return src_->IsLastFrame(frame - opts_.delay); 
   }
+  virtual BaseFloat FrameShiftInSeconds() const {
+    return src_->FrameShiftInSeconds();
+  }
 
   virtual int32 NumFramesReady() const;
 
@@ -348,7 +353,9 @@ class OnlineProcessPitch: public OnlineFeatureInterface {
                      OnlineFeatureInterface *src);
 
  private:
-  static const int32 kRawFeatureDim = 2;  // input: (nccf, pitch)
+  enum { kRawFeatureDim = 2}; // anonymous enum to define a constant. 
+                              // kRawFeatureDim defines the dimension 
+                              // of the input: (nccf, pitch)
 
   ProcessPitchOptions opts_;
   OnlineFeatureInterface *src_;
diff --git a/src/feat/signal-test.cc b/src/feat/signal-test.cc
new file mode 100644
index 00000000000..39a379040b0
--- /dev/null
+++ b/src/feat/signal-test.cc
@@ -0,0 +1,63 @@
+// feat/signal-test.cc
+
+// Copyright 2015  Tom Ko
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "feat/signal.h"
+
+namespace kaldi {
+
+void UnitTestBlockConvolution() {
+  for (int32 i = 0; i < 5; i++) {
+    int32 signal_length = 4000000 + Rand() % 400000;
+    int32 filter_length = 10000 + Rand() % 1000;
+    Vector<BaseFloat> signal(signal_length);
+    Vector<BaseFloat> filter(filter_length);
+    signal.SetRandn();
+    filter.SetRandn();
+    Vector<BaseFloat> signal_test(signal);
+    FFTbasedConvolveSignals(filter, &signal_test);
+    FFTbasedBlockConvolveSignals(filter, &signal);
+    AssertEqual(signal, signal_test, 0.000001 * signal.Dim());
+  }
+}
+
+void UnitTestConvolution() {
+  for (int32 i = 0; i < 5; i++) {
+    int32 signal_length = 40000 + Rand() % 4000;
+    int32 filter_length = 100 + Rand() % 100;
+    Vector<BaseFloat> signal(signal_length);
+    Vector<BaseFloat> filter(filter_length);
+    signal.SetRandn();
+    filter.SetRandn();
+    Vector<BaseFloat> signal_test(signal);
+    ConvolveSignals(filter, &signal_test);
+    FFTbasedBlockConvolveSignals(filter, &signal);
+    AssertEqual(signal, signal_test, 0.0001 * signal.Dim());
+  }
+}
+}
+
+int main() {
+  using namespace kaldi;
+  UnitTestBlockConvolution();
+  UnitTestConvolution();
+  KALDI_LOG << "Tests succeeded.";
+
+}
diff --git a/src/feat/signal.cc b/src/feat/signal.cc
new file mode 100644
index 00000000000..e8fbb0b84cf
--- /dev/null
+++ b/src/feat/signal.cc
@@ -0,0 +1,123 @@
+// feat/signal.cc
+
+// Copyright 2015  Tom Ko
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "feat/signal.h"
+
+namespace kaldi {
+
+void ElementwiseProductOfFft(const Vector<BaseFloat> &a, Vector<BaseFloat> *b) {
+  int32 num_fft_bins = a.Dim() / 2;
+  for (int32 i = 0; i < num_fft_bins; i++) {
+    // do complex multiplication
+    ComplexMul(a(2*i), a(2*i + 1), &((*b)(2*i)), &((*b)(2*i + 1)));
+  }
+}
+
+void ConvolveSignals(const Vector<BaseFloat> &filter, Vector<BaseFloat> *signal) {
+  int32 signal_length = signal->Dim();
+  int32 filter_length = filter.Dim();
+  Vector<BaseFloat> signal_padded(signal_length + filter_length - 1);
+  signal_padded.SetZero();
+  for (int32 i = 0; i < signal_length; i++) {
+    for (int32 j = 0; j < filter_length; j++) {
+        signal_padded(i + j) += (*signal)(i) * filter(j);
+    }
+  }
+  signal->CopyFromVec(signal_padded.Range(0, signal_length));
+}
+
+
+void FFTbasedConvolveSignals(const Vector<BaseFloat> &filter, Vector<BaseFloat> *signal) {
+  int32 signal_length = signal->Dim();
+  int32 filter_length = filter.Dim();
+
+  int32 fft_length = RoundUpToNearestPowerOfTwo(signal_length + filter_length - 1);
+  KALDI_VLOG(1) << "fft_length for full signal convolution is " << fft_length;
+
+  SplitRadixRealFft<BaseFloat> srfft(fft_length);
+
+  Vector<BaseFloat> filter_padded(fft_length);
+  filter_padded.Range(0, filter_length).CopyFromVec(filter);
+  srfft.Compute(filter_padded.Data(), true);
+
+  Vector<BaseFloat> signal_padded(fft_length);
+  signal_padded.Range(0, signal_length).CopyFromVec(*signal);
+  srfft.Compute(signal_padded.Data(), true);
+
+  ElementwiseProductOfFft(filter_padded, &signal_padded);
+
+  srfft.Compute(signal_padded.Data(), false);
+  signal_padded.Scale(1.0 / fft_length);
+
+  signal->CopyFromVec(signal_padded.Range(0, signal_length));
+}
+
+void FFTbasedBlockConvolveSignals(const Vector<BaseFloat> &filter, Vector<BaseFloat> *signal) {
+  int32 signal_length = signal->Dim();
+  int32 filter_length = filter.Dim();
+
+  KALDI_VLOG(1) << "Length of the filter is " << filter_length;
+
+  int32 fft_length = RoundUpToNearestPowerOfTwo(4 * filter_length);
+  KALDI_VLOG(1) << "Best FFT length is " << fft_length;
+
+  int32 block_length = fft_length - filter_length + 1;
+  KALDI_VLOG(1) << "Block size is " << block_length;
+  SplitRadixRealFft<BaseFloat> srfft(fft_length);
+
+  Vector<BaseFloat> filter_padded(fft_length);
+  filter_padded.Range(0, filter_length).CopyFromVec(filter);
+  srfft.Compute(filter_padded.Data(), true);
+
+  Vector<BaseFloat> temp_pad(filter_length - 1);
+  temp_pad.SetZero();
+  Vector<BaseFloat> signal_block_padded(fft_length);
+
+  for (int32 po = 0; po < signal_length; po += block_length) {
+    // get a block of the signal
+    int32 process_length = std::min(block_length, signal_length - po);
+    signal_block_padded.SetZero();
+    signal_block_padded.Range(0, process_length).CopyFromVec(signal->Range(po, process_length));
+
+    srfft.Compute(signal_block_padded.Data(), true);
+
+    ElementwiseProductOfFft(filter_padded, &signal_block_padded);
+
+    srfft.Compute(signal_block_padded.Data(), false);
+    signal_block_padded.Scale(1.0 / fft_length);
+
+    // combine the block
+    if (po + block_length < signal_length) {       // current block is not the last block
+      signal->Range(po, block_length).CopyFromVec(signal_block_padded.Range(0, block_length));
+      signal->Range(po, filter_length - 1).AddVec(1.0, temp_pad);
+      temp_pad.CopyFromVec(signal_block_padded.Range(block_length, filter_length - 1));
+    } else {
+      signal->Range(po, signal_length - po).CopyFromVec(
+                        signal_block_padded.Range(0, signal_length - po));
+      if (filter_length - 1 < signal_length - po)
+        signal->Range(po, filter_length - 1).AddVec(1.0, temp_pad);
+      else
+        signal->Range(po, signal_length - po).AddVec(1.0, temp_pad.Range(0, signal_length - po));
+    }
+  }
+}
+}
+
diff --git a/src/feat/signal.h b/src/feat/signal.h
new file mode 100644
index 00000000000..7ff0ce33b52
--- /dev/null
+++ b/src/feat/signal.h
@@ -0,0 +1,51 @@
+// feat/signal.h
+
+// Copyright 2015  Tom Ko
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_FEAT_SIGNAL_H_
+#define KALDI_FEAT_SIGNAL_H_
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+
+namespace kaldi {
+
+/*
+   This function implements a simple non-FFT-based convolution of two signals.
+   It is suggested to use the FFT-based convolution function which is more
+   efficient.
+*/
+void ConvolveSignals(const Vector<BaseFloat> &filter, Vector<BaseFloat> *signal);
+
+/*
+   This function implements FFT-based convolution of two signals.
+   However this should be an inefficient version of BlockConvolveSignals()
+   as it processes the entire signal with a single FFT.
+*/
+void FFTbasedConvolveSignals(const Vector<BaseFloat> &filter, Vector<BaseFloat> *signal);
+
+/*
+   This function implements FFT-based block convolution of two signals using
+   overlap-add method. This is an efficient way to evaluate the discrete
+   convolution of a long signal with a finite impulse response filter.
+*/
+void FFTbasedBlockConvolveSignals(const Vector<BaseFloat> &filter, Vector<BaseFloat> *signal);
+
+}  // namespace kaldi
+
+#endif  // KALDI_FEAT_SIGNAL_H_
diff --git a/src/feat/sinusoid-detection.cc b/src/feat/sinusoid-detection.cc
index 187b94953ac..bf6b0b9e4fe 100644
--- a/src/feat/sinusoid-detection.cc
+++ b/src/feat/sinusoid-detection.cc
@@ -104,7 +104,7 @@ void SinusoidDetector::QuadraticMaximize(
   // Also,  x1.y2 - y1 =  a (x1 - x1^2) + (x1 - 1) c, so
   // a = ( (x1 y2 - y1) - (x1 - 1) c) / (x1 - x1^2), and
   // b = y2 - a - c.
-  BaseFloat c = y0, 
+  BaseFloat c = y0,
       a = (x1 * y2 - y1 - (x1 - 1.0) * c) / (x1 - x1*x1),
       b = y2 - a - c;
 
@@ -152,8 +152,8 @@ BaseFloat SinusoidDetector::QuadraticInterpolate(
   KALDI_ASSERT(x1 >= 0.0 && x1 <= 1.0);
   if (x1 == 0.0) return y0;
   else if (x1 == 1.0) return y2;
-  
-  BaseFloat c = y0, 
+
+  BaseFloat c = y0,
       a = (x1 * y2 - y1 - (x1 - 1.0) * c) / (x1 - x1*x1),
       b = y2 - a - c;
   return a * x * x + b * x + c;
@@ -172,7 +172,7 @@ void SinusoidDetector::CreateCosAndSin(BaseFloat samp_freq,
   BaseFloat *cos_data = cos_vec->Data(), *sin_data = sin_vec->Data();
   BaseFloat factor_real = cos(M_2PI * freq / samp_freq),
       factor_im = sin(M_2PI * freq / samp_freq);
-  
+
   // process frames in batches of size "batch_size", after which we recompute
   // the starting point to prevent loss of accuracy due to drift.
   for (int32 b = 0; b * batch_size < dim; b++) {
@@ -191,7 +191,7 @@ void SinusoidDetector::CreateCosAndSin(BaseFloat samp_freq,
 }
 
 SinusoidDetector::SinusoidDetector(BaseFloat samp_freq,
-                                     int32 num_samp): 
+                                     int32 num_samp):
     samp_freq_(samp_freq),
     num_samples_(num_samp),
     num_samples_padded_(RoundUpToNearestPowerOfTwo(num_samp)),
@@ -208,14 +208,14 @@ void SinusoidDetector::SelfTest(
     BaseFloat final_energy) {
   int32 num_bins = num_samples_padded_ * 2 + 1;
 
-  
+
   {
     BaseFloat cutoff = 0.0;
     for (int32 k = 0; k <= num_bins; k += 4)
       cutoff = std::max(cutoff, info[k].energy);
     BaseFloat energy_upper_bound = factor1_ * cutoff;
     if (final_energy > energy_upper_bound) {
-      KALDI_WARN << "Self-testing failed [factor1]: " 
+      KALDI_WARN << "Self-testing failed [factor1]: "
                  << final_energy << " > " << energy_upper_bound
                  << ", num-samples is " << num_samples_
                  << ", freq/nyquist = "
@@ -231,17 +231,17 @@ void SinusoidDetector::SelfTest(
         cutoff = std::max(cutoff, info[k].energy);
     BaseFloat energy_upper_bound = factor2_ * cutoff;
     if (final_energy > energy_upper_bound) {
-      KALDI_WARN << "Self-testing failed [factor2]: " 
+      KALDI_WARN << "Self-testing failed [factor2]: "
                  << final_energy << " > " << energy_upper_bound
                  << ", num-samples is " << num_samples_
                  << ", freq/nyquist = "
                  << (final_freq / (samp_freq_ * 0.5))
                  << "- would require factor2 >= "
                  << (final_energy / cutoff);
-          
+
     }
   }
-  
+
 }
 
 
@@ -249,7 +249,7 @@ BaseFloat SinusoidDetector::OptimizeFrequency(
     const std::vector<InfoForBin> &info,
     int32 *bin_out,
     BaseFloat *offset_out) const {
-  
+
   BaseFloat max_energy = 0.0;
   *bin_out = -1;
   int32 max_freq =  num_samples_padded_ * 2;
@@ -320,20 +320,20 @@ BaseFloat SinusoidDetector::DetectSinusoid(
   // between bins, with an offset.
   int32 bin;
   BaseFloat offset;
-  
+
   BaseFloat opt_energy = OptimizeFrequency(info,  &bin, &offset);
 
   if (opt_energy == 0.0)
     return 0.0;
 
   BaseFloat max_freq = (bin + offset) * samp_freq_ / (num_samples_padded_ * 4);
-  
+
   KALDI_VLOG(4) << "Best frequency based on interpolation is "
                 << max_freq << ", best energy is "
                 << opt_energy << ", bin is " << bin;
 
   OptimizedInfo final_info;
-  
+
   FineOptimizeFrequency(signal, bin, offset, &info, &final_info);
 
   // the following while loop will rarely be accessed.
@@ -342,7 +342,7 @@ BaseFloat SinusoidDetector::DetectSinusoid(
     FineOptimizeFrequency(signal, bin, 1.0, &info, &final_info);
   }
 
-  // the following while loop will rarely be accessed.  
+  // the following while loop will rarely be accessed.
   while (final_info.offset == 1.0 && bin < num_samples_padded_ * 2) {
     bin++;
     FineOptimizeFrequency(signal, bin, 0.0, &info, &final_info);
@@ -353,9 +353,9 @@ BaseFloat SinusoidDetector::DetectSinusoid(
     // next-to-highest allowed bin (note, "bin" here is a range, and it can
     // never have the value num_samples_padded_ * 2), we tend to get more
     // estimation error than usual, so do another round of optimization.
-    FineOptimizeFrequency(signal, bin, final_info.offset, &info, &final_info);    
+    FineOptimizeFrequency(signal, bin, final_info.offset, &info, &final_info);
   }
-  
+
   BaseFloat final_freq = (final_info.bin + final_info.offset) * samp_freq_ / (num_samples_padded_ * 4);
   KALDI_VLOG(4) << "Final optimized info is: freq " << final_freq
                 << ", cos coeff " << final_info.cos_coeff << ", sin coeff "
@@ -390,12 +390,12 @@ BaseFloat SinusoidDetector::DetectSinusoid(
 
   Let the signal, as a vector, be V.
   We want to maximize the (positive) energy-difference:
-       ||V||^2  - || V - c C_f - s S_f ||^2 
+       ||V||^2  - || V - c C_f - s S_f ||^2
   where c and s are the coefficients of C_f and S_f.
   This quantity can be expanded as follows, where . means dot product.
    \delta E =    -c^2 C_f.C_f - s^2 S_f.S_f - 2 c s C_f.S_f  + 2 c V.C_f + 2 s V.S_f.
   which can be written as follows, where . means dot-product and ' means transpose:
-    \delta E   =   2 [c s] v  -  [c s] M [c s]' 
+    \delta E   =   2 [c s] v  -  [c s] M [c s]'
   where M = [ C_f.C_f, C_f.S_f, C_f.S_f,  S_f.S_f ],
     and v = [V.C_f,  V.S_f].
   If M is invertible (i.e. for nonzero frequencies), this is maximized by
@@ -451,7 +451,7 @@ void SinusoidDetector::ComputeCoefficients() {
   int32 num_freq =  num_samples_padded_ * 2 + 1;
   cos_.Resize(num_freq, num_samp);
   sin_.Resize(num_freq, num_samp);
-  
+
   Vector<BaseFloat> cc(num_freq), cs(num_freq);
   for (int32 k = 0; k < num_freq; k++) {
     BaseFloat freq = k * samp_freq_ / (num_samples_padded_ * 4);
@@ -460,10 +460,10 @@ void SinusoidDetector::ComputeCoefficients() {
     cc(k) = VecVec(c, c);
     cs(k) = VecVec(c, s);
   }
-  
-  M_.Resize(num_freq, 3, kUndefined);  
+
+  M_.Resize(num_freq, 3, kUndefined);
   Minv_.Resize(num_freq, 3, kUndefined);
-  
+
   for (int32 k = 0; k < num_freq; k++) {
     // Let the matrix M be [ a b; b d ].   [we don't write c because c == b].
     // We want to compute Minv_.
@@ -503,7 +503,7 @@ void SinusoidDetector::FineOptimizeFrequency(
   std::vector<InfoForBin> &info = *info_in;
   if (!info[bin].valid) ComputeBinInfo(signal, bin, &(info[bin]));
   if (!info[bin+1].valid) ComputeBinInfo(signal, bin+1, &(info[bin+1]));
-  
+
   const BaseFloat epsilon = 0.02, delta = 0.001;
 
   // If the offset is very close to the edges of the bin, move it
@@ -527,16 +527,16 @@ void SinusoidDetector::FineOptimizeFrequency(
   BaseFloat a = VecVec(c, c), b = VecVec(c, s), d = num_samples_ - a;
   BaseFloat inv_det = 1.0 / (a * d - b * b);
   BaseFloat inv_a = d * inv_det, inv_b = -b * inv_det, inv_d = a * inv_det;
-  
+
 
   BaseFloat v1 = VecVec(c, signal), v2 = VecVec(s, signal);
-  
+
   BaseFloat delta_e = v1 * v1 * inv_a + v2 * v2 * inv_d + 2 * v1 * v2 * inv_b;
-  
+
   KALDI_VLOG(4) << "Actual energy-change at frequency " << freq << " is "
                 << delta_e;
   // "freq" is frequency somewhere in the middle of the bin.
-  
+
   BaseFloat final_offset, final_energy;
   QuadraticMaximize(bin_offset, info[bin].energy, delta_e, info[bin+1].energy,
                     &final_offset, &final_energy);
@@ -561,7 +561,7 @@ void SinusoidDetector::FineOptimizeFrequency(
 
   // Now get the inverse of the M matrix at the final point.
   BaseFloat a_inv_interp, b_inv_interp, d_inv_interp;
-  
+
   if ((bin == 0 && final_offset < delta) ||
       (bin == num_samples_padded_ * 2 && final_offset > 1.0 - delta)) {
     // If we're extremely close to zero or the Nyquist, we'll have trouble
@@ -584,7 +584,7 @@ void SinusoidDetector::FineOptimizeFrequency(
                                              info[bin+1].cos_dot, final_offset);
   BaseFloat v2_interp = QuadraticInterpolate(bin_offset, info[bin].sin_dot, v2,
                                              info[bin+1].sin_dot, final_offset);
-  
+
   opt_info->bin = bin;
   opt_info->offset = final_offset;
   // Recompute the energy-reduction using the more accurate interpolated values of
@@ -596,7 +596,7 @@ void SinusoidDetector::FineOptimizeFrequency(
   // Compute the coefficients of the cos and sin in the optimal sinusoid, as
   // M^{-1} v.
   opt_info->cos_coeff = a_inv_interp * v1_interp + b_inv_interp * v2_interp;
-  opt_info->sin_coeff = b_inv_interp * v1_interp + d_inv_interp * v2_interp;  
+  opt_info->sin_coeff = b_inv_interp * v1_interp + d_inv_interp * v2_interp;
 }
 
 void SinusoidDetector::FindCandidateBins(
@@ -611,7 +611,7 @@ void SinusoidDetector::FindCandidateBins(
     KALDI_ASSERT(info[k].valid);
     cutoff = std::max(cutoff, info[k].energy);
   }
-  
+
   for (int32 k = 0; k < max_bin; k += 4) {
     BaseFloat energy_upper_bound =
         factor1_ * std::max(info[k].energy,
@@ -628,14 +628,14 @@ void SinusoidDetector::FindCandidateBins2(
     std::vector<int32> *bins2) const {
 
   int32 max_bin = num_samples_padded_ * 2;
-  
+
   BaseFloat cutoff = min_energy;
   for (int32 k = 0; k <= max_bin; k += 2) {
     if (info[k].valid)
       cutoff = std::max(cutoff, info[k].energy);
   }
 
-  for (int32 k = 0; k < max_bin; k += 2) {  
+  for (int32 k = 0; k < max_bin; k += 2) {
     if (info[k].valid && info[k+2].valid) {
       BaseFloat energy_upper_bound =
           factor2_ * std::max(info[k].energy,
@@ -645,7 +645,7 @@ void SinusoidDetector::FindCandidateBins2(
     }
   }
 }
-      
+
 
 void SinusoidDetector::ComputeBinInfo(
     const VectorBase<BaseFloat> &signal,
@@ -670,8 +670,6 @@ MultiSinusoidDetector::MultiSinusoidDetector(
     sample_freq_(sampling_freq),
     samples_per_frame_subsampled_(0.001 * config.frame_length_ms *
                                   static_cast<BaseFloat>(config.subsample_freq)),
-    samples_shift_subsampled_(0.001 * config.frame_shift_ms *
-                              static_cast<BaseFloat>(config.subsample_freq)),
     waveform_finished_(false),
     samples_consumed_(0),
     resampler_(sampling_freq, config.subsample_freq,
@@ -726,7 +724,7 @@ int32 MultiSinusoidDetector::NumSubsampledSamplesReady(int32 max_samp) const {
                ((subsampled_signal_.empty() && samples_consumed_ == 0) ||
                 (!subsampled_signal_.empty () && samples_consumed_ <
                  subsampled_signal_[0]->Dim())));
-      
+
   int32 ans = -samples_consumed_;
   for (size_t i = 0; i < subsampled_signal_.size(); i++) {
     ans += subsampled_signal_[i]->Dim();
@@ -787,7 +785,7 @@ void MultiSinusoidDetector::GetNextFrame(MultiSinusoidDetectorOutput *output) {
   if (signal_energy == 0.0) return;
 
   // min_energy1 is the lowest energy we might care about.
-  BaseFloat min_energy1 = signal_energy * 
+  BaseFloat min_energy1 = signal_energy *
       std::min<BaseFloat>(config_.two_freq_min_total_energy * 0.5,
                           config_.one_freq_min_energy);
 
@@ -830,7 +828,7 @@ void MultiSinusoidDetector::GetNextFrame(MultiSinusoidDetectorOutput *output) {
                     << factor << ".  (This means sinusoid detection is not "
                     << " working ideally).";
     }
-    
+
     if (DetectedTwoFrequency(signal_energy,
                              sinusoid1, energy1,
                              sinusoid2, energy2,
@@ -917,14 +915,14 @@ void DetectSinusoids(const VectorBase<BaseFloat> &signal,
   detector->AcceptWaveform(signal);
   detector->WaveformFinished();
 
-  int32 safety_margin = 10, approx_num_frames = safety_margin + 
+  int32 safety_margin = 10, approx_num_frames = safety_margin +
       (signal.Dim() / (detector->SamplingFrequency() *
                        detector->FrameShiftSecs()));
   output_vec.reserve(approx_num_frames);
   while (!detector->Done()) {
     output_vec.resize(output_vec.size() + 1);
     detector->GetNextFrame(&(output_vec.back()));
-  }  
+  }
   detector->Reset();
   if (output_vec.empty()) {
     output->Resize(0, 0);
diff --git a/src/feat/sinusoid-detection.h b/src/feat/sinusoid-detection.h
index 29483fcc30b..f6addc0b530 100644
--- a/src/feat/sinusoid-detection.h
+++ b/src/feat/sinusoid-detection.h
@@ -55,7 +55,7 @@ class SinusoidDetector {
  public:
   SinusoidDetector(BaseFloat samp_freq,
                     int32 num_samp);
-  
+
 
   // Detect the dominant sinusoid component in the signal, as long as the
   // energy-reduction of the signal from subtracting that sinuoid would be >=
@@ -65,7 +65,7 @@ class SinusoidDetector {
   BaseFloat DetectSinusoid(BaseFloat min_energy_change,
                            const VectorBase<BaseFloat> &signal,
                            Sinusoid *sinusoid);
-  
+
   // This function does quadratic interpolation for a function that is known at
   // three equally spaced points [x0 x1 x2] = [0 1 2], and we want the x-value
   // and corresponding y-value at the maximum of the function within the range
@@ -89,7 +89,7 @@ class SinusoidDetector {
   static BaseFloat QuadraticInterpolate(
     BaseFloat x1, BaseFloat y0, BaseFloat y1, BaseFloat y2,
     BaseFloat x);
-  
+
 
  private:
   BaseFloat samp_freq_;
@@ -121,14 +121,14 @@ class SinusoidDetector {
   // containing the values x y z of a symmetric matrix [ a b; b c ].  There is
   // one of these matrices for each frequency, sampled at one quarter the
   // spacing of the FFT bins.  There is a long comment next to the definition of
-  // ComputeCoefficients that describes this. 
+  // ComputeCoefficients that describes this.
   Matrix<BaseFloat> M_;
 
   // Minv_ is the coefficients in the same format as M_, but containing the
   // corresponding coefficients of the inverse matrix.  There is a long comment
   // next to the definition of ComputeCoefficients that describes this.
   Matrix<BaseFloat> Minv_;
-  
+
 
   struct InfoForBin {
     bool valid;
@@ -146,9 +146,9 @@ class SinusoidDetector {
     BaseFloat cos_coeff;
     BaseFloat sin_coeff;
   };
-  
+
   // Compute the coefficients and energies at the original FFT bins (every
-  // fourth entry in "info"). 
+  // fourth entry in "info").
   void ComputeCoarseInfo(const Vector<BaseFloat> &fft,
                          std::vector<InfoForBin> *info) const;
 
@@ -164,11 +164,11 @@ class SinusoidDetector {
                           const std::vector<InfoForBin> &info,
                           std::vector<int32> *bins) const;
 
-  
+
   void ComputeBinInfo(const VectorBase<BaseFloat> &signal,
                       int32 bin, InfoForBin *info) const;
 
-  
+
   // For each bin b such that we have valid "info" data for bins b, b+1 and b+2,
   // does quadratic interpolation to find the maximum predicted energy in the
   // range [b, b+2].  The location of the maximum predicted energy is output to
@@ -186,7 +186,7 @@ class SinusoidDetector {
       const std::vector<InfoForBin> &info,
       int32 *bin_out,
       BaseFloat *offset_out) const;
-  
+
 
   // This function does
   // (*cos)(t) = cos(2 pi t freq / samp_freq)
@@ -195,7 +195,7 @@ class SinusoidDetector {
                               BaseFloat freq,
                               VectorBase<BaseFloat> *cos,
                               VectorBase<BaseFloat> *sin);
-  
+
   // Do fine optimization of the frequency within a bin, given a reasonable
   // approximate position within it based on interpolation (that should be close
   // to the optimum).
@@ -205,7 +205,7 @@ class SinusoidDetector {
       BaseFloat offset,
       std::vector<InfoForBin> *info,
       OptimizedInfo *opt_info) const;
-  
+
   // Computes the coefficients cos_, sin_, and Minv_.
   void ComputeCoefficients();
 
@@ -263,7 +263,7 @@ struct MultiSinusoidDetectorConfig {
   // the following is not critical and is not exported to the
   // command line.
   int32 subsample_filter_zeros;
-  
+
   MultiSinusoidDetectorConfig():
       frame_length_ms(20), frame_shift_ms(10),
       two_freq_min_energy(0.2), two_freq_min_total_energy(0.6),
@@ -313,8 +313,8 @@ struct MultiSinusoidDetectorConfig {
     KALDI_ASSERT(fabs(samples_per_frame_shift -
                       static_cast<int32>(samples_per_frame_shift)) <
                  0.001);
-                      
-  }             
+
+  }
 };
 
 struct MultiSinusoidDetectorOutput {
@@ -338,19 +338,19 @@ class MultiSinusoidDetector {
 
   // Initialize sinusoid detector.  Sampling frequency must be integer.
   MultiSinusoidDetector(const MultiSinusoidDetectorConfig &config,
-                        int32 sampling_freq);    
+                        int32 sampling_freq);
 
   /// This is how the class acccepts its input.  You can put the waveform in
   /// piece by piece, if it's an online application.
   void AcceptWaveform(const VectorBase<BaseFloat> &waveform);
-  
+
   /// The user calls this to announce to the class that the waveform has ended;
   /// this forces any pending data to be flushed.
   void WaveformFinished();
 
   /// Resets the state of the class so you can start processing another waveform.
-  void Reset(); 
-  
+  void Reset();
+
   /// This returns true if the class currently has no more data ready to output.
   bool Done() const;
 
@@ -362,7 +362,7 @@ class MultiSinusoidDetector {
   BaseFloat FrameShiftSecs() const { return 0.001 * config_.frame_shift_ms; }
 
   BaseFloat SamplingFrequency() const { return sample_freq_; }
-  
+
  private:
   // Gets the next frame of subsampled signal, and consumes the appropriate
   // amount of stored data.  It is an error to call this if Done() returned
@@ -386,23 +386,21 @@ class MultiSinusoidDetector {
                             const Sinusoid &sinusoid2,
                             BaseFloat energy2,
                             MultiSinusoidDetectorOutput *output);
-  
-  
+
+
   // Returns std::min(max_samp, sum-of-samples-in-subsampled_signal_).
   // (the std::min is for efficiency so we don't have to visit the
   //  whole list).
   int32 NumSubsampledSamplesReady(int32 max_samp) const;
-  
+
   MultiSinusoidDetectorConfig config_;
   int32 sample_freq_;
   int32 samples_per_frame_subsampled_;  // (samples per frame at subsampled
                                         // rate).
-  int32 samples_shift_subsampled_;  // (samples per frame-shift at subsampled
-                                    // rate).
 
   // True if the user has called WaveformFinished().
   bool waveform_finished_;
-  
+
   // Pieces of the subsampled signal that are awaiting processing.
   // Normally there will be just one element here, but if someone calls
   // AcceptWaveform multiple times before getting output, there could
@@ -414,12 +412,12 @@ class MultiSinusoidDetector {
   // (subsampled_signal_.empty() && samples_consumed_ == 0) or
   // samples_consumed_ < subsampled_signal_[0]->Dim().
   int32 samples_consumed_;
-  
-  
+
+
   // This object is used to subsample the signal.
   LinearResample resampler_;
 
-  // This object is used to detect sinusoids in the subsampled 
+  // This object is used to detect sinusoids in the subsampled
   // frames.
   SinusoidDetector detector_;
 };
diff --git a/src/feat/wave-reader.cc b/src/feat/wave-reader.cc
index cb3f287fdd6..389b461d86c 100644
--- a/src/feat/wave-reader.cc
+++ b/src/feat/wave-reader.cc
@@ -106,7 +106,7 @@ void WaveData::WriteUint16(std::ostream &os, int16 i) {
 
 
 
-void WaveData::Read(std::istream &is) {
+void WaveData::Read(std::istream &is, ReadDataType read_data) {
   data_.Resize(0, 0);  // clear the data.
 
   char tmp[5];
@@ -224,13 +224,26 @@ void WaveData::Read(std::istream &is) {
   if (std::abs(static_cast<int64>(riff_chunk_read) +
                static_cast<int64>(data_chunk_size) -
                static_cast<int64>(riff_chunk_size)) > 1) {
-    // we allow the size to be off by one, because there is a weirdness in the
-    // format of RIFF files that means that the input may sometimes be padded
-    // with 1 unused byte to make the total size even.
-    KALDI_ERR << "Expected " << riff_chunk_size << " bytes in RIFF chunk, but "
-              << "after first data block there will be " << riff_chunk_read
-              << " + " << data_chunk_size << " bytes "
-              << "(we do not support reading multiple data chunks).";
+    // we allow the size to be off by one without warning, because there is a
+    // weirdness in the format of RIFF files that means that the input may
+    // sometimes be padded with 1 unused byte to make the total size even.
+    KALDI_WARN << "Expected " << riff_chunk_size << " bytes in RIFF chunk, but "
+               << "after first data block there will be " << riff_chunk_read
+               << " + " << data_chunk_size << " bytes "
+               << "(we do not support reading multiple data chunks).";
+  }
+
+  if (read_data == kLeaveDataUndefined) {
+    // we won't actually be reading the data- we'll just be faking that we read
+    // that data, so the caller can get the metadata.
+    // assume we'd read the same number of bytes that the data-chunk header
+    // says we'd read.
+    int32 num_bytes_read = data_chunk_size;
+    uint32 num_samp = num_bytes_read / block_align;
+    data_.Resize(num_channels, num_samp, kUndefined);
+    return;
+  } else {
+    KALDI_ASSERT(read_data == kReadData);
   }
 
   std::vector<char*> data_pointer_vec;
diff --git a/src/feat/wave-reader.h b/src/feat/wave-reader.h
index 64e7bd94d4e..0749022f7d7 100644
--- a/src/feat/wave-reader.h
+++ b/src/feat/wave-reader.h
@@ -37,6 +37,11 @@
 //  each WAVE chunk has header sub-chunk 'fmt_'
 //  and one or more data sub-chunks 'data'
 //
+//  [Note from Dan: to say that the wave format was ever "specified" anywhere is
+//   not quite right.  The guy who invented the wave format attempted to create
+//   a formal specification but it did not completely make sense.  And there
+//   doesn't seem to be a consensus on what makes a valid wave file,
+//   particularly where the accuracy of header information is concerned.]
 */
 
 
@@ -59,6 +64,8 @@ const BaseFloat kWaveSampleMax = 32768.0;
 /// This class's purpose is to read in Wave files.
 class WaveData {
  public:
+  enum ReadDataType { kReadData, kLeaveDataUndefined };
+
   WaveData(BaseFloat samp_freq, const MatrixBase<BaseFloat> &data)
       : data_(data), samp_freq_(samp_freq) {}
 
@@ -67,7 +74,7 @@ class WaveData {
   /// Read() will throw on error.  It's valid to call Read() more than once--
   /// in this case it will destroy what was there before.
   /// "is" should be opened in binary mode.
-  void Read(std::istream &is);
+  void Read(std::istream &is, ReadDataType read_data = kReadData);
 
   /// Write() will throw on error.   os should be opened in binary mode.
   void Write(std::ostream &os) const;
@@ -92,6 +99,11 @@ class WaveData {
     samp_freq_ = 0.0;
   }
 
+  void Swap(WaveData *other) {
+    data_.Swap(&(other->data_));
+    std::swap(samp_freq_, other->samp_freq_);
+  }
+
  private:
   static const uint32 kBlockSize = 1024 * 1024;  // Use 1M bytes.
   Matrix<BaseFloat> data_;
@@ -106,8 +118,11 @@ class WaveData {
 };
 
 
-// Holder class for .wav files that enables us to read (but not write)
-// .wav files. c.f. util/kaldi-holder.h
+// Holder class for .wav files that enables us to read (but not write) .wav
+// files. c.f. util/kaldi-holder.h we don't use the KaldiObjectHolder template
+// because we don't want to check for the \0B binary header. We could have faked
+// it by pretending to read in the wave data in text mode after failing to find
+// the \0B header, but that would have been a little ugly.
 class WaveHolder {
  public:
   typedef WaveData T;
@@ -120,8 +135,8 @@ class WaveHolder {
       t.Write(os);  // throws exception on failure.
       return true;
     } catch (const std::exception &e) {
-      KALDI_WARN << "Exception caught in WaveHolder object (writing).";
-      if (!IsKaldiError(e.what())) { std::cerr << e.what(); }
+      KALDI_WARN << "Exception caught in WaveHolder object (writing). " 
+                 << e.what();
       return false;  // write failure.
     }
   }
@@ -147,12 +162,71 @@ class WaveHolder {
       t_.Read(is);  // throws exception on failure.
       return true;
     } catch (const std::exception &e) {
-      KALDI_WARN << "Exception caught in WaveHolder object (reading).";
-      if (!IsKaldiError(e.what())) { std::cerr << e.what(); }
+      KALDI_WARN << "Exception caught in WaveHolder object (reading). " 
+                 << e.what();
       return false;  // write failure.
     }
   }
 
+  void Swap(WaveHolder *other) {
+    t_.Swap(&(other->t_));
+  }
+
+  bool ExtractRange(const WaveHolder &other, const std::string &range) {
+    KALDI_ERR << "ExtractRange is not defined for this type of holder.";
+    return false;
+  }
+
+ private:
+  T t_;
+};
+
+// This is like WaveHolder but when you just want the metadata-
+// it leaves the actual data undefined, it doesn't read it.
+class WaveInfoHolder {
+ public:
+  typedef WaveData T;
+
+  static bool Write(std::ostream &os, bool binary, const T &t) {
+    KALDI_ERR << "This holder type does not support writing.";
+    return true;
+  }
+
+  void Copy(const T &t) { t_.CopyFrom(t); }
+
+  static bool IsReadInBinary() { return true; }
+
+  void Clear() { t_.Clear(); }
+
+  const T &Value() { return t_; }
+
+  WaveInfoHolder &operator = (const WaveInfoHolder &other) {
+    t_.CopyFrom(other.t_);
+    return *this;
+  }
+  WaveInfoHolder(const WaveInfoHolder &other): t_(other.t_) {}
+
+  WaveInfoHolder() {}
+
+  bool Read(std::istream &is) {
+    try {
+      t_.Read(is, WaveData::kLeaveDataUndefined);  // throws exception on failure.
+      return true;
+    } catch (const std::exception &e) {
+      KALDI_WARN << "Exception caught in WaveHolder object (reading). " 
+                 << e.what();
+      return false;  // write failure.
+    }
+  }
+
+  void Swap(WaveInfoHolder *other) {
+    t_.Swap(&(other->t_));
+  }
+
+  bool ExtractRange(const WaveInfoHolder &other, const std::string &range) {
+    KALDI_ERR << "ExtractRange is not defined for this type of holder.";
+    return false;
+  }
  private:
   T t_;
 };
diff --git a/src/featbin/Makefile b/src/featbin/Makefile
index 0ff5f58904e..8c3592908a8 100644
--- a/src/featbin/Makefile
+++ b/src/featbin/Makefile
@@ -7,22 +7,23 @@ BINFILES = compute-mfcc-feats compute-plp-feats compute-fbank-feats \
     compute-cmvn-stats add-deltas remove-mean apply-cmvn transform-feats \
     copy-feats compose-transforms splice-feats extract-segments subset-feats \
     feat-to-len feat-to-dim fmpe-apply-transform fmpe-acc-stats fmpe-init \
-    fmpe-est fmpe-copy fmpe-sum-accs append-feats extend-transform-dim \
+    fmpe-est fmpe-copy fmpe-sum-accs extend-transform-dim \
     get-full-lda-mat compute-spectrogram-feats extract-feature-segments \
     reverse-feats paste-feats select-feats subsample-feats process-pitch-feats \
     interpolate-pitch copy-feats-to-htk copy-feats-to-sphinx extract-rows \
     apply-cmvn-sliding compute-cmvn-stats-two-channel compute-kaldi-pitch-feats \
     process-kaldi-pitch-feats compare-feats wav-to-duration add-deltas-sdc \
     compute-and-process-kaldi-pitch-feats modify-cmvn-stats wav-copy \
-    append-vector-to-feats detect-sinusoids
+    wav-reverberate append-vector-to-feats detect-sinusoids shift-feats \
+    concat-feats
 
-OBJFILES = 
+OBJFILES =
 
 TESTFILES =
 
 ADDLIBS = ../feat/kaldi-feat.a ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-         ../thread/kaldi-thread.a ../tree/kaldi-tree.a ../matrix/kaldi-matrix.a \
-         ../util/kaldi-util.a ../base/kaldi-base.a
+          ../tree/kaldi-tree.a ../matrix/kaldi-matrix.a \
+         ../util/kaldi-util.a ../thread/kaldi-thread.a ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
 
diff --git a/src/featbin/append-feats.cc b/src/featbin/append-feats.cc
deleted file mode 100644
index cf373d7a30a..00000000000
--- a/src/featbin/append-feats.cc
+++ /dev/null
@@ -1,100 +0,0 @@
-// featbin/append-feats.cc
-
-// Copyright 2012   Petr Motlicek  Pawel Swietojanski
-//                  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "matrix/kaldi-matrix.h"
-
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-
-    const char *usage =
-        "Append 2 feature-streams [and possibly change format]\n"
-        "Note, this is deprecated; please use paste-feats\n"
-        "Usage: append-feats [options] <in-rspecifier1> <in-rspecifier2> <out-wspecifier>\n"
-        "\n"
-        "e.g.: append-feats --feats-offset-in1 5 --num-feats-in1 5 scp:list1.scp "
-        "scp:list2.scp ark:-\n";
-
-    ParseOptions po(usage);
-
-    bool truncate_frames = false;
-    
-    po.Register("truncate-frames", &truncate_frames, "If true, do not treat it "
-                "as an error when files differ in number of frames, but truncate "
-                "the longest one.");
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string rspecifier1 = po.GetArg(1),
-        rspecifier2 = po.GetArg(2),
-        wspecifier = po.GetArg(3);
-
-    BaseFloatMatrixWriter feats_writer(wspecifier);
-    SequentialBaseFloatMatrixReader feats_reader1(rspecifier1);
-    RandomAccessBaseFloatMatrixReader feats_reader2(rspecifier2);
-
-    int32 num_done = 0, num_err = 0;
-
-    for (; !feats_reader1.Done(); feats_reader1.Next()) {
-      std::string utt = feats_reader1.Key();
-      if (!feats_reader2.HasKey(utt)) {
-        KALDI_WARN << "Could not find features for " << utt << " in "
-                   << rspecifier2 << ": producing no output for the utterance";
-        num_err++;
-        continue;
-      }
-      
-      const Matrix<BaseFloat> &feats1 = feats_reader1.Value();
-      const Matrix<BaseFloat> &feats2 = feats_reader2.Value(utt);
-      if (feats1.NumRows() != feats2.NumRows() && !truncate_frames) {
-        KALDI_WARN << "For utterance " << utt << ", features have different "
-                   << "#frames " << feats1.NumRows() << " vs. "
-                   << feats2.NumRows() << ", producing no output (use "
-                   << "--truncate-frames=true if you want output)";
-        num_err++;
-        continue;
-      }
-      int32 num_frames = std::min(feats1.NumRows(), feats2.NumRows()),
-          dim1 = feats1.NumCols(), dim2 = feats2.NumCols();
-      Matrix<BaseFloat> output(num_frames, dim1 + dim2, kUndefined);
-      output.Range(0, num_frames, 0, dim1).CopyFromMat(
-          feats1.Range(0, num_frames, 0, dim1));
-      output.Range(0, num_frames, dim1, dim2).CopyFromMat(
-          feats2.Range(0, num_frames, 0, dim2));
-      
-      feats_writer.Write(utt, output);
-      num_done++;
-    }
-    KALDI_LOG << "Appended " << num_done << " feats; " << num_err
-              << " with errors.";
-    return (num_done != 0 ? 0 : 1);
-  } catch (const std::exception& e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/featbin/append-vector-to-feats.cc b/src/featbin/append-vector-to-feats.cc
index 58965159fda..5ca6ae97063 100644
--- a/src/featbin/append-vector-to-feats.cc
+++ b/src/featbin/append-vector-to-feats.cc
@@ -35,7 +35,7 @@ void AppendVectorToFeats(const Matrix<BaseFloat> &in,
              0, in.NumCols()).CopyFromMat(in);
   out->Range(0, in.NumRows(),
              in.NumCols(), vec.Dim()).CopyRowsFromVec(vec);
-}  
+}
 
 
 }
@@ -44,31 +44,32 @@ int main(int argc, char *argv[]) {
   try {
     using namespace kaldi;
     using namespace std;
-    
+
     const char *usage =
         "Append a vector to each row of input feature files\n"
         "\n"
         "Usage: append-vector-to-feats <in-rspecifier1> <in-rspecifier2> <out-wspecifier>\n"
-        " or: append-feats <in-rxfilename1> <in-rxfilename2> <out-wxfilename>\n";
-    
+        " or: append-vector-to-feats <in-rxfilename1> <in-rxfilename2> <out-wxfilename>\n"
+        "See also: paste-feats, concat-feats\n";
+
     ParseOptions po(usage);
 
     bool binary = true;
     po.Register("binary", &binary, "If true, output files in binary "
                 "(only relevant for single-file operation, i.e. no tables)");
-    
+
     po.Read(argc, argv);
-    
+
     if (po.NumArgs() != 3) {
       po.PrintUsage();
       exit(1);
     }
-    
+
     if (ClassifyRspecifier(po.GetArg(1), NULL, NULL)
         != kNoRspecifier) {
       // We're operating on tables, e.g. archives.
-      
-    
+
+
       string feat_rspecifier = po.GetArg(1);
       SequentialBaseFloatMatrixReader feat_reader(feat_rspecifier);
 
@@ -77,22 +78,22 @@ int main(int argc, char *argv[]) {
 
       string wspecifier = po.GetArg(3);
       BaseFloatMatrixWriter feat_writer(wspecifier);
-      
+
       int32 num_done = 0, num_err = 0;
       // Main loop
       for (; !feat_reader.Done(); feat_reader.Next()) {
         string utt = feat_reader.Key();
         KALDI_VLOG(2) << "Processing utterance " << utt;
-        
+
         const Matrix<BaseFloat> &feats(feat_reader.Value());
-        
+
         if (!vec_reader.HasKey(utt)) {
           KALDI_WARN << "Could not read vector for utterance " << utt;
           num_err++;
-          continue;          
+          continue;
         }
         const Vector<BaseFloat> &vec(vec_reader.Value(utt));
-        
+
         Matrix<BaseFloat> output;
         AppendVectorToFeats(feats, vec, &output);
         feat_writer.Write(utt, output);
@@ -132,7 +133,7 @@ EOF
 cat <<EOF > 2.vec
  [ 0 1 ]
 EOF
-append-vector-to-feats --binary=false 1.mat 2.vec 3a.mat 
+append-vector-to-feats --binary=false 1.mat 2.vec 3a.mat
 cat <<EOF > 3b.mat
  [ 0 1 2 0 1
    3 4 5 0 1
diff --git a/src/featbin/compute-spectrogram-feats.cc b/src/featbin/compute-spectrogram-feats.cc
index 42f4eeb3602..3a74eb94b2f 100644
--- a/src/featbin/compute-spectrogram-feats.cc
+++ b/src/featbin/compute-spectrogram-feats.cc
@@ -118,7 +118,7 @@ int main(int argc, char *argv[]) {
       SubVector<BaseFloat> waveform(wave_data.Data(), this_chan);
       Matrix<BaseFloat> features;
       try {
-        spec.Compute(waveform, &features, NULL);
+        spec.Compute(waveform, 1.0, &features, NULL);
       } catch (...) {
         KALDI_WARN << "Failed to compute features for utterance "
                    << utt;
diff --git a/src/featbin/concat-feats.cc b/src/featbin/concat-feats.cc
new file mode 100644
index 00000000000..1f926061772
--- /dev/null
+++ b/src/featbin/concat-feats.cc
@@ -0,0 +1,97 @@
+// featbin/concat-feats.cc
+
+// Copyright 2013 Johns Hopkins University (Author: Daniel Povey)
+//           2015 Tom Ko
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "matrix/kaldi-matrix.h"
+
+namespace kaldi {
+
+/*
+   This function concatenates several sets of feature vectors
+   to form a longer set. The length of the output will be equal
+   to the sum of lengths of the inputs but the dimension will be
+   the same to the inputs.
+*/
+
+void ConcatFeats(const std::vector<Matrix<BaseFloat> > &in,
+                 Matrix<BaseFloat> *out) {
+  KALDI_ASSERT(in.size() >= 1);
+  int32 tot_len = in[0].NumRows(),
+      dim = in[0].NumCols();
+  for (int32 i = 1; i < in.size(); i++) {
+    KALDI_ASSERT(in[i].NumCols() == dim);
+    tot_len += in[i].NumRows();
+  }
+  out->Resize(tot_len, dim);
+  int32 len_offset = 0;
+  for (int32 i = 0; i < in.size(); i++) {
+    int32 this_len = in[i].NumRows();
+    out->Range(len_offset, this_len, 0, dim).CopyFromMat(
+        in[i]);
+    len_offset += this_len;
+  }
+}
+
+
+}
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace std;
+
+    const char *usage =
+        "Concatenate feature files (assuming they have the same dimensions)\n"
+        "Usage: concat-feats <in-rxfilename1> <in-rxfilename2> [<in-rxfilename3> ...] <out-wxfilename>\n"
+        " e.g. concat-feats mfcc/foo.ark:12343 mfcc/foo.ark:56789 -\n"
+        "See also: copy-feats, append-vector-to-feats, paste-feats\n";
+
+    ParseOptions po(usage);
+
+    bool binary = true;
+    po.Register("binary", &binary, "If true, output files in binary "
+                "(only relevant for single-file operation, i.e. no tables)");
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() < 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::vector<Matrix<BaseFloat> > feats(po.NumArgs() - 1);
+    for (int32 i = 1; i < po.NumArgs(); i++)
+      ReadKaldiObject(po.GetArg(i), &(feats[i-1]));
+    Matrix<BaseFloat> output;
+    ConcatFeats(feats, &output);
+    std::string output_wxfilename = po.GetArg(po.NumArgs());
+    WriteKaldiObject(output, output_wxfilename, binary);
+
+    // This will tend to produce too much output if we have a logging mesage.
+    // KALDI_LOG << "Wrote concatenated features to " << output_wxfilename;
+    return 0;
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
+
diff --git a/src/featbin/copy-feats-to-htk.cc b/src/featbin/copy-feats-to-htk.cc
index 4c7834a89a1..ba0711414c5 100644
--- a/src/featbin/copy-feats-to-htk.cc
+++ b/src/featbin/copy-feats-to-htk.cc
@@ -50,8 +50,8 @@ int main(int argc, char *argv[]) {
     ParseOptions po(usage);
     std::string dir_out = "./";
     std::string ext_out = "fea";
-    int32 sample_period = 10000;
-    int32 sample_kind = 9; //USER  
+    int32 sample_period = 100000; // 100ns unit : 10ms = 100000,
+    int32 sample_kind = 9; // USER,
     /*
     0 WAVEFORM sampled waveform
     1 LPC linear prediction filter coefficients
diff --git a/src/featbin/copy-feats.cc b/src/featbin/copy-feats.cc
index 3c123b70d60..258466b4f3b 100644
--- a/src/featbin/copy-feats.cc
+++ b/src/featbin/copy-feats.cc
@@ -34,7 +34,8 @@ int main(int argc, char *argv[]) {
         "e.g.: copy-feats ark:- ark,scp:foo.ark,foo.scp\n"
         " or: copy-feats ark:foo.ark ark,t:txt.ark\n"
         "See also: copy-matrix, copy-feats-to-htk, copy-feats-to-sphinx, select-feats,\n"
-        "extract-rows, subset-feats, subsample-feats, splice-feats, append-feats\n";
+        "extract-rows, subset-feats, subsample-feats, splice-feats, paste-feats,\n"
+        "concat-feats\n";
 
     ParseOptions po(usage);
     bool binary = true;
@@ -48,7 +49,7 @@ int main(int argc, char *argv[]) {
     po.Register("compress", &compress, "If true, write output in compressed form"
                 "(only currently supported for wxfilename, i.e. archive/script,"
                 "output)");
-    
+
     po.Read(argc, argv);
 
     if (po.NumArgs() != 2) {
@@ -57,7 +58,7 @@ int main(int argc, char *argv[]) {
     }
 
     int32 num_done = 0;
-    
+
     if (ClassifyRspecifier(po.GetArg(1), NULL, NULL) != kNoRspecifier) {
       // Copying tables of features.
       std::string rspecifier = po.GetArg(1);
@@ -101,7 +102,7 @@ int main(int argc, char *argv[]) {
       return (num_done != 0 ? 0 : 1);
     } else {
       KALDI_ASSERT(!compress && "Compression not yet supported for single files");
-      
+
       std::string feat_rxfilename = po.GetArg(1), feat_wxfilename = po.GetArg(2);
 
       Matrix<BaseFloat> feat_matrix;
diff --git a/src/featbin/extract-feature-segments.cc b/src/featbin/extract-feature-segments.cc
index d3b2661b76b..93f599feb3a 100644
--- a/src/featbin/extract-feature-segments.cc
+++ b/src/featbin/extract-feature-segments.cc
@@ -2,6 +2,7 @@
 
 // Copyright 2009-2011  Microsoft Corporation;  Govivace Inc.
 //           2012-2013  Mirko Hannemann;  Arnab Ghoshal
+//           2015       Tanel Alumae
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -33,50 +34,69 @@
 int main(int argc, char *argv[]) {
   try {
     using namespace kaldi;
-    
+
     const char *usage =
         "Create feature files by segmenting input files.\n"
-        "Usage:  extract-feature-segments [options...] <feats-rspecifier> <segments-file> <feats-wspecifier>\n"
-        " (segments-file has lines like: output-utterance-id input-utterance-or-spk-id 1.10 2.36)\n";
+        "Usage:  "
+        "extract-feature-segments [options...] <feats-rspecifier> "
+        " <segments-file> <feats-wspecifier>\n"
+        " (segments-file has lines like: "
+        "output-utterance-id input-utterance-or-spk-id 1.10 2.36)\n";
 
     // construct all the global objects
     ParseOptions po(usage);
 
     BaseFloat min_segment_length = 0.1,  // Minimum segment length in seconds.
         max_overshoot = 0.0;  // max time by which last segment can overshoot
-    BaseFloat samp_freq = 100;  // feature sampling frequency (assuming 10ms window shift)
+    int32 frame_shift = 10;
+    int32 frame_length = 25;
+    bool snip_edges = true;
 
     // Register the options
     po.Register("min-segment-length", &min_segment_length,
-                "Minimum segment length in seconds (reject shorter segments)");
-    po.Register("frame-rate", &samp_freq,
-                "Feature sampling frequency (e.g. 100 for 10ms window shift)");
+        "Minimum segment length in seconds (reject shorter segments)");
+    po.Register("frame-length", &frame_length, "Frame length in milliseconds");
+    po.Register("frame-shift", &frame_shift, "Frame shift in milliseconds");
     po.Register("max-overshoot", &max_overshoot,
-                "End segments overshooting by less (in seconds) are truncated,"
-                " else rejected.");
+        "End segments overshooting by less (in seconds) are truncated,"
+        " else rejected.");
+    po.Register("snip-edges", &snip_edges,
+        "If true, n_frames frames will be snipped from the end of each "
+        "extracted feature matrix, "
+        "where n_frames = ceil((frame_length - frame_shift) / frame_shift), "
+        "This ensures that only the feature vectors that "
+        "completely fit in the segment are extracted. "
+        "This makes the extracted segment lengths match the lengths of the "
+        "features that have been extracted from already segmented audio.");
 
     // OPTION PARSING ...
     // parse options  (+filling the registered variables)
     po.Read(argc, argv);
-    // number of arguments should be 3(scriptfile,segments file and outputwav write mode)
+    // number of arguments should be 3
+    // (scriptfile, segments file and outputwav write mode)
     if (po.NumArgs() != 3) {
       po.PrintUsage();
       exit(1);
     }
 
-
-    std::string rspecifier = po.GetArg(1); // get script file/feature archive
-    std::string segments_rxfilename = po.GetArg(2);// get segment file
-    std::string wspecifier = po.GetArg(3); // get written archive name
+    std::string rspecifier = po.GetArg(1);  // get script file/feature archive
+    std::string segments_rxfilename = po.GetArg(2);  // get segment file
+    std::string wspecifier = po.GetArg(3);  // get written archive name
 
     BaseFloatMatrixWriter feat_writer(wspecifier);
 
-    RandomAccessBaseFloatMatrixReader feat_reader(rspecifier); 
+    RandomAccessBaseFloatMatrixReader feat_reader(rspecifier);
 
-    Input ki(segments_rxfilename); // no binary argment: never binary.
+    Input ki(segments_rxfilename);  // no binary argment: never binary.
 
     int32 num_lines = 0, num_success = 0;
-    
+
+    int32 snip_length = 0;
+    if (snip_edges) {
+      snip_length = static_cast<int32>(ceil(
+          1.0 * (frame_length - frame_shift) / frame_shift));
+    }
+
     std::string line;
     /* read each line from segments file */
     while (std::getline(ki.Stream(), line)) {
@@ -106,18 +126,20 @@ int main(int argc, char *argv[]) {
         KALDI_WARN << "Invalid line in segments file [bad end]: " << line;
         continue;
       }
+
       // start time must not be negative; start time must not be greater than
       // end time, except if end time is -1
       if (start < 0 || end <= 0 || start >= end) {
-        KALDI_WARN << "Invalid line in segments file [empty or invalid segment]: "
+        KALDI_WARN << "Invalid line in segments file "
+            "[empty or invalid segment]: "
                    << line;
         continue;
       }
       int32 channel = -1;  // means channel info is unspecified.
       // if each line has 5 elements then 5th element must be channel identifier
-      if(split_line.size() == 5) {
+      if (split_line.size() == 5) {
         if (!ConvertStringToInteger(split_line[4], &channel) || channel < 0) {
-          KALDI_WARN << "Invalid line in segments file [bad channel]: " << line;
+          KALDI_WARN<< "Invalid line in segments file [bad channel]: " << line;
           continue;
         }
       }
@@ -131,45 +153,62 @@ int main(int argc, char *argv[]) {
         continue;
       }
       const Matrix<BaseFloat> &feats = feat_reader.Value(utterance);
-      int32 num_samp = feats.NumRows(), // total number of samples present in wav data
-          num_chan = feats.NumCols(); // total number of channels present in wav file
-
+      // total number of samples present in wav data
+      int32 num_samp = feats.NumRows();
+      // total number of channels present in wav file
+      int32 num_chan = feats.NumCols();
       // Convert start & end times of the segment to corresponding sample number
-      int32 start_samp = static_cast<int32>(start * samp_freq);
-      int32 end_samp = static_cast<int32>(end * samp_freq);
+      int32 start_samp = static_cast<int32>(round(
+          (start * 1000.0 / frame_shift)));
+      int32 end_samp = static_cast<int32>(round(end * 1000.0 / frame_shift));
+
+      if (snip_edges) {
+        // snip the edge at the end of the segment (usually 2 frames),
+        end_samp -= snip_length;
+      }
+
       /* start sample must be less than total number of samples 
        * otherwise skip the segment
        */
       if (start_samp < 0 || start_samp >= num_samp) {
-        KALDI_WARN << "Start sample out of range " << start_samp << " [length:] "
-                   << num_samp << "x" << num_chan << ", skipping segment " << segment;
+        KALDI_WARN << "Start sample out of range " << start_samp
+            << " [length:] " << num_samp << "x" << num_chan
+            << ", skipping segment " << segment;
         continue;
       }
+
       /* end sample must be less than total number samples 
        * otherwise skip the segment
        */
       if (end_samp > num_samp) {
-        if (end_samp >=
-            num_samp + static_cast<int32>(max_overshoot * samp_freq)) {
-          KALDI_WARN << "End sample too far out of range " << end_samp
-                     << " [length:] " << num_samp << "x" << num_chan << ", skipping segment "
-                     << segment;
+        if (end_samp >= num_samp
+                + static_cast<int32>(
+                    round(max_overshoot * 1000.0 / frame_shift))) {
+          KALDI_WARN<< "End sample too far out of range " << end_samp
+              << " [length:] " << num_samp << "x" << num_chan
+              << ", skipping segment "
+              << segment;
           continue;
         }
-        end_samp = num_samp; // for small differences, just truncate.
+        end_samp = num_samp;  // for small differences, just truncate.
       }
+
       /* check whether the segment size is less than minimum segment length(default 0.1 sec)
        * if yes, skip the segment
        */
-      if (end_samp <=
-          start_samp + static_cast<int32>(min_segment_length * samp_freq)) {
-        KALDI_WARN << "Segment " << segment << " too short, skipping it.";
+      if (end_samp
+          <= start_samp
+              + static_cast<int32>(round(
+                  (min_segment_length * 1000.0 / frame_shift)))) {
+        KALDI_WARN<< "Segment " << segment << " too short, skipping it.";
         continue;
       }
 
-      SubMatrix<BaseFloat> segment_matrix(feats, start_samp, end_samp-start_samp, 0, num_chan);
+      SubMatrix<BaseFloat> segment_matrix(feats, start_samp,
+          end_samp-start_samp, 0, num_chan);
       Matrix<BaseFloat> outmatrix(segment_matrix);
-      feat_writer.Write(segment, outmatrix);  // write segment in feature archive.
+      // write segment in feature archive.
+      feat_writer.Write(segment, outmatrix);
       num_success++;
     }
     KALDI_LOG << "Successfully processed " << num_success << " lines out of "
diff --git a/src/featbin/extract-segments.cc b/src/featbin/extract-segments.cc
index 47afca5668d..f5ed4441a03 100644
--- a/src/featbin/extract-segments.cc
+++ b/src/featbin/extract-segments.cc
@@ -20,7 +20,6 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "feat/feature-mfcc.h"
 #include "feat/wave-reader.h"
 
 /*! @brief This is the main program for extracting segments from a wav file
@@ -123,7 +122,7 @@ int main(int argc, char *argv[]) {
       /* check whether a segment start time and end time exists in recording 
        * if fails , skips the segment.
        */ 
-     if (!reader.HasKey(recording)) {
+      if (!reader.HasKey(recording)) {
         KALDI_WARN << "Could not find recording " << recording
                    << ", skipping segment " << segment;
         continue;
diff --git a/src/featbin/paste-feats.cc b/src/featbin/paste-feats.cc
index 5eab09d96c1..553bca9064c 100644
--- a/src/featbin/paste-feats.cc
+++ b/src/featbin/paste-feats.cc
@@ -50,7 +50,7 @@ bool AppendFeats(const std::vector<Matrix<BaseFloat> > &in,
   }
   if (max_len - min_len > 0) {
     KALDI_VLOG(2) << "Length mismatch " << max_len << " vs. " << min_len
-                  << (utt.empty() ? "" : " for utt ") << utt 
+                  << (utt.empty() ? "" : " for utt ") << utt
                   << " within tolerance " << tolerance;
   }
   out->Resize(min_len, tot_dim);
@@ -71,7 +71,7 @@ int main(int argc, char *argv[]) {
   try {
     using namespace kaldi;
     using namespace std;
-    
+
     const char *usage =
         "Paste feature files (assuming they have the same lengths);  think of the\n"
         "unix command paste a b.\n"
@@ -79,8 +79,8 @@ int main(int argc, char *argv[]) {
         " or: paste-feats <in-rxfilename1> <in-rxfilename2> [<in-rxfilename3> ...] <out-wxfilename>\n"
         " e.g. paste-feats ark:feats1.ark \"ark:select-feats 0-3 ark:feats2.ark ark:- |\" ark:feats-out.ark\n"
         "  or: paste-feats foo.mat bar.mat baz.mat\n"
-        "See also: copy-feats, copy-matrix, append-vector-to-feats, concat-feats\n";
-    
+        "See also: copy-feats, copy-matrix, append-vector-to-feats\n";
+
     ParseOptions po(usage);
 
     int32 length_tolerance = 0;
@@ -90,22 +90,22 @@ int main(int argc, char *argv[]) {
                 " difference of length-tolerance, otherwise exclude segment.");
     po.Register("binary", &binary, "If true, output files in binary "
                 "(only relevant for single-file operation, i.e. no tables)");
-    
+
     po.Read(argc, argv);
-    
+
     if (po.NumArgs() < 3) {
       po.PrintUsage();
       exit(1);
     }
-    
+
     if (ClassifyRspecifier(po.GetArg(1), NULL, NULL)
         != kNoRspecifier) {
       // We're operating on tables, e.g. archives.
-      
+
       // Last argument is output
       string wspecifier = po.GetArg(po.NumArgs());
       BaseFloatMatrixWriter feat_writer(wspecifier);
-    
+
       // First input is sequential
       string rspecifier1 = po.GetArg(1);
       SequentialBaseFloatMatrixReader input1(rspecifier1);
@@ -117,14 +117,14 @@ int main(int argc, char *argv[]) {
         RandomAccessBaseFloatMatrixReader *rd = new RandomAccessBaseFloatMatrixReader(rspecifier);
         input.push_back(rd);
       }
-  
+
       int32 num_done = 0, num_err = 0;
-    
+
       // Main loop
       for (; !input1.Done(); input1.Next()) {
         string utt = input1.Key();
         KALDI_VLOG(2) << "Merging features for utterance " << utt;
-      
+
         // Collect features from streams to vector 'feats'
         vector<Matrix<BaseFloat> > feats(po.NumArgs() - 1);
         feats[0] = input1.Value();
@@ -189,7 +189,7 @@ cat <<EOF > 2.mat
  [ 0 1
    2 3 ]
 EOF
-paste-feats --length-tolerance=1 --binary=false 1.mat 2.mat 3a.mat 
+paste-feats --length-tolerance=1 --binary=false 1.mat 2.mat 3a.mat
 cat <<EOF > 3b.mat
  [ 0 1 2 0 1
    3 4 5 2 3 ]
diff --git a/src/featbin/shift-feats.cc b/src/featbin/shift-feats.cc
new file mode 100644
index 00000000000..7b970e92248
--- /dev/null
+++ b/src/featbin/shift-feats.cc
@@ -0,0 +1,90 @@
+// featbin/shift-feats.cc
+
+// Copyright 2009-2011  Microsoft Corporation
+//           2013-2015  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "matrix/kaldi-matrix.h"
+
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+
+    const char *usage =
+        "Copy features and possibly shift them in time while maintaining the length, e.g.\n"
+        "shift-feats --shift=1 <input-feats> <output-feats> will shift all frames to the\n"
+        "right by one (the first frame would be duplicated).\n"
+        "See also: copy-feats, copy-matrix\n";
+
+    ParseOptions po(usage);
+    int32 shift = 0;
+    po.Register("shift", &shift, "Number of frames by which to shift the features.");
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    int32 num_done = 0, num_err = 0;
+
+    SequentialBaseFloatMatrixReader feat_reader(po.GetArg(1));
+    BaseFloatMatrixWriter feat_writer(po.GetArg(2));
+
+
+    for (; !feat_reader.Done(); feat_reader.Next()) {
+      const std::string &key = feat_reader.Key();
+      const Matrix<BaseFloat> &src = feat_reader.Value();
+      if (src.NumRows() == 0) {
+        KALDI_WARN << "Empty matrix for key " << key;
+        num_err++;
+        continue;
+      }
+      Matrix<BaseFloat> rearranged(src.NumRows(), src.NumCols());
+      for (int32 r = 0; r < src.NumRows(); r++) {
+        int32 src_r = r - shift;
+        if (src_r < 0) src_r = 0;
+        if (src_r >= src.NumRows()) src_r = src.NumRows() - 1;
+        rearranged.Row(r).CopyFromVec(src.Row(src_r));
+      }
+      feat_writer.Write(key, rearranged);
+      num_done++;
+    }
+
+    KALDI_LOG << "Shifted " << num_done << " features by "
+              << shift << " frames; " << num_err << " with errors.";
+    return (num_done > 0 ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
+
+
+/*
+test:
+  echo "foo [ 1 1; 2 2; 3 3 ]" | shift-feats --shift=1 ark:- ark,t:-
+  outputs:
+  foo  [
+  1 1
+  1 1
+  2 2 ]
+*/
diff --git a/src/featbin/subsample-feats.cc b/src/featbin/subsample-feats.cc
index 9a8d5520433..0d79ce5030f 100644
--- a/src/featbin/subsample-feats.cc
+++ b/src/featbin/subsample-feats.cc
@@ -31,17 +31,17 @@ int main(int argc, char *argv[]) {
   try {
     using namespace kaldi;
     using namespace std;
-    
+
     const char *usage =
-        "Sub-samples features by taking every n'th frame."
+        "Sub-samples features by taking every n'th frame.\n"
         "With negative values of n, will repeat each frame n times\n"
         "(e.g. --n=-2 will repeat each frame twice)\n"
         "\n"
         "Usage: subsample-feats [options] <in-rspecifier> <out-wspecifier>\n"
         "  e.g. subsample-feats --n=2 ark:- ark:-\n";
-    
+
     ParseOptions po(usage);
-    
+
     int32 n = 1, offset = 0;
 
     po.Register("n", &n, "Take every n'th feature, for this value of n"
@@ -53,23 +53,23 @@ int main(int argc, char *argv[]) {
     if (n < 0)
       KALDI_ASSERT(offset == 0 &&
                    "--offset option cannot be used with negative n.");
-    
+
     po.Read(argc, argv);
-    
+
     if (po.NumArgs() != 2) {
       po.PrintUsage();
       exit(1);
-    }    
+    }
 
     string rspecifier = po.GetArg(1);
     string wspecifier = po.GetArg(2);
-    
+
     SequentialBaseFloatMatrixReader feat_reader(rspecifier);
     BaseFloatMatrixWriter feat_writer(wspecifier);
 
     int32 num_done = 0, num_err = 0;
     int64 frames_in = 0, frames_out = 0;
-    
+
     // process all keys
     for (; !feat_reader.Done(); feat_reader.Next()) {
       std::string utt = feat_reader.Key();
@@ -85,7 +85,7 @@ int main(int argc, char *argv[]) {
 
         frames_in += feats.NumRows();
         frames_out += num_indexes;
-      
+
         if (num_indexes == 0) {
           KALDI_WARN << "For utterance " << utt << ", output would have no rows, "
                      << "producing no output.";
@@ -108,7 +108,7 @@ int main(int argc, char *argv[]) {
           output.Row(i).CopyFromVec(feats.Row(i / repeat));
         frames_in += feats.NumRows();
         frames_out += feats.NumRows() * repeat;
-        feat_writer.Write(utt, output);        
+        feat_writer.Write(utt, output);
         num_done++;
       }
     }
diff --git a/src/featbin/wav-reverberate.cc b/src/featbin/wav-reverberate.cc
new file mode 100644
index 00000000000..d7599c5ea3d
--- /dev/null
+++ b/src/featbin/wav-reverberate.cc
@@ -0,0 +1,260 @@
+// featbin/wav-reverberate.cc
+
+// Copyright 2015  Tom Ko
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "feat/wave-reader.h"
+#include "feat/signal.h"
+
+namespace kaldi {
+
+/*
+   This function is to repeatedly concatenate signal1 by itself 
+   to match the length of signal2 and add the two signals together.
+*/
+void AddVectorsOfUnequalLength(const Vector<BaseFloat> &signal1, Vector<BaseFloat> *signal2) {
+  for (int32 po = 0; po < signal2->Dim(); po += signal1.Dim()) {
+    int32 block_length = signal1.Dim();
+    if (signal2->Dim() - po < block_length) block_length = signal2->Dim() - po;
+    signal2->Range(po, block_length).AddVec(1.0, signal1.Range(0, block_length));
+  }
+}
+
+BaseFloat MaxAbsolute(const Vector<BaseFloat> &vector) {
+  return std::max(std::abs(vector.Max()), std::abs(vector.Min()));
+}
+
+/* 
+   Early reverberation component of the signal is composed of reflections 
+   within 0.05 seconds of the direct path signal (assumed to be the peak of 
+   the room impulse response). This function returns the energy in 
+   this early reverberation component of the signal. 
+   The input parameters to this function are the room impulse response, the signal
+   and their sampling frequency respectively.
+*/
+BaseFloat ComputeEarlyReverbEnergy(const Vector<BaseFloat> &rir, const Vector<BaseFloat> &signal,
+                                   BaseFloat samp_freq) {
+  int32 peak_index = 0;
+  rir.Max(&peak_index);
+  KALDI_VLOG(1) << "peak index is " << peak_index;
+
+  const float sec_before_peak = 0.001;
+  const float sec_after_peak = 0.05;
+  int32 early_rir_start_index = peak_index - sec_before_peak * samp_freq;
+  int32 early_rir_end_index = peak_index + sec_after_peak * samp_freq;
+  if (early_rir_start_index < 0) early_rir_start_index = 0;
+  if (early_rir_end_index > rir.Dim()) early_rir_end_index = rir.Dim();
+
+  int32 duration = early_rir_end_index - early_rir_start_index;
+  Vector<BaseFloat> early_rir(rir.Range(early_rir_start_index, duration));
+  Vector<BaseFloat> early_reverb(signal);
+  FFTbasedBlockConvolveSignals(early_rir, &early_reverb);
+
+  // compute the energy
+  return VecVec(early_reverb, early_reverb) / early_reverb.Dim();
+}
+
+/*
+   This is the core function to do reverberation and noise addition
+   on the given signal. The noise will be scaled before the addition
+   to match the given signal-to-noise ratio (SNR) and it will also concatenate
+   itself repeatedly to match the length of the signal.
+   The input parameters to this function are the room impulse response,
+   the sampling frequency, the SNR(dB), the noise and the signal respectively.
+*/
+void DoReverberation(const Vector<BaseFloat> &rir, BaseFloat samp_freq,
+                        BaseFloat snr_db, Vector<BaseFloat> *noise,
+                        Vector<BaseFloat> *signal) {
+  if (noise->Dim()) {
+    float input_power = ComputeEarlyReverbEnergy(rir, *signal, samp_freq);
+    float noise_power = VecVec(*noise, *noise) / noise->Dim();
+    float scale_factor = sqrt(pow(10, -snr_db / 10) * input_power / noise_power);
+    noise->Scale(scale_factor);
+    KALDI_VLOG(1) << "Noise signal is being scaled with " << scale_factor
+                  << " to generate output with SNR " << snr_db << "db\n";
+  }
+
+  FFTbasedBlockConvolveSignals(rir, signal);
+
+  if (noise->Dim() > 0) {
+    AddVectorsOfUnequalLength(*noise, signal);
+  }
+}
+}
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+
+    const char *usage =
+        "Corrupts the wave files supplied via input pipe with the specified\n"
+        "room-impulse response (rir_matrix) and additive noise distortions\n"
+        "(specified by corresponding files).\n"
+        "Usage:  wav-reverberate [options...] <wav-in-rxfilename> "
+        "<rir-rxfilename> <wav-out-wxfilename>\n"
+        "e.g.\n"
+        "wav-reverberate --noise-file=noise.wav \\\n"
+        "  input.wav rir.wav output.wav\n";
+
+    ParseOptions po(usage);
+    std::string noise_file;
+    BaseFloat snr_db = 20;
+    bool multi_channel_output = false;
+    int32 input_channel = 0;
+    int32 rir_channel = 0;
+    int32 noise_channel = 0;
+    bool normalize_output = true;
+    BaseFloat volume = 0;
+
+    po.Register("multi-channel-output", &multi_channel_output,
+                "Specifies if the output should be multi-channel or not");
+    po.Register("input-wave-channel", &input_channel,
+                "Specifies the channel to be used from input as only a "
+                "single channel will be used to generate reverberated output");
+    po.Register("rir-channel", &rir_channel,
+                "Specifies the channel of the room impulse response, "
+                "it will only be used when multi-channel-output is false");
+    po.Register("noise-channel", &noise_channel,
+                "Specifies the channel of the noise file, "
+                "it will only be used when multi-channel-output is false");
+    po.Register("noise-file", &noise_file,
+                "File with additive noise");
+    po.Register("snr-db", &snr_db,
+                "Desired SNR(dB) of the output");
+    po.Register("normalize-output", &normalize_output,
+                "If true, then after reverberating and "
+                "possibly adding noise, scale so that the signal "
+                "energy is the same as the original input signal.");
+    po.Register("volume", &volume,
+                "If nonzero, a scaling factor on the signal that is applied "
+                "after reverberating and possibly adding noise. "
+                "If you set this option to a nonzero value, it will be as"
+                "if you had also specified --normalize-output=false.");
+
+    po.Read(argc, argv);
+    if (po.NumArgs() != 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    if (multi_channel_output) {
+      if (rir_channel != 0 || noise_channel != 0)
+        KALDI_WARN << "options for --rir-channel and --noise-channel"
+                      "are ignored as --multi-channel-output is true.";
+    }
+
+    std::string input_wave_file = po.GetArg(1);
+    std::string rir_file = po.GetArg(2);
+    std::string output_wave_file = po.GetArg(3);
+
+    WaveData input_wave;
+    {
+      Input ki(input_wave_file);
+      input_wave.Read(ki.Stream());
+    }
+
+    const Matrix<BaseFloat> &input_matrix = input_wave.Data();
+    BaseFloat samp_freq_input = input_wave.SampFreq();
+    int32 num_samp_input = input_matrix.NumCols(),  // #samples in the input
+          num_input_channel = input_matrix.NumRows();  // #channels in the input
+    KALDI_VLOG(1) << "sampling frequency of input: " << samp_freq_input
+                  << " #samples: " << num_samp_input
+                  << " #channel: " << num_input_channel;
+    KALDI_ASSERT(input_channel < num_input_channel);
+
+    WaveData rir_wave;
+    {
+      Input ki(rir_file);
+      rir_wave.Read(ki.Stream());
+    }
+    const Matrix<BaseFloat> &rir_matrix = rir_wave.Data();
+    BaseFloat samp_freq_rir = rir_wave.SampFreq();
+    int32 num_samp_rir = rir_matrix.NumCols(),
+          num_rir_channel = rir_matrix.NumRows();
+    KALDI_VLOG(1) << "sampling frequency of rir: " << samp_freq_rir
+                  << " #samples: " << num_samp_rir
+                  << " #channel: " << num_rir_channel;
+    if (!multi_channel_output) {
+      KALDI_ASSERT(rir_channel < num_rir_channel);
+    }
+
+    Matrix<BaseFloat> noise_matrix;
+    if (!noise_file.empty()) {
+      WaveData noise_wave;
+      {
+        Input ki(noise_file);
+        noise_wave.Read(ki.Stream());
+      }
+      noise_matrix = noise_wave.Data();
+      BaseFloat samp_freq_noise = noise_wave.SampFreq();
+      int32 num_samp_noise = noise_matrix.NumCols(),
+            num_noise_channel = noise_matrix.NumRows();
+      KALDI_VLOG(1) << "sampling frequency of noise: " << samp_freq_noise
+                    << " #samples: " << num_samp_noise
+                    << " #channel: " << num_noise_channel;
+      if (multi_channel_output) {
+        KALDI_ASSERT(num_rir_channel == num_noise_channel);
+      } else {
+        KALDI_ASSERT(noise_channel < num_noise_channel);
+      }
+    }
+
+    int32 num_output_channels = (multi_channel_output ? num_rir_channel : 1);
+    Matrix<BaseFloat> out_matrix(num_output_channels, num_samp_input);
+
+    for (int32 output_channel = 0; output_channel < num_output_channels; output_channel++) {
+      Vector<BaseFloat> input(num_samp_input);
+      input.CopyRowFromMat(input_matrix, input_channel);
+      float power_before_reverb = VecVec(input, input) / input.Dim();
+
+      int32 this_rir_channel = (multi_channel_output ? output_channel : rir_channel);
+      Vector<BaseFloat> rir(num_samp_rir);
+      rir.CopyRowFromMat(rir_matrix, this_rir_channel);
+      rir.Scale(1.0 / (1 << 15));
+
+      Vector<BaseFloat> noise(0);
+      if (!noise_file.empty()) {
+        noise.Resize(noise_matrix.NumCols());
+        int32 this_noise_channel = (multi_channel_output ? output_channel : noise_channel);
+        noise.CopyRowFromMat(noise_matrix, this_noise_channel);
+      }
+
+      DoReverberation(rir, samp_freq_rir, snr_db, &noise, &input);
+
+      float power_after_reverb = VecVec(input, input) / input.Dim();
+
+      if (volume > 0)
+        input.Scale(volume);
+      else if (normalize_output)
+        input.Scale(sqrt(power_before_reverb / power_after_reverb));
+
+      out_matrix.CopyRowFromVec(input, output_channel);
+    }
+
+    WaveData out_wave(samp_freq_input, out_matrix);
+    Output ko(output_wave_file, false);
+    out_wave.Write(ko.Stream());
+
+    return 0;
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
+
diff --git a/src/featbin/wav-to-duration.cc b/src/featbin/wav-to-duration.cc
index b0f23f35115..2eb95dc3fc1 100644
--- a/src/featbin/wav-to-duration.cc
+++ b/src/featbin/wav-to-duration.cc
@@ -30,10 +30,17 @@ int main(int argc, char *argv[]) {
         "the duration of each one in seconds.\n"
         "Usage:  wav-to-duration [options...] <wav-rspecifier> <duration-wspecifier>\n"
         "E.g.: wav-to-duration scp:wav.scp ark,t:-\n"
-        "See also: wav-copy extract-segments feat-to-len\n";
+        "See also: wav-copy extract-segments feat-to-len\n"
+        "Currently this program may output a lot of harmless warnings regarding\n"
+        "nonzero exit status of pipes\n";
+
+    bool read_entire_file = false;
 
     ParseOptions po(usage);
 
+    po.Register("read-entire-file", &read_entire_file, "If true, use regular WaveHolder "
+                "instead of WaveInfoHolder to ensure the returned duration is correct.");
+
     po.Read(argc, argv);
 
     if (po.NumArgs() != 2) {
@@ -48,21 +55,36 @@ int main(int argc, char *argv[]) {
     double sum_duration = 0.0,
         min_duration = std::numeric_limits<BaseFloat>::infinity(),
         max_duration = 0;
-    
-    SequentialTableReader<WaveHolder> wav_reader(wav_rspecifier);
+    int32 num_done = 0;
+
     BaseFloatWriter duration_writer(duration_wspecifier);
+    if (read_entire_file) {
+      SequentialTableReader<WaveHolder> wav_reader(wav_rspecifier);
+      for (; !wav_reader.Done(); wav_reader.Next()) {
+        std::string key = wav_reader.Key();
+        const WaveData &wave_data = wav_reader.Value();
+        BaseFloat duration = wave_data.Duration();
+        duration_writer.Write(key, duration);
 
-    int32 num_done = 0;
-    for (; !wav_reader.Done(); wav_reader.Next()) {
-      std::string key = wav_reader.Key();
-      const WaveData &wave_data = wav_reader.Value();
-      BaseFloat duration = wave_data.Duration();
-      duration_writer.Write(key, duration);
-
-      sum_duration += duration;
-      min_duration = std::min<double>(min_duration, duration);
-      max_duration = std::max<double>(max_duration, duration);
-      num_done++;
+        sum_duration += duration;
+        min_duration = std::min<double>(min_duration, duration);
+        max_duration = std::max<double>(max_duration, duration);
+        num_done++;
+      }
+    }
+    else {
+      SequentialTableReader<WaveInfoHolder> wav_reader(wav_rspecifier);
+      for (; !wav_reader.Done(); wav_reader.Next()) {
+        std::string key = wav_reader.Key();
+        const WaveData &wave_data = wav_reader.Value();
+        BaseFloat duration = wave_data.Duration();
+        duration_writer.Write(key, duration);
+
+        sum_duration += duration;
+        min_duration = std::min<double>(min_duration, duration);
+        max_duration = std::max<double>(max_duration, duration);
+        num_done++;
+      }
     }
 
     KALDI_LOG << "Printed duration for " << num_done << " audio files.";
diff --git a/src/fgmmbin/Makefile b/src/fgmmbin/Makefile
index 49bfa11aade..c8d01e31b6e 100644
--- a/src/fgmmbin/Makefile
+++ b/src/fgmmbin/Makefile
@@ -7,7 +7,7 @@ BINFILES = fgmm-global-acc-stats fgmm-global-sum-accs fgmm-global-est \
            fgmm-global-merge fgmm-global-to-gmm fgmm-gselect fgmm-global-get-frame-likes \
            fgmm-global-acc-stats-twofeats fgmm-global-copy fgmm-global-mixdown \
            fgmm-global-gselect-to-post fgmm-global-info \
-           fgmm-global-acc-stats-post fgmm-global-init-from-accs  
+           fgmm-global-acc-stats-post fgmm-global-init-from-accs
 
 
 OBJFILES =
@@ -17,8 +17,8 @@ OBJFILES =
 TESTFILES =
 
 ADDLIBS = ../decoder/kaldi-decoder.a ../lat/kaldi-lat.a ../feat/kaldi-feat.a \
-          ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a ../thread/kaldi-thread.a \
+          ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
 		  ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a ../matrix/kaldi-matrix.a  \
-		  ../util/kaldi-util.a ../base/kaldi-base.a 
+		  ../util/kaldi-util.a  ../thread/kaldi-thread.a ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/fgmmbin/fgmm-global-init-from-accs.cc b/src/fgmmbin/fgmm-global-init-from-accs.cc
index def175c3b87..23dc6be75cf 100644
--- a/src/fgmmbin/fgmm-global-init-from-accs.cc
+++ b/src/fgmmbin/fgmm-global-init-from-accs.cc
@@ -60,8 +60,9 @@ int main(int argc, char *argv[]) {
       gmm_accs.Read(ki.Stream(), binary, true /* add accs. */);
     }
 
-    int32 num_gauss = gmm_accs.NumGauss(),
-          dim = gmm_accs.Dim();
+    int32 num_gauss = gmm_accs.NumGauss(), dim = gmm_accs.Dim(),
+          tot_floored = 0, gauss_floored = 0;
+
     FullGmm fgmm(num_components, dim);
 
     Vector<BaseFloat> weights(num_gauss);
@@ -85,14 +86,26 @@ int main(int argc, char *argv[]) {
       SpMatrix<BaseFloat> covar(gmm_accs.covariance_accumulator()[i]);
       covar.Scale(1.0 / occ);
       covar.AddVec2(-1.0, means.Row(i));  // subtract squared means.
-      covar.Invert();
+      // Floor variance Eigenvalues.
+      BaseFloat floor = std::max(
+          static_cast<BaseFloat>(gmm_opts.variance_floor),
+          static_cast<BaseFloat>(covar.MaxAbsEig() / gmm_opts.max_condition));
+      int32 floored = covar.ApplyFloor(floor);
+      if (floored) {
+        tot_floored += floored;
+        gauss_floored++;
+      }
+      covar.InvertDouble();
       invcovars.push_back(covar);
     }
     fgmm.SetWeights(weights);
     fgmm.SetInvCovarsAndMeans(invcovars, means);
     int32 num_bad = fgmm.ComputeGconsts();
     KALDI_LOG << "FullGmm has " << num_bad << " bad GConsts";
-
+    if (tot_floored > 0) {
+      KALDI_WARN << tot_floored << " variances floored in " << gauss_floored
+                 << " Gaussians.";
+    }
     WriteKaldiObject(fgmm, model_out_filename, binary_write);
 
     KALDI_LOG << "Written model to " << model_out_filename;
diff --git a/src/fstbin/Makefile b/src/fstbin/Makefile
index 6c381c48690..6106262859a 100644
--- a/src/fstbin/Makefile
+++ b/src/fstbin/Makefile
@@ -17,14 +17,14 @@ BINFILES = fstdeterminizestar  \
            fstdeterminizelog fstphicompose fstrhocompose fstpropfinal fstcopy \
 	       fstpushspecial fsts-to-transcripts
 
-OBJFILES = 
+OBJFILES =
 
-TESTFILES = 
+TESTFILES =
 
 # actually, this library is currently empty.  Everything is a header.
-LIBFILE = 
+LIBFILE =
 
-ADDLIBS = ../fstext/kaldi-fstext.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a \
-          ../util/kaldi-util.a 
+ADDLIBS = ../fstext/kaldi-fstext.a ../util/kaldi-util.a ../thread/kaldi-thread.a \
+          ../matrix/kaldi-matrix.a ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/fstbin/fstaddselfloops.cc b/src/fstbin/fstaddselfloops.cc
index 9219093bee1..96895f23cf4 100644
--- a/src/fstbin/fstaddselfloops.cc
+++ b/src/fstbin/fstaddselfloops.cc
@@ -45,8 +45,9 @@ int main(int argc, char *argv[]) {
         "on at least one arc out of the state.  Useful in conjunction with predeterminize\n"
         "\n"
         "Usage:  fstaddselfloops in-disambig-list out-disambig-list  [in.fst [out.fst] ]\n"
-        "E.g:  fstaddselfloops in.list out.list < in.fst > withloops.fst\n";
-
+        "E.g:  fstaddselfloops in.list out.list < in.fst > withloops.fst\n"
+        "in.list and out.list are lists of integers, one per line, of the\n"
+        "same length.\n";
 
     ParseOptions po(usage);
     po.Read(argc, argv);
@@ -62,12 +63,12 @@ int main(int argc, char *argv[]) {
         fst_out_filename = po.GetOptArg(4);
 
     VectorFst<StdArc> *fst = ReadFstKaldi(fst_in_filename);
-    
+
     std::vector<int32> disambig_in;
     if (!ReadIntegerVectorSimple(disambig_in_rxfilename, &disambig_in))
       KALDI_ERR << "fstaddselfloops: Could not read disambiguation symbols from "
                  << kaldi::PrintableRxfilename(disambig_in_rxfilename);
-    
+
     std::vector<int32> disambig_out;
     if (!ReadIntegerVectorSimple(disambig_out_rxfilename, &disambig_out))
       KALDI_ERR << "fstaddselfloops: Could not read disambiguation symbols from "
@@ -81,7 +82,7 @@ int main(int argc, char *argv[]) {
     WriteFstKaldi(*fst, fst_out_filename);
 
     delete fst;
-    
+
     return 0;
   } catch(const std::exception &e) {
     std::cerr << e.what();
diff --git a/src/fstbin/fstrmsymbols.cc b/src/fstbin/fstrmsymbols.cc
index 438170c2b98..75f5ab18654 100644
--- a/src/fstbin/fstrmsymbols.cc
+++ b/src/fstbin/fstrmsymbols.cc
@@ -25,19 +25,62 @@
 #include "fstext/fstext-utils.h"
 #include "fstext/kaldi-fst-io.h"
 
-/* some test examples:
- ( echo 3; echo  4) > /tmp/in.list
- ( echo "0 0 1 1"; echo " 0 0 3 2"; echo "0 0"; ) | fstcompile | fstrmsymbols /tmp/in.list | fstprint
+namespace fst {
+// we can move these functions elsewhere later, if they are needed in other
+// places.
 
-  cd ~/tmpdir
-  while true; do
-    fstrand > 1.fst
-    fstpredeterminize out.lst 1.fst | fstdeterminizestar | fstrmsymbols out.lst > 2.fst
-    fstequivalent --random=true 1.fst 2.fst || echo "Test failed"
-    echo -n "."
-  done
+template<class Arc, class I>
+void RemoveArcsWithSomeInputSymbols(const std::vector<I> &symbols_in,
+                                    VectorFst<Arc> *fst) {
+  typedef typename Arc::StateId StateId;
+
+  kaldi::ConstIntegerSet<I> symbol_set(symbols_in);
+
+  StateId num_states = fst->NumStates();
+  StateId dead_state = fst->AddState();
+  for (StateId s = 0; s < num_states; s++) {
+    for (MutableArcIterator<VectorFst<Arc> > iter(fst, s);
+         !iter.Done(); iter.Next()) {
+      if (symbol_set.count(iter.Value().ilabel) != 0) {
+        Arc arc = iter.Value();
+        arc.nextstate = dead_state;
+        iter.SetValue(arc);
+      }
+    }
+  }
+  // Connect() will actually remove the arcs, and the dead state.
+  Connect(fst);
+  if (fst->NumStates() == 0)
+    KALDI_WARN << "After Connect(), fst was empty.";
+}
+
+template<class Arc, class I>
+void PenalizeArcsWithSomeInputSymbols(const std::vector<I> &symbols_in,
+                                      float penalty,
+                                      VectorFst<Arc> *fst) {
+  typedef typename Arc::StateId StateId;
+  typedef typename Arc::Label Label;
+  typedef typename Arc::Weight Weight;
+
+  Weight penalty_weight(penalty);
+
+  kaldi::ConstIntegerSet<I> symbol_set(symbols_in);
+
+  StateId num_states = fst->NumStates();
+  for (StateId s = 0; s < num_states; s++) {
+    for (MutableArcIterator<VectorFst<Arc> > iter(fst, s);
+         !iter.Done(); iter.Next()) {
+      if (symbol_set.count(iter.Value().ilabel) != 0) {
+        Arc arc = iter.Value();
+        arc.weight = Times(arc.weight, penalty_weight);
+        iter.SetValue(arc);
+      }
+    }
+  }
+}
+
+}
 
-*/
 
 int main(int argc, char *argv[]) {
   try {
@@ -45,47 +88,105 @@ int main(int argc, char *argv[]) {
     using namespace fst;
     using kaldi::int32;
 
-    bool remove_from_output = false;
-    
+    bool apply_to_output = false;
+    bool remove_arcs = false;
+    float penalty = -std::numeric_limits<BaseFloat>::infinity();
+
     const char *usage =
-        "Replaces a subset of symbols with epsilon, wherever they appear on the input side\n"
-        "of an FST (or the output side, with --remove-from-output=true)\n"
+        "With no options, replaces a subset of symbols with epsilon, wherever\n"
+        "they appear on the input side of an FST."
+        "With --remove-arcs=true, will remove arcs that contain these symbols\n"
+        "on the input\n"
+        "With --penalty=<float>, will add the specified penalty to the\n"
+        "cost of any arc that has one of the given symbols on its input side\n"
+        "In all cases, the option --apply-to-output=true (or for\n"
+        "back-compatibility, --remove-from-output=true) makes this apply\n"
+        "to the output side.\n"
         "\n"
-        "Usage:  fstrmsymbols in-disambig-list  [in.fst [out.fst] ]\n"
-        "E.g:  fstrmsymbols in.list  < in.fst > out.fst\n";
+        "Usage:  fstrmsymbols [options] <in-disambig-list>  [<in.fst> [<out.fst>]]\n"
+        "E.g:  fstrmsymbols in.list  < in.fst > out.fst\n"
+        "<in-disambig-list> is an rxfilename specifying a file containing list of integers\n"
+        "representing symbols, in text form, one per line.\n";
 
     ParseOptions po(usage);
-    po.Register("remove-from-output", &remove_from_output, "If true, remove these symbols from "
-                "the output, not the input, side.");
+    po.Register("remove-from-output", &apply_to_output, "If true, this applies to symbols "
+                "on the output, not the input, side.  (For back compatibility; use "
+                "--apply-to-output insead)");
+    po.Register("apply-to-output", &apply_to_output, "If true, this applies to symbols "
+                "on the output, not the input, side.");
+    po.Register("remove-arcs", &remove_arcs, "If true, instead of converting the symbol "
+                "to <eps>, remove the arcs.");
+    po.Register("penalty", &penalty, "If specified, instead of converting "
+                "the symbol to <eps>, penalize the arc it is on by adding this "
+                "value to its cost.");
+
+
     po.Read(argc, argv);
 
+    if (remove_arcs &&
+        penalty != -std::numeric_limits<BaseFloat>::infinity())
+      KALDI_ERR << "--remove-arc and --penalty options are mutually exclusive";
+
     if (po.NumArgs() < 1 || po.NumArgs() > 3) {
       po.PrintUsage();
       exit(1);
     }
-    
+
     std::string disambig_rxfilename = po.GetArg(1),
         fst_rxfilename = po.GetOptArg(2),
         fst_wxfilename = po.GetOptArg(3);
 
     VectorFst<StdArc> *fst = ReadFstKaldi(fst_rxfilename);
-    
+
     std::vector<int32> disambig_in;
     if (!ReadIntegerVectorSimple(disambig_rxfilename, &disambig_in))
       KALDI_ERR << "fstrmsymbols: Could not read disambiguation symbols from "
                 << (disambig_rxfilename == "" ? "standard input" : disambig_rxfilename);
 
-    if (remove_from_output) Invert(fst);
-    RemoveSomeInputSymbols(disambig_in, fst);
-    if (remove_from_output) Invert(fst);
-    
+    if (apply_to_output) Invert(fst);
+    if (remove_arcs) {
+      RemoveArcsWithSomeInputSymbols(disambig_in, fst);
+    } else if (penalty != -std::numeric_limits<BaseFloat>::infinity()) {
+      PenalizeArcsWithSomeInputSymbols(disambig_in, penalty, fst);
+    } else {
+      RemoveSomeInputSymbols(disambig_in, fst);
+    }
+    if (apply_to_output) Invert(fst);
+
     WriteFstKaldi(*fst, fst_wxfilename);
 
     delete fst;
-    return 0;    
+    return 0;
   } catch(const std::exception &e) {
     std::cerr << e.what();
     return -1;
   }
 }
 
+/* some test examples:
+
+ ( echo "0 0 1 1"; echo " 0 0 3 2"; echo "0 0"; ) | fstcompile | fstrmsymbols "echo 3; echo  4|" | fstprint
+ # should produce:
+ # 0	0	1	1
+ # 0	0	0	2
+ # 0
+
+ ( echo "0 0 1 1"; echo " 0 0 3 2"; echo "0 0"; ) | fstcompile | fstrmsymbols --apply-to-output=true "echo 2; echo 3|" | fstprint
+ # should produce:
+ # 0	0	1	1
+ # 0	0	3	0
+ # 0
+
+
+ ( echo "0 0 1 1"; echo " 0 0 3 2"; echo "0 0"; ) | fstcompile | fstrmsymbols --remove-arcs=true  "echo 3; echo  4|" | fstprint
+ # should produce:
+ # 0	0	1	1
+ # 0
+
+ ( echo "0 0 1 1"; echo " 0 0 3 2"; echo "0 0"; ) | fstcompile | fstrmsymbols --penalty=2 "echo 3; echo 4; echo 5|" | fstprint
+# should produce:
+ # 0	0	1	1
+ # 0	0	3	2	2
+ # 0
+
+*/
diff --git a/src/fstbin/fsts-to-transcripts.cc b/src/fstbin/fsts-to-transcripts.cc
index 3190a8e2a86..7c301e10390 100644
--- a/src/fstbin/fsts-to-transcripts.cc
+++ b/src/fstbin/fsts-to-transcripts.cc
@@ -33,19 +33,19 @@ int main(int argc, char *argv[]) {
 
     const char *usage =
         "Reads a table of FSTs; for each element, finds the best path and prints out the\n"
-        "output-symbol sequence (if --output-side=true), or input-symbol sequence"
+        "output-symbol sequence (if --output-side=true), or input-symbol sequence "
         "otherwise.\n"
         "\n"
-        "Usage: fsts-to-transcripts [options] fsts-rspecifier transcriptions-wspecifier\n"
+        "Usage: fsts-to-transcripts [options] <fsts-rspecifier> <transcriptions-wspecifier>\n"
         " e.g.: fsts-to-transcripts ark:train.fsts ark,t:train.text\n";
-    
+
     ParseOptions po(usage);
 
     bool output_side = true;
 
-    po.Register("output-side", &output_side, "If true, extract the symbols on the output\n"
-                "side of the FSTs, else the input side.");
-    
+    po.Register("output-side", &output_side, "If true, extract the symbols on "
+                "the output side of the FSTs, else the input side.");
+
     po.Read(argc, argv);
 
     if (po.NumArgs() < 2 || po.NumArgs() > 3) {
@@ -64,7 +64,7 @@ int main(int argc, char *argv[]) {
     for (; !fst_reader.Done(); fst_reader.Next()) {
       std::string key = fst_reader.Key();
       const VectorFst<StdArc> &fst = fst_reader.Value();
-      
+
 
       VectorFst<StdArc> shortest_path;
       ShortestPath(fst, &shortest_path); // the OpenFst algorithm ShortestPath.
@@ -75,7 +75,7 @@ int main(int argc, char *argv[]) {
         n_err++;
         continue;
       }
-      
+
       std::vector<int32> transcript;
       bool ans;
       if (output_side) ans = fst::GetLinearSymbolSequence<StdArc, int32>(
@@ -90,7 +90,7 @@ int main(int argc, char *argv[]) {
     }
 
     KALDI_LOG << "Converted " << n_done << " FSTs, " << n_err << " with errors";
-    return (n_done != 0 ? 0 : 1);    
+    return (n_done != 0 ? 0 : 1);
   } catch(const std::exception &e) {
     std::cerr << e.what();
     return -1;
diff --git a/src/fstext/Makefile b/src/fstext/Makefile
index 2b8c54c653f..3c419182684 100644
--- a/src/fstext/Makefile
+++ b/src/fstext/Makefile
@@ -25,6 +25,6 @@ LIBNAME = kaldi-fstext
 # tree and matrix archives needed for test-context-fst
 # matrix archive needed for push-special.
 ADDLIBS =  ../tree/kaldi-tree.a ../matrix/kaldi-matrix.a \
-           ../util/kaldi-util.a ../base/kaldi-base.a 
+           ../util/kaldi-util.a ../thread/kaldi-thread.a ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/fstext/context-fst-inl.h b/src/fstext/context-fst-inl.h
index 889f138e0fa..5127e7ae584 100644
--- a/src/fstext/context-fst-inl.h
+++ b/src/fstext/context-fst-inl.h
@@ -88,8 +88,8 @@ template<class Arc, class LabelT>
 ContextFstImpl<Arc, LabelT>::ContextFstImpl(const ContextFstImpl &other):
     phone_syms_(other.phone_syms_),
     disambig_syms_(other.disambig_syms_) {
-  std::cerr << "ContextFst copying not yet supported [not hard, but would have to test.]";
-  exit(1);
+  KALDI_ERR << "ContextFst copying not yet supported "
+            << "[not hard, but would have to test.]";
 }
 
 
@@ -149,7 +149,7 @@ typename ContextFstImpl<Arc, LabelT>::Weight ContextFstImpl<Arc, LabelT>::Final(
   assert(static_cast<size_t>(s) < state_seqs_.size());  // make sure state exists already.
   if (!this->HasFinal(s)) {  // Work out final-state weight.
     const vector<LabelT> &seq = state_seqs_[s];
-    
+
     bool final_ok;
     assert(static_cast<int32>(seq.size()) == N_-1);
 
@@ -198,8 +198,8 @@ size_t ContextFstImpl<Arc, LabelT>::NumArcs(StateId s) {
   } else {
     // For normal states, in general there is potentially an arc for each phone and an arc
     // for each disambiguation symbol, plus one for the subsequential symbol.
-    return phone_syms_.size() + disambig_syms_.size() + 1; 
-  }  
+    return phone_syms_.size() + disambig_syms_.size() + 1;
+  }
 }
 
 template<class Arc, class LabelT>
@@ -310,9 +310,9 @@ bool ContextFstImpl<Arc, LabelT>::CreateArc(StateId s,
     // the output arcs, just 0.
     return CreatePhoneOrEpsArc(s, nextstate, olabel, phoneseq, oarc);
   } else {
-    std::cerr << "ContextFst: CreateArc, invalid olabel supplied [confusion about phone list or disambig symbols?]: "<<(olabel);
-    exit(1);
- }
+    KALDI_ERR << "ContextFst: CreateArc, invalid olabel supplied [confusion "
+              << "about phone list or disambig symbols?]: " << olabel;
+  }
   return false;  // won't get here.  suppress compiler error.
 }
 
@@ -400,7 +400,6 @@ bool ContextMatcher<Arc, LabelT>::Find(typename Arc::Label match_label) {
 template<class Arc>
 void AddSubsequentialLoop(typename Arc::Label subseq_symbol,
                           MutableFst<Arc> *fst) {
-  typedef typename Arc::Label Label;
   typedef typename Arc::StateId StateId;
   typedef typename Arc::Weight Weight;
 
@@ -463,30 +462,26 @@ SymbolTable *CreateILabelInfoSymbolTable(const vector<vector<I> > &info,
   assert(s == 0);
   for (size_t i = 1; i < info.size(); i++) {
     if (info[i].size() == 0) {
-      std::cerr << "CreateILabelInfoSymbolTable: invalid ilabel-info";
-      exit(1);
+      KALDI_ERR << "Invalid ilabel-info";
     }
     if (info[i].size() == 1 &&
        info[i][0] <= 0) {
       if (info[i][0] == 0) {  // special symbol at start that we want to call #-1.
         s = ans->AddSymbol(initial_disambig);
         if (s != i) {
-          std::cerr << "Disambig symbol " << initial_disambig
-                    << " already in vocab\n";
-          exit(1);
+          KALDI_ERR << "Disambig symbol " << initial_disambig
+                    << " already in vocab";
         }
       } else {
         std::string disambig_sym = phones_symtab.Find(-info[i][0]);
         if (disambig_sym == "") {
-          std::cerr << "CreateILabelInfoSymbolTable: disambig symbol "
-                    << -info[i][0] << " not in phone symbol-table.";
-          exit(1);
+          KALDI_ERR << "Disambig symbol " << -info[i][0]
+                    << " not in phone symbol-table";
         }
         s = ans->AddSymbol(disambig_sym);
         if (s != i) {
-          std::cerr << "Disambig symbol " << disambig_sym
-                    << " already in vocab\n";
-          exit(1);
+          KALDI_ERR << "Disambig symbol " << disambig_sym
+                    << " already in vocab";
         }
       }
     } else {
@@ -495,24 +490,22 @@ SymbolTable *CreateILabelInfoSymbolTable(const vector<vector<I> > &info,
       for (size_t j = 0; j < info[i].size(); j++) {
         std::string phonesym = phones_symtab.Find(info[i][j]);
         if (phonesym == "") {
-          std::cerr << "CreateILabelInfoSymbolTable: symbol "
-                    << info[i][j] << " not in phone symbol-table.";
-          exit(1);
+          KALDI_ERR << "Symbol " << info[i][j]
+                    << " not in phone symbol-table";
         }
         if (j != 0) newsym += separator;
         newsym += phonesym;
       }
       int64 s = ans->AddSymbol(newsym);
       if (s != static_cast<int64>(i)) {
-        std::cerr << "CreateILabelInfoSymbolTable: some problem with duplicate symbols.";
-        exit(1);
+        KALDI_ERR << "Some problem with duplicate symbols";
       }
     }
   }
   return ans;
 }
 
-inline void ComposeContext(vector<int32> &disambig_syms_in,
+inline void ComposeContext(const vector<int32> &disambig_syms_in,
                            int N, int P,
                            VectorFst<StdArc> *ifst,
                            VectorFst<StdArc> *ofst,
@@ -532,7 +525,7 @@ inline void ComposeContext(vector<int32> &disambig_syms_in,
     if (!std::binary_search(disambig_syms.begin(),
                             disambig_syms.end(), all_syms[i]))
       phones.push_back(all_syms[i]);
-  
+
   // Get subsequential symbol that does not clash with
   // any disambiguation symbol or symbol in the FST.
   int32 subseq_sym = 1;
@@ -540,7 +533,7 @@ inline void ComposeContext(vector<int32> &disambig_syms_in,
     subseq_sym = std::max(subseq_sym, all_syms.back() + 1);
   if (!disambig_syms.empty())
     subseq_sym = std::max(subseq_sym, disambig_syms.back() + 1);
-  
+
   // if P == N-1, it's left-context, and no subsequential symbol needed.
   if (P != N-1)
     AddSubsequentialLoop(subseq_sym, ifst);
@@ -551,8 +544,8 @@ inline void ComposeContext(vector<int32> &disambig_syms_in,
 
 ///
 
-} // end namespace fst
+}  // namespace fst
 
 
 
-#endif
+#endif  // KALDI_FSTEXT_CONTEXT_FST_INL_H_
diff --git a/src/fstext/context-fst.h b/src/fstext/context-fst.h
index c0b62f00135..15cb0ef9fdb 100644
--- a/src/fstext/context-fst.h
+++ b/src/fstext/context-fst.h
@@ -274,9 +274,9 @@ class ContextFst : public Fst<Arc> {
 
   virtual uint64 Properties(uint64 mask, bool test) const {
     if (test) {
-      uint64 known, test = TestProperties(*this, mask, &known);
-      impl_->SetProperties(test, known);
-      return test & mask;
+      uint64 knownprops, testprops = TestProperties(*this, mask, &knownprops);
+      impl_->SetProperties(knownprops, testprops);
+      return testprops & mask;
     } else {
       return impl_->Properties(mask);
     }
@@ -310,7 +310,7 @@ class ContextFst : public Fst<Arc> {
   ContextFstImpl<Arc, LabelT> *impl_;  // protected so CacheStateIterator
   // Makes visible to friends.
   ContextFstImpl<Arc, LabelT> *GetImpl() const { return impl_; }
- // would be: ImplToFst<ContextFstImpl<Arc, LabelT> >::GetImpl(); 
+ // would be: ImplToFst<ContextFstImpl<Arc, LabelT> >::GetImpl();
  // but need to convert to using the ImplToFst stuff.
 
   void operator = (const ContextFstImpl<Arc> &fst);  // disallow
@@ -504,7 +504,7 @@ void ComposeContextFst(const ContextFst<Arc, LabelT> &ifst1, const Fst<Arc> &ifs
    information to ilabels_out.  "ifst" is mutable because we need to add the
    subsequential loop.
  */
-inline void ComposeContext(vector<int32> &disambig_syms,
+inline void ComposeContext(const vector<int32> &disambig_syms,
                            int N, int P,
                            VectorFst<StdArc> *ifst,
                            VectorFst<StdArc> *ofst,
@@ -534,4 +534,4 @@ void AddSubsequentialLoop(typename Arc::Label subseq_symbol,
 
 #include "context-fst-inl.h"
 
-#endif
+#endif  // KALDI_FSTEXT_CONTEXT_FST_H_
diff --git a/src/fstext/deterministic-fst-inl.h b/src/fstext/deterministic-fst-inl.h
index 1af52ce594c..d9099e47ba3 100644
--- a/src/fstext/deterministic-fst-inl.h
+++ b/src/fstext/deterministic-fst-inl.h
@@ -1,7 +1,8 @@
 // fstext/deterministic-fst-inl.h
 
-// Copyright 2011-2012 Gilles Boulianne  Johns Hopkins University (author: Daniel Povey)
+// Copyright 2011-2012 Gilles Boulianne 
 //                2014 Telepoint Global Hosting Service, LLC. (Author: David Snyder)
+//           2012-2015 Johns Hopkins University (author: Daniel Povey)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -311,6 +312,197 @@ bool LmExampleDeterministicOnDemandFst<Arc>::GetArc(
   return true;
 }
 
+
+template<class Arc>
+void ComposeDeterministicOnDemand(const Fst<Arc> &fst1,
+                                  DeterministicOnDemandFst<Arc> *fst2,
+                                  MutableFst<Arc> *fst_composed) {
+  typedef typename Arc::Weight Weight;
+  typedef typename Arc::StateId StateId;
+  typedef std::pair<StateId, StateId> StatePair;
+  typedef unordered_map<StatePair, StateId,
+    kaldi::PairHasher<StateId> > MapType;
+  typedef typename MapType::iterator IterType;
+
+  fst_composed->DeleteStates();
+
+  MapType state_map;
+  std::queue<StatePair> state_queue;
+
+  // Set start state in fst_composed.
+  StateId s1 = fst1.Start(),
+          s2 = fst2->Start(),
+          start_state = fst_composed->AddState();
+  StatePair start_pair(s1, s2);
+  state_queue.push(start_pair);
+  fst_composed->SetStart(start_state);
+  // A mapping between pairs of states in fst1 and fst2 and the corresponding
+  // state in fst_composed.
+  std::pair<const StatePair, StateId> start_map(start_pair, start_state);
+  std::pair<IterType, bool> result = state_map.insert(start_map);
+  KALDI_ASSERT(result.second == true);
+
+  while (!state_queue.empty()) {
+    StatePair q = state_queue.front();
+    StateId q1 = q.first,
+            q2 = q.second;
+    state_queue.pop();
+    // If the product of the final weights of the two fsts is non-zero then
+    // we can set a final-prob in fst_composed
+    Weight final_weight = Times(fst1.Final(q1), fst2->Final(q2));
+    if (final_weight != Weight::Zero()) {
+      KALDI_ASSERT(state_map.find(q) != state_map.end());
+      fst_composed->SetFinal(state_map[q], final_weight);
+    }
+
+    // for each pair of edges from fst1 and fst2 at q1 and q2.
+    for (ArcIterator<Fst<Arc> > aiter(fst1, q1); !aiter.Done(); aiter.Next()) {
+      const Arc &arc1 = aiter.Value();
+      Arc arc2;
+      StatePair next_pair;
+      StateId next_state1 = arc1.nextstate,
+              next_state2,
+              next_state;
+      // If there is an epsilon on the arc of fst1 we transition to the next
+      // state but keep fst2 at the current state.
+      if (arc1.olabel == 0) {
+        next_state2 = q2;
+      } else {
+        bool match = fst2->GetArc(q2, arc1.olabel, &arc2);
+        if (!match)  // There is no matching arc -> nothing to do.
+          continue;
+        next_state2 = arc2.nextstate;
+      }
+      next_pair = StatePair(next_state1, next_state2);
+      IterType sitr = state_map.find(next_pair);
+      // If sitr == state_map.end() then the state isn't in fst_composed yet.
+      if (sitr == state_map.end()) {
+        next_state = fst_composed->AddState();
+        std::pair<const StatePair, StateId> new_state(
+          next_pair, next_state);
+        std::pair<IterType, bool> result = state_map.insert(new_state);
+        // Since we already checked if state_map contained new_state,
+        // it should always be added if we reach here.
+        KALDI_ASSERT(result.second == true);
+        state_queue.push(next_pair);
+      // If sitr != state_map.end() then the next state is already in
+      // the state_map.
+      } else {
+        next_state = sitr->second;
+      }
+      if (arc1.olabel == 0) {
+        fst_composed->AddArc(state_map[q], Arc(arc1.ilabel, 0, arc1.weight,
+                                               next_state));
+      } else {
+        fst_composed->AddArc(state_map[q], Arc(arc1.ilabel, arc2.olabel,
+          Times(arc1.weight, arc2.weight), next_state));
+      }
+    }
+  }
+}
+
+
+// we are doing *fst_composed = Compose(Inverse(*left), right).
+template<class Arc>
+void ComposeDeterministicOnDemandInverse(const Fst<Arc> &right,
+                                         DeterministicOnDemandFst<Arc> *left,
+                                         MutableFst<Arc> *fst_composed) {
+  typedef typename Arc::Weight Weight;
+  typedef typename Arc::StateId StateId;
+  typedef std::pair<StateId, StateId> StatePair;
+  typedef unordered_map<StatePair, StateId,
+    kaldi::PairHasher<StateId> > MapType;
+  typedef typename MapType::iterator IterType;
+
+  fst_composed->DeleteStates();
+
+  // the queue and map contain pairs (state-in-left, state-in-right)
+  MapType state_map;
+  std::queue<StatePair> state_queue;
+
+  // Set start state in fst_composed.
+  StateId s_left = left->Start(),
+          s_right = right.Start(),
+          start_state = fst_composed->AddState();
+  StatePair start_pair(s_left, s_right);
+  state_queue.push(start_pair);
+  fst_composed->SetStart(start_state);
+  // A mapping between pairs of states in *left and right, and the corresponding
+  // state in fst_composed.
+  std::pair<const StatePair, StateId> start_map(start_pair, start_state);
+  std::pair<IterType, bool> result = state_map.insert(start_map);
+  KALDI_ASSERT(result.second == true);
+
+  while (!state_queue.empty()) {
+    StatePair q = state_queue.front();
+    StateId q_left = q.first,
+            q_right = q.second;
+    state_queue.pop();
+    // If the product of the final weights of the two fsts is non-zero then
+    // we can set a final-prob in fst_composed
+    Weight final_weight = Times(left->Final(q_left), right.Final(q_right));
+    if (final_weight != Weight::Zero()) {
+      KALDI_ASSERT(state_map.find(q) != state_map.end());
+      fst_composed->SetFinal(state_map[q], final_weight);
+    }
+
+    for (ArcIterator<Fst<Arc> > aiter(right, q_right); !aiter.Done(); aiter.Next()) {
+      const Arc &arc_right = aiter.Value();
+      Arc arc_left;
+      StatePair next_pair;
+      StateId next_state_right = arc_right.nextstate,
+              next_state_left,
+              next_state;
+      // If there is an epsilon on the input side of the rigth arc, we
+      // transition to the next state of the output but keep 'left' at the
+      // current state.
+      if (arc_right.ilabel == 0) {
+        next_state_left = q_left;
+      } else {
+        bool match = left->GetArc(q_left, arc_right.ilabel, &arc_left);
+        if (!match)  // There is no matching arc -> nothing to do.
+          continue;
+        // the next 'swap' is because we are composing with the inverse of
+        // *left.  Just removing the swap statement wouldn't let us compose
+        // with non-inverted *left though, because the GetArc function call
+        // above interprets the second argument as an ilabel not an olabel.
+        std::swap(arc_left.ilabel, arc_left.olabel);
+        next_state_left = arc_left.nextstate;
+      }
+      next_pair = StatePair(next_state_left, next_state_right);
+      IterType sitr = state_map.find(next_pair);
+      // If sitr == state_map.end() then the state isn't in fst_composed yet.
+      if (sitr == state_map.end()) {
+        next_state = fst_composed->AddState();
+        std::pair<const StatePair, StateId> new_state(
+          next_pair, next_state);
+        std::pair<IterType, bool> result = state_map.insert(new_state);
+        // Since we already checked if state_map contained new_state,
+        // it should always be added if we reach here.
+        KALDI_ASSERT(result.second == true);
+        state_queue.push(next_pair);
+      // If sitr != state_map.end() then the next state is already in
+      // the state_map.
+      } else {
+        next_state = sitr->second;
+      }
+      if (arc_right.ilabel == 0) {
+        // we didn't get an actual arc from the left FST.
+        fst_composed->AddArc(state_map[q], Arc(0, arc_right.olabel,
+                                               arc_right.weight,
+                                               next_state));
+      } else {
+        fst_composed->AddArc(state_map[q],
+                             Arc(arc_left.ilabel, arc_right.olabel,
+                                 Times(arc_left.weight, arc_right.weight),
+                                 next_state));
+      }
+    }
+  }
+}
+
+
+
 } // end namespace fst
 
 
diff --git a/src/fstext/deterministic-fst-test.cc b/src/fstext/deterministic-fst-test.cc
index 90b74e27e9c..a041291e427 100644
--- a/src/fstext/deterministic-fst-test.cc
+++ b/src/fstext/deterministic-fst-test.cc
@@ -109,7 +109,7 @@ StdVectorFst* CreateResultFst() {
 }
 
 void DeleteTestFst(StdVectorFst *fst) {
-  if (fst) delete fst;
+  delete fst;
 }
 
 // Follow paths from an input fst representing a string
diff --git a/src/fstext/deterministic-fst.h b/src/fstext/deterministic-fst.h
index ecb3f9e969b..65ec4685170 100644
--- a/src/fstext/deterministic-fst.h
+++ b/src/fstext/deterministic-fst.h
@@ -1,7 +1,8 @@
 // fstext/deterministic-fst.h
 
-// Copyright 2011-2012 Gilles Boulianne  Johns Hopkins University (author: Daniel Povey)
+// Copyright 2011-2012 Gilles Boulianne
 //                2014 Telepoint Global Hosting Service, LLC. (Author: David Snyder)
+//           2012-2015 Johns Hopkins University (author: Daniel Povey)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -63,15 +64,12 @@ namespace fst {
 /// @{
 
 
-/// class DeterministicOnDemandFst is an "FST-like" base-class.
-/// It does not actually inherit from any Fst class because its
-/// interface is not exactly the same (it doesn't have the
-/// GetArc function).
-/// It assumes that the FST can have only one arc for any
-/// given input symbol, which makes the GetArc function below
-/// possible.
-/// Note: we don't use "const" in this interface, because
-/// it creates problems when we do things like caching,
+/// class DeterministicOnDemandFst is an "FST-like" base-class.  It does not
+/// actually inherit from any Fst class because its interface is not exactly the
+/// same (it doesn't have the GetArc function).  It assumes that the FST can
+/// have only one arc for any given input symbol, which makes the GetArc
+/// function below possible.  Note: we don't use "const" in this interface,
+/// because it creates problems when we do things like caching.
 template<class Arc>
 class DeterministicOnDemandFst {
  public:
@@ -253,6 +251,37 @@ class LmExampleDeterministicOnDemandFst: public DeterministicOnDemandFst<Arc> {
 };
 
 
+// Compose an FST (which may be a lattice) with a DeterministicOnDemandFst and
+// store the result in fst_composed.  This is mainly used for expanding lattice
+// n-gram histories, where fst1 is a lattice and fst2 is an UnweightedNgramFst.
+// This does not call Connect.
+template<class Arc>
+void ComposeDeterministicOnDemand(const Fst<Arc> &fst1,
+                                  DeterministicOnDemandFst<Arc> *fst2,
+                                  MutableFst<Arc> *fst_composed);
+
+/**
+   This function does
+   '*fst_composed = Compose(Inverse(*fst2), fst1)'
+   Note that the arguments are reversed; this is unfortunate but it's
+   because the fst2 argument needs to be non-const and non-const arguments
+   must follow const ones.
+   This is the counterpart to ComposeDeterministicOnDemand, used for
+   the case where the DeterministicOnDemandFst is on the left.  The
+   reason why we need to make the left-hand argument to compose the
+   inverse of 'fst2' (i.e. with the input and output symbols swapped),
+   is that the DeterministicOnDemandFst interface only supports lookup
+   by ilabel (see its function GetArc).
+   This does not call Connect.   
+*/
+template<class Arc>
+void ComposeDeterministicOnDemandInverse(const Fst<Arc> &fst1,
+                                         DeterministicOnDemandFst<Arc> *fst2,
+                                         MutableFst<Arc> *fst_composed);
+
+
+
+
 /// @}
 
 }  // namespace fst
diff --git a/src/fstext/determinize-lattice-inl.h b/src/fstext/determinize-lattice-inl.h
index b41deb980ee..9aff3e774a4 100644
--- a/src/fstext/determinize-lattice-inl.h
+++ b/src/fstext/determinize-lattice-inl.h
@@ -48,15 +48,15 @@ template<class IntType> class LatticeStringRepository {
   // Note: all Entry* pointers returned in function calls are
   // owned by the repository itself, not by the caller!
 
-  // Interface guarantees empty string is NULL.  
-  inline const Entry *EmptyString() { return NULL; }  
+  // Interface guarantees empty string is NULL.
+  inline const Entry *EmptyString() { return NULL; }
 
   // Returns string of "parent" with i appended.  Pointer
   // owned by repository
   const Entry *Successor(const Entry *parent, IntType i) {
     new_entry_->parent = parent;
     new_entry_->i = i;
-    
+
     std::pair<typename SetType::iterator, bool> pr = set_.insert(new_entry_);
     if (pr.second) { // Was successfully inserted (was not there).  We need to
                      // replace the element we inserted, which resides on the
@@ -124,7 +124,7 @@ template<class IntType> class LatticeStringRepository {
       ans = Successor(ans, a_vec[i]);
     return ans;
   }
-  
+
 
 
   // Returns true if a is a prefix of b.  If a is prefix of b,
@@ -145,7 +145,7 @@ template<class IntType> class LatticeStringRepository {
     }
     return ans;
   }
-  
+
   void ConvertToVector(const Entry *entry, vector<IntType> *out) const {
     size_t length = Size(entry);
     out->resize(length);
@@ -165,9 +165,9 @@ template<class IntType> class LatticeStringRepository {
       e = Successor(e, vec[i]);
     return e;
   }
-  
+
   LatticeStringRepository() { new_entry_ = new Entry; }
-  
+
   void Destroy() {
     for (typename SetType::iterator iter = set_.begin();
          iter != set_.end();
@@ -199,13 +199,13 @@ template<class IntType> class LatticeStringRepository {
     }
     set_.swap(tmp_set);
   }
-  
+
   ~LatticeStringRepository() { Destroy(); }
   int32 MemSize() const {
     return set_.size() * sizeof(Entry) * 2; // this is a lower bound
     // on the size this structure might take.
   }
- private:  
+ private:
   class EntryKey { // Hash function object.
    public:
     inline size_t operator()(const Entry *entry) const {
@@ -234,7 +234,7 @@ template<class IntType> class LatticeStringRepository {
       }
     }
   }
-  
+
   DISALLOW_COPY_AND_ASSIGN(LatticeStringRepository);
   Entry *new_entry_; // We always have a pre-allocated Entry ready to use,
                      // to avoid unnecessary news and deletes.
@@ -263,8 +263,8 @@ template<class Weight, class IntType> class LatticeDeterminizer {
 
   typedef CompactLatticeWeightTpl<Weight, IntType> CompactWeight;
   typedef ArcTpl<CompactWeight> CompactArc; // arc in compact, acceptor form of lattice
-  typedef ArcTpl<Weight> Arc; // arc in non-compact version of lattice 
-  
+  typedef ArcTpl<Weight> Arc; // arc in non-compact version of lattice
+
 
   // Output to standard FST with CompactWeightTpl<Weight> as its weight type (the
   // weight stores the original output-symbol strings).  If destroy == true,
@@ -427,11 +427,11 @@ template<class Weight, class IntType> class LatticeDeterminizer {
     { vector<OutputStateId> tmp; tmp.swap(queue_); }
     { vector<pair<Label, Element> > tmp; tmp.swap(all_elems_tmp_); }
   }
-  
+
   ~LatticeDeterminizer() {
     FreeMostMemory(); // rest is deleted by destructors.
   }
-  void RebuildRepository() { // rebuild the string repository,    
+  void RebuildRepository() { // rebuild the string repository,
     // freeing stuff we don't need.. we call this when memory usage
     // passes a supplied threshold.  We need to accumulate all the
     // strings we need the repository to "remember", then tell it
@@ -464,7 +464,7 @@ template<class Weight, class IntType> class LatticeDeterminizer {
                          needed_strings.end()); // uniq the strings.
     repository_.Rebuild(needed_strings);
   }
-  
+
   bool CheckMemoryUsage() {
     int32 repo_size = repository_.MemSize(),
         arcs_size = num_arcs_ * sizeof(TempArc),
@@ -479,7 +479,7 @@ template<class Weight, class IntType> class LatticeDeterminizer {
 
       KALDI_VLOG(2) << "Rebuilt repository in determinize-lattice: repository shrank from "
                     << repo_size << " to " << new_repo_size << " bytes (approximately)";
-      
+
       if (new_total_size > static_cast<int32>(opts_.max_mem * 0.8)) {
         // Rebuilding didn't help enough-- we need a margin to stop
         // having to rebuild too often.
@@ -492,7 +492,7 @@ template<class Weight, class IntType> class LatticeDeterminizer {
     }
     return true;
   }
-  
+
   // Returns true on success.  Can fail for out-of-memory
   // or max-states related reasons.
   bool Determinize(bool *debug_ptr) {
@@ -521,12 +521,12 @@ template<class Weight, class IntType> class LatticeDeterminizer {
           << repo_size << "," << arcs_size << "," << elems_size << ")";
       return (determinized_ = false);
     } catch (std::runtime_error) {
-      std::cerr << "Caught exception doing lattice determinization\n";
+      KALDI_WARN << "Caught exception doing lattice determinization";
       return (determinized_ = false);
-    }      
+    }
   }
  private:
-  
+
   typedef typename Arc::Label Label;
   typedef typename Arc::StateId StateId;  // use this when we don't know if it's input or output.
   typedef typename Arc::StateId InputStateId;  // state in the input FST.
@@ -547,6 +547,10 @@ template<class Weight, class IntType> class LatticeDeterminizer {
       return (state != other.state || string != other.string ||
               weight != other.weight);
     }
+    // This operator is only intended to support sorting in EpsilonClosure()
+    bool operator < (const Element &other) const {
+      return state < other.state;
+    }
   };
 
   // Arcs in the format we temporarily create in this class (a representation, essentially of
@@ -635,7 +639,7 @@ template<class Weight, class IntType> class LatticeDeterminizer {
   // these types are the same anyway].
   typedef unordered_map<const vector<Element>*, Element,
                         SubsetKey, SubsetEqual> InitialSubsetHash;
-  
+
 
   // converts the representation of the subset from canonical (all states) to
   // minimal (only states with output symbols on arcs leaving them, and final
@@ -653,7 +657,7 @@ template<class Weight, class IntType> class LatticeDeterminizer {
     }
     subset->resize(cur_out - subset->begin());
   }
-  
+
   // Takes a minimal, normalized subset, and converts it to an OutputStateId.
   // Involves a hash lookup, and possibly adding a new OutputStateId.
   // If it creates a new OutputStateId, it adds it to the queue.
@@ -672,7 +676,7 @@ template<class Weight, class IntType> class LatticeDeterminizer {
     return ans;
   }
 
-  
+
   // Given a normalized initial subset of elements (i.e. before epsilon closure),
   // compute the corresponding output-state.
   OutputStateId InitialToStateId(const vector<Element> &subset_in,
@@ -685,7 +689,7 @@ template<class Weight, class IntType> class LatticeDeterminizer {
       *remaining_weight = elem.weight;
       *common_prefix = elem.string;
       if (elem.weight == Weight::Zero())
-        std::cerr << "Zero weight!\n"; // TEMP
+        KALDI_WARN << "Zero weight!"; // TEMP
       return elem.state;
     }
     // else no matching subset-- have to work it out.
@@ -698,17 +702,17 @@ template<class Weight, class IntType> class LatticeDeterminizer {
     ConvertToMinimal(&subset); // remove all but emitting and final states.
 
     Element elem; // will be used to store remaining weight and string, and
-                 // OutputStateId, in initial_hash_;    
+                 // OutputStateId, in initial_hash_;
     NormalizeSubset(&subset, &elem.weight, &elem.string); // normalize subset; put
     // common string and weight in "elem".  The subset is now a minimal,
     // normalized subset.
-    
+
     OutputStateId ans = MinimalToStateId(subset);
     *remaining_weight = elem.weight;
     *common_prefix = elem.string;
     if (elem.weight == Weight::Zero())
-      std::cerr << "Zero weight!\n"; // TEMP
-    
+      KALDI_WARN << "Zero weight!"; // TEMP
+
     // Before returning "ans", add the initial subset to the hash,
     // so that we can bypass the epsilon-closure etc., next time
     // we process the same initial subset.
@@ -748,8 +752,8 @@ template<class Weight, class IntType> class LatticeDeterminizer {
     assert(0); // because we checked if a_str == b_str above, shouldn't reach here
     return 0;
   }
-  
-  
+
+
   // This function computes epsilon closure of subset of states by following epsilon links.
   // Called by InitialToStateId and Initialize.
   // Has no side effects except on the string repository.  The "output_subset" is not
@@ -759,37 +763,26 @@ template<class Weight, class IntType> class LatticeDeterminizer {
     // at input, subset must have only one example of each StateId.  [will still
     // be so at output].  This function follows input-epsilons, and augments the
     // subset accordingly.
-    
+
+    std::deque<Element> queue;
     unordered_map<InputStateId, Element> cur_subset;
-    typedef typename unordered_map<InputStateId, Element>::iterator MapIter;    
+    typedef typename unordered_map<InputStateId, Element>::iterator MapIter;
+    typedef typename vector<Element>::const_iterator VecIter;
 
-    {
-      MapIter iter = cur_subset.end();
-      for (size_t i = 0;i < subset->size();i++) {
-        std::pair<const InputStateId, Element> pr((*subset)[i].state, (*subset)[i]);
-#if __GNUC__ == 4 && __GNUC_MINOR__ == 0
-        iter = cur_subset.insert(iter, pr).first;
-#else
-        iter = cur_subset.insert(iter, pr);
-#endif
-        // By providing iterator where we inserted last one, we make insertion more efficient since
-        // input subset was already in sorted order.
-      }
+    for (VecIter iter = subset->begin(); iter != subset->end(); ++iter) {
+      queue.push_back(*iter);
+      cur_subset[iter->state] = *iter;
     }
-    // find whether input fst is known to be sorted on input label. 
-    bool sorted = ((ifst_->Properties(kILabelSorted, false) & kILabelSorted) != 0);
 
-    std::deque<Element> queue;
-    for (typename vector<Element>::const_iterator iter = subset->begin();
-         iter != subset->end();
-         ++iter) queue.push_back(*iter);
+    // find whether input fst is known to be sorted on input label.
+    bool sorted = ((ifst_->Properties(kILabelSorted, false) & kILabelSorted) != 0);
     bool replaced_elems = false; // relates to an optimization, see below.
     int counter = 0; // stops infinite loops here for non-lattice-determinizable input;
     // useful in testing.
     while (queue.size() != 0) {
       Element elem = queue.front();
       queue.pop_front();
-      
+
       // The next if-statement is a kind of optimization.  It's to prevent us
       // unnecessarily repeating the processing of a state.  "cur_subset" always
       // contains only one Element with a particular state.  The issue is that
@@ -801,8 +794,7 @@ template<class Weight, class IntType> class LatticeDeterminizer {
         continue;
       if (opts_.max_loop > 0 && counter++ > opts_.max_loop) {
         KALDI_ERR << "Lattice determinization aborted since looped more than "
-                  << opts_.max_loop << " times during epsilon closure.\n";
-        throw std::runtime_error("looped more than max-arcs times in lattice determinization");
+                  << opts_.max_loop << " times during epsilon closure";
       }
       for (ArcIterator<Fst<Arc> > aiter(*ifst_, elem.state); !aiter.Done(); aiter.Next()) {
         const Arc &arc = aiter.Value();
@@ -818,9 +810,8 @@ template<class Weight, class IntType> class LatticeDeterminizer {
             next_elem.string = elem.string;
           else
             next_elem.string = repository_.Successor(elem.string, arc.olabel);
-          
-          typename unordered_map<InputStateId, Element>::iterator
-              iter = cur_subset.find(next_elem.state);
+
+          MapIter iter = cur_subset.find(next_elem.state);
           if (iter == cur_subset.end()) {
             // was no such StateId: insert and add to queue.
             cur_subset[next_elem.state] = next_elem;
@@ -843,12 +834,13 @@ template<class Weight, class IntType> class LatticeDeterminizer {
       }
     }
 
-    {  // copy cur_subset to subset.
-      // sorted order is automatic.
+    { // copy cur_subset to subset.
       subset->clear();
       subset->reserve(cur_subset.size());
       MapIter iter = cur_subset.begin(), end = cur_subset.end();
       for (; iter != end; ++iter) subset->push_back(iter->second);
+      // sort by state ID, because the subset hash function is order-dependent(see SubsetKey)
+      std::sort(subset->begin(), subset->end());
     }
   }
 
@@ -889,7 +881,7 @@ template<class Weight, class IntType> class LatticeDeterminizer {
       temp_arc.string = final_string;
       temp_arc.weight = final_weight;
       output_arcs_[output_state].push_back(temp_arc);
-      num_arcs_++;      
+      num_arcs_++;
     }
   }
 
@@ -900,7 +892,7 @@ template<class Weight, class IntType> class LatticeDeterminizer {
                        Weight *tot_weight,
                        StringId *common_str) {
     if(elems->empty()) { // just set common_str, tot_weight
-      std::cerr << "[empty subset]\n"; // TEMP 
+      KALDI_WARN << "[empty subset]"; // TEMP
       // to defaults and return...
       *common_str = repository_.EmptyString();
       *tot_weight = Weight::Zero();
@@ -910,14 +902,14 @@ template<class Weight, class IntType> class LatticeDeterminizer {
     vector<IntType> common_prefix;
     repository_.ConvertToVector((*elems)[0].string, &common_prefix);
     Weight weight = (*elems)[0].weight;
-    for(size_t i = 1; i < size; i++) {
+    for (size_t i = 1; i < size; i++) {
       weight = Plus(weight, (*elems)[i].weight);
       repository_.ReduceToCommonPrefix((*elems)[i].string, &common_prefix);
     }
     assert(weight != Weight::Zero()); // we made sure to ignore arcs with zero
     // weights on them, so we shouldn't have zero here.
     size_t prefix_len = common_prefix.size();
-    for(size_t i = 0; i < size; i++) {
+    for (size_t i = 0; i < size; i++) {
       (*elems)[i].weight = Divide((*elems)[i].weight, weight, DIVIDE_LEFT);
       (*elems)[i].string =
           repository_.RemovePrefix((*elems)[i].string, prefix_len);
@@ -931,11 +923,11 @@ template<class Weight, class IntType> class LatticeDeterminizer {
   // (weight, string) pair in the semiring).
   void MakeSubsetUnique(vector<Element> *subset) {
     typedef typename vector<Element>::iterator IterType;
-    
+
     // This assert is designed to fail (usually) if the subset is not sorted on
     // state.
     assert(subset->size() < 2 || (*subset)[0].state <= (*subset)[1].state);
-    
+
     IterType cur_in = subset->begin(), cur_out = cur_in, end = subset->end();
     size_t num_out = 0;
     // Merge elements with same state-id
@@ -958,7 +950,7 @@ template<class Weight, class IntType> class LatticeDeterminizer {
     }
     subset->resize(num_out);
   }
-  
+
   // ProcessTransition is called from "ProcessTransitions".  Broken out for
   // clarity.  Processes a transition from state "state".  The set of Elements
   // represents a set of next-states with associated weights and strings, each
@@ -969,7 +961,7 @@ template<class Weight, class IntType> class LatticeDeterminizer {
   // semiring).
   void ProcessTransition(OutputStateId state, Label ilabel, vector<Element> *subset) {
     MakeSubsetUnique(subset); // remove duplicates with the same state.
-    
+
     StringId common_str;
     Weight tot_weight;
     NormalizeSubset(subset, &tot_weight, &common_str);
@@ -978,13 +970,13 @@ template<class Weight, class IntType> class LatticeDeterminizer {
     {
       Weight next_tot_weight;
       StringId next_common_str;
-      nextstate = InitialToStateId(*subset, 
+      nextstate = InitialToStateId(*subset,
                                    &next_tot_weight,
                                    &next_common_str);
       common_str = repository_.Concatenate(common_str, next_common_str);
       tot_weight = Times(tot_weight, next_tot_weight);
     }
-    
+
     // Now add an arc to the next state (would have been created if necessary by
     // InitialToStateId).
     TempArc temp_arc;
@@ -998,7 +990,7 @@ template<class Weight, class IntType> class LatticeDeterminizer {
 
 
   // "less than" operator for pair<Label, Element>.   Used in ProcessTransitions.
-  // Lexicographical order, which only compares the state when ordering the 
+  // Lexicographical order, which only compares the state when ordering the
   // "Element" member of the pair.
 
   class PairComparator {
@@ -1022,7 +1014,7 @@ template<class Weight, class IntType> class LatticeDeterminizer {
   // with the same ilabel.
   // Side effects on repository, and (via ProcessTransition) on Q_, hash_,
   // and output_arcs_.
-  
+
   void ProcessTransitions(OutputStateId output_state) {
     const vector<Element> &minimal_subset = *(output_states_[output_state]);
     // it's possible that minimal_subset could be empty if there are
@@ -1046,7 +1038,7 @@ template<class Weight, class IntType> class LatticeDeterminizer {
             next_elem.weight = Times(elem.weight, arc.weight);
             if (arc.olabel == 0) // output epsilon
               next_elem.string = elem.string;
-            else 
+            else
               next_elem.string = repository_.Successor(elem.string, arc.olabel);
             all_elems.push_back(this_pr);
           }
@@ -1083,29 +1075,28 @@ template<class Weight, class IntType> class LatticeDeterminizer {
     ProcessFinal(output_state);
     ProcessTransitions(output_state);
   }
-    
+
 
   void Debug() {  // this function called if you send a signal
     // SIGUSR1 to the process (and it's caught by the handler in
     // fstdeterminizestar).  It prints out some traceback
     // info and exits.
 
-    std::cerr << "Debug function called (probably SIGUSR1 caught).\n";
+    KALDI_WARN << "Debug function called (probably SIGUSR1 caught)";
     // free up memory from the hash as we need a little memory
     { MinimalSubsetHash hash_tmp; hash_tmp.swap(minimal_hash_); }
 
     if (output_arcs_.size() <= 2) {
-      std::cerr << "Nothing to trace back";
-      exit(1);
+      KALDI_ERR << "Nothing to trace back";
     }
-    size_t max_state = output_arcs_.size() - 2;  // don't take the last
+    size_t max_state = output_arcs_.size() - 2;  // Don't take the last
     // one as we might be halfway into constructing it.
 
     vector<OutputStateId> predecessor(max_state+1, kNoStateId);
     for (size_t i = 0; i < max_state; i++) {
       for (size_t j = 0; j < output_arcs_[i].size(); j++) {
         OutputStateId nextstate = output_arcs_[i][j].nextstate;
-        // always find an earlier-numbered prececessor; this
+        // Always find an earlier-numbered predecessor; this
         // is always possible because of the way the algorithm
         // works.
         if (nextstate <= max_state && nextstate > i)
@@ -1113,8 +1104,8 @@ template<class Weight, class IntType> class LatticeDeterminizer {
       }
     }
     vector<pair<Label, StringId> > traceback;
-    // traceback is a pair of (ilabel, olabel-seq).
-    OutputStateId cur_state = max_state;  // a recently constructed state.
+    // 'traceback' is a pair of (ilabel, olabel-seq).
+    OutputStateId cur_state = max_state;  // A recently constructed state.
 
     while (cur_state != 0 && cur_state != kNoStateId) {
       OutputStateId last_state = predecessor[cur_state];
@@ -1128,23 +1119,25 @@ template<class Weight, class IntType> class LatticeDeterminizer {
           break;
         }
       }
-      assert(i != output_arcs_[last_state].size());  // or fell off loop.
+      KALDI_ASSERT(i != output_arcs_[last_state].size());  // Or fell off loop.
       cur_state = last_state;
     }
-    if (cur_state == kNoStateId) 
-      std::cerr << "Traceback did not reach start state (possibly debug-code error)";
+    if (cur_state == kNoStateId)
+      KALDI_WARN << "Traceback did not reach start state "
+                 << "(possibly debug-code error)";
 
-    std::cerr << "Traceback below (or on standard error) in format ilabel (olabel olabel) ilabel (olabel) ...\n";
+    std::stringstream ss;
+    ss << "Traceback follows in format "
+       << "ilabel (olabel olabel) ilabel (olabel) ... :";
     for (ssize_t i = traceback.size() - 1; i >= 0; i--) {
-      std::cerr << traceback[i].first << ' ' << "( ";
+      ss << ' ' << traceback[i].first << " ( ";
       vector<Label> seq;
       repository_.ConvertToVector(traceback[i].second, &seq);
       for (size_t j = 0; j < seq.size(); j++)
-        std::cerr << seq[j] << ' ';
-      std::cerr << ") ";
+        ss << seq[j] << ' ';
+      ss << ')';
     }
-    std::cerr << '\n';
-    exit(1);
+    KALDI_ERR << ss.str();
   }
 
   bool IsIsymbolOrFinal(InputStateId state) { // returns true if this state
@@ -1172,8 +1165,8 @@ template<class Weight, class IntType> class LatticeDeterminizer {
     }
     return IsIsymbolOrFinal(state); // will only recurse once.
   }
-  
-  void InitializeDeterminization() {    
+
+  void InitializeDeterminization() {
     if(ifst_->Properties(kExpanded, false) != 0) { // if we know the number of
       // states in ifst_, it might be a bit more efficient
       // to pre-size the hashes so we're not constantly rebuilding them.
@@ -1196,7 +1189,7 @@ template<class Weight, class IntType> class LatticeDeterminizer {
          Note, we don't put anything in the initial_hash_.  The initial_hash_ is only
          a lookaside buffer anyway, so this isn't a problem-- it will get populated
          later if it needs to be.
-      */ 
+      */
       Element elem;
       elem.state = start_id;
       elem.weight = Weight::One();
@@ -1214,9 +1207,9 @@ template<class Weight, class IntType> class LatticeDeterminizer {
       OutputStateId initial_state = 0;
       minimal_hash_[subset_ptr] = initial_state;
       queue_.push_back(initial_state);
-    }     
+    }
   }
-  
+
   DISALLOW_COPY_AND_ASSIGN(LatticeDeterminizer);
 
 
@@ -1228,7 +1221,7 @@ template<class Weight, class IntType> class LatticeDeterminizer {
 
   int num_arcs_; // keep track of memory usage: number of arcs in output_arcs_
   int num_elems_; // keep track of memory usage: number of elems in output_states_
-  
+
   const Fst<Arc> *ifst_;
   DeterminizeLatticeOptions opts_;
   SubsetKey hasher_;  // object that computes keys-- has no data members.
@@ -1253,13 +1246,13 @@ template<class Weight, class IntType> class LatticeDeterminizer {
   // determinization.  LIFO queue (queue discipline doesn't really matter).
 
   vector<pair<Label, Element> > all_elems_tmp_; // temporary vector used in ProcessTransitions.
-  
+
   enum IsymbolOrFinal { OSF_UNKNOWN = 0, OSF_NO = 1, OSF_YES = 2 };
-  
+
   vector<char> isymbol_or_final_; // A kind of cache; it says whether
   // each state is (emitting or final) where emitting means it has at least one
   // non-epsilon output arc.  Only accessed by IsIsymbolOrFinal()
-  
+
   LatticeStringRepository<IntType> repository_;  // defines a compact and fast way of
   // storing sequences of labels.
 };
@@ -1298,9 +1291,6 @@ bool DeterminizeLattice(const Fst<ArcTpl<Weight> >&ifst,
   return true;
 }
 
+}  // namespace fst
 
-
-}
-
-
-#endif
+#endif  // KALDI_FSTEXT_DETERMINIZE_LATTICE_INL_H_
diff --git a/src/fstext/determinize-star-inl.h b/src/fstext/determinize-star-inl.h
index c7d9194e6bb..a6653445389 100644
--- a/src/fstext/determinize-star-inl.h
+++ b/src/fstext/determinize-star-inl.h
@@ -1,6 +1,7 @@
 // fstext/determinize-star-inl.h
 
 // Copyright 2009-2011  Microsoft Corporation;  Jan Silovsky
+//           2015 Hainan Xu
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -95,7 +96,7 @@ template<class Label, class StringId> class StringRepository {
     else if (id>=single_symbol_start) {
       v->resize(1); (*v)[0] = id - single_symbol_start;
     } else {
-      assert(id >= string_start && id < static_cast<StringId>(vec_.size()));
+      assert(static_cast<size_t>(id) < vec_.size());
       *v = *(vec_[id]);
     }
   }
@@ -124,7 +125,7 @@ template<class Label, class StringId> class StringRepository {
     for (typename vector<vector<Label>* >::iterator iter = vec_.begin(); iter != vec_.end(); ++iter)
       delete *iter;
     vector<vector<Label>* > tmp_vec;
-    tmp_vec.swap(vec_);    
+    tmp_vec.swap(vec_);
     MapType tmp_map;
     tmp_map.swap(map_);
   }
@@ -160,148 +161,33 @@ template<class Label, class StringId> class StringRepository {
 };
 
 
-template<class Arc> class DeterminizerStar {
+template<class F> class DeterminizerStar {
+  typedef typename F::Arc Arc;
  public:
   // Output to Gallic acceptor (so the strings go on weights, and there is a 1-1 correspondence
   // between our states and the states in ofst.  If destroy == true, release memory as we go
   // (but we cannot output again).
-  void Output(MutableFst<GallicArc<Arc> >  *ofst, bool destroy = true) {
-    assert(determinized_);
-    if (destroy) determinized_ = false;
-    typedef GallicWeight<Label, Weight> ThisGallicWeight;
-    typedef typename Arc::StateId StateId;
-    if (destroy)
-      FreeMostMemory();
-    StateId nStates = static_cast<StateId>(output_arcs_.size());
-    ofst->DeleteStates();
-    ofst->SetStart(kNoStateId);
-    if (nStates == 0) {
-      return;
-    }
-    for (StateId s = 0;s < nStates;s++) {
-      OutputStateId news = ofst->AddState();
-      assert(news == s);
-    }
-    ofst->SetStart(0);
-    // now process transitions.
-    for (StateId this_state = 0; this_state < nStates; this_state++) {
-      vector<TempArc> &this_vec(output_arcs_[this_state]);
-      typename vector<TempArc>::const_iterator iter = this_vec.begin(), end = this_vec.end();
-
-      for (;iter != end; ++iter) {
-        const TempArc &temp_arc(*iter);
-        GallicArc<Arc> new_arc;
-        vector<Label> seq;
-        repository_.SeqOfId(temp_arc.ostring, &seq);
-        StringWeight<Label, STRING_LEFT> string_weight;
-        for (size_t i = 0;i < seq.size();i++) string_weight.PushBack(seq[i]);
-        ThisGallicWeight gallic_weight(string_weight, temp_arc.weight);
-
-        if (temp_arc.nextstate == kNoStateId) {  // is really final weight.
-          ofst->SetFinal(this_state, gallic_weight);
-        } else {  // is really an arc.
-          new_arc.nextstate = temp_arc.nextstate;
-          new_arc.ilabel = temp_arc.ilabel;
-          new_arc.olabel = temp_arc.ilabel;  // acceptor.  input == output.
-          new_arc.weight = gallic_weight;  // includes string and weight.
-          ofst->AddArc(this_state, new_arc);
-        }
-      }
-      // Free up memory.  Do this inside the loop as ofst is also allocating memory
-      if (destroy) { vector<TempArc> temp; temp.swap(this_vec); }
-    }
-    if (destroy) { vector<vector<TempArc> > temp; temp.swap(output_arcs_); }
-  }
+  void Output(MutableFst<GallicArc<Arc> >  *ofst, bool destroy = true);
 
   // Output to standard FST.  We will create extra states to handle sequences of symbols
   // on the output.  If destroy == true, release memory as we go
   // (but we cannot output again).
 
-  void  Output(MutableFst<Arc> *ofst, bool destroy = true) {
-    assert(determinized_);
-    if (destroy) determinized_ = false;
-    // Outputs to standard fst.
-    OutputStateId nStates = static_cast<OutputStateId>(output_arcs_.size());
-    if (destroy)
-      FreeMostMemory();
-    ofst->DeleteStates();
-    if (nStates == 0) {
-      ofst->SetStart(kNoStateId);
-      return;
-    }
-    // Add basic states-- but will add extra ones to account for strings on output.
-    for (OutputStateId s = 0;s < nStates;s++) {
-      OutputStateId news = ofst->AddState();
-      assert(news == s);
-    }
-    ofst->SetStart(0);
-    for (OutputStateId this_state = 0; this_state < nStates; this_state++) {
-      vector<TempArc> &this_vec(output_arcs_[this_state]);
-
-      typename vector<TempArc>::const_iterator iter = this_vec.begin(), end = this_vec.end();
-      for (;iter != end; ++iter) {
-        const TempArc &temp_arc(*iter);
-        vector<Label> seq;
-        repository_.SeqOfId(temp_arc.ostring, &seq);
-
-        if (temp_arc.nextstate == kNoStateId) {  // Really a final weight.
-          // Make a sequence of states going to a final state, with the strings as labels.
-          // Put the weight on the first arc.
-          OutputStateId cur_state = this_state;
-          for (size_t i = 0;i < seq.size();i++) {
-            OutputStateId next_state = ofst->AddState();
-            Arc arc;
-            arc.nextstate = next_state;
-            arc.weight = (i == 0 ? temp_arc.weight : Weight::One());
-            arc.ilabel = 0;  // epsilon.
-            arc.olabel = seq[i];
-            ofst->AddArc(cur_state, arc);
-            cur_state = next_state;
-          }
-          ofst->SetFinal(cur_state, (seq.size() == 0 ? temp_arc.weight : Weight::One()));
-        } else {  // Really an arc.
-          OutputStateId cur_state = this_state;
-          // Have to be careful with this integer comparison (i+1 < seq.size()) because unsigned.
-          // i < seq.size()-1 could fail for zero-length sequences.
-          for (size_t i = 0; i+1 < seq.size();i++) {
-            // for all but the last element of seq, create new state.
-            OutputStateId next_state = ofst->AddState();
-            Arc arc;
-            arc.nextstate = next_state;
-            arc.weight = (i == 0 ? temp_arc.weight : Weight::One());
-            arc.ilabel = (i == 0 ? temp_arc.ilabel : 0);  // put ilabel on first element of seq.
-            arc.olabel = seq[i];
-            ofst->AddArc(cur_state, arc);
-            cur_state = next_state;
-          }
-          // Add the final arc in the sequence.
-          Arc arc;
-          arc.nextstate = temp_arc.nextstate;
-          arc.weight = (seq.size() <= 1 ? temp_arc.weight : Weight::One());
-          arc.ilabel = (seq.size() <= 1 ? temp_arc.ilabel : 0);
-          arc.olabel = (seq.size() > 0 ? seq.back() : 0);
-          ofst->AddArc(cur_state, arc);
-        }
-      }
-      // Free up memory.  Do this inside the loop as ofst is also allocating memory
-      if (destroy) { vector<TempArc> temp; temp.swap(this_vec); }
-    }
-    if (destroy) {
-      vector<vector<TempArc> > temp;
-      temp.swap(output_arcs_);
-      repository_.Destroy();
-    }
-  }
+  void  Output(MutableFst<Arc> *ofst, bool destroy = true);
 
 
-  // Initializer.  After initializing the object you will typically call one of
-  // the Output functions.
+  // Initializer.  After initializing the object you will typically call
+  // Determinize() and then one of the Output functions.
   DeterminizerStar(const Fst<Arc> &ifst, float delta = kDelta,
                    int max_states = -1, bool allow_partial = false):
       ifst_(ifst.Copy()), delta_(delta), max_states_(max_states),
       determinized_(false), allow_partial_(allow_partial),
       is_partial_(false), equal_(delta),
-      hash_(ifst.Properties(kExpanded, false) ? down_cast<const ExpandedFst<Arc>*, const Fst<Arc> >(&ifst)->NumStates()/2 + 3 : 20, hasher_, equal_) { }
+      hash_(ifst.Properties(kExpanded, false) ?
+              down_cast<const ExpandedFst<Arc>*,
+              const Fst<Arc> >(&ifst)->NumStates()/2 + 3 : 20,
+            hasher_, equal_),
+      epsilon_closure_(ifst_, max_states, &repository_, delta) { }
 
   void Determinize(bool *debug_ptr) {
     assert(!determinized_);
@@ -326,12 +212,11 @@ template<class Arc> class DeterminizerStar {
       if (debug_ptr && *debug_ptr) Debug();  // will exit.
       if (max_states_ > 0 && output_arcs_.size() > max_states_) {
         if (allow_partial_ == false) {
-          std::cerr << "Determinization aborted since passed " << max_states_
-                    << " states.\n";
-          throw std::runtime_error("max-states reached in determinization");
+          KALDI_ERR << "Determinization aborted since passed " << max_states_
+                    << " states";
         } else {
           KALDI_WARN << "Determinization terminated since passed " << max_states_
-                     << " states, partial results will be generated.";
+                     << " states, partial results will be generated";
           is_partial_ = true;
           break;
         }
@@ -343,7 +228,7 @@ template<class Arc> class DeterminizerStar {
   bool IsPartial() {
     return is_partial_;
   }
-  
+
   // frees all except output_arcs_, which contains the important info
   // we need to output.
   void FreeMostMemory() {
@@ -355,9 +240,9 @@ template<class Arc> class DeterminizerStar {
         iter != hash_.end(); ++iter)
       delete iter->first;
     SubsetHash tmp;
-    tmp.swap(hash_); 
+    tmp.swap(hash_);
   }
-  
+
   ~DeterminizerStar() {
     FreeMostMemory();
   }
@@ -409,7 +294,8 @@ template<class Arc> class DeterminizerStar {
    public:
     size_t operator ()(const vector<Element> * subset) const {  // hashes only the state and string.
       size_t hash = 0, factor = 1;
-      for (typename vector<Element>::const_iterator iter= subset->begin(); iter != subset->end(); ++iter) {
+      for (typename vector<Element>::const_iterator iter = subset->begin();
+           iter != subset->end(); ++iter) {
         hash *= factor;
         hash += iter->state + 103333*iter->string;
         factor *= 23531;  // these numbers are primes.
@@ -422,16 +308,18 @@ template<class Arc> class DeterminizerStar {
   // and string, and approximate match on weights.
   class SubsetEqual {
    public:
-    bool operator ()(const vector<Element> * s1, const vector<Element> * s2) const {
+    bool operator ()(const vector<Element> *s1,
+                     const vector<Element> *s2) const {
       size_t sz = s1->size();
-      assert(sz>=0);
+      assert(sz >= 0);
       if (sz != s2->size()) return false;
       typename vector<Element>::const_iterator iter1 = s1->begin(),
-          iter1_end = s1->end(), iter2=s2->begin();
+          iter1_end = s1->end(), iter2 = s2->begin();
       for (; iter1 < iter1_end; ++iter1, ++iter2) {
         if (iter1->state != iter2->state ||
            iter1->string != iter2->string ||
-           ! ApproxEqual(iter1->weight, iter2->weight, delta_)) return false;
+           ! ApproxEqual(iter1->weight, iter2->weight, delta_))
+          return false;
       }
       return true;
     }
@@ -444,7 +332,7 @@ template<class Arc> class DeterminizerStar {
   // Used only for debug.
   class SubsetEqualStates {
    public:
-    bool operator ()(const vector<Element> * s1, const vector<Element> * s2) const {
+    bool operator ()(const vector<Element> *s1, const vector<Element> *s2) const {
       size_t sz = s1->size();
       assert(sz>=0);
       if (sz != s2->size()) return false;
@@ -460,108 +348,89 @@ template<class Arc> class DeterminizerStar {
   // Define the hash type we use to store subsets.
   typedef unordered_map<const vector<Element>*, OutputStateId, SubsetKey, SubsetEqual> SubsetHash;
 
+  class EpsilonClosure {
+   public:
+    EpsilonClosure(const Fst<Arc> *ifst, int max_states,
+        StringRepository<Label, StringId> *repository, float delta):
+      ifst_(ifst), max_states_(max_states), repository_(repository),
+      delta_(delta) {
 
-  // This function computes epsilon closure of subset of states by following epsilon links.
-  // Called by ProcessSubset.
-  // Has no side effects except on the repository.
-
-  void EpsilonClosure(const vector<Element> & input_subset,
-                      vector<Element> *output_subset) {
-    // input_subset must have only one example of each StateId.
-
-    std::map<InputStateId, Element> cur_subset;
-    typedef typename std::map<InputStateId, Element>::iterator MapIter;
-    {
-      MapIter iter = cur_subset.end();
-      for (size_t i = 0;i < input_subset.size();i++) {
-        std::pair<const InputStateId, Element> pr(input_subset[i].state, input_subset[i]);
-        iter = cur_subset.insert(iter, pr);
-        // By providing iterator where we inserted last one, we make insertion more efficient since
-        // input subset was already in sorted order.
-      }
-    }
-    // find whether input fst is known to be sorted in input label.
-    bool sorted = ((ifst_->Properties(kILabelSorted, false) & kILabelSorted) != 0);
-    
-    vector<Element> queue(input_subset);  // queue of things to be processed.
-    bool replaced_elems = false; // relates to an optimization, see below.
-    int counter = 0; // relates to max-states option, used for test.
-    while (queue.size() != 0) {
-      Element elem = queue.back();
-      queue.pop_back();
-      // The next if-statement is a kind of optimization.  It's to prevent us
-      // unnecessarily repeating the processing of a state.  "cur_subset" always
-      // contains only one Element with a particular state.  The issue is that
-      // whenever we modify the Element corresponding to that state in "cur_subset",
-      // both the new (optimal) and old (less-optimal) Element will still be in
-      // "queue".  The next if-statement stops us from wasting compute by
-      // processing the old Element.
-      if (replaced_elems && cur_subset[elem.state] != elem)
-        continue;
-      if (max_states_ > 0 && counter++ > max_states_) {
-        std::cerr << "Determinization aborted since looped more than "
-                  << max_states_ << " times during epsilon closure.\n";
-        throw std::runtime_error("looped more than max-states times in determinization");
-      }      
-      for (ArcIterator<Fst<Arc> > aiter(*ifst_, elem.state); !aiter.Done(); aiter.Next()) {
-        const Arc &arc = aiter.Value();
-        if (sorted && arc.ilabel != 0) break;  // Break from the loop: due to sorting there will be no
-        // more transitions with epsilons as input labels.
-        if (arc.ilabel == 0) {  // Epsilon transition.
-          Element next_elem;
-          next_elem.state = arc.nextstate;
-          next_elem.weight = Times(elem.weight, arc.weight);
-          // now must append strings
-          if (arc.olabel == 0)
-            next_elem.string = elem.string;
-          else {
-            vector<Label> seq;
-            repository_.SeqOfId(elem.string, &seq);
-            if (arc.olabel != 0)
-              seq.push_back(arc.olabel);
-            next_elem.string = repository_.IdOfSeq(seq);
-          }
-          typename std::map<InputStateId, Element>::iterator
-              iter = cur_subset.find(next_elem.state);
-          if (iter == cur_subset.end()) {
-            // was no such StateId: insert and add to queue.
-            cur_subset[next_elem.state] = next_elem;
-            queue.push_back(next_elem);
-          } else {  // one is already there.  Add weights.
-            if (iter->second.string != next_elem.string) {
-              std::cerr << "DeterminizerStar: FST was not functional -> not determinizable\n";
-              { // Print some debugging information.  Can be helpful to debug
-                // the inputs when FSTs are mysteriously non-functional.
-                vector<Label> tmp_seq;
-                repository_.SeqOfId(iter->second.string, &tmp_seq);
-                std::cerr << "First string: ";
-                for (size_t i = 0; i < tmp_seq.size(); i++) std::cerr << tmp_seq[i] << " ";
-                std::cerr << "\nSecond string: ";
-                repository_.SeqOfId(next_elem.string, &tmp_seq);
-                for (size_t i = 0; i < tmp_seq.size(); i++) std::cerr << tmp_seq[i] << " ";
-                std::cerr << "\n";
-              }
-              throw std::runtime_error("Non-functional FST: cannot determinize.\n");
-            }
-            Weight weight = Plus(iter->second.weight, next_elem.weight);
-            if (! ApproxEqual(weight, iter->second.weight, delta_)) {  // add extra part of weight to queue.
-              queue.push_back(next_elem);
-              replaced_elems = true;
-            }
-            iter->second.weight = weight; // Update weight in map.
-          }
-        }
-      }
     }
 
-    {  // copy cur_subset to output_subset.
-      // sorted order is automatic.
-      output_subset->clear();
-      output_subset->reserve(cur_subset.size());
-      MapIter iter = cur_subset.begin(), end = cur_subset.end();
-      for (; iter != end; ++iter) output_subset->push_back(iter->second);
-    }
-  }
+    // This function computes epsilon closure of subset of states by following epsilon links.
+    // Called by ProcessSubset.
+    // Has no side effects except on the repository.
+    void GetEpsilonClosure(const vector<Element> &input_subset,
+                        vector<Element> *output_subset);
+
+   private:
+    struct EpsilonClosureInfo {
+      EpsilonClosureInfo() {}
+      EpsilonClosureInfo(const Element &e, const Weight &w, bool i) :
+        element(e), weight_to_process(w), in_queue(i) {}
+      // the weight in the Element struct is the total current weight
+      // that has been processed already
+      Element element;
+      // this stores the weight that we haven't processed (propagated)
+      Weight weight_to_process;
+      // whether "this" struct is in the queue
+      // we store the info here so that we don't have to look it up every time
+      bool in_queue;
+      bool operator<(const EpsilonClosureInfo &other) const {
+        return this->element.state < other.element.state;
+      }
+    };
+
+    // to further speed up EpsilonClosure() computation, we have 2 queues
+    // the 2nd queue is used when we first iterate over the input set -
+    // if queue_2_.empty() then we directly set output_set equal to input_set
+    // and return immediately
+    // Since Epsilon arcs are relatively rare, this way we could efficiently
+    // detect the epsilon-free case, without having to waste our computation e.g.
+    // allocating the EpsilonClosureInfo structure; this also lets us do a
+    // level-by-level traversal, which could avoid some (unfortunately not all)
+    // duplicate computation if epsilons form a DAG that is not a tree
+    //
+    // We put the queues here for better efficiency for memory allocation
+    deque<typename Arc::StateId> queue_;
+    vector<Element> queue_2_;
+
+    // the following 2 structures together form our *virtual "map"*
+    // basically we need a map from state_id to EpsilonClosureInfo that operates
+    // in O(1) time, while still takes relatively small mem, and this does it well
+    // for efficiency we don't clear id_to_index_ of its outdated information
+    // As a result each time we do a look-up, we need to check
+    // if (ecinfo_[id_to_index_[id]].element.state == id)
+    // Yet this is still faster than using a std::map<StateId, EpsilonClosureInfo>
+    vector<int> id_to_index_;
+    // unlike id_to_index_, we clear the content of ecinfo_ each time we call
+    // EpsilonClosure(). This needed because we need an efficient way to
+    // traverse the virtual map - it is just too costly to traverse the
+    // id_to_index_ vector.
+    vector<EpsilonClosureInfo> ecinfo_;
+
+    // Add one element (elem) into cur_subset
+    // it also adds the necessary stuff to queue_, set the correct weight
+    void AddOneElement(const Element &elem, const Weight &unprocessed_weight);
+
+    // Sub-routine that we call in EpsilonClosure()
+    // It takes the current "unprocessed_weight" and propagate it to the
+    // states accessible from elem.state by an epsilon arc
+    // and add the results to cur_subset.
+    // save_to_queue_2 is set true when we iterate over the initial subset
+    // - then we save it to queue_2 s.t. if it's empty, we directly return
+    // the input set
+    void ExpandOneElement(const Element &elem,
+                          bool sorted,
+                          const Weight &unprocessed_weight,
+                          bool save_to_queue_2 = false);
+
+    // no pointers below would take the ownership
+    const Fst<Arc> *ifst_;
+    int max_states_;
+    StringRepository<Label, StringId> *repository_;
+    float delta_;
+  };
 
 
   // This function works out the final-weight of the determinized state.
@@ -588,9 +457,8 @@ template<class Arc> class DeterminizerStar {
           is_final = true;
         } else {  // already have one.
           if (final_string != elem.string) {
-            std::cerr << "DeterminizerStar: FST was not functional -> not determinizable\n";
-            throw std::runtime_error("Non-functional FST: cannot determinize.\n");
-          }            
+            KALDI_ERR << "FST was not functional -> not determinizable";
+          }
           final_weight = Plus(final_weight, Times(elem.weight, this_final_weight));
         }
       }
@@ -606,88 +474,10 @@ template<class Arc> class DeterminizerStar {
     }
   }
 
-  // ProcessTransition is called from "ProcessTransitions".  Broken out for clarity.
-  // Has side effects on output_arcs_, and (via SubsetToStateId) Q_ and hash_
-
-  void ProcessTransition(OutputStateId state, Label ilabel, vector<Element> *subset) {
-    // At input, "subset" may contain duplicates for a given dest state (but in sorted
-    // order).  This function removes duplicates from "subset", normalizes it, and adds
-    // a transition to the dest. state (possibly affecting Q_ and hash_, if state did not
-    // exist).
-
-    typedef typename vector<Element>::iterator IterType;
-    {  // This block makes the subset have one unique Element per state, adding the weights.
-      IterType cur_in = subset->begin(), cur_out = cur_in, end = subset->end();
-      size_t num_out = 0;
-      // Merge elements with same state-id
-      while (cur_in != end) {  // while we have more elements to process.
-        // At this point, cur_out points to location of next place we want to put an element,
-        // cur_in points to location of next element we want to process.
-        if (cur_in != cur_out) *cur_out = *cur_in;
-        cur_in++;
-        while (cur_in != end && cur_in->state == cur_out->state) {  // merge elements.
-          if (cur_in->string != cur_out->string) {
-            std::cerr << "DeterminizerStar: FST was not functional -> not determinizable\n";
-            throw std::runtime_error("Non-functional FST: cannot determinize.\n");
-          }            
-          cur_out->weight = Plus(cur_out->weight, cur_in->weight);
-          cur_in++;
-        }
-        cur_out++;
-        num_out++;
-      }
-      subset->resize(num_out);
-    }
-
-    StringId common_str;
-    Weight tot_weight;
-    {  // This block computes common_str and tot_weight (essentially: the common divisor)
-      // and removes them from the elements.
-      vector<Label> seq;
-
-      IterType begin = subset->begin(), iter, end = subset->end();
-      {  // This block computes "seq", which is the common prefix, and "common_str",
-        // which is the StringId version of "seq".
-        vector<Label> tmp_seq;
-        for (iter = begin; iter!= end; ++iter) {
-          if (iter == begin) {
-            repository_.SeqOfId(iter->string, &seq);
-          } else {
-            repository_.SeqOfId(iter->string, &tmp_seq);
-            if (tmp_seq.size() < seq.size()) seq.resize(tmp_seq.size());  // size of shortest one.
-            for (size_t i = 0;i < seq.size(); i++) // seq.size() is the shorter one at this point.
-              if (tmp_seq[i] != seq[i]) seq.resize(i);
-          }
-          if (seq.size() == 0) break;  // will not get any prefix.
-        }
-        common_str = repository_.IdOfSeq(seq);
-      }
-
-      {  // This block computes "tot_weight".
-        iter = begin;
-        tot_weight = iter->weight;
-        for (++iter; iter!= end; ++iter)
-          tot_weight = Plus(tot_weight, iter->weight);
-      }
-
-      // Now divide out common stuff from elements.
-      size_t prefix_len = seq.size();
-      for (iter = begin; iter != end; ++iter) {
-        iter->weight = Divide(iter->weight, tot_weight);
-        iter->string = repository_.RemovePrefix(iter->string, prefix_len);
-      }
-    }
-
-    // Now add an arc to the state that the subset represents.
-    // We may create a new state id for this (in SubsetToStateId).
-    TempArc temp_arc;
-    temp_arc.ilabel = ilabel;
-    temp_arc.nextstate = SubsetToStateId(*subset);  // may or may not really add the subset.
-    temp_arc.ostring = common_str;
-    temp_arc.weight = tot_weight;
-    output_arcs_[state].push_back(temp_arc);  // record the arc.
-  }
-
+  // ProcessTransition is called from "ProcessTransitions".  Broken out for
+  // clarity.  Has side effects on output_arcs_, and (via SubsetToStateId), Q_
+  // and hash_.
+  void ProcessTransition(OutputStateId state, Label ilabel, vector<Element> *subset);
 
   // "less than" operator for pair<Label, Element>.   Used in ProcessTransitions.
   // Lexicographical order, with comparing the state only for "Element".
@@ -713,15 +503,16 @@ template<class Arc> class DeterminizerStar {
   // with the same ilabel.
   // Side effects on repository, and (via ProcessTransition) on Q_, hash_,
   // and output_arcs_.
-
   void ProcessTransitions(const vector<Element> &closed_subset, OutputStateId state) {
     vector<pair<Label, Element> > all_elems;
     {  // Push back into "all_elems", elements corresponding to all non-epsilon-input transitions
       // out of all states in "closed_subset".
-      typename vector<Element>::const_iterator iter = closed_subset.begin(), end = closed_subset.end();
-      for (;iter != end; ++iter) {
+      typename vector<Element>::const_iterator iter = closed_subset.begin(),
+          end = closed_subset.end();
+      for (; iter != end; ++iter) {
         const Element &elem = *iter;
-        for (ArcIterator<Fst<Arc> > aiter(*ifst_, elem.state); ! aiter.Done(); aiter.Next()) {
+        for (ArcIterator<Fst<Arc> > aiter(*ifst_, elem.state);
+             !aiter.Done(); aiter.Next()) {
           const Arc &arc = aiter.Value();
           if (arc.ilabel != 0) {  // Non-epsilon transition -- ignore epsilons here.
             pair<Label, Element> this_pr;
@@ -766,14 +557,16 @@ template<class Arc> class DeterminizerStar {
   // fst.  This is a hash lookup; if no such state exists, it adds a new state to the hash
   // and adds a new pair to the queue.
   // Side effects on hash_ and Q_, and on output_arcs_ [just affects the size].
-
   OutputStateId SubsetToStateId(const vector<Element> &subset) {  // may add the subset to the queue.
     typedef typename SubsetHash::iterator IterType;
     IterType iter = hash_.find(&subset);
     if (iter == hash_.end()) {  // was not there.
       vector<Element> *new_subset = new vector<Element>(subset);
       OutputStateId new_state_id = (OutputStateId) output_arcs_.size();
-      hash_[new_subset] = new_state_id;
+      bool ans = hash_.insert(std::pair<const vector<Element>*,
+                                        OutputStateId>(new_subset,
+                                                       new_state_id)).second;
+      assert(ans);
       output_arcs_.push_back(vector<TempArc>());
       if (allow_partial_ == false) {
         // If --allow-partial is not requested, we do the old way.
@@ -797,13 +590,12 @@ template<class Arc> class DeterminizerStar {
   // of (states, weights)).  After that we ignore epsilons.  We process the final-weight
   // of the state, and then handle transitions out (this may add more determinized states
   // to the queue).
-
   void ProcessSubset(const pair<vector<Element>*, OutputStateId> & pair) {
     const vector<Element> *subset = pair.first;
     OutputStateId state = pair.second;
 
     vector<Element> closed_subset;  // subset after epsilon closure.
-    EpsilonClosure(*subset, &closed_subset);
+    epsilon_closure_.GetEpsilonClosure(*subset, &closed_subset);
 
     // Now follow non-epsilon arcs [and also process final states]
     ProcessFinal(closed_subset, state);
@@ -812,68 +604,7 @@ template<class Arc> class DeterminizerStar {
     ProcessTransitions(closed_subset, state);
   }
 
-  void Debug() {  // this function called if you send a signal
-    // SIGUSR1 to the process (and it's caught by the handler in
-    // fstdeterminizestar).  It prints out some traceback
-    // info and exits.
-
-    std::cerr << "Debug function called (probably SIGUSR1 caught).\n";
-    // free up memory from the hash as we need a little memory
-    { SubsetHash hash_tmp; std::swap(hash_tmp, hash_); }
-
-    if (output_arcs_.size() <= 2) {
-      std::cerr << "Nothing to trace back";
-      exit(1);
-    }
-    size_t max_state = output_arcs_.size() - 2;  // don't take the last
-    // one as we might be halfway into constructing it.
-
-    vector<OutputStateId> predecessor(max_state+1, kNoStateId);
-    for (size_t i = 0; i < max_state; i++) {
-      for (size_t j = 0; j < output_arcs_[i].size(); j++) {
-        OutputStateId nextstate = output_arcs_[i][j].nextstate;
-        // always find an earlier-numbered prececessor; this
-        // is always possible because of the way the algorithm
-        // works.
-        if (nextstate <= max_state && nextstate > i)
-          predecessor[nextstate] = i;
-      }
-    }
-    vector<pair<Label, StringId> > traceback;
-    // traceback is a pair of (ilabel, olabel-seq).
-    OutputStateId cur_state = max_state;  // a recently constructed state.
-
-    while (cur_state != 0 && cur_state != kNoStateId) {
-      OutputStateId last_state = predecessor[cur_state];
-      pair<Label, StringId> p;
-      size_t i;
-      for (i = 0; i < output_arcs_[last_state].size(); i++) {
-        if (output_arcs_[last_state][i].nextstate == cur_state) {
-          p.first = output_arcs_[last_state][i].ilabel;
-          p.second = output_arcs_[last_state][i].ostring;
-          traceback.push_back(p);
-          break;
-        }
-      }
-      assert(i != output_arcs_[last_state].size());  // or fell off loop.
-      cur_state = last_state;
-    }
-    if (cur_state == kNoStateId)
-      std::cerr << "Traceback did not reach start state (possibly debug-code error)";
-
-    std::cerr << "Traceback below (or on standard error) in format ilabel (olabel olabel) ilabel (olabel) ...\n";
-    for (ssize_t i = traceback.size() - 1; i >= 0; i--) {
-      std::cerr << traceback[i].first << ' ' << "( ";
-      vector<Label> seq;
-      repository_.SeqOfId(traceback[i].second, &seq);
-      for (size_t j = 0; j < seq.size(); j++)
-        std::cerr << seq[j] << ' ';
-      std::cerr << ") ";
-    }
-    std::cerr << '\n';
-    exit(1);
-  }
-
+  void Debug();
 
   DISALLOW_COPY_AND_ASSIGN(DeterminizerStar);
   deque<pair<vector<Element>*, OutputStateId> > Q_;  // queue of subsets to be processed.
@@ -891,37 +622,518 @@ template<class Arc> class DeterminizerStar {
   SubsetHash hash_;  // hash from Subset to StateId in final Fst.
 
   StringRepository<Label, StringId> repository_;  // associate integer id's with sequences of labels.
+  EpsilonClosure epsilon_closure_;
 };
 
 
-template<class Arc>
-bool DeterminizeStar(Fst<Arc> &ifst, MutableFst<Arc> *ofst,
+template<class F>
+bool DeterminizeStar(F &ifst, MutableFst<typename F::Arc> *ofst,
                      float delta, bool *debug_ptr, int max_states,
                      bool allow_partial) {
   ofst->SetOutputSymbols(ifst.OutputSymbols());
   ofst->SetInputSymbols(ifst.InputSymbols());
-  DeterminizerStar<Arc> det(ifst, delta, max_states, allow_partial);
+  DeterminizerStar<F> det(ifst, delta, max_states, allow_partial);
   det.Determinize(debug_ptr);
   det.Output(ofst);
   return det.IsPartial();
 }
 
 
-template<class Arc>
-bool DeterminizeStar(Fst<Arc> &ifst, MutableFst<GallicArc<Arc> > *ofst, float delta,
+template<class F>
+bool DeterminizeStar(F &ifst,
+                     MutableFst<GallicArc<typename F::Arc> > *ofst, float delta,
                      bool *debug_ptr, int max_states,
                      bool allow_partial) {
   ofst->SetOutputSymbols(ifst.InputSymbols());
   ofst->SetInputSymbols(ifst.InputSymbols());
-  DeterminizerStar<Arc> det(ifst, delta, max_states, allow_partial);
+  DeterminizerStar<F> det(ifst, delta, max_states, allow_partial);
   det.Determinize(debug_ptr);
   det.Output(ofst);
   return det.IsPartial();
 }
 
+template<class F>
+void DeterminizerStar<F>::EpsilonClosure::
+            GetEpsilonClosure(const vector<Element> &input_subset,
+                                       vector<Element> *output_subset) {
+  ecinfo_.resize(0);
+  size_t size = input_subset.size();
+  // find whether input fst is known to be sorted in input label.
+  bool sorted =
+          ((ifst_->Properties(kILabelSorted, false) & kILabelSorted) != 0);
+
+  // size is still the input_subset.size()
+  for (size_t i = 0; i < size; i++) {
+    ExpandOneElement(input_subset[i], sorted, input_subset[i].weight, true);
+  }
+
+  size_t s = queue_2_.size();
+  if (s == 0) {
+    *output_subset = input_subset;
+    return;
+  } else {
+    // queue_2 not empty. Need to create the vector<info>
+    for (size_t i = 0; i < size; i++) {
+      // the weight has not been processed yet,
+      // so put all of them in the "weight_to_process"
+      ecinfo_.push_back(EpsilonClosureInfo(input_subset[i],
+                                           input_subset[i].weight,
+                                           false));
+      ecinfo_.back().element.weight = Weight::Zero(); // clear the weight
+
+      if (id_to_index_.size() < input_subset[i].state + 1) {
+        id_to_index_.resize(2 * input_subset[i].state + 1, -1);
+      }
+      id_to_index_[input_subset[i].state] = ecinfo_.size() - 1;
+    }
+  }
+
+  {
+    Element elem;
+    elem.weight = Weight::Zero();
+    for (size_t i = 0; i < s; i++) {
+      elem.state = queue_2_[i].state;
+      elem.string = queue_2_[i].string;
+      AddOneElement(elem, queue_2_[i].weight);
+    }
+    queue_2_.resize(0);
+  }
+
+  int counter = 0; // relates to max-states option, used for test.
+  while (!queue_.empty()) {
+    InputStateId id = queue_.front();
+
+    // no need to check validity of the index
+    // since anything in the queue we are sure they're in the "virtual set"
+    int index = id_to_index_[id];
+    EpsilonClosureInfo &info = ecinfo_[index];
+    Element &elem = info.element;
+    Weight unprocessed_weight = info.weight_to_process;
+
+    elem.weight = Plus(elem.weight, unprocessed_weight);
+    info.weight_to_process = Weight::Zero();
+
+    info.in_queue = false;
+    queue_.pop_front();
+
+    if (max_states_ > 0 && counter++ > max_states_) {
+      KALDI_ERR << "Determinization aborted since looped more than "
+                << max_states_ << " times during epsilon closure";
+    }
+
+    // generally we need to be careful about iterator-invalidation problem
+    // here we pass a reference (elem), which could be an issue.
+    // In the beginning of ExpandOneElement, we make a copy of elem.string
+    // to avoid that issue
+    ExpandOneElement(elem, sorted, unprocessed_weight);
+  }
+
+  {
+    // this sorting is based on StateId
+    sort(ecinfo_.begin(), ecinfo_.end());
+
+    output_subset->clear();
+
+    size = ecinfo_.size();
+    output_subset->reserve(size);
+    for (size_t i = 0; i < size; i++) {
+      EpsilonClosureInfo& info = ecinfo_[i];
+      if (info.weight_to_process != Weight::Zero()) {
+        info.element.weight = Plus(info.element.weight, info.weight_to_process);
+      }
+      output_subset->push_back(info.element);
+    }
+  }
+}
+
+template<class F>
+void DeterminizerStar<F>::EpsilonClosure::
+     AddOneElement(const Element &elem, const Weight &unprocessed_weight) {
+  // first we try to find the element info in the ecinfo_ vector
+  int index = -1;
+  if (elem.state < id_to_index_.size()) {
+    index = id_to_index_[elem.state];
+  }
+  if (index != -1) {
+    if (index >= ecinfo_.size()) {
+      index = -1;
+    }
+    // since ecinfo_ might store outdated information, we need to check
+    else if (ecinfo_[index].element.state != elem.state) {
+      index = -1;
+    }
+  }
+
+  if (index == -1) {
+    // was no such StateId: insert and add to queue.
+    ecinfo_.push_back(EpsilonClosureInfo(elem, unprocessed_weight, true));
+    size_t size = id_to_index_.size();
+    if (size < elem.state + 1) {
+      // double the size to reduce memory operations
+      id_to_index_.resize(2 * elem.state + 1, -1);
+    }
+    id_to_index_[elem.state] = ecinfo_.size() - 1;
+    queue_.push_back(elem.state);
+
+  } else {  // one is already there.  Add weights.
+    EpsilonClosureInfo &info = ecinfo_[index];
+    if (info.element.string != elem.string) {
+      // Non-functional FST.
+      std::ostringstream ss;
+      ss << "FST was not functional -> not determinizable.";
+      { // Print some debugging information.  Can be helpful to debug
+        // the inputs when FSTs are mysteriously non-functional.
+        vector<Label> tmp_seq;
+        repository_->SeqOfId(info.element.string, &tmp_seq);
+        ss << "\nFirst string:";
+        for (size_t i = 0; i < tmp_seq.size(); i++)
+          ss << ' ' << tmp_seq[i];
+        ss << "\nSecond string:";
+        repository_->SeqOfId(elem.string, &tmp_seq);
+        for (size_t i = 0; i < tmp_seq.size(); i++)
+          ss << ' ' << tmp_seq[i];
+      }
+      KALDI_ERR << ss.str();
+    }
+
+    info.weight_to_process =
+          Plus(info.weight_to_process, unprocessed_weight);
+
+    if (!info.in_queue) {
+      // this is because the code in "else" below: the
+      // iter->second.weight_to_process might not be Zero()
+      Weight weight = Plus(info.element.weight, info.weight_to_process);
+
+      // What is done below is, we propagate the weight (by adding them
+      // to the queue only when the change is big enough;
+      // otherwise we just store the weight, until before returning
+      // we add the element.weight and weight_to_process together
+      if (! ApproxEqual(weight, info.element.weight, delta_)) {
+        // add extra part of weight to queue.
+        info.in_queue = true;
+        queue_.push_back(elem.state);
+      }
+    }
+  }
+}
 
+template<class F>
+void DeterminizerStar<F>::EpsilonClosure::ExpandOneElement(
+                                          const Element &elem,
+                                          bool sorted,
+                                          const Weight &unprocessed_weight,
+                                          bool save_to_queue_2) {
+  StringId str = elem.string; // copy it here because there is an iterator-
+                // - invalidation problem (it really happens for some FSTs)
+
+  // now we are going to propagate the "unprocessed_weight"
+  for (ArcIterator<Fst<Arc> > aiter(*ifst_, elem.state);
+       !aiter.Done(); aiter.Next()) {
+    const Arc &arc = aiter.Value();
+    if (sorted && arc.ilabel > 0) {
+      break;
+      // Break from the loop: due to sorting there will be no
+      // more transitions with epsilons as input labels.
+    }
+    if (arc.ilabel != 0) {
+      continue;  // we only process epsilons here
+    }
+    Element next_elem;
+    next_elem.state = arc.nextstate;
+    next_elem.weight = Weight::Zero();
+    Weight next_unprocessed_weight
+                   = Times(unprocessed_weight, arc.weight);
+
+    // now must append strings
+    if (arc.olabel == 0) {
+      next_elem.string = str;
+    } else {
+      vector<Label> seq;
+      repository_->SeqOfId(str, &seq);
+      if (arc.olabel != 0)
+        seq.push_back(arc.olabel);
+      next_elem.string = repository_->IdOfSeq(seq);
+    }
+    if (save_to_queue_2) {
+      next_elem.weight = next_unprocessed_weight;
+      queue_2_.push_back(next_elem);
+    } else {
+      AddOneElement(next_elem, next_unprocessed_weight);
+    }
+  }
+}
 
+template<class F>
+void DeterminizerStar<F>::Output(MutableFst<GallicArc<Arc> > *ofst,
+                                   bool destroy) {
+  assert(determinized_);
+  if (destroy) determinized_ = false;
+  typedef GallicWeight<Label, Weight> ThisGallicWeight;
+  typedef typename Arc::StateId StateId;
+  if (destroy)
+    FreeMostMemory();
+  StateId nStates = static_cast<StateId>(output_arcs_.size());
+  ofst->DeleteStates();
+  ofst->SetStart(kNoStateId);
+  if (nStates == 0) {
+    return;
+  }
+  for (StateId s = 0;s < nStates;s++) {
+    OutputStateId news = ofst->AddState();
+    assert(news == s);
+  }
+  ofst->SetStart(0);
+  // now process transitions.
+  for (StateId this_state = 0; this_state < nStates; this_state++) {
+    vector<TempArc> &this_vec(output_arcs_[this_state]);
+    typename vector<TempArc>::const_iterator iter = this_vec.begin(),
+        end = this_vec.end();
+    for (; iter != end; ++iter) {
+      const TempArc &temp_arc(*iter);
+      GallicArc<Arc> new_arc;
+      vector<Label> seq;
+      repository_.SeqOfId(temp_arc.ostring, &seq);
+      StringWeight<Label, STRING_LEFT> string_weight;
+      for (size_t i = 0;i < seq.size();i++) string_weight.PushBack(seq[i]);
+      ThisGallicWeight gallic_weight(string_weight, temp_arc.weight);
+
+      if (temp_arc.nextstate == kNoStateId) {  // is really final weight.
+        ofst->SetFinal(this_state, gallic_weight);
+      } else {  // is really an arc.
+        new_arc.nextstate = temp_arc.nextstate;
+        new_arc.ilabel = temp_arc.ilabel;
+        new_arc.olabel = temp_arc.ilabel;  // acceptor.  input == output.
+        new_arc.weight = gallic_weight;  // includes string and weight.
+        ofst->AddArc(this_state, new_arc);
+      }
+    }
+    // Free up memory.  Do this inside the loop as ofst is also allocating memory
+    if (destroy) { vector<TempArc> temp; temp.swap(this_vec); }
+  }
+  if (destroy) { vector<vector<TempArc> > temp; temp.swap(output_arcs_); }
 }
 
+template<class F>
+void DeterminizerStar<F>::Output(MutableFst<Arc> *ofst, bool destroy) {
+  assert(determinized_);
+  if (destroy) determinized_ = false;
+  // Outputs to standard fst.
+  OutputStateId num_states = static_cast<OutputStateId>(output_arcs_.size());
+  if (destroy)
+    FreeMostMemory();
+  ofst->DeleteStates();
+  if (num_states == 0) {
+    ofst->SetStart(kNoStateId);
+    return;
+  }
+  // Add basic states-- but will add extra ones to account for strings on output.
+  for (OutputStateId s = 0; s < num_states; s++) {
+    OutputStateId news = ofst->AddState();
+    assert(news == s);
+  }
+  ofst->SetStart(0);
+  for (OutputStateId this_state = 0; this_state < num_states; this_state++) {
+    vector<TempArc> &this_vec(output_arcs_[this_state]);
 
-#endif
+    typename vector<TempArc>::const_iterator iter = this_vec.begin(),
+        end = this_vec.end();
+    for (; iter != end; ++iter) {
+      const TempArc &temp_arc(*iter);
+      vector<Label> seq;
+      repository_.SeqOfId(temp_arc.ostring, &seq);
+      if (temp_arc.nextstate == kNoStateId) {  // Really a final weight.
+        // Make a sequence of states going to a final state, with the strings as labels.
+        // Put the weight on the first arc.
+        OutputStateId cur_state = this_state;
+        for (size_t i = 0; i < seq.size();i++) {
+          OutputStateId next_state = ofst->AddState();
+          Arc arc;
+          arc.nextstate = next_state;
+          arc.weight = (i == 0 ? temp_arc.weight : Weight::One());
+          arc.ilabel = 0;  // epsilon.
+          arc.olabel = seq[i];
+          ofst->AddArc(cur_state, arc);
+          cur_state = next_state;
+        }
+        ofst->SetFinal(cur_state, (seq.size() == 0 ? temp_arc.weight : Weight::One()));
+      } else {  // Really an arc.
+        OutputStateId cur_state = this_state;
+        // Have to be careful with this integer comparison (i+1 < seq.size()) because unsigned.
+        // i < seq.size()-1 could fail for zero-length sequences.
+        for (size_t i = 0; i+1 < seq.size();i++) {
+          // for all but the last element of seq, create new state.
+          OutputStateId next_state = ofst->AddState();
+          Arc arc;
+          arc.nextstate = next_state;
+          arc.weight = (i == 0 ? temp_arc.weight : Weight::One());
+          arc.ilabel = (i == 0 ? temp_arc.ilabel : 0);  // put ilabel on first element of seq.
+          arc.olabel = seq[i];
+          ofst->AddArc(cur_state, arc);
+          cur_state = next_state;
+        }
+        // Add the final arc in the sequence.
+        Arc arc;
+        arc.nextstate = temp_arc.nextstate;
+        arc.weight = (seq.size() <= 1 ? temp_arc.weight : Weight::One());
+        arc.ilabel = (seq.size() <= 1 ? temp_arc.ilabel : 0);
+        arc.olabel = (seq.size() > 0 ? seq.back() : 0);
+        ofst->AddArc(cur_state, arc);
+      }
+    }
+    // Free up memory.  Do this inside the loop as ofst is also allocating memory
+    if (destroy) { vector<TempArc> temp; temp.swap(this_vec); }
+  }
+  if (destroy) {
+    vector<vector<TempArc> > temp;
+    temp.swap(output_arcs_);
+    repository_.Destroy();
+  }
+}
+
+template<class F> void DeterminizerStar<F>::
+ProcessTransition(OutputStateId state, Label ilabel, vector<Element> *subset) {
+  // At input, "subset" may contain duplicates for a given dest state (but in sorted
+  // order).  This function removes duplicates from "subset", normalizes it, and adds
+  // a transition to the dest. state (possibly affecting Q_ and hash_, if state did not
+  // exist).
+
+  typedef typename vector<Element>::iterator IterType;
+  {  // This block makes the subset have one unique Element per state, adding the weights.
+    IterType cur_in = subset->begin(), cur_out = cur_in, end = subset->end();
+    size_t num_out = 0;
+    // Merge elements with same state-id
+    while (cur_in != end) {  // while we have more elements to process.
+      // At this point, cur_out points to location of next place we want to put an element,
+      // cur_in points to location of next element we want to process.
+      if (cur_in != cur_out) *cur_out = *cur_in;
+      cur_in++;
+      while (cur_in != end && cur_in->state == cur_out->state) {  // merge elements.
+        if (cur_in->string != cur_out->string) {
+          KALDI_ERR << "FST was not functional -> not determinizable";
+        }
+        cur_out->weight = Plus(cur_out->weight, cur_in->weight);
+        cur_in++;
+      }
+      cur_out++;
+      num_out++;
+    }
+    subset->resize(num_out);
+  }
+
+  StringId common_str;
+  Weight tot_weight;
+  {  // This block computes common_str and tot_weight (essentially: the common divisor)
+    // and removes them from the elements.
+    vector<Label> seq;
+
+    IterType begin = subset->begin(), iter, end = subset->end();
+    {  // This block computes "seq", which is the common prefix, and "common_str",
+      // which is the StringId version of "seq".
+      vector<Label> tmp_seq;
+      for (iter = begin; iter != end; ++iter) {
+        if (iter == begin) {
+          repository_.SeqOfId(iter->string, &seq);
+        } else {
+          repository_.SeqOfId(iter->string, &tmp_seq);
+          if (tmp_seq.size() < seq.size()) seq.resize(tmp_seq.size());  // size of shortest one.
+          for (size_t i = 0; i < seq.size(); i++) // seq.size() is the shorter one at this point.
+            if (tmp_seq[i] != seq[i]) seq.resize(i);
+        }
+        if (seq.size() == 0) break;  // will not get any prefix.
+      }
+      common_str = repository_.IdOfSeq(seq);
+    }
+
+    {  // This block computes "tot_weight".
+      iter = begin;
+      tot_weight = iter->weight;
+      for (++iter; iter != end; ++iter)
+        tot_weight = Plus(tot_weight, iter->weight);
+    }
+
+    // Now divide out common stuff from elements.
+    size_t prefix_len = seq.size();
+    for (iter = begin; iter != end; ++iter) {
+      iter->weight = Divide(iter->weight, tot_weight);
+      iter->string = repository_.RemovePrefix(iter->string, prefix_len);
+    }
+  }
+
+  // Now add an arc to the state that the subset represents.
+  // We may create a new state id for this (in SubsetToStateId).
+  TempArc temp_arc;
+  temp_arc.ilabel = ilabel;
+  temp_arc.nextstate = SubsetToStateId(*subset);  // may or may not really add the subset.
+  temp_arc.ostring = common_str;
+  temp_arc.weight = tot_weight;
+  output_arcs_[state].push_back(temp_arc);  // record the arc.
+}
+
+template<class F>
+void DeterminizerStar<F>::Debug() {
+  // this function called if you send a signal
+  // SIGUSR1 to the process (and it's caught by the handler in
+  // fstdeterminizestar).  It prints out some traceback
+  // info and exits.
+
+  KALDI_WARN << "Debug function called (probably SIGUSR1 caught)";
+  // free up memory from the hash as we need a little memory
+  { SubsetHash hash_tmp; std::swap(hash_tmp, hash_); }
+
+  if (output_arcs_.size() <= 2) {
+    KALDI_ERR << "Nothing to trace back";
+  }
+  size_t max_state = output_arcs_.size() - 2;  // don't take the last
+  // one as we might be halfway into constructing it.
+
+  vector<OutputStateId> predecessor(max_state+1, kNoStateId);
+  for (size_t i = 0; i < max_state; i++) {
+    for (size_t j = 0; j < output_arcs_[i].size(); j++) {
+      OutputStateId nextstate = output_arcs_[i][j].nextstate;
+      // Always find an earlier-numbered predecessor; this
+      // is always possible because of the way the algorithm
+      // works.
+      if (nextstate <= max_state && nextstate > i)
+        predecessor[nextstate] = i;
+    }
+  }
+  vector<pair<Label, StringId> > traceback;
+  // 'traceback' is a pair of (ilabel, olabel-seq).
+  OutputStateId cur_state = max_state;  // A recently constructed state.
+
+  while (cur_state != 0 && cur_state != kNoStateId) {
+    OutputStateId last_state = predecessor[cur_state];
+    pair<Label, StringId> p;
+    size_t i;
+    for (i = 0; i < output_arcs_[last_state].size(); i++) {
+      if (output_arcs_[last_state][i].nextstate == cur_state) {
+        p.first = output_arcs_[last_state][i].ilabel;
+        p.second = output_arcs_[last_state][i].ostring;
+        traceback.push_back(p);
+        break;
+      }
+    }
+    KALDI_ASSERT(i != output_arcs_[last_state].size());  // Or fell off loop.
+    cur_state = last_state;
+  }
+  if (cur_state == kNoStateId)
+    KALDI_WARN << "Traceback did not reach start state "
+    << "(possibly debug-code error)";
+
+  std::stringstream ss;
+  ss << "Traceback follows in format "
+    << "ilabel (olabel olabel) ilabel (olabel) ... :";
+  for (ssize_t i = traceback.size() - 1; i >= 0; i--) {
+    ss << ' ' << traceback[i].first << " ( ";
+    vector<Label> seq;
+    repository_.SeqOfId(traceback[i].second, &seq);
+    for (size_t j = 0; j < seq.size(); j++)
+      ss << seq[j] << ' ';
+    ss << ')';
+  }
+  KALDI_ERR << ss.str();
+}
+
+}  // namespace fst
+
+#endif  // KALDI_FSTEXT_DETERMINIZE_STAR_INL_H_
diff --git a/src/fstext/determinize-star-test.cc b/src/fstext/determinize-star-test.cc
index d83d4f5eaef..d6aaaa4e024 100644
--- a/src/fstext/determinize-star-test.cc
+++ b/src/fstext/determinize-star-test.cc
@@ -1,6 +1,7 @@
 // fstext/determinize-star-test.cc
 
 // Copyright 2009-2011  Microsoft Corporation
+//           2015       Hainan Xu
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -45,7 +46,7 @@ template<class Arc> void TestDeterminizeGeneral() {
     }
     VectorFst<Arc> ofst;
     try {
-      DeterminizeStar<Arc>(*fst, &ofst, kDelta, NULL, max_states);
+      DeterminizeStar<Fst<Arc> >(*fst, &ofst, kDelta, NULL, max_states);
       std::cout << "FST after determinizing is:\n";
       {
 #ifdef HAVE_OPENFST_GE_10400
diff --git a/src/fstext/determinize-star.h b/src/fstext/determinize-star.h
index 2b8eaabd52b..6377ad7352f 100644
--- a/src/fstext/determinize-star.h
+++ b/src/fstext/determinize-star.h
@@ -2,6 +2,7 @@
 
 // Copyright 2009-2011  Microsoft Corporation
 //                2014  Guoguo Chen
+//                2015  Hainan Xu
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -82,8 +83,8 @@ namespace fst {
     The function will return false if partial FST is generated, and true if the
     complete determinized FST is generated.
 */
-template<class Arc>
-bool DeterminizeStar(Fst<Arc> &ifst, MutableFst<Arc> *ofst,
+template<class F>
+bool DeterminizeStar(F &ifst, MutableFst<typename F::Arc> *ofst,
                      float delta = kDelta,
                      bool *debug_ptr = NULL,
                      int max_states = -1,
@@ -102,8 +103,8 @@ bool DeterminizeStar(Fst<Arc> &ifst, MutableFst<Arc> *ofst,
     The function will return false if partial FST is generated, and true if the
     complete determinized FST is generated.
 */
-template<class Arc>
-bool DeterminizeStar(Fst<Arc> &ifst, MutableFst<GallicArc<Arc> > *ofst,
+template<class F>
+bool DeterminizeStar(F &ifst, MutableFst<GallicArc<typename F::Arc> > *ofst,
                      float delta = kDelta, bool *debug_ptr = NULL,
                      int max_states = -1,
                      bool allow_partial = false);
diff --git a/src/fstext/epsilon-property-inl.h b/src/fstext/epsilon-property-inl.h
index 2dbf6e3633c..a0308bf91e4 100644
--- a/src/fstext/epsilon-property-inl.h
+++ b/src/fstext/epsilon-property-inl.h
@@ -1,4 +1,4 @@
-// fstext/fstext-utils-inl.h
+// fstext/epsilon-property-inl.h
 
 // Copyright 2014    Johns Hopkins University (Author: Daniel Povey)
 
diff --git a/src/fstext/fstext-utils-inl.h b/src/fstext/fstext-utils-inl.h
index d65f7a2a594..f3b0b68b502 100644
--- a/src/fstext/fstext-utils-inl.h
+++ b/src/fstext/fstext-utils-inl.h
@@ -179,7 +179,6 @@ bool GetLinearSymbolSequence(const Fst<Arc> &fst,
                              vector<I> *isymbols_out,
                              vector<I> *osymbols_out,
                              typename Arc::Weight *tot_weight_out) {
-  typedef typename Arc::Label Label;
   typedef typename Arc::StateId StateId;
   typedef typename Arc::Weight Weight;
 
@@ -222,7 +221,6 @@ bool GetLinearSymbolSequences(const Fst<Arc> &fst,
                               vector<vector<I> > *isymbols_out,
                               vector<vector<I> > *osymbols_out,
                               vector<typename Arc::Weight> *weights_out) {
-  typedef typename Arc::Label Label;
   typedef typename Arc::StateId StateId;
   typedef typename Arc::Weight Weight;
 
@@ -361,7 +359,6 @@ void NbestAsFsts(const Fst<Arc> &fst,
 template<class Arc, class I>
 void MakeLinearAcceptorWithAlternatives(const vector<vector<I> > &labels,
                                         MutableFst<Arc> *ofst) {
-  typedef typename Arc::Label Label;
   typedef typename Arc::StateId StateId;
   typedef typename Arc::Weight Weight;
 
@@ -382,7 +379,6 @@ void MakeLinearAcceptorWithAlternatives(const vector<vector<I> > &labels,
 
 template<class Arc, class I>
 void MakeLinearAcceptor(const vector<I> &labels, MutableFst<Arc> *ofst) {
-  typedef typename Arc::Label Label;
   typedef typename Arc::StateId StateId;
   typedef typename Arc::Weight Weight;
 
@@ -506,7 +502,6 @@ void SafeDeterminizeWrapperInLog(VectorFst<StdArc> *ifst, VectorFst<StdArc> *ofs
 
 template<class Arc>
 void RemoveWeights(MutableFst<Arc> *ifst) {
-  typedef typename Arc::Label Label;
   typedef typename Arc::StateId StateId;
   typedef typename Arc::Weight Weight;
 
@@ -539,7 +534,6 @@ bool PrecedingInputSymbolsAreSame(bool start_is_epsilon, const Fst<Arc> &fst) {
 
 template<class Arc, class F> // F is functor type from labels to classes.
 bool PrecedingInputSymbolsAreSameClass(bool start_is_epsilon, const Fst<Arc> &fst, const F &f) {
-  typedef typename Arc::Label Label;
   typedef typename F::Result ClassType;
   typedef typename Arc::StateId StateId;
   vector<ClassType> classes;
@@ -578,7 +572,6 @@ bool FollowingInputSymbolsAreSame(bool end_is_epsilon, const Fst<Arc> &fst) {
 
 template<class Arc, class F>
 bool FollowingInputSymbolsAreSameClass(bool end_is_epsilon, const Fst<Arc> &fst, const F &f) {
-  typedef typename Arc::Label Label;
   typedef typename Arc::StateId StateId;
   typedef typename Arc::Weight Weight;
   typedef typename F::Result ClassType;
@@ -610,7 +603,6 @@ void MakePrecedingInputSymbolsSame(bool start_is_epsilon, MutableFst<Arc> *fst)
 template<class Arc, class F>
 void MakePrecedingInputSymbolsSameClass(bool start_is_epsilon, MutableFst<Arc> *fst, const F &f) {
   typedef typename F::Result ClassType;
-  typedef typename Arc::Label Label;
   typedef typename Arc::StateId StateId;
   typedef typename Arc::Weight Weight;
   vector<ClassType> classes;
@@ -692,7 +684,6 @@ void MakeFollowingInputSymbolsSame(bool end_is_epsilon, MutableFst<Arc> *fst) {
 
 template<class Arc, class F>
 void MakeFollowingInputSymbolsSameClass(bool end_is_epsilon, MutableFst<Arc> *fst, const F &f) {
-  typedef typename Arc::Label Label;
   typedef typename Arc::StateId StateId;
   typedef typename Arc::Weight Weight;
   typedef typename F::Result ClassType;
@@ -890,7 +881,6 @@ bool EqualAlign(const Fst<Arc> &ifst,
   // infinite loop.
   KALDI_ASSERT(ifst.Properties(kCoAccessible, true) == kCoAccessible);
 
-  typedef typename Arc::Label Label;
   typedef typename Arc::StateId StateId;
   typedef typename Arc::Weight Weight;
 
@@ -1127,94 +1117,6 @@ void PhiCompose(const Fst<Arc> &fst1,
   Connect(ofst);
 }
 
-template<class Arc>
-void ComposeDeterministicOnDemand(const Fst<Arc> &fst1,
-                                  DeterministicOnDemandFst<Arc> *fst2,
-                                  MutableFst<Arc> *fst_composed) {
-  typedef typename Arc::Weight Weight;
-  typedef typename Arc::StateId StateId;
-  typedef std::pair<StateId, StateId> StatePair;
-  typedef unordered_map<StatePair, StateId,
-    kaldi::PairHasher<StateId> > MapType;
-  typedef typename MapType::iterator IterType;
-
-  fst_composed->DeleteStates();
-
-  MapType state_map;
-  std::queue<StatePair> state_queue;
-
-  // Set start state in fst_composed.
-  StateId s1 = fst1.Start(),
-          s2 = fst2->Start(),
-          start_state = fst_composed->AddState();
-  StatePair start_pair(s1, s2);
-  state_queue.push(start_pair);
-  fst_composed->SetStart(start_state);
-  // A mapping between pairs of states in fst1 and fst2 and the corresponding
-  // state in fst_composed.
-  std::pair<const StatePair, StateId> start_map(start_pair, start_state);
-  std::pair<IterType, bool> result = state_map.insert(start_map);
-  KALDI_ASSERT(result.second == true);
-
-  while (!state_queue.empty()) {
-    StatePair q = state_queue.front();
-    StateId q1 = q.first,
-            q2 = q.second;
-    state_queue.pop();
-    // If the product of the final weights of the two fsts is non-zero then
-    // we can create a final state in fst_composed.
-    Weight final_weight = Times(fst1.Final(q1), fst2->Final(q2));
-    if (final_weight != Weight::Zero()) {
-      KALDI_ASSERT(state_map.find(q) != state_map.end());
-      fst_composed->SetFinal(state_map[q], final_weight);
-    }
-
-    // for each pair of edges from fst1 and fst2 at q1 and q2.
-    for (ArcIterator<Fst<Arc> > aiter(fst1, q1); !aiter.Done(); aiter.Next()) {
-      const Arc &arc1 = aiter.Value();
-      Arc arc2;
-      StatePair next_pair;
-      StateId next_state1 = arc1.nextstate,
-              next_state2,
-              next_state;
-      // If there is an epsilon on the arc of fst1 we transition to the next
-      // state but keep fst2 at the current state.
-      if (arc1.olabel == 0) {
-        next_state2 = q2;
-      } else {
-        bool match = fst2->GetArc(q2, arc1.olabel, &arc2);
-        // This should always find a match.
-        KALDI_ASSERT(match == true);
-        next_state2 = arc2.nextstate;
-      }
-      next_pair = StatePair(next_state1, next_state2);
-      IterType sitr = state_map.find(next_pair);
-      // If sitr == state_map.end() then the state isn't in fst_composed yet.
-      if (sitr == state_map.end()) {
-        next_state = fst_composed->AddState();
-        std::pair<const StatePair, StateId> new_state(
-          next_pair, next_state);
-        std::pair<IterType, bool> result = state_map.insert(new_state);
-        // Since we already checked if state_map contained new_state,
-        // it should always be added if we reach here.
-        KALDI_ASSERT(result.second == true);
-        state_queue.push(next_pair);
-      // If sitr != state_map.end() then the next state is already in
-      // the state_map.
-      } else {
-        next_state = sitr->second;
-      }
-      if (arc1.olabel == 0) {
-        fst_composed->AddArc(state_map[q], Arc(0, 0, arc1.weight,
-          next_state));
-      } else {
-        fst_composed->AddArc(state_map[q], Arc(arc1.ilabel, arc2.olabel,
-          Times(arc1.weight, arc2.weight), next_state));
-      }
-    }
-  }
-}
-
 template<class Arc>
 void PropagateFinalInternal(
     typename Arc::Label phi_label,
@@ -1304,7 +1206,6 @@ bool IsStochasticFst(const Fst<Arc> &fst,
                      float delta,
                      typename Arc::Weight *min_sum,
                      typename Arc::Weight *max_sum) {
-  typedef typename Arc::Label Label;
   typedef typename Arc::StateId StateId;
   typedef typename Arc::Weight Weight;
   NaturalLess<Weight> nl;
@@ -1342,7 +1243,6 @@ bool IsStochasticFst(const Fst<LogArc> &fst,
                      LogArc::Weight *min_sum,
                      LogArc::Weight *max_sum) {
   typedef LogArc Arc;
-  typedef Arc::Label Label;
   typedef Arc::StateId StateId;
   typedef Arc::Weight Weight;
   bool first_time = true;
diff --git a/src/fstext/fstext-utils-test.cc b/src/fstext/fstext-utils-test.cc
index 1233491baf6..7f63d83186b 100644
--- a/src/fstext/fstext-utils-test.cc
+++ b/src/fstext/fstext-utils-test.cc
@@ -51,7 +51,7 @@ void TestMakeLinearAcceptor() {
   assert(vec_nozeros == vec2);
   assert(vec_nozeros == vec3);
 
-  if (vec2.size() != 0 || vec3.size() != 0) { // This test might not work 
+  if (vec2.size() != 0 || vec3.size() != 0) { // This test might not work
     // for empty sequences...
     {
       vector<vector<I> > vecs2;
@@ -70,7 +70,7 @@ void TestMakeLinearAcceptor() {
       assert(RandEquivalent(vfst, fstvec[0], 2/*paths*/, 0.01/*delta*/,
                             kaldi::Rand()/*seed*/, 100/*path length-- max?*/));
     }
-  }  
+  }
   bool include_eps = (kaldi::Rand() % 2 == 0);
   if (!include_eps) vec = vec_nozeros;
   kaldi::SortAndUniq(&vec);
@@ -335,15 +335,14 @@ template<class Arc>  void TestMakeLoopFst() {
       *fst2 = MakeLoopFstCompare(fsts);
 
   assert(fst1->Properties(kOLabelSorted, kOLabelSorted) != 0);
-      
+
   assert(RandEquivalent(*fst1, *fst2, 5/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 100/*path length-- max?*/));
   delete fst1;
   delete fst2;
   std::sort(fsts.begin(), fsts.end());
   fsts.erase(std::unique(fsts.begin(), fsts.end()), fsts.end());
   for (int i = 0; i < (int)fsts.size(); i++)
-    if (fsts[i] != NULL)
-      delete fsts[i];
+    delete fsts[i];
 }
 
 
diff --git a/src/fstext/fstext-utils.h b/src/fstext/fstext-utils.h
index e3b914a4264..e06207a111e 100644
--- a/src/fstext/fstext-utils.h
+++ b/src/fstext/fstext-utils.h
@@ -29,10 +29,9 @@
 #include <fst/fstlib.h>
 #include <fst/fst-decl.h>
 #include "fstext/determinize-star.h"
-#include "fstext/deterministic-fst.h"
 #include "fstext/remove-eps-local.h"
-#include "../base/kaldi-common.h" // for error reporting macros.
-#include "../util/text-utils.h" // for SplitStringToVector
+#include "base/kaldi-common.h" // for error reporting macros.
+#include "util/text-utils.h" // for SplitStringToVector
 #include "fst/script/print-impl.h"
 
 namespace fst {
@@ -130,7 +129,6 @@ void MinimizeEncoded(VectorFst<Arc> *fst, float delta = kDelta) {
 /// the total weight as "tot_weight". The total weight will be Weight::Zero()
 /// if the FST is empty.  If any of the output pointers are NULL, it does not
 /// create that output.
-
 template<class Arc, class I>
 bool GetLinearSymbolSequence(const Fst<Arc> &fst,
                              vector<I> *isymbols_out,
@@ -167,7 +165,7 @@ bool GetLinearSymbolSequences(const Fst<Arc> &fst,
 template<class Arc>
 void ConvertNbestToVector(const Fst<Arc> &fst,
                           vector<VectorFst<Arc> > *fsts_out);
-  
+
 
 /// Takes the n-shortest-paths (using ShortestPath), but outputs
 /// the result as a vector of up to n fsts.  This function will
@@ -382,15 +380,6 @@ void PhiCompose(const Fst<Arc> &fst1,
                 typename Arc::Label phi_label,
                 MutableFst<Arc> *fst);
 
-// Compose a left hand FST or lattice with a right hand
-// DeterministicOnDemandFst and store the result in fst_composed.
-// This is mainly used for expanding lattice n-gram histories, where
-// fst1 is a lattice and fst2 is an UnweightedNgramFst.
-template<class Arc>
-void ComposeDeterministicOnDemand(const Fst<Arc> &fst1,
-                                  DeterministicOnDemandFst<Arc> *fst2,
-                                  MutableFst<Arc> *fst_composed);
-
 // PropagateFinal propagates final-probs through
 // "phi" transitions (note that here, phi_label may
 // be epsilon if you want).  If you have a backoff LM
diff --git a/src/fstext/kaldi-fst-io-inl.h b/src/fstext/kaldi-fst-io-inl.h
index 1d269be4061..9185295bee6 100644
--- a/src/fstext/kaldi-fst-io-inl.h
+++ b/src/fstext/kaldi-fst-io-inl.h
@@ -32,9 +32,7 @@ void WriteFstKaldi(std::ostream &os, bool binary,
                    const VectorFst<Arc> &t) {
   bool ok;
   if (binary) {
-    // Binary-mode writing.  No binary header; the Read function
-    // knows it is text mode if it sees a space as the 1st character
-    // (the leading \n).
+    // Binary-mode writing.
     ok = t.Write(os, FstWriteOptions());
   } else {
     // Text-mode output.  Note: we expect that t.InputSymbols() and
@@ -53,7 +51,7 @@ void WriteFstKaldi(std::ostream &os, bool binary,
 #endif
     printer.Print(&os, "<unknown>");
     if (os.fail())
-      KALDI_ERR << "Stream failure detected writing FST to stream\n";
+      KALDI_ERR << "Stream failure detected writing FST to stream";
     // Write another newline as a terminating character.  The read routine will
     // detect this [this is a Kaldi mechanism, not something in the original
     // OpenFst code].
@@ -89,6 +87,7 @@ void ReadFstKaldi(std::istream &is, bool binary,
       KALDI_ERR << "Error reading FST from stream.";
     }
     *fst = *ans;  // shallow copy.
+    delete ans;
   } else {
     // Consume the \r on Windows, the \n that the text-form FST format starts
     // with, and any extra spaces that might have got in there somehow.
@@ -105,7 +104,7 @@ void ReadFstKaldi(std::istream &is, bool binary,
     fst->DeleteStates();
     string line;
     size_t nline = 0;
-    string separator = FLAGS_fst_field_separator + "\r\n";      
+    string separator = FLAGS_fst_field_separator + "\r\n";
     while (std::getline(is, line)) {
       nline++;
       vector<string> col;
@@ -167,14 +166,14 @@ void ReadFstKaldi(std::istream &is, bool binary,
         KALDI_ERR << "Bad line in FST: " << line;
     }
   }
-}  
+}
 
 
 
 
 template<class Arc> // static
 bool VectorFstTplHolder<Arc>::Write(std::ostream &os, bool binary, const T &t) {
-  try { 
+  try {
     WriteFstKaldi(os, binary, t);
     return true;
   } catch (...) {
@@ -201,8 +200,8 @@ bool VectorFstTplHolder<Arc>::Read(std::istream &is) {
       return false;
     }
   } else {  // reading a binary FST.
-    try { 
-      t_ = new VectorFst<Arc>();      
+    try {
+      t_ = new VectorFst<Arc>();
       ReadFstKaldi(is, true, t_);
     } catch (...) {
       Clear();
@@ -214,4 +213,4 @@ bool VectorFstTplHolder<Arc>::Read(std::istream &is) {
 
 } // namespace fst.
 
-#endif
+#endif  // KALDI_FSTEXT_KALDI_FST_IO_INL_H_
diff --git a/src/fstext/kaldi-fst-io.cc b/src/fstext/kaldi-fst-io.cc
index de9bac540bb..bedebbf6c78 100644
--- a/src/fstext/kaldi-fst-io.cc
+++ b/src/fstext/kaldi-fst-io.cc
@@ -42,6 +42,12 @@ VectorFst<StdArc> *ReadFstKaldi(std::string rxfilename) {
   return fst;
 }
 
+void ReadFstKaldi(std::string rxfilename, fst::StdVectorFst *ofst) {
+  fst::StdVectorFst *fst = ReadFstKaldi(rxfilename);
+  *ofst = *fst;
+  delete fst;
+}
+
 void WriteFstKaldi(const VectorFst<StdArc> &fst,
                    std::string wxfilename) {
   if (wxfilename == "") wxfilename = "-"; // interpret "" as stdout,
diff --git a/src/fstext/kaldi-fst-io.h b/src/fstext/kaldi-fst-io.h
index e123ea619b8..301166fcdde 100644
--- a/src/fstext/kaldi-fst-io.h
+++ b/src/fstext/kaldi-fst-io.h
@@ -27,14 +27,24 @@
 #include <fst/script/print-impl.h>
 #include "base/kaldi-common.h"
 
-// Some functions for writing Fsts
+// Some functions for writing Fsts.
+// I/O for FSTs is a bit of a mess, and not very well integrated with Kaldi's
+// generic I/O mechanisms, because we want files containing just FSTs to
+// be readable by OpenFST's native binaries, which is not compatible
+// with the normal \0B header that identifies Kaldi files as containing
+// binary data.
+// So use the functions here with your eyes open, and with caution!
 namespace fst {
 
 // Read a binary FST using Kaldi I/O mechanisms (pipes, etc.)
-// On error, throws using KALDI_ERR.  For use only in code in fstbin/,
-// as it doesn't support the text-mode option that we generally like to support.
+// On error, throws using KALDI_ERR.  Note: this
+// doesn't support the text-mode option that we generally like to support.
 VectorFst<StdArc> *ReadFstKaldi(std::string rxfilename);
 
+// Version of ReadFstKaldi() that writes to a pointer.  Assumes
+// the FST is binary with no binary marker.  Crashes on error.
+void ReadFstKaldi(std::string rxfilename, VectorFst<StdArc> *ofst);
+
 // Write an FST using Kaldi I/O mechanisms (pipes, etc.)
 // On error, throws using KALDI_ERR.  For use only in code in fstbin/,
 // as it doesn't support the text-mode option.
@@ -70,7 +80,7 @@ class VectorFstTplHolder {
 
   VectorFstTplHolder(): t_(NULL) { }
 
-  static bool Write(std::ostream &os, bool binary, const T &t);  
+  static bool Write(std::ostream &os, bool binary, const T &t);
 
   void Copy(const T &t) {  // copies it into the holder.
     Clear();
@@ -100,6 +110,16 @@ class VectorFstTplHolder {
     }
   }
 
+  void Swap(VectorFstTplHolder<Arc> *other) {
+    std::swap(t_, other->t_);
+  }
+
+  bool ExtractRange(const VectorFstTplHolder<Arc> &other,
+                    const std::string &range) {
+    KALDI_ERR << "ExtractRange is not defined for this type of holder.";
+    return false;
+  }
+
   ~VectorFstTplHolder() { Clear(); }
   // No destructor.  Assignment and
   // copy constructor take their default implementations.
diff --git a/src/fstext/lattice-utils-inl.h b/src/fstext/lattice-utils-inl.h
index a8bc56301e1..a3f603aa274 100644
--- a/src/fstext/lattice-utils-inl.h
+++ b/src/fstext/lattice-utils-inl.h
@@ -36,7 +36,6 @@ void ConvertLattice(
     bool invert) {
   typedef ArcTpl<Weight> Arc;
   typedef typename Arc::StateId StateId;
-  typedef typename Arc::Label Label;
   typedef CompactLatticeWeightTpl<Weight, Int> CompactWeight;
   typedef ArcTpl<CompactWeight> CompactArc;
 
@@ -50,11 +49,11 @@ void ConvertLattice(
     Invert(&invfst);
     Factor(invfst, &ffst,  &labels);
   }
-    
+
   TopSort(&ffst); // Put the states in ffst in topological order, which is
   // easier on the eye when reading the text-form lattices and corresponds to
   // what we get when we generate the lattices in the decoder.
-  
+
   ofst->DeleteStates();
 
   // The states will be numbered exactly the same as the original FST.
@@ -146,10 +145,10 @@ void ConvertLattice(
           olabel = (string_length > 0 ? arc.weight.String()[string_length-1] : 0);
       Weight weight = (string_length <= 1 ? arc.weight.Weight() : Weight::One());
       Arc new_arc(ilabel, olabel, weight, arc.nextstate);
-      if (invert) std::swap(new_arc.ilabel, new_arc.olabel);      
+      if (invert) std::swap(new_arc.ilabel, new_arc.olabel);
       ofst->AddArc(cur_state, new_arc);
     }
-  }    
+  }
 }
 
 // This function converts lattices between float and double;
@@ -161,7 +160,6 @@ void ConvertLattice(
   typedef ArcTpl<WeightIn> ArcIn;
   typedef ArcTpl<WeightOut> ArcOut;
   typedef typename ArcIn::StateId StateId;
-  typedef typename ArcOut::Label Label;
   ofst->DeleteStates();
   // The states will be numbered exactly the same as the original FST.
   // Add the states to the new FST.
@@ -205,7 +203,6 @@ void ScaleLattice(
   typedef ArcTpl<Weight> Arc;
   typedef MutableFst<Arc> Fst;
   typedef typename Arc::StateId StateId;
-  typedef typename Arc::Label Label;
   StateId num_states = fst->NumStates();
   for (StateId s = 0; s < num_states; s++) {
     for (MutableArcIterator<Fst> aiter(fst, s);
@@ -228,7 +225,6 @@ void RemoveAlignmentsFromCompactLattice(
   typedef ArcTpl<W> Arc;
   typedef MutableFst<Arc> Fst;
   typedef typename Arc::StateId StateId;
-  typedef typename Arc::Label Label;
   StateId num_states = fst->NumStates();
   for (StateId s = 0; s < num_states; s++) {
     for (MutableArcIterator<Fst> aiter(fst, s);
@@ -251,7 +247,6 @@ bool CompactLatticeHasAlignment(
   typedef ArcTpl<W> Arc;
   typedef ExpandedFst<Arc> Fst;
   typedef typename Arc::StateId StateId;
-  typedef typename Arc::Label Label;
   StateId num_states = fst.NumStates();
   for (StateId s = 0; s < num_states; s++) {
     for (ArcIterator<Fst> aiter(fst, s);
@@ -267,6 +262,19 @@ bool CompactLatticeHasAlignment(
 }
 
 
+template <class Real>
+void ConvertFstToLattice(
+    const ExpandedFst<ArcTpl<TropicalWeight> > &ifst,
+    MutableFst<ArcTpl<LatticeWeightTpl<Real> > > *ofst) {
+  int32 num_states_cache = 50000;
+  CacheOptions cache_opts(true, num_states_cache);
+  StdToLatticeMapper<Real> mapper;
+  MapFst<StdArc, ArcTpl<LatticeWeightTpl<Real> >,
+         StdToLatticeMapper<Real> > map_fst(ifst, mapper, cache_opts);
+  *ofst = map_fst;
+}
+
+
 }
 
 
diff --git a/src/fstext/lattice-utils.h b/src/fstext/lattice-utils.h
index 04c5a4d2c1a..6fcb336ceb2 100644
--- a/src/fstext/lattice-utils.h
+++ b/src/fstext/lattice-utils.h
@@ -100,7 +100,7 @@ void ConvertLattice(const ExpandedFst<ArcTpl<LatticeWeightTpl<double> > > &ifst,
   ConvertLattice(fst, ofst);
 }
 
-// CompactLattice with double to Lattice with float.
+/// Converts CompactLattice with double to Lattice with float.
 template<class Int>
 void ConvertLattice(const ExpandedFst<ArcTpl<CompactLatticeWeightTpl<LatticeWeightTpl<double>, Int> > > &ifst,
                     MutableFst<ArcTpl<LatticeWeightTpl<float> > > *ofst) {
@@ -109,7 +109,7 @@ void ConvertLattice(const ExpandedFst<ArcTpl<CompactLatticeWeightTpl<LatticeWeig
   ConvertLattice(fst, ofst);
 }
 
-// CompactLattice with float to Lattice with double.
+/// Converts CompactLattice with float to Lattice with double.
 template<class Int>
 void ConvertLattice(const ExpandedFst<ArcTpl<CompactLatticeWeightTpl<LatticeWeightTpl<float>, Int> > > &ifst,
                     MutableFst<ArcTpl<LatticeWeightTpl<double> > > *ofst) {
@@ -118,6 +118,13 @@ void ConvertLattice(const ExpandedFst<ArcTpl<CompactLatticeWeightTpl<LatticeWeig
   ConvertLattice(fst, ofst);
 }
 
+/// Converts TropicalWeight to LatticeWeight (puts all the weight on
+/// the first float in the lattice's pair).
+template <class Real>
+void ConvertFstToLattice(
+    const ExpandedFst<ArcTpl<TropicalWeight> > &ifst,
+    MutableFst<ArcTpl<LatticeWeightTpl<Real> > > *ofst);
+
 
 /** Returns a default 2x2 matrix scaling factor for LatticeWeight */
 inline vector<vector<double> > DefaultLatticeScale() {
@@ -185,10 +192,9 @@ bool CompactLatticeHasAlignment(
 /// to a LatticeArc by putting the StdArc weight as the first
 /// element of the LatticeWeight.  Useful when doing LM
 /// rescoring.
-
-template<class Int>
+template<class Real>
 class StdToLatticeMapper {
-  typedef LatticeWeightTpl<Int> LatticeWeight;
+  typedef LatticeWeightTpl<Real> LatticeWeight;
   typedef ArcTpl<LatticeWeight> LatticeArc;
  public:
   LatticeArc operator()(const StdArc &arc) {
@@ -216,9 +222,9 @@ class StdToLatticeMapper {
 /// Class LatticeToStdMapper maps a LatticeArc to a normal arc (StdArc)
 /// by adding the elements of the LatticeArc weight.
 
-template<class Int>
+template<class Real>
 class LatticeToStdMapper {
-  typedef LatticeWeightTpl<Int> LatticeWeight;
+  typedef LatticeWeightTpl<Real> LatticeWeight;
   typedef ArcTpl<LatticeWeight> LatticeArc;
  public:
   StdArc operator()(const LatticeArc &arc) {
diff --git a/src/fstext/lattice-weight.h b/src/fstext/lattice-weight.h
index f4d84797fc5..8453b9c5670 100644
--- a/src/fstext/lattice-weight.h
+++ b/src/fstext/lattice-weight.h
@@ -56,7 +56,7 @@ class LatticeWeightTpl {
   inline void SetValue1(T f) { value1_ = f; }
 
   inline void SetValue2(T f) { value2_ = f; }
-  
+
   LatticeWeightTpl() { }
 
   LatticeWeightTpl(T a, T b): value1_(a), value2_(b) {}
@@ -72,7 +72,7 @@ class LatticeWeightTpl {
   LatticeWeightTpl<FloatType> Reverse() const {
     return *this;
   }
-  
+
   static const LatticeWeightTpl Zero() {
     return LatticeWeightTpl(numeric_limits<T>::infinity(),
                             numeric_limits<T>::infinity());
@@ -81,7 +81,7 @@ class LatticeWeightTpl {
   static const LatticeWeightTpl One() {
     return LatticeWeightTpl(0.0, 0.0);
   }
-  
+
   static const string &Type() {
     static const string type = (sizeof(T) == 4 ? "lattice4" : "lattice8") ;
     return type;
@@ -107,7 +107,7 @@ class LatticeWeightTpl {
     }
     return true;
   }
-  
+
   LatticeWeightTpl Quantize(float delta = kDelta) const {
     if (value1_+value2_ == -numeric_limits<T>::infinity()) {
       return LatticeWeightTpl(-numeric_limits<T>::infinity(), -numeric_limits<T>::infinity());
@@ -123,7 +123,7 @@ class LatticeWeightTpl {
     return kLeftSemiring | kRightSemiring | kCommutative |
         kPath | kIdempotent;
   }
-  
+
   // This is used in OpenFst for binary I/O.  This is OpenFst-style,
   // not Kaldi-style, I/O.
   istream &Read(istream &strm) {
@@ -169,7 +169,7 @@ class LatticeWeightTpl {
     else
       strm << f;
   }
-  
+
   // Internal helper function, used in ReadNoParen.
   inline static void ReadFloatType(istream &strm, T &f) {
     string s;
@@ -213,7 +213,7 @@ class LatticeWeightTpl {
     ReadFloatType(strm, value2_);
     return strm;
   }
-  
+
   friend istream &operator>> <FloatType>(istream&, LatticeWeightTpl<FloatType>&);
   friend ostream &operator<< <FloatType>(ostream&, const LatticeWeightTpl<FloatType>&);
 
@@ -221,12 +221,12 @@ class LatticeWeightTpl {
   T value1_;
   T value2_;
 
-  
+
 };
 
 
 /* ScaleTupleWeight is a function defined for LatticeWeightTpl and
-   CompactLatticeWeightTpl that mutliplies the pair (value1_, value2_) by a 2x2 
+   CompactLatticeWeightTpl that mutliplies the pair (value1_, value2_) by a 2x2
    matrix.  Used, for example, in applying acoustic scaling.
  */
 template<class FloatType, class ScaleFloatType>
@@ -239,7 +239,7 @@ inline LatticeWeightTpl<FloatType> ScaleTupleWeight(
   return LatticeWeightTpl<FloatType>(scale[0][0] * w.Value1() + scale[0][1] * w.Value2(),
                                      scale[1][0] * w.Value1() + scale[1][1] * w.Value2());
 }
-                                        
+
 /* For testing purposes and in case it's ever useful, we define a similar
    function to apply to LexicographicWeight and the like, templated on
    TropicalWeight<float> etc.; we use PairWeight which is the base class of
@@ -261,7 +261,7 @@ inline PairWeight<TropicalWeightTpl<FloatType>,
   return PairType(BaseType(scale[0][0] * f1 + scale[0][1] * f2),
                   BaseType(scale[1][0] * f1 + scale[1][1] * f2));
 }
-                                        
+
 
 
 template<class FloatType>
@@ -315,7 +315,7 @@ inline LatticeWeightTpl<FloatType> Plus(const LatticeWeightTpl<FloatType> &w1,
 }
 
 
-// For efficiency, override the NaturalLess template class.  
+// For efficiency, override the NaturalLess template class.
 template<class FloatType>
 class NaturalLess<LatticeWeightTpl<FloatType> > {
  public:
@@ -324,14 +324,14 @@ class NaturalLess<LatticeWeightTpl<FloatType> > {
     // NaturalLess is a negative order (opposite to normal ordering).
     // This operator () corresponds to "<" in the negative order, which
     // corresponds to the ">" in the normal order.
-    return (Compare(w1, w2) == 1); 
+    return (Compare(w1, w2) == 1);
   }
 };
 
 template<class FloatType>
 inline LatticeWeightTpl<FloatType> Times(const LatticeWeightTpl<FloatType> &w1,
                                          const LatticeWeightTpl<FloatType> &w2) {
-  return LatticeWeightTpl<FloatType>(w1.Value1()+w2.Value1(), w1.Value2()+w2.Value2());  
+  return LatticeWeightTpl<FloatType>(w1.Value1()+w2.Value1(), w1.Value2()+w2.Value2());
 }
 
 // divide w1 by w2 (on left/right/any doesn't matter as
@@ -344,8 +344,8 @@ inline LatticeWeightTpl<FloatType> Divide(const LatticeWeightTpl<FloatType> &w1,
   T a = w1.Value1() - w2.Value1(), b = w1.Value2() - w2.Value2();
   if (a != a || b != b || a == -numeric_limits<T>::infinity()
      || b == -numeric_limits<T>::infinity()) {
-    std::cerr << "LatticeWeightTpl::Divide, NaN or invalid number produced. "
-              << "[dividing by zero?]  Returning zero.";
+    KALDI_WARN << "LatticeWeightTpl::Divide, NaN or invalid number produced. "
+               << "[dividing by zero?]  Returning zero";
     return LatticeWeightTpl<T>::Zero();
   }
   if (a == numeric_limits<T>::infinity() ||
@@ -365,7 +365,6 @@ inline bool ApproxEqual(const LatticeWeightTpl<FloatType> &w1,
 
 template <class FloatType>
 inline ostream &operator <<(ostream &strm, const LatticeWeightTpl<FloatType> &w) {
-  typedef FloatType T;
   LatticeWeightTpl<FloatType>::WriteFloatType(strm, w.Value1());
   CHECK(FLAGS_fst_weight_separator.size() == 1);
   strm << FLAGS_fst_weight_separator[0]; // comma by default;
@@ -378,7 +377,7 @@ template <class FloatType>
 inline istream &operator >>(istream &strm, LatticeWeightTpl<FloatType> &w1) {
   CHECK(FLAGS_fst_weight_separator.size() == 1);
   // separator defaults to ','
-  return w1.ReadNoParen(strm, FLAGS_fst_weight_separator[0]); 
+  return w1.ReadNoParen(strm, FLAGS_fst_weight_separator[0]);
 }
 
 
@@ -396,7 +395,7 @@ class CompactLatticeWeightTpl {
   typedef WeightType W;
 
   typedef CompactLatticeWeightTpl<WeightType, IntType> ReverseWeight;
-  
+
   // Plus is like LexicographicWeight on the pair (weight_, string_), but where we
   // use standard lexicographic order on string_ [this is not the same as
   // NaturalLess on the StringWeight equivalent, which does not define a
@@ -408,7 +407,7 @@ class CompactLatticeWeightTpl {
 
   CompactLatticeWeightTpl(const WeightType &w, const vector<IntType> &s):
       weight_(w), string_(s) { }
-  
+
   CompactLatticeWeightTpl &operator=(const CompactLatticeWeightTpl<WeightType, IntType> &w) {
     weight_ = w.weight_;
     string_ = w.string_;
@@ -422,7 +421,7 @@ class CompactLatticeWeightTpl {
   void SetWeight(const W &w) { weight_ = w; }
 
   void SetString(const vector<IntType> &s) { string_ = s; }
-  
+
   static const CompactLatticeWeightTpl<WeightType, IntType> Zero() {
     return CompactLatticeWeightTpl<WeightType, IntType>(
         WeightType::Zero(), vector<IntType>());
@@ -450,7 +449,7 @@ class CompactLatticeWeightTpl {
         WeightType::NoWeight(), std::vector<IntType>());
   }
 
-  
+
   CompactLatticeWeightTpl<WeightType, IntType> Reverse() const {
     size_t s = string_.size();
     vector<IntType> v(s);
@@ -458,7 +457,7 @@ class CompactLatticeWeightTpl {
       v[i] = string_[s-i-1];
     return CompactLatticeWeightTpl<WeightType, IntType>(weight_, v);
   }
-  
+
   bool Member() const {
     // a semiring has only one zero, this is the important property
     // we're trying to maintain here.  So force string_ to be empty if
@@ -487,7 +486,7 @@ class CompactLatticeWeightTpl {
     ReadType(strm, &sz);
     if (strm.fail()){ return strm; }
     if (sz < 0) {
-      std::cerr << "Negative string size!  Read failure.";
+      KALDI_WARN << "Negative string size!  Read failure";
       strm.clear(std::ios::badbit);
       return strm;
     }
@@ -521,7 +520,7 @@ class CompactLatticeWeightTpl {
   }
  private:
   W weight_;
-  vector<IntType> string_; 
+  vector<IntType> string_;
 
 };
 
@@ -565,7 +564,7 @@ inline int Compare (const CompactLatticeWeightTpl<WeightType, IntType> &w1,
   if (c1 != 0) return c1;
   int l1 = w1.String().size(), l2 = w2.String().size();
   // Use opposite order on the string lengths, so that if the costs are the same,
-  // the shorter string wins.  
+  // the shorter string wins.
   if (l1 > l2) return -1;
   else if (l1 < l2) return 1;
   for(int i = 0; i < l1; i++) {
@@ -584,7 +583,7 @@ class NaturalLess<CompactLatticeWeightTpl<LatticeWeightTpl<FloatType>, IntType>
     // NaturalLess is a negative order (opposite to normal ordering).
     // This operator () corresponds to "<" in the negative order, which
     // corresponds to the ">" in the normal order.
-    return (Compare(w1, w2) == 1); 
+    return (Compare(w1, w2) == 1);
   }
 };
 
@@ -597,21 +596,20 @@ inline int Compare(const TropicalWeight &w1,
   else if (f1 > f2) return -1;
   else return 1;
 }
-                   
+
 
 
 template<class WeightType, class IntType>
 inline CompactLatticeWeightTpl<WeightType, IntType> Plus(
     const CompactLatticeWeightTpl<WeightType, IntType> &w1,
     const CompactLatticeWeightTpl<WeightType, IntType> &w2) {
-  return (Compare(w1, w2) >= 0 ? w1 : w2); 
+  return (Compare(w1, w2) >= 0 ? w1 : w2);
 }
 
 template<class WeightType, class IntType>
 inline CompactLatticeWeightTpl<WeightType, IntType> Times(
     const CompactLatticeWeightTpl<WeightType, IntType> &w1,
     const CompactLatticeWeightTpl<WeightType, IntType> &w2) {
-  typedef WeightType T;
   WeightType w = Times(w1.Weight(), w2.Weight());
   if (w == WeightType::Zero()) {
     return CompactLatticeWeightTpl<WeightType, IntType>::Zero();
@@ -634,40 +632,34 @@ inline CompactLatticeWeightTpl<WeightType, IntType> Divide(const CompactLatticeW
     if (w2.Weight() != WeightType::Zero()) {
       return CompactLatticeWeightTpl<WeightType, IntType>::Zero();
     } else {
-      std::cerr << "Division by zero [0/0] in CompactLatticeWeightTpl\n";
-      exit(1);
+      KALDI_ERR << "Division by zero [0/0]";
     }
   } else if (w2.Weight() == WeightType::Zero()) {
-    std::cerr << "Error: division by zero in CompactLatticeWeightTpl::Divide()";
-    exit(1);
+    KALDI_ERR << "Error: division by zero";
   }
   WeightType w = Divide(w1.Weight(), w2.Weight());
 
   const vector<IntType> v1 = w1.String(), v2 = w2.String();
   if (v2.size() > v1.size()) {
-    std::cerr << "Error in Divide (CompactLatticeWeightTpl): cannot divide, length mismatch.\n";
-    exit(1);
+    KALDI_ERR << "Cannot divide, length mismatch";
   }
   typename vector<IntType>::const_iterator v1b = v1.begin(),
       v1e = v1.end(), v2b = v2.begin(), v2e = v2.end();
   if (div == DIVIDE_LEFT) {
     if (!std::equal(v2b, v2e, v1b)) { // v2 must be identical to first part of v1.
-      std::cerr << "Error in Divide (CompactLatticeWeighTpl): cannot divide, data mismatch.\n";
-      exit(1);
+      KALDI_ERR << "Cannot divide, data mismatch";
     }
     return CompactLatticeWeightTpl<WeightType, IntType>(
         w, vector<IntType>(v1b+(v2e-v2b), v1e)); // return last part of v1.
   } else if (div == DIVIDE_RIGHT) {
     if (!std::equal(v2b, v2e, v1e-(v2e-v2b))) { // v2 must be identical to last part of v1.
-      std::cerr << "Error in Divide (CompactLatticeWeighTpl): cannot divide, data mismatch.\n";
-      exit(1);
+      KALDI_ERR << "Cannot divide, data mismatch";
     }
     return CompactLatticeWeightTpl<WeightType, IntType>(
         w, vector<IntType>(v1b, v1e-(v2e-v2b))); // return first part of v1.
 
   } else {
-    std::cerr << "Cannot divide CompactLatticeWeightTpl with DIVIDE_ANY.\n";
-    exit(1);
+    KALDI_ERR << "Cannot divide CompactLatticeWeightTpl with DIVIDE_ANY";
   }
   return CompactLatticeWeightTpl<WeightType,IntType>::Zero(); // keep compiler happy.
 }
@@ -693,9 +685,9 @@ inline istream &operator >>(istream &strm, CompactLatticeWeightTpl<WeightType, I
     return strm;
   }
   CHECK(FLAGS_fst_weight_separator.size() == 1);
-  size_t pos = s.find_last_of(FLAGS_fst_weight_separator); // normally ","  
+  size_t pos = s.find_last_of(FLAGS_fst_weight_separator); // normally ","
   if (pos == std::string::npos) {
-    strm.clear(std::ios::badbit);    
+    strm.clear(std::ios::badbit);
     return strm;
   }
   // get parts of str before and after the separator (default: ',');
@@ -731,7 +723,7 @@ template<class BaseWeightType, class IntType>
 class CompactLatticeWeightCommonDivisorTpl {
  public:
   typedef CompactLatticeWeightTpl<BaseWeightType, IntType> Weight;
-  
+
   Weight operator()(const Weight &w1, const Weight &w2) const {
     // First find longest common prefix of the strings.
     typename vector<IntType>::const_iterator s1b = w1.String().begin(),
@@ -806,8 +798,6 @@ inline double ConvertToCost(const TropicalWeightTpl<Float> &w) {
 }
 
 
-
-  
-} // end namespace fst
+}  // namespace fst
 
 #endif  // KALDI_FSTEXT_LATTICE_WEIGHT_H_
diff --git a/src/fstext/pre-determinize-inl.h b/src/fstext/pre-determinize-inl.h
index 92d649546f3..557dbc9d124 100644
--- a/src/fstext/pre-determinize-inl.h
+++ b/src/fstext/pre-determinize-inl.h
@@ -336,12 +336,12 @@ void PreDeterminize(MutableFst<Arc> *fst,
   }
 
   {  // (D)(i)(b): make single final state.
-    KALDI_VLOG(2) <<  "PreDeterminize: creating single final state\n";
+    KALDI_VLOG(2) <<  "PreDeterminize: creating single final state";
     CreateSuperFinal(fst);
   }
 
   {  // (D)(i)(c): sort arcs on input.
-    KALDI_VLOG(2) <<  "PreDeterminize: sorting arcs on input\n";
+    KALDI_VLOG(2) <<  "PreDeterminize: sorting arcs on input";
     ILabelCompare<Arc> icomp;
     ArcSort(fst, icomp);
   }
@@ -401,14 +401,10 @@ void PreDeterminize(MutableFst<Arc> *fst,
       // Closure in this case whis will usually not add anything, for typical topologies in speech
       vector<StateId> closure_s_vec;
       pre_determinize_helpers::CopySetToVector(closure_s, &closure_s_vec);
-      assert(closure_s_vec.size() != 0);
+      KALDI_ASSERT(closure_s_vec.size() != 0);
       vector<StateId> *ptr = pre_determinize_helpers::InsertMember(closure_s_vec, &S);
-      if (ptr != NULL) {  // was inserted.
-        Q.push_back(pair<vector<StateId>*, size_t>(ptr, 0));
-      } else {
-        assert(!"Error: PreDeterminize failed in initialization\n");  // conceptual bug or programming error.
-        exit(1);
-      }
+      KALDI_ASSERT(ptr != NULL);  // Or conceptual bug or programming error.
+      Q.push_back(pair<vector<StateId>*, size_t>(ptr, 0));
     }
   }
 
@@ -547,9 +543,8 @@ void PreDeterminize(MutableFst<Arc> *fst,
     for (typename map<pair<StateId, ArcId>, size_t>::iterator m_iter = m_map.begin();
         m_iter != m_map.end();
         ++m_iter) {
-      pair<StateId, ArcId> pr = m_iter->first;
-      StateId state = pr.first;
-      ArcId arcpos = pr.second;
+      StateId state = m_iter->first.first;
+      ArcId arcpos = m_iter->first.second;
       size_t m_a = m_iter->second;
 
       MutableArcIterator<MutableFst<Arc> > aiter(fst, state);
@@ -579,7 +574,7 @@ void PreDeterminize(MutableFst<Arc> *fst,
   }
   // Now free up memory.
   for (size_t i = 0;i < S.size();i++)
-    if (S[i] != NULL) delete S[i];
+    delete S[i];
 } // end function PreDeterminize
 
 
@@ -659,7 +654,6 @@ int64 DeleteISymbols(MutableFst<Arc> *fst, vector<typename Arc::Label> isyms) {
 
   typedef typename Arc::Label Label;
   typedef typename Arc::StateId StateId;
-  typedef typename Arc::Weight Weight;
 
   int64 num_deleted = 0;
 
@@ -691,14 +685,13 @@ int64 DeleteISymbols(MutableFst<Arc> *fst, vector<typename Arc::Label> isyms) {
 
 template<class Arc>
 typename Arc::StateId CreateSuperFinal(MutableFst<Arc> *fst) {
-  typedef typename Arc::Label Label;
   typedef typename Arc::StateId StateId;
   typedef typename Arc::Weight Weight;
   assert(fst != NULL);
   StateId num_states = fst->NumStates();
   StateId num_final = 0;
   vector<StateId> final_states;
-  for (StateId s = 0;s < num_states;s++) {
+  for (StateId s = 0; s < num_states; s++) {
     if (fst->Final(s) != Weight::Zero()) {
       num_final++;
       final_states.push_back(s);
@@ -707,7 +700,9 @@ typename Arc::StateId CreateSuperFinal(MutableFst<Arc> *fst) {
   if (final_states.size() == 1) {
     if (fst->Final(final_states[0]) == Weight::One()) {
       ArcIterator<MutableFst<Arc> > iter(*fst, final_states[0]);
-      if (iter.Done()) {  // already have a final state w/ no transitions out and unit weight.  So don.
+      if (iter.Done()) {
+        // We already have a final state w/ no transitions out and unit weight.
+        // So we're done.
         return final_states[0];
       }
     }
@@ -715,7 +710,7 @@ typename Arc::StateId CreateSuperFinal(MutableFst<Arc> *fst) {
 
   StateId final_state = fst->AddState();
   fst->SetFinal(final_state, Weight::One());
-  for (size_t idx = 0;idx < final_states.size();idx++) {
+  for (size_t idx = 0;idx < final_states.size(); idx++) {
     StateId s = final_states[idx];
     Weight weight = fst->Final(s);
     fst->SetFinal(s, Weight::Zero());
@@ -730,6 +725,6 @@ typename Arc::StateId CreateSuperFinal(MutableFst<Arc> *fst) {
 }
 
 
-} // end namespace fst.
+}  // namespace fst
 
-#endif
+#endif  // KALDI_FSTEXT_PRE_DETERMINIZE_INL_H_
diff --git a/src/fstext/push-special.h b/src/fstext/push-special.h
index 927cf07860e..49e1babd47c 100644
--- a/src/fstext/push-special.h
+++ b/src/fstext/push-special.h
@@ -32,9 +32,8 @@ namespace fst {
   gets distributed evenly along the FST, and doesn't end up either
   at the start or at the end.  Basically it pushes the weights such
   that the total weight of each state (i.e. the sum of the arc
-  probabilities plus the final-prob) is the same for all states.  
+  probabilities plus the final-prob) is the same for all states.
 */
-
 void PushSpecial(VectorFst<StdArc> *fst,
                  float delta = kDelta);
 
diff --git a/src/fstext/rand-fst.h b/src/fstext/rand-fst.h
index d5762c429c2..3fb4e897c5c 100644
--- a/src/fstext/rand-fst.h
+++ b/src/fstext/rand-fst.h
@@ -54,7 +54,6 @@ struct RandFstOptions {
 /// Returns a random FST.  Useful for randomized algorithm testing.
 /// Only works if weight can be constructed from float.
 template<class Arc> VectorFst<Arc>* RandFst(RandFstOptions opts = RandFstOptions() ) {
-  typedef typename Arc::Label Label;
   typedef typename Arc::StateId StateId;
   typedef typename Arc::Weight Weight;
 
@@ -107,7 +106,6 @@ template<class Arc> VectorFst<Arc>* RandFst(RandFstOptions opts = RandFstOptions
 /// Returns a random FST.  Useful for randomized algorithm testing.
 /// Only works if weight can be constructed from a pair of floats
 template<class Arc> VectorFst<Arc>* RandPairFst(RandFstOptions opts = RandFstOptions() ) {
-  typedef typename Arc::Label Label;
   typedef typename Arc::StateId StateId;
   typedef typename Arc::Weight Weight;
 
diff --git a/src/fstext/remove-eps-local-inl.h b/src/fstext/remove-eps-local-inl.h
index a71e735e38b..0565981186d 100644
--- a/src/fstext/remove-eps-local-inl.h
+++ b/src/fstext/remove-eps-local-inl.h
@@ -137,7 +137,7 @@ class RemoveEpsLocalClass {
 
 
   void Reweight(StateId s, size_t pos, Weight reweight) {
-    // Reweight is called from RemoveEpsPatter1; it is a step we
+    // Reweight is called from RemoveEpsPattern1; it is a step we
     // do to preserve stochasticity.  This function multiplies the
     // arc at (s, pos) by reweight and divides all the arcs [+final-prob]
     // out of the next state by the same.  This is only valid if
@@ -313,7 +313,7 @@ void RemoveEpsLocal(MutableFst<Arc> *fst) {
 
 void RemoveEpsLocalSpecial(MutableFst<StdArc> *fst) {
   // work gets done in initializer.
-  RemoveEpsLocalClass<StdArc, ReweightPlusLogArc> c(fst);  
+  RemoveEpsLocalClass<StdArc, ReweightPlusLogArc> c(fst);
 }
 
 } // end namespace fst.
diff --git a/src/fstext/table-matcher.h b/src/fstext/table-matcher.h
index 3fde4ec01de..aed821a8725 100644
--- a/src/fstext/table-matcher.h
+++ b/src/fstext/table-matcher.h
@@ -92,7 +92,7 @@ class TableMatcherImpl : public MatcherBase<typename F::Arc> {
       if (tables_[i] != NULL && tables_[i] != empty)
         delete tables_[i];
     }
-    if (aiter_) delete aiter_;
+    delete aiter_;
     delete fst_;
   }
 
@@ -355,14 +355,13 @@ struct TableComposeCache {
   TableMatcher<F> *matcher;
   TableComposeOptions opts;
   TableComposeCache(const TableComposeOptions &opts = TableComposeOptions()): matcher (NULL), opts(opts) {}
-  ~TableComposeCache() { if (matcher) delete(matcher); }
+  ~TableComposeCache() { delete(matcher); }
 };
 
 template<class Arc>
 void TableCompose(const Fst<Arc> &ifst1, const Fst<Arc> &ifst2,
                   MutableFst<Arc> *ofst,
                   TableComposeCache<Fst<Arc> > *cache) {
-  typedef Matcher< Fst<Arc> > M;
   typedef Fst<Arc> F;
   assert(cache != NULL);
   CacheOptions nopts;
diff --git a/src/gmm/Makefile b/src/gmm/Makefile
index b9b9ecedb43..02f32449260 100644
--- a/src/gmm/Makefile
+++ b/src/gmm/Makefile
@@ -1,7 +1,7 @@
 all:
 
-OPENFST_CXXFLAGS = 
-OPENFST_LDLIBS = 
+OPENFST_CXXFLAGS =
+OPENFST_LDLIBS =
 include ../kaldi.mk
 
 TESTFILES = diag-gmm-test mle-diag-gmm-test full-gmm-test mle-full-gmm-test \
@@ -14,8 +14,8 @@ OBJFILES = diag-gmm.o diag-gmm-normal.o mle-diag-gmm.o am-diag-gmm.o \
 
 LIBNAME = kaldi-gmm
 
-ADDLIBS = ../tree/kaldi-tree.a ../thread/kaldi-thread.a ../util/kaldi-util.a \
-        ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+ADDLIBS = ../tree/kaldi-tree.a ../util/kaldi-util.a ../thread/kaldi-thread.a \
+        ../matrix/kaldi-matrix.a ../base/kaldi-base.a
 
 
 
diff --git a/src/gmm/am-diag-gmm.cc b/src/gmm/am-diag-gmm.cc
index c93e8e15546..cf80ea92c28 100644
--- a/src/gmm/am-diag-gmm.cc
+++ b/src/gmm/am-diag-gmm.cc
@@ -214,7 +214,7 @@ void ClusterGaussiansToUbm(const AmDiagGmm &am,
     ClusterGaussiansToUbm(tmp_am, state_occs, opts, ubm_out);
     return;
   }
-  
+
   int32 num_pdfs = static_cast<int32>(am.NumPdfs()),
       dim = am.Dim(),
       num_clust_states = static_cast<int32>(opts.reduce_state_factor*num_pdfs);
@@ -234,7 +234,10 @@ void ClusterGaussiansToUbm(const AmDiagGmm &am,
     tmp_gmm.GetComponentMean(0, &tmp_mean);
     tmp_gmm.GetComponentVariance(0, &tmp_var);
     tmp_var.AddVec2(1.0, tmp_mean);  // make it x^2 stats.
-    BaseFloat this_weight = state_occs(pdf_index);
+    // It may cause problems downstream if we add states with zero weights (see
+    // KALDI_ASSERT(weight > 0) below), so we put in a very small floor.
+    // These states with tiny weights will later get merged into other states.
+    BaseFloat this_weight = 1.0e-10 + state_occs(pdf_index);
     tmp_mean.Scale(this_weight);
     tmp_var.Scale(this_weight);
     states.push_back(new GaussClusterable(tmp_mean, tmp_var,
@@ -263,7 +266,9 @@ void ClusterGaussiansToUbm(const AmDiagGmm &am,
       am.GetGaussianMean(pdf_index, gauss_index, &tmp_mean);
       am.GetGaussianVariance(pdf_index, gauss_index, &tmp_var);
       tmp_var.AddVec2(1.0, tmp_mean);  // make it x^2 stats.
-      BaseFloat this_weight =  state_occs(pdf_index) *
+      // adding 1.0e-10 to the weight will prevent problems later on, see
+      // the line KALDI_ASSERT(weight > 0.0).
+      BaseFloat this_weight =  (1.0e-10 + state_occs(pdf_index)) *
           (am.GetPdf(pdf_index).weights())(gauss_index);
       tmp_mean.Scale(this_weight);
       tmp_var.Scale(this_weight);
@@ -288,7 +293,7 @@ void ClusterGaussiansToUbm(const AmDiagGmm &am,
                << ", increasing it to " << num_clust_states;
     opts.intermediate_num_gauss = num_clust_states;
   }
-    
+
   KALDI_VLOG(1) << "Merging from " << am.NumGauss() << " Gaussians in the "
                 << "acoustic model, down to " << opts.intermediate_num_gauss
                 << " Gaussians.";
@@ -312,13 +317,13 @@ void ClusterGaussiansToUbm(const AmDiagGmm &am,
       GaussClusterable *this_cluster = static_cast<GaussClusterable*>(
           gauss_clusters_out[clust_index][i]);
       BaseFloat weight = this_cluster->count();
-      KALDI_ASSERT(weight > 0);
+      KALDI_ASSERT(weight > 0.0);
       tmp_weights(gauss_index) = weight;
       tmp_vec.CopyFromVec(this_cluster->x_stats());
-      tmp_vec.Scale(1/weight);
+      tmp_vec.Scale(1.0 / weight);
       tmp_means.CopyRowFromVec(tmp_vec, gauss_index);
       tmp_vec.CopyFromVec(this_cluster->x2_stats());
-      tmp_vec.Scale(1/weight);
+      tmp_vec.Scale(1.0 / weight);
       tmp_vec.AddVec2(-1.0, tmp_means.Row(gauss_index));  // x^2 stats to var.
       tmp_vars.CopyRowFromVec(tmp_vec, gauss_index);
       gauss_index++;
diff --git a/src/gmm/decodable-am-diag-gmm.h b/src/gmm/decodable-am-diag-gmm.h
index 82d09e77c28..745b4f61b14 100644
--- a/src/gmm/decodable-am-diag-gmm.h
+++ b/src/gmm/decodable-am-diag-gmm.h
@@ -149,7 +149,7 @@ class DecodableAmDiagGmmScaled: public DecodableAmDiagGmmUnmapped {
   const TransitionModel *TransModel() { return &trans_model_; }
 
   virtual ~DecodableAmDiagGmmScaled() {
-    if (delete_feats_) delete delete_feats_;
+    delete delete_feats_;
   }
   
  private: // want to access it public to have pdf id information
diff --git a/src/gmm/mle-am-diag-gmm.cc b/src/gmm/mle-am-diag-gmm.cc
index d6b001b18c8..e578d7c5e2e 100644
--- a/src/gmm/mle-am-diag-gmm.cc
+++ b/src/gmm/mle-am-diag-gmm.cc
@@ -124,7 +124,7 @@ void AccumAmDiagGmm::Read(std::istream &in_stream, bool binary,
     gmm_accumulators_.resize(num_pdfs, NULL);
     for (std::vector<AccumDiagGmm*>::iterator it = gmm_accumulators_.begin(),
              end = gmm_accumulators_.end(); it != end; ++it) {
-      if (*it != NULL) delete *it;
+      delete *it;
       *it = new AccumDiagGmm();
       (*it)->Read(in_stream, binary, add);
     }
diff --git a/src/gmmbin/Makefile b/src/gmmbin/Makefile
index e5504891aab..fb254b9c2c8 100644
--- a/src/gmmbin/Makefile
+++ b/src/gmmbin/Makefile
@@ -39,8 +39,8 @@ TESTFILES =
 ADDLIBS = ../decoder/kaldi-decoder.a ../lat/kaldi-lat.a ../feat/kaldi-feat.a \
 	../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
 	../hmm/kaldi-hmm.a ../tree/kaldi-tree.a ../matrix/kaldi-matrix.a  \
-	../thread/kaldi-thread.a ../fstext/kaldi-fstext.a \
-    ../util/kaldi-util.a ../base/kaldi-base.a 
+	../fstext/kaldi-fstext.a ../util/kaldi-util.a ../thread/kaldi-thread.a \
+    ../base/kaldi-base.a
 
 
 include ../makefiles/default_rules.mk
diff --git a/src/gmmbin/gmm-decode-biglm-faster.cc b/src/gmmbin/gmm-decode-biglm-faster.cc
index 8cb777ac57e..6e47d68de3c 100644
--- a/src/gmmbin/gmm-decode-biglm-faster.cc
+++ b/src/gmmbin/gmm-decode-biglm-faster.cc
@@ -240,7 +240,7 @@ int main(int argc, char *argv[])
     KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like/frame_count) << " over "
               << frame_count<<" frames.";
 
-    if (word_syms) delete word_syms;    
+    delete word_syms;    
     delete decode_fst;
     delete old_lm_fst;
     delete new_lm_fst;
diff --git a/src/gmmbin/gmm-decode-faster-regtree-fmllr.cc b/src/gmmbin/gmm-decode-faster-regtree-fmllr.cc
index 19719c869e3..ca39cbe8cb7 100644
--- a/src/gmmbin/gmm-decode-faster-regtree-fmllr.cc
+++ b/src/gmmbin/gmm-decode-faster-regtree-fmllr.cc
@@ -274,7 +274,7 @@ int main(int argc, char *argv[]) {
     KALDI_LOG << "Done " << num_success << " utterances, failed for "
               << num_fail;
 
-    if (word_syms) delete word_syms;
+    delete word_syms;
     delete decode_fst;
     if (num_success != 0)
       return 0;
diff --git a/src/gmmbin/gmm-decode-faster.cc b/src/gmmbin/gmm-decode-faster.cc
index ff12bd90633..34c4ff2c37e 100644
--- a/src/gmmbin/gmm-decode-faster.cc
+++ b/src/gmmbin/gmm-decode-faster.cc
@@ -214,7 +214,7 @@ int main(int argc, char *argv[]) {
     KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like/frame_count) << " over "
               << frame_count<<" frames.";
 
-    if (word_syms) delete word_syms;    
+    delete word_syms;    
     delete decode_fst;
     return (num_success != 0 ? 0 : 1);
   } catch(const std::exception &e) {
diff --git a/src/gmmbin/gmm-decode-nbest.cc b/src/gmmbin/gmm-decode-nbest.cc
index 657f7ba7042..cb19ffd53bf 100644
--- a/src/gmmbin/gmm-decode-nbest.cc
+++ b/src/gmmbin/gmm-decode-nbest.cc
@@ -79,7 +79,7 @@ int main(int argc, char *argv[]) {
     ParseOptions po(usage);
     bool allow_partial = true;
     BaseFloat acoustic_scale = 0.1;
-    
+
     std::string word_syms_filename;
     NBestDecoderOptions decoder_opts;
     decoder_opts.Register(&po, true);  // true == include obscure settings.
@@ -123,7 +123,7 @@ int main(int argc, char *argv[]) {
     Int32VectorWriter alignment_writer(alignment_wspecifier);
 
     fst::SymbolTable *word_syms = NULL;
-    if (word_syms_filename != "") 
+    if (word_syms_filename != "")
       if (!(word_syms = fst::SymbolTable::ReadText(word_syms_filename)))
         KALDI_ERR << "Could not read symbol table from file "
                    << word_syms_filename;
@@ -234,7 +234,7 @@ int main(int argc, char *argv[]) {
     KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like/frame_count) << " over "
               << frame_count<<" frames.";
 
-    if (word_syms) delete word_syms;    
+    delete word_syms;
     delete decode_fst;
     if (num_success != 0) return 0;
     else return 1;
diff --git a/src/gmmbin/gmm-decode-simple.cc b/src/gmmbin/gmm-decode-simple.cc
index 110ff40eb1c..b408afafdff 100644
--- a/src/gmmbin/gmm-decode-simple.cc
+++ b/src/gmmbin/gmm-decode-simple.cc
@@ -180,7 +180,7 @@ int main(int argc, char *argv[]) {
     KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like/frame_count) << " over "
               << frame_count<<" frames.";
 
-    if (word_syms) delete word_syms;
+    delete word_syms;
     delete decode_fst;
     if (num_success != 0) return 0;
     else return 1;
diff --git a/src/gmmbin/gmm-est-fmllr-global.cc b/src/gmmbin/gmm-est-fmllr-global.cc
index a62f9497a45..b3af0780aa5 100644
--- a/src/gmmbin/gmm-est-fmllr-global.cc
+++ b/src/gmmbin/gmm-est-fmllr-global.cc
@@ -1,4 +1,4 @@
-// gmmbin/gmm-est-fmllr.cc
+// gmmbin/gmm-est-fmllr-global.cc
 
 // Copyright 2009-2011  Microsoft Corporation;  Saarland University
 //           2013-2014  Johns Hopkins University (author: Daniel Povey)
diff --git a/src/gmmbin/gmm-latgen-biglm-faster.cc b/src/gmmbin/gmm-latgen-biglm-faster.cc
index f7c4d2cc5b1..c088cd2c3ed 100644
--- a/src/gmmbin/gmm-latgen-biglm-faster.cc
+++ b/src/gmmbin/gmm-latgen-biglm-faster.cc
@@ -306,7 +306,7 @@ int main(int argc, char *argv[]) {
     KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like/frame_count) << " over "
               << frame_count<<" frames.";
 
-    if (word_syms) delete word_syms;
+    delete word_syms;
     if (num_success != 0) return 0;
     else return 1;
   } catch(const std::exception &e) {
diff --git a/src/gmmbin/gmm-latgen-faster-parallel.cc b/src/gmmbin/gmm-latgen-faster-parallel.cc
index e2999b54898..6f6df590b27 100644
--- a/src/gmmbin/gmm-latgen-faster-parallel.cc
+++ b/src/gmmbin/gmm-latgen-faster-parallel.cc
@@ -199,7 +199,7 @@ int main(int argc, char *argv[]) {
     }
     sequencer.Wait();
 
-    if (decode_fst != NULL) delete decode_fst;
+    delete decode_fst;
     
     double elapsed = timer.Elapsed();
     KALDI_LOG << "Decoded with " << sequencer_config.num_threads << " threads.";
@@ -212,7 +212,7 @@ int main(int argc, char *argv[]) {
               << (tot_like/frame_count) << " over "
               << frame_count << " frames.";
 
-    if (word_syms) delete word_syms;
+    delete word_syms;
     if (num_done != 0) return 0;
     else return 1;
   } catch(const std::exception &e) {
diff --git a/src/gmmbin/gmm-latgen-faster-regtree-fmllr.cc b/src/gmmbin/gmm-latgen-faster-regtree-fmllr.cc
index d12340dff04..cf3a7641dcd 100644
--- a/src/gmmbin/gmm-latgen-faster-regtree-fmllr.cc
+++ b/src/gmmbin/gmm-latgen-faster-regtree-fmllr.cc
@@ -1,4 +1,4 @@
-// gmmbin/gmm-latgen-faster.cc
+// gmmbin/gmm-latgen-faster-regtree-fmllr.cc
 
 // Copyright 2009-2012  Microsoft Corporation
 //           2012-2013  Johns Hopkins University (author: Daniel Povey)
@@ -208,7 +208,7 @@ int main(int argc, char *argv[]) {
     KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like/frame_count) << " over "
               << frame_count << " frames.";
 
-    if (word_syms) delete word_syms;
+    delete word_syms;
     if (num_done != 0) return 0;
     else return 1;
   } catch(const std::exception &e) {
diff --git a/src/gmmbin/gmm-latgen-faster.cc b/src/gmmbin/gmm-latgen-faster.cc
index fd35dfc889a..9d031f7f51a 100644
--- a/src/gmmbin/gmm-latgen-faster.cc
+++ b/src/gmmbin/gmm-latgen-faster.cc
@@ -180,7 +180,7 @@ int main(int argc, char *argv[]) {
     KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like/frame_count) << " over "
               << frame_count << " frames.";
 
-    if (word_syms) delete word_syms;
+    delete word_syms;
     if (num_done != 0) return 0;
     else return 1;
   } catch(const std::exception &e) {
diff --git a/src/gmmbin/gmm-latgen-map.cc b/src/gmmbin/gmm-latgen-map.cc
index b25902c5a97..230bcc531d4 100644
--- a/src/gmmbin/gmm-latgen-map.cc
+++ b/src/gmmbin/gmm-latgen-map.cc
@@ -220,7 +220,7 @@ int main(int argc, char *argv[]) {
     KALDI_LOG << "Done " << num_success << " utterances, failed for "
               << num_fail;
 
-    if (word_syms) delete word_syms;
+    delete word_syms;
     return (num_success != 0 ? 0 : 1);
   }
   catch(const std::exception& e) {
diff --git a/src/gmmbin/gmm-latgen-simple.cc b/src/gmmbin/gmm-latgen-simple.cc
index 3d3eee7b6d8..fb7df3ab81b 100644
--- a/src/gmmbin/gmm-latgen-simple.cc
+++ b/src/gmmbin/gmm-latgen-simple.cc
@@ -141,7 +141,7 @@ int main(int argc, char *argv[]) {
               << frame_count<<" frames.";
 
     delete decode_fst;
-    if (word_syms) delete word_syms;
+    delete word_syms;
     if (num_success != 0) return 0;
     else return 1;
   } catch(const std::exception &e) {
diff --git a/src/gmmbin/gmm-latgen-tracking.cc b/src/gmmbin/gmm-latgen-tracking.cc
index bd01e1ae94d..2a08d56915a 100644
--- a/src/gmmbin/gmm-latgen-tracking.cc
+++ b/src/gmmbin/gmm-latgen-tracking.cc
@@ -198,7 +198,7 @@ int main(int argc, char *argv[]) {
     KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like/frame_count) << " over "
               << frame_count<<" frames.";
 
-    if (word_syms) delete word_syms;
+    delete word_syms;
     if (num_success != 0) return 0;
     else return 1;
   } catch(const std::exception &e) {
diff --git a/src/gst-plugin/Makefile b/src/gst-plugin/Makefile
index 13f2bf07b18..e9dec8f78fe 100644
--- a/src/gst-plugin/Makefile
+++ b/src/gst-plugin/Makefile
@@ -9,7 +9,7 @@ EXTRA_CXXFLAGS += -Wno-sign-compare -I ../../tools/portaudio/install/include
 EXTRA_CXXFLAGS += $(shell pkg-config --cflags gstreamer-1.0)
 EXTRA_CXXFLAGS += $(shell pkg-config --cflags glib-2.0)
 
-EXTRA_LDLIBS += -pthread -lgstbase-1.0 -lgstcontroller-1.0 -lgmodule-2.0 -lgthread-2.0 -lrt 
+EXTRA_LDLIBS += -lgstbase-1.0 -lgstcontroller-1.0 -lgmodule-2.0 -lgthread-2.0
 EXTRA_LDLIBS += $(shell pkg-config --libs gstreamer-1.0)
 EXTRA_LDLIBS += $(shell pkg-config --libs glib-2.0)
 
@@ -32,8 +32,6 @@ all: $(LIBFILE)
 EXTRA_LDLIBS += ../../tools/portaudio/install/lib/libportaudio.a
 ifneq ($(wildcard ../../tools/portaudio/install/include/pa_linux_alsa.h),)
     EXTRA_LDLIBS += -lasound
-else
-    EXTRA_LDLIBS += -lrt
 endif
 
 # MKL libs required when linked via shared library
@@ -41,9 +39,18 @@ ifdef MKLROOT
 	EXTRA_LDLIBS+=-lmkl_p4n -lmkl_def
 endif
 
+# Library so name and rpath
+CXX_VERSION=$(shell $(CXX) --version 2>/dev/null)
+ifneq (,$(findstring clang, $(CXX_VERSION)))
+    # clang++ linker
+    EXTRA_LDLIBS +=  -Wl,-install_name,$(LIBFILE) -Wl,-rpath,$(KALDILIBDIR)
+else
+    # g++ linker
+    EXTRA_LDLIBS +=  -Wl,-soname=$(LIBFILE) -Wl,--no-as-needed -Wl,-rpath=$(KALDILIBDIR) -lrt -pthread
+endif
+
 $(LIBFILE): $(OBJFILES)
-	$(CXX) -shared -DPIC -o $(LIBFILE) -Wl,-soname=$(LIBFILE) -Wl,--no-as-needed \
-	  -L$(KALDILIBDIR) -Wl,-rpath=$(KALDILIBDIR) $(EXTRA_LDLIBS) $(LDLIBS) $(LDFLAGS) \
+	$(CXX) -shared -DPIC -o $(LIBFILE) -L$(KALDILIBDIR) $(EXTRA_LDLIBS) $(LDLIBS) $(LDFLAGS) \
 	  $(OBJFILES)
  
 kaldimarshal.h: kaldimarshal.list
diff --git a/src/gst-plugin/gst-online-gmm-decode-faster.cc b/src/gst-plugin/gst-online-gmm-decode-faster.cc
index b94baec6e28..040a04b53c2 100644
--- a/src/gst-plugin/gst-online-gmm-decode-faster.cc
+++ b/src/gst-plugin/gst-online-gmm-decode-faster.cc
@@ -1,4 +1,4 @@
-// gst-plugin/gst-online-decode-faster.cc
+// gst-plugin/gst-online-gmm-decode-faster.cc
 
 // Copyright 2013  Tanel Alumae, Tallinn University of Technology
 // Copyright 2012 Cisco Systems (author: Matthias Paulik)
diff --git a/src/gst-plugin/gst-online-gmm-decode-faster.h b/src/gst-plugin/gst-online-gmm-decode-faster.h
index c66e84be9f4..b950d1e0a12 100644
--- a/src/gst-plugin/gst-online-gmm-decode-faster.h
+++ b/src/gst-plugin/gst-online-gmm-decode-faster.h
@@ -1,4 +1,4 @@
-// gst-plugin/gst-online-decode-faster.h
+// gst-plugin/gst-online-gmm-decode-faster.h
 
 // Copyright 2013  Tanel Alumae, Tallinn University of Technology
 
diff --git a/src/hmm/Makefile b/src/hmm/Makefile
index 68daf8a2eef..eb6362f28f2 100644
--- a/src/hmm/Makefile
+++ b/src/hmm/Makefile
@@ -5,11 +5,12 @@ include ../kaldi.mk
 
 TESTFILES = hmm-topology-test hmm-utils-test transition-model-test posterior-test
 
-OBJFILES = hmm-topology.o transition-model.o hmm-utils.o tree-accu.o posterior.o
+OBJFILES = hmm-topology.o transition-model.o hmm-utils.o tree-accu.o \
+        posterior.o hmm-test-utils.o
 
 LIBNAME = kaldi-hmm
 ADDLIBS = ../tree/kaldi-tree.a ../matrix/kaldi-matrix.a ../util/kaldi-util.a \
-          ../base/kaldi-base.a 
+          ../thread/kaldi-thread.a ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
 
diff --git a/src/hmm/hmm-test-utils.cc b/src/hmm/hmm-test-utils.cc
new file mode 100644
index 00000000000..4cfebcd0d51
--- /dev/null
+++ b/src/hmm/hmm-test-utils.cc
@@ -0,0 +1,274 @@
+// hmm/hmm-test-utils.cc
+
+// Copyright 2015   Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "hmm/hmm-test-utils.h"
+
+namespace kaldi {
+
+TransitionModel *GenRandTransitionModel(ContextDependency **ctx_dep_out) {
+  std::vector<int32> phones;
+  phones.push_back(1);
+  for (int32 i = 2; i < 20; i++)
+    if (rand() % 2 == 0)
+      phones.push_back(i);
+  int32 N = 2 + rand() % 2, // context-size N is 2 or 3.
+      P = rand() % N;  // Central-phone is random on [0, N)
+
+  std::vector<int32> num_pdf_classes;
+
+  ContextDependency *ctx_dep =
+      GenRandContextDependencyLarge(phones, N, P,
+                                    true, &num_pdf_classes);
+
+  HmmTopology topo = GenRandTopology(phones, num_pdf_classes);
+
+  TransitionModel *trans_model = new TransitionModel(*ctx_dep, topo);
+
+  if (ctx_dep_out == NULL) delete ctx_dep;
+  else *ctx_dep_out = ctx_dep;
+  return trans_model;
+}
+
+HmmTopology GetDefaultTopology(const std::vector<int32> &phones_in) {
+  std::vector<int32> phones(phones_in);
+  std::sort(phones.begin(), phones.end());
+  KALDI_ASSERT(IsSortedAndUniq(phones) && !phones.empty());
+
+  std::ostringstream topo_string;
+  topo_string <<  "<Topology>\n"
+      "<TopologyEntry>\n"
+      "<ForPhones> ";
+  for (size_t i = 0; i < phones.size(); i++)
+    topo_string << phones[i] << " ";
+
+  topo_string << "</ForPhones>\n"
+      "<State> 0 <PdfClass> 0\n"
+      "<Transition> 0 0.5\n"
+      "<Transition> 1 0.5\n"
+      "</State> \n"
+      "<State> 1 <PdfClass> 1 \n"
+      "<Transition> 1 0.5\n"
+      "<Transition> 2 0.5\n"
+      "</State>  \n"
+      " <State> 2 <PdfClass> 2\n"
+      " <Transition> 2 0.5\n"
+      " <Transition> 3 0.5\n"
+      " </State>   \n"
+      " <State> 3 </State>\n"
+      " </TopologyEntry>\n"
+      " </Topology>\n";
+
+  HmmTopology topo;
+  std::istringstream iss(topo_string.str());
+  topo.Read(iss, false);
+  return topo;
+
+}
+
+
+HmmTopology GenRandTopology(const std::vector<int32> &phones_in,
+                            const std::vector<int32> &num_pdf_classes) {
+  std::vector<int32> phones(phones_in);
+  std::sort(phones.begin(), phones.end());
+  KALDI_ASSERT(IsSortedAndUniq(phones) && !phones.empty());
+
+  std::ostringstream topo_string;
+
+   std::map<int32, std::vector<int32> > num_pdf_classes_to_phones;
+  for (size_t i = 0; i < phones.size(); i++) {
+    int32 p = phones[i];
+    KALDI_ASSERT(static_cast<size_t>(p) < num_pdf_classes.size());
+    int32 n = num_pdf_classes[p];
+    KALDI_ASSERT(n > 0 && "num-pdf-classes cannot be zero.");
+    num_pdf_classes_to_phones[n].push_back(p);
+  }
+
+  topo_string <<  "<Topology>\n";
+  std::map<int32, std::vector<int32> >::const_iterator
+      iter = num_pdf_classes_to_phones.begin(),
+      end = num_pdf_classes_to_phones.end();
+  for (; iter != end; ++iter) {
+    topo_string << "<TopologyEntry>\n"
+        "<ForPhones> ";
+    int32 this_num_pdf_classes = iter->first;
+    const std::vector<int32> &phones = iter->second;
+    for (size_t i = 0; i < phones.size(); i++)
+      topo_string << phones[i] << " ";
+    topo_string << "</ForPhones> ";
+    bool ergodic = (RandInt(0, 1) == 0);
+    if (ergodic) {
+      // Note, this type of topology is not something we ever use in practice- it
+      // has an initial nonemitting state (no PdfClass specified).  But it's
+      // supported so we're testing it.
+      std::vector<int32> state_to_pdf_class;
+      state_to_pdf_class.push_back(-1);  // state zero, nonemitting.
+      for (int32 i = 0; i < this_num_pdf_classes; i++) {
+        int32 num_states = RandInt(1, 2);
+        for (int32 j = 0; j < num_states; j++)
+          state_to_pdf_class.push_back(i);
+      }
+      state_to_pdf_class.push_back(-1);  // final non-emitting state.
+      { // state zero is nonemitting.  This is not something used in any current
+        // example script.
+        topo_string << "<State> 0\n";
+        BaseFloat prob = 1.0 / (state_to_pdf_class.size() - 2);
+        for (size_t i = 1; i + 1 < state_to_pdf_class.size(); i++) {
+          topo_string << "<Transition> " << i << ' ' << prob << '\n';
+        }
+        topo_string << "</State>\n";
+      }
+      // ergodic part.
+      for (size_t i = 1; i + 1 < state_to_pdf_class.size(); i++) {
+        BaseFloat prob = 1.0 / (state_to_pdf_class.size() - 1);
+        topo_string << "<State> " << i << " <PdfClass> "
+                    << state_to_pdf_class[i] << '\n';
+        for (size_t j = 1; j < state_to_pdf_class.size(); j++)
+          topo_string << "<Transition> " << j << ' ' << prob << '\n';
+        topo_string << "</State>\n";
+      }
+      // final, nonemitting state.  No pdf-class, no transitions.
+      topo_string << "<State> " << (state_to_pdf_class.size() - 1) << " </State>\n";
+    } else {
+      // feedforward topology.
+      int32 cur_state = 0;
+      for (int32 pdf_class = 0; pdf_class < this_num_pdf_classes; pdf_class++) {
+        int32 this_num_states = RandInt(1, 2);
+        for (int32 s = 0; s < this_num_states; s++) {
+          topo_string << "<State> " << cur_state << " <PdfClass> " << pdf_class
+                      << "\n<Transition> " << cur_state << " 0.5\n<Transition> "
+                      << (cur_state + 1) << " 0.5\n</State>\n";
+          cur_state++;
+        }
+      }
+      // final, non-emitting state.
+      topo_string << "<State> " << cur_state << " </State>\n";
+    }
+    topo_string << "</TopologyEntry>\n";
+  }
+  topo_string << "</Topology>\n";
+
+  HmmTopology topo;
+  std::istringstream iss(topo_string.str());
+  topo.Read(iss, false);
+  return topo;
+}
+
+HmmTopology GenRandTopology() {
+  std::vector<int32> phones;
+  phones.push_back(1);
+  for (int32 i = 2; i < 20; i++)
+    if (rand() % 2 == 0)
+      phones.push_back(i);
+  if (RandInt(0, 1) == 0) {
+    return GetDefaultTopology(phones);
+  } else {
+    std::vector<int32> num_pdf_classes(phones.back() + 1, -1);
+    for (int32 i = 0; i < phones.size(); i++)
+      num_pdf_classes[phones[i]] = RandInt(1, 5);
+    return GenRandTopology(phones, num_pdf_classes);
+  }
+}
+
+void GeneratePathThroughHmm(const HmmTopology &topology,
+                            bool reorder,
+                            int32 phone,
+                            std::vector<std::pair<int32, int32> > *path) {
+  path->clear();
+  const HmmTopology::TopologyEntry &this_entry =
+      topology.TopologyForPhone(phone);
+  int32 cur_state = 0;  // start-state is always state zero.
+  int32 num_states = this_entry.size(), final_state = num_states - 1;
+  KALDI_ASSERT(num_states > 1);  // there has to be a final nonemitting state
+  // that's different from the start state.
+  std::vector<std::pair<int32, int32> > pending_self_loops;
+  while (cur_state != final_state) {
+    const HmmTopology::HmmState &cur_hmm_state = this_entry[cur_state];
+    int32 num_transitions = cur_hmm_state.transitions.size(),
+        transition_index = RandInt(0, num_transitions - 1);
+    if (cur_hmm_state.pdf_class != -1) {
+      std::pair<int32, int32> pr(cur_state, transition_index);
+      if (!reorder) {
+        path->push_back(pr);
+      } else {
+        bool is_self_loop = (cur_state ==
+                             cur_hmm_state.transitions[transition_index].first);
+        if (is_self_loop) { // save these up, we'll put them after the forward
+                            // transition.
+          pending_self_loops.push_back(pr);
+        } else {
+          // non-self-loop: output it and then flush out any self-loops we
+          // stored up.
+          path->push_back(pr);
+          path->insert(path->end(), pending_self_loops.begin(),
+                       pending_self_loops.end());
+          pending_self_loops.clear();
+        }
+      }
+    }
+    cur_state = cur_hmm_state.transitions[transition_index].first;
+  }
+  KALDI_ASSERT(pending_self_loops.empty());
+}
+
+
+void GenerateRandomAlignment(const ContextDependencyInterface &ctx_dep,
+                             const TransitionModel &trans_model,
+                             bool reorder,
+                             const std::vector<int32> &phone_sequence,
+                             std::vector<int32> *alignment) {
+  int32 context_width = ctx_dep.ContextWidth(),
+      central_position = ctx_dep.CentralPosition(),
+      num_phones = phone_sequence.size();
+  alignment->clear();
+  for (int32 i = 0; i < num_phones; i++) {
+    std::vector<int32> context_window;
+    context_window.reserve(context_width);
+    for (int32 j = i - central_position;
+         j < i - central_position + context_width;
+         j++) {
+      if (j >= 0 && j < num_phones) context_window.push_back(phone_sequence[j]);
+      else context_window.push_back(0);  // zero for out-of-window phones
+    }
+    // 'path' is the path through this phone's HMM, represented as
+    // (emitting-HMM-state, transition-index) pairs
+    std::vector<std::pair<int32, int32> > path;
+    int32 phone = phone_sequence[i];
+    GeneratePathThroughHmm(trans_model.GetTopo(), reorder, phone, &path);
+    for (size_t k = 0; k < path.size(); k++) {
+      const HmmTopology::TopologyEntry &entry =
+          trans_model.GetTopo().TopologyForPhone(phone);
+      int32 hmm_state = path[k].first,
+          transition_index = path[k].second,
+          pdf_class = entry[hmm_state].pdf_class,
+          pdf_id;
+      bool ans = ctx_dep.Compute(context_window, pdf_class, &pdf_id);
+      KALDI_ASSERT(ans && "context-dependency computation failed.");
+      int32 transition_state = trans_model.TripleToTransitionState(
+          phone, hmm_state, pdf_id),
+          transition_id = trans_model.PairToTransitionId(transition_state,
+                                                         transition_index);
+      alignment->push_back(transition_id);
+    }
+  }
+}
+
+
+} // End namespace kaldi
diff --git a/src/hmm/hmm-test-utils.h b/src/hmm/hmm-test-utils.h
new file mode 100644
index 00000000000..495ebf278ae
--- /dev/null
+++ b/src/hmm/hmm-test-utils.h
@@ -0,0 +1,81 @@
+// hmm/hmm-test-utils.h
+
+// Copyright 2009-2011   Microsoft Corporation
+//                2015   Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_HMM_HMM_TEST_UTILS_H_
+#define KALDI_HMM_HMM_TEST_UTILS_H_
+
+#include "hmm/hmm-topology.h"
+#include "hmm/transition-model.h"
+#include "lat/kaldi-lattice.h"
+
+namespace kaldi {
+
+// Here we put a convenience function for generating a TransitionModel object --
+// useful in test code.  We may put other testing-related things here in time.
+
+// This function returns a randomly generated TransitionModel object.
+// If 'ctx_dep' is not NULL, it outputs to *ctx_dep a pointer to the
+// tree that was used to generate the transition model.
+TransitionModel *GenRandTransitionModel(ContextDependency **ctx_dep);
+
+/// This function returns a HmmTopology object giving a normal 3-state topology,
+/// covering all phones in the list "phones".  This is mainly of use in testing
+/// code.
+HmmTopology GetDefaultTopology(const std::vector<int32> &phones);
+
+
+/// This method of generating an arbitrary HmmTopology object allows you to
+/// specify the number of pdf-classes for each phone separately.
+/// 'num_pdf_classes' is indexed by the phone-index (so the length will be
+/// longer than the length of the 'phones' vector, which for example lacks the
+/// zero index and may have gaps).
+HmmTopology GenRandTopology(const std::vector<int32> &phones,
+                            const std::vector<int32> &num_pdf_classes);
+
+/// This version of GenRandTopology() generates the phone list and number of pdf
+/// classes randomly.
+HmmTopology GenRandTopology();
+
+/// This function generates a random path through the HMM for the given
+/// phone.  The 'path' output is a list of pairs (HMM-state, transition-index)
+/// in which any nonemitting states will have been removed.  This is
+/// used in other test code.
+/// the 'reorder' option is as described in the documentation; if true, the
+/// self-loops from a state are reordered to come after the forward-transition.
+void GeneratePathThroughHmm(const HmmTopology &topology,
+                            bool reorder,
+                            int32 phone,
+                            std::vector<std::pair<int32, int32> > *path);
+
+
+/// For use in test code, this function generates an alignment (a sequence of
+/// transition-ids) corresponding to a given phone sequence.
+void GenerateRandomAlignment(const ContextDependencyInterface &ctx_dep,
+                             const TransitionModel &trans_model,
+                             bool reorder,
+                             const std::vector<int32> &phone_sequence,
+                             std::vector<int32> *alignment);
+
+
+
+
+}  // namespace kaldi
+
+#endif
diff --git a/src/hmm/hmm-topology-test.cc b/src/hmm/hmm-topology-test.cc
index 6542357d338..61cf13e17bc 100644
--- a/src/hmm/hmm-topology-test.cc
+++ b/src/hmm/hmm-topology-test.cc
@@ -1,6 +1,7 @@
 // hmm/hmm-topology-test.cc
 
-// Copyright 2009-2011 Microsoft Corporation
+// Copyright 2009-2011  Microsoft Corporation
+//                2015  Johns Hopkins University (author: Daniel Povey)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -18,6 +19,7 @@
 // limitations under the License.
 
 #include "hmm/hmm-topology.h"
+#include "hmm/hmm-test-utils.h"
 
 namespace kaldi {
 
@@ -58,9 +60,15 @@ void TestHmmTopology() {
 
   HmmTopology topo;
 
-  std::istringstream iss(input_str);
-  topo.Read(iss, false);
-  
+  if (RandInt(0, 1) == 0) {
+    topo = GenRandTopology();
+  } else {
+    std::istringstream iss(input_str);
+    topo.Read(iss, false);
+    KALDI_ASSERT(topo.MinLength(3) == 3);
+    KALDI_ASSERT(topo.MinLength(11) == 2);
+  }
+
   std::ostringstream oss;
   topo.Write(oss, binary);
 
diff --git a/src/hmm/hmm-topology.cc b/src/hmm/hmm-topology.cc
index 6418ee4bfe2..54144326766 100644
--- a/src/hmm/hmm-topology.cc
+++ b/src/hmm/hmm-topology.cc
@@ -58,7 +58,8 @@ void HmmTopology::Read(std::istream &is, bool binary) {
           else {
             int32 phone;
             if (!ConvertStringToInteger(s, &phone))
-              KALDI_ERR << "Reading HmmTopology object, expected integer, got instead "<<s;
+              KALDI_ERR << "Reading HmmTopology object, expected "
+                        << "integer, got instead " << s;
             phones.push_back(phone);
           }
         }
@@ -86,7 +87,7 @@ void HmmTopology::Read(std::istream &is, bool binary) {
             BaseFloat trans_prob;
             ReadBasicType(is, binary, &dst_state);
             ReadBasicType(is, binary, &trans_prob);
-            this_entry.back().transitions.push_back(std::make_pair(dst_state, trans_prob));  
+            this_entry.back().transitions.push_back(std::make_pair(dst_state, trans_prob));
             ReadToken(is, binary, &token);
           }
           if(token == "<Final>") // TODO: remove this clause after a while.
@@ -214,7 +215,7 @@ void HmmTopology::Check() {
     if (!entries_[i][num_states-1].transitions.empty())
       KALDI_ERR << "HmmTopology::Check(), last state must have no transitions.";
     // not sure how necessary this next stipulation is.
-    if (entries_[i][num_states-1].pdf_class != kNoPdf) 
+    if (entries_[i][num_states-1].pdf_class != kNoPdf)
       KALDI_ERR << "HmmTopology::Check(), last state must not be emitting.";
 
     std::vector<bool> has_trans_in(num_states, false);
@@ -262,7 +263,7 @@ void HmmTopology::Check() {
         KALDI_ASSERT(tot_prob == 0.0);
     }
     // make sure all but start state have input transitions.
-    for (int32 j = 1; j < num_states; j++) 
+    for (int32 j = 1; j < num_states; j++)
       if (!has_trans_in[j])
         KALDI_ERR << "HmmTopology::Check, state "<<(j)<<" has no input transitions.";
     SortAndUniq(&seen_pdf_classes);
@@ -290,41 +291,41 @@ int32 HmmTopology::NumPdfClasses(int32 phone) const {
   return max_pdf_class+1;
 }
 
-HmmTopology GetDefaultTopology(const std::vector<int32> &phones_in) {
-  std::vector<int32> phones(phones_in);
-  std::sort(phones.begin(), phones.end());
-  KALDI_ASSERT(IsSortedAndUniq(phones) && !phones.empty());
-  
-  std::ostringstream topo_string;
-  topo_string <<  "<Topology>\n"
-      "<TopologyEntry>\n"
-      "<ForPhones> ";
-  for (size_t i = 0; i < phones.size(); i++)
-    topo_string << phones[i] << " ";
-  
-  topo_string << "</ForPhones>\n"
-      "<State> 0 <PdfClass> 0\n"
-      "<Transition> 0 0.5\n"
-      "<Transition> 1 0.5\n"
-      "</State> \n"
-      "<State> 1 <PdfClass> 1 \n"
-      "<Transition> 1 0.5\n"
-      "<Transition> 2 0.5\n"
-      "</State>  \n"
-      " <State> 2 <PdfClass> 2\n"
-      " <Transition> 2 0.5\n"
-      " <Transition> 3 0.5\n"
-      " </State>   \n"
-      " <State> 3 </State>\n"
-      " </TopologyEntry>\n"
-      " </Topology>\n";
+int32 HmmTopology::MinLength(int32 phone) const {
+  const TopologyEntry &entry = TopologyForPhone(phone);
+  // min_length[state] gives the minimum length for sequences up to and
+  // including that state.
+  std::vector<int32> min_length(entry.size(),
+                                std::numeric_limits<int32>::max());
+  KALDI_ASSERT(!entry.empty());
 
-  HmmTopology topo;
-  std::istringstream iss(topo_string.str());
-  topo.Read(iss, false);  
-  return topo;
-  
+  min_length[0] = (entry[0].pdf_class == -1 ? 0 : 1);
+  int32 num_states = min_length.size();
+  bool changed = true;
+  while (changed) {
+    changed = false;
+    for (int32 s = 0; s < num_states; s++) {
+      const HmmState &this_state = entry[s];
+      std::vector<std::pair<int32, BaseFloat> >::const_iterator
+          iter = this_state.transitions.begin(),
+          end = this_state.transitions.end();
+      for (; iter != end; ++iter) {
+        int32 next_state = iter->first;
+        KALDI_ASSERT(next_state < num_states);
+        int32 next_state_min_length = min_length[s] +
+            (entry[next_state].pdf_class == -1 ? 0 : 1);
+        if (next_state_min_length < min_length[next_state]) {
+          min_length[next_state] = next_state_min_length;
+          if (next_state < s)
+            changed = true;
+          // the test of 'next_state < s' is an optimization for speed.
+        }
+      }
+    }
+  }
+  KALDI_ASSERT(min_length.back() != std::numeric_limits<int32>::max());
+  // the last state is the final-state.
+  return min_length.back();
 }
 
-
 } // End namespace kaldi
diff --git a/src/hmm/hmm-topology.h b/src/hmm/hmm-topology.h
index 53ca427473b..79b535e7d6b 100644
--- a/src/hmm/hmm-topology.h
+++ b/src/hmm/hmm-topology.h
@@ -61,7 +61,7 @@ namespace kaldi {
  <Final> 0.5
  </State>
  <State> 3
- </State> 
+ </State>
  </TopologyEntry>
  </Topology>
 */
@@ -100,9 +100,9 @@ class HmmTopology {
     /// equal to \ref kNoPdf == -1 in order to specify nonemitting states (unusual).
     int32 pdf_class;
 
-    /// A list of transitions.  The first member of each pair is the index of
-    /// the next HmmState, and the second is the default transition probability
-    /// (before training).
+    /// A list of transitions, indexed by what we call a 'transition-index'.
+    /// The first member of each pair is the index of the next HmmState, and the
+    /// second is the default transition probability (before training).
     std::vector<std::pair<int32, BaseFloat> > transitions;
 
     explicit HmmState(int32 p): pdf_class(p) { }
@@ -110,7 +110,7 @@ class HmmTopology {
     bool operator == (const HmmState &other) const {
       return (pdf_class == other.pdf_class && transitions == other.transitions);
     }
-    
+
     HmmState(): pdf_class(-1) { }
   };
 
@@ -144,6 +144,10 @@ class HmmTopology {
   /// used by tree-building code such as BuildTree().
   void GetPhoneToNumPdfClasses(std::vector<int32> *phone2num_pdf_classes) const;
 
+  // Returns the minimum number of frames it takes to traverse this model for
+  // this phone: e.g. 3 for the normal HMM topology.
+  int32 MinLength(int32 phone) const;
+
   HmmTopology() {}
 
   bool operator == (const HmmTopology &other) const {
@@ -158,11 +162,6 @@ class HmmTopology {
 };
 
 
-/// This function returns a HmmTopology object giving a normal 3-state topology,
-/// covering all phones in the list "phones".  This is mainly of use in testing
-/// code.
-HmmTopology GetDefaultTopology(const std::vector<int32> &phones);
-
 /// @} end "addtogroup hmm_group"
 
 
diff --git a/src/hmm/hmm-utils-test.cc b/src/hmm/hmm-utils-test.cc
index 18fd1099199..805b77ce7f0 100644
--- a/src/hmm/hmm-utils-test.cc
+++ b/src/hmm/hmm-utils-test.cc
@@ -1,6 +1,7 @@
 // hmm/hmm-utils-test.cc
 
-// Copyright 2009-2011 Microsoft Corporation
+// Copyright 2009-2011  Microsoft Corporation
+//                2015  Johns Hopkins University (author: Daniel Povey)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -18,6 +19,8 @@
 // limitations under the License.
 
 #include "hmm/hmm-utils.h"
+#include "hmm/tree-accu.h"
+#include "hmm/hmm-test-utils.h"
 
 namespace kaldi {
 
@@ -70,7 +73,7 @@ void TestConvertPhnxToProns() {
 
   { // test w/ unexpected word-end -> should fail.
     std::vector<int32> phnx; phnx.push_back(3); phnx.push_back(4);
-    phnx.push_back(word_end_sym); 
+    phnx.push_back(word_end_sym);
     std::vector<int32> words;
     std::vector<std::vector<int32> > ans;
     KALDI_ASSERT(!ConvertPhnxToProns(phnx, words, word_start_sym,
@@ -120,7 +123,7 @@ void TestConvertPhnxToProns() {
   }
 
   { // test w/ ONE real word w/ one phone..
-    std::vector<int32> phnx; 
+    std::vector<int32> phnx;
     phnx.push_back(word_start_sym); phnx.push_back(5); phnx.push_back(word_end_sym);
     std::vector<int32> words; words.push_back(100);
     std::vector<std::vector<int32> > ans;
@@ -134,9 +137,9 @@ void TestConvertPhnxToProns() {
 
   { // test w/ ONE real word w/ one phone, but no
     // words supplied-- should fail.
-    std::vector<int32> phnx; 
+    std::vector<int32> phnx;
     phnx.push_back(word_start_sym); phnx.push_back(5); phnx.push_back(word_end_sym);
-    std::vector<int32> words; 
+    std::vector<int32> words;
     std::vector<std::vector<int32> > ans;
     KALDI_ASSERT(!ConvertPhnxToProns(phnx, words, word_start_sym,
                                     word_end_sym, &ans));
@@ -144,9 +147,9 @@ void TestConvertPhnxToProns() {
 
   { // test w/ ONE real word w/ one phone, but two
     // words supplied-- should fail.
-    std::vector<int32> phnx; 
+    std::vector<int32> phnx;
     phnx.push_back(word_start_sym); phnx.push_back(5); phnx.push_back(word_end_sym);
-    std::vector<int32> words(2, 10); 
+    std::vector<int32> words(2, 10);
     std::vector<std::vector<int32> > ans;
     KALDI_ASSERT(!ConvertPhnxToProns(phnx, words, word_start_sym,
                                     word_end_sym, &ans));
@@ -154,17 +157,17 @@ void TestConvertPhnxToProns() {
 
   { // test w/ ONE real word w/ one phone, but word-id
     // is zero-- should fail.
-    std::vector<int32> phnx; 
+    std::vector<int32> phnx;
     phnx.push_back(word_start_sym); phnx.push_back(5); phnx.push_back(word_end_sym);
     std::vector<int32> words(1, 0);
     std::vector<std::vector<int32> > ans;
     KALDI_ASSERT(!ConvertPhnxToProns(phnx, words, word_start_sym,
                                     word_end_sym, &ans));
   }
-  
+
   { // test w/ ONE real word w/ two phones, then one
     // empty word...
-    std::vector<int32> phnx; 
+    std::vector<int32> phnx;
     phnx.push_back(word_start_sym); phnx.push_back(5);
     phnx.push_back(7); phnx.push_back(word_end_sym);
     phnx.push_back(10);
@@ -180,8 +183,143 @@ void TestConvertPhnxToProns() {
                                     word_end_sym, &ans)
                  && ans == ans_check);
   }
+}
+
+void TestAccumulateTreeStatsOptions() {
+  AccumulateTreeStatsOptions opts;
+  opts.var_floor = RandInt(0, 10);
+  opts.ci_phones_str = "3:2:1";
+  opts.phone_map_rxfilename = "echo 1 2; echo 2 5 |";
+  opts.context_width = RandInt(3, 4);
+  opts.central_position = RandInt(0, 2);
+  AccumulateTreeStatsInfo info(opts);
+  KALDI_ASSERT(info.var_floor == opts.var_floor);
+  KALDI_ASSERT(info.ci_phones.size() == 3 && info.ci_phones[2] == 3);
+  KALDI_ASSERT(info.phone_map.size() == 3 && info.phone_map[2] == 5);
+  KALDI_ASSERT(info.context_width == opts.context_width);
+  KALDI_ASSERT(info.central_position == opts.central_position);
+}
+
+void TestSplitToPhones() {
+  ContextDependency *ctx_dep = NULL;
+  TransitionModel *trans_model = GenRandTransitionModel(&ctx_dep);
+  std::vector<int32> phone_seq;
+  int32 num_phones = RandInt(0, 10);
+  const std::vector<int32> &phone_list = trans_model->GetPhones();
+  for (int32 i = 0; i < num_phones; i++) {
+    int32 rand_phone = phone_list[RandInt(0, phone_list.size() - 1)];
+    phone_seq.push_back(rand_phone);
+  }
+  bool reorder = (RandInt(0, 1) == 0);
+  std::vector<int32> alignment;
+  GenerateRandomAlignment(*ctx_dep, *trans_model, reorder,
+                          phone_seq, &alignment);
+  std::vector<std::vector<int32> > split_alignment;
+  SplitToPhones(*trans_model, alignment, &split_alignment);
+  KALDI_ASSERT(split_alignment.size() == phone_seq.size());
+  for (size_t i = 0; i < split_alignment.size(); i++) {
+    KALDI_ASSERT(!split_alignment[i].empty());
+    for (size_t j = 0; j < split_alignment[i].size(); j++) {
+      int32 transition_id = split_alignment[i][j];
+      KALDI_ASSERT(trans_model->TransitionIdToPhone(transition_id) ==
+                   phone_seq[i]);
+    }
+  }
+  delete trans_model;
+  delete ctx_dep;
+}
+
+void TestConvertAlignment() {
+  bool old_reorder = (RandInt(0, 1) == 1),
+      new_reorder = (RandInt(0, 1) == 1),
+      new_tree = (RandInt(0, 1) == 1),
+      new_topology = (RandInt(0, 1) == 1);
+  if (!new_tree)
+    new_topology = true;
+
+  int32 subsample_factor = RandInt(1, 3);
+
+  KALDI_LOG << " old-reorder = " << old_reorder
+            << ", new-reorder = " << new_reorder
+            << ", new-tree = " << new_tree
+            << ", subsample-factor = " << subsample_factor;
+
+  std::vector<int32> phones;
+  phones.push_back(1);
+  for (int32 i = 2; i < 20; i++)
+    if (rand() % 2 == 0)
+      phones.push_back(i);
+  int32 N = 2 + rand() % 2, // context-size N is 2 or 3.
+      P = rand() % N;  // Central-phone is random on [0, N)
+
+  std::vector<int32> num_pdf_classes_old,
+      num_pdf_classes_new;
+
+  ContextDependencyInterface *ctx_dep_old =
+      GenRandContextDependencyLarge(phones, N, P,
+                                    true, &num_pdf_classes_old),
+      *ctx_dep_new;
+  if (new_tree) {
+    if (new_topology) {
+      ctx_dep_new = GenRandContextDependencyLarge(phones, N, P,
+                                                  true, &num_pdf_classes_new);
+    } else {
+      num_pdf_classes_new = num_pdf_classes_old;
+      ctx_dep_new = MonophoneContextDependency(phones, num_pdf_classes_new);
+    }
+  } else {
+    num_pdf_classes_new = num_pdf_classes_old;
+    ctx_dep_new = ctx_dep_old->Copy();
+  }
 
-  
+
+  HmmTopology topo_old = GenRandTopology(phones, num_pdf_classes_old),
+      topo_new =  (new_topology ?
+                   GenRandTopology(phones, num_pdf_classes_new) : topo_old);
+
+  TransitionModel trans_model_old(*ctx_dep_old, topo_old),
+      trans_model_new(*ctx_dep_new, topo_new);
+
+  std::vector<int32> phone_sequence;
+  int32 phone_sequence_length = RandInt(0, 20);
+  for (int32 i = 0; i < phone_sequence_length; i++)
+    phone_sequence.push_back(phones[RandInt(0, phones.size() - 1)]);
+  std::vector<int32> old_alignment;
+  GenerateRandomAlignment(*ctx_dep_old, trans_model_old,
+                          old_reorder, phone_sequence,
+                          &old_alignment);
+
+  std::vector<int32> new_alignment;
+
+  bool ans = ConvertAlignment(trans_model_old, trans_model_new, *ctx_dep_new,
+                              old_alignment, subsample_factor, new_reorder,
+                              NULL, &new_alignment);
+  if(!ans) {
+    KALDI_WARN << "Alignment conversion failed";
+    // make sure it failed for a good reason.
+    KALDI_ASSERT(new_topology || subsample_factor > 1);
+  } else {
+    std::vector<std::vector<int32> > old_split, new_split;
+    bool b1 = SplitToPhones(trans_model_old, old_alignment, &old_split),
+        b2 = SplitToPhones(trans_model_new, new_alignment, &new_split);
+    KALDI_ASSERT(b1 && b2);
+    KALDI_ASSERT(old_split.size() == new_split.size());
+    for (size_t i = 0; i < new_split.size(); i++)
+      KALDI_ASSERT(trans_model_old.TransitionIdToPhone(old_split[i].front()) ==
+                   trans_model_new.TransitionIdToPhone(new_split[i].front()));
+    if (!new_topology && subsample_factor == 1) {
+      // we should be able to convert back and it'll be the same.
+      std::vector<int32> old_alignment_copy;
+      bool ans = ConvertAlignment(trans_model_new, trans_model_old, *ctx_dep_old,
+                                  new_alignment, subsample_factor, old_reorder,
+                                  NULL, &old_alignment_copy);
+      KALDI_ASSERT(ans);
+      KALDI_ASSERT(old_alignment_copy == old_alignment);
+    }
+
+  }
+  delete ctx_dep_old;
+  delete ctx_dep_new;
 }
 
 
@@ -189,6 +327,13 @@ void TestConvertPhnxToProns() {
 
 int main() {
   kaldi::TestConvertPhnxToProns();
+#ifndef _MSC_VER
+  kaldi::TestAccumulateTreeStatsOptions();
+#endif
+  for (int32 i = 0; i < 2; i++)
+    kaldi::TestSplitToPhones();
+  for (int32 i = 0; i < 5; i++)
+    kaldi::TestConvertAlignment();
   std::cout << "Test OK.\n";
 }
 
diff --git a/src/hmm/hmm-utils.cc b/src/hmm/hmm-utils.cc
index 0303d0d8e50..1c8f4f092d8 100644
--- a/src/hmm/hmm-utils.cc
+++ b/src/hmm/hmm-utils.cc
@@ -31,7 +31,7 @@ fst::VectorFst<fst::StdArc> *GetHmmAsFst(
     std::vector<int32> phone_window,
     const ContextDependencyInterface &ctx_dep,
     const TransitionModel &trans_model,
-    const HTransducerConfig &config,    
+    const HTransducerConfig &config,
     HmmCacheType *cache) {
   using namespace fst;
 
@@ -40,9 +40,9 @@ fst::VectorFst<fst::StdArc> *GetHmmAsFst(
   // right.  will also have to reverse the FST we produce.
 
   if (static_cast<int32>(phone_window.size()) != ctx_dep.ContextWidth())
-    KALDI_ERR <<"Context size mismatch, ilabel-info [from context FST is "
-              <<(phone_window.size())<<", context-dependency object "
-        "expects "<<(ctx_dep.ContextWidth());
+    KALDI_ERR << "Context size mismatch, ilabel-info [from context FST is "
+              << phone_window.size() << ", context-dependency object "
+        "expects " << ctx_dep.ContextWidth();
 
   int P = ctx_dep.CentralPosition();
   int32 phone = phone_window[P];
@@ -66,7 +66,7 @@ fst::VectorFst<fst::StdArc> *GetHmmAsFst(
   for (int32 pdf_class = 0;
        pdf_class < static_cast<int32>(pdfs.size());
        pdf_class++) {
-    if ( ! ctx_dep.Compute(phone_window, pdf_class, &(pdfs[pdf_class])) ) {
+    if (! ctx_dep.Compute(phone_window, pdf_class, &(pdfs[pdf_class])) ) {
       std::ostringstream ctx_ss;
       for (size_t i = 0; i < phone_window.size(); i++)
         ctx_ss << phone_window[i] << ' ';
@@ -85,7 +85,7 @@ fst::VectorFst<fst::StdArc> *GetHmmAsFst(
     if (iter != cache->end())
       return iter->second;
   }
-  
+
   VectorFst<StdArc> *ans = new VectorFst<StdArc>;
 
   typedef StdArc Arc;
@@ -217,14 +217,11 @@ GetHmmAsFstSimple(std::vector<int32> phone_window,
       BaseFloat log_prob;
       Label label;
       int32 dest_state = entry[hmm_state].transitions[trans_idx].first;
-      bool is_self_loop = (dest_state == hmm_state);
-      if (is_self_loop)
-        continue;
       if (pdf_class == kNoPdf) {
-        // no pdf, hence non-estimated probability.  very unusual case.
-        // [would not happen with normal topology] .  There is no transition-state
+        // no pdf, hence non-estimated probability.  very unusual case.  [would
+        // not happen with normal topology] .  There is no transition-state
         // involved in this case.
-        KALDI_ASSERT(!is_self_loop);
+        KALDI_ASSERT(hmm_state != dest_state);
         log_prob = Log(entry[hmm_state].transitions[trans_idx].second);
         label = 0;
       } else {  // normal probability.
@@ -496,7 +493,7 @@ static void AddSelfLoopsBefore(const TransitionModel &trans_model,
   // forward-prob for that state (which is one minus self-loop-prob).  We do it
   // like this to maintain stochasticity (i.e. rather than multiplying the arcs
   // with the corresponding labels on them by this probability).
-  
+
   for (StateId s = 0; s < static_cast<StateId>(state_in.size()); s++) {
     if (state_in[s] > 0) {  // defined, and not eps or a disambiguation symbol...
       int32 trans_state = static_cast<int32>(state_in[s]);
@@ -590,10 +587,9 @@ void AddSelfLoops(const TransitionModel &trans_model,
 // code doesn't care what the answer is.
 // The "alignment" vector contains a sequence of TransitionIds.
 
-
 static bool IsReordered(const TransitionModel &trans_model,
                         const std::vector<int32> &alignment) {
-  for (size_t i = 0; i+1 < alignment.size(); i++) {
+  for (size_t i = 0; i + 1 < alignment.size(); i++) {
     int32 tstate1 = trans_model.TransitionIdToTransitionState(alignment[i]),
         tstate2 = trans_model.TransitionIdToTransitionState(alignment[i+1]);
     if (tstate1 != tstate2) {
@@ -642,7 +638,7 @@ static bool SplitToPhonesInternal(const TransitionModel &trans_model,
       else {  // reordered.
         while (i+1 < alignment.size() &&
               trans_model.IsSelfLoop(alignment[i+1])) {
-          KALDI_ASSERT(trans_model.TransitionIdToTransitionState(alignment[i]) == 
+          KALDI_ASSERT(trans_model.TransitionIdToTransitionState(alignment[i]) ==
                  trans_model.TransitionIdToTransitionState(alignment[i+1]));
           i++;
         }
@@ -674,13 +670,13 @@ static bool SplitToPhonesInternal(const TransitionModel &trans_model,
     // The next if-statement checks if the initial trans-id at the current end
     // point is the initial-state of the current phone if that initial-state
     // is emitting (a cursory check that the alignment is plausible).
-    int32 trans_state = 
+    int32 trans_state =
       trans_model.TransitionIdToTransitionState(alignment[cur_point]);
     int32 phone = trans_model.TransitionStateToPhone(trans_state);
     int32 pdf_class = trans_model.GetTopo().TopologyForPhone(phone)[0].pdf_class;
     if (pdf_class != kNoPdf)  // initial-state of the current phone is emitting
       if (trans_model.TransitionStateToHmmState(trans_state) != 0)
-        was_ok= false;
+        was_ok = false;
     for (size_t j = cur_point; j < end_points[i]; j++)
       split_output->back().push_back(alignment[j]);
     cur_point = end_points[i];
@@ -701,74 +697,247 @@ bool SplitToPhones(const TransitionModel &trans_model,
 }
 
 
+/** This function is used internally inside ConvertAlignment;
+    it converts the alignment for a single phone. 'new_phone_window'
+    is the window of phones as required by the tree.
+    The size of 'new_phone_alignment' is the length requested, which
+    may not always equal 'old_phone_alignment' (in case the
+    'subsample' value is not 1).
+ */
+static inline void ConvertAlignmentForPhone(
+    const TransitionModel &old_trans_model,
+    const TransitionModel &new_trans_model,
+    const ContextDependencyInterface &new_ctx_dep,
+    const std::vector<int32> &old_phone_alignment,
+    const std::vector<int32> &new_phone_window,
+    bool old_is_reordered,
+    bool new_is_reordered,
+    std::vector<int32> *new_phone_alignment) {
+  int32 alignment_size = old_phone_alignment.size();
+  static bool warned_topology = false;
+  int32 P = new_ctx_dep.CentralPosition(),
+      old_central_phone = old_trans_model.TransitionIdToPhone(
+          old_phone_alignment[0]),
+      new_central_phone = new_phone_window[P];
+  const HmmTopology &old_topo = old_trans_model.GetTopo(),
+      &new_topo = new_trans_model.GetTopo();
+
+  bool topology_mismatch = !(old_topo.TopologyForPhone(old_central_phone) ==
+                             new_topo.TopologyForPhone(new_central_phone));
+  if (topology_mismatch) {
+    if (!warned_topology) {
+      warned_topology = true;
+      KALDI_WARN << "Topology mismatch detected; automatically converting. "
+                 << "Won't warn again.";
+    }
+  }
+  bool length_mismatch =
+      (new_phone_alignment->size() != old_phone_alignment.size());
+  if (length_mismatch || topology_mismatch) {
+    // We generate a random path from this FST, ignoring the
+    // old alignment.
+    GetRandomAlignmentForPhone(new_ctx_dep, new_trans_model,
+                               new_phone_window, new_phone_alignment);
+    if (new_is_reordered)
+      ChangeReorderingOfAlignment(new_trans_model, new_phone_alignment);
+    return;
+  }
+
+  KALDI_ASSERT(!old_phone_alignment.empty());
+
+  int32 new_num_pdf_classes = new_topo.NumPdfClasses(new_central_phone);
+  std::vector<int32> pdf_ids(new_num_pdf_classes);  // Indexed by pdf-class
+  for (int32 pdf_class = 0; pdf_class < new_num_pdf_classes; pdf_class++) {
+    if (!new_ctx_dep.Compute(new_phone_window, pdf_class,
+                             &(pdf_ids[pdf_class]))) {
+      std::ostringstream ss;
+      WriteIntegerVector(ss, false, new_phone_window);
+      KALDI_ERR << "tree did not succeed in converting phone window "
+                << ss.str();
+    }
+  }
+
+  // the topologies and lengths match -> we can directly transfer
+  // the alignment.
+  for (int32 j = 0; j < alignment_size; j++) {
+    int32 old_tid = old_phone_alignment[j];
+    int32 pdf_class = old_trans_model.TransitionIdToPdfClass(old_tid);
+    int32 hmm_state = old_trans_model.TransitionIdToHmmState(old_tid);
+    int32 trans_idx = old_trans_model.TransitionIdToTransitionIndex(old_tid);
+    int32 new_pdf = pdf_ids[pdf_class];
+    int32 new_trans_state =
+        new_trans_model.TripleToTransitionState(new_central_phone, hmm_state,
+                                                new_pdf);
+    int32 new_tid =
+        new_trans_model.PairToTransitionId(new_trans_state, trans_idx);
+    (*new_phone_alignment)[j] = new_tid;
+  }
+
+  if (new_is_reordered != old_is_reordered)
+    ChangeReorderingOfAlignment(new_trans_model, new_phone_alignment);
+}
+
+
+/**
+   This function, called from ConvertAlignment(), works out suitable new lengths
+   of phones in the case where subsample_factor != 1.  The input vectors
+   'mapped_phones' and 'old_lengths' must be the same size-- the length of the
+   phone sequence.  The 'topology' object and 'mapped_phones' are needed to
+   work out the minimum length of each phone in the sequence.
+   Returns true only if it could not assign lengths (because the topology was
+   too long relative to the number of frames).
+*/
+static bool ComputeNewPhoneLengths(const HmmTopology &topology,
+                                   const std::vector<int32> &mapped_phones,
+                                   const std::vector<int32> &old_lengths,
+                                   int32 subsample_factor,
+                                   std::vector<int32> *new_lengths) {
+  int32 phone_sequence_length = old_lengths.size();
+  std::vector<int32> min_lengths(phone_sequence_length);
+  new_lengths->resize(phone_sequence_length);
+  for (int32 i = 0; i < phone_sequence_length; i++)
+    min_lengths[i] = topology.MinLength(mapped_phones[i]);
+  int32 cur_time_elapsed = 0;
+  for (int32 i = 0; i < phone_sequence_length; i++) {
+    // Note: the '+ subsample_factor - 1' here is needed so that
+    // the subsampled alignments have the same length as features
+    // subsampled with 'subsample-feats'.
+    int32 subsampled_time =
+        (cur_time_elapsed + subsample_factor - 1) / subsample_factor;
+    cur_time_elapsed += old_lengths[i];
+    int32 next_subsampled_time =
+        (cur_time_elapsed + subsample_factor - 1) / subsample_factor;
+    (*new_lengths)[i] = next_subsampled_time - subsampled_time;
+  }
+  bool changed = true;
+  while (changed) {
+    changed = false;
+    for (int32 i = 0; i < phone_sequence_length; i++) {
+      if ((*new_lengths)[i] < min_lengths[i]) {
+        changed = true;
+        // we need at least one extra frame.. just try to get one frame for now.
+        // Get it from the left or the right, depending which one has the closest
+        // availability of a 'spare' frame.
+        int32 min_distance = std::numeric_limits<int32>::max(),
+            best_other_phone_index = -1,
+            cur_distance = 0;
+        // first try to the left.
+        for (int32 j = i - 1; j >= 0; j--) {
+          if ((*new_lengths)[j] > min_lengths[j]) {
+            min_distance = cur_distance;
+            best_other_phone_index = j;
+            break;
+          } else {
+            cur_distance += (*new_lengths)[j];
+          }
+        }
+        // .. now to the right.
+        cur_distance = 0;
+        for (int32 j = i + 1; j < phone_sequence_length; j++) {
+          if ((*new_lengths)[j] > min_lengths[j]) {
+            if (cur_distance < min_distance) {
+              min_distance = cur_distance;
+              best_other_phone_index = j;
+            }
+            break;
+          } else {
+            cur_distance += (*new_lengths)[j];
+          }
+        }
+        if (best_other_phone_index == -1)
+          return false;
+        // assign an extra frame to this phone...
+        (*new_lengths)[i]++;
+        // and borrow it from the place that we found.
+        (*new_lengths)[best_other_phone_index]--;
+      }
+    }
+  }
+  return true;
+}
+
 bool ConvertAlignment(const TransitionModel &old_trans_model,
                       const TransitionModel &new_trans_model,
                       const ContextDependencyInterface &new_ctx_dep,
                       const std::vector<int32> &old_alignment,
+                      int32 subsample_factor,
+                      bool new_is_reordered,
                       const std::vector<int32> *phone_map,
                       std::vector<int32> *new_alignment) {
+  bool old_is_reordered = IsReordered(old_trans_model, old_alignment);
   KALDI_ASSERT(new_alignment != NULL);
   new_alignment->clear();
-  std::vector<std::vector<int32> > split;  // split into phones.
-  if (!SplitToPhones(old_trans_model, old_alignment, &split))
+  new_alignment->reserve(old_alignment.size());
+  std::vector<std::vector<int32> > old_split;  // split into phones.
+  if (!SplitToPhones(old_trans_model, old_alignment, &old_split))
     return false;
-  std::vector<int32> phones(split.size());
-  for (size_t i = 0; i < split.size(); i++) {
-    KALDI_ASSERT(!split[i].empty());
-    phones[i] = old_trans_model.TransitionIdToPhone(split[i][0]);
-  }
-  if (phone_map != NULL) {  // Map the phone sequence.
-    int32 sz = phone_map->size();
-    for (size_t i = 0; i < split.size(); i++) {
-      if (phones[i] < 0 || phones[i] >= sz || (*phone_map)[phones[i]] == -1)
-        KALDI_ERR << "ConvertAlignment: could not map phone " << phones[i];
-      phones[i] = (*phone_map)[phones[i]];
+  int32 phone_sequence_length = old_split.size();
+  std::vector<int32> mapped_phones(phone_sequence_length);
+  for (size_t i = 0; i < phone_sequence_length; i++) {
+    KALDI_ASSERT(!old_split[i].empty());
+    mapped_phones[i] = old_trans_model.TransitionIdToPhone(old_split[i][0]);
+    if (phone_map != NULL) {  // Map the phone sequence.
+      int32 sz = phone_map->size();
+      if (mapped_phones[i] < 0 || mapped_phones[i] >= sz ||
+          (*phone_map)[mapped_phones[i]] == -1)
+        KALDI_ERR << "ConvertAlignment: could not map phone " << mapped_phones[i];
+      mapped_phones[i] = (*phone_map)[mapped_phones[i]];
+    }
+  }
+
+  // the sizes of each element of 'new_split' indicate the length of alignment
+  // that we want for each phone in the new sequence.
+  std::vector<std::vector<int32> > new_split(phone_sequence_length);
+  if (subsample_factor == 1 &&
+      old_trans_model.GetTopo() == new_trans_model.GetTopo()) {
+    // we know the old phone lengths will be fine.
+    for (size_t i = 0; i < phone_sequence_length; i++)
+      new_split[i].resize(old_split[i].size());
+  } else {
+    // .. they may not be fine.
+    std::vector<int32> old_lengths(phone_sequence_length), new_lengths;
+    for (int32 i = 0; i < phone_sequence_length; i++)
+      old_lengths[i] = old_split[i].size();
+    if (!ComputeNewPhoneLengths(new_trans_model.GetTopo(),
+                                mapped_phones, old_lengths,
+                                subsample_factor, &new_lengths)) {
+      KALDI_WARN << "Failed to produce suitable phone lengths";
+      return false;
     }
+    for (int32 i = 0; i < phone_sequence_length; i++)
+      new_split[i].resize(new_lengths[i]);
   }
+
   int32 N = new_ctx_dep.ContextWidth(),
       P = new_ctx_dep.CentralPosition();
 
-  // by starting at -N and going to split.size()+N, we're
+  // by starting at -N and going to phone_sequence_length + N, we're
   // being generous and not bothering to work out the exact
   // array bounds.
   for (int32 win_start = -N;
-      win_start < static_cast<int32>(split.size()+N);
-      win_start++) {  // start of a context window.
+       win_start < static_cast<int32>(phone_sequence_length + N);
+       win_start++) {  // start of a context window.
     int32 central_pos = win_start + P;
-    if (static_cast<size_t>(central_pos)  < split.size()) {
-      // i.e. central_pos>=0 && central_pos<split.size()
-      std::vector<int32> phone_window(N, 0);
+    if (static_cast<size_t>(central_pos) < phone_sequence_length) {
+      // i.e. if (central_pos >= 0 && central_pos < phone_sequence_length)
+      std::vector<int32> new_phone_window(N, 0);
       for (int32 offset = 0; offset < N; offset++)
-        if (static_cast<size_t>(win_start+offset) < split.size())
-          phone_window[offset] = phones[win_start+offset];
-      int32 central_phone = phone_window[P];
-      int32 num_pdf_classes = new_trans_model.GetTopo().NumPdfClasses(central_phone);
-      std::vector<int32> state_seq(num_pdf_classes);  // Indexed by pdf-class
-      for (int32 pdf_class = 0; pdf_class < num_pdf_classes; pdf_class++) {
-        if (!new_ctx_dep.Compute(phone_window, pdf_class, &(state_seq[pdf_class]))) {
-          std::ostringstream ss;
-          WriteIntegerVector(ss, false, phone_window);
-          KALDI_ERR << "tree did not succeed in converting phone window "<<ss.str();
-        }
-      }
-      for (size_t j = 0; j < split[central_pos].size(); j++) {
-        int32 old_tid = split[central_pos][j];
-        int32 phone = phones[central_pos];
-        int32 pdf_class = old_trans_model.TransitionIdToPdfClass(old_tid);
-        int32 hmm_state = old_trans_model.TransitionIdToHmmState(old_tid);
-        int32 trans_idx = old_trans_model.TransitionIdToTransitionIndex(old_tid);
-        if (static_cast<size_t>(pdf_class) >= state_seq.size())
-          KALDI_ERR << "ConvertAlignment: error converting alignment, possibly different topologies?";
-        int32 new_pdf = state_seq[pdf_class];
-        int32 new_trans_state =
-            new_trans_model.TripleToTransitionState(phone, hmm_state, new_pdf);
-        int32 new_tid =
-            new_trans_model.PairToTransitionId(new_trans_state, trans_idx);
-        new_alignment->push_back(new_tid);
-      }
+        if (static_cast<size_t>(win_start+offset) < phone_sequence_length)
+          new_phone_window[offset] = mapped_phones[win_start+offset];
+      const std::vector<int32> &old_alignment_for_phone = old_split[central_pos];
+      std::vector<int32> &new_alignment_for_phone = new_split[central_pos];
+
+      ConvertAlignmentForPhone(old_trans_model, new_trans_model, new_ctx_dep,
+                               old_alignment_for_phone, new_phone_window,
+                               old_is_reordered, new_is_reordered,
+                               &new_alignment_for_phone);
+      new_alignment->insert(new_alignment->end(),
+                            new_alignment_for_phone.begin(),
+                            new_alignment_for_phone.end());
     }
   }
-  KALDI_ASSERT(new_alignment->size() == old_alignment.size());
+  KALDI_ASSERT(new_alignment->size() ==
+               (old_alignment.size() + subsample_factor - 1)/subsample_factor);
   return true;
 }
 
@@ -875,7 +1044,7 @@ bool ConvertPhnxToProns(const std::vector<int32> &phnx,
                         int32 word_end_sym,
                         std::vector<std::vector<int32> > *prons) {
   size_t i = 0, j = 0;
-    
+
   while (i < phnx.size()) {
     if (phnx[i] == 0) return false; // zeros not valid here.
     if (phnx[i] == word_start_sym) { // start a word...
@@ -915,4 +1084,90 @@ bool ConvertPhnxToProns(const std::vector<int32> &phnx,
 }
 
 
-} // End namespace kaldi
+void GetRandomAlignmentForPhone(const ContextDependencyInterface &ctx_dep,
+                                const TransitionModel &trans_model,
+                                const std::vector<int32> &phone_window,
+                                std::vector<int32> *alignment) {
+  typedef fst::StdArc Arc;
+  int32 length = alignment->size();
+  BaseFloat prob_scale = 0.0;
+  fst::VectorFst<Arc> *fst = GetHmmAsFstSimple(phone_window, ctx_dep,
+                                               trans_model, prob_scale);
+  fst::RmEpsilon(fst);
+
+  fst::VectorFst<Arc> length_constraint_fst;
+  {  // set up length_constraint_fst.
+    std::vector<int32> symbols;
+    bool include_epsilon = false;
+    // note: 'fst' is an acceptor so ilabels == olabels.
+    GetInputSymbols(*fst, include_epsilon, &symbols);
+    int32 cur_state = length_constraint_fst.AddState();
+    length_constraint_fst.SetStart(cur_state);
+    for (int32 i = 0; i < length; i++) {
+      int32 next_state = length_constraint_fst.AddState();
+      for (size_t j = 0; j < symbols.size(); j++) {
+        length_constraint_fst.AddArc(cur_state,
+                                     Arc(symbols[j], symbols[j],
+                                         fst::TropicalWeight::One(),
+                                         next_state));
+      }
+      cur_state = next_state;
+    }
+    length_constraint_fst.SetFinal(cur_state, fst::TropicalWeight::One());
+  }
+  fst::VectorFst<Arc> composed_fst;
+  fst::Compose(*fst, length_constraint_fst, &composed_fst);
+  fst::VectorFst<Arc> single_path_fst;
+  {  // randomly generate a single path.
+    fst::UniformArcSelector<Arc> selector;
+    fst::RandGenOptions<fst::UniformArcSelector<Arc> > randgen_opts(selector);
+    fst::RandGen(composed_fst, &single_path_fst, randgen_opts);
+  }
+  if (single_path_fst.NumStates() == 0) {
+    KALDI_ERR << "Error generating random alignment (wrong length?): "
+              << "requested length is " << length << " versus min-length "
+              << trans_model.GetTopo().MinLength(
+                  phone_window[ctx_dep.CentralPosition()]);
+  }
+  std::vector<int32> symbol_sequence;
+  bool ans = fst::GetLinearSymbolSequence<Arc, int32>(
+      single_path_fst, &symbol_sequence, NULL, NULL);
+  KALDI_ASSERT(ans && symbol_sequence.size() == length);
+  symbol_sequence.swap(*alignment);
+  delete fst;
+}
+
+void ChangeReorderingOfAlignment(const TransitionModel &trans_model,
+                                 std::vector<int32> *alignment) {
+  int32 start_pos = 0, size = alignment->size();
+  while (start_pos != size) {
+    int32 start_tid = (*alignment)[start_pos];
+    int32 cur_tstate = trans_model.TransitionIdToTransitionState(start_tid);
+    bool start_is_self_loop = trans_model.IsSelfLoop(start_tid) ? 0 : 1;
+    int32 end_pos = start_pos + 1;
+    // If the first instance of this transition-state was a self-loop, then eat
+    // only non-self-loops of this state; if it was a non-self-loop, then eat
+    // only self-loops of this state.  Imposing this condition on self-loops
+    // would only actually matter in the rare circumstances that phones can
+    // have length 1.
+    while (end_pos != size &&
+           trans_model.TransitionIdToTransitionState((*alignment)[end_pos]) ==
+           cur_tstate) {
+      bool this_is_self_loop = trans_model.IsSelfLoop((*alignment)[end_pos]);
+      if (!this_is_self_loop) {
+        if (start_is_self_loop) {
+          break;  // stop before including this transition-id.
+        } else {
+          end_pos++;
+          break;  // stop after including this transition-id.
+        }
+      }
+      end_pos++;
+    }
+    std::swap((*alignment)[start_pos], (*alignment)[end_pos - 1]);
+    start_pos = end_pos;
+  }
+}
+
+
+} // namespace kaldi
diff --git a/src/hmm/hmm-utils.h b/src/hmm/hmm-utils.h
index 955c02dca94..5c44db9e38e 100644
--- a/src/hmm/hmm-utils.h
+++ b/src/hmm/hmm-utils.h
@@ -117,8 +117,10 @@ fst::VectorFst<fst::StdArc> *GetHmmAsFst(
     const HTransducerConfig &config,
     HmmCacheType *cache = NULL);
 
+
 /// Included mainly as a form of documentation, not used in any other code
-/// currently.  Creates the FST with self-loops, and with fewer options.
+/// currently.  Creates the acceptor FST with self-loops, and with fewer
+/// options.
 fst::VectorFst<fst::StdArc>*
 GetHmmAsFstSimple(std::vector<int32> context_window,
                   const ContextDependencyInterface &ctx_dep,
@@ -127,7 +129,8 @@ GetHmmAsFstSimple(std::vector<int32> context_window,
 
 
 /**
-  * Returns the H tranducer; result owned by caller.
+  * Returns the H tranducer; result owned by caller.  Caution: our version of
+  * the H transducer does not include self-loops; you have to add those later.
   * See \ref hmm_graph_get_h_transducer.  The H transducer has on the
   * input transition-ids, and also possibly some disambiguation symbols, which
   * will be put in disambig_syms.  The output side contains the identifiers that
@@ -249,23 +252,40 @@ void ConvertTransitionIdsToPdfs(const TransitionModel &trans_model,
 /// phones but it will return false.  For more serious errors it will
 /// die or throw an exception.
 /// This function works out by itself whether the graph was created
-/// with "reordering" (dan-style graph), and just does the right thing.
-
+/// with "reordering", and just does the right thing.
 bool SplitToPhones(const TransitionModel &trans_model,
                    const std::vector<int32> &alignment,
                    std::vector<std::vector<int32> > *split_alignment);
 
-/// ConvertAlignment converts an alignment that was created using one
-/// model, to another model.  They must use a compatible topology (so we
-/// know the state alignments of the new model).
-/// It returns false if it could not be split to phones (probably
-/// because the alignment was partial), but for other kinds of
-/// error that are more likely a coding error, it will throw
-/// an exception.
+/**
+   ConvertAlignment converts an alignment that was created using one model, to
+   another model.  Returns false if it could not be split to phones (e.g.
+   because the alignment was partial), or because some other error happened,
+   such as we couldn't convert the alignment because there were too few frames
+   for the new topology.
+
+   @param old_trans_model [in]  The transition model that the original alignment
+                                used.
+   @param new_trans_model [in]  The transition model that we want to use for the
+                                new alignment.
+   @param new_ctx_dep     [in]  The new tree
+   @param old_alignment   [in]  The alignment we want to convert
+   @param subsample_factor [in] The frame subsampling factor... normally 1, but
+                                might be > 1 if we're converting to a reduced-frame-rate
+                                system.
+   @param reorder [in]          True if you want the pdf-ids on the new alignment to
+                                be 'reordered'. (vs. the way they appear in
+                                the HmmTopology object)
+   @param phone_map [in]        If non-NULL, map from old to new phones.
+   @param new_alignment [out]   The converted alignment.
+*/
+
 bool ConvertAlignment(const TransitionModel &old_trans_model,
                       const TransitionModel &new_trans_model,
                       const ContextDependencyInterface &new_ctx_dep,
                       const std::vector<int32> &old_alignment,
+                      int32 subsample_factor,  // 1 in the normal case -> no subsampling.
+                      bool reorder,
                       const std::vector<int32> *phone_map,  // may be NULL
                       std::vector<int32> *new_alignment);
 
@@ -287,6 +307,23 @@ bool ConvertPhnxToProns(const std::vector<int32> &phnx,
                         int32 word_end_sym,
                         std::vector<std::vector<int32> > *prons);
 
+
+/* Generates a random alignment for this phone, of length equal to
+   alignment->size(), which is required to be at least the MinLength() of the
+   topology for this phone, or this function will crash.
+   The alignment will be without 'reordering'.
+*/
+void GetRandomAlignmentForPhone(const ContextDependencyInterface &ctx_dep,
+                                const TransitionModel &trans_model,
+                                const std::vector<int32> &phone_window,
+                                std::vector<int32> *alignment);
+
+/*
+  If the alignment was non-reordered makes it reordered, and vice versa.
+*/
+void ChangeReorderingOfAlignment(const TransitionModel &trans_model,
+                                 std::vector<int32> *alignment);
+
 /// @} end "addtogroup hmm_group"
 
 } // end namespace kaldi
diff --git a/src/hmm/posterior-test.cc b/src/hmm/posterior-test.cc
index b14f0b5805c..b6958674f9b 100644
--- a/src/hmm/posterior-test.cc
+++ b/src/hmm/posterior-test.cc
@@ -77,7 +77,7 @@ void TestPosteriorIo() {
       KALDI_ASSERT(post[i].size() == post2[i].size());
       for (int32 j = 0; j < post[i].size(); j++) {
         KALDI_ASSERT(post[i][j].first == post2[i][j].first &&
-                     fabs(post[i][j].second - post2[i][j].second < 0.01));
+                     fabs(post[i][j].second - post2[i][j].second) < 0.01);
       }
     }
   }
diff --git a/src/hmm/posterior.cc b/src/hmm/posterior.cc
index 25acf48a7d1..66965ad8d34 100644
--- a/src/hmm/posterior.cc
+++ b/src/hmm/posterior.cc
@@ -129,8 +129,7 @@ bool PosteriorHolder::Write(std::ostream &os, bool binary, const T &t) {
     WritePosterior(os, binary, t);
     return true;
   } catch(const std::exception &e) {
-    KALDI_WARN << "Exception caught writing table of posteriors";
-    if (!IsKaldiError(e.what())) { std::cerr << e.what(); }
+    KALDI_WARN << "Exception caught writing table of posteriors. " << e.what();
     return false;  // Write failure.
   }
 }
@@ -147,8 +146,7 @@ bool PosteriorHolder::Read(std::istream &is) {
     ReadPosterior(is, is_binary, &t_);
     return true;
   } catch (std::exception &e) {
-    KALDI_WARN << "Exception caught reading table of posteriors";
-    if (!IsKaldiError(e.what())) { std::cerr << e.what(); }
+    KALDI_WARN << "Exception caught reading table of posteriors. " << e.what();
     t_.clear();
     return false;
   }
@@ -174,8 +172,7 @@ bool GaussPostHolder::Write(std::ostream &os, bool binary, const T &t) {
     if(!binary) os << '\n';
     return os.good();
   } catch (const std::exception &e) {
-    KALDI_WARN << "Exception caught writing table of posteriors";
-    if (!IsKaldiError(e.what())) { std::cerr << e.what(); }
+    KALDI_WARN << "Exception caught writing table of posteriors. " << e.what();
     return false;  // Write failure.
   }
 }
@@ -210,8 +207,7 @@ bool GaussPostHolder::Read(std::istream &is) {
     }
     return true;
   } catch (std::exception &e) {
-    KALDI_WARN << "Exception caught reading table of posteriors";
-    if (!IsKaldiError(e.what())) { std::cerr << e.what(); }
+    KALDI_WARN << "Exception caught reading table of posteriors. " << e.what();
     t_.clear();
     return false;
   }
@@ -429,18 +425,6 @@ void WeightSilencePostDistributed(const TransitionModel &trans_model,
   }
 }
 
-// comparator object that can be used to sort from greatest to
-// least posterior.
-struct CompareReverseSecond {
-  // view this as an "<" operator used for sorting, except it behaves like
-  // a ">" operator on the .second field of the pair because we want the
-  // sort to be in reverse order (greatest to least) on posterior.
-  bool operator() (const std::pair<int32, BaseFloat> &a,
-                   const std::pair<int32, BaseFloat> &b) {
-    return (a.second > b.second);
-  }
-};
-
 BaseFloat VectorToPosteriorEntry(
     const VectorBase<BaseFloat> &log_likes,
     int32 num_gselect,
diff --git a/src/hmm/posterior.h b/src/hmm/posterior.h
index 18bbd65a86a..a589137313f 100644
--- a/src/hmm/posterior.h
+++ b/src/hmm/posterior.h
@@ -62,18 +62,26 @@ class PosteriorHolder {
   PosteriorHolder() { }
 
   static bool Write(std::ostream &os, bool binary, const T &t);
-  
+
   void Clear() { Posterior tmp; std::swap(tmp, t_); }
 
   // Reads into the holder.
   bool Read(std::istream &is);
-  
+
   // Kaldi objects always have the stream open in binary mode for
   // reading.
   static bool IsReadInBinary() { return true; }
 
   const T &Value() const { return t_; }
-  
+
+  void Swap(PosteriorHolder *other) {
+    t_.swap(other->t_);
+  }
+
+  bool ExtractRange(const PosteriorHolder &other, const std::string &range) {
+    KALDI_ERR << "ExtractRange is not defined for this type of holder.";
+    return false;
+  }
  private:
   KALDI_DISALLOW_COPY_AND_ASSIGN(PosteriorHolder);
   T t_;
@@ -97,19 +105,27 @@ class GaussPostHolder {
 
   GaussPostHolder() { }
 
-  static bool Write(std::ostream &os, bool binary, const T &t);  
+  static bool Write(std::ostream &os, bool binary, const T &t);
 
   void Clear() {  GaussPost tmp;  std::swap(tmp, t_); }
 
   // Reads into the holder.
   bool Read(std::istream &is);
-  
+
   // Kaldi objects always have the stream open in binary mode for
   // reading.
   static bool IsReadInBinary() { return true; }
 
   const T &Value() const { return t_; }
-  
+
+  void Swap(GaussPostHolder *other) {
+    t_.swap(other->t_);
+  }
+
+  bool ExtractRange(const GaussPostHolder &other, const std::string &range) {
+    KALDI_ERR << "ExtractRange is not defined for this type of holder.";
+    return false;
+  }
  private:
   KALDI_DISALLOW_COPY_AND_ASSIGN(GaussPostHolder);
   T t_;
@@ -155,6 +171,18 @@ int32 MergePosteriors(const Posterior &post1,
                       bool drop_frames,
                       Posterior *post);
 
+// comparator object that can be used to sort from greatest to
+// least posterior.
+struct CompareReverseSecond {
+  // view this as an "<" operator used for sorting, except it behaves like
+  // a ">" operator on the .second field of the pair because we want the
+  // sort to be in reverse order (greatest to least) on posterior.
+  bool operator() (const std::pair<int32, BaseFloat> &a,
+                   const std::pair<int32, BaseFloat> &b) {
+    return (a.second > b.second);
+  }
+};
+
 /// Given a vector of log-likelihoods (typically of Gaussians in a GMM
 /// but could be of pdf-ids), a number gselect >= 1 and a minimum posterior
 /// 0 <= min_post < 1, it gets the posterior for each element of log-likes
diff --git a/src/hmm/transition-model-test.cc b/src/hmm/transition-model-test.cc
index d379b9ffdb0..841c714efb1 100644
--- a/src/hmm/transition-model-test.cc
+++ b/src/hmm/transition-model-test.cc
@@ -18,52 +18,34 @@
 // limitations under the License.
 
 #include "hmm/transition-model.h"
+#include "hmm/hmm-test-utils.h"
 
 namespace kaldi {
 
 
 void TestTransitionModel() {
-  std::vector<int32> phones;
-  phones.push_back(1);
-  for (int32 i = 2; i < 20; i++)
-    if (rand() % 2 == 0)
-      phones.push_back(i);
-  int32 N = 2 + rand() % 2, // context-size N is 2 or 3.
-      P = rand() % N;  // Central-phone is random on [0, N)
-
-  std::vector<int32> num_pdf_classes;
-
-  ContextDependency *ctx_dep =
-      GenRandContextDependencyLarge(phones, N, P,
-                                    true, &num_pdf_classes);
-  
-  HmmTopology topo = GetDefaultTopology(phones);
-  
-  TransitionModel trans_model(*ctx_dep, topo);
-  
-  delete ctx_dep; // We won't need this further.
-  ctx_dep = NULL;
 
+  TransitionModel *trans_model = GenRandTransitionModel(NULL);
 
   bool binary = (rand() % 2 == 0);
-  
+
   std::ostringstream os;
-  trans_model.Write(os, binary);
+  trans_model->Write(os, binary);
 
   TransitionModel trans_model2;
-  std::istringstream is2(os.str());  
+  std::istringstream is2(os.str());
   trans_model2.Read(is2, binary);
 
   {
     std::ostringstream os1, os2;
-    trans_model.Write(os1, false);
+    trans_model->Write(os1, false);
     trans_model2.Write(os2, false);
     KALDI_ASSERT(os1.str() == os2.str());
-    KALDI_ASSERT(trans_model.Compatible(trans_model2));
+    KALDI_ASSERT(trans_model->Compatible(trans_model2));
   }
-
+  delete trans_model;
 }
-  
+
 }
 
 int main() {
diff --git a/src/hmm/transition-model.cc b/src/hmm/transition-model.cc
index f672c9ad54b..df22169cd25 100644
--- a/src/hmm/transition-model.cc
+++ b/src/hmm/transition-model.cc
@@ -24,7 +24,7 @@
 
 namespace kaldi {
 
-void TransitionModel::ComputeTriples(const ContextDependency &ctx_dep) {
+void TransitionModel::ComputeTriples(const ContextDependencyInterface &ctx_dep) {
   const std::vector<int32> &phones = topo_.GetPhones();
   std::vector<std::vector<std::pair<int32, int32> > > pdf_info;
   KALDI_ASSERT(!phones.empty());
@@ -136,7 +136,7 @@ void TransitionModel::Check() const {
   }
 }
 
-TransitionModel::TransitionModel(const ContextDependency &ctx_dep,
+TransitionModel::TransitionModel(const ContextDependencyInterface &ctx_dep,
                                  const HmmTopology &hmm_topo): topo_(hmm_topo) {
   // First thing is to get all possible triples.
   ComputeTriples(ctx_dep);
@@ -220,7 +220,7 @@ bool TransitionModel::IsFinal(int32 trans_id) const {
                entry[triple.hmm_state].transitions.size());
   // return true if the transition goes to the final state of the
   // topology entry.
-  return (entry[triple.hmm_state].transitions[trans_index].first + 1 == 
+  return (entry[triple.hmm_state].transitions[trans_index].first + 1 ==
           static_cast<int32>(entry.size()));
 }
 
@@ -337,7 +337,7 @@ BaseFloat TransitionModel::GetTransitionLogProbIgnoringSelfLoops(int32 trans_id)
 }
 
 // stats are counts/weights, indexed by transition-id.
-void TransitionModel::MleUpdate(const Vector<double> &stats,  
+void TransitionModel::MleUpdate(const Vector<double> &stats,
                                 const MleTransitionUpdateConfig &cfg,
                                 BaseFloat *objf_impr_out,
                                 BaseFloat *count_out) {
@@ -403,7 +403,7 @@ void TransitionModel::MleUpdate(const Vector<double> &stats,
 
 
 // stats are counts/weights, indexed by transition-id.
-void TransitionModel::MapUpdate(const Vector<double> &stats,  
+void TransitionModel::MapUpdate(const Vector<double> &stats,
                                 const MapTransitionUpdateConfig &cfg,
                                 BaseFloat *objf_impr_out,
                                 BaseFloat *count_out) {
@@ -717,7 +717,7 @@ bool GetPdfsForPhones(const TransitionModel &trans_model,
 }
 
 bool GetPhonesForPdfs(const TransitionModel &trans_model,
-                     const std::vector<int32> &pdfs, 
+                     const std::vector<int32> &pdfs,
                      std::vector<int32> *phones) {
   KALDI_ASSERT(IsSortedAndUniq(pdfs));
   KALDI_ASSERT(phones != NULL);
@@ -731,7 +731,7 @@ bool GetPhonesForPdfs(const TransitionModel &trans_model,
 
   for (int32 tstate = 1; tstate <= trans_model.NumTransitionStates(); tstate++)
     if (std::binary_search(phones->begin(), phones->end(),
-                           trans_model.TransitionStateToPhone(tstate)) 
+                           trans_model.TransitionStateToPhone(tstate))
         && !std::binary_search(pdfs.begin(), pdfs.end(),
                                trans_model.TransitionStateToPdf(tstate)))
       return false;
diff --git a/src/hmm/transition-model.h b/src/hmm/transition-model.h
index 974716cb452..ff236e6de9e 100644
--- a/src/hmm/transition-model.h
+++ b/src/hmm/transition-model.h
@@ -92,7 +92,7 @@ struct MleTransitionUpdateConfig {
                             BaseFloat mincount = 5.0,
                             bool share_for_pdfs = false):
       floor(floor), mincount(mincount), share_for_pdfs(share_for_pdfs) {}
-  
+
   void Register (OptionsItf *opts) {
     opts->Register("transition-floor", &floor,
                    "Floor for transition probabilities");
@@ -124,7 +124,7 @@ class TransitionModel {
   /// Initialize the object [e.g. at the start of training].
   /// The class keeps a copy of the HmmTopology object, but not
   /// the ContextDependency object.
-  TransitionModel(const ContextDependency &ctx_dep,
+  TransitionModel(const ContextDependencyInterface &ctx_dep,
                   const HmmTopology &hmm_topo);
 
 
@@ -199,8 +199,8 @@ class TransitionModel {
   /// Returns the log-probability of a particular non-self-loop transition
   /// after subtracting the probability mass of the self-loop and renormalizing;
   /// will crash if called on a self-loop.  Specifically:
-  /// for non-self-loops it returns the log of that prob divided by (1 minus
-  /// self-loop-prob-for-that-state).
+  /// for non-self-loops it returns the log of (that prob divided by (1 minus
+  /// self-loop-prob-for-that-state)).
   BaseFloat GetTransitionLogProbIgnoringSelfLoops(int32 trans_id) const;
 
   /// Returns the log-prob of the non-self-loop probability
@@ -210,18 +210,18 @@ class TransitionModel {
 
   /// Does Maximum Likelihood estimation.  The stats are counts/weights, indexed
   /// by transition-id.  This was previously called Update().
-  void MleUpdate(const Vector<double> &stats, 
+  void MleUpdate(const Vector<double> &stats,
                  const MleTransitionUpdateConfig &cfg,
                  BaseFloat *objf_impr_out,
                  BaseFloat *count_out);
 
   /// Does Maximum A Posteriori (MAP) estimation.  The stats are counts/weights,
   /// indexed by transition-id.
-  void MapUpdate(const Vector<double> &stats, 
+  void MapUpdate(const Vector<double> &stats,
                  const MapTransitionUpdateConfig &cfg,
                  BaseFloat *objf_impr_out,
                  BaseFloat *count_out);
-  
+
   /// Print will print the transition model in a human-readable way, for purposes of human
   /// inspection.  The "occs" are optional (they are indexed by pdf-id).
   void Print(std::ostream &os,
@@ -241,7 +241,7 @@ class TransitionModel {
   /// returns true if all the integer class members are identical (but does not
   /// compare the transition probabilities.
   bool Compatible(const TransitionModel &other) const;
-  
+
  private:
   void MleUpdateShared(const Vector<double> &stats,
                        const MleTransitionUpdateConfig &cfg,
@@ -249,7 +249,7 @@ class TransitionModel {
   void MapUpdateShared(const Vector<double> &stats,
                        const MapTransitionUpdateConfig &cfg,
                        BaseFloat *objf_impr_out, BaseFloat *count_out);
-  void ComputeTriples(const ContextDependency &ctx_dep);  // called from constructor.  initializes triples_.
+  void ComputeTriples(const ContextDependencyInterface &ctx_dep);  // called from constructor.  initializes triples_.
   void ComputeDerived();  // called from constructor and Read function: computes state2id_ and id2state_.
   void ComputeDerivedOfProbs();  // computes quantities derived from log-probs (currently just
   // non_self_loop_log_probs_; called whenever log-probs change.
@@ -282,7 +282,7 @@ class TransitionModel {
   /// the triples are in sorted order which allows us to do the reverse mapping from
   /// triple to transition state
   std::vector<Triple> triples_;
-  
+
   /// Gives the first transition_id of each transition-state; indexed by
   /// the transition-state.  Array indexed 1..num-transition-states+1 (the last one
   /// is needed so we can know the num-transitions of the last transition-state.
diff --git a/src/hmm/tree-accu.cc b/src/hmm/tree-accu.cc
index f56feb59381..c8ce49d9bc7 100644
--- a/src/hmm/tree-accu.cc
+++ b/src/hmm/tree-accu.cc
@@ -23,52 +23,48 @@
 
 namespace kaldi {
 
-static int32 MapPhone(const std::vector<int32> *phone_map,
+static int32 MapPhone(const std::vector<int32> &phone_map,
                       int32 phone) {
-  if (phone == 0 || phone_map == NULL) return phone;
-  else if (phone < 0 || phone >= phone_map->size()) {
+  if (phone == 0 || phone_map.empty()) return phone;
+  else if (phone < 0 || phone >= phone_map.size()) {
     KALDI_ERR << "Out-of-range phone " << phone << " bad --phone-map option?";
   }
-  return (*phone_map)[phone];
+  return phone_map[phone];
 }
 
 
 void AccumulateTreeStats(const TransitionModel &trans_model,
-                         BaseFloat var_floor,
-                         int N,  // context window size.
-                         int P,  // central position.
-                         const std::vector<int32> &ci_phones,
+                         const AccumulateTreeStatsInfo &info,
                          const std::vector<int32> &alignment,
                          const Matrix<BaseFloat> &features,
-                         const std::vector<int32> *phone_map,
                          std::map<EventType, GaussClusterable*> *stats) {
-
-  KALDI_ASSERT(IsSortedAndUniq(ci_phones));
   std::vector<std::vector<int32> > split_alignment;
   bool ans = SplitToPhones(trans_model, alignment, &split_alignment);
   if (!ans) {
     KALDI_WARN << "AccumulateTreeStats: alignment appears to be bad, not using it";
     return;
   }
-  int cur_pos = 0;
-  int dim = features.NumCols();
+  int32 cur_pos = 0;
+  int32 dim = features.NumCols();
   KALDI_ASSERT(features.NumRows() == static_cast<int32>(alignment.size()));
-  for (int i = -N; i < static_cast<int>(split_alignment.size()); i++) {
-    // consider window starting at i, only if i+P is within
+  for (int32 i = -info.context_width; i < static_cast<int32>(split_alignment.size()); i++) {
+    // consider window starting at i, only if i+info.central_position is within
     // list of phones.
-    if (i + P >= 0 && i + P < static_cast<int>(split_alignment.size())) {
+    if (i + info.central_position >= 0 &&
+        i + info.central_position < static_cast<int32>(split_alignment.size())) {
       int32 central_phone =
-          MapPhone(phone_map,
-                   trans_model.TransitionIdToPhone(split_alignment[i+P][0]));
-      bool is_ctx_dep = ! std::binary_search(ci_phones.begin(),
-                                             ci_phones.end(),
-                                             central_phone);
+          MapPhone(info.phone_map,
+                   trans_model.TransitionIdToPhone(
+                       split_alignment[i+info.central_position][0]));
+      bool is_ctx_dep = !std::binary_search(info.ci_phones.begin(),
+                                            info.ci_phones.end(),
+                                            central_phone);
       EventType evec;
-      for (int j = 0; j < N; j++) {
-        int phone;
-        if (i + j >= 0 && i + j < static_cast<int>(split_alignment.size()))
+      for (int32 j = 0; j < info.context_width; j++) {
+        int32 phone;
+        if (i + j >= 0 && i + j < static_cast<int32>(split_alignment.size()))
           phone =
-              MapPhone(phone_map,
+              MapPhone(info.phone_map,
                        trans_model.TransitionIdToPhone(split_alignment[i+j][0]));
         else
           phone = 0;  // ContextDependency class uses 0 to mean "out of window";
@@ -82,27 +78,28 @@ void AccumulateTreeStats(const TransitionModel &trans_model,
         // give an inconsistent answer in tree-training versus graph-building.
         // [setting it to zero would have the same effect given the "normal"
         // recipe but might be less robust to changes in tree-building recipe].
-        if (is_ctx_dep || j == P)
+        if (is_ctx_dep || j == info.central_position)
           evec.push_back(std::make_pair(static_cast<EventKeyType>(j), static_cast<EventValueType>(phone)));
       }
-      for (int j = 0; j < static_cast<int>(split_alignment[i+P].size());j++) {
+      for (int32 j = 0; j < static_cast<int32>(split_alignment[i+info.central_position].size());j++) {
         // for central phone of this window...
         EventType evec_more(evec);
-        int32 pdf_class = trans_model.TransitionIdToPdfClass(split_alignment[i+P][j]);
+        int32 pdf_class = trans_model.TransitionIdToPdfClass(
+            split_alignment[i+info.central_position][j]);
         // pdf_class will normally by 0, 1 or 2 for 3-state HMM.
         std::pair<EventKeyType, EventValueType> pr(kPdfClass, pdf_class);
         evec_more.push_back(pr);
         std::sort(evec_more.begin(), evec_more.end());  // these must be sorted!
         if (stats->count(evec_more) == 0)
-          (*stats)[evec_more] = new GaussClusterable(dim, var_floor);
-        
+          (*stats)[evec_more] = new GaussClusterable(dim, info.var_floor);
+
         BaseFloat weight = 1.0;
         (*stats)[evec_more]->AddStats(features.Row(cur_pos), weight);
         cur_pos++;
       }
     }
   }
-  KALDI_ASSERT(cur_pos == static_cast<int>(alignment.size()));
+  KALDI_ASSERT(cur_pos == static_cast<int32>(alignment.size()));
 }
 
 
@@ -124,8 +121,8 @@ void ReadPhoneMap(std::string phone_map_rxfilename,
        (vec[i][0]<static_cast<int32>(phone_map->size()) &&
         (*phone_map)[vec[i][0]] != -1))
       KALDI_ERR << "Error reading phone map from "
-                 <<   PrintableRxfilename(phone_map_rxfilename)
-                 << " (bad line " << i << ")";
+                <<   PrintableRxfilename(phone_map_rxfilename)
+                << " (bad line " << i << ")";
     if (vec[i][0]>=static_cast<int32>(phone_map->size()))
       phone_map->resize(vec[i][0]+1, -1);
     KALDI_ASSERT((*phone_map)[vec[i][0]] == -1);
@@ -137,6 +134,23 @@ void ReadPhoneMap(std::string phone_map_rxfilename,
   }
 }
 
+AccumulateTreeStatsInfo::AccumulateTreeStatsInfo(
+    const AccumulateTreeStatsOptions &opts):
+    var_floor(opts.var_floor),
+    context_width(opts.context_width),
+    central_position(opts.central_position) {
+  if (central_position < 0 || context_width <= central_position)
+    KALDI_ERR << "Invalid options: --central-position=" << central_position
+              << ", --context-width=" << context_width;
+  if (!opts.phone_map_rxfilename.empty())
+    ReadPhoneMap(opts.phone_map_rxfilename, &phone_map);
 
+  if (!opts.ci_phones_str.empty()) {
+    SplitStringToIntegers(opts.ci_phones_str, ":", false, &ci_phones);
+    std::sort(ci_phones.begin(), ci_phones.end());
+    if (!IsSortedAndUniq(ci_phones) || ci_phones.empty() || ci_phones[0] == 0)
+      KALDI_ERR << "Invalid --ci-phones option: " << opts.ci_phones_str;
+  }
+}
 
 }  // end namespace kaldi
diff --git a/src/hmm/tree-accu.h b/src/hmm/tree-accu.h
index d571762c8aa..92e83c535c7 100644
--- a/src/hmm/tree-accu.h
+++ b/src/hmm/tree-accu.h
@@ -1,7 +1,7 @@
 // hmm/tree-accu.h
 
 // Copyright 2009-2011 Microsoft Corporation
-//                2013 Johns Hopkins University (author: Daniel Povey)
+//           2013-2015 Johns Hopkins University (author: Daniel Povey)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -33,19 +33,51 @@ namespace kaldi {
 /// \ingroup tree_group_top
 /// @{
 
+struct AccumulateTreeStatsOptions {
+  BaseFloat var_floor;
+  std::string ci_phones_str;
+  std::string phone_map_rxfilename;
+  bool collapse_pdf_classes;
+  int context_width;
+  int central_position;
+  AccumulateTreeStatsOptions(): var_floor(0.01), context_width(3),
+                                central_position(1) { }
+
+
+  void Register(OptionsItf *opts) {
+    opts->Register("var-floor", &var_floor, "Variance floor for tree "
+                   "clustering.");
+    opts->Register("ci-phones", &ci_phones_str, "Colon-separated list of "
+                   "integer indices of context-independent phones (after "
+                   "mapping, if --phone-map option is used).");
+    opts->Register("context-width", &context_width, "Context window size.");
+    opts->Register("central-position", &central_position, "Central "
+                   "context-window position (zero-based)");
+    opts->Register("phone-map", &phone_map_rxfilename,
+                   "File name containing old->new phone mapping (each line is: "
+                   "old-integer-id new-integer-id)");
+  }
+};
+
+// This class is a binary representation of AccumulateTreeStatsOptions.
+struct AccumulateTreeStatsInfo {
+  BaseFloat var_floor;
+  std::vector<int32> ci_phones;  // sorted, uniq vector of context-independent
+                                 // phones.
+  std::vector<int32> phone_map;  // if nonempty, maps old phones to new phones.
+  int32 context_width;
+  int32 central_position;
+  AccumulateTreeStatsInfo(const AccumulateTreeStatsOptions &opts);
+};
 
 /// Accumulates the stats needed for training context-dependency trees (in the
 /// "normal" way).  It adds to 'stats' the stats obtained from this file.  Any
 /// new GaussClusterable* pointers in "stats" will be allocated with "new".
 
 void AccumulateTreeStats(const TransitionModel &trans_model,
-                         BaseFloat var_floor,
-                         int N,  // context window size.
-                         int P,  // central position.
-                         const std::vector<int32> &ci_phones,  // sorted
+                         const AccumulateTreeStatsInfo &info,
                          const std::vector<int32> &alignment,
                          const Matrix<BaseFloat> &features,
-                         const std::vector<int32> *phone_map, // or NULL
                          std::map<EventType, GaussClusterable*> *stats);
 
 
@@ -56,7 +88,7 @@ void AccumulateTreeStats(const TransitionModel &trans_model,
  invalid, e.g. there are multiple inconsistent entries for the same old phone.
  The output vector "phone_map" will be indexed by old-phone and will contain
  the corresponding new-phone, or -1 for any entry that was not defined. */
- 
+
 void ReadPhoneMap(std::string phone_map_rxfilename,
                   std::vector<int32> *phone_map);
 
diff --git a/src/itf/context-dep-itf.h b/src/itf/context-dep-itf.h
index 6a0bd0f6bc7..b989dd900ea 100644
--- a/src/itf/context-dep-itf.h
+++ b/src/itf/context-dep-itf.h
@@ -60,6 +60,13 @@ class ContextDependencyInterface {
   virtual bool Compute(const std::vector<int32> &phoneseq, int32 pdf_class,
                        int32 *pdf_id) const = 0;
 
+  /// GetPdfInfo returns a vector indexed by pdf-id, saying for each pdf which
+  /// pairs of (phone, pdf-class) it can correspond to.  (Usually just one).
+  /// c.f. hmm/hmm-topology.h for meaning of pdf-class.
+  virtual void GetPdfInfo(const std::vector<int32> &phones,  // list of phones
+                          const std::vector<int32> &num_pdf_classes,  // indexed by phone,
+                          std::vector<std::vector<std::pair<int32, int32> > > *pdf_info)
+      const = 0;
 
 
   /// NumPdfs() returns the number of acoustic pdfs (they are numbered 0.. NumPdfs()-1).
diff --git a/src/itf/decodable-itf.h b/src/itf/decodable-itf.h
index ba4d765eb8c..ad3b7809dab 100644
--- a/src/itf/decodable-itf.h
+++ b/src/itf/decodable-itf.h
@@ -99,7 +99,7 @@ class DecodableInterface {
   /// decoding-from-matrix setting where we want to allow the last delta or LDA
   /// features to be flushed out for compatibility with the baseline setup.
   virtual bool IsLastFrame(int32 frame) const = 0;
-  
+
   /// The call NumFramesReady() will return the number of frames currently available
   /// for this decodable object.  This is for use in setups where you don't want the
   /// decoder to block while waiting for input.  This is newly added as of Jan 2014,
@@ -114,7 +114,7 @@ class DecodableInterface {
   /// (they will be indexed one-based, i.e. from 1 to NumIndices();
   /// this is for compatibility with OpenFst.
   virtual int32 NumIndices() const = 0;
-  
+
   virtual ~DecodableInterface() {}
 };
 /// @}
diff --git a/src/itf/online-feature-itf.h b/src/itf/online-feature-itf.h
index dafcd8a9f07..3837024ab55 100644
--- a/src/itf/online-feature-itf.h
+++ b/src/itf/online-feature-itf.h
@@ -74,10 +74,15 @@ class OnlineFeatureInterface {
   /// the class.
   virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat) = 0;
 
+  // Returns frame shift in seconds.  Helps to estimate duration from frame
+  // counts.
+  virtual BaseFloat FrameShiftInSeconds() const = 0;
+
   /// Virtual destructor.  Note: constructors that take another member of
   /// type OnlineFeatureInterface are not expected to take ownership of
   /// that pointer; the caller needs to keep track of that manually.
   virtual ~OnlineFeatureInterface() { }  
+  
 };
 
 
diff --git a/src/ivector/Makefile b/src/ivector/Makefile
index 918bd4ef113..f53bd9c8e51 100644
--- a/src/ivector/Makefile
+++ b/src/ivector/Makefile
@@ -1,8 +1,8 @@
 
 all:
 
-OPENFST_CXXFLAGS = 
-OPENFST_LDLIBS = 
+OPENFST_CXXFLAGS =
+OPENFST_LDLIBS =
 include ../kaldi.mk
 
 TESTFILES = ivector-extractor-test plda-test logistic-regression-test
@@ -12,7 +12,8 @@ OBJFILES = ivector-extractor.o voice-activity-detection.o plda.o logistic-regres
 LIBNAME = kaldi-ivector
 
 ADDLIBS = ../gmm/kaldi-gmm.a ../tree/kaldi-tree.a ../transform/kaldi-transform.a \
-		../thread/kaldi-thread.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a \
-        ../util/kaldi-util.a 
+		../util/kaldi-util.a ../thread/kaldi-thread.a ../matrix/kaldi-matrix.a \
+        ../base/kaldi-base.a
+
 
 include ../makefiles/default_rules.mk
diff --git a/src/ivector/ivector-extractor.cc b/src/ivector/ivector-extractor.cc
index 352dae9189c..338187b6ae9 100644
--- a/src/ivector/ivector-extractor.cc
+++ b/src/ivector/ivector-extractor.cc
@@ -1,6 +1,7 @@
 // ivector/ivector-extractor.cc
 
 // Copyright 2013     Daniel Povey
+//           2015     David Snyder
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -85,19 +86,19 @@ void IvectorExtractor::GetIvectorDistribution(
     GetIvectorDistPrior(utt_stats, &linear, &quadratic);
     // At this point, "linear" and "quadratic" contain
     // the mean and prior-related terms, and we avoid
-    // recomputing those. 
+    // recomputing those.
 
     Vector<double> cur_mean(IvectorDim());
 
     SpMatrix<double> quadratic_inv(IvectorDim());
     InvertWithFlooring(quadratic, &quadratic_inv);
     cur_mean.AddSpVec(1.0, quadratic_inv, linear, 0.0);
-    
+
     KALDI_VLOG(3) << "Trace of quadratic is " << quadratic.Trace()
                   << ", condition is " << quadratic.Cond();
     KALDI_VLOG(3) << "Trace of quadratic_inv is " << quadratic_inv.Trace()
                   << ", condition is " << quadratic_inv.Cond();
-    
+
     // The loop is finding successively better approximation points
     // for the quadratic expansion of the weights.
     int32 num_iters = 4;
@@ -119,7 +120,7 @@ void IvectorExtractor::GetIvectorDistribution(
                            &this_linear, &this_quadratic);
       InvertWithFlooring(this_quadratic, &quadratic_inv);
       Vector<double> mean_diff(cur_mean);
-      cur_mean.AddSpVec(1.0, quadratic_inv, this_linear, 0.0);      
+      cur_mean.AddSpVec(1.0, quadratic_inv, this_linear, 0.0);
       mean_diff.AddVec(-1.0, cur_mean);
       double change = mean_diff.Norm(2.0);
       KALDI_VLOG(2) << "On iter " << iter << ", iVector changed by " << change;
@@ -151,7 +152,7 @@ IvectorExtractor::IvectorExtractor(
 
   prior_offset_ = 100.0; // hardwired for now.  Must be nonzero.
   gmm_means.Scale(1.0 / prior_offset_);
-  
+
   M_.resize(num_gauss);
   for (int32 i = 0; i < num_gauss; i++) {
     M_[i].Resize(feature_dim, opts.ivector_dim);
@@ -203,7 +204,7 @@ void IvectorExtractor::ComputeDerivedVars() {
   KALDI_LOG << "Done.";
 }
 
-  
+
 void IvectorExtractor::ComputeDerivedVars(int32 i) {
   SpMatrix<double> temp_U(IvectorDim());
   // temp_U = M_i^T Sigma_i^{-1} M_i
@@ -250,9 +251,9 @@ void IvectorExtractor::GetIvectorDistWeight(
 
   // *quadratic += \sum_i quadratic_coeff(i) w_i w_i^T, where w_i is
   //    i'th row of w_.
-  quadratic->AddMat2Vec(1.0, w_, kTrans, quadratic_coeff, 1.0);      
+  quadratic->AddMat2Vec(1.0, w_, kTrans, quadratic_coeff, 1.0);
 }
-  
+
 void IvectorExtractor::GetIvectorDistMean(
     const IvectorExtractorUtteranceStats &utt_stats,
     VectorBase<double> *linear,
@@ -263,7 +264,7 @@ void IvectorExtractor::GetIvectorDistMean(
     if (gamma != 0.0) {
       SubVector<double> x(utt_stats.X_, i); // == \gamma(i) \m_i
       // next line: a += \gamma_i \M_i^T \Sigma_i^{-1} \m_i
-      linear->AddMatVec(1.0, Sigma_inv_M_[i], kTrans, x, 1.0); 
+      linear->AddMatVec(1.0, Sigma_inv_M_[i], kTrans, x, 1.0);
     }
   }
   SubVector<double> q_vec(quadratic->Data(), IvectorDim()*(IvectorDim()+1)/2);
@@ -301,7 +302,7 @@ double IvectorExtractor::GetAcousticAuxfWeight(
     // considering the variance.  At the moment, "w" contains
     // the normalized log weights.
     double ans = VecVec(w, utt_stats.gamma_);
-    
+
     w.ApplyExp(); // now w is the weights.
 
     if (var == NULL) {
@@ -339,6 +340,21 @@ double IvectorExtractor::GetAuxf(const IvectorExtractorUtteranceStats &utt_stats
   return acoustic_auxf + prior_auxf;
 }
 
+// gets logdet of a matrix while suppressing exceptions; always returns finite
+// value even if there was a problem.
+static double GetLogDetNoFailure(const SpMatrix<double> &var) {
+  try {
+    return var.LogPosDefDet();
+  } catch (...) {
+    Vector<double> eigs(var.NumRows());
+    var.Eig(&eigs);
+    int32 floored = eigs.ApplyFloor(1.0e-20);
+    if (floored > 0)
+      KALDI_WARN << "Floored " << floored << " eigenvalues of variance.";
+    eigs.ApplyLog();
+    return eigs.Sum();
+  }
+}
 
 /*
   Get the prior-related part of the auxiliary function.  Suppose
@@ -386,11 +402,10 @@ double IvectorExtractor::GetPriorAuxf(
     // = -0.5 ( trace(var I) - trace(var^{-1} var) + 0.0 - logdet(var))
     // = -0.5 ( trace(var) - dim(var) - logdet(var))
 
-    
     KALDI_ASSERT(var->NumRows() == IvectorDim());
     return -0.5 * (VecVec(offset, offset) + var->Trace() -
-                   IvectorDim() - var->LogPosDefDet());
-  }  
+                   IvectorDim() - GetLogDetNoFailure(*var));
+  }
 }
 
 /* Gets the acoustic-related part of the auxf.
@@ -398,7 +413,7 @@ double IvectorExtractor::GetPriorAuxf(
    "var" is p(x), and the acoustic auxiliary-function given
    x is r(x), this function returns
    \int p(x) log r(x) dx
-   
+
 */
 double IvectorExtractor::GetAcousticAuxf(
     const IvectorExtractorUtteranceStats &utt_stats,
@@ -447,7 +462,7 @@ double IvectorExtractor::GetAcousticAuxfMean(
     const SpMatrix<double> *var) const {
   double K = 0.0;
   Vector<double> a(IvectorDim()), temp(FeatDim());
-  
+
   int32 I = NumGauss();
   for (int32 i = 0; i < I; i++) {
     double gamma = utt_stats.gamma_(i);
@@ -456,15 +471,15 @@ double IvectorExtractor::GetAcousticAuxfMean(
       temp.AddSpVec(1.0 / gamma, Sigma_inv_[i], x, 0.0);
       // now temp = Sigma_i^{-1} \m_i.
       // next line: K += -0.5 \gamma_i \m_i^T \Sigma_i^{-1} \m_i
-      K += -0.5 * VecVec(x, temp); 
+      K += -0.5 * VecVec(x, temp);
       // next line: a += \gamma_i \M_i^T \Sigma_i^{-1} \m_i
-      a.AddMatVec(gamma, M_[i], kTrans, temp, 1.0); 
+      a.AddMatVec(gamma, M_[i], kTrans, temp, 1.0);
     }
   }
   SpMatrix<double> B(IvectorDim());
   SubVector<double> B_vec(B.Data(), IvectorDim()*(IvectorDim()+1)/2);
   B_vec.AddMatVec(1.0, U_, kTrans, Vector<double>(utt_stats.gamma_), 0.0);
-  
+
   double ans = K + VecVec(mean, a) - 0.5 * VecSpVec(mean, B, mean);
   if (var != NULL)
     ans -= 0.5 * TraceSpSp(*var, B);
@@ -518,7 +533,6 @@ void IvectorExtractor::TransformIvectors(const MatrixBase<double> &T,
   prior_offset_ = new_prior_offset;
 }
 
-
 void OnlineIvectorEstimationStats::AccStats(
     const IvectorExtractor &extractor,
     const VectorBase<BaseFloat> &feature,
@@ -532,7 +546,7 @@ void OnlineIvectorEstimationStats::AccStats(
       quadratic_term_dim = (ivector_dim * (ivector_dim + 1)) / 2;
   SubVector<double> quadratic_term_vec(quadratic_term_.Data(),
                                        quadratic_term_dim);
-  
+
   for (size_t idx = 0; idx < gauss_post.size(); idx++) {
     int32 g = gauss_post[idx].first;
     double weight = gauss_post[idx].second;
@@ -563,7 +577,7 @@ void OnlineIvectorEstimationStats::AccStats(
       quadratic_term_.AddToDiag(prior_scale_change);
     }
   }
-  
+
   num_frames_ += tot_weight;
 }
 
@@ -633,7 +647,7 @@ void OnlineIvectorEstimationStats::GetIvector(
     VectorBase<double> *ivector) const {
   KALDI_ASSERT(ivector != NULL && ivector->Dim() ==
                this->IvectorDim());
-  
+
   if (num_frames_ > 0.0) {
     // could be done exactly as follows:
     // SpMatrix<double> quadratic_inv(quadratic_term_);
@@ -666,7 +680,7 @@ double OnlineIvectorEstimationStats::Objf(
   if (num_frames_ == 0.0) {
     return 0.0;
   } else {
-    return (1.0 / num_frames_) * (-0.5 * VecSpVec(ivector, quadratic_term_, 
+    return (1.0 / num_frames_) * (-0.5 * VecSpVec(ivector, quadratic_term_,
                                                   ivector)
                                   + VecVec(ivector, linear_term_));
   }
@@ -700,7 +714,7 @@ OnlineIvectorEstimationStats::OnlineIvectorEstimationStats(
     num_frames_(other.num_frames_),
     quadratic_term_(other.quadratic_term_),
     linear_term_(other.linear_term_) { }
-    
+
 
 
 void IvectorExtractor::Write(std::ostream &os, bool binary) const {
@@ -709,12 +723,12 @@ void IvectorExtractor::Write(std::ostream &os, bool binary) const {
   w_.Write(os, binary);
   WriteToken(os, binary, "<w_vec>");
   w_vec_.Write(os, binary);
-  WriteToken(os, binary, "<M>");  
+  WriteToken(os, binary, "<M>");
   int32 size = M_.size();
   WriteBasicType(os, binary, size);
   for (int32 i = 0; i < size; i++)
     M_[i].Write(os, binary);
-  WriteToken(os, binary, "<SigmaInv>");  
+  WriteToken(os, binary, "<SigmaInv>");
   KALDI_ASSERT(size == static_cast<int32>(Sigma_inv_.size()));
   for (int32 i = 0; i < size; i++)
     Sigma_inv_[i].Write(os, binary);
@@ -730,7 +744,7 @@ void IvectorExtractor::Read(std::istream &is, bool binary) {
   w_.Read(is, binary);
   ExpectToken(is, binary, "<w_vec>");
   w_vec_.Read(is, binary);
-  ExpectToken(is, binary, "<M>");  
+  ExpectToken(is, binary, "<M>");
   int32 size;
   ReadBasicType(is, binary, &size);
   KALDI_ASSERT(size > 0);
@@ -751,7 +765,7 @@ void IvectorExtractor::Read(std::istream &is, bool binary) {
 void IvectorExtractorUtteranceStats::AccStats(
     const MatrixBase<BaseFloat> &feats,
     const Posterior &post) {
-  typedef std::vector<std::pair<int32, BaseFloat> > VecType;  
+  typedef std::vector<std::pair<int32, BaseFloat> > VecType;
   int32 num_frames = feats.NumRows(),
       num_gauss = X_.NumRows(),
       feat_dim = feats.NumCols();
@@ -793,7 +807,7 @@ IvectorExtractorStats::IvectorExtractorStats(
     config_(stats_opts) {
   int32 S = extractor.IvectorDim(), D = extractor.FeatDim(),
       I = extractor.NumGauss();
-  
+
   KALDI_ASSERT(config_.num_samples_for_weights > 1);
   tot_auxf_ = 0.0;
   gamma_.Resize(I);
@@ -806,7 +820,7 @@ IvectorExtractorStats::IvectorExtractorStats(
 
   R_gamma_cache_.Resize(stats_opts.cache_size, I);
   R_ivec_scatter_cache_.Resize(stats_opts.cache_size, S*(S+1)/2);
-  
+
   if (extractor.IvectorDependentWeights()) {
     Q_.Resize(I, S * (S + 1) / 2);
     G_.Resize(I, S);
@@ -832,7 +846,7 @@ void IvectorExtractorStats::CommitStatsForM(
 
   // We do the occupation stats here also.
   gamma_.AddVec(1.0, utt_stats.gamma_);
-  
+
   // Stats for the linear term in M:
   for  (int32 i = 0; i < extractor.NumGauss(); i++) {
     Y_[i].AddVecVec(1.0, utt_stats.X_.Row(i),
@@ -842,13 +856,13 @@ void IvectorExtractorStats::CommitStatsForM(
 
   SpMatrix<double> ivec_scatter(ivec_var);
   ivec_scatter.AddVec2(1.0, ivec_mean);
-  
+
   R_cache_lock_.Lock();
   while (R_num_cached_ == R_gamma_cache_.NumRows()) {
     // Cache full.  The "while" statement is in case of certain race conditions.
     R_cache_lock_.Unlock();
     FlushCache();
-    R_cache_lock_.Lock();    
+    R_cache_lock_.Lock();
   }
   R_gamma_cache_.Row(R_num_cached_).CopyFromVec(utt_stats.gamma_);
   int32 ivector_dim = ivec_mean.Dim();
@@ -912,7 +926,7 @@ void IvectorExtractorStats::CommitStatsForWPoint(
 
   Vector<double> w(logw_unnorm);
   w.ApplySoftMax(); // now w is the weights.
-  
+
   Vector<double> linear_coeff(num_gauss);
   Vector<double> quadratic_coeff(num_gauss);
   double gamma = utt_stats.gamma_.Sum();
@@ -940,7 +954,7 @@ void IvectorExtractorStats::CommitStatsForW(
     const VectorBase<double> &ivec_mean,
     const SpMatrix<double> &ivec_var) {
   KALDI_ASSERT(config_.num_samples_for_weights > 1);
-  
+
   Matrix<double> rand(config_.num_samples_for_weights, extractor.IvectorDim());
   rand.SetRandn();
   TpMatrix<double> ivec_stddev(extractor.IvectorDim());
@@ -954,7 +968,7 @@ void IvectorExtractorStats::CommitStatsForW(
   // Correct the variance for what we just did, so the expected
   // variance still has the correct value.
   ivecs.Scale(sqrt(config_.num_samples_for_weights / (config_.num_samples_for_weights - 1.0)));
-  // Add the mean of the distribution to "ivecs". 
+  // Add the mean of the distribution to "ivecs".
   ivecs.AddVecToRows(1.0, ivec_mean);
   // "ivecs" is now a sample from the iVector distribution.
   for (int32 samp = 0; samp < config_.num_samples_for_weights; samp++)
@@ -979,7 +993,7 @@ void IvectorExtractorStats::CommitStatsForPrior(
 void IvectorExtractorStats::CommitStatsForUtterance(
     const IvectorExtractor &extractor,
     const IvectorExtractorUtteranceStats &utt_stats) {
-  
+
   int32 ivector_dim = extractor.IvectorDim();
   Vector<double> ivec_mean(ivector_dim);
   SpMatrix<double> ivec_var(ivector_dim);
@@ -990,7 +1004,7 @@ void IvectorExtractorStats::CommitStatsForUtterance(
 
   if (config_.compute_auxf)
     tot_auxf_ += extractor.GetAuxf(utt_stats, ivec_mean, &ivec_var);
-  
+
   CommitStatsForM(extractor, utt_stats, ivec_mean, ivec_var);
   if (extractor.IvectorDependentWeights())
     CommitStatsForW(extractor, utt_stats, ivec_mean, ivec_var);
@@ -1035,7 +1049,7 @@ void IvectorExtractorStats::AccStatsForUtterance(
   typedef std::vector<std::pair<int32, BaseFloat> > VecType;
 
   CheckDims(extractor);
-  
+
   int32 num_gauss = extractor.NumGauss(), feat_dim = extractor.FeatDim();
 
   if (feat_dim != feats.NumCols()) {
@@ -1045,13 +1059,13 @@ void IvectorExtractorStats::AccStatsForUtterance(
   KALDI_ASSERT(static_cast<int32>(post.size()) == feats.NumRows());
 
   bool update_variance = (!S_.empty());
-  
+
   // The zeroth and 1st-order stats are in "utt_stats".
   IvectorExtractorUtteranceStats utt_stats(num_gauss, feat_dim,
                                            update_variance);
 
   utt_stats.AccStats(feats, post);
-  
+
   CommitStatsForUtterance(extractor, utt_stats);
 }
 
@@ -1177,7 +1191,7 @@ double IvectorExtractorStats::Update(
               << (tot_auxf_/gamma_.Sum()) << " per frame over "
               << gamma_.Sum() << " frames.";
   }
-  
+
   double ans = 0.0;
   ans += UpdateProjections(opts, extractor);
   if (extractor->IvectorDependentWeights())
@@ -1193,6 +1207,28 @@ double IvectorExtractorStats::Update(
   return ans;
 }
 
+void IvectorExtractorStats::IvectorVarianceDiagnostic(
+  const IvectorExtractor &extractor) {
+
+  // W is an estimate of the total residual variance explained by the
+  // speaker-adapated model.  B is an estimate of the total variance
+  // explained by the Ivector-subspace.
+  SpMatrix<double> W(extractor.Sigma_inv_[0].NumRows()),
+                      B(extractor.M_[0].NumRows());
+  Vector<double> w(gamma_);
+  w.Scale(1.0 / gamma_.Sum());
+  for (int32 i = 0; i < extractor.NumGauss(); i++) {
+    SpMatrix<double> Sigma_i(extractor.FeatDim());
+    extractor.InvertWithFlooring(extractor.Sigma_inv_[i], &Sigma_i);
+    W.AddSp(w(i), Sigma_i);
+    B.AddMat2(w(i), extractor.M_[i], kNoTrans, 1.0);
+  }
+  double trace_W = W.Trace(),
+         trace_B = B.Trace();
+  KALDI_LOG << "The proportion of within-Gaussian variance explained by "
+            << "the iVectors is " << trace_B / (trace_B + trace_W) << ".";
+}
+
 double IvectorExtractorStats::UpdateProjection(
     const IvectorExtractorEstimationOptions &opts,
     int32 i,
@@ -1229,7 +1265,7 @@ double IvectorExtractorStats::UpdateProjection(
 
 void IvectorExtractorStats::GetOrthogonalIvectorTransform(
                               const SubMatrix<double> &T,
-                              IvectorExtractor *extractor, 
+                              IvectorExtractor *extractor,
                               Matrix<double> *A) const {
   extractor->ComputeDerivedVars(); // Update the extractor->U_ matrix.
   int32 ivector_dim = extractor->IvectorDim(),
@@ -1252,7 +1288,7 @@ void IvectorExtractorStats::GetOrthogonalIvectorTransform(
   Matrix<double> Tinv(T);
   Tinv.Invert();
   Matrix<double> Vavg_temp(Vavg), Uavg_temp(Uavg);
-    
+
   Vavg_temp.AddMatMatMat(1.0, Tinv, kTrans, SubMatrix<double>(Uavg_temp,
                            1, ivector_dim-1, 1, ivector_dim-1),
                          kNoTrans, Tinv, kNoTrans, 0.0);
@@ -1325,7 +1361,7 @@ double IvectorExtractorStats::UpdateVariances(
   std::vector<SpMatrix<double> > raw_variances(num_gauss);
   SpMatrix<double> var_floor(feat_dim);
   double var_floor_count = 0.0;
-  
+
   for (int32 i = 0; i < num_gauss; i++) {
     if (gamma_(i) < opts.gaussian_min_count) continue; // warned in UpdateProjections
     SpMatrix<double> &S(raw_variances[i]);
@@ -1353,7 +1389,7 @@ double IvectorExtractorStats::UpdateVariances(
     SubVector<double> R_vec(R.Data(),
                             ivector_dim * (ivector_dim + 1) / 2);
     R_vec.CopyFromVec(R_.Row(i)); //
-    
+
     S.AddMat2Sp(1.0, M, kNoTrans, R, 1.0);
 
     var_floor.AddSp(1.0, S);
@@ -1366,6 +1402,17 @@ double IvectorExtractorStats::UpdateVariances(
 
   var_floor.Scale(opts.variance_floor_factor / var_floor_count);
 
+  // var_floor should not be singular in any normal case, but previously
+  // we've had situations where cholesky on it failed (perhaps due to
+  // people using linearly dependent features).  So we floor its
+  // singular values.
+  int eig_floored = var_floor.ApplyFloor(var_floor.MaxAbsEig() * 1.0e-04);
+  if (eig_floored > 0) {
+    KALDI_WARN << "Floored " << eig_floored << " eigenvalues of the "
+               << "variance floor matrix.  This is not expected.  Maybe your "
+               << "feature data is linearly dependent.";
+  }
+
   int32 tot_num_floored = 0;
   for (int32 i = 0; i < num_gauss; i++) {
     SpMatrix<double> &S(raw_variances[i]); // un-floored variance.
@@ -1381,7 +1428,7 @@ double IvectorExtractorStats::UpdateVariances(
     // this objf is per frame;
     double old_objf = -0.5 * (TraceSpSp(S, old_inv_var) -
                               old_inv_var.LogPosDefDet());
-    
+
     SpMatrix<double> new_inv_var(floored_var);
     new_inv_var.Invert();
 
@@ -1396,7 +1443,7 @@ double IvectorExtractorStats::UpdateVariances(
   }
   double floored_percent = tot_num_floored * 100.0 / (num_gauss * feat_dim);
   KALDI_LOG << "Floored " << floored_percent << "% of all Gaussian eigenvalues";
-  
+
   KALDI_LOG << "Overall objf impr/frame for variances was "
             << (tot_objf_impr / gamma_.Sum()) << " over "
             << gamma_.Sum() << " frames.";
@@ -1407,15 +1454,15 @@ double IvectorExtractorStats::UpdateWeight(
     const IvectorExtractorEstimationOptions &opts,
     int32 i,
     IvectorExtractor *extractor) const {
-  
+
   int32 num_gauss = extractor->NumGauss(),
       ivector_dim = extractor->IvectorDim();
   KALDI_ASSERT(i >= 0 && i < num_gauss);
-  
+
   SolverOptions solver_opts;
   solver_opts.diagonal_precondition = true;
   solver_opts.name = "w";
-  
+
   SubVector<double> w_i(extractor->w_, i);
   SubVector<double> g_i(G_, i);
   SpMatrix<double> Q(ivector_dim);
@@ -1455,7 +1502,7 @@ class IvectorExtractorUpdateWeightClass {
 double IvectorExtractorStats::UpdateWeights(
     const IvectorExtractorEstimationOptions &opts,
     IvectorExtractor *extractor) const {
-  
+
   int32 I = extractor->NumGauss();
   double tot_impr = 0.0;
   {
@@ -1467,7 +1514,7 @@ double IvectorExtractorStats::UpdateWeights(
       sequencer.Run(new IvectorExtractorUpdateWeightClass(
           *this, opts, i, extractor, &tot_impr));
   }
-  
+
   double num_frames = gamma_.Sum();
   KALDI_LOG << "Overall auxf impr/frame from weight update is "
             << (tot_impr / num_frames) << " over "
@@ -1491,7 +1538,7 @@ double IvectorExtractorStats::PriorDiagnostics(double old_prior_offset) const {
   SpMatrix<double> covar(ivector_scatter_);
   covar.Scale(1.0 / num_ivectors_);
   covar.AddVec2(-1.0, sum); // Get the centered covariance.
-  
+
   // Now work out the offset from the old prior's mean.
   Vector<double> mean_offset(sum);
   mean_offset(0) -= old_prior_offset;
@@ -1552,9 +1599,9 @@ double IvectorExtractorStats::UpdatePrior(
 
   Vector<double> sum_proj(ivector_dim);
   sum_proj.AddMatVec(1.0, T, kNoTrans, sum, 0.0);
-  
+
   KALDI_ASSERT(sum_proj.Norm(2.0) != 0.0);
-  
+
   // We need a projection that (like T) makes "covar" unit,
   // but also that sends "sum" to a multiple of the vector e0 = [ 1 0 0 0 .. ].
   // We'll do this by a transform that follows T, of the form
@@ -1566,7 +1613,7 @@ double IvectorExtractorStats::UpdatePrior(
   //  (I - 2(alpha x + beta e0)(alpha x + beta e0)  x
   // equals zero., i.e. 1 - 2 alpha (alpha x^T x + beta e0^T x) == 0,
   //    (1 - 2 alpha^2 - 2 alpha beta x0) = 0
-  // To ensure that a is unit, we require that 
+  // To ensure that a is unit, we require that
   // (alpha x + beta e0).(alpha x + beta e0) = 1, i.e.
   //    alpha^2 + beta^2 + 2 alpha beta x0 = 1
   // at wolframalpha.com,
@@ -1574,7 +1621,7 @@ double IvectorExtractorStats::UpdatePrior(
   // gives different solutions, but the one that keeps the offset positive
   // after projection seems to be:
   //    alpha = 1/(sqrt(2)sqrt(1 - x0)), beta = -alpha
-  
+
   Matrix<double> U(ivector_dim, ivector_dim);
   U.SetUnit();
   Vector<double> x(sum_proj);
@@ -1586,7 +1633,7 @@ double IvectorExtractorStats::UpdatePrior(
   a.Scale(alpha);
   a(0) += beta;
   U.AddVecVec(-2.0, a, a);
-  
+
   Matrix<double> V(ivector_dim, ivector_dim);
   V.AddMatMat(1.0, U, kNoTrans, T, kNoTrans, 0.0);
 
@@ -1596,15 +1643,15 @@ double IvectorExtractorStats::UpdatePrior(
   if (opts.diagonalize) {
 
     SubMatrix<double> Vsub(V, 1, V.NumRows()-1, 0, V.NumCols());
-    Matrix<double> Vtemp(SubMatrix<double>(V, 1, V.NumRows()-1, 
-                         0, V.NumCols())), 
+    Matrix<double> Vtemp(SubMatrix<double>(V, 1, V.NumRows()-1,
+                         0, V.NumCols())),
                    A;
-    GetOrthogonalIvectorTransform(SubMatrix<double>(Vtemp, 0, 
+    GetOrthogonalIvectorTransform(SubMatrix<double>(Vtemp, 0,
                                   Vtemp.NumRows(), 1, Vtemp.NumCols()-1),
                                   extractor, &A);
 
     // It is necessary to exclude the first row of V in this transformation
-    // so that the sum_vproj has the form [ x 0 0 0 .. ], where x > 0. 
+    // so that the sum_vproj has the form [ x 0 0 0 .. ], where x > 0.
     Vsub.AddMatMat(1.0, A, kNoTrans, Vtemp, kNoTrans, 0.0);
   }
 
@@ -1614,7 +1661,7 @@ double IvectorExtractorStats::UpdatePrior(
     KALDI_ASSERT(Vproj.IsUnit(1.0e-04));
   }
 
-  
+
   Vector<double> sum_vproj(ivector_dim);
   sum_vproj.AddMatVec(1.0, V, kNoTrans, sum, 0.0);
   // Make sure sum_vproj is of the form [ x 0 0 0 .. ] with x > 0.
@@ -1622,7 +1669,7 @@ double IvectorExtractorStats::UpdatePrior(
   KALDI_ASSERT(ApproxEqual(sum_vproj(0), sum_vproj.Norm(2.0)));
 
   double ans = PriorDiagnostics(extractor->prior_offset_);
-  
+
   extractor->TransformIvectors(V, sum_vproj(0));
 
   return ans;
@@ -1648,12 +1695,12 @@ double EstimateIvectorsOnline(
     int32 num_cg_iters,
     BaseFloat max_count,
     Matrix<BaseFloat> *ivectors) {
-  
+
   KALDI_ASSERT(ivector_period > 0);
   KALDI_ASSERT(static_cast<int32>(post.size()) == feats.NumRows());
   int32 num_frames = feats.NumRows(),
       num_ivectors = (num_frames + ivector_period - 1) / ivector_period;
-  
+
   ivectors->Resize(num_ivectors, extractor.IvectorDim());
 
   OnlineIvectorEstimationStats online_stats(extractor.IvectorDim(),
@@ -1661,7 +1708,7 @@ double EstimateIvectorsOnline(
                                             max_count);
 
   double ans = 0.0;
-  
+
   Vector<double> cur_ivector(extractor.IvectorDim());
   cur_ivector(0) = extractor.PriorOffset();
   for (int32 frame = 0; frame < num_frames; frame++) {
@@ -1680,6 +1727,6 @@ double EstimateIvectorsOnline(
 }
 
 
-  
+
 
 } // namespace kaldi
diff --git a/src/ivector/ivector-extractor.h b/src/ivector/ivector-extractor.h
index ffe1361ad9b..1a9f01027b5 100644
--- a/src/ivector/ivector-extractor.h
+++ b/src/ivector/ivector-extractor.h
@@ -1,6 +1,7 @@
 // ivector/ivector-extractor.h
 
 // Copyright 2013-2014    Daniel Povey
+//           2015         David Snyder
 
 
 // See ../../COPYING for clarification regarding multiple authors
@@ -89,11 +90,11 @@ class IvectorExtractorUtteranceStats {
 
   void AccStats(const MatrixBase<BaseFloat> &feats,
                 const Posterior &post);
-  
+
   void Scale(double scale); // Used to apply acoustic scale.
 
   double NumFrames() { return gamma_.Sum(); }
-  
+
  protected:
   friend class IvectorExtractor;
   friend class IvectorExtractorStats;
@@ -138,7 +139,7 @@ class IvectorExtractor {
   friend class OnlineIvectorEstimationStats;
 
   IvectorExtractor(): prior_offset_(0.0) { }
-  
+
   IvectorExtractor(
       const IvectorExtractorOptions &opts,
       const FullGmm &fgmm);
@@ -156,13 +157,13 @@ class IvectorExtractor {
   /// zero; its first dimension has a nonzero offset.  This function returns
   /// that offset.
   double PriorOffset() const { return prior_offset_; }
-  
+
   /// Returns the log-likelihood objective function, summed over frames,
   /// for this distribution of iVectors (a point distribution, if var == NULL).
   double GetAuxf(const IvectorExtractorUtteranceStats &utt_stats,
                  const VectorBase<double> &mean,
                  const SpMatrix<double> *var = NULL) const;
-  
+
   /// Returns the data-dependent part of the log-likelihood objective function,
   /// summed over frames.  If variance pointer is NULL, uses point value.
   double GetAcousticAuxf(const IvectorExtractorUtteranceStats &utt_stats,
@@ -188,7 +189,7 @@ class IvectorExtractor {
   double GetAcousticAuxfMean(
       const IvectorExtractorUtteranceStats &utt_stats,
       const VectorBase<double> &mean,
-      const SpMatrix<double> *var = NULL) const;      
+      const SpMatrix<double> *var = NULL) const;
 
   /// This returns the part of the acoustic auxf that relates to the
   /// gconsts of the Gaussians.
@@ -201,8 +202,8 @@ class IvectorExtractor {
   double GetAcousticAuxfWeight(
       const IvectorExtractorUtteranceStats &utt_stats,
       const VectorBase<double> &mean,
-      const SpMatrix<double> *var = NULL) const;      
-  
+      const SpMatrix<double> *var = NULL) const;
+
 
   /// Gets the linear and quadratic terms in the distribution over iVectors, but
   /// only the terms arising from the Gaussian means (i.e. not the weights
@@ -236,13 +237,12 @@ class IvectorExtractor {
 
   // Note: the function GetStats no longer exists due to code refactoring.
   // Instead of this->GetStats(feats, posterior, &utt_stats), call
-  // utt_stats.AccStats(feats, posterior).  
+  // utt_stats.AccStats(feats, posterior).
 
   int32 FeatDim() const;
   int32 IvectorDim() const;
   int32 NumGauss() const;
   bool IvectorDependentWeights() const { return w_.NumRows() != 0; }
-  
   void Write(std::ostream &os, bool binary) const;
   void Read(std::istream &is, bool binary);
 
@@ -252,14 +252,14 @@ class IvectorExtractor {
   void ComputeDerivedVars();
   void ComputeDerivedVars(int32 i);
   friend class IvectorExtractorComputeDerivedVarsClass;
-  
+
   // Imagine we'll project the iVectors with transformation T, so apply T^{-1}
   // where necessary to keep the model equivalent.  Used to keep unit variance
   // (like prior re-estimation).
   void TransformIvectors(const MatrixBase<double> &T,
                          double new_prior_offset);
-  
-  
+
+
   /// Weight projection vectors, if used.  Dimension is [I][S]
   Matrix<double> w_;
 
@@ -268,16 +268,16 @@ class IvectorExtractor {
   /// as a way of making sure the log-probs are comparable between systems with
   /// and without weight projection matrices.
   Vector<double> w_vec_;
-  
+
   /// Ivector-subspace projection matrices, dimension is [I][D][S].
   /// The I'th matrix projects from ivector-space to Gaussian mean.
   /// There is no mean offset to add-- we deal with it by having
   /// a prior with a nonzero mean.
-  std::vector<Matrix<double> > M_; 
+  std::vector<Matrix<double> > M_;
 
   /// Inverse variances of speaker-adapted model, dimension [I][D][D].
   std::vector<SpMatrix<double> > Sigma_inv_;
-  
+
   /// 1st dim of the prior over the ivector has an offset, so it is not zero.
   /// This is used to handle the global offset of the speaker-adapted means in a
   /// simple way.
@@ -289,11 +289,11 @@ class IvectorExtractor {
   /// The constant term in the log-likelihood of each Gaussian (not
   /// counting any weight).
   Vector<double> gconsts_;
-  
+
   /// U_i = M_i^T \Sigma_i^{-1} M_i is a quantity that comes up
   /// in ivector estimation.  This is conceptually a
-  /// std::vector<SpMatrix<double> >, but we store the packed-data 
-  /// in the rows of a matrix, which gives us an efficiency 
+  /// std::vector<SpMatrix<double> >, but we store the packed-data
+  /// in the rows of a matrix, which gives us an efficiency
   /// improvement (we can use matrix-multiplies).
   Matrix<double> U_;
 
@@ -304,12 +304,12 @@ class IvectorExtractor {
   // of quadratic_term to 1.0, which mathematically is the least they can be,
   // due to the prior term.
   static void InvertWithFlooring(const SpMatrix<double> &quadratic_term,
-                                 SpMatrix<double> *var);  
+                                 SpMatrix<double> *var);
 };
 
 /**
    This class helps us to efficiently estimate iVectors in situations where the
-   data is coming in frame by frame. 
+   data is coming in frame by frame.
  */
 class OnlineIvectorEstimationStats {
  public:
@@ -322,11 +322,11 @@ class OnlineIvectorEstimationStats {
 
   OnlineIvectorEstimationStats(const OnlineIvectorEstimationStats &other);
 
-  
+
   void AccStats(const IvectorExtractor &extractor,
                 const VectorBase<BaseFloat> &feature,
                 const std::vector<std::pair<int32, BaseFloat> > &gauss_post);
-  
+
   int32 IvectorDim() const { return linear_term_.Dim(); }
 
   /// This function gets the current estimate of the iVector.  Internally it
@@ -382,7 +382,7 @@ class OnlineIvectorEstimationStats {
   /// Returns objective function evaluated at the point
   /// [ prior_offset_, 0, 0, 0, ... ]... this is used in diagnostics.
   double DefaultObjf() const;
-  
+
   friend class IvectorExtractor;
   double prior_offset_;
   double max_count_;
@@ -459,7 +459,7 @@ struct IvectorExtractorEstimationOptions {
     opts->Register("gaussian-min-count", &gaussian_min_count,
                    "Minimum total count per Gaussian, below which we refuse to "
                    "update any associated parameters.");
-    opts->Register("diagonalize", &diagonalize, 
+    opts->Register("diagonalize", &diagonalize,
                    "If true, diagonalize the quadratic term in the "
                    "objective function. This reorders the ivector dimensions"
                    "from most to least important.");
@@ -476,12 +476,12 @@ class IvectorExtractorStats {
   friend class IvectorExtractor;
 
   IvectorExtractorStats(): tot_auxf_(0.0), R_num_cached_(0), num_ivectors_(0) { }
-  
+
   IvectorExtractorStats(const IvectorExtractor &extractor,
                         const IvectorExtractorStatsOptions &stats_opts);
-  
+
   void Add(const IvectorExtractorStats &other);
-  
+
   void AccStatsForUtterance(const IvectorExtractor &extractor,
                             const MatrixBase<BaseFloat> &feats,
                             const Posterior &post);
@@ -493,13 +493,13 @@ class IvectorExtractorStats {
   double AccStatsForUtterance(const IvectorExtractor &extractor,
                               const MatrixBase<BaseFloat> &feats,
                               const FullGmm &fgmm);
-  
+
   void Read(std::istream &is, bool binary, bool add = false);
 
   void Write(std::ostream &os, bool binary); // non-const version; relates to cache.
 
   // const version of Write; may use extra memory if we have stuff cached
-  void Write(std::ostream &os, bool binary) const; 
+  void Write(std::ostream &os, bool binary) const;
 
   /// Returns the objf improvement per frame.
   double Update(const IvectorExtractorEstimationOptions &opts,
@@ -507,13 +507,18 @@ class IvectorExtractorStats {
 
   double AuxfPerFrame() { return tot_auxf_ / gamma_.Sum(); }
 
+  /// Prints the proportion of the variance explained by
+  /// the Ivector model versus the Gaussians.
+  void IvectorVarianceDiagnostic(const IvectorExtractor &extractor);
+
   // Copy constructor.
   explicit IvectorExtractorStats (const IvectorExtractorStats &other);
+
  protected:
   friend class IvectorExtractorUpdateProjectionClass;
   friend class IvectorExtractorUpdateWeightClass;
 
-  
+
   // This is called by AccStatsForUtterance
   void CommitStatsForUtterance(const IvectorExtractor &extractor,
                                const IvectorExtractorUtteranceStats &utt_stats);
@@ -527,7 +532,7 @@ class IvectorExtractorStats {
 
   /// Flushes the cache for the R_ stats.
   void FlushCache();
-  
+
   /// Commit the stats used to update the variance.
   void CommitStatsForSigma(const IvectorExtractor &extractor,
                            const IvectorExtractorUtteranceStats &utt_stats);
@@ -539,7 +544,7 @@ class IvectorExtractorStats {
                             const VectorBase<double> &ivector,
                             double weight);
 
-  
+
   /// Commit the stats used to update the weight-projection w_.
   void CommitStatsForW(const IvectorExtractor &extractor,
                        const IvectorExtractorUtteranceStats &utt_stats,
@@ -549,7 +554,7 @@ class IvectorExtractorStats {
   /// Commit the stats used to update the prior distribution.
   void CommitStatsForPrior(const VectorBase<double> &ivec_mean,
                            const SpMatrix<double> &ivec_var);
-  
+
   // Updates M.  Returns the objf improvement per frame.
   double UpdateProjections(const IvectorExtractorEstimationOptions &opts,
                            IvectorExtractor *extractor) const;
@@ -576,7 +581,7 @@ class IvectorExtractorStats {
                          IvectorExtractor *extractor) const;
 
 
-  
+
   // Updates the prior; returns obj improvement per frame.
   double UpdatePrior(const IvectorExtractorEstimationOptions &opts,
                      IvectorExtractor *extractor) const;
@@ -584,8 +589,8 @@ class IvectorExtractorStats {
   // Called from UpdatePrior, separating out some code that
   // computes likelihood changes.
   double PriorDiagnostics(double old_prior_offset) const;
-  
-  
+
+
   void CheckDims(const IvectorExtractor &extractor) const;
 
   IvectorExtractorStatsOptions config_; /// Caution: if we read from disk, this
@@ -599,17 +604,17 @@ class IvectorExtractorStats {
 
   /// This mutex guards gamma_ and Y_ (for multi-threaded
   /// update)
-  Mutex gamma_Y_lock_; 
-  
+  Mutex gamma_Y_lock_;
+
   /// Total occupation count for each Gaussian index (zeroth-order stats)
   Vector<double> gamma_;
-  
+
   /// Stats Y_i for estimating projections M.  Dimension is [I][D][S].  The
   /// linear term in M.
   std::vector<Matrix<double> > Y_;
-  
+
   /// This mutex guards R_ (for multi-threaded update)
-  Mutex R_lock_; 
+  Mutex R_lock_;
 
   /// R_i, quadratic term for ivector subspace (M matrix)estimation.  This is a
   /// kind of scatter of ivectors of training speakers, weighted by count for
@@ -620,8 +625,8 @@ class IvectorExtractorStats {
 
   /// This mutex guards R_num_cached_, R_gamma_cache_, R_ivec_cache_ (for
   /// multi-threaded update)
-  Mutex R_cache_lock_; 
-  
+  Mutex R_cache_lock_;
+
   /// To avoid too-frequent rank-1 update of R, which is slow, we cache some
   /// quantities here.
   int32 R_num_cached_;
@@ -632,7 +637,7 @@ class IvectorExtractorStats {
 
   /// This mutex guards Q_ and G_ (for multi-threaded update)
   Mutex weight_stats_lock_;
-  
+
   /// Q_ is like R_ (with same dimensions), except used for weight estimation;
   /// the scatter of ivectors is weighted by the coefficient of the quadratic
   /// term in the expansion for w (the "safe" one, with the max expression).
@@ -657,7 +662,7 @@ class IvectorExtractorStats {
   /// Count of the number of iVectors we trained on.   Need for prior re-estimation.
   /// (make it double not int64 to more easily support weighting later.)
   double num_ivectors_;
-  
+
   /// Sum of all the iVector means.  Needed for prior re-estimation.
   Vector<double> ivector_sum_;
 
@@ -666,12 +671,12 @@ class IvectorExtractorStats {
 
  private:
   /// Computes an orthogonal matrix A from the iVector transform
-  /// T such that T' = A*T is an alternative transform which diagonalizes the 
+  /// T such that T' = A*T is an alternative transform which diagonalizes the
   /// quadratic_term_ in the iVector estimation objective function. This
   /// reorders the dimensions of the iVector from most to least important,
   /// which may be more convenient to view. The transform should not
-  /// affect the performance of systems which use iVectors. 
-  void GetOrthogonalIvectorTransform(const SubMatrix<double> &T, 
+  /// affect the performance of systems which use iVectors.
+  void GetOrthogonalIvectorTransform(const SubMatrix<double> &T,
                                      IvectorExtractor *extractor,
                                      Matrix<double> *A) const;
 
diff --git a/src/ivector/logistic-regression-test.cc b/src/ivector/logistic-regression-test.cc
index 94a88e1adbc..96faeaf3b33 100644
--- a/src/ivector/logistic-regression-test.cc
+++ b/src/ivector/logistic-regression-test.cc
@@ -41,7 +41,7 @@ void UnitTestPosteriors() {
     classes.push_back(i);
   }
   classifier.SetWeights(weights, classes);
-  
+
   // Get posteriors for the xs using batch and serial methods.
   Matrix<BaseFloat> batch_log_posteriors;
   classifier.GetLogPosteriors(xs, &batch_log_posteriors);
@@ -51,14 +51,14 @@ void UnitTestPosteriors() {
     x.CopyRowFromMat(xs, i);
     Vector<BaseFloat> log_post;
     classifier.GetLogPosteriors(x, &log_post);
-    
+
     // Verify that sum_y p(y|x) = 1.0.
     Vector<BaseFloat> post(log_post);
     post.ApplyExp();
     KALDI_ASSERT(ApproxEqual(post.Sum(), 1.0));
     log_posteriors.Row(i).CopyFromVec(log_post);
   }
-  
+
   // Verify equivalence of batch and serial methods.
   float tolerance = 0.01;
   KALDI_ASSERT(log_posteriors.ApproxEqual(batch_log_posteriors, tolerance));
@@ -89,32 +89,32 @@ void UnitTestTrain() {
   // the x vectors: a 1.0 which handles the prior.
   Matrix<BaseFloat> xs_with_prior(n_xs, n_features + 1);
   for (int32 i = 0; i < n_xs; i++) {
-    xs_with_prior(i, n_xs) = 1.0;
+    xs_with_prior(i, n_features) = 1.0;
   }
   SubMatrix<BaseFloat> sub_xs(xs_with_prior, 0, n_xs, 0, n_features);
   sub_xs.CopyFromMat(xs);
 
   Matrix<BaseFloat> xw(n_xs, n_labels);
-  xw.AddMatMat(1.0, xs_with_prior, kNoTrans, classifier.weights_, 
+  xw.AddMatMat(1.0, xs_with_prior, kNoTrans, classifier.weights_,
                kTrans, 0.0);
 
   Matrix<BaseFloat> grad(classifier.weights_.NumRows(),
                       classifier.weights_.NumCols());
 
-  double objf_trained = classifier.GetObjfAndGrad(xs_with_prior, 
+  double objf_trained = classifier.GetObjfAndGrad(xs_with_prior,
                                                   ys, xw, &grad, normalizer);
 
   // Calculate objective function using a random weight matrix.
   Matrix<BaseFloat> xw_rand(n_xs, n_labels);
-  
+
   Matrix<BaseFloat> weights_rand(classifier.weights_);
   weights_rand.SetRandn();
-  xw.AddMatMat(1.0, xs_with_prior, kNoTrans, weights_rand, 
+  xw.AddMatMat(1.0, xs_with_prior, kNoTrans, weights_rand,
                kTrans, 0.0);
 
   // Verify that the objective function after training is better
   // than the objective function with a random weight matrix.
-  double objf_rand_w = classifier.GetObjfAndGrad(xs_with_prior, ys, 
+  double objf_rand_w = classifier.GetObjfAndGrad(xs_with_prior, ys,
                                                  xw_rand, &grad, normalizer);
   KALDI_ASSERT(objf_trained > objf_rand_w);
   KALDI_ASSERT(objf_trained > Log(1.0 / n_xs));
diff --git a/src/ivector/logistic-regression.cc b/src/ivector/logistic-regression.cc
index 78fa2746ebb..5d02c013294 100644
--- a/src/ivector/logistic-regression.cc
+++ b/src/ivector/logistic-regression.cc
@@ -24,14 +24,14 @@
 
 namespace kaldi {
 
-void LogisticRegression::Train(const Matrix<BaseFloat> &xs, 
+void LogisticRegression::Train(const Matrix<BaseFloat> &xs,
                                const std::vector<int32> &ys,
                                const LogisticRegressionConfig &conf) {
-  
+
   int32 xs_num_rows = xs.NumRows(), xs_num_cols = xs.NumCols(),
                      num_ys = ys.size();
   KALDI_ASSERT(xs_num_rows == num_ys);
-  
+
   // Adding on extra column for each x to handle the prior.
   Matrix<BaseFloat> xs_with_prior(xs_num_rows, xs_num_cols + 1);
   SubMatrix<BaseFloat> sub_xs(xs_with_prior, 0, xs_num_rows, 0, xs_num_cols);
@@ -55,7 +55,7 @@ void LogisticRegression::Train(const Matrix<BaseFloat> &xs,
 
   weights_.SetZero();
   TrainParameters(xs_with_prior, ys, conf, &xw);
-  KALDI_LOG << 
+  KALDI_LOG <<
     "Finished training parameters without mixture components." << std::endl;
 
   // If we are using mixture components, we add those components
@@ -64,7 +64,7 @@ void LogisticRegression::Train(const Matrix<BaseFloat> &xs,
     MixUp(ys, num_classes, conf);
     Matrix<BaseFloat> xw(xs_num_rows, weights_.NumRows());
     TrainParameters(xs_with_prior, ys, conf, &xw);
-    KALDI_LOG << 
+    KALDI_LOG <<
       "Finished training mixture components." << std::endl;
   }
 }
@@ -73,7 +73,7 @@ void LogisticRegression::Train(const Matrix<BaseFloat> &xs,
 void LogisticRegression::MixUp(const std::vector<int32> &ys,
                                const int32 &num_classes,
                                const LogisticRegressionConfig &conf) {
-  
+
   Vector<BaseFloat> counts(num_classes);
   for (int32 i = 0; i < ys.size(); i++) {
     counts(ys[i]) += 1.0;
@@ -86,14 +86,14 @@ void LogisticRegression::MixUp(const std::vector<int32> &ys,
   int32 new_dim = std::accumulate(targets.begin(), targets.end(),
                                   static_cast<int32>(0));
 
-  KALDI_LOG << "Target number mixture components was " << conf.mix_up 
-            << ". Training " << new_dim << " mixture components. " 
+  KALDI_LOG << "Target number mixture components was " << conf.mix_up
+            << ". Training " << new_dim << " mixture components. "
             << std::endl;
 
   int32 old_dim = weights_.NumRows(),
         num_components = old_dim,
         num_feats = weights_.NumCols();
-                                       
+
   Matrix<BaseFloat> old_weights(weights_);
   weights_.Resize(new_dim, num_feats);
   SubMatrix<BaseFloat> sub_weights(weights_, 0, num_classes, 0, num_feats);
@@ -142,9 +142,9 @@ void LogisticRegression::GetLogPosteriors(const Matrix<BaseFloat> &xs,
   int32 xs_num_rows = xs.NumRows(),
       xs_num_cols = xs.NumCols(),
       num_mixes = weights_.NumRows();
-  
+
   int32 num_classes = *std::max_element(class_.begin(), class_.end()) + 1;
-  
+
   log_posteriors->Resize(xs_num_rows, num_classes);
   Matrix<BaseFloat> xw(xs_num_rows, num_mixes);
 
@@ -155,9 +155,9 @@ void LogisticRegression::GetLogPosteriors(const Matrix<BaseFloat> &xs,
   for (int32 i = 0; i < xs_num_rows; i++) {
     xs_with_prior(i, xs_num_cols) = 1.0;
   }
-  xw.AddMatMat(1.0, xs_with_prior, kNoTrans, weights_, 
+  xw.AddMatMat(1.0, xs_with_prior, kNoTrans, weights_,
                kTrans, 0.0);
-  
+
   log_posteriors->Set(-std::numeric_limits<BaseFloat>::infinity());
 
   // i is the training example
@@ -184,11 +184,11 @@ void LogisticRegression::GetLogPosteriors(const Vector<BaseFloat> &x,
   sub_x.CopyFromVec(x);
   // Adding on extra element to handle the prior
   x_with_prior(x_dim) = 1.0;
-  
+
   xw.AddMatVec(1.0, weights_, kNoTrans, x_with_prior, kNoTrans);
 
   log_posteriors->Set(-std::numeric_limits<BaseFloat>::infinity());
-  
+
   for (int32 i = 0; i < num_mixes; i++) {
     int32 j = class_[i];
     (*log_posteriors)(j) = LogAdd((*log_posteriors)(j), xw(i));
@@ -203,7 +203,7 @@ BaseFloat LogisticRegression::DoStep(const Matrix<BaseFloat> &xs,
   Matrix<BaseFloat> gradient(weights_.NumRows(), weights_.NumCols());
   // Vector form of the above matrix
   Vector<BaseFloat> grad_vec(weights_.NumRows() * weights_.NumCols());
-    
+
   // Calculate XW.T. The rows correspond to the x
   // training examples and the columns to the class labels.
   xw->AddMatMat(1.0, xs, kNoTrans, weights_, kTrans, 0.0);
@@ -217,7 +217,7 @@ BaseFloat LogisticRegression::DoStep(const Matrix<BaseFloat> &xs,
 
   // Compute next step in L-BFGS.
   lbfgs->DoStep(objf, grad_vec);
-  
+
   // Update weights
   Vector<BaseFloat> new_w(lbfgs->GetProposedValue());
   weights_.CopyRowsFromVec(new_w);
@@ -240,7 +240,7 @@ BaseFloat LogisticRegression::GetObjfAndGrad(
     Vector<BaseFloat> row(xw.NumCols());
     row.CopyFromVec(xw.Row(i));
     row.ApplySoftMax();
-    // Identify the rows of weights_ (which are a set of columns in wx) 
+    // Identify the rows of weights_ (which are a set of columns in wx)
     // which correspond to class ys[i]
     const std::vector<int32> &cols = class_to_cols[ys[i]];
     SubVector<BaseFloat> x = xs.Row(i);
@@ -268,7 +268,7 @@ BaseFloat LogisticRegression::GetObjfAndGrad(
   grad->Scale(1.0/ys.size());
   grad->AddMat(-1.0 * normalizer, weights_);
   raw_objf /= ys.size();
-  BaseFloat regularizer = - 0.5 * normalizer 
+  BaseFloat regularizer = - 0.5 * normalizer
                           * TraceMatMat(weights_, weights_, kTrans);
   KALDI_VLOG(2) << "Objf is " << raw_objf << " + " << regularizer
                 << " = " << (raw_objf + regularizer);
@@ -280,7 +280,7 @@ void LogisticRegression::SetWeights(const Matrix<BaseFloat> &weights,
   weights_.Resize(weights.NumRows(), weights.NumCols());
   weights_.CopyFromMat(weights);
   class_.resize(classes.size());
-  for (int32 i = 0; i < class_.size(); i++) 
+  for (int32 i = 0; i < class_.size(); i++)
     class_[i] = classes[i];
 }
 
diff --git a/src/ivector/logistic-regression.h b/src/ivector/logistic-regression.h
index 73a28240180..7d4162b6e1e 100644
--- a/src/ivector/logistic-regression.h
+++ b/src/ivector/logistic-regression.h
@@ -33,7 +33,7 @@ struct LogisticRegressionConfig {
         mix_up;
   double normalizer,
          power;
-  LogisticRegressionConfig(): max_steps(20), mix_up(0), 
+  LogisticRegressionConfig(): max_steps(20), mix_up(0),
                               normalizer(0.0025), power(0.15){ }
   void Register(OptionsItf *opts) {
     opts->Register("max-steps", &max_steps,
@@ -52,61 +52,61 @@ struct LogisticRegressionConfig {
 class LogisticRegression {
  public:
   // xs and ys are the training data. Each row of xs is a vector
-  // corresponding to the class label in the same row of ys. 
+  // corresponding to the class label in the same row of ys.
   void Train(const Matrix<BaseFloat> &xs, const std::vector<int32> &ys,
              const LogisticRegressionConfig &conf);
- 
+
   // Calculates the log posterior of the class label given the input xs.
-  // The rows of log_posteriors corresponds to the rows of xs: the 
-  // individual data points to be evaluated. The columns of 
+  // The rows of log_posteriors corresponds to the rows of xs: the
+  // individual data points to be evaluated. The columns of
   // log_posteriors are the integer class labels.
-  void GetLogPosteriors(const Matrix<BaseFloat> &xs, 
+  void GetLogPosteriors(const Matrix<BaseFloat> &xs,
                         Matrix<BaseFloat> *log_posteriors);
 
   // Calculates the log posterior of the class label given the input x.
   // The indices of log_posteriors are the class labels.
-  void GetLogPosteriors(const Vector<BaseFloat> &x, 
+  void GetLogPosteriors(const Vector<BaseFloat> &x,
                         Vector<BaseFloat> *log_posteriors);
-  
+
   void Write(std::ostream &os, bool binary) const;
   void Read(std::istream &is, bool binary);
 
   void ScalePriors(const Vector<BaseFloat> &prior_scales);
-  
+
  protected:
   void friend UnitTestTrain();
   void friend UnitTestPosteriors();
 
  private:
   // Performs a step in the L-BFGS. This is mostly used internally
-  // By Train() and for testing.   
-  BaseFloat DoStep(const Matrix<BaseFloat> &xs, 
-                Matrix<BaseFloat> *xw, const std::vector<int32> &ys, 
+  // By Train() and for testing.
+  BaseFloat DoStep(const Matrix<BaseFloat> &xs,
+                Matrix<BaseFloat> *xw, const std::vector<int32> &ys,
                 OptimizeLbfgs<BaseFloat> *lbfgs,
                 BaseFloat normalizer);
 
-  void TrainParameters(const Matrix<BaseFloat> &xs, 
-             const std::vector<int32> &ys, 
+  void TrainParameters(const Matrix<BaseFloat> &xs,
+             const std::vector<int32> &ys,
              const LogisticRegressionConfig &conf,
              Matrix<BaseFloat> *xw);
 
   // Creates the mixture components. Uses conf.mix_up, conf.power,
   // the occupancy of ys and GetSplitTargets() to determin the number
   // of mixture components for each weight index.
-  void MixUp(const std::vector<int32> &ys, const int32 &num_classes, 
+  void MixUp(const std::vector<int32> &ys, const int32 &num_classes,
              const LogisticRegressionConfig &conf);
 
   // Returns the objective function given the training data, xs, ys.
   // The gradient is also calculated, and returned in grad. Uses
   // L2 regularization.
-  BaseFloat GetObjfAndGrad(const Matrix<BaseFloat> &xs, 
-                        const std::vector<int32> &ys, 
-                        const Matrix<BaseFloat> &xw, 
+  BaseFloat GetObjfAndGrad(const Matrix<BaseFloat> &xs,
+                        const std::vector<int32> &ys,
+                        const Matrix<BaseFloat> &xw,
                         Matrix<BaseFloat> *grad,
                         BaseFloat normalizer);
-  
+
   // Sets the weights and class map. This is generally used for testing.
-  void SetWeights(const Matrix<BaseFloat> &weights, 
+  void SetWeights(const Matrix<BaseFloat> &weights,
                   const std::vector<int32> classes);
   // Before mixture components or added, or if mix_up <= num_classes
   // each row of weights_ corresponds to a class label.
@@ -118,7 +118,7 @@ class LogisticRegression {
   // Maps from the row of weights_ to the class.  Normally the
   // identity mapping, but may not be for multi-mixture logistic
   // regression.
-  std::vector<int32> class_;    
+  std::vector<int32> class_;
 };
 
 }
diff --git a/src/ivector/plda.h b/src/ivector/plda.h
index f5affa5d1ae..57609633169 100644
--- a/src/ivector/plda.h
+++ b/src/ivector/plda.h
@@ -73,8 +73,8 @@ class Plda {
   /// before giving them to the function LogLikelihoodRatio (it's
   /// done this way for efficiency because a given iVector may be
   /// used multiple times in LogLikelihoodRatio and we don't want
-  /// do repeat the matrix multiplication
-  /// 
+  /// to repeat the matrix multiplication
+  ///
   /// If config.normalize_length == true, it will also normalize the length of
   /// the iVector so that it is equal to the sqrt(dim).  The normalization
   /// factor is returned, even if config.normalize_length == false, in which
@@ -88,7 +88,7 @@ class Plda {
   float TransformIvector(const PldaConfig &config,
                          const VectorBase<float> &ivector,
                          VectorBase<float> *transformed_ivector) const;
-  
+
   /// Returns the log-likelihood ratio
   /// log (p(test_ivector | same) / p(test_ivector | different)).
   /// transformed_train_ivector is an average over utterances for
@@ -100,7 +100,7 @@ class Plda {
                             int32 num_train_utts,
                             const VectorBase<double> &transformed_test_ivector);
 
-  
+
   /// This function smooths the within-class covariance by adding to it,
   /// smoothing_factor (e.g. 0.1) times the between-class covariance (it's
   /// implemented by modifying transform_).  This is to compensate for
@@ -108,7 +108,7 @@ class Plda {
   /// estimate of the within-class covariance, and where the leading elements of
   /// psi_ were as a result very large.
   void SmoothWithinClassCovariance(double smoothing_factor);
-  
+
   int32 Dim() const { return mean_.Dim(); }
   void Write(std::ostream &os, bool binary) const;
   void Read(std::istream &is, bool binary);
@@ -116,7 +116,7 @@ class Plda {
   void ComputeDerivedVars(); // computes offset_.
   friend class PldaEstimator;
   friend class PldaUnsupervisedAdaptor;
-  
+
   Vector<double> mean_;  // mean of samples in original space.
   Matrix<double> transform_; // of dimension Dim() by Dim();
                              // this transform makes within-class covar unit
@@ -142,7 +142,7 @@ class PldaStats {
   /// to weight your training samples.
   void AddSamples(double weight,
                   const Matrix<double> &group);
-    
+
   int32 Dim() const { return dim_; }
 
   void Init(int32 dim);
@@ -151,9 +151,9 @@ class PldaStats {
   bool IsSorted() const;
   ~PldaStats();
  protected:
-  
+
   friend class PldaEstimator;
-  
+
   int32 dim_;
   int64 num_classes_;
   int64 num_examples_; // total number of examples, sumed over classes.
@@ -165,7 +165,7 @@ class PldaStats {
 
   SpMatrix<double> offset_scatter_; // Sum over all examples, of the weight
                                     // times (example - class-mean).
-  
+
   // We have one of these objects per class.
   struct ClassInfo {
     double weight;
@@ -178,7 +178,7 @@ class PldaStats {
     ClassInfo(double weight, Vector<double> *mean, int32 num_examples):
         weight(weight), mean(mean), num_examples(num_examples) { }
   };
-   
+
   std::vector<ClassInfo> class_info_;
  private:
   KALDI_DISALLOW_COPY_AND_ASSIGN(PldaStats);
@@ -197,16 +197,16 @@ struct PldaEstimationConfig {
 class PldaEstimator {
  public:
   PldaEstimator(const PldaStats &stats);
-  
+
   void Estimate(const PldaEstimationConfig &config,
                 Plda *output);
 private:
   typedef PldaStats::ClassInfo ClassInfo;
-  
+
   /// Returns the part of the objf relating to
   /// offsets from the class means.  (total, not normalized)
   double ComputeObjfPart1() const;
-  
+
   /// Returns the part of the obj relating to
   /// the class means (total_not normalized)
   double ComputeObjfPart2() const;
@@ -217,7 +217,7 @@ class PldaEstimator {
   int32 Dim() const { return stats_.Dim(); }
 
   void EstimateOneIter();
-  
+
   void InitParameters();
 
   void ResetPerIterStats();
@@ -233,7 +233,7 @@ class PldaEstimator {
 
   // Copy to output.
   void GetOutput(Plda *plda);
-  
+
   const PldaStats &stats_;
 
   SpMatrix<double> within_var_;
@@ -254,7 +254,7 @@ struct PldaUnsupervisedAdaptorConfig {
   BaseFloat mean_diff_scale;
   BaseFloat within_covar_scale;
   BaseFloat between_covar_scale;
-  
+
   PldaUnsupervisedAdaptorConfig():
       mean_diff_scale(1.0),
       within_covar_scale(0.3),
@@ -285,7 +285,7 @@ class PldaUnsupervisedAdaptor {
   // Add stats to this class.  Normally the weight will be 1.0.
   void AddStats(double weight, const Vector<double> &ivector);
   void AddStats(double weight, const Vector<float> &ivector);
-  
+
 
   void UpdatePlda(const PldaUnsupervisedAdaptorConfig &config,
                   Plda *plda) const;
@@ -293,7 +293,7 @@ class PldaUnsupervisedAdaptor {
 
   double tot_weight_;
   Vector<double> mean_stats_;
-  SpMatrix<double> variance_stats_;    
+  SpMatrix<double> variance_stats_;
 };
 
 
diff --git a/src/ivector/voice-activity-detection.cc b/src/ivector/voice-activity-detection.cc
index b4a2acc9f9f..d758f0fced8 100644
--- a/src/ivector/voice-activity-detection.cc
+++ b/src/ivector/voice-activity-detection.cc
@@ -51,7 +51,7 @@ void ComputeVadEnergy(const VadEnergyOptions &opts,
     for (int32 t2 = t - context; t2 <= t + context; t2++) {
       if (t2 >= 0 && t2 < T) {
         den_count++;
-        if (log_energy_data[t] > energy_threshold)
+        if (log_energy_data[t2] > energy_threshold)
           num_count++;
       }
     }
diff --git a/src/ivector/voice-activity-detection.h b/src/ivector/voice-activity-detection.h
index 67ca03d185e..2e241153905 100644
--- a/src/ivector/voice-activity-detection.h
+++ b/src/ivector/voice-activity-detection.h
@@ -47,7 +47,7 @@ struct VadEnergyOptions {
   
   VadEnergyOptions(): vad_energy_threshold(5.0),
                       vad_energy_mean_scale(0.5),
-                      vad_frames_context(5),
+                      vad_frames_context(0),
                       vad_proportion_threshold(0.6) { }
   void Register(OptionsItf *opts) {
     opts->Register("vad-energy-threshold", &vad_energy_threshold,
diff --git a/src/ivectorbin/Makefile b/src/ivectorbin/Makefile
index e0e2e0ee344..b10ebbc4ce5 100644
--- a/src/ivectorbin/Makefile
+++ b/src/ivectorbin/Makefile
@@ -7,10 +7,11 @@ include ../kaldi.mk
 BINFILES = ivector-extractor-init ivector-extractor-acc-stats \
            ivector-extractor-sum-accs ivector-extractor-est \
            ivector-extract compute-vad select-voiced-frames \
+           compute-vad-from-frame-likes merge-vads \
            ivector-normalize-length \
            ivector-transform ivector-compute-dot-products ivector-mean \
            ivector-compute-lda ivector-compute-plda \
-	       ivector-copy-plda compute-eer \
+	   ivector-copy-plda compute-eer \
            ivector-subtract-global-mean ivector-plda-scoring \
            logistic-regression-train logistic-regression-eval \
            logistic-regression-copy create-split-from-vad \
@@ -24,7 +25,7 @@ TESTFILES =
 
 
 ADDLIBS = ../ivector/kaldi-ivector.a ../hmm/kaldi-hmm.a ../gmm/kaldi-gmm.a \
-    ../tree/kaldi-tree.a ../thread/kaldi-thread.a ../matrix/kaldi-matrix.a \
-    ../util/kaldi-util.a ../base/kaldi-base.a 
+    ../tree/kaldi-tree.a ../matrix/kaldi-matrix.a ../util/kaldi-util.a \
+    ../thread/kaldi-thread.a ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/ivectorbin/compute-vad-from-frame-likes.cc b/src/ivectorbin/compute-vad-from-frame-likes.cc
new file mode 100644
index 00000000000..4ad9ac8e5c5
--- /dev/null
+++ b/src/ivectorbin/compute-vad-from-frame-likes.cc
@@ -0,0 +1,215 @@
+// ivectorbin/compute-vad-from-frame-likes.cc
+
+// Copyright  2015 David Snyder
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "matrix/kaldi-matrix.h"
+#include "util/parse-options.h"
+#include "util/stl-utils.h"
+
+namespace kaldi {
+
+/**
+   PrepareMap creates a map that specifies the mapping between the input
+   and output class labels.  If the string map_rxfilename is empty, then
+   the mapping is the identity map (e.g., 0 maps to 0, 1 maps to 1, etc),
+   based on the number of classes num_classes.  If map_rxfilename is not
+   empty, the mapping is created from that file.  The file is expected to
+   be two columns of integers with up to num_classes rows.  If an input
+   class is not specified in the file, then the output class label is the
+   same as the input.  The first column is the input class and the second
+   column is the output class.  For example:
+       0 0
+       1 1
+       2 0
+*/
+void PrepareMap(const std::string &map_rxfilename, int32 num_classes,
+                unordered_map<int32, int32> *map) {
+  Input map_input(map_rxfilename);
+  for (int32 i = 0; i < num_classes; i++)
+    (*map)[i] = i;
+
+  if (!map_rxfilename.empty()) {
+    std::string line;
+    while (std::getline(map_input.Stream(), line)) {
+      if (line.size() == 0) continue;
+      int32 start = line.find_first_not_of(" \t");
+      int32 end = line.find_first_of('#'); // Ignore trailing comments
+      if (start == std::string::npos || start == end) continue;
+      end = line.find_last_not_of(" \t", end - 1);
+      KALDI_ASSERT(end >= start);
+      std::vector<std::string> fields;
+      SplitStringToVector(line.substr(start, end - start + 1),
+         " \t\n\r", true, &fields);
+      if (fields.size() != 2) {
+        KALDI_ERR << "Bad line. Expected two fields, got: "
+                  << line;
+      }
+      (*map)[std::atoi(fields[0].c_str())] = std::atoi(fields[1].c_str());
+    }
+  }
+
+  if (map->size() > num_classes)
+    KALDI_ERR << "Map table has " << map->size() << " classes.  "
+              << "Expected " << num_classes << " or fewer";
+}
+
+/**
+   PreparePriors creates a table specifying the priors for each class.
+   If priors_str is empty, uniform priors are assumed.  If priors_str is
+   nonempty, the comma-separated floats are parsed out.  If present, the
+   input of priors_str is of the form:
+      0.5,0.25,0.25
+*/
+void PreparePriors(const std::string &priors_str, int32 num_classes,
+                std::vector<BaseFloat> *priors) {
+  if (priors_str.empty()) {
+    for (int32 i = 0; i < num_classes; i++)
+      priors->push_back(log(1.0/num_classes)); // Uniform priors
+  } else {
+    SplitStringToFloats(priors_str, ",", false, priors);
+    for (int32 i = 0; i < priors->size(); i++)
+      (*priors)[i] = log((*priors)[i]);
+  }
+
+  if (priors->size() != num_classes)
+    KALDI_ERR << priors->size() << " priors specified.  Expected "
+              << num_classes;
+}
+
+}
+
+int main(int argc, char *argv[]) {
+  using namespace kaldi;
+  typedef kaldi::int32 int32;
+  try {
+    const char *usage =
+      "This program computes frame-level voice activity decisions from a\n"
+      "set of input frame-level log-likelihoods.  Usually, these\n"
+      "log-likelihoods are the output of fgmm-global-get-frame-likes.\n"
+      "Frames are assigned labels according to the class for which the\n"
+      "log-likelihood (optionally weighted by a prior) is maximal.  The\n"
+      "class labels are determined by the order of inputs on the command\n"
+      "line.  See options for more details.\n"
+      "\n"
+      "Usage: compute-vad-from-frame-likes [options] <likes-rspecifier-1>\n"
+      "    ... <likes-rspecifier-n> <vad-wspecifier>\n"
+      "e.g.: compute-vad-from-frame-likes --map=label_map.txt\n"
+      "    scp:likes1.scp scp:likes2.scp ark:vad.ark\n"
+      "See also: fgmm-global-get-frame-likes, compute-vad, merge-vads\n";
+
+    ParseOptions po(usage);
+    std::string map_rxfilename;
+    std::string priors_str;
+
+    po.Register("map", &map_rxfilename, "Table that defines the frame-level "
+      "labels.  For each row, the first field is the zero-based index of the "
+      "input likelihood archive and the second field is the associated "
+      "integer label.");
+
+    po.Register("priors", &priors_str, "Comma-separated list that specifies "
+      "the priors for each class.  The order of the floats corresponds to "
+      "the index of the input archives.  E.g., --priors=0.5,0.2,0.3");
+
+    po.Read(argc, argv);
+    if (po.NumArgs() < 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    unordered_map<int32, int32> map;
+    std::vector<BaseFloat> priors;
+    int32 num_classes = po.NumArgs() - 1;
+    PrepareMap(map_rxfilename, num_classes, &map);
+    PreparePriors(priors_str, num_classes, &priors);
+
+    SequentialBaseFloatVectorReader first_reader(po.GetArg(1));
+    std::vector<RandomAccessBaseFloatVectorReader *> readers;
+    std::string vad_wspecifier = po.GetArg(po.NumArgs());
+    BaseFloatVectorWriter vad_writer(vad_wspecifier);
+
+    for (int32 i = 2; i < po.NumArgs(); i++) {
+      RandomAccessBaseFloatVectorReader *reader
+        = new RandomAccessBaseFloatVectorReader(po.GetArg(i));
+      readers.push_back(reader);
+    }
+
+    int32 num_done = 0, num_err = 0;
+    for (;!first_reader.Done(); first_reader.Next()) {
+      std::string utt = first_reader.Key();
+      Vector<BaseFloat> like(first_reader.Value());
+      int32 like_dim = like.Dim();
+      std::vector<Vector<BaseFloat> > likes;
+      likes.push_back(like);
+      if (like_dim == 0) {
+        KALDI_WARN << "Empty vector for utterance " << utt;
+        num_err++;
+        continue;
+      }
+      for (int32 i = 0; i < num_classes - 1; i++) {
+        if (!readers[i]->HasKey(utt)) {
+          KALDI_WARN << "No vector for utterance " << utt;
+          num_err++;
+          continue;
+        }
+        Vector<BaseFloat> other_like(readers[i]->Value(utt));
+        if (like_dim != other_like.Dim()) {
+          KALDI_WARN << "Dimension mismatch in input vectors in " << utt
+                    << ": " << like_dim << " vs. " << other_like.Dim();
+          num_err++;
+          continue;
+        }
+        likes.push_back(other_like);
+      }
+
+      Vector<BaseFloat> vad_result(like_dim);
+      for (int32 i = 0; i < like.Dim(); i++) {
+        int32 max_indx = 0;
+        BaseFloat max_post = likes[0](i) + priors[0];
+        for (int32 j = 0; j < num_classes; j++) {
+          BaseFloat other_post = likes[j](i) + priors[j];
+          if (other_post > max_post) {
+            max_indx = j;
+            max_post = other_post;
+          }
+        }
+        unordered_map<int32, int32>::const_iterator iter = map.find(max_indx);
+        if (iter == map.end()) {
+          KALDI_ERR << "Missing label " << max_indx  << " in map";
+        } else {
+          vad_result(i) = iter->second;
+        }
+      }
+      vad_writer.Write(utt, vad_result);
+      num_done++;
+    }
+
+    for (int32 i = 0; i < num_classes - 1; i++)
+      delete readers[i];
+
+    KALDI_LOG << "Applied frame-level likelihood-based voice activity "
+              << "detection; processed " << num_done
+              << " utterances successfully; " << num_err
+              << " had empty features.";
+    return (num_done != 0 ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
diff --git a/src/ivectorbin/compute-vad.cc b/src/ivectorbin/compute-vad.cc
index 4cff112e055..38854613297 100644
--- a/src/ivectorbin/compute-vad.cc
+++ b/src/ivectorbin/compute-vad.cc
@@ -40,7 +40,7 @@ int main(int argc, char *argv[]) {
         "\n"
         "Usage: compute-vad [options] <feats-rspecifier> <vad-wspecifier>\n"
         "e.g.: compute-vad scp:feats.scp ark:vad.ark\n";
-    
+
     ParseOptions po(usage);
     bool omit_unvoiced_utts = false;
     po.Register("omit-unvoiced-utts", &omit_unvoiced_utts,
@@ -64,7 +64,7 @@ int main(int argc, char *argv[]) {
     int32 num_done = 0, num_err = 0;
     int32 num_unvoiced = 0;
     double tot_length = 0.0, tot_decision = 0.0;
-    
+
     for (;!feat_reader.Done(); feat_reader.Next()) {
       std::string utt = feat_reader.Key();
       Matrix<BaseFloat> feat(feat_reader.Value());
@@ -86,7 +86,7 @@ int main(int argc, char *argv[]) {
       }
       tot_decision += vad_result.Sum();
       tot_length += vad_result.Dim();
-      
+
       if (!(omit_unvoiced_utts && sum == 0)) {
         vad_writer.Write(utt, vad_result);
       }
diff --git a/src/ivectorbin/ivector-extractor-est.cc b/src/ivectorbin/ivector-extractor-est.cc
index 787887bfc52..133f7f344c9 100644
--- a/src/ivectorbin/ivector-extractor-est.cc
+++ b/src/ivectorbin/ivector-extractor-est.cc
@@ -25,7 +25,7 @@ int main(int argc, char *argv[]) {
   try {
     typedef kaldi::int32 int32;
     using namespace kaldi;
-    
+
     const char *usage =
         "Do model re-estimation of iVector extractor (this is\n"
         "the update phase of a single pass of E-M)\n"
@@ -33,14 +33,14 @@ int main(int argc, char *argv[]) {
 
     bool binary = true;
     IvectorExtractorEstimationOptions update_opts;
-    
+
     kaldi::ParseOptions po(usage);
     po.Register("binary", &binary, "Write output in binary mode");
     po.Register("num-threads", &g_num_threads,
                 "Number of threads used in update");
-    
+
     update_opts.Register(&po);
-    
+
     po.Read(argc, argv);
 
     if (po.NumArgs() != 3) {
@@ -61,9 +61,9 @@ int main(int argc, char *argv[]) {
     ReadKaldiObject(stats_rxfilename, &stats);
 
     stats.Update(update_opts, &extractor);
-
     WriteKaldiObject(extractor, model_wxfilename, binary);
-    
+    stats.IvectorVarianceDiagnostic(extractor);
+
     KALDI_LOG << "Updated model and wrote it to "
               << model_wxfilename;
 
diff --git a/src/ivectorbin/logistic-regression-copy.cc b/src/ivectorbin/logistic-regression-copy.cc
index f91eae7dde5..b70d7503afe 100644
--- a/src/ivectorbin/logistic-regression-copy.cc
+++ b/src/ivectorbin/logistic-regression-copy.cc
@@ -36,7 +36,7 @@ int main(int argc, char *argv[]) {
         "Usage: logistic-regression-copy [options] <model-in> <model-out>\n"
         "e.g.: echo '[ 2.6 1.7 3.9 1.24 7.5 ]' | logistic-regression-copy --scale-priors=- \\\n"
         "  1.model scaled_priors.mdl\n";
-    
+
     ParseOptions po(usage);
 
     bool binary = true;
@@ -45,9 +45,9 @@ int main(int argc, char *argv[]) {
     po.Register("binary", &binary, "Write output in binary mode");
     po.Register("scale-priors", &scale_priors_rxfilename, "(extended) filename for file "
                 "containing a vector of prior-scales (e.g. inverses of training priors)");
-    
+
     po.Read(argc, argv);
-    
+
     if (po.NumArgs() != 2) {
       po.PrintUsage();
       exit(1);
@@ -55,7 +55,7 @@ int main(int argc, char *argv[]) {
 
     std::string model_rxfilename = po.GetArg(1),
         model_wxfilename = po.GetArg(2);
-    
+
 
     LogisticRegression model;
     ReadKaldiObject(model_rxfilename, &model);
diff --git a/src/ivectorbin/logistic-regression-eval.cc b/src/ivectorbin/logistic-regression-eval.cc
index 3a13c8aea35..85f581928c0 100644
--- a/src/ivectorbin/logistic-regression-eval.cc
+++ b/src/ivectorbin/logistic-regression-eval.cc
@@ -24,16 +24,16 @@
 
 using namespace kaldi;
 
-int ComputeLogPosteriors(ParseOptions &po, 
+int ComputeLogPosteriors(ParseOptions &po,
   const LogisticRegressionConfig &config,
   bool apply_log) {
   std::string model = po.GetArg(1),
       vector_rspecifier = po.GetArg(2),
       log_posteriors_wspecifier = po.GetArg(3);
-  
+
   LogisticRegression classifier;
   ReadKaldiObject(model, &classifier);
-  
+
   std::vector<Vector<BaseFloat> > vectors;
   SequentialBaseFloatVectorReader vector_reader(vector_rspecifier);
   BaseFloatVectorWriter posterior_writer(log_posteriors_wspecifier);
@@ -90,12 +90,12 @@ int32 ComputeScores(ParseOptions &po, const LogisticRegressionConfig &config,
     KALDI_WARN << "Read no input";
     return 1;
   }
-  
+
   Matrix<BaseFloat> xs(vectors.size(), vectors[0].Dim());
   for (int i = 0; i < vectors.size(); i++) {
     xs.Row(i).CopyFromVec(vectors[i]);
   }
- 
+
   Matrix<BaseFloat> log_posteriors;
   classifier.GetLogPosteriors(xs, &log_posteriors);
 
@@ -104,11 +104,11 @@ int32 ComputeScores(ParseOptions &po, const LogisticRegressionConfig &config,
 
   if (!apply_log)
     log_posteriors.ApplyExp();
-  
+
   for (int i = 0; i < ys.size(); i++) {
     ko.Stream() << utt_list[i] << " " << ys[i] << " " << log_posteriors(i, ys[i]) << std::endl;
   }
-  KALDI_LOG << "Calculated scores for " << num_utt_done 
+  KALDI_LOG << "Calculated scores for " << num_utt_done
             << " vectors with "
             << num_utt_err << " missing. ";
   return (num_utt_done == 0 ? 1 : 0);
@@ -125,11 +125,11 @@ int main(int argc, char *argv[]) {
         "                                <output-log-posteriors-wspecifier>\n"
         "Usage2: logistic-regression-eval <model> <trials-file> <input-vectors-rspecifier>\n"
         "                                <output-scores-file>\n";
-    
+
   ParseOptions po(usage);
 
   bool apply_log = true;
-  po.Register("apply-log", &apply_log, 
+  po.Register("apply-log", &apply_log,
               "If false, apply Exp to the log posteriors output. This is "
               "helpful when combining posteriors from multiple logistic "
               "regression models.");
@@ -141,7 +141,7 @@ int main(int argc, char *argv[]) {
     po.PrintUsage();
     exit(1);
   }
-  
+
   if (po.NumArgs() == 4) {
     return ComputeScores(po, config, apply_log);
   } else {
diff --git a/src/ivectorbin/logistic-regression-train.cc b/src/ivectorbin/logistic-regression-train.cc
index e029ae41ba2..098665ac7b9 100644
--- a/src/ivectorbin/logistic-regression-train.cc
+++ b/src/ivectorbin/logistic-regression-train.cc
@@ -31,10 +31,10 @@ int main(int argc, char *argv[]) {
         "Trains a model using Logistic Regression with L-BFGS from\n"
         "a set of vectors. The class labels in <classes-rspecifier>\n"
         "must be a set of integers such that there are no gaps in \n"
-        "its range and the smallest label must be 0.\n" 
+        "its range and the smallest label must be 0.\n"
         "Usage: logistic-regression-train <vector-rspecifier>\n"
         "<classes-rspecifier> <model-out>\n";
-    
+
     ParseOptions po(usage);
 
     bool binary = true;
@@ -42,7 +42,7 @@ int main(int argc, char *argv[]) {
     config.Register(&po);
     po.Register("binary", &binary, "Write output in binary mode");
     po.Read(argc, argv);
-    
+
     if (po.NumArgs() != 3) {
       po.PrintUsage();
       exit(1);
@@ -54,7 +54,7 @@ int main(int argc, char *argv[]) {
 
     RandomAccessBaseFloatVectorReader vector_reader(vector_rspecifier);
     SequentialInt32Reader class_reader(class_rspecifier);
-    
+
     std::vector<int32> ys;
     std::vector<std::string> utt_ids;
     std::vector<Vector<BaseFloat> > vectors;
@@ -72,7 +72,7 @@ int main(int argc, char *argv[]) {
         ys.push_back(class_label);
         const Vector<BaseFloat> &vector = vector_reader.Value(utt);
         vectors.push_back(vector);
-    
+
         // Since there are no gaps in the class labels and we
         // start at 0, the largest label is the number of the
         // of the classes - 1.
@@ -99,7 +99,7 @@ int main(int argc, char *argv[]) {
       xs.Row(i).CopyFromVec(vectors[i]);
     }
     vectors.clear();
-  
+
     LogisticRegression classifier = LogisticRegression();
     classifier.Train(xs, ys, config);
     WriteKaldiObject(classifier, model_out, binary);
diff --git a/src/ivectorbin/merge-vads.cc b/src/ivectorbin/merge-vads.cc
new file mode 100644
index 00000000000..6f65835c539
--- /dev/null
+++ b/src/ivectorbin/merge-vads.cc
@@ -0,0 +1,163 @@
+// ivectorbin/merge-vads.cc
+
+// Copyright  2015 David Snyder
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "matrix/kaldi-matrix.h"
+#include "util/stl-utils.h"
+
+namespace kaldi {
+
+/**
+   PrepareMap creates a mapping between the pairs of VAD decisions and
+   the output label.  If map_rxfilename is empty, we create a mapping
+   in which a frame is only classified as speech (represented as "1") if
+   both VAD decisions agree on speech, and nonspeech (represented as "0")
+   otherwise.  If map_rxfilename is not empty, then that table provides
+   the mapping.  If the first set of VAD decisions has N classes and the
+   second has M classes, then the table needs to have NxM rows, and three
+   columns.  The first two columns correspond to the labels in the first
+   and second VAD decisions respectively, and the last column is the
+   resultant output label. For example:
+     0 0 0
+     0 1 0
+     0 2 0
+     1 0 0
+     1 1 1
+     1 2 1
+*/
+void PrepareMap(const std::string map_rxfilename,
+  unordered_map<std::pair<int32, int32>, int32, PairHasher<int32> > *map) {
+  Input map_input(map_rxfilename);
+
+  // If a map file isn't specified, provide an obvious mapping.  The
+  // following mapping assumes "0" corresponds to nonspeech and "1"
+  // corresponds to speech. The combination of two VAD decisions only
+  // results in a decision of speech if both input frames are
+  // classified as speech.
+  if (map_rxfilename.empty()) {
+    (*map)[std::pair<int32, int32>(0, 0)] = 0;
+    (*map)[std::pair<int32, int32>(0, 1)] = 0;
+    (*map)[std::pair<int32, int32>(1, 0)] = 0;
+    (*map)[std::pair<int32, int32>(1, 1)] = 1;
+  } else {
+    std::string line;
+    while (std::getline(map_input.Stream(), line)) {
+      if (line.size() == 0) continue;
+      int32 start = line.find_first_not_of(" \t");
+      int32 end = line.find_first_of('#');
+      if (start == std::string::npos || start == end) continue;
+      end = line.find_last_not_of(" \t", end - 1);
+      KALDI_ASSERT(end >= start);
+      std::vector<std::string> fields;
+      SplitStringToVector(line.substr(start, end - start + 1),
+         " \t\n\r", true, &fields);
+      if (fields.size() != 3) {
+        KALDI_ERR << "Bad line. Expected three fields, got: "
+                  << line;
+      }
+      int32 label1 = std::atoi(fields[0].c_str()),
+            label2 = std::atoi(fields[1].c_str()),
+            result_label = std::atoi(fields[2].c_str());
+      (*map)[std::pair<int32, int32>(label1, label2)] = result_label;
+    }
+  }
+}
+
+}
+
+int main(int argc, char *argv[]) {
+  using namespace kaldi;
+  typedef kaldi::int32 int32;
+  try {
+    const char *usage =
+      "This program merges two archives of per-frame weights representing\n"
+      "voice activity decisions.  By default, the program assumes that the\n"
+      "input vectors consist of floats that are 0.0 if a frame is judged\n"
+      "as nonspeech and 1.0 if it is considered speech.  The default\n"
+      "behavior produces a frame-level decision of 1.0 if both input frames\n"
+      "are 1.0, and 0.0 otherwise.  Additional classes (e.g., 2.0 for music)\n"
+      "can be handled using the \"map\" option.\n"
+      "\n"
+      "Usage: merge-vads [options] <vad-rspecifier-1> <vad-rspecifier-2>\n"
+      "    <vad-wspecifier>\n"
+      "e.g.: merge-vads [options] scp:vad_energy.scp scp:vad_gmm.scp\n"
+      "    ark:vad.ark\n"
+      "See also: compute-vad-from-frame-likes, compute-vad, ali-to-post,\n"
+      "post-to-weights\n";
+
+    ParseOptions po(usage);
+    std::string map_rxfilename;
+    po.Register("map", &map_rxfilename, "This table specifies a mapping "
+      "between the labels of the frame-level decisions in the first and "
+      "second input archives to the integer output label.");
+
+    po.Read(argc, argv);
+    if (po.NumArgs() != 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    unordered_map<std::pair<int32, int32>, int32, PairHasher<int32> > map;
+    PrepareMap(map_rxfilename, &map);
+    SequentialBaseFloatVectorReader first_vad_reader(po.GetArg(1));
+    RandomAccessBaseFloatVectorReader second_vad_reader(po.GetArg(2));
+    BaseFloatVectorWriter vad_writer(po.GetArg(3));
+
+    int32 num_done = 0, num_err = 0;
+    for (;!first_vad_reader.Done(); first_vad_reader.Next()) {
+      std::string utt = first_vad_reader.Key();
+      Vector<BaseFloat> vad1(first_vad_reader.Value());
+      if (!second_vad_reader.HasKey(utt)) {
+        KALDI_WARN << "No vector for utterance " << utt;
+        num_err++;
+        continue;
+      }
+      Vector<BaseFloat> vad2(second_vad_reader.Value(utt));
+      if (vad1.Dim() != vad2.Dim()) {
+        KALDI_WARN << "VAD length mismatch for utterance " << utt;
+        num_err++;
+        continue;
+      }
+      Vector<BaseFloat> vad_result(vad1.Dim());
+      for (int32 i = 0; i < vad1.Dim(); i++) {
+        std::pair<int32, int32> key(static_cast<int32>(vad1(i)),
+          static_cast<int32>(vad2(i)));
+        unordered_map<std::pair<int32, int32>, int32,
+          PairHasher<int32> >::const_iterator iter = map.find(key);
+        if (iter == map.end()) {
+          KALDI_ERR << "Map is missing combination "
+                    << vad1(i) << " and " << vad2(i);
+        } else {
+          vad_result(i) = iter->second;
+        }
+      }
+
+      vad_writer.Write(utt, vad_result);
+      num_done++;
+    }
+    KALDI_LOG << "Merged voice activity detection decisions; "
+              << "processed " << num_done << " utterances successfully; "
+              << num_err << " had errors.";
+    return (num_done != 0 ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
diff --git a/src/kws/Makefile b/src/kws/Makefile
index f2f7ac18cde..c17e71a69d8 100644
--- a/src/kws/Makefile
+++ b/src/kws/Makefile
@@ -6,11 +6,12 @@ include ../kaldi.mk
 EXTRA_CXXFLAGS += -Wno-sign-compare
 
 
-OBJFILES = kws-functions.o kws-scoring.o
+OBJFILES = kws-functions.o kws-functions2.o kws-scoring.o
 LIBNAME = kaldi-kws
 
 ADDLIBS = ../hmm/kaldi-hmm.a ../lat/kaldi-lat.a ../tree/kaldi-tree.a \
-					../matrix/kaldi-matrix.a ../util/kaldi-util.a ../base/kaldi-base.a
+          ../matrix/kaldi-matrix.a ../util/kaldi-util.a ../thread/kaldi-thread.a \
+          ../base/kaldi-base.a
 
 
 include ../makefiles/default_rules.mk
diff --git a/src/kws/kaldi-kws.h b/src/kws/kaldi-kws.h
index 075e302a78a..dd003485b91 100644
--- a/src/kws/kaldi-kws.h
+++ b/src/kws/kaldi-kws.h
@@ -1,4 +1,4 @@
-// lat/kaldi-kws.h
+// kws/kaldi-kws.h
 
 // Copyright 2012  Johns Hopkins University (Author: Guoguo Chen)
 
diff --git a/src/kws/kws-functions.cc b/src/kws/kws-functions.cc
index 6949c3f35cc..26645ee92cb 100644
--- a/src/kws/kws-functions.cc
+++ b/src/kws/kws-functions.cc
@@ -1,4 +1,4 @@
-// lat/kws-functions.cc
+// kws/kws-functions.cc
 
 // Copyright 2012  Johns Hopkins University (Author: Guoguo Chen)
 
@@ -23,6 +23,10 @@
 #include "fstext/determinize-star.h"
 #include "fstext/epsilon-property.h"
 
+// note: this .cc file does not include everything declared in kws-functions.h;
+// the remainder are defined in kws-functions2.cc (for compilation speed and
+// to avoid generating too-large object files on cygwin).
+
 namespace kaldi {
 
 bool CompareInterval(const Interval &i1,
@@ -66,7 +70,7 @@ bool ClusterLattice(CompactLattice *clat,
   //   till we find the next one that doesn't overlap in time with the current
   //   cluster head, and so on.
   unordered_map<StateId, vector<Interval> >::iterator iter;
-  for (iter = head.begin(); iter != head.end(); iter++) {
+  for (iter = head.begin(); iter != head.end(); ++iter) {
     // For this ilabel, sort all the arcs on time, from first to last.
     sort(iter->second.begin(), iter->second.end(), CompareInterval);
     vector<Interval> tmp;
@@ -139,39 +143,10 @@ class CompactLatticeToKwsProductFstMapper {
   uint64 Properties(uint64 props) const { return props; }
 };
 
-class KwsProductFstToKwsLexicographicFstMapper {
- public:
-  typedef KwsProductArc FromArc;
-  typedef KwsProductWeight FromWeight;
-  typedef KwsLexicographicArc ToArc;
-  typedef KwsLexicographicWeight ToWeight;
-
-  KwsProductFstToKwsLexicographicFstMapper() {}
-
-  ToArc operator()(const FromArc &arc) const {
-    return ToArc(arc.ilabel, 
-                 arc.olabel, 
-                 (arc.weight == FromWeight::Zero() ?
-                  ToWeight::Zero() :
-                  ToWeight(arc.weight.Value1().Value(), 
-                           StdLStdWeight(arc.weight.Value2().Value1().Value(),
-                                         arc.weight.Value2().Value2().Value()))),
-                 arc.nextstate);
-  }
-
-  fst::MapFinalAction FinalAction() const { return fst::MAP_NO_SUPERFINAL; }
-
-  fst::MapSymbolsAction InputSymbolsAction() const { return fst::MAP_COPY_SYMBOLS; }
-
-  fst::MapSymbolsAction OutputSymbolsAction() const { return fst::MAP_COPY_SYMBOLS;}
-
-  uint64 Properties(uint64 props) const { return props; }
-};
-
 
 bool CreateFactorTransducer(const CompactLattice &clat,
                             const vector<int32> &state_times,
-                            int32 utterance_id, 
+                            int32 utterance_id,
                             KwsProductFst *factor_transducer) {
   using namespace fst;
   typedef KwsProductArc::StateId StateId;
@@ -196,10 +171,10 @@ bool CreateFactorTransducer(const CompactLattice &clat,
   // initial and remove the total weight, i.e., the sum of all the outgoing
   // transitions and final weight at any state is equal to One() (push only the
   // negated log-prob, not the alignments)
-  for (StateIterator<KwsProductFst> 
+  for (StateIterator<KwsProductFst>
        siter(*factor_transducer); !siter.Done(); siter.Next()) {
     KwsProductArc::StateId state_id = siter.Value();
-    for (MutableArcIterator<KwsProductFst> 
+    for (MutableArcIterator<KwsProductFst>
          aiter(factor_transducer, state_id); !aiter.Done(); aiter.Next()) {
       KwsProductArc arc = aiter.Value();
       BaseFloat w = arc.weight.Value1().Value();
@@ -213,7 +188,7 @@ bool CreateFactorTransducer(const CompactLattice &clat,
       BaseFloat w = factor_transducer->Final(state_id).Value1().Value();
       w += beta[state_id];
       KwsProductWeight weight(w, factor_transducer->Final(state_id).Value2());
-      factor_transducer->SetFinal(state_id, weight); 
+      factor_transducer->SetFinal(state_id, weight);
     }
   }
 
@@ -248,11 +223,11 @@ bool CreateFactorTransducer(const CompactLattice &clat,
   if (!has_epsilon_property) {
     KALDI_WARN << "Epsilon property does not hold, reverting to old behavior.";
   }
-  
+
   // OK, after the above preparation, we finally come to the factor generation
-  // step. 
-  StateId ns = factor_transducer->NumStates(); 
-  StateId ss = factor_transducer->AddState(); 
+  // step.
+  StateId ns = factor_transducer->NumStates();
+  StateId ss = factor_transducer->AddState();
   StateId fs = factor_transducer->AddState();
   factor_transducer->SetStart(ss);
   factor_transducer->SetFinal(fs, KwsProductWeight::One());
@@ -285,7 +260,7 @@ void RemoveLongSilences(int32 max_silence_frames,
     // Skip arcs start from the initial state
     if (s == ss)
       continue;
-    for (MutableArcIterator<KwsProductFst> 
+    for (MutableArcIterator<KwsProductFst>
          aiter(factor_transducer, s); !aiter.Done(); aiter.Next()) {
       KwsProductArc arc = aiter.Value();
       // Skip arcs end with the final state
@@ -324,13 +299,13 @@ static void DifferenceWrapper(const fst::VectorFst<Arc> &fst1,
     DifferenceWrapper(fst1_copy, fst2_copy, difference);
     Decode(difference, encoder);
   } else {
-    VectorFst<Arc> fst2_copy(fst2);    
+    VectorFst<Arc> fst2_copy(fst2);
     RmEpsilon(&fst2_copy); // or Difference will crash.
     RemoveWeights(&fst2_copy); // or Difference will crash.
     Difference(fst1, fst2_copy, difference);
   }
 }
-                       
+
 
 void MaybeDoSanityCheck(const KwsLexicographicFst &index_transducer) {
   typedef KwsLexicographicFst::Arc::Label Label;
@@ -355,7 +330,7 @@ void MaybeDoSanityCheck(const KwsLexicographicFst &index_transducer) {
   // into a transducer).
   KwsLexicographicFst difference_transducer;
   DifferenceWrapper(index_transducer, temp_transducer, &difference_transducer);
-  ShortestPath(difference_transducer, &temp_transducer);  
+  ShortestPath(difference_transducer, &temp_transducer);
 
   GetLinearSymbolSequence(temp_transducer, &isymbols, &osymbols, &weight);
   std::ostringstream os2;
@@ -369,10 +344,9 @@ void MaybeDoSanityCheck(const KwsLexicographicFst &index_transducer) {
     KALDI_WARN << "Negative second-best cost found " << second_best_cost;
   }
 }
-  
+
 
 void MaybeDoSanityCheck(const KwsProductFst &product_transducer) {
-  typedef KwsProductFst::Arc::Label Label;
   if (GetVerboseLevel() < 2) return;
   KwsLexicographicFst index_transducer;
   Map(product_transducer, &index_transducer, KwsProductFstToKwsLexicographicFstMapper());
@@ -380,114 +354,5 @@ void MaybeDoSanityCheck(const KwsProductFst &product_transducer) {
 }
 
 
-// This function replaces a symbol with epsilon wherever it appears
-// (fst must be an acceptor).
-template<class Arc>
-static void ReplaceSymbolWithEpsilon(typename Arc::Label symbol,
-                                     fst::VectorFst<Arc> *fst) {
-  typedef typename Arc::StateId StateId;
-  for (StateId s = 0; s < fst->NumStates(); s++) {
-    for (fst::MutableArcIterator<fst::VectorFst<Arc> > aiter(fst, s);
-         !aiter.Done(); aiter.Next()) {
-      Arc arc = aiter.Value();
-      KALDI_ASSERT(arc.ilabel == arc.olabel);
-      if (arc.ilabel == symbol) {
-        arc.ilabel = 0;
-        arc.olabel = 0;
-        aiter.SetValue(arc);
-      }
-    }
-  }
-}  
-
-
-void DoFactorMerging(KwsProductFst *factor_transducer,
-                     KwsLexicographicFst *index_transducer) {
-  using namespace fst;
-  typedef KwsProductFst::Arc::Label Label;
-
-  // Encode the transducer first
-  EncodeMapper<KwsProductArc> encoder(kEncodeLabels, ENCODE);
-  Encode(factor_transducer, &encoder);
-
-
-  // We want DeterminizeStar to remove epsilon arcs, so turn whatever it encoded
-  // epsilons as, into actual epsilons.
-  {
-    KwsProductArc epsilon_arc(0, 0, KwsProductWeight::One(), 0);
-    Label epsilon_label = encoder(epsilon_arc).ilabel;
-    ReplaceSymbolWithEpsilon(epsilon_label, factor_transducer);
-  }
-    
-
-  MaybeDoSanityCheck(*factor_transducer);
-
-  // Use DeterminizeStar
-  KALDI_VLOG(2) << "DoFactorMerging: determinization...";
-  KwsProductFst dest_transducer;
-  DeterminizeStar(*factor_transducer, &dest_transducer);
-
-  MaybeDoSanityCheck(dest_transducer);
-
-  // Commenting the minimization out, as it moves states/arcs in a way we don't
-  // want in some rare cases. For example, if we have two arcs from starting
-  // state, which have same words on the input side, but different cluster IDs
-  // on the output side, it may make the two arcs sharing a common final arc,
-  // which will cause problem in the factor disambiguation stage (we will not
-  // be able to add disambiguation symbols for both paths). We do a final step
-  // optimization anyway so commenting this out shouldn't matter too much.
-  // KALDI_VLOG(2) << "DoFactorMerging: minimization...";
-  // Minimize(&dest_transducer);
-
-  MaybeDoSanityCheck(dest_transducer);
-  
-  Decode(&dest_transducer, encoder);
-
-  Map(dest_transducer, index_transducer, KwsProductFstToKwsLexicographicFstMapper());
-}
-
-void DoFactorDisambiguation(KwsLexicographicFst *index_transducer) {
-  using namespace fst;
-  typedef KwsLexicographicArc::StateId StateId;
-
-  StateId ns = index_transducer->NumStates();
-  for (StateId s = 0; s < ns; s++) {
-    for (MutableArcIterator<KwsLexicographicFst> 
-         aiter(index_transducer, s); !aiter.Done(); aiter.Next()) {
-      KwsLexicographicArc arc = aiter.Value();
-      if (index_transducer->Final(arc.nextstate) != KwsLexicographicWeight::Zero())
-        arc.ilabel = s;
-      else
-        arc.olabel = 0;
-      aiter.SetValue(arc);
-    }
-  }
-}
-
-void OptimizeFactorTransducer(KwsLexicographicFst *index_transducer,
-                              int32 max_states,
-                              bool allow_partial) {
-  using namespace fst;
-  KwsLexicographicFst ifst = *index_transducer;
-  EncodeMapper<KwsLexicographicArc> encoder(kEncodeLabels, ENCODE);
-  Encode(&ifst, &encoder);
-  KALDI_VLOG(2) << "OptimizeFactorTransducer: determinization...";
-  if (allow_partial) {
-    DeterminizeStar(ifst, index_transducer, kDelta, NULL, max_states, true);
-  } else {
-      try {
-        DeterminizeStar(ifst, index_transducer, kDelta, NULL, max_states,
-                        false);
-      } catch(const std::exception &e) {
-        KALDI_WARN << e.what();
-        *index_transducer = ifst;
-      }
-  }
-  KALDI_VLOG(2) << "OptimizeFactorTransducer: minimization...";
-  Minimize(index_transducer);
-  Decode(index_transducer, encoder);
-}
-
-
 
 } // end namespace kaldi
diff --git a/src/kws/kws-functions.h b/src/kws/kws-functions.h
index bef0ebae098..e13e99f38ae 100644
--- a/src/kws/kws-functions.h
+++ b/src/kws/kws-functions.h
@@ -1,4 +1,4 @@
-// lat/kws-functions.h
+// kws/kws-functions.h
 
 // Copyright 2012  Johns Hopkins University (Author: Guoguo Chen)
 
@@ -51,37 +51,37 @@ class Interval {
 // We define a function bool CompareInterval(const Interval &i1, const Interval
 // &i2) to compare the Interval defined above. If interval i1 is in front of
 // interval i2, then return true; otherwise return false.
-bool CompareInterval(const Interval &i1, 
-                     const Interval &i2); 
+bool CompareInterval(const Interval &i1,
+                     const Interval &i2);
 
 // This function clusters the arcs with same word id and overlapping time-spans.
 // Examples of clusters:
-// 0 1 a a (0.1s ~ 0.5s) and 2 3 a a (0.2s ~ 0.4s) are within the same cluster; 
-// 0 1 a a (0.1s ~ 0.5s) and 5 6 b b (0.2s ~ 0.4s) are in different clusters; 
+// 0 1 a a (0.1s ~ 0.5s) and 2 3 a a (0.2s ~ 0.4s) are within the same cluster;
+// 0 1 a a (0.1s ~ 0.5s) and 5 6 b b (0.2s ~ 0.4s) are in different clusters;
 // 0 1 a a (0.1s ~ 0.5s) and 7 8 a a (0.9s ~ 1.4s) are also in different clusters.
 // It puts disambiguating symbols in the olabels, leaving the words on the
 // ilabels.
-bool ClusterLattice(CompactLattice *clat, 
+bool ClusterLattice(CompactLattice *clat,
                     const vector<int32> &state_times);
 
 // This function contains two steps: weight pushing and factor generation. The
 // original ShortestDistance() is not very efficient, so we do the weight
 // pushing and shortest path manually by computing the alphas and betas. The
 // factor generation step expand the lattice to the LXTXT' semiring, with
-// additional start state and end state (and corresponding arcs) added. 
+// additional start state and end state (and corresponding arcs) added.
 bool CreateFactorTransducer(const CompactLattice &clat,
-                            const vector<int32> &state_times, 
-                            int32 utterance_id, 
+                            const vector<int32> &state_times,
+                            int32 utterance_id,
                             KwsProductFst *factor_transducer);
 
 // This function removes the arcs with long silence. By "long" we mean arcs with
 // #frames exceeding the given max_silence_frames. We do this filtering because
-// the gap between adjacent words in a keyword must be <= 0.5 second. 
+// the gap between adjacent words in a keyword must be <= 0.5 second.
 // Note that we should not remove the arcs created in the factor generation
 // step, so the "search area" is limited to the original arcs before factor
-// generation. 
-void RemoveLongSilences(int32 max_silence_frames, 
-                        const vector<int32> &state_times, 
+// generation.
+void RemoveLongSilences(int32 max_silence_frames,
+                        const vector<int32> &state_times,
                         KwsProductFst *factor_transducer);
 
 // Do the factor merging part: encode input and output, and apply weighted
@@ -109,6 +109,40 @@ void MaybeDoSanityCheck(const KwsProductFst &factor_transducer);
 void MaybeDoSanityCheck(const KwsLexicographicFst &index_transducer);
 
 
+// this Mapper class is used in some of the the internals; we have to declare it
+// in the header because, for the sake of compilation time, we split up the
+// implementation into two .cc files.
+class KwsProductFstToKwsLexicographicFstMapper {
+ public:
+  typedef KwsProductArc FromArc;
+  typedef KwsProductWeight FromWeight;
+  typedef KwsLexicographicArc ToArc;
+  typedef KwsLexicographicWeight ToWeight;
+
+  KwsProductFstToKwsLexicographicFstMapper() {}
+
+  inline ToArc operator()(const FromArc &arc) const {
+    return ToArc(arc.ilabel,
+                 arc.olabel,
+                 (arc.weight == FromWeight::Zero() ?
+                  ToWeight::Zero() :
+                  ToWeight(arc.weight.Value1().Value(),
+                           StdLStdWeight(arc.weight.Value2().Value1().Value(),
+                                         arc.weight.Value2().Value2().Value()))),
+                 arc.nextstate);
+  }
+
+  fst::MapFinalAction FinalAction() const { return fst::MAP_NO_SUPERFINAL; }
+
+  fst::MapSymbolsAction InputSymbolsAction() const { return fst::MAP_COPY_SYMBOLS; }
+
+  fst::MapSymbolsAction OutputSymbolsAction() const { return fst::MAP_COPY_SYMBOLS;}
+
+  uint64 Properties(uint64 props) const { return props; }
+};
+
+
+
 } // namespace kaldi
 
 
diff --git a/src/kws/kws-functions2.cc b/src/kws/kws-functions2.cc
new file mode 100644
index 00000000000..53c08fb5296
--- /dev/null
+++ b/src/kws/kws-functions2.cc
@@ -0,0 +1,140 @@
+// kws/kws-functions.cc
+
+// Copyright 2012  Johns Hopkins University (Author: Guoguo Chen)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "lat/lattice-functions.h"
+#include "kws/kws-functions.h"
+#include "fstext/determinize-star.h"
+#include "fstext/epsilon-property.h"
+
+// this file implements things in kws-functions.h; it's an overflow from
+// kws-functions.cc (we split it up for compilation speed and to avoid
+// generating too-large object files on cygwin).
+
+namespace kaldi {
+
+
+// This function replaces a symbol with epsilon wherever it appears
+// (fst must be an acceptor).
+template<class Arc>
+static void ReplaceSymbolWithEpsilon(typename Arc::Label symbol,
+                                     fst::VectorFst<Arc> *fst) {
+  typedef typename Arc::StateId StateId;
+  for (StateId s = 0; s < fst->NumStates(); s++) {
+    for (fst::MutableArcIterator<fst::VectorFst<Arc> > aiter(fst, s);
+         !aiter.Done(); aiter.Next()) {
+      Arc arc = aiter.Value();
+      KALDI_ASSERT(arc.ilabel == arc.olabel);
+      if (arc.ilabel == symbol) {
+        arc.ilabel = 0;
+        arc.olabel = 0;
+        aiter.SetValue(arc);
+      }
+    }
+  }
+}
+
+void DoFactorMerging(KwsProductFst *factor_transducer,
+                     KwsLexicographicFst *index_transducer) {
+  using namespace fst;
+  typedef KwsProductFst::Arc::Label Label;
+
+  // Encode the transducer first
+  EncodeMapper<KwsProductArc> encoder(kEncodeLabels, ENCODE);
+  Encode(factor_transducer, &encoder);
+
+
+  // We want DeterminizeStar to remove epsilon arcs, so turn whatever it encoded
+  // epsilons as, into actual epsilons.
+  {
+    KwsProductArc epsilon_arc(0, 0, KwsProductWeight::One(), 0);
+    Label epsilon_label = encoder(epsilon_arc).ilabel;
+    ReplaceSymbolWithEpsilon(epsilon_label, factor_transducer);
+  }
+
+
+  MaybeDoSanityCheck(*factor_transducer);
+
+  // Use DeterminizeStar
+  KALDI_VLOG(2) << "DoFactorMerging: determinization...";
+  KwsProductFst dest_transducer;
+  DeterminizeStar(*factor_transducer, &dest_transducer);
+
+  MaybeDoSanityCheck(dest_transducer);
+
+  // Commenting the minimization out, as it moves states/arcs in a way we don't
+  // want in some rare cases. For example, if we have two arcs from starting
+  // state, which have same words on the input side, but different cluster IDs
+  // on the output side, it may make the two arcs sharing a common final arc,
+  // which will cause problem in the factor disambiguation stage (we will not
+  // be able to add disambiguation symbols for both paths). We do a final step
+  // optimization anyway so commenting this out shouldn't matter too much.
+  // KALDI_VLOG(2) << "DoFactorMerging: minimization...";
+  // Minimize(&dest_transducer);
+
+  MaybeDoSanityCheck(dest_transducer);
+
+  Decode(&dest_transducer, encoder);
+
+  Map(dest_transducer, index_transducer, KwsProductFstToKwsLexicographicFstMapper());
+}
+
+void DoFactorDisambiguation(KwsLexicographicFst *index_transducer) {
+  using namespace fst;
+  typedef KwsLexicographicArc::StateId StateId;
+
+  StateId ns = index_transducer->NumStates();
+  for (StateId s = 0; s < ns; s++) {
+    for (MutableArcIterator<KwsLexicographicFst>
+         aiter(index_transducer, s); !aiter.Done(); aiter.Next()) {
+      KwsLexicographicArc arc = aiter.Value();
+      if (index_transducer->Final(arc.nextstate) != KwsLexicographicWeight::Zero())
+        arc.ilabel = s;
+      else
+        arc.olabel = 0;
+      aiter.SetValue(arc);
+    }
+  }
+}
+
+void OptimizeFactorTransducer(KwsLexicographicFst *index_transducer,
+                              int32 max_states,
+                              bool allow_partial) {
+  using namespace fst;
+  KwsLexicographicFst ifst = *index_transducer;
+  EncodeMapper<KwsLexicographicArc> encoder(kEncodeLabels, ENCODE);
+  Encode(&ifst, &encoder);
+  KALDI_VLOG(2) << "OptimizeFactorTransducer: determinization...";
+  if (allow_partial) {
+    DeterminizeStar(ifst, index_transducer, kDelta, NULL, max_states, true);
+  } else {
+      try {
+        DeterminizeStar(ifst, index_transducer, kDelta, NULL, max_states,
+                        false);
+      } catch(const std::exception &e) {
+        KALDI_WARN << e.what();
+        *index_transducer = ifst;
+      }
+  }
+  KALDI_VLOG(2) << "OptimizeFactorTransducer: minimization...";
+  Minimize(index_transducer);
+  Decode(index_transducer, encoder);
+}
+
+} // end namespace kaldi
diff --git a/src/kws/kws-scoring.cc b/src/kws/kws-scoring.cc
index 7dec4b33239..e7fdedc5023 100644
--- a/src/kws/kws-scoring.cc
+++ b/src/kws/kws-scoring.cc
@@ -60,11 +60,11 @@ class KwTermEqual {
 // This was an old definition of the criterion "the hyp is within
 // max_distance_ area from the ref". The positive thing about the
 // definition is, that it allows binary search through the collection
-//        ret &= abs(left.tbeg - right.tbeg) <= max_distance_;
-//        ret &= abs(left.tend - right.tend) <= max_distance_;
+//        ret &= fabs(left.tbeg - right.tbeg) <= max_distance_;
+//        ret &= fabs(left.tend - right.tend) <= max_distance_;
 
 // This is the newer definition -- should be equivalent to what F4DE uses
-    ret &= abs(center_left - center_right) <= max_distance_;
+    ret &= fabs(center_left - center_right) <= max_distance_;
 
     return ret;
   }
diff --git a/src/kwsbin/Makefile b/src/kwsbin/Makefile
index 702cd4ab186..fe99b0c8db4 100644
--- a/src/kwsbin/Makefile
+++ b/src/kwsbin/Makefile
@@ -16,6 +16,6 @@ TESTFILES =
 
 ADDLIBS = ../kws/kaldi-kws.a ../lat/kaldi-lat.a ../fstext/kaldi-fstext.a \
         ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a ../matrix/kaldi-matrix.a \
-        ../util/kaldi-util.a ../base/kaldi-base.a
+        ../util/kaldi-util.a ../thread/kaldi-thread.a ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/kwsbin/compute-atwv.cc b/src/kwsbin/compute-atwv.cc
index 57ad08bcc2e..c7c8e484f8d 100644
--- a/src/kwsbin/compute-atwv.cc
+++ b/src/kwsbin/compute-atwv.cc
@@ -1,4 +1,4 @@
-// bin/compute-atwv.cc
+// kwsbin/compute-atwv.cc
 
 // Copyright (c) 2015, Johns Hopkins University (Yenda Trmal<jtrmal@gmail.com>)
 
@@ -37,13 +37,34 @@ int main(int argc, char *argv[]) {
 
     const char *usage = "Computes the Actual Term-Weighted Value and prints it."
         "\n"
-        "Usage: compute-atwv [options]  ref-rspecifier hyp-rspecifier [alignment csv]\n"
-        " e.g.: compute-atwv ark:ref.1 ark:hyp.1 ali.csv\n"
+        "Usage: compute-atwv [options] <nof-trials> <ref-rspecifier> <hyp-rspecifier> [alignment-csv-filename]\n"
+        " e.g.: compute-atwv 32485.4 ark:ref.1 ark:hyp.1 ali.csv\n"
+        "   or: compute-atwv 32485.4 ark:ref.1 ark:hyp.1\n"
         "\n"
-        "where the alignment format is compatible with the alignment produced\n"
-        "using the F4DE tool -- you are responsible for mapping the utterance\n"
-        "identifiers and the term string to the correct ones - use the script\n"
-        "utils/int2sym.pl and the utterance/keyword maps\n";
+        "NOTES: \n"
+        "  a) the number of trials is usually equal to the size of the searched\n"
+        "     collection in seconds\n"
+        "  b  the ref-rspecifier/hyp-rspecifier are the kaldi IO specifiers for both\n"
+        "     the reference and the hypotheses (found hits), respectively.\n"
+        "     The format is the same for both of them. Each line is of \n"
+        "     the following format\n"
+        "\n"
+        "     <KW-ID> <utterance-id> <start-frame> <end-frame> <score>\n\n"
+        "     e.g.:\n\n"
+        "     KW106-189 348 459 560 0.8\n"
+        "\n"
+        "  b) the alignment-csv-filename is an optional parameter. If present,\n"
+        "     the alignment i.e. detailed information about what hypotheses match\n"
+        "     up with which reference entries will be generated. The alignemnt\n"
+        "     file format is equivalent to the alignment file produced using\n"
+        "     the F4DE tool. However, we do not set some fields and the utterance\n"
+        "     identifiers are numeric. You can use the script utils/int2sym.pl\n"
+        "     and the utterance/keyword maps to convert the numerical ids into text\n"
+        "  c) the scores are expected to be probabilities. Please note that\n"
+        "     the output from the kws-search is in -log(probability).\n"
+        "  d) compute-atwv does not perform any score normalization (it's just\n"
+        "     for scoring purposes). Without score normalization/calibration\n"
+        "     the performance of the search will be quite poor.\n";
 
     ParseOptions po(usage);
     KwsTermsAlignerOptions ali_opts;
diff --git a/src/kwsbin/generate-proxy-keywords.cc b/src/kwsbin/generate-proxy-keywords.cc
index 779715417df..8495b8e3fe6 100644
--- a/src/kwsbin/generate-proxy-keywords.cc
+++ b/src/kwsbin/generate-proxy-keywords.cc
@@ -246,7 +246,7 @@ int main(int argc, char *argv[]) {
     delete L1;
     delete L2xE;
     KALDI_LOG << "Done " << n_done << " keywords";
-    return (n_done != 0 ? 0 : 1);    
+    return (n_done != 0 ? 0 : 1);
   } catch(const std::exception &e) {
     std::cerr << e.what();
     return -1;
diff --git a/src/kwsbin/kws-search.cc b/src/kwsbin/kws-search.cc
index cca3438c690..467a2ab1ccd 100644
--- a/src/kwsbin/kws-search.cc
+++ b/src/kwsbin/kws-search.cc
@@ -50,7 +50,7 @@ class VectorFstToKwsLexicographicFstMapper {
   VectorFstToKwsLexicographicFstMapper() {}
 
   ToArc operator()(const FromArc &arc) const {
-    return ToArc(arc.ilabel, 
+    return ToArc(arc.ilabel,
                  arc.olabel,
                  (arc.weight == FromWeight::Zero() ?
                   ToWeight::Zero() :
@@ -99,12 +99,12 @@ int main(int argc, char *argv[]) {
     bool strict = true;
     double negative_tolerance = -0.1;
     double keyword_beam = -1;
-    
+
     po.Register("nbest", &n_best, "Return the best n hypotheses.");
     po.Register("keyword-nbest", &keyword_nbest,
                 "Pick the best n keywords if the FST contains multiple keywords.");
     po.Register("strict", &strict, "Affects the return status of the program.");
-    po.Register("negative-tolerance", &negative_tolerance, 
+    po.Register("negative-tolerance", &negative_tolerance,
                 "The program will print a warning if we get negative score smaller "
                 "than this tolerance.");
     po.Register("keyword-beam", &keyword_beam,
@@ -134,13 +134,13 @@ int main(int argc, char *argv[]) {
         keyword_rspecifier = po.GetOptArg(2),
         result_wspecifier = po.GetOptArg(3);
 
-    RandomAccessTableReader< VectorFstTplHolder<KwsLexicographicArc> > index_reader(index_rspecifier);
+    RandomAccessTableReader< VectorFstTplHolder<Arc> > index_reader(index_rspecifier);
     SequentialTableReader<VectorFstHolder> keyword_reader(keyword_rspecifier);
-    TableWriter< BasicVectorHolder<double> > result_writer(result_wspecifier);
+    TableWriter<BasicVectorHolder<double> > result_writer(result_wspecifier);
 
     // Index has key "global"
     KwsLexicographicFst index = index_reader.Value("global");
-    
+
     // First we have to remove the disambiguation symbols. But rather than
     // removing them totally, we actually move them from input side to output
     // side, making the output symbol a "combined" symbol of the disambiguation
@@ -154,7 +154,7 @@ int main(int argc, char *argv[]) {
     unordered_map<uint32, uint64> label_decoder;
     for (StateIterator<KwsLexicographicFst> siter(index); !siter.Done(); siter.Next()) {
       StateId state_id = siter.Value();
-      for (MutableArcIterator<KwsLexicographicFst> 
+      for (MutableArcIterator<KwsLexicographicFst>
            aiter(&index, state_id); !aiter.Done(); aiter.Next()) {
         Arc arc = aiter.Value();
         // Skip the non-final arcs
@@ -169,14 +169,14 @@ int main(int argc, char *argv[]) {
           label_encoder[osymbol] = label_count;
           label_decoder[label_count] = osymbol;
           label_count++;
-        } else { 
+        } else {
           arc.olabel = label_encoder[osymbol];
         }
         aiter.SetValue(arc);
       }
     }
-    ArcSort(&index, fst::ILabelCompare<KwsLexicographicArc>());
-    
+    ArcSort(&index, fst::ILabelCompare<Arc>());
+
     int32 n_done = 0;
     int32 n_fail = 0;
     for (; !keyword_reader.Done(); keyword_reader.Next()) {
@@ -210,7 +210,7 @@ int main(int argc, char *argv[]) {
       // Got something here
       double score;
       int32 tbeg, tend, uid;
-      for (ArcIterator<KwsLexicographicFst> 
+      for (ArcIterator<KwsLexicographicFst>
            aiter(result_fst, result_fst.Start()); !aiter.Done(); aiter.Next()) {
         const Arc &arc = aiter.Value();
 
diff --git a/src/kwsbin/transcripts-to-fsts.cc b/src/kwsbin/transcripts-to-fsts.cc
index 9b72434edb7..e1a99a29fa2 100644
--- a/src/kwsbin/transcripts-to-fsts.cc
+++ b/src/kwsbin/transcripts-to-fsts.cc
@@ -107,12 +107,8 @@ int main(int argc, char *argv[]) {
       n_done++;
     }
 
-    if (lfst != NULL) {
-      delete lfst;
-    }
-    if (rfst != NULL) {
-      delete rfst;
-    }
+    delete lfst;
+    delete rfst;
 
     KALDI_LOG << "Done " << n_done << " transcriptions";
     return (n_done != 0 ? 0 : 1);    
diff --git a/src/lat/Makefile b/src/lat/Makefile
index 6911899325a..4cc43121758 100644
--- a/src/lat/Makefile
+++ b/src/lat/Makefile
@@ -6,7 +6,7 @@ include ../kaldi.mk
 EXTRA_CXXFLAGS += -Wno-sign-compare
 
 TESTFILES = kaldi-lattice-test push-lattice-test minimize-lattice-test \
-      determinize-lattice-pruned-test 
+      determinize-lattice-pruned-test word-align-lattice-lexicon-test
 
 OBJFILES = kaldi-lattice.o lattice-functions.o word-align-lattice.o \
 	   phone-align-lattice.o word-align-lattice-lexicon.o sausages.o \
@@ -16,7 +16,7 @@ OBJFILES = kaldi-lattice.o lattice-functions.o word-align-lattice.o \
 LIBNAME = kaldi-lat
 
 ADDLIBS = ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a ../matrix/kaldi-matrix.a \
-          ../util/kaldi-util.a ../base/kaldi-base.a
+          ../util/kaldi-util.a ../thread/kaldi-thread.a ../base/kaldi-base.a
 
 
 include ../makefiles/default_rules.mk
diff --git a/src/lat/determinize-lattice-pruned.cc b/src/lat/determinize-lattice-pruned.cc
index e77afc79f49..e38c62b3bfa 100644
--- a/src/lat/determinize-lattice-pruned.cc
+++ b/src/lat/determinize-lattice-pruned.cc
@@ -1,4 +1,4 @@
-// lat/determinize-lattice-pruned-inl.h
+// lat/determinize-lattice-pruned.cc
 
 // Copyright 2009-2012  Microsoft Corporation
 //           2012-2013  Johns Hopkins University (Author: Daniel Povey)
@@ -19,17 +19,6 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
-#ifdef _MSC_VER
-#include <unordered_map>
-using std::unordered_map;
-#elif __cplusplus > 199711L || defined(__GXX_EXPERIMENTAL_CXX0X__)
-#include <unordered_map>
-using std::unordered_map;
-#else
-#include <tr1/unordered_map>
-using std::tr1::unordered_map;
-#endif
-
 #include <vector>
 #include <climits>
 #include "fstext/determinize-lattice.h" // for LatticeStringRepository
@@ -412,6 +401,10 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
     bool operator > (const Element &other) const {
       return state > other.state;
     }
+    // This operator is only intended to support sorting in EpsilonClosure()
+    bool operator < (const Element &other) const {
+      return state < other.state;
+    }
   };
 
   // Arcs in the format we temporarily create in this class (a representation, essentially of
@@ -538,6 +531,7 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
                    << forward_cost << ", "
                    << state.forward_cost;
       }
+      return state_id;
     }
     OutputStateId state_id = static_cast<OutputStateId>(output_states_.size());
     OutputState *new_state = new OutputState(subset, forward_cost);
@@ -641,29 +635,18 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
     // be so at output].  This function follows input-epsilons, and augments the
     // subset accordingly.
     
+    std::priority_queue<Element, vector<Element>, greater<Element> > queue;
     unordered_map<InputStateId, Element> cur_subset;
-    typedef typename unordered_map<InputStateId, Element>::iterator MapIter;    
+    typedef typename unordered_map<InputStateId, Element>::iterator MapIter;
+    typedef typename vector<Element>::const_iterator VecIter;
 
-    {
-      MapIter iter = cur_subset.end();
-      for (size_t i = 0; i < subset->size(); i++) {
-        std::pair<const InputStateId, Element> pr((*subset)[i].state, (*subset)[i]);
-#if __GNUC__ == 4 && __GNUC_MINOR__ == 0
-        iter = cur_subset.insert(iter, pr).first;
-#else
-        iter = cur_subset.insert(iter, pr);
-#endif
-        // By providing iterator where we inserted last one, we make insertion more efficient since
-        // input subset was already in sorted order.
-      }
+    for (VecIter iter = subset->begin(); iter != subset->end(); ++iter) {
+      queue.push(*iter);
+      cur_subset[iter->state] = *iter;
     }
-    // find whether input fst is known to be sorted on input label. 
-    bool sorted = ((ifst_->Properties(kILabelSorted, false) & kILabelSorted) != 0);
 
-    std::priority_queue<Element, vector<Element>, greater<Element> > queue;
-    for (typename vector<Element>::const_iterator iter = subset->begin();
-         iter != subset->end();
-         ++iter) queue.push(*iter);
+    // find whether input fst is known to be sorted on input label.
+    bool sorted = ((ifst_->Properties(kILabelSorted, false) & kILabelSorted) != 0);
     bool replaced_elems = false; // relates to an optimization, see below.
     int counter = 0; // stops infinite loops here for non-lattice-determinizable input
     // (e.g. input with negative-cost epsilon loops); useful in testing.
@@ -697,8 +680,7 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
           // next_elem.string is not set up yet... create it only
           // when we know we need it (this is an optimization) 
           
-          typename unordered_map<InputStateId, Element>::iterator
-              iter = cur_subset.find(next_elem.state);
+          MapIter iter = cur_subset.find(next_elem.state);
           if (iter == cur_subset.end()) {
             // was no such StateId: insert and add to queue.
             next_elem.string = (arc.olabel == 0 ? elem.string :
@@ -732,12 +714,13 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
       }
     }
 
-    {  // copy cur_subset to subset.
-      // sorted order is automatic.
+    { // copy cur_subset to subset.
       subset->clear();
       subset->reserve(cur_subset.size());
       MapIter iter = cur_subset.begin(), end = cur_subset.end();
       for (; iter != end; ++iter) subset->push_back(iter->second);
+      // sort by state ID, because the subset hash function is order-dependent(see SubsetKey)
+      std::sort(subset->begin(), subset->end());
     }
   }
 
@@ -1386,6 +1369,11 @@ void DeterminizeLatticeDeletePhones(
     }
   }
 }
+// instantiate for type LatticeWeight
+template
+void DeterminizeLatticeDeletePhones(
+    ArcTpl<kaldi::LatticeWeight>::Label first_phone_label,
+    MutableFst<ArcTpl<kaldi::LatticeWeight> > *fst);    
 
 /** This function does a first pass determinization with phone symbols inserted
     at phone boundary. It uses a transition model to work out the transition-id
@@ -1552,5 +1540,3 @@ bool DeterminizeLatticePhonePruned<kaldi::LatticeWeight, kaldi::int32>(
     DeterminizeLatticePhonePrunedOptions opts);
 
 }
-
-
diff --git a/src/lat/determinize-lattice-pruned.h b/src/lat/determinize-lattice-pruned.h
index 430bab36988..8e1858aa2b1 100644
--- a/src/lat/determinize-lattice-pruned.h
+++ b/src/lat/determinize-lattice-pruned.h
@@ -278,6 +278,8 @@ bool DeterminizeLatticePhonePruned(
     Unlike other determinization routines, the function
     requires "ifst" to have transition-id's on the input side and words on the
     output side.
+    This function can be used as the top-level interface to all the determinization
+    code.
 */
 bool DeterminizeLatticePhonePrunedWrapper(
     const kaldi::TransitionModel &trans_model,
diff --git a/src/lat/kaldi-lattice.cc b/src/lat/kaldi-lattice.cc
index c78d05e404e..ee58e64704d 100644
--- a/src/lat/kaldi-lattice.cc
+++ b/src/lat/kaldi-lattice.cc
@@ -130,16 +130,16 @@ class LatticeReader {
       // archive format.
       if (col.size() > 5) {
         KALDI_WARN << "Reading lattice: bad line in FST: " << line;
-        if (fst) delete fst;
-        if (cfst) delete cfst;    
+        delete fst;
+        delete cfst;    
         return PairT(static_cast<Lattice*>(NULL),
                      static_cast<CompactLattice*>(NULL));
       }
       StateId s;
       if (!ConvertStringToInteger(col[0], &s)) {
         KALDI_WARN << "FstCompiler: bad line in FST: " << line;
-        if (fst) delete fst;
-        if (cfst) delete cfst;
+        delete fst;
+        delete cfst;
         return PairT(static_cast<Lattice*>(NULL),
                      static_cast<CompactLattice*>(NULL));
       }
@@ -283,7 +283,7 @@ class LatticeReader {
 CompactLattice *ReadCompactLatticeText(std::istream &is) {
   std::pair<Lattice*, CompactLattice*> lat_pair = LatticeReader::ReadText(is);
   if (lat_pair.second != NULL) {
-    if (lat_pair.first) delete lat_pair.first;
+    delete lat_pair.first;
     return lat_pair.second;
   } else if (lat_pair.first != NULL) {
     // note: ConvertToCompactLattice frees its input.
@@ -297,7 +297,7 @@ CompactLattice *ReadCompactLatticeText(std::istream &is) {
 Lattice *ReadLatticeText(std::istream &is) {
   std::pair<Lattice*, CompactLattice*> lat_pair = LatticeReader::ReadText(is);
   if (lat_pair.first != NULL) {
-    if (lat_pair.second) delete lat_pair.second;
+    delete lat_pair.second;
     return lat_pair.first;
   } else if (lat_pair.second != NULL) {
     // note: ConvertToLattice frees its input.
diff --git a/src/lat/kaldi-lattice.h b/src/lat/kaldi-lattice.h
index 375b437e5f7..5444fbe8faf 100644
--- a/src/lat/kaldi-lattice.h
+++ b/src/lat/kaldi-lattice.h
@@ -86,12 +86,20 @@ class CompactLatticeHolder {
   const T &Value() const {
     KALDI_ASSERT(t_ != NULL && "Called Value() on empty CompactLatticeHolder");
     return *t_;
-  } 
+  }
 
-  void Clear() { if (t_) { delete t_; t_ = NULL; } }
+  void Clear() { delete t_; t_ = NULL; }
 
-  ~CompactLatticeHolder() { Clear(); }
+  void Swap(CompactLatticeHolder *other) {
+    std::swap(t_, other->t_);
+  }
 
+  bool ExtractRange(const CompactLatticeHolder &other, const std::string &range) {
+    KALDI_ERR << "ExtractRange is not defined for this type of holder.";
+    return false;
+  }
+
+  ~CompactLatticeHolder() { Clear(); }
  private:
   T *t_;
 };
@@ -116,12 +124,20 @@ class LatticeHolder {
   const T &Value() const {
     KALDI_ASSERT(t_ != NULL && "Called Value() on empty LatticeHolder");
     return *t_;
-  } 
+  }
 
-  void Clear() { if (t_) { delete t_; t_ = NULL; } }
+  void Clear() {  delete t_; t_ = NULL; }
 
-  ~LatticeHolder() { Clear(); }
+  void Swap(LatticeHolder *other) {
+    std::swap(t_, other->t_);
+  }
 
+  bool ExtractRange(const LatticeHolder &other, const std::string &range) {
+    KALDI_ERR << "ExtractRange is not defined for this type of holder.";
+    return false;
+  }
+
+  ~LatticeHolder() { Clear(); }
  private:
   T *t_;
 };
diff --git a/src/lat/lattice-functions.cc b/src/lat/lattice-functions.cc
index 0ea66712eda..b636b536b5f 100644
--- a/src/lat/lattice-functions.cc
+++ b/src/lat/lattice-functions.cc
@@ -405,15 +405,11 @@ static inline double LogAddOrMax(bool viterbi, double a, double b) {
     return LogAdd(a, b);
 }
 
-// Computes (normal or Viterbi) alphas and betas; returns (total-prob, or
-// best-path negated cost) Note: in either case, the alphas and betas are
-// negated costs.  Requires that lat be topologically sorted.  This code
-// will work for either CompactLattice or Latice.
 template<typename LatticeType>
-static double ComputeLatticeAlphasAndBetas(const LatticeType &lat,
-                                           bool viterbi,
-                                           vector<double> *alpha,
-                                           vector<double> *beta) {
+double ComputeLatticeAlphasAndBetas(const LatticeType &lat,
+                                    bool viterbi,
+                                    vector<double> *alpha,
+                                    vector<double> *beta) {
   typedef typename LatticeType::Arc Arc;
   typedef typename Arc::Weight Weight;
   typedef typename Arc::StateId StateId;
@@ -462,6 +458,19 @@ static double ComputeLatticeAlphasAndBetas(const LatticeType &lat,
   return 0.5 * (tot_backward_prob + tot_forward_prob);
 }
 
+// instantiate the template for Lattice and CompactLattice
+template
+double ComputeLatticeAlphasAndBetas(const Lattice &lat,
+                                    bool viterbi,
+                                    vector<double> *alpha,
+                                    vector<double> *beta);
+
+template
+double ComputeLatticeAlphasAndBetas(const CompactLattice &lat,
+                                    bool viterbi,
+                                    vector<double> *alpha,
+                                    vector<double> *beta);
+
 
 
 /// This is used in CompactLatticeLimitDepth.
@@ -1508,16 +1517,27 @@ void ComposeCompactLatticeDeterministic(
     StateId s2 = s.second;
     state_queue.pop();
 
-    // If the product of the final weights of the two states is not zero, then
-    // we should create final state in fst_composed. We compute the product
-    // manually since this is more efficient.
-    Weight2 final_weight(LatticeWeight(clat.Final(s1).Weight().Value1() +
-                                       det_fst->Final(s2).Value(),
-                                       clat.Final(s1).Weight().Value2()),
-                         clat.Final(s1).String());
-    if (final_weight != Weight2::Zero()) {
-      KALDI_ASSERT(state_map.find(s) != state_map.end());
-      composed_clat->SetFinal(state_map[s], final_weight);
+
+    Weight2 clat_final = clat.Final(s1);
+    if (clat_final.Weight().Value1() !=
+        std::numeric_limits<BaseFloat>::infinity()) {
+      // Test for whether the final-prob of state s1 was zero.
+      Weight1 det_fst_final = det_fst->Final(s2);
+      if (det_fst_final.Value() !=
+          std::numeric_limits<BaseFloat>::infinity()) {
+        // Test for whether the final-prob of state s2 was zero.  If neither
+        // source-state final prob was zero, then we should create final state
+        // in fst_composed. We compute the product manually since this is more
+        // efficient.
+        Weight2 final_weight(LatticeWeight(clat_final.Weight().Value1() +
+                                           det_fst_final.Value(),
+                                           clat_final.Weight().Value2()),
+                             clat_final.String());
+        // we can assume final_weight is not Zero(), since neither of
+        // the sources was zero.
+        KALDI_ASSERT(state_map.find(s) != state_map.end());
+        composed_clat->SetFinal(state_map[s], final_weight);
+      }
     }
 
     // Loops over pair of edges at s1 and s2.
@@ -1558,7 +1578,7 @@ void ComposeCompactLatticeDeterministic(
           KALDI_ASSERT(result.second);
           state_queue.push(next_state_pair);
         } else {
-          // If the combposed state is already in <state_map>, we can directly
+          // If the composed state is already in <state_map>, we can directly
           // use that.
           next_state = siter->second;
         }
@@ -1566,7 +1586,7 @@ void ComposeCompactLatticeDeterministic(
         // Adds arc to <composed_clat>.
         if (arc1.olabel == 0) {
           composed_clat->AddArc(state_map[s],
-                                CompactLatticeArc(0, 0,
+                                CompactLatticeArc(arc1.ilabel, 0,
                                                   arc1.weight, next_state));
         } else {
           Weight2 composed_weight(
@@ -1575,7 +1595,7 @@ void ComposeCompactLatticeDeterministic(
                             arc1.weight.Weight().Value2()),
               arc1.weight.String());
           composed_clat->AddArc(state_map[s],
-                                CompactLatticeArc(arc1.ilabel, arc1.olabel,
+                                CompactLatticeArc(arc1.ilabel, arc2.olabel,
                                                   composed_weight, next_state));
         }
       }
diff --git a/src/lat/lattice-functions.h b/src/lat/lattice-functions.h
index 505aaffbe55..c58b2ec32b8 100644
--- a/src/lat/lattice-functions.h
+++ b/src/lat/lattice-functions.h
@@ -45,7 +45,7 @@ int32 LatticeStateTimes(const Lattice &lat, std::vector<int32> *times);
 
 /// As LatticeStateTimes, but in the CompactLattice format.  Note: must
 /// be topologically sorted.  Returns length of the utterance in frames, which
-/// may not be the same as the maximum time in the lattice, due to frames
+/// might not be the same as the maximum time in the lattice, due to frames
 /// in the final-prob.
 int32 CompactLatticeStateTimes(const CompactLattice &clat,
                                std::vector<int32> *times);
@@ -64,7 +64,7 @@ BaseFloat LatticeForwardBackward(const Lattice &lat,
                                  double *acoustic_like_sum = NULL);
 
 // This function is something similar to LatticeForwardBackward(), but it is on
-// the CompactLattice lattice format. Also we only need the alpha in the forward 
+// the CompactLattice lattice format. Also we only need the alpha in the forward
 // path, not the posteriors.
 bool ComputeCompactLatticeAlphas(const CompactLattice &lat,
                                  vector<double> *alpha);
@@ -74,6 +74,18 @@ bool ComputeCompactLatticeAlphas(const CompactLattice &lat,
 bool ComputeCompactLatticeBetas(const CompactLattice &lat,
                                 vector<double> *beta);
 
+
+// Computes (normal or Viterbi) alphas and betas; returns (total-prob, or
+// best-path negated cost) Note: in either case, the alphas and betas are
+// negated costs.  Requires that lat be topologically sorted.  This code
+// will work for either CompactLattice or Latice.
+template<typename LatticeType>
+double ComputeLatticeAlphasAndBetas(const LatticeType &lat,
+                                    bool viterbi,
+                                    vector<double> *alpha,
+                                    vector<double> *beta);
+
+
 /// Topologically sort the compact lattice if not already topologically sorted.
 /// Will crash if the lattice cannot be topologically sorted.
 void TopSortCompactLatticeIfNeeded(CompactLattice *clat);
diff --git a/src/lat/phone-align-lattice.cc b/src/lat/phone-align-lattice.cc
index 2b3ef7b615e..a8da7b76a0f 100644
--- a/src/lat/phone-align-lattice.cc
+++ b/src/lat/phone-align-lattice.cc
@@ -29,13 +29,13 @@ class LatticePhoneAligner {
  public:
   typedef CompactLatticeArc::StateId StateId;
   typedef CompactLatticeArc::Label Label;
-  
+
   class ComputationState { /// The state of the computation in which,
     /// along a single path in the lattice, we work out the phone
     /// boundaries and output phone-aligned arcs. [These may or may not have
     /// words on them; the word symbols are not aligned with anything.
    public:
-    
+
     /// Advance the computation state by adding the symbols and weights
     /// from this arc.  Gets rid of the weight and puts it in "weight" which
     /// will be put on the output arc; this keeps the state-space small.
@@ -71,9 +71,9 @@ class LatticePhoneAligner {
                        const PhoneAlignLatticeOptions &opts,
                        CompactLatticeArc *arc_out,
                        bool *error);
-      
+
     bool IsEmpty() { return (transition_ids_.empty() && word_labels_.empty()); }
-    
+
     /// FinalWeight() will return "weight" if both transition_ids
     /// and word_labels are empty, otherwise it will return
     /// Weight::Zero().
@@ -95,7 +95,7 @@ class LatticePhoneAligner {
                         const PhoneAlignLatticeOptions &opts,
                         CompactLatticeArc *arc_out,
                         bool *error);
-    
+
     size_t Hash() const {
       VectorHasher<int32> vh;
       return vh(transition_ids_) + 90647 * vh(word_labels_);
@@ -112,7 +112,7 @@ class LatticePhoneAligner {
               && word_labels_ == other.word_labels_
               && weight_ == other.weight_);
     }
-    
+
     ComputationState(): weight_(LatticeWeight::One()) { } // initial state.
     ComputationState(const ComputationState &other):
         transition_ids_(other.transition_ids_), word_labels_(other.word_labels_),
@@ -134,7 +134,7 @@ class LatticePhoneAligner {
   struct TupleHash {
     size_t operator() (const Tuple &state) const {
       return state.input_state + 102763 * state.comp_state.Hash();
-      // 102763 is just an arbitrary prime number 
+      // 102763 is just an arbitrary prime number
     }
   };
   struct TupleEqual {
@@ -144,9 +144,9 @@ class LatticePhoneAligner {
               && state1.comp_state == state2.comp_state);
     }
   };
-  
+
   typedef unordered_map<Tuple, StateId, TupleHash, TupleEqual> MapType;
-  
+
   StateId GetStateForTuple(const Tuple &tuple, bool add_to_queue) {
     MapType::iterator iter = map_.find(tuple);
     if (iter == map_.end()) { // not in map.
@@ -159,17 +159,17 @@ class LatticePhoneAligner {
       return iter->second;
     }
   }
-  
+
   void ProcessFinal(Tuple tuple, StateId output_state) {
     // ProcessFinal is only called if the input_state has
     // final-prob of One().  [else it should be zero.  This
     // is because we called CreateSuperFinal().]
-    
+
     if (tuple.comp_state.IsEmpty()) { // computation state doesn't have
       // anything pending.
       std::vector<int32> empty_vec;
       CompactLatticeWeight cw(tuple.comp_state.FinalWeight(), empty_vec);
-      lat_out_->SetFinal(output_state, Plus(lat_out_->Final(output_state), cw));      
+      lat_out_->SetFinal(output_state, Plus(lat_out_->Final(output_state), cw));
     } else {
       // computation state has something pending, i.e. input or
       // output symbols that need to be flushed out.  Note: OutputArc() would
@@ -187,8 +187,8 @@ class LatticePhoneAligner {
       lat_out_->AddArc(output_state, lat_arc);
     }
   }
-  
-  
+
+
   void ProcessQueueElement() {
     KALDI_ASSERT(!queue_.empty());
     Tuple tuple = queue_.back().first;
@@ -240,7 +240,7 @@ class LatticePhoneAligner {
       }
     }
   }
-  
+
   LatticePhoneAligner(const CompactLattice &lat,
                       const TransitionModel &tmodel,
                       const PhoneAlignLatticeOptions &opts,
@@ -258,7 +258,7 @@ class LatticePhoneAligner {
   void RemoveEpsilonsFromLattice() {
     RmEpsilon(lat_out_, true); // true = connect.
   }
-  
+
   bool AlignLattice() {
     lat_out_->DeleteStates();
     if (lat_.Start() == fst::kNoStateId) {
@@ -269,16 +269,16 @@ class LatticePhoneAligner {
     Tuple initial_tuple(lat_.Start(), initial_comp_state);
     StateId start_state = GetStateForTuple(initial_tuple, true); // True = add this to queue.
     lat_out_->SetStart(start_state);
-    
+
     while (!queue_.empty())
       ProcessQueueElement();
 
     if (opts_.remove_epsilon)
       RemoveEpsilonsFromLattice();
-    
+
     return !error_;
   }
-  
+
   CompactLattice lat_;
   const TransitionModel &tmodel_;
   const PhoneAlignLatticeOptions &opts_;
@@ -286,7 +286,7 @@ class LatticePhoneAligner {
 
   std::vector<std::pair<Tuple, StateId> > queue_;
   MapType map_; // map from tuples to StateId.
-  bool error_;  
+  bool error_;
 };
 
 bool LatticePhoneAligner::ComputationState::OutputPhoneArc(
@@ -334,7 +334,7 @@ bool LatticePhoneAligner::ComputationState::OutputPhoneArc(
   *arc_out = CompactLatticeArc(output_label, output_label,
                                CompactLatticeWeight(weight_, tids_out),
                                fst::kNoStateId);
-  transition_ids_.erase(transition_ids_.begin(), transition_ids_.begin()+i); 
+  transition_ids_.erase(transition_ids_.begin(), transition_ids_.begin()+i);
   weight_ = LatticeWeight::One(); // we just output the weight.
   return true;
 }
@@ -346,7 +346,7 @@ bool LatticePhoneAligner::ComputationState::OutputWordArc(
     bool *error) {
   // output a word but no phones.
   if (word_labels_.size() < 2) return false;
-  
+
   int32 output_label = word_labels_[0];
   word_labels_.erase(word_labels_.begin(), word_labels_.begin()+1);
 
@@ -356,7 +356,7 @@ bool LatticePhoneAligner::ComputationState::OutputWordArc(
   weight_ = LatticeWeight::One(); // we just output the weight, so set it to one.
   return true;
 }
-  
+
 
 void LatticePhoneAligner::ComputationState::OutputArcForce(
     const TransitionModel &tmodel,
@@ -370,7 +370,7 @@ void LatticePhoneAligner::ComputationState::OutputArcForce(
   // the code.  IsEmpty() would be true if we had transition_ids_.empty()
   // and opts.replace_output_symbols, so we would already die by assertion;
   // in fact, this function would neve be called.
-  
+
   if (!transition_ids_.empty()) { // Do some checking here.
     int32 tid = transition_ids_[0];
     phone = tmodel.TransitionIdToPhone(tid);
@@ -388,7 +388,7 @@ void LatticePhoneAligner::ComputationState::OutputArcForce(
     if (num_final != 1 && ! *error) {
       KALDI_WARN << "Problem phone-aligning lattice: saw " << num_final
                  << " final-states in last phone in lattice (forced out?) "
-                 << "Producing patial lattice.";
+                 << "Producing partial lattice.";
       *error = true;
     }
   }
diff --git a/src/lat/phone-align-lattice.h b/src/lat/phone-align-lattice.h
index ec905347110..106e5e03e21 100644
--- a/src/lat/phone-align-lattice.h
+++ b/src/lat/phone-align-lattice.h
@@ -50,7 +50,7 @@ struct PhoneAlignLatticeOptions {
   }
 };
 
-/// Returns a lattice in which the arcs correspond exactly to sequences of
+/// Outputs a lattice in which the arcs correspond exactly to sequences of
 /// phones, so the boundaries between the arcs correspond to the boundaries
 /// between phones If remove-epsilon == false and replace-output-symbols ==
 /// false, but an arc may have >1 phone on it, but the boundaries will still
diff --git a/src/lat/word-align-lattice-lexicon-test.cc b/src/lat/word-align-lattice-lexicon-test.cc
new file mode 100644
index 00000000000..240153417b1
--- /dev/null
+++ b/src/lat/word-align-lattice-lexicon-test.cc
@@ -0,0 +1,237 @@
+// lat/word-align-lattice-lexicon-test.cc
+
+// Copyright    2015  Johns Hopkins University (Author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lat/determinize-lattice-pruned.h"
+#include "fstext/lattice-utils.h"
+#include "fstext/fst-test-utils.h"
+#include "lat/kaldi-lattice.h"
+#include "lat/lattice-functions.h"
+#include "hmm/hmm-test-utils.h"
+#include "lat/word-align-lattice-lexicon.h"
+
+namespace kaldi {
+
+// This function generates a lexicon in the same format that
+// WordAlignLatticeLexicon uses: (original-word-id), (new-word-id), (phone-seq).
+void GenerateLexicon(const std::vector<int32> &phones,
+                     bool allow_zero_words,
+                     bool allow_empty_word,
+                     bool allow_multiple_prons,
+                     std::vector<std::vector<int32> > *lexicon) {
+  KALDI_ASSERT(!phones.empty());
+  lexicon->clear();
+  int32 num_words = RandInt(1, 20);
+  for (int32 word = 1; word <= num_words; word++) {
+    int32 num_prons = RandInt(1, (allow_multiple_prons ? 2 : 1));
+    bool is_zero_word = allow_zero_words && (RandInt(1, 5) == 1);
+
+    for (int32 j = 0; j < num_prons; j++) {
+      // don't allow empty pron if this word isn't labeled in the lattice (zero word,
+      // like optional silence).  This doesn't make sense.
+      int32 pron_length = RandInt(((allow_empty_word && !is_zero_word) ? 0 : 1),
+                                  4);
+      std::vector<int32> this_entry;
+      this_entry.push_back(is_zero_word ? 0 : word);
+      this_entry.push_back(word);
+      for (int32 p = 0; p < pron_length; p++)
+        this_entry.push_back(phones[RandInt(0, phones.size() - 1)]);
+      lexicon->push_back(this_entry);
+    }
+  }
+  SortAndUniq(lexicon);
+  // randomize the order.
+  std::random_shuffle(lexicon->begin(), lexicon->end());
+
+
+  for (size_t i = 0; i < lexicon->size(); i++) {
+    if ((*lexicon)[i].size() > 2) {
+      // ok, this lexicon has at least one nonempty word: potentially OK.  Do
+      // further check that the info object doesn't complain.
+      try {
+        WordAlignLatticeLexiconInfo info(*lexicon);
+        return;  // OK, we're satisfied with this lexicon.
+      } catch (...) {
+        break;  // will re-try, see below.
+      }
+    }
+  }
+  // there were no nonempty words in the lexicon -> try again.
+  // recursing is the easiest way.
+  GenerateLexicon(phones, allow_zero_words, allow_empty_word, allow_multiple_prons,
+                  lexicon);
+
+
+}
+
+
+static void PrintLexicon(const std::vector<std::vector<int32> > &lexicon) {
+  KALDI_LOG << "Lexicon is: ";
+  for (size_t i = 0; i < lexicon.size(); i++) {
+    KALDI_ASSERT(lexicon[i].size() >= 2);
+    const std::vector<int32> &entry = lexicon[i];
+    std::cerr << entry[0] << "\t" << entry[1] << "\t";
+    for (size_t j = 2; j < entry.size(); j++)
+      std::cerr << entry[j] << " ";
+    std::cerr << "\n";
+  }
+}
+
+static void PrintWordsAndPhones(const std::vector<int32> &words,
+                                const std::vector<int32> &phones) {
+  std::ostringstream word_str, phone_str;
+  for (size_t i = 0; i < words.size(); i++)
+    word_str << words[i] << " ";
+  for (size_t i = 0; i < phones.size(); i++)
+    phone_str << phones[i] << " ";
+  KALDI_LOG << "Word-sequence is: " << word_str.str();
+  KALDI_LOG << "Phone-sequence is: " << phone_str.str();
+}
+
+
+// generates a phone and word sequence together from the lexicon.  Not
+// guaranteed nonempty.
+void GenerateWordAndPhoneSequence(std::vector<std::vector<int32> > &lexicon,
+                                  std::vector<int32> *phone_seq,
+                                  std::vector<int32> *word_seq) {
+  int32 num_words = RandInt(0, 5);
+  phone_seq->clear();
+  word_seq->clear();
+  for (int32 i = 0; i < num_words; i++) {
+    const std::vector<int32> &lexicon_entry =
+        lexicon[RandInt(0, lexicon.size() - 1)];
+    // the zeroth element of 'lexicon_entry' is how it appears in
+    // the lattice prior to word alignment.
+    int32 word = lexicon_entry[0];
+    if (word != 0) word_seq->push_back(word);
+    // add everything from position 2 in the lexicon entry, to the
+    // phone sequence.
+    phone_seq->insert(phone_seq->end(),
+                      lexicon_entry.begin() + 2,
+                      lexicon_entry.end());
+  }
+}
+
+
+
+void GenerateCompactLatticeRandomly(const std::vector<int32> &alignment,
+                                    const std::vector<int32> &words,
+                                    CompactLattice *clat) {
+  clat->DeleteStates();
+  clat->AddState();
+  clat->SetStart(0);
+  int32 cur_state = 0;
+  size_t word_start = 0, alignment_start = 0,
+      num_words = words.size(), num_transition_ids = alignment.size();
+  for (; word_start < num_words; word_start++) {
+    int32 word = words[word_start];
+    int32 ali_length = RandInt(0, num_transition_ids - alignment_start);
+    std::vector<int32> this_ali(ali_length);
+    for (int32 i = 0; i < ali_length; i++)
+      this_ali[i] = alignment[alignment_start + i];
+    alignment_start += ali_length;
+    CompactLatticeWeight weight(LatticeWeight::One(), this_ali);
+    int32 ilabel = word;
+    int32 next_state = clat->AddState();
+    CompactLatticeArc arc(ilabel, ilabel, weight, next_state);
+    clat->AddArc(cur_state, arc);
+    cur_state = next_state;
+  }
+  if (alignment_start < alignment.size()) {
+    int32 ali_length = num_transition_ids - alignment_start;
+    std::vector<int32> this_ali(ali_length);
+    for (int32 i = 0; i < ali_length; i++)
+      this_ali[i] = alignment[alignment_start + i];
+    alignment_start += ali_length;
+    CompactLatticeWeight weight(LatticeWeight::One(), this_ali);
+    int32 ilabel = 0;
+    int32 next_state = clat->AddState();
+    CompactLatticeArc arc(ilabel, ilabel, weight, next_state);
+    clat->AddArc(cur_state, arc);
+    cur_state = next_state;
+  }
+  clat->SetFinal(cur_state, CompactLatticeWeight::One());
+}
+
+
+
+void TestWordAlignLatticeLexicon() {
+  ContextDependency *ctx_dep;
+  TransitionModel *trans_model = GenRandTransitionModel(&ctx_dep);
+  bool allow_zero_words = true;
+  bool allow_empty_word = true;
+  bool allow_multiple_prons = true;
+
+  const std::vector<int32> &phones = trans_model->GetPhones();
+  std::vector<std::vector<int32> > lexicon;
+  GenerateLexicon(phones, allow_zero_words, allow_empty_word,
+                  allow_multiple_prons, &lexicon);
+
+  std::vector<int32> phone_seq;
+  std::vector<int32> word_seq;
+  while (phone_seq.empty())
+    GenerateWordAndPhoneSequence(lexicon, &phone_seq, &word_seq);
+
+  PrintLexicon(lexicon);
+  PrintWordsAndPhones(word_seq, phone_seq);
+
+  std::vector<int32> alignment;
+  bool reorder = (RandInt(0, 1) == 0);
+  GenerateRandomAlignment(*ctx_dep, *trans_model, reorder,
+                          phone_seq, &alignment);
+
+  CompactLattice clat;
+  GenerateCompactLatticeRandomly(alignment, word_seq, &clat);
+
+  KALDI_LOG << "clat is ";
+  WriteCompactLattice(std::cerr, false, clat);
+
+  WordAlignLatticeLexiconOpts opts;
+  WordAlignLatticeLexiconInfo lexicon_info(lexicon);
+  opts.test = true;  // we rely on the self-test code that's activated when we
+                     // do this.
+  opts.allow_duplicate_paths = true;
+  opts.reorder = reorder;
+  CompactLattice aligned_clat;
+  bool ans = WordAlignLatticeLexicon(clat, *trans_model, lexicon_info, opts,
+                                     &aligned_clat);
+  KALDI_LOG << "Aligned clat is ";
+  WriteCompactLattice(std::cerr, false, aligned_clat);
+  KALDI_ASSERT(ans);
+
+  Lattice lat;
+  ConvertLattice(clat, &lat);
+  int32 n = 1000;  // a maximum.
+  Lattice nbest_lat;
+  std::vector<Lattice> nbest_lats;
+  fst::ShortestPath(lat, &nbest_lat, n);
+  fst::ConvertNbestToVector(nbest_lat, &nbest_lats);
+  KALDI_LOG << "Word-aligned lattice has " << nbest_lats.size() << " paths.";
+
+  delete ctx_dep;
+  delete trans_model;
+}
+
+} // end namespace kaldi
+
+int main() {
+  for (int32 i = 0; i < 3; i++)
+    kaldi::TestWordAlignLatticeLexicon();
+  std::cout << "Tests succeeded\n";
+}
+
diff --git a/src/lat/word-align-lattice-lexicon.cc b/src/lat/word-align-lattice-lexicon.cc
index 340b91ff821..63284b771de 100644
--- a/src/lat/word-align-lattice-lexicon.cc
+++ b/src/lat/word-align-lattice-lexicon.cc
@@ -58,13 +58,13 @@ class LatticeLexiconWordAligner {
     kFresh,
     kAllFresh
   } Freshness;
-  
+
   class ComputationState {
     /// The state of the computation in which,
     /// along a single path in the lattice, we work out the word
     /// boundaries and output aligned arcs.
    public:
-    
+
     /// Advance the computation state by adding the symbols and weights from
     /// this arc.  Outputs weight to "leftover_weight" and sets the weight to
     /// 1.0 (this helps keep the state space small).  Note: because we
@@ -80,13 +80,13 @@ class LatticeLexiconWordAligner {
     /// that avoids us creating an exponentially large number of states that
     /// would contribute nothing to the final output.
     bool ViableIfAdvanced(const ViabilityMap &viability_map) const;
-    
+
     int32 NumPhones() const { return phones_.size(); }
     int32 NumWords() const { return words_.size(); }
     int32 PendingWord() const { KALDI_ASSERT(!words_.empty()); return words_[0]; }
     Freshness WordFreshness() const { return word_fresh_; }
     Freshness PhoneFreshness() const { return phone_fresh_; }
-    
+
     /// This may be called at the end of a lattice, if it was forced
     /// out.  Note: we will only use "partial_word_label" if there are
     /// phones without corresponding words; otherwise we'll use the
@@ -94,7 +94,7 @@ class LatticeLexiconWordAligner {
     void TakeForcedTransition(int32 partial_word_label,
                               ComputationState *next_state,
                               CompactLatticeArc *arc_out) const;
-    
+
     /// Take a transition, if possible; consume "num_phones" phones and (if
     /// word_id != 0) the word "word_id" which must be the first word in words_.
     /// Returns true if we could take the transition.
@@ -103,16 +103,16 @@ class LatticeLexiconWordAligner {
                         int32 num_phones,
                         ComputationState *next_state,
                         CompactLatticeArc *arc_out) const;
-    
+
     bool IsEmpty() const { return (transition_ids_.empty() && words_.empty()); }
-    
+
     /// FinalWeight() will return "weight" if both transition_ids
     /// and word_labels are empty, otherwise it will return
     /// Weight::Zero().
     LatticeWeight FinalWeight() const {
       return (IsEmpty() ? weight_ : LatticeWeight::Zero());
     }
-    
+
     size_t Hash() const {
       VectorHasher<int32> vh;
       const int32 p1 = 11117, p2 = 90647, p3 = 3967, p4 = 3557; // primes.
@@ -128,7 +128,7 @@ class LatticeLexiconWordAligner {
       // need to include it in the hash.
       return ans;
     }
-    
+
     bool operator == (const ComputationState &other) const {
       // phones_ is determined by transition-id sequence so don't
       // need to compare it.
@@ -138,10 +138,10 @@ class LatticeLexiconWordAligner {
               phone_fresh_ == other.phone_fresh_ &&
               word_fresh_ == other.word_fresh_);
     }
-          
+
     ComputationState(): phone_fresh_(kNotFresh), word_fresh_(kNotFresh),
                         weight_(LatticeWeight::One()) { } // initial state.
-    
+
     ComputationState(const ComputationState &other):
         phones_(other.phones_), words_(other.words_),
         phone_fresh_(other.phone_fresh_), word_fresh_(other.word_fresh_),
@@ -160,16 +160,16 @@ class LatticeLexiconWordAligner {
     Freshness word_fresh_;
 
     std::vector<std::vector<int32> > transition_ids_; // sequence of transition-ids for each phone..
-    
+
     LatticeWeight weight_; // contains two floats.
   };
 
-  
+
   static void AppendVectors(
       std::vector<std::vector<int32> >::const_iterator input_begin,
       std::vector<std::vector<int32> >::const_iterator input_end,
       std::vector<int32> *output);
-  
+
   struct Tuple {
     Tuple(StateId input_state, ComputationState comp_state):
         input_state(input_state), comp_state(comp_state) {}
@@ -181,7 +181,7 @@ class LatticeLexiconWordAligner {
   struct TupleHash {
     size_t operator() (const Tuple &state) const {
       return state.input_state + 102763 * state.comp_state.Hash();
-      // 102763 is just an arbitrary prime number 
+      // 102763 is just an arbitrary prime number
     }
   };
   struct TupleEqual {
@@ -191,7 +191,7 @@ class LatticeLexiconWordAligner {
               && state1.comp_state == state2.comp_state);
     }
   };
-  
+
   typedef unordered_map<Tuple, StateId, TupleHash, TupleEqual> MapType;
 
   // This function may alter queue_.
@@ -206,7 +206,7 @@ class LatticeLexiconWordAligner {
       return iter->second;
     }
   }
-  
+
   // This function may alter queue_, via GetStateForTuple.
   void ProcessTransition(StateId prev_output_state, // state-id of from-state in output lattice
                          const Tuple &next_tuple,
@@ -221,10 +221,10 @@ class LatticeLexiconWordAligner {
   // will be determinized with epsilon-removal so there is no separate arc,
   // just one or more extra phones that don't match up with any word.
   void ProcessEpsilonTransitions(const Tuple &tuple, StateId output_state);
-  
+
   // Process any non-epsilon transitions out of this state in the output lattice.
   void ProcessWordTransitions(const Tuple &tuple, StateId output_state);
-  
+
   // Take any transitions that correspond to advancing along arcs arc in the
   // original FST.
   void PossiblyAdvanceArc(const Tuple &tuple, StateId output_state);
@@ -232,7 +232,7 @@ class LatticeLexiconWordAligner {
   /// Process all final-probs (normal case, no forcing-out).
   /// returns true if we had at least one final-prob.
   bool ProcessFinal();
-  
+
   /// This function returns true if the state "output_state" in the output
   /// lattice has arcs out that have either a non-epsilon symbol or transition-ids
   /// in the string of the weight.
@@ -245,7 +245,7 @@ class LatticeLexiconWordAligner {
   /// "naturally" final; this will only happen for lattices that were forced out
   /// during decoding.
   void ProcessFinalForceOut();
-  
+
   // Process all final-probs -- a wrapper function that handles the forced-out case.
   void ProcessFinalWrapper() {
     if (final_queue_.empty()) {
@@ -256,9 +256,9 @@ class LatticeLexiconWordAligner {
     if (ProcessFinal()) return;
     error_ = true;
     KALDI_WARN << "Word-aligning lattice: lattice was forced out, will have partial words at end.";
-    
+
     ProcessFinalForceOut();
-      
+
     if (ProcessFinal()) return;
     KALDI_WARN << "Word-aligning lattice: had no final-states even after forcing out "
                << "(result will be empty).  This probably indicates wrong input.";
@@ -281,7 +281,7 @@ class LatticeLexiconWordAligner {
     if (lat_in_.Final(tuple.input_state) != CompactLatticeWeight::Zero())
       final_queue_.push_back(std::make_pair(tuple, output_state));
   }
-  
+
   LatticeLexiconWordAligner(const CompactLattice &lat,
                             const TransitionModel &tmodel,
                             const WordAlignLatticeLexiconInfo &lexicon_info,
@@ -289,7 +289,7 @@ class LatticeLexiconWordAligner {
                             int32 partial_word_label,
                             CompactLattice *lat_out):
       lat_in_(lat), tmodel_(tmodel), lexicon_info_(lexicon_info),
-      max_states_(max_states), 
+      max_states_(max_states),
       lat_out_(lat_out),
       partial_word_label_(partial_word_label == 0 ?
                           kTemporaryEpsilon : partial_word_label),
@@ -298,7 +298,7 @@ class LatticeLexiconWordAligner {
 
     fst::CreateSuperFinal(&lat_in_); // Creates a super-final state, so the
     // only final-probs are One().  Note: the member lat_in_ is not a reference.
-    
+
   }
 
   // Removes epsilons; also removes unreachable states...
@@ -307,15 +307,13 @@ class LatticeLexiconWordAligner {
   // and partial-words, with epsilons, if we wanted epsilons.
   void RemoveEpsilonsFromLattice() {
     Connect(lat_out_);
-    RemoveEpsLocal(lat_out_);    
-    // was:
-    // RmEpsilon(lat_out_, true); // true = connect.
+    RmEpsilon(lat_out_, true); // true = connect.
     std::vector<int32> syms_to_remove;
     syms_to_remove.push_back(kTemporaryEpsilon);
     RemoveSomeInputSymbols(syms_to_remove, lat_out_);
-    Project(lat_out_, fst::PROJECT_INPUT);      
+    Project(lat_out_, fst::PROJECT_INPUT);
   }
-  
+
   bool AlignLattice() {
     lat_out_->DeleteStates();
     if (lat_in_.Start() == fst::kNoStateId) {
@@ -326,7 +324,7 @@ class LatticeLexiconWordAligner {
     Tuple initial_tuple(lat_in_.Start(), initial_comp_state);
     StateId start_state = GetStateForTuple(initial_tuple);
     lat_out_->SetStart(start_state);
-    
+
     while (!queue_.empty()) {
       if (max_states_ > 0 && lat_out_->NumStates() > max_states_) {
         KALDI_WARN << "Number of states in lattice exceeded max-states of "
@@ -338,11 +336,12 @@ class LatticeLexiconWordAligner {
       ProcessQueueElement();
     }
     ProcessFinalWrapper();
+
     RemoveEpsilonsFromLattice();
-    
+
     return !error_;
   }
-  
+
   CompactLattice lat_in_;
   const TransitionModel &tmodel_;
   const WordAlignLatticeLexiconInfo &lexicon_info_;
@@ -354,13 +353,13 @@ class LatticeLexiconWordAligner {
   std::vector<std::pair<Tuple, StateId> > final_queue_; // as queue_, but
   // just contains states that may have final-probs to process.  We process these
   // all at once, at the end.
-  
+
   MapType map_; // map from tuples to StateId.
   int32 partial_word_label_;
   bool error_;
 };
 
-// static 
+// static
 void LatticeLexiconWordAligner::AppendVectors(
     std::vector<std::vector<int32> >::const_iterator input_begin,
     std::vector<std::vector<int32> >::const_iterator input_end,
@@ -369,8 +368,8 @@ void LatticeLexiconWordAligner::AppendVectors(
   for (std::vector<std::vector<int32> >::const_iterator iter = input_begin;
        iter != input_end;
        ++iter)
-    size += iter->size();    
-  output->clear();    
+    size += iter->size();
+  output->clear();
   output->reserve(size);
   for (std::vector<std::vector<int32> >::const_iterator iter = input_begin;
        iter != input_end;
@@ -416,12 +415,10 @@ void LatticeLexiconWordAligner::ProcessEpsilonTransitions(
   } else { // kNotFresh
     return;
   }
-  min_num_phones = 1;
-  max_num_phones = comp_state.NumPhones();
-  
+
   if (min_num_phones == 0)
     KALDI_ERR << "Lexicon error: epsilon transition that produces no output:";
-    
+
   for (int32 num_phones = min_num_phones;
        num_phones <= max_num_phones;
        num_phones++) {
@@ -469,7 +466,6 @@ void LatticeLexiconWordAligner::ProcessWordTransitions(
       return; // Nothing to do, since neither the word nor the phones are fresh.
     }
 
-      
     for (int32 num_phones = min_num_phones;
          num_phones <= max_num_phones;
          num_phones++) {
@@ -500,7 +496,7 @@ void LatticeLexiconWordAligner::PossiblyAdvanceArc(
       next_tuple.comp_state.Advance(arc_in, tmodel_, &arc_weight);
       // Note: GetStateForTuple will add the tuple to the queue,
       // if necessary.
-      
+
       StateId next_output_state = GetStateForTuple(next_tuple);
       CompactLatticeArc arc_out(0, 0,
                                 CompactLatticeWeight(arc_weight,
@@ -555,7 +551,7 @@ void LatticeLexiconWordAligner::ProcessFinalForceOut() {
       // avoids forcing things out too early, when they had words
       // that could naturally have been put out.  [without it,
       // we'd have multiple alternate paths at the end.]
-        
+
       CompactLatticeArc arc;
       Tuple next_tuple;
       next_tuple.input_state = tuple.input_state;
@@ -612,7 +608,7 @@ bool LatticeLexiconWordAligner::ComputationState::ViableIfAdvanced(
      words and/or phones.  It's OK to return true in some cases where the
      condition is false, though, if it's a pain to check, because the result
      will just be doing extra work for nothing (those states won't be
-     co-accessible in the output). 
+     co-accessible in the output).
   */
   if (phones_.empty()) return true;
   if (words_.empty()) return true;
@@ -660,6 +656,8 @@ void LatticeLexiconWordAligner::ComputationState::TakeForcedTransition(
   } else {
     word_id = partial_word_label;
   }
+  KALDI_ASSERT(word_id != 0);  // any zeros would have been replaced with
+                               // 'temporary epsilon' = 2.
   std::vector<int32> appended_transition_ids;
   AppendVectors(transition_ids_.begin(),
                 transition_ids_.end(),
@@ -677,7 +675,7 @@ bool LatticeLexiconWordAligner::ComputationState::TakeTransition(
     ComputationState *next_state, CompactLatticeArc *arc_out) const {
   KALDI_ASSERT(word_id == 0 || (!words_.empty() && word_id == words_[0]));
   KALDI_ASSERT(num_phones <= static_cast<int32>(phones_.size()));
-      
+
   std::vector<int32> lexicon_key;
   lexicon_key.reserve(1 + num_phones);
   lexicon_key.push_back(word_id); // put 1st word in lexicon_key.
@@ -696,13 +694,35 @@ bool LatticeLexiconWordAligner::ComputationState::TakeTransition(
         (word_id != 0 && !next_state->words_.empty()) ? kFresh : kNotFresh;
     next_state->phone_fresh_ =
         (next_state->phones_.empty() || num_phones == 0) ? kNotFresh : kAllFresh;
+
+    // this next thing is a bit hard to explain.  If we just consumed a word with
+    // no phones, we treat the phones as fresh.  The idea is that if we need to
+    // both consume a word with no phones and a phone with no words (e.g.
+    // an empty word and then silence), we need to have the phones marked
+    // as fresh in order for this to be possible.
+    if (num_phones == 0 && word_id != 0 && !next_state->phones_.empty())
+      next_state->phone_fresh_ = kAllFresh;
+
     next_state->weight_ = LatticeWeight::One();
 
+    if (GetVerboseLevel() >= 5) {
+      std::ostringstream ostr;
+      for (size_t i = 0; i < num_phones; i++)
+        ostr << phones_[i] << " ";
+      KALDI_VLOG(5) << "Taking arc with word = " << word_id
+                    << " and phones = " << ostr.str()
+                    << ", output-word = " << iter->second
+                    << ", dest-state has num-words = " << next_state->words_.size()
+                    << " and num-phones = " << next_state->phones_.size();
+    }
+
     // Set arc_out:
     Label word_id = iter->second; // word_id will typically be
     // the same as words_[0], i.e. the
     // word we consumed.
-        
+
+    KALDI_ASSERT(word_id != 0);  // we replaced zeros with 'temporary epsilon' = -2.
+
     std::vector<int32> appended_transition_ids;
     AppendVectors(transition_ids_.begin(),
                   transition_ids_.begin() + num_phones,
@@ -726,7 +746,7 @@ static bool IsPlausibleWord(const WordAlignLatticeLexiconInfo &lexicon_info,
                             const TransitionModel &tmodel,
                             int32 word_id,
                             const std::vector<int32> &transition_ids) {
-  
+
   std::vector<std::vector<int32> > split_alignment; // Split into phones.
   if (!SplitToPhones(tmodel, transition_ids, &split_alignment)) {
     KALDI_WARN << "Could not split word into phones correctly (forced-out?)";
@@ -756,7 +776,8 @@ void WordAlignLatticeLexiconInfo::UpdateViabilityMap(
   int32 word = lexicon_entry[0];  // note: word may be zero.
   int32 num_phones = static_cast<int32>(lexicon_entry.size()) - 2;
   std::vector<int32> phones;
-  phones.reserve(num_phones - 1);
+  if (num_phones > 0)
+    phones.reserve(num_phones - 1);
   // for each nonempty sequence of phones that is a strict prefix of the phones
   // in the lexicon entry (i.e. lexicon_entry [2 ... ]), add the word to the set
   // in viability_map_[phones].
@@ -822,6 +843,8 @@ void WordAlignLatticeLexiconInfo::UpdateNumPhonesMap(
     std::pair<int32, int32> &pr = num_phones_map_[word];
     pr.first = std::min(pr.first, num_phones); // update min-num-phones
     pr.second = std::max(pr.second, num_phones); // update max-num-phones
+    if (pr.first == 0 && word == 0)
+      KALDI_ERR << "Zero word with empty pronunciation is not allowed.";
   }
 }
 
@@ -901,10 +924,11 @@ static void MapSymbols(const WordAlignLatticeLexiconInfo &lexicon_info,
   }
 }
 
-bool TestWordAlignedLattice(const WordAlignLatticeLexiconInfo &lexicon_info,
-                            const TransitionModel &tmodel,
-                            CompactLattice clat,
-                            CompactLattice aligned_clat) {
+static bool TestWordAlignedLattice(const WordAlignLatticeLexiconInfo &lexicon_info,
+                                   const TransitionModel &tmodel,
+                                   CompactLattice clat,
+                                   CompactLattice aligned_clat,
+                                   bool allow_duplicate_paths) {
   int32 max_err = 5, num_err = 0;
   { // We test whether the forward-backward likelihoods differ; this is intended
     // to detect when we have duplicate paths in the aligned lattice, for some path
@@ -921,7 +945,8 @@ bool TestWordAlignedLattice(const WordAlignLatticeLexiconInfo &lexicon_info,
         1.0e-04 * (fabs(like_before) + fabs(like_after))) {
       KALDI_WARN << "Forward-backward likelihoods differ in word-aligned lattice "
                  << "testing, " << like_before << " != " << like_after;
-      num_err++;
+      if (!allow_duplicate_paths)
+        num_err++;
     }
   }
 
@@ -951,10 +976,10 @@ bool TestWordAlignedLattice(const WordAlignLatticeLexiconInfo &lexicon_info,
   // First map symbols into equivalence classes, so that we don't wrongly fail
   // due to the capability of the framework to map words to other words.
   // (e.g. mapping <eps> on silence arcs to SIL).
-  
+
   MapSymbols(lexicon_info, &clat);
   MapSymbols(lexicon_info, &aligned_clat);
-  
+
   /// Check equivalence.
   int32 num_paths = 5, seed = Rand(), max_path_length = -1;
   BaseFloat delta = 0.2; // some lattices have large costs -> use large delta.
@@ -982,7 +1007,7 @@ bool WordAlignLatticeLexicon(const CompactLattice &lat,
   phone_align_opts.reorder = opts.reorder;
   phone_align_opts.replace_output_symbols = false;
   phone_align_opts.remove_epsilon = false;
- 
+
   // Input Lattice should be deterministic and w/o epsilons.
   bool test = true;
   uint64 props = lat.Properties(fst::kIDeterministic|fst::kIEpsilons, test);
@@ -999,31 +1024,22 @@ bool WordAlignLatticeLexicon(const CompactLattice &lat,
 
   int32 max_states;
   if (opts.max_expand <= 0) {
-    max_states = -1; 
+    max_states = -1;
   } else {
     // The 1000 is a fixed offset to give it more wiggle room for very
     // small inputs.
     max_states = kNumStatesOffset + opts.max_expand * phone_aligned_lat.NumStates();
   }
-  
-  /*  if (ans && opts.test) {
-    /// Check equivalence.
-    int32 num_paths = 5, seed = Rand(), max_path_length = -1;
-    BaseFloat delta = 0.2; // some lattices have large costs -> use large delta.
-    if (!RandEquivalent(lat, phone_aligned_lat, num_paths, delta, seed, max_path_length)) {
-      KALDI_WARN << "Equivalence test failed during lattice alignment (phone-alignment stage)";
-      return false;
-    }
-    } */
-  
+
   // If ans == false, we hope this is due to a forced-out lattice, and we try to
   // continue.
   LatticeLexiconWordAligner aligner(phone_aligned_lat, tmodel, lexicon_info,
                                     max_states, opts.partial_word_label, lat_out);
   // We'll let the calling code warn if this is false; it will know the utterance-id.
-  ans = ans && aligner.AlignLattice();
+  ans = aligner.AlignLattice() && ans;
   if (ans && opts.test) { // We only test if it succeeded.
-    if (!TestWordAlignedLattice(lexicon_info, tmodel, lat, *lat_out)) {
+    if (!TestWordAlignedLattice(lexicon_info, tmodel, lat, *lat_out,
+                                opts.allow_duplicate_paths)) {
       KALDI_WARN << "Lattice failed test (activated because --test=true). "
                  << "Probable code error, please contact Kaldi maintainers.";
       ans = false;
diff --git a/src/lat/word-align-lattice-lexicon.h b/src/lat/word-align-lattice-lexicon.h
index 41fd235b637..915142234a0 100644
--- a/src/lat/word-align-lattice-lexicon.h
+++ b/src/lat/word-align-lattice-lexicon.h
@@ -74,7 +74,7 @@ class WordAlignLatticeLexiconInfo {
   void UpdateEquivalenceMap(const std::vector<std::vector<int32> > &lexicon);
 
   void FinalizeViabilityMap(); // sorts the vectors.
-  
+
   /// The type ViabilityMap maps from sequences of phones (excluding the empty
   /// sequence), to the sets of all word-labels [on the input lattice] that
   /// could correspond to phone sequences that start with s [but are longer than
@@ -121,11 +121,13 @@ struct WordAlignLatticeLexiconOpts {
   int32 partial_word_label;
   bool reorder;
   bool test;
+  bool allow_duplicate_paths;
   BaseFloat max_expand;
-  
+
   WordAlignLatticeLexiconOpts(): partial_word_label(0), reorder(true),
-                                 test(false), max_expand(-1.0) { }
-  
+                                 test(false), allow_duplicate_paths(false),
+                                 max_expand(-1.0) { }
+
   void Register(OptionsItf *opts) {
     opts->Register("partial-word-label", &partial_word_label, "Numeric id of "
                    "word symbol that is to be used for arcs in the word-aligned "
@@ -136,6 +138,11 @@ struct WordAlignLatticeLexiconOpts {
                    "reordering self-loops (typically true)");
     opts->Register("test", &test, "If true, testing code will be activated "
                    "(the purpose of this is to validate the algorithm).");
+    opts->Register("allow-duplicate-paths", &allow_duplicate_paths, "Only "
+                   "has an effect if --test=true.  If true, does not die "
+                   "(only prints warnings) if duplicate paths are found. "
+                   "This should only happen with very pathological lexicons, "
+                   "e.g. as encountered in testing code.");
     opts->Register("max-expand", &max_expand, "If >0.0, the maximum ratio "
                    "by which we allow the lattice-alignment code to increase the #states"
                    "in a lattice (vs. the phone-aligned lattice) before we fail and "
@@ -153,7 +160,6 @@ struct WordAlignLatticeLexiconOpts {
 /// Returns true if everything was OK, false if there was any kind of
 /// error including when the the lattice seems to have been "forced out"
 /// (did not reach end state, resulting in partial word at end).
-
 bool WordAlignLatticeLexicon(const CompactLattice &lat,
                              const TransitionModel &tmodel,
                              const WordAlignLatticeLexiconInfo &lexicon_info,
@@ -161,7 +167,6 @@ bool WordAlignLatticeLexicon(const CompactLattice &lat,
                              CompactLattice *lat_out);
 
 
-
 /// This function is designed to crash if something went wrong with the
 /// word-alignment of the lattice.  If was_ok==true (was_ok is the return status
 /// of WordAlignLattice), it tests that, after removing any silence and
@@ -174,7 +179,8 @@ bool WordAlignLatticeLexicon(const CompactLattice &lat,
 void TestWordAlignedLatticeLexicon(const CompactLattice &lat,
                                    const TransitionModel &tmodel,
                                    const std::vector<std::vector<int32> > &lexicon,
-                                   const CompactLattice &aligned_lat);
+                                   const CompactLattice &aligned_lat,
+                                   bool allow_duplicate_paths);
 
 } // end namespace kaldi
 #endif
diff --git a/src/lat/word-align-lattice.cc b/src/lat/word-align-lattice.cc
index 51886e810f8..3cc43d54100 100644
--- a/src/lat/word-align-lattice.cc
+++ b/src/lat/word-align-lattice.cc
@@ -28,12 +28,12 @@ class LatticeWordAligner {
  public:
   typedef CompactLatticeArc::StateId StateId;
   typedef CompactLatticeArc::Label Label;
-  
+
   class ComputationState { /// The state of the computation in which,
     /// along a single path in the lattice, we work out the word
     /// boundaries and output aligned arcs.
    public:
-    
+
     /// Advance the computation state by adding the symbols and weights
     /// from this arc.  We'll put the weight on the output arc; this helps
     /// keep the state-space smaller.
@@ -71,18 +71,18 @@ class LatticeWordAligner {
     bool OutputSilenceArc(const WordBoundaryInfo &info,
                           const TransitionModel &tmodel,
                           CompactLatticeArc *arc_out,
-                          bool *error); 
+                          bool *error);
     bool OutputOnePhoneWordArc(const WordBoundaryInfo &info,
                                const TransitionModel &tmodel,
                                CompactLatticeArc *arc_out,
-                               bool *error); 
+                               bool *error);
     bool OutputNormalWordArc(const WordBoundaryInfo &info,
                              const TransitionModel &tmodel,
                              CompactLatticeArc *arc_out,
                              bool *error);
-    
+
     bool IsEmpty() { return (transition_ids_.empty() && word_labels_.empty()); }
-    
+
     /// FinalWeight() will return "weight" if both transition_ids
     /// and word_labels are empty, otherwise it will return
     /// Weight::Zero().
@@ -104,7 +104,7 @@ class LatticeWordAligner {
                         const TransitionModel &tmodel,
                         CompactLatticeArc *arc_out,
                         bool *error);
-  
+
     size_t Hash() const {
       VectorHasher<int32> vh;
       return vh(transition_ids_) + 90647 * vh(word_labels_);
@@ -121,7 +121,7 @@ class LatticeWordAligner {
               && word_labels_ == other.word_labels_
               && weight_ == other.weight_);
     }
-    
+
     ComputationState(): weight_(LatticeWeight::One()) { } // initial state.
     ComputationState(const ComputationState &other):
         transition_ids_(other.transition_ids_), word_labels_(other.word_labels_),
@@ -143,7 +143,7 @@ class LatticeWordAligner {
   struct TupleHash {
     size_t operator() (const Tuple &state) const {
       return state.input_state + 102763 * state.comp_state.Hash();
-      // 102763 is just an arbitrary prime number 
+      // 102763 is just an arbitrary prime number
     }
   };
   struct TupleEqual {
@@ -153,7 +153,7 @@ class LatticeWordAligner {
               && state1.comp_state == state2.comp_state);
     }
   };
-  
+
   typedef unordered_map<Tuple, StateId, TupleHash, TupleEqual> MapType;
 
   StateId GetStateForTuple(const Tuple &tuple, bool add_to_queue) {
@@ -168,17 +168,17 @@ class LatticeWordAligner {
       return iter->second;
     }
   }
-  
+
   void ProcessFinal(Tuple tuple, StateId output_state) {
     // ProcessFinal is only called if the input_state has
     // final-prob of One().  [else it should be zero.  This
     // is because we called CreateSuperFinal().]
-    
+
     if (tuple.comp_state.IsEmpty()) { // computation state doesn't have
       // anything pending.
       std::vector<int32> empty_vec;
       CompactLatticeWeight cw(tuple.comp_state.FinalWeight(), empty_vec);
-      lat_out_->SetFinal(output_state, Plus(lat_out_->Final(output_state), cw));      
+      lat_out_->SetFinal(output_state, Plus(lat_out_->Final(output_state), cw));
     } else {
       // computation state has something pending, i.e. input or
       // output symbols that need to be flushed out.  Note: OutputArc() would
@@ -186,7 +186,8 @@ class LatticeWordAligner {
       // force it out.
       CompactLatticeArc lat_arc;
       tuple.comp_state.OutputArcForce(info_, tmodel_, &lat_arc, &error_);
-      lat_arc.nextstate = GetStateForTuple(tuple, true); // true == add to queue.
+      // True in the next line means add it to the queue.
+      lat_arc.nextstate = GetStateForTuple(tuple, true);
       // The final-prob stuff will get called again from ProcessQueueElement().
       // Note: because we did CreateSuperFinal(), this final-state on the input
       // lattice will have no output arcs (and unit final-prob), so there will be
@@ -197,7 +198,7 @@ class LatticeWordAligner {
     }
   }
 
-  
+
   void ProcessQueueElement() {
     KALDI_ASSERT(!queue_.empty());
     Tuple tuple = queue_.back().first;
@@ -210,7 +211,6 @@ class LatticeWordAligner {
     // epsilon-sequencing rules encoded by the filters in
     // composition.
     CompactLatticeArc lat_arc;
-    Tuple tuple2(tuple); // temp
     if (tuple.comp_state.OutputArc(info_, tmodel_, &lat_arc, &error_)) {
       // note: this function changes the tuple (when it returns true).
       lat_arc.nextstate = GetStateForTuple(tuple, true); // true == add to queue,
@@ -229,7 +229,7 @@ class LatticeWordAligner {
         ProcessFinal(tuple, output_state);
       }
       // Now process the arcs.  Note: final-state shouldn't have any arcs.
-      for(fst::ArcIterator<CompactLattice> aiter(lat_, tuple.input_state);
+      for (fst::ArcIterator<CompactLattice> aiter(lat_, tuple.input_state);
           !aiter.Done(); aiter.Next()) {
         const CompactLatticeArc &arc = aiter.Value();
         Tuple next_tuple(tuple);
@@ -248,7 +248,7 @@ class LatticeWordAligner {
       }
     }
   }
-  
+
   LatticeWordAligner(const CompactLattice &lat,
                      const TransitionModel &tmodel,
                      const WordBoundaryInfo &info,
@@ -266,7 +266,7 @@ class LatticeWordAligner {
     }
     fst::CreateSuperFinal(&lat_); // Creates a super-final state, so the
     // only final-probs are One().
-    
+
     // Inside this class, we don't want to use zero for the silence
     // or partial-word labels, as this will interfere with the RmEpsilon
     // stage, where we don't want the arcs corresponding to silence or
@@ -274,9 +274,13 @@ class LatticeWordAligner {
     // on them.
     if (info_.partial_word_label == 0 || info_.silence_label == 0) {
       int32 unused_label = 1 + HighestNumberedOutputSymbol(lat);
+      if (info_.partial_word_label >= unused_label)
+        unused_label = info_.partial_word_label + 1;
+      if (info_.silence_label >= unused_label)
+        unused_label = info_.silence_label + 1;
       KALDI_ASSERT(unused_label > 0);
       if (info_.partial_word_label == 0)
-        info_.partial_word_label = unused_label;
+        info_.partial_word_label = unused_label++;
       if (info_.silence_label == 0)
         info_.silence_label = unused_label;
     }
@@ -296,10 +300,10 @@ class LatticeWordAligner {
       syms_to_remove.push_back(info_.silence_label);
     if (!syms_to_remove.empty()) {
       RemoveSomeInputSymbols(syms_to_remove, lat_out_);
-      Project(lat_out_, fst::PROJECT_INPUT);      
+      Project(lat_out_, fst::PROJECT_INPUT);
     }
   }
-  
+
   bool AlignLattice() {
     lat_out_->DeleteStates();
     if (lat_.Start() == fst::kNoStateId) {
@@ -310,7 +314,7 @@ class LatticeWordAligner {
     Tuple initial_tuple(lat_.Start(), initial_comp_state);
     StateId start_state = GetStateForTuple(initial_tuple, true); // True = add this to queue.
     lat_out_->SetStart(start_state);
-    
+
     while (!queue_.empty()) {
       if (max_states_ > 0 && lat_out_->NumStates() > max_states_) {
         KALDI_WARN << "Number of states in lattice exceeded max-states of "
@@ -323,10 +327,10 @@ class LatticeWordAligner {
     }
 
     RemoveEpsilonsFromLattice();
-    
+
     return !error_;
   }
-  
+
   CompactLattice lat_;
   const TransitionModel &tmodel_;
   const WordBoundaryInfo &info_in_;
@@ -335,12 +339,12 @@ class LatticeWordAligner {
   CompactLattice *lat_out_;
 
   std::vector<std::pair<Tuple, StateId> > queue_;
-  
-  
-  
+
+
+
   MapType map_; // map from tuples to StateId.
   bool error_;
-  
+
 };
 
 bool LatticeWordAligner::ComputationState::OutputSilenceArc(
@@ -355,7 +359,7 @@ bool LatticeWordAligner::ComputationState::OutputSilenceArc(
   size_t len = transition_ids_.size(), i;
   // Keep going till we reach a "final" transition-id; note, if
   // reorder==true, we have to go a bit further after this.
-  for (i = 1; i < len; i++) {
+  for (i = 0; i < len; i++) {
     int32 tid = transition_ids_[i];
     int32 this_phone = tmodel.TransitionIdToPhone(tid);
     if (this_phone != phone && ! *error) { // error condition: should have reached final transition-id first.
@@ -379,7 +383,7 @@ bool LatticeWordAligner::ComputationState::OutputSilenceArc(
   }
   // interpret i as the number of transition-ids to consume.
   std::vector<int32> tids_out(transition_ids_.begin(), transition_ids_.begin()+i);
-  
+
   // consumed transition ids from our internal state.
   *arc_out = CompactLatticeArc(info.silence_label, info.silence_label,
                                CompactLatticeWeight(weight_, tids_out), fst::kNoStateId);
@@ -396,11 +400,11 @@ bool LatticeWordAligner::ComputationState::OutputOnePhoneWordArc(
   if (word_labels_.empty()) return false;
   int32 phone = tmodel.TransitionIdToPhone(transition_ids_[0]);
   if (info.TypeOfPhone(phone) != WordBoundaryInfo::kWordBeginAndEndPhone)
-    return false;  
+    return false;
   // we assume the start of transition_ids_ is the start of the phone.
   // this is a precondition.
   size_t len = transition_ids_.size(), i;
-  for (i = 1; i < len; i++) {
+  for (i = 0; i < len; i++) {
     int32 tid = transition_ids_[i];
     int32 this_phone = tmodel.TransitionIdToPhone(tid);
     if (this_phone != phone && ! *error) { // error condition: should have reached final transition-id first.
@@ -416,23 +420,26 @@ bool LatticeWordAligner::ComputationState::OutputOnePhoneWordArc(
   if (info.reorder) // we have to consume the following self-loop transition-ids.
     while (i < len && tmodel.IsSelfLoop(transition_ids_[i])) i++;
   if (i == len) return false; // we don't know if it ends here... so can't output arc.
-  
+
   if (tmodel.TransitionIdToPhone(transition_ids_[i-1]) != phone
       && ! *error) { // another check.
     KALDI_WARN << "Phone changed unexpectedly in lattice "
         "[broken lattice or mismatched model?]";
     *error = true;
   }
-  
+
   // interpret i as the number of transition-ids to consume.
-  std::vector<int32> tids_out(transition_ids_.begin(), transition_ids_.begin()+i);
-  
+  std::vector<int32> tids_out(transition_ids_.begin(),
+                              transition_ids_.begin() + i);
+
   // consumed transition ids from our internal state.
   int32 word = word_labels_[0];
   *arc_out = CompactLatticeArc(word, word,
                                CompactLatticeWeight(weight_, tids_out), fst::kNoStateId);
-  transition_ids_.erase(transition_ids_.begin(), transition_ids_.begin()+i); // delete these
-  word_labels_.erase(word_labels_.begin(), word_labels_.begin()+1); // remove the word we output.
+  transition_ids_.erase(transition_ids_.begin(),
+                        transition_ids_.begin() + i); // delete these
+  // Remove the word that we just output.
+  word_labels_.erase(word_labels_.begin(), word_labels_.begin() + 1);
   weight_ = LatticeWeight::One(); // we just output the weight.
   return true;
 }
@@ -447,7 +454,7 @@ bool LatticeWordAligner::ComputationState::OutputNormalWordArc(
   if (word_labels_.empty()) return false;
   int32 begin_phone = tmodel.TransitionIdToPhone(transition_ids_[0]);
   if (info.TypeOfPhone(begin_phone) != WordBoundaryInfo::kWordBeginPhone)
-    return false;  
+    return false;
   // we assume the start of transition_ids_ is the start of the phone.
   // this is a precondition.
   size_t len = transition_ids_.size(), i;
@@ -488,7 +495,7 @@ bool LatticeWordAligner::ComputationState::OutputNormalWordArc(
   // a "final-transition".
 
   // this variable just used for checks.
-  int32 final_phone = tmodel.TransitionIdToPhone(transition_ids_[i]); 
+  int32 final_phone = tmodel.TransitionIdToPhone(transition_ids_[i]);
   for (; i < len; i++) {
     int32 this_phone = tmodel.TransitionIdToPhone(transition_ids_[i]);
     if (this_phone != final_phone && ! *error) {
@@ -514,15 +521,19 @@ bool LatticeWordAligner::ComputationState::OutputNormalWordArc(
 
   // OK, we're ready to output the word.
   // Interpret i as the number of transition-ids to consume.
-  std::vector<int32> tids_out(transition_ids_.begin(), transition_ids_.begin()+i);
-  
+  std::vector<int32> tids_out(transition_ids_.begin(),
+                              transition_ids_.begin() + i);
+
   // consumed transition ids from our internal state.
   int32 word = word_labels_[0];
   *arc_out = CompactLatticeArc(word, word,
                                CompactLatticeWeight(weight_, tids_out),
                                fst::kNoStateId);
-  transition_ids_.erase(transition_ids_.begin(), transition_ids_.begin()+i); // delete these
-  word_labels_.erase(word_labels_.begin(), word_labels_.begin()+1); // remove the word we output.
+  transition_ids_.erase(transition_ids_.begin(),
+                        transition_ids_.begin() + i); // delete these
+  // Remove the word that we just output.
+  word_labels_.erase(word_labels_.begin(),
+                     word_labels_.begin() + 1);
   weight_ = LatticeWeight::One(); // we just output the weight.
   return true;
 }
@@ -550,7 +561,7 @@ static bool IsPlausibleWord(const WordBoundaryInfo &info,
   } else return false;
 }
 
-    
+
 void LatticeWordAligner::ComputationState::OutputArcForce(
     const WordBoundaryInfo &info, const TransitionModel &tmodel,
     CompactLatticeArc *arc_out,  bool *error) {
@@ -560,7 +571,7 @@ void LatticeWordAligner::ComputationState::OutputArcForce(
       && !transition_ids_.empty()) { // We have at least one word to
     // output, and some transition-ids.  We assume that the normal OutputArc was called
     // and failed, so this means we didn't see the end of that
-    // word. 
+    // word.
     int32 word = word_labels_[0];
     if (! *error && !IsPlausibleWord(info, tmodel, transition_ids_)) {
       *error = true;
@@ -604,7 +615,8 @@ void LatticeWordAligner::ComputationState::OutputArcForce(
       if (!*error) { // Check that it ends at the end state of silence; error otherwise.
         int32 i = transition_ids_.size() - 1;
         if (info.reorder)
-          while(tmodel.IsSelfLoop(transition_ids_[i]) && i > 0) i--;
+          while (tmodel.IsSelfLoop(transition_ids_[i]) && i > 0)
+            i--;
         if (!tmodel.IsFinal(transition_ids_[i])) {
           *error = true;
           KALDI_WARN << "Broken silence arc at end of utterance (does not "
@@ -686,7 +698,7 @@ WordBoundaryInfo::WordBoundaryInfo(const WordBoundaryInfoNewOpts &opts,
 void WordBoundaryInfo::Init(std::istream &stream) {
   std::string line;
   while (std::getline(stream, line)) {
-    std::vector<std::string> split_line;  
+    std::vector<std::string> split_line;
     SplitStringToVector(line, " \t\r", true, &split_line);// split the line by space or tab
     int32 p = 0;
     if (split_line.size() != 2 ||
@@ -701,13 +713,13 @@ void WordBoundaryInfo::Init(std::istream &stream) {
     else if (t == "singleton") phone_to_type[p] = kWordBeginAndEndPhone;
     else if (t == "end") phone_to_type[p] = kWordEndPhone;
     else if (t == "internal") phone_to_type[p] = kWordInternalPhone;
-    else 
+    else
       KALDI_ERR << "Invalid line in word-boundary file: " << line;
   }
   if (phone_to_type.empty())
     KALDI_ERR << "Empty word-boundary file";
 }
-  
+
 bool WordAlignLattice(const CompactLattice &lat,
                       const TransitionModel &tmodel,
                       const WordBoundaryInfo &info,
@@ -726,7 +738,7 @@ class WordAlignedLatticeTester {
                            const WordBoundaryInfo &info,
                            const CompactLattice &aligned_lat):
       lat_(lat), tmodel_(tmodel), info_(info), aligned_lat_(aligned_lat) { }
-  
+
   void Test() {
     // First test that each aligned arc is valid.
     typedef CompactLattice::StateId StateId ;
@@ -766,15 +778,15 @@ class WordAlignedLatticeTester {
       return false;
     for (size_t i = 0; i < tids.size(); i++)
       if (tmodel_.TransitionIdToPhone(tids[i]) != first_phone) return false;
-      
+
     if (!info_.reorder) return tmodel_.IsFinal(tids.back());
     else {
       for (size_t i = 0; i < tids.size(); i++) {
-        if(tmodel_.IsFinal(tids[i])) { // got the "final" transition, which is
+        if (tmodel_.IsFinal(tids[i])) { // got the "final" transition, which is
           // reordered to actually not be final.  Make sure that all the
           // rest of the transition ids are the self-loop of that same
           // transition-state.
-          for(size_t j = i+1; j < tids.size(); j++) {
+          for (size_t j = i+1; j < tids.size(); j++) {
             if (!(tmodel_.TransitionIdToTransitionState(tids[j])
                   == tmodel_.TransitionIdToTransitionState(tids[i]))) return false;
           }
@@ -794,15 +806,15 @@ class WordAlignedLatticeTester {
         WordBoundaryInfo::kWordBeginAndEndPhone) return false;
     for (size_t i = 0; i < tids.size(); i++)
       if (tmodel_.TransitionIdToPhone(tids[i]) != first_phone) return false;
-      
+
     if (!info_.reorder) return tmodel_.IsFinal(tids.back());
     else {
       for (size_t i = 0; i < tids.size(); i++) {
-        if(tmodel_.IsFinal(tids[i])) { // got the "final" transition, which is
+        if (tmodel_.IsFinal(tids[i])) { // got the "final" transition, which is
           // reordered to actually not be final.  Make sure that all the
           // rest of the transition ids are the self-loop of that same
           // transition-state.
-          for(size_t j = i+1; j < tids.size(); j++) {
+          for (size_t j = i+1; j < tids.size(); j++) {
             if (tmodel_.TransitionIdToTransitionState(tids[j])
                 != tmodel_.TransitionIdToTransitionState(tids[i])) return false;
           }
@@ -824,8 +836,8 @@ class WordAlignedLatticeTester {
     { // first phone.
       int num_final = 0;
       for (i = 0; i < tids.size(); i++) {
-        if (tmodel_.IsFinal(tids[i])) num_final++;
         if (tmodel_.TransitionIdToPhone(tids[i]) != first_phone) break;
+        if (tmodel_.IsFinal(tids[i])) num_final++;
       }
       if (num_final != 1)
         return false; // Something went wrong-- perhaps we
@@ -852,7 +864,7 @@ class WordAlignedLatticeTester {
         else {
           // Make sure the only thing that follows this is self-loops
           // of the final transition-state.
-          for (size_t k=j+1; k<tids.size(); k++)
+          for (size_t k = j + 1; k < tids.size(); k++)
             if (tmodel_.TransitionIdToTransitionState(tids[k])
                 != tmodel_.TransitionIdToTransitionState(tids[j])
                 || !tmodel_.IsSelfLoop(tids[k]))
@@ -871,7 +883,7 @@ class WordAlignedLatticeTester {
     if (tids.empty()) return false;
     return true; // We're pretty liberal when it comes to partial words here.
   }
-  
+
   void TestFinal(const CompactLatticeWeight &w) {
     if (!w.String().empty())
       KALDI_ERR << "Expect to have no strings on final-weights of lattices.";
@@ -890,14 +902,14 @@ class WordAlignedLatticeTester {
       KALDI_ERR << "Equivalence test failed (testing word-alignment of lattices.) "
                 << "Make sure your model and lattices match!";
   }
-  
+
   const CompactLattice &lat_;
   const TransitionModel &tmodel_;
   const WordBoundaryInfo &info_;
   const CompactLattice &aligned_lat_;
 };
-  
-  
+
+
 
 
 /// You should only test a lattice if WordAlignLattice returned true (i.e. it
diff --git a/src/latbin/Makefile b/src/latbin/Makefile
index a3f05621a68..74bf664b6c6 100644
--- a/src/latbin/Makefile
+++ b/src/latbin/Makefile
@@ -20,7 +20,8 @@ BINFILES = lattice-best-path lattice-prune lattice-equivalent lattice-to-nbest \
            lattice-minimize lattice-limit-depth lattice-depth-per-frame \
            lattice-confidence lattice-determinize-phone-pruned \
            lattice-determinize-phone-pruned-parallel lattice-expand-ngram \
-           lattice-lmrescore-const-arpa nbest-to-prons
+           lattice-lmrescore-const-arpa lattice-lmrescore-rnnlm nbest-to-prons \
+           lattice-arc-post lattice-determinize-non-compact
 
 OBJFILES =
 
@@ -30,7 +31,7 @@ TESTFILES =
 
 ADDLIBS = ../lat/kaldi-lat.a ../lm/kaldi-lm.a ../hmm/kaldi-hmm.a \
           ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
-          ../thread/kaldi-thread.a ../fstext/kaldi-fstext.a ../base/kaldi-base.a 
+          ../thread/kaldi-thread.a ../fstext/kaldi-fstext.a ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
 
diff --git a/src/latbin/lattice-add-trans-probs.cc b/src/latbin/lattice-add-trans-probs.cc
index 1e44fa4ab8e..0fa79338f8e 100644
--- a/src/latbin/lattice-add-trans-probs.cc
+++ b/src/latbin/lattice-add-trans-probs.cc
@@ -43,11 +43,11 @@ int main(int argc, char *argv[]) {
         "\n"
         "Usage: lattice-add-trans-probs [options] model lattice-rspecifier lattice-wspecifier\n"
         " e.g.: lattice-add-trans-probs --transition-scale=1.0 --self-loop-scale=0.1 1.mdl ark:in.lats ark:out.lats\n";
-      
+
     ParseOptions po(usage);
 
     BaseFloat transition_scale = 1.0, self_loop_scale = 1.0;
-    
+
     po.Register("transition-scale", &transition_scale,
                 "Scale for transition probabilities (excluding self-loops)");
     po.Register("self-loop-scale", &self_loop_scale,
@@ -55,7 +55,7 @@ int main(int argc, char *argv[]) {
                 "probability mass.");
 
     po.Read(argc, argv);
-    
+
     if (po.NumArgs() != 3) {
       po.PrintUsage();
       exit(1);
@@ -65,13 +65,13 @@ int main(int argc, char *argv[]) {
         model_rxfilename = po.GetArg(1),
         lats_rspecifier = po.GetArg(2),
         lats_wspecifier = po.GetArg(3);
-    
+
     int32 n_done = 0;
 
     TransitionModel trans_model;
-    
+
     ReadKaldiObject(model_rxfilename, &trans_model);
-    
+
     SequentialLatticeReader lattice_reader(lats_rspecifier); // read as
     // regular lattice.
     CompactLatticeWriter clat_writer(lats_wspecifier); // write as compact.
diff --git a/src/latbin/lattice-align-words-lexicon.cc b/src/latbin/lattice-align-words-lexicon.cc
index e3bd01f2eea..72226731c7c 100644
--- a/src/latbin/lattice-align-words-lexicon.cc
+++ b/src/latbin/lattice-align-words-lexicon.cc
@@ -116,6 +116,7 @@ int main(int argc, char *argv[]) {
         else {
           if (aligned_clat.Start() != fst::kNoStateId) {
             KALDI_WARN << "Outputting partial lattice for " << key;
+            TopSortCompactLatticeIfNeeded(&aligned_clat);
             clat_writer.Write(key, aligned_clat);
           } else {
             KALDI_WARN << "Empty aligned lattice for " << key
diff --git a/src/latbin/lattice-arc-post.cc b/src/latbin/lattice-arc-post.cc
new file mode 100644
index 00000000000..38a5d6d304d
--- /dev/null
+++ b/src/latbin/lattice-arc-post.cc
@@ -0,0 +1,214 @@
+// latbin/lattice-arc-post.cc
+
+// Copyright 2015  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "fstext/fstext-lib.h"
+#include "lat/kaldi-lattice.h"
+#include "lat/lattice-functions.h"
+
+namespace kaldi {
+
+// This class computes and outputs
+// the information about arc posteriors.
+
+class ArcPosteriorComputer {
+ public:
+  // Note: 'clat' must be topologically sorted.
+  ArcPosteriorComputer(const CompactLattice &clat,
+                       BaseFloat min_post,
+                       bool print_alignment,
+                       const TransitionModel *trans_model = NULL):
+      clat_(clat), min_post_(min_post), print_alignment_(print_alignment),
+      trans_model_(trans_model) { }
+
+  // returns the number of arc posteriors that it output.
+  int32 OutputPosteriors(const std::string &utterance,
+                         std::ostream &os) {
+    int32 num_post = 0;
+    if (!ComputeCompactLatticeAlphas(clat_, &alpha_))
+      return num_post;
+    if (!ComputeCompactLatticeBetas(clat_, &beta_))
+      return num_post;
+
+    CompactLatticeStateTimes(clat_, &state_times_);
+    if (clat_.Start() < 0)
+      return 0;
+    double tot_like = beta_[clat_.Start()];
+
+    int32 num_states = clat_.NumStates();
+    for (int32 state = 0; state < num_states; state++) {
+      for (fst::ArcIterator<CompactLattice> aiter(clat_, state);
+           !aiter.Done(); aiter.Next()) {
+        const CompactLatticeArc &arc = aiter.Value();
+        double arc_loglike = -ConvertToCost(arc.weight) +
+            alpha_[state] + beta_[arc.nextstate] - tot_like;
+        KALDI_ASSERT(arc_loglike < 0.1 &&
+                     "Bad arc posterior in forward-backward computation");
+        if (arc_loglike > 0.0) arc_loglike = 0.0;
+        int32 num_frames = arc.weight.String().size(),
+            word = arc.ilabel;
+        BaseFloat arc_post = exp(arc_loglike);
+        if (arc_post <= min_post_) continue;
+        os << utterance << '\t' << state_times_[state] << '\t' << num_frames
+           << '\t' << arc_post << '\t' << word;
+        if (print_alignment_) {
+          os << '\t';
+          const std::vector<int32> &ali = arc.weight.String();
+          for (int32 frame = 0; frame < num_frames; frame++) {
+            os << ali[frame];
+            if (frame + 1 < num_frames) os << ',';
+          }
+        }
+        if (trans_model_ != NULL) {
+          // we want to print the phone sequence too.
+          os << '\t';
+          const std::vector<int32> &ali = arc.weight.String();
+          bool first_phone = true;
+          for (int32 frame = 0; frame < num_frames; frame++) {
+            if (trans_model_->IsFinal(ali[frame])) {
+              if (first_phone) first_phone = false;
+              else os << ' ';
+              os << trans_model_->TransitionIdToPhone(ali[frame]);
+            }
+          }
+        }
+        os << std::endl;
+        num_post++;
+      }
+    }
+    return num_post;
+  }
+ private:
+  const CompactLattice &clat_;
+  std::vector<double> alpha_;
+  std::vector<double> beta_;
+  std::vector<int32> state_times_;
+
+  BaseFloat min_post_;
+  bool print_alignment_;
+  const TransitionModel *trans_model_;
+};
+
+}
+
+
+int main(int argc, char *argv[]) {
+  try {
+    typedef kaldi::int32 int32;
+    using fst::SymbolTable;
+    using fst::VectorFst;
+    using fst::StdArc;
+
+    const char *usage =
+        "Print out information regarding posteriors of lattice arcs\n"
+        "This program computes posteriors from a lattice and prints out\n"
+        "information for each arc (the format is reminiscent of ctm, but\n"
+        "contains information from multiple paths).  Each line is:\n"
+        " <utterance-id> <start-frame> <num-frames> <posterior> <word> [<ali>] [<phone1> <phone2>...]\n"
+        "for instance:\n"
+        "2013a04-bk42\t104\t26\t0.95\t0\t11,242,242,242,71,894,894,62,63,63,63,63\t2 8 9\n"
+        "where the --print-alignment option determines whether the alignments (i.e. the\n"
+        "sequences of transition-ids) are printed, and the phones are printed only if the\n"
+        "<model> is supplied on the command line.  Note, there are tabs between the major\n"
+        "fields, but the phones are separated by spaces.\n"
+        "Usage: lattice-arc-post [<model>] <lattices-rspecifier> <output-wxfilename>\n"
+        "e.g.: lattice-arc-post --acoustic-scale=0.1 final.mdl 'ark:gunzip -c lat.1.gz|' post.txt\n"
+        "You will probably want to word-align the lattices (e.g. lattice-align-words or\n"
+        "lattice-align-words-lexicon) before this program, apply an acoustic scale either\n"
+        "via the --acoustic-scale option or using lattice-scale.\n"
+        "See also: lattice-post, lattice-to-ctm-conf, nbest-to-ctm\n";
+
+    kaldi::BaseFloat acoustic_scale = 1.0, lm_scale = 1.0;
+    kaldi::BaseFloat min_post = 0.0001;
+    bool print_alignment = false;
+
+    kaldi::ParseOptions po(usage);
+    po.Register("acoustic-scale", &acoustic_scale,
+                "Scaling factor for acoustic likelihoods");
+    po.Register("lm-scale", &lm_scale,
+                "Scaling factor for \"graph costs\" (including LM costs)");
+    po.Register("print-alignment", &print_alignment,
+                "If true, print alignments (i.e. sequences of transition-ids) for each\n"
+                "arc.");
+    po.Register("min-post", &min_post,
+                "Arc posteriors below this value will be pruned away");
+    po.Read(argc, argv);
+
+    if (po.NumArgs() < 2 || po.NumArgs() > 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    if (acoustic_scale == 0.0)
+      KALDI_ERR << "Do not use a zero acoustic scale (cannot be inverted)";
+
+    kaldi::TransitionModel trans_model;
+
+    std::string lats_rspecifier, output_wxfilename;
+    if (po.NumArgs() == 3) {
+      ReadKaldiObject(po.GetArg(1), &trans_model);
+      lats_rspecifier = po.GetArg(2);
+      output_wxfilename = po.GetArg(3);
+    } else {
+      lats_rspecifier = po.GetArg(1);
+      output_wxfilename = po.GetArg(2);
+    }
+
+
+    kaldi::Output output(output_wxfilename, false);
+
+    // Read as regular lattice
+    kaldi::SequentialCompactLatticeReader clat_reader(lats_rspecifier);
+
+    int64 tot_post = 0;
+    int32 num_lat_done = 0, num_lat_err = 0;
+
+    for (; !clat_reader.Done(); clat_reader.Next()) {
+      std::string key = clat_reader.Key();
+      kaldi::CompactLattice clat = clat_reader.Value();
+      // FreeCurrent() is an optimization that prevents the lattice from being
+      // copied unnecessarily (OpenFst does copy-on-write).
+      clat_reader.FreeCurrent();
+      fst::ScaleLattice(fst::LatticeScale(lm_scale, acoustic_scale), &clat);
+      kaldi::TopSortCompactLatticeIfNeeded(&clat);
+
+      kaldi::ArcPosteriorComputer computer(
+          clat, min_post, print_alignment,
+          (po.NumArgs() == 3 ? &trans_model : NULL));
+
+      int32 num_post = computer.OutputPosteriors(key, output.Stream());
+      if (num_post != 0) {
+        num_lat_done++;
+        tot_post += num_post;
+      } else {
+        num_lat_err++;
+        KALDI_WARN << "No posterior printed for " << key;
+      }
+    }
+    KALDI_LOG << "Printed posteriors for " << num_lat_done << " lattices ("
+              << num_lat_err << " with errors); on average printed "
+              << (tot_post / (num_lat_done == 0 ? 1 : num_lat_done))
+              << " posteriors per lattice.";
+    return (num_lat_done > 0 ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
diff --git a/src/latbin/lattice-arcgraph.cc b/src/latbin/lattice-arcgraph.cc
index a7fcb8b5924..3855c41991e 100644
--- a/src/latbin/lattice-arcgraph.cc
+++ b/src/latbin/lattice-arcgraph.cc
@@ -108,7 +108,7 @@ void MapTransitionIdsToTransitionStates(kaldi::CompactLattice *lat,
       }
       kaldi::LatticeWeight new_w(w_final.Weight());
       if (!keep_weights) new_w = kaldi::LatticeWeight::One();
-      kaldi::CompactLatticeWeight newwgt(new_w, syms); 
+      kaldi::CompactLatticeWeight newwgt(new_w, syms);
       lat->SetFinal(s, newwgt);
     }
 
@@ -126,7 +126,7 @@ void MapTransitionIdsToTransitionStates(kaldi::CompactLattice *lat,
       }
       kaldi::LatticeWeight new_w(w.Weight());
       if (!keep_weights) new_w = kaldi::LatticeWeight::One();
-      kaldi::CompactLatticeWeight newwgt(new_w, syms); 
+      kaldi::CompactLatticeWeight newwgt(new_w, syms);
       arc.weight = newwgt;
       aiter.SetValue(arc);
     }
@@ -226,7 +226,7 @@ int main(int argc, char *argv[]) {
       if (lattice_wspecifier != "") lat_writer.Write(key, lat_composed);
 
       CompactLattice clat_determinized;
-      if (DeterminizeLattice(lat_composed, &clat_determinized, lat_opts, NULL)) { 
+      if (DeterminizeLattice(lat_composed, &clat_determinized, lat_opts, NULL)) {
         // now we can forget about the weights
         ScaleLattice(fst::LatticeScale(0.0, 0.0), &clat_determinized);
         Lattice lat_det;
diff --git a/src/latbin/lattice-best-path.cc b/src/latbin/lattice-best-path.cc
index 7c18b8bc26e..dc25fb351c6 100644
--- a/src/latbin/lattice-best-path.cc
+++ b/src/latbin/lattice-best-path.cc
@@ -37,9 +37,9 @@ int main(int argc, char *argv[]) {
         "Generate 1-best path through lattices; output as transcriptions and alignments\n"
         "Note: if you want output as FSTs, use lattice-1best; if you want output\n"
         "with acoustic and LM scores, use lattice-1best | nbest-to-linear\n"
-        "Usage: lattice-best-path [options]  lattice-rspecifier [ transcriptions-wspecifier [ alignments-wspecifier] ]\n"
+        "Usage: lattice-best-path [options]  <lattice-rspecifier> [ <transcriptions-wspecifier> [ <alignments-wspecifier>] ]\n"
         " e.g.: lattice-best-path --acoustic-scale=0.1 ark:1.lats ark:1.tra ark:1.ali\n";
-      
+
     ParseOptions po(usage);
     BaseFloat acoustic_scale = 1.0;
     BaseFloat lm_scale = 1.0;
@@ -49,7 +49,7 @@ int main(int argc, char *argv[]) {
     po.Register("lm-scale", &lm_scale, "Scaling factor for LM probabilities. "
                 "Note: the ratio acoustic-scale/lm-scale is all that matters.");
     po.Register("word-symbol-table", &word_syms_filename, "Symbol table for words [for debug output]");
-    
+
     po.Read(argc, argv);
 
     if (po.NumArgs() < 1 || po.NumArgs() > 3) {
@@ -62,13 +62,13 @@ int main(int argc, char *argv[]) {
         alignments_wspecifier = po.GetOptArg(3);
 
     SequentialCompactLatticeReader clat_reader(lats_rspecifier);
-    
+
     Int32VectorWriter transcriptions_writer(transcriptions_wspecifier);
 
     Int32VectorWriter alignments_writer(alignments_wspecifier);
 
     fst::SymbolTable *word_syms = NULL;
-    if (word_syms_filename != "") 
+    if (word_syms_filename != "")
       if (!(word_syms = fst::SymbolTable::ReadText(word_syms_filename)))
         KALDI_ERR << "Could not read symbol table from file "
                    << word_syms_filename;
@@ -77,7 +77,7 @@ int main(int argc, char *argv[]) {
     int32 n_done = 0, n_fail = 0;
     int64 n_frame = 0;
     LatticeWeight tot_weight = LatticeWeight::One();
-    
+
     for (; !clat_reader.Done(); clat_reader.Next()) {
       std::string key = clat_reader.Key();
       CompactLattice clat = clat_reader.Value();
@@ -98,7 +98,7 @@ int main(int argc, char *argv[]) {
         GetLinearSymbolSequence(best_path, &alignment, &words, &weight);
         KALDI_LOG << "For utterance " << key << ", best cost "
                   << weight.Value1() << " + " << weight.Value2() << " = "
-                  << (weight.Value1() + weight.Value2()) 
+                  << (weight.Value1() + weight.Value2())
                   << " over " << alignment.size() << " frames.";
         if (transcriptions_wspecifier != "")
           transcriptions_writer.Write(key, words);
@@ -121,13 +121,13 @@ int main(int argc, char *argv[]) {
     }
 
     BaseFloat tot_weight_float = tot_weight.Value1() + tot_weight.Value2();
-    KALDI_LOG << "Overall score per frame is " << (tot_weight_float/n_frame)
+    KALDI_LOG << "Overall cost per frame is " << (tot_weight_float/n_frame)
               << " = " << (tot_weight.Value1()/n_frame) << " [graph]"
               << " + " << (tot_weight.Value2()/n_frame) << " [acoustic]"
               << " over " << n_frame << " frames.";
     KALDI_LOG << "Done " << n_done << " lattices, failed for " << n_fail;
-    
-    if (word_syms) delete word_syms;
+
+    delete word_syms;
     if (n_done != 0) return 0;
     else return 1;
   } catch(const std::exception &e) {
diff --git a/src/latbin/lattice-compose.cc b/src/latbin/lattice-compose.cc
index 8011ec6573e..5feb958a6a1 100644
--- a/src/latbin/lattice-compose.cc
+++ b/src/latbin/lattice-compose.cc
@@ -43,7 +43,7 @@ int main(int argc, char *argv[]) {
         "(lattice-rspecifier2|fst-rxfilename2) lattice-wspecifier\n"
         " e.g.: lattice-compose ark:1.lats ark:2.lats ark:composed.lats\n"
         " or: lattice-compose ark:1.lats G.fst ark:composed.lats\n";
-    
+
     ParseOptions po(usage);
 
     int32 num_states_cache = 50000;
@@ -60,17 +60,17 @@ int main(int argc, char *argv[]) {
     }
 
     KALDI_ASSERT(phi_label > 0 || phi_label == fst::kNoLabel); // e.g. 0 not allowed.
-    
+
     std::string lats_rspecifier1 = po.GetArg(1),
         arg2 = po.GetArg(2),
         lats_wspecifier = po.GetArg(3);
     int32 n_done = 0, n_fail = 0;
-    
+
     SequentialLatticeReader lattice_reader1(lats_rspecifier1);
     // Write as compact lattice.
-    CompactLatticeWriter compact_lattice_writer(lats_wspecifier); 
+    CompactLatticeWriter compact_lattice_writer(lats_wspecifier);
+
 
-    
     if (ClassifyRspecifier(arg2, NULL, NULL) == kNoRspecifier) {
       std::string fst_rxfilename = arg2;
       VectorFst<StdArc> *fst2 = fst::ReadFstKaldi(fst_rxfilename);
@@ -114,10 +114,10 @@ int main(int argc, char *argv[]) {
       // read in another set of lattices and compose them.  But in this
       // case we don't do any projection; we assume that the user has already
       // done this (e.g. with lattice-project).
-      RandomAccessLatticeReader lattice_reader2(lats_rspecifier2);    
+      RandomAccessLatticeReader lattice_reader2(lats_rspecifier2);
       for (; !lattice_reader1.Done(); lattice_reader1.Next()) {
         std::string key = lattice_reader1.Key();
-        KALDI_VLOG(1) << "Processing lattice for key " << key;        
+        KALDI_VLOG(1) << "Processing lattice for key " << key;
         Lattice lat1 = lattice_reader1.Value();
         lattice_reader1.FreeCurrent();
         if (!lattice_reader2.HasKey(key)) {
@@ -155,7 +155,7 @@ int main(int argc, char *argv[]) {
         }
       }
     }
-    
+
     KALDI_LOG << "Done " << n_done << " lattices; failed for "
               << n_fail;
     return (n_done != 0 ? 0 : 1);
diff --git a/src/latbin/lattice-copy.cc b/src/latbin/lattice-copy.cc
index 76ca034b2e4..f66eb699705 100644
--- a/src/latbin/lattice-copy.cc
+++ b/src/latbin/lattice-copy.cc
@@ -24,6 +24,108 @@
 #include "fstext/fstext-lib.h"
 #include "lat/kaldi-lattice.h"
 
+namespace kaldi {
+  int32 CopySubsetLattices(std::string filename, 
+      SequentialLatticeReader *lattice_reader,
+      LatticeWriter *lattice_writer,
+      bool include = true, bool ignore_missing = false
+      ) {
+    unordered_set<std::string, StringHasher> subset;
+    std::set<std::string> subset_list; 
+
+    bool binary;
+    Input ki(filename, &binary);
+    KALDI_ASSERT(!binary);
+    std::string line;
+    while (std::getline(ki.Stream(), line)) {
+      std::vector<std::string> split_line;
+      SplitStringToVector(line, " \t\r", true, &split_line);
+      if(split_line.empty()) {
+        KALDI_ERR << "Unable to parse line \"" << line << "\" encountered in input in " << filename;
+      }
+      subset.insert(split_line[0]);
+      subset_list.insert(split_line[0]);
+    }
+
+    int32 num_total = 0;
+    size_t num_success = 0;
+    for (; !lattice_reader->Done(); lattice_reader->Next(), num_total++) {
+      if (include && lattice_reader->Key() > *(subset_list.rbegin())) {
+        KALDI_LOG << "The utterance " << lattice_reader->Key()
+                  << " is larger than "
+                  << "the last key in the include list. Not reading further.";
+        KALDI_LOG << "Wrote " << num_success << " utterances";
+        return 0;
+      }
+
+      if (include && subset.count(lattice_reader->Key()) > 0) {
+        lattice_writer->Write(lattice_reader->Key(), lattice_reader->Value());
+        num_success++;
+      } else if (!include && subset.count(lattice_reader->Key()) == 0) {
+        lattice_writer->Write(lattice_reader->Key(), lattice_reader->Value());
+        num_success++;
+      }
+    }
+
+    KALDI_LOG << "Wrote " << num_success << " out of " << num_total
+      << " utterances.";
+
+    if (ignore_missing) return 0;
+
+    return (num_success != 0 ? 0 : 1);
+  }
+
+  int32 CopySubsetLattices(std::string filename, 
+      SequentialCompactLatticeReader *lattice_reader,
+      CompactLatticeWriter *lattice_writer,
+      bool include = true, bool ignore_missing = false
+      ) {
+    unordered_set<std::string, StringHasher> subset;
+    std::set<std::string> subset_list; 
+    
+    bool binary;
+    Input ki(filename, &binary);
+    KALDI_ASSERT(!binary);
+    std::string line;
+    while (std::getline(ki.Stream(), line)) {
+      std::vector<std::string> split_line;
+      SplitStringToVector(line, " \t\r", true, &split_line);
+      if(split_line.empty()) {
+        KALDI_ERR << "Unable to parse line \"" << line << "\" encountered in input in " << filename;
+      }
+      subset.insert(split_line[0]);
+      subset_list.insert(split_line[0]);
+    }
+
+    int32 num_total = 0;
+    size_t num_success = 0;
+    for (; !lattice_reader->Done(); lattice_reader->Next(), num_total++) {
+      if (include && lattice_reader->Key() > *(subset_list.rbegin())) {
+        KALDI_LOG << "The utterance " << lattice_reader->Key()
+                  << " is larger than "
+                  << "the last key in the include list. Not reading further.";
+        KALDI_LOG << "Wrote " << num_success << " utterances";
+        return 0;
+      }
+
+      if (include && subset.count(lattice_reader->Key()) > 0) {
+        lattice_writer->Write(lattice_reader->Key(), lattice_reader->Value());
+        num_success++;
+      } else if (!include && subset.count(lattice_reader->Key()) == 0) {
+        lattice_writer->Write(lattice_reader->Key(), lattice_reader->Value());
+        num_success++;
+      }
+    }
+
+    KALDI_LOG << " Wrote " << num_success << " out of " << num_total
+      << " utterances.";
+
+    if (ignore_missing) return 0;
+
+    return (num_success != 0 ? 0 : 1);
+  }
+}
+
 int main(int argc, char *argv[]) {
   try {
     using namespace kaldi;
@@ -36,14 +138,32 @@ int main(int argc, char *argv[]) {
     const char *usage =
         "Copy lattices (e.g. useful for changing to text mode or changing\n"
         "format to standard from compact lattice.)\n"
+        "The --include and --exclude options can be used to copy only a subset "
+        "of lattices, where are the --include option specifies the "
+        "whitelisted utterances that would be copied and --exclude option "
+        "specifies the blacklisted utterances that would not be copied.\n"
+        "Only one of --include and --exclude can be supplied.\n"
         "Usage: lattice-copy [options] lattice-rspecifier lattice-wspecifier\n"
         " e.g.: lattice-copy --write-compact=false ark:1.lats ark,t:text.lats\n"
         "See also: lattice-to-fst, and the script egs/wsj/s5/utils/convert_slf.pl\n";
     
     ParseOptions po(usage);
-    bool write_compact = true;
+    bool write_compact = true, ignore_missing = false;
+    std::string include_rxfilename;
+    std::string exclude_rxfilename;
+
     po.Register("write-compact", &write_compact, "If true, write in normal (compact) form.");
-    
+    po.Register("include", &include_rxfilename, 
+                "Text file, the first field of each "
+                "line being interpreted as the "
+                "utterance-id whose lattices will be included");
+    po.Register("exclude", &exclude_rxfilename, 
+                "Text file, the first field of each "
+                "line being interpreted as an utterance-id "
+                "whose lattices will be excluded");
+    po.Register("ignore-missing", &ignore_missing,
+                "Exit with status 0 even if no lattices are copied");
+
     po.Read(argc, argv);
 
     if (po.NumArgs() != 2) {
@@ -59,15 +179,46 @@ int main(int argc, char *argv[]) {
     if (write_compact) {
       SequentialCompactLatticeReader lattice_reader(lats_rspecifier);
       CompactLatticeWriter lattice_writer(lats_wspecifier);
+      
+      if (include_rxfilename != "") {
+        if (exclude_rxfilename != "") {
+          KALDI_ERR << "should not have both --exclude and --include option!";
+        }
+        return CopySubsetLattices(include_rxfilename,  
+            &lattice_reader, &lattice_writer,
+            true, ignore_missing);
+      } else if (exclude_rxfilename != "") {
+        return CopySubsetLattices(exclude_rxfilename, 
+            &lattice_reader, &lattice_writer,
+            false, ignore_missing);
+      }
+
       for (; !lattice_reader.Done(); lattice_reader.Next(), n_done++)
         lattice_writer.Write(lattice_reader.Key(), lattice_reader.Value());
     } else {
       SequentialLatticeReader lattice_reader(lats_rspecifier);
       LatticeWriter lattice_writer(lats_wspecifier);
+      
+      if (include_rxfilename != "") {
+        if (exclude_rxfilename != "") {
+          KALDI_ERR << "should not have both --exclude and --include option!";
+        }
+        return CopySubsetLattices(include_rxfilename,
+            &lattice_reader, &lattice_writer,
+            true, ignore_missing);
+      } else if (exclude_rxfilename != "") {
+        return CopySubsetLattices(exclude_rxfilename,
+            &lattice_reader, &lattice_writer,
+            true, ignore_missing);
+      }
+
       for (; !lattice_reader.Done(); lattice_reader.Next(), n_done++)
         lattice_writer.Write(lattice_reader.Key(), lattice_reader.Value());
     }
     KALDI_LOG << "Done copying " << n_done << " lattices.";
+    
+    if (ignore_missing) return 0;
+
     return (n_done != 0 ? 0 : 1);
   } catch(const std::exception &e) {
     std::cerr << e.what();
diff --git a/src/latbin/lattice-determinize-non-compact.cc b/src/latbin/lattice-determinize-non-compact.cc
new file mode 100644
index 00000000000..8665fcb58d1
--- /dev/null
+++ b/src/latbin/lattice-determinize-non-compact.cc
@@ -0,0 +1,317 @@
+// latbin/lattice-determinize-non-compact.cc
+
+// Copyright 2009-2012  Microsoft Corporation
+//           2012-2013  Johns Hopkins University (Author: Daniel Povey)
+//           2015  Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "util/stl-utils.h"
+#include "fstext/fstext-lib.h"
+#include "lat/kaldi-lattice.h"
+#include "lat/lattice-functions.h"
+#include "lat/push-lattice.h"
+#include "lat/minimize-lattice.h"
+
+namespace kaldi {
+
+typedef Lattice::StateId StateId;
+typedef Lattice::Arc Arc;
+
+// This function is a copy of the function in the program lattice-determinize
+bool DeterminizeLatticeWrapper(const Lattice &lat,
+                               const std::string &key,
+                               bool prune,
+                               BaseFloat beam,
+                               BaseFloat beam_ratio,
+                               int32 max_mem,
+                               int32 max_loop,
+                               BaseFloat delta,
+                               int32 num_loops,
+                               CompactLattice *clat) {
+  fst::DeterminizeLatticeOptions lat_opts;
+  lat_opts.max_mem = max_mem;
+  lat_opts.max_loop = max_loop;
+  lat_opts.delta = delta;
+  BaseFloat cur_beam = beam;
+  for (int32 i = 0; i < num_loops;) { // we increment i below.
+
+    if (lat.Start() == fst::kNoStateId) {
+      KALDI_WARN << "Detected empty lattice, skipping " << key;
+      return false;
+    }
+    
+    // The work gets done in the next line.  
+    if (DeterminizeLattice(lat, clat, lat_opts, NULL)) { 
+      if (prune) PruneLattice(cur_beam, clat);
+      return true;
+    } else { // failed to determinize..
+      KALDI_WARN << "Failed to determinize lattice (presumably max-states "
+                 << "reached), reducing lattice-beam to "
+                 << (cur_beam*beam_ratio) << " and re-trying.";
+      for (; i < num_loops; i++) {
+        cur_beam *= beam_ratio;
+        Lattice pruned_lat(lat);
+        PruneLattice(cur_beam, &pruned_lat);
+        if (NumArcs(lat) == NumArcs(pruned_lat)) {
+          cur_beam *= beam_ratio;
+          KALDI_WARN << "Pruning did not have an effect on the original "
+                     << "lattice size; reducing beam to "
+                     << cur_beam << " and re-trying.";
+        } else if (DeterminizeLattice(pruned_lat, clat, lat_opts, NULL)) {
+          if (prune) PruneLattice(cur_beam, clat);
+          return true;
+        } else {
+          KALDI_WARN << "Determinization failed again; reducing beam again to "
+                     << (cur_beam*beam_ratio) << " and re-trying.";
+        }
+      }
+    }
+  }
+  KALDI_WARN << "Decreased pruning beam --num-loops=" << num_loops
+             << " times and was not able to determinize: failed for "
+             << key;
+  return false;
+}
+
+void ComputeAcousticScoresMap(
+    const Lattice &lat, 
+    unordered_map<std::pair<int32, int32>, std::pair<BaseFloat, int32>, 
+                                        PairHasher<int32> > *acoustic_scores) {
+  acoustic_scores->clear();
+
+  std::vector<int32> state_times;
+  LatticeStateTimes(lat, &state_times);
+  
+  KALDI_ASSERT(lat.Start() == 0);
+
+  for (StateId s = 0; s < lat.NumStates(); s++) {
+    int32 t = state_times[s];
+    for (fst::ArcIterator<Lattice> aiter(lat, s); !aiter.Done();
+          aiter.Next()) {
+      const Arc &arc = aiter.Value();
+      const LatticeWeight &weight = arc.weight;
+
+      int32 tid = arc.ilabel;
+
+      if (tid != 0) {
+        unordered_map<std::pair<int32, int32>, std::pair<BaseFloat, int32>, 
+          PairHasher<int32> >::iterator it = acoustic_scores->find(std::make_pair(t, tid));
+        if (it == acoustic_scores->end()) {
+          acoustic_scores->insert(std::make_pair(std::make_pair(t, tid), 
+                                          std::make_pair(weight.Value2(), 1)));
+        } else {
+          if (it->second.second == 2 
+                && it->second.first / it->second.second != weight.Value2()) {
+            KALDI_VLOG(2) << "Transitions on the same frame have different "
+                          << "acoustic costs for tid " << tid << "; " 
+                          << it->second.first / it->second.second 
+                          << " vs " << weight.Value2();
+          }
+          it->second.first += weight.Value2();
+          it->second.second++;
+        }
+      } else {
+        // Arcs with epsilon input label (tid) must have 0 acoustic cost
+        KALDI_ASSERT(weight.Value2() == 0);
+      }
+    }
+
+    LatticeWeight f = lat.Final(s);
+    if (f != LatticeWeight::Zero()) {
+      // Final acoustic cost must be 0 as we are reading from 
+      // non-determinized, non-compact lattice
+      KALDI_ASSERT(f.Value2() == 0.0);
+    }
+  }
+}
+
+void ReplaceAcousticScoresFromMap(
+    const unordered_map<std::pair<int32, int32>, std::pair<BaseFloat, int32>, 
+                                        PairHasher<int32> > &acoustic_scores,
+    Lattice *lat) {
+  fst::TopSort(lat);
+  
+  std::vector<int32> state_times;
+  LatticeStateTimes(*lat, &state_times);
+  
+  KALDI_ASSERT(lat->Start() == 0);
+
+  for (StateId s = 0; s < lat->NumStates(); s++) {
+    int32 t = state_times[s];
+    for (fst::MutableArcIterator<Lattice> aiter(lat, s); 
+          !aiter.Done(); aiter.Next()) {
+      Arc arc(aiter.Value());
+ 
+      int32 tid = arc.ilabel;
+      if (tid != 0) {
+        unordered_map<std::pair<int32, int32>, std::pair<BaseFloat, int32>, 
+          PairHasher<int32> >::const_iterator it = acoustic_scores.find(std::make_pair(t, tid));
+        if (it == acoustic_scores.end()) {
+          KALDI_ERR << "Could not find tid " << tid << " at time " << t
+                    << " in the acoustic scores map.";
+        } else {
+          arc.weight.SetValue2(it->second.first / it->second.second);
+        }
+      } else {
+        // For epsilon arcs, set acoustic cost to 0.0
+        arc.weight.SetValue2(0.0);
+      }
+      aiter.SetValue(arc);
+    }
+
+    LatticeWeight f = lat->Final(s);
+    if (f != LatticeWeight::Zero()) {
+      // Set final acoustic cost to 0.0
+      f.SetValue2(0.0);
+      lat->SetFinal(s, f);
+    }
+  }
+}
+
+}
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+    using fst::SymbolTable;
+    using fst::VectorFst;
+    using fst::StdArc;
+
+    const char *usage =
+        "lattice-determinize lattices (and apply a pruning beam)\n"
+        " (see http://kaldi.sourceforge.net/lattices.html for more explanation)\n"
+        "This version of the program retains the original "
+        "acoustic scores of arcs in the determinized lattice and writes it "
+        "as a normal (non-compact) lattice. \n"
+        " note: this program is tyically only useful if you generated state-level\n"
+        " lattices, e.g. called gmm-latgen-simple with --determinize=false\n"
+        "\n"
+        "Usage: lattice-determinize-non-compact [options] lattice-rspecifier lattice-wspecifier\n"
+        " e.g.: lattice-determinize-non-compact --acoustic-scale=0.1 --beam=15.0 ark:1.lats ark:det.lats\n";
+      
+    ParseOptions po(usage);
+    BaseFloat acoustic_scale = 1.0;
+    BaseFloat beam = 10.0;
+    BaseFloat beam_ratio = 0.9;
+    int32 num_loops = 20;
+    int32 max_mem = 50000000; // 50 MB
+    int32 max_loop = 500000;
+    BaseFloat delta = fst::kDelta;
+    bool prune = false;
+    bool minimize = false;
+    
+    po.Register("acoustic-scale", &acoustic_scale,
+                "Scaling factor for acoustic likelihoods");
+    po.Register("beam", &beam,
+                "Pruning beam [applied after acoustic scaling]-- also used "
+                "to handle determinization failures, set --prune=false to "
+                "disable routine pruning");
+    po.Register("delta", &delta, "Tolerance used in determinization");
+    po.Register("prune", &prune, "If true, prune determinized lattices "
+                "with the --beam option.");
+    po.Register("max-mem", &max_mem, "Maximum approximate memory usage in "
+                "determinization (real usage might be many times this)");
+    po.Register("max-loop", &max_loop, "Option to detect a certain "
+                "type of failure in lattice determinization (not critical)");
+    po.Register("beam-ratio", &beam_ratio, "Ratio by which to "
+                "decrease beam if we reach the max-arcs.");
+    po.Register("num-loops", &num_loops, "Number of times to "
+                "decrease beam by beam-ratio if determinization fails.");
+    po.Register("minimize", &minimize,
+                "If true, push and minimize after determinization");
+    
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string lats_rspecifier = po.GetArg(1),
+        lats_wspecifier = po.GetArg(2);
+
+    // Read as regular lattice-- this is the form we need it in for efficient
+    // pruning.
+    SequentialLatticeReader lattice_reader(lats_rspecifier);
+    
+    // Write as regular lattice.
+    LatticeWriter lattice_writer(lats_wspecifier); 
+
+    int32 n_done = 0, n_error = 0;
+
+    if (acoustic_scale == 0.0)
+      KALDI_ERR << "Do not use a zero acoustic scale (cannot be inverted)";
+    LatticeWeight beam_weight(beam, static_cast<BaseFloat>(0.0));
+
+    for (; !lattice_reader.Done(); lattice_reader.Next()) {
+      std::string key = lattice_reader.Key();
+      Lattice lat = lattice_reader.Value();
+      
+      lattice_reader.FreeCurrent();
+      
+      fst::TopSort(&lat);
+      
+      fst::ScaleLattice(fst::AcousticLatticeScale(acoustic_scale), &lat);
+
+
+      // Compute a map from each (t, tid) to (sum_of_acoustic_scores, count) 
+      unordered_map<std::pair<int32,int32>, std::pair<BaseFloat, int32>, 
+                                          PairHasher<int32> > acoustic_scores;
+      ComputeAcousticScoresMap(lat, &acoustic_scores);
+      
+      Invert(&lat); // make it so word labels are on the input.
+      
+      CompactLattice clat;
+      if (DeterminizeLatticeWrapper(lat, key, prune,
+                                    beam, beam_ratio, max_mem, max_loop,
+                                    delta, num_loops, &clat)) {
+        if (minimize) {
+          PushCompactLatticeStrings(&clat);
+          PushCompactLatticeWeights(&clat);
+          MinimizeCompactLattice(&clat);
+        }
+
+        Lattice out_lat;
+        fst::ConvertLattice(clat, &out_lat);
+        fst::TopSort(&out_lat);
+
+        // Replace each arc (t, tid) with the averaged acoustic score from
+        // the computed map
+        ReplaceAcousticScoresFromMap(acoustic_scores, &out_lat);
+
+        fst::ScaleLattice(fst::AcousticLatticeScale(1.0/acoustic_scale), 
+                          &out_lat);
+        lattice_writer.Write(key, out_lat);
+        n_done++;
+      } else {
+        n_error++; // will have already printed warning.
+      }
+    }
+
+    KALDI_LOG << "Done " << n_done << " lattices, errors on " << n_error;
+    return (n_done != 0 ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
+
diff --git a/src/latbin/lattice-determinize-pruned.cc b/src/latbin/lattice-determinize-pruned.cc
index 1724f4993b5..c8160dd6567 100644
--- a/src/latbin/lattice-determinize-pruned.cc
+++ b/src/latbin/lattice-determinize-pruned.cc
@@ -28,7 +28,7 @@ int main(int argc, char *argv[]) {
   try {
     using namespace kaldi;
     typedef kaldi::int32 int32;
-    
+
     const char *usage =
         "Determinize lattices, keeping only the best path (sequence of acoustic states)\n"
         "for each input-symbol sequence.  This version does pruning as part of the\n"
@@ -37,7 +37,7 @@ int main(int argc, char *argv[]) {
         "\n"
         "Usage: lattice-determinize-pruned [options] lattice-rspecifier lattice-wspecifier\n"
         " e.g.: lattice-determinize-pruned --acoustic-scale=0.1 --beam=6.0 ark:in.lats ark:det.lats\n";
-    
+
     ParseOptions po(usage);
     BaseFloat acoustic_scale = 1.0;
     BaseFloat beam = 10.0;
@@ -47,7 +47,7 @@ int main(int argc, char *argv[]) {
     // being more part of "fst world", so we register its elements independently.
     opts.max_mem = 50000000;
     opts.max_loop = 0; // was 500000;
-    
+
     po.Register("acoustic-scale", &acoustic_scale,
                 "Scaling factor for acoustic likelihoods");
     po.Register("beam", &beam, "Pruning beam [applied after acoustic scaling].");
@@ -68,9 +68,9 @@ int main(int argc, char *argv[]) {
     // Read as regular lattice-- this is the form the determinization code
     // accepts.
     SequentialLatticeReader lat_reader(lats_rspecifier);
-    
+
     // Write as compact lattice.
-    CompactLatticeWriter compact_lat_writer(lats_wspecifier); 
+    CompactLatticeWriter compact_lat_writer(lats_wspecifier);
 
     int32 n_done = 0, n_warn = 0;
 
diff --git a/src/latbin/lattice-interp.cc b/src/latbin/lattice-interp.cc
index db2a155fdb2..41e1b32658f 100644
--- a/src/latbin/lattice-interp.cc
+++ b/src/latbin/lattice-interp.cc
@@ -58,32 +58,32 @@ int main(int argc, char *argv[]) {
     std::string lats_rspecifier1 = po.GetArg(1),
         lats_rspecifier2 = po.GetArg(2),
         lats_wspecifier = po.GetArg(3);
-    
+
     SequentialLatticeReader lattice_reader1(lats_rspecifier1);
     RandomAccessCompactLatticeReader lattice_reader2(lats_rspecifier2);
 
     CompactLatticeWriter compact_lattice_writer(lats_wspecifier);
 
     int32 n_processed = 0, n_empty = 0, n_success = 0, n_no_2ndlat=0;
-    
+
     for (; !lattice_reader1.Done(); lattice_reader1.Next()) {
       std::string key = lattice_reader1.Key();
       Lattice lat1 = lattice_reader1.Value();
       lattice_reader1.FreeCurrent();
       ScaleLattice(fst::LatticeScale(alpha, alpha), &lat1);
       ArcSort(&lat1, fst::OLabelCompare<LatticeArc>());
-      
+
       if (lattice_reader2.HasKey(key)) {
         n_processed++;
         CompactLattice clat2 = lattice_reader2.Value(key);
         RemoveAlignmentsFromCompactLattice(&clat2);
-        
+
         Lattice lat2;
         ConvertLattice(clat2, &lat2);
-        fst::Project(&lat2, fst::PROJECT_OUTPUT); // project on words.   
+        fst::Project(&lat2, fst::PROJECT_OUTPUT); // project on words.
         ScaleLattice(fst::LatticeScale(1.0-alpha, 1.0-alpha), &lat2);
         ArcSort(&lat2, fst::ILabelCompare<LatticeArc>());
-        
+
         Lattice lat3;
         Compose(lat1, lat2, &lat3);
         if (lat3.Start() == fst::kNoStateId) { // empty composition.
@@ -100,7 +100,7 @@ int main(int argc, char *argv[]) {
                    << lats_rspecifier2 << ". Not producing output";
         n_no_2ndlat++;
       }
-    }    
+    }
     KALDI_LOG << "Done " << n_processed << " lattices; "
               << n_success << " had nonempty result, " << n_empty
               << " had empty composition; in " << n_no_2ndlat
diff --git a/src/latbin/lattice-lmrescore-const-arpa.cc b/src/latbin/lattice-lmrescore-const-arpa.cc
index 798852236f4..789f0fb8d4e 100644
--- a/src/latbin/lattice-lmrescore-const-arpa.cc
+++ b/src/latbin/lattice-lmrescore-const-arpa.cc
@@ -42,13 +42,13 @@ int main(int argc, char *argv[]) {
         "                                   const-arpa-in lattice-wspecifier\n"
         " e.g.: lattice-lmrescore-const-arpa --lm-scale=-1.0 ark:in.lats \\\n"
         "                                   const_arpa ark:out.lats\n";
-      
+
     ParseOptions po(usage);
     BaseFloat lm_scale = 1.0;
-    
+
     po.Register("lm-scale", &lm_scale, "Scaling factor for language model "
                 "costs; frequently 1.0 or -1.0");
-    
+
     po.Read(argc, argv);
 
     if (po.NumArgs() != 3) {
@@ -66,7 +66,7 @@ int main(int argc, char *argv[]) {
 
     // Reads and writes as compact lattice.
     SequentialCompactLatticeReader compact_lattice_reader(lats_rspecifier);
-    CompactLatticeWriter compact_lattice_writer(lats_wspecifier); 
+    CompactLatticeWriter compact_lattice_writer(lats_wspecifier);
 
     int32 n_done = 0, n_fail = 0;
     for (; !compact_lattice_reader.Done(); compact_lattice_reader.Next()) {
@@ -87,7 +87,7 @@ int main(int argc, char *argv[]) {
         // for each lattice to prevent memory usage increasing with time.
         ConstArpaLmDeterministicFst const_arpa_fst(const_arpa);
 
-        // Composes lattice with language model.        
+        // Composes lattice with language model.
         CompactLattice composed_clat;
         ComposeCompactLatticeDeterministic(clat,
                                            &const_arpa_fst, &composed_clat);
diff --git a/src/latbin/lattice-lmrescore-rnnlm.cc b/src/latbin/lattice-lmrescore-rnnlm.cc
new file mode 100644
index 00000000000..92a9b014297
--- /dev/null
+++ b/src/latbin/lattice-lmrescore-rnnlm.cc
@@ -0,0 +1,142 @@
+// latbin/lattice-lmrescore-rnnlm.cc
+
+// Copyright 2015  Guoguo Chen
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "base/kaldi-common.h"
+#include "fstext/fstext-lib.h"
+#include "lat/kaldi-lattice.h"
+#include "lat/lattice-functions.h"
+#include "lm/kaldi-rnnlm.h"
+#include "lm/mikolov-rnnlm-lib.h"
+#include "util/common-utils.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Rescores lattice with rnnlm. The LM will be wrapped into the\n"
+        "DeterministicOnDemandFst interface and the rescoring is done by\n"
+        "composing with the wrapped LM using a special type of composition\n"
+        "algorithm. Determinization will be applied on the composed lattice.\n"
+        "\n"
+        "Usage: lattice-lmrescore-rnnlm [options] [unk_prob_rspecifier] \\\n"
+        "             <word-symbol-table-rxfilename> <lattice-rspecifier> \\\n"
+        "             <rnnlm-rxfilename> <lattice-wspecifier>\n"
+        " e.g.: lattice-lmrescore-rnnlm --lm-scale=-1.0 words.txt \\\n"
+        "                     ark:in.lats rnnlm ark:out.lats\n";
+
+    ParseOptions po(usage);
+    int32 max_ngram_order = 3;
+    BaseFloat lm_scale = 1.0;
+
+    po.Register("lm-scale", &lm_scale, "Scaling factor for language model "
+                "costs; frequently 1.0 or -1.0");
+    po.Register("max-ngram-order", &max_ngram_order, "If positive, limit the "
+                "rnnlm context to the given number, -1 means we are not going "
+                "to limit it.");
+
+    KaldiRnnlmWrapperOpts opts;
+    opts.Register(&po);
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 4 && po.NumArgs() != 5) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string lats_rspecifier, unk_prob_rspecifier,
+        word_symbols_rxfilename, rnnlm_rxfilename, lats_wspecifier;
+    if (po.NumArgs() == 4) {
+      unk_prob_rspecifier = "";
+      word_symbols_rxfilename = po.GetArg(1);
+      lats_rspecifier = po.GetArg(2);
+      rnnlm_rxfilename = po.GetArg(3);
+      lats_wspecifier = po.GetArg(4);
+    } else if (po.NumArgs() == 5) {
+      unk_prob_rspecifier = po.GetArg(1);
+      word_symbols_rxfilename = po.GetArg(2);
+      lats_rspecifier = po.GetArg(3);
+      rnnlm_rxfilename = po.GetArg(4);
+      lats_wspecifier = po.GetArg(5);
+    }
+
+    // Reads the language model.
+    KaldiRnnlmWrapper rnnlm(opts, unk_prob_rspecifier,
+                            word_symbols_rxfilename, rnnlm_rxfilename);
+
+    // Reads and writes as compact lattice.
+    SequentialCompactLatticeReader compact_lattice_reader(lats_rspecifier);
+    CompactLatticeWriter compact_lattice_writer(lats_wspecifier);
+
+    int32 n_done = 0, n_fail = 0;
+    for (; !compact_lattice_reader.Done(); compact_lattice_reader.Next()) {
+      std::string key = compact_lattice_reader.Key();
+      CompactLattice clat = compact_lattice_reader.Value();
+      compact_lattice_reader.FreeCurrent();
+
+      if (lm_scale != 0.0) {
+        // Before composing with the LM FST, we scale the lattice weights
+        // by the inverse of "lm_scale".  We'll later scale by "lm_scale".
+        // We do it this way so we can determinize and it will give the
+        // right effect (taking the "best path" through the LM) regardless
+        // of the sign of lm_scale.
+        fst::ScaleLattice(fst::GraphLatticeScale(1.0 / lm_scale), &clat);
+        ArcSort(&clat, fst::OLabelCompare<CompactLatticeArc>());
+
+        // Wraps the rnnlm into FST. We re-create it for each lattice to prevent
+        // memory usage increasing with time.
+        RnnlmDeterministicFst rnnlm_fst(max_ngram_order, &rnnlm);
+
+        // Composes lattice with language model.
+        CompactLattice composed_clat;
+        ComposeCompactLatticeDeterministic(clat, &rnnlm_fst, &composed_clat);
+
+        // Determinizes the composed lattice.
+        Lattice composed_lat;
+        ConvertLattice(composed_clat, &composed_lat);
+        Invert(&composed_lat);
+        CompactLattice determinized_clat;
+        DeterminizeLattice(composed_lat, &determinized_clat);
+        fst::ScaleLattice(fst::GraphLatticeScale(lm_scale), &determinized_clat);
+        if (determinized_clat.Start() == fst::kNoStateId) {
+          KALDI_WARN << "Empty lattice for utterance " << key
+              << " (incompatible LM?)";
+          n_fail++;
+        } else {
+          compact_lattice_writer.Write(key, determinized_clat);
+          n_done++;
+        }
+      } else {
+        // Zero scale so nothing to do.
+        n_done++;
+        compact_lattice_writer.Write(key, clat);
+      }
+    }
+
+    KALDI_LOG << "Done " << n_done << " lattices, failed for " << n_fail;
+    return (n_done != 0 ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
diff --git a/src/latbin/lattice-lmrescore.cc b/src/latbin/lattice-lmrescore.cc
index c4de72f5c79..b8f1067e607 100644
--- a/src/latbin/lattice-lmrescore.cc
+++ b/src/latbin/lattice-lmrescore.cc
@@ -25,7 +25,6 @@
 #include "fstext/kaldi-fst-io.h"
 #include "lat/kaldi-lattice.h"
 
-
 int main(int argc, char *argv[]) {
   try {
     using namespace kaldi;
@@ -40,18 +39,18 @@ int main(int argc, char *argv[]) {
         "Add lm_scale * [cost of best path through LM FST] to graph-cost of\n"
         "paths through lattice.  Does this by composing with LM FST, then\n"
         "lattice-determinizing (it has to negate weights first if lm_scale<0)\n"
-        "Usage: lattice-lmrescore [options] lattice-rspecifier lm-fst-in lattice-wspecifier\n"
+        "Usage: lattice-lmrescore [options] <lattice-rspecifier> <lm-fst-in> <lattice-wspecifier>\n"
         " e.g.: lattice-lmrescore --lm-scale=-1.0 ark:in.lats 'fstproject --project_output=true data/lang/G.fst|' ark:out.lats\n";
-      
+
     ParseOptions po(usage);
     BaseFloat lm_scale = 1.0;
     int32 num_states_cache = 50000;
-    
+
     po.Register("lm-scale", &lm_scale, "Scaling factor for language model costs; frequently 1.0 or -1.0");
     po.Register("num-states-cache", &num_states_cache,
                 "Number of states we cache when mapping LM FST to lattice type. "
                 "More -> more memory but faster.");
-    
+
     po.Read(argc, argv);
 
     if (po.NumArgs() != 3) {
@@ -65,7 +64,7 @@ int main(int argc, char *argv[]) {
 
 
 
-    VectorFst<StdArc> *std_lm_fst = ReadFstKaldi(fst_rxfilename);    
+    VectorFst<StdArc> *std_lm_fst = ReadFstKaldi(fst_rxfilename);
     if (std_lm_fst->Properties(fst::kILabelSorted, true) == 0) {
       // Make sure LM is sorted on ilabel.
       fst::ILabelCompare<StdArc> ilabel_comp;
@@ -75,12 +74,12 @@ int main(int argc, char *argv[]) {
     // mapped_fst is the LM fst interpreted using the LatticeWeight semiring,
     // with all the cost on the first member of the pair (since it's a graph
     // weight).
-    fst::CacheOptions cache_opts(true, num_states_cache);    
+    fst::CacheOptions cache_opts(true, num_states_cache);
     fst::StdToLatticeMapper<BaseFloat> mapper;
     fst::MapFst<StdArc, LatticeArc, fst::StdToLatticeMapper<BaseFloat> >
         lm_fst(*std_lm_fst, mapper, cache_opts);
     delete std_lm_fst;
-    
+
     // The next fifteen or so lines are a kind of optimization and
     // can be ignored if you just want to understand what is going on.
     // Change the options for TableCompose to match the input
@@ -89,21 +88,21 @@ int main(int argc, char *argv[]) {
     fst::TableComposeOptions compose_opts(fst::TableMatcherOptions(),
                                           true, fst::SEQUENCE_FILTER,
                                           fst::MATCH_INPUT);
-    
+
     // The following is an optimization for the TableCompose
     // composition: it stores certain tables that enable fast
     // lookup of arcs during composition.
     fst::TableComposeCache<fst::Fst<LatticeArc> > lm_compose_cache(compose_opts);
-    
+
     // Read as regular lattice-- this is the form we need it in for efficient
     // composition and determinization.
     SequentialLatticeReader lattice_reader(lats_rspecifier);
-    
+
     // Write as compact lattice.
-    CompactLatticeWriter compact_lattice_writer(lats_wspecifier); 
+    CompactLatticeWriter compact_lattice_writer(lats_wspecifier);
 
     int32 n_done = 0, n_fail = 0;
-    
+
     for (; !lattice_reader.Done(); lattice_reader.Next()) {
       std::string key = lattice_reader.Key();
       Lattice lat = lattice_reader.Value();
@@ -115,9 +114,9 @@ int main(int argc, char *argv[]) {
         // We do it this way so we can determinize and it will give the
         // right effect (taking the "best path" through the LM) regardless
         // of the sign of lm_scale.
-        fst::ScaleLattice(fst::GraphLatticeScale(1.0/lm_scale), &lat);
+        fst::ScaleLattice(fst::GraphLatticeScale(1.0 / lm_scale), &lat);
         ArcSort(&lat, fst::OLabelCompare<LatticeArc>());
-        
+
         Lattice composed_lat;
         // Could just do, more simply: Compose(lat, lm_fst, &composed_lat);
         // and not have lm_compose_cache at all.
diff --git a/src/latbin/lattice-mbr-decode.cc b/src/latbin/lattice-mbr-decode.cc
index cb474584f82..465f4e35fbd 100644
--- a/src/latbin/lattice-mbr-decode.cc
+++ b/src/latbin/lattice-mbr-decode.cc
@@ -122,7 +122,7 @@ int main(int argc, char *argv[]) {
               << (tot_bayes_risk / n_done) << " and per word, "
               << (tot_bayes_risk / n_words);
     
-    if (word_syms) delete word_syms;
+    delete word_syms;
     return (n_done != 0 ? 0 : 1);
   } catch(const std::exception &e) {
     std::cerr << e.what();
diff --git a/src/latbin/lattice-oracle.cc b/src/latbin/lattice-oracle.cc
index e5e8590fc11..c235efe8d9f 100644
--- a/src/latbin/lattice-oracle.cc
+++ b/src/latbin/lattice-oracle.cc
@@ -2,6 +2,7 @@
 
 // Copyright 2011 Gilles Boulianne
 //           2013 Johns Hopkins University (author: Daniel Povey)
+//           2015 Guoguo Chen
 //
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -28,7 +29,7 @@ namespace kaldi {
 using std::vector;
 using std::set;
 
-typedef unordered_set<fst::StdArc::Label> LabelSet; 
+typedef unordered_set<fst::StdArc::Label> LabelSet;
 
 void ReadSymbolList(const std::string &rxfilename,
                     fst::SymbolTable *word_syms,
@@ -57,9 +58,11 @@ void ReadSymbolList(const std::string &rxfilename,
 
 void MapWildCards(const LabelSet &wildcards, fst::StdVectorFst *ofst) {
   // map all wildcards symbols to epsilons
-  for (fst::StateIterator<fst::StdVectorFst> siter(*ofst); !siter.Done(); siter.Next()) {
+  for (fst::StateIterator<fst::StdVectorFst> siter(*ofst);
+       !siter.Done(); siter.Next()) {
     fst::StdArc::StateId s = siter.Value();
-    for (fst::MutableArcIterator<fst::StdVectorFst> aiter(ofst, s); !aiter.Done();  aiter.Next()) {
+    for (fst::MutableArcIterator<fst::StdVectorFst> aiter(ofst, s);
+         !aiter.Done();  aiter.Next()) {
       fst::StdArc arc(aiter.Value());
       LabelSet::const_iterator it = wildcards.find(arc.ilabel);
       if (it != wildcards.end()) {
@@ -68,10 +71,12 @@ void MapWildCards(const LabelSet &wildcards, fst::StdVectorFst *ofst) {
         arc.ilabel = 0;
       }
       it = wildcards.find(arc.olabel);
-      if (it != wildcards.end()) {arc.olabel = 0;}
+      if (it != wildcards.end()) {
+        arc.olabel = 0;
+      }
       aiter.SetValue(arc);
     }
-  }    
+  }
 }
 
 // convert from Lattice to standard FST
@@ -81,22 +86,22 @@ void ConvertLatticeToUnweightedAcceptor(const kaldi::Lattice &ilat,
                                         const LabelSet &wildcards,
                                         fst::StdVectorFst *ofst) {
   // first convert from  lattice to normal FST
-  fst::ConvertLattice(ilat, ofst); 
+  fst::ConvertLattice(ilat, ofst);
   // remove weights, project to output, sort according to input arg
-  fst::Map(ofst, fst::RmWeightMapper<fst::StdArc>()); 
-  fst::Project(ofst, fst::PROJECT_OUTPUT);  // The words are on the output side  
+  fst::Map(ofst, fst::RmWeightMapper<fst::StdArc>());
+  fst::Project(ofst, fst::PROJECT_OUTPUT);  // The words are on the output side
   MapWildCards(wildcards, ofst);
-  fst::RmEpsilon(ofst);   // Don't tolerate epsilons as they make it hard to tally errors
+  fst::RmEpsilon(ofst);   // Don't tolerate epsilons as they make it hard to
+                          // tally errors
   fst::ArcSort(ofst, fst::StdILabelCompare());
 }
 
 void CreateEditDistance(const fst::StdVectorFst &fst1,
                         const fst::StdVectorFst &fst2,
                         fst::StdVectorFst *pfst) {
-  using namespace fst;
-  typedef StdArc StdArc;
-  typedef StdArc::Weight Weight;
-  typedef StdArc::Label Label;
+  typedef fst::StdArc StdArc;
+  typedef fst::StdArc::Weight Weight;
+  typedef fst::StdArc::Label Label;
   Weight correct_cost(0.0);
   Weight substitution_cost(1.0);
   Weight insertion_cost(1.0);
@@ -109,26 +114,26 @@ void CreateEditDistance(const fst::StdVectorFst &fst1,
 
   pfst->AddState();
   pfst->SetStart(0);
-  for (size_t i = 0; i < fst1syms.size(); i++) 
-    pfst->AddArc(0, StdArc(fst1syms[i], 0, deletion_cost, 0)); // deletions
-  
+  for (size_t i = 0; i < fst1syms.size(); i++)
+    pfst->AddArc(0, StdArc(fst1syms[i], 0, deletion_cost, 0));  // deletions
+
   for (size_t i = 0; i < fst2syms.size(); i++)
     pfst->AddArc(0, StdArc(0, fst2syms[i], insertion_cost, 0));  // insertions
- 
+
   // stupid implementation O(N^2)
   for (size_t i = 0; i < fst1syms.size(); i++) {
     Label label1 = fst1syms[i];
     for (size_t j = 0; j < fst2syms.size(); j++) {
       Label label2 = fst2syms[j];
-      Weight cost( label1 == label2 ? correct_cost : substitution_cost);
-      pfst->AddArc(0, StdArc(label1, label2, cost, 0)); // substitutions
+      Weight cost(label1 == label2 ? correct_cost : substitution_cost);
+      pfst->AddArc(0, StdArc(label1, label2, cost, 0));  // substitutions
     }
   }
   pfst->SetFinal(0, Weight::One());
-  ArcSort(pfst, StdOLabelCompare());
+  ArcSort(pfst, fst::StdOLabelCompare());
 }
 
-void CountErrors(fst::StdVectorFst &fst,
+void CountErrors(const fst::StdVectorFst &fst,
                  int32 *correct,
                  int32 *substitutions,
                  int32 *insertions,
@@ -136,12 +141,13 @@ void CountErrors(fst::StdVectorFst &fst,
                  int32 *num_words) {
   typedef fst::StdArc::StateId StateId;
   typedef fst::StdArc::Weight Weight;
-   *correct = *substitutions = *insertions = *deletions = *num_words = 0;
+  *correct = *substitutions = *insertions = *deletions = *num_words = 0;
 
   // go through the first complete path in fst (there should be only one)
-  StateId src = fst.Start(); 
-  while (fst.Final(src)== Weight::Zero()) { // while not final
-    for (fst::ArcIterator<fst::StdVectorFst> aiter(fst, src); !aiter.Done(); aiter.Next()) {
+  StateId src = fst.Start();
+  while (fst.Final(src)== Weight::Zero()) {  // while not final
+    for (fst::ArcIterator<fst::StdVectorFst> aiter(fst, src);
+         !aiter.Done(); aiter.Next()) {
       fst::StdArc arc = aiter.Value();
       if (arc.ilabel == arc.olabel && arc.ilabel != 0) {
         (*correct)++;
@@ -150,7 +156,7 @@ void CountErrors(fst::StdVectorFst &fst,
         (*deletions)++;
         (*num_words)++;
       } else if (arc.ilabel != 0 && arc.olabel == 0) {
-        (*insertions)++; 
+        (*insertions)++;
       } else if (arc.ilabel != 0 && arc.olabel != 0) {
         (*substitutions)++;
         (*num_words)++;
@@ -158,110 +164,79 @@ void CountErrors(fst::StdVectorFst &fst,
         KALDI_ASSERT(arc.ilabel == 0 && arc.olabel == 0);
       }
       src = arc.nextstate;
-      continue; // jump to next state
+      continue;  // jump to next state
     }
   }
 }
 
 
-bool CheckFst(fst::StdVectorFst &fst, string name, string key) {
-
+bool CheckFst(const fst::StdVectorFst &fst, string name, string key) {
 #ifdef DEBUG
   StateId numstates = fst.NumStates();
-  cerr << " "<<name<<" has "<<numstates<<" states"<<endl;
-  std::stringstream ss; ss <<name<<key<<".fst";
+  cerr << " " << name << " has " <<numstates << " states" <<endl;
+  std::stringstream ss;
+  ss << name << key << ".fst";
   fst.Write(ss.str());
-  return(fst.Start() == fst::kNoStateId); 
+  return(fst.Start() == fst::kNoStateId);
 #else
   return true;
 #endif
 }
-
-
-// Guoguo Chen added the implementation for option "write-lattices". This
-// function does a depth first search on the lattice and remove the arcs that
-// don't correctespond to the oracle path. By "remove" I actually point the next
-// state of the arc to some state that is not in the lattice and then use the
-// openfst connect function. This makes things much easier. 
-bool GetOracleLattice(Lattice *oracle_lat, 
-                      vector<int32> oracle_words, 
-                      LatticeArc::StateId bad_state,
-                      LatticeArc::StateId current_state, 
-                      int32 current_word) {
-  if (current_word == oracle_words.size()) {
-    if (oracle_lat->Final(current_state) != LatticeArc::Weight::Zero())
-      return true;
-  } else {
-    if (oracle_lat->Final(current_state) != LatticeArc::Weight::Zero())
-      return false;
-  }
-
-  bool status = false;
-  for (fst::MutableArcIterator<Lattice> aiter(oracle_lat, current_state);
-       !aiter.Done();
-       aiter.Next()) {
-    LatticeArc arc(aiter.Value());
-    LatticeArc::StateId nextstate = arc.nextstate;
-    if (arc.olabel == 0)
-      status = GetOracleLattice(oracle_lat, oracle_words, bad_state, nextstate, current_word) || status;
-    else if (current_word < oracle_words.size() && arc.olabel == oracle_words[current_word])
-      status = GetOracleLattice(oracle_lat, oracle_words, bad_state, nextstate, ++current_word) || status;
-    else {
-      arc.nextstate = bad_state;
-      aiter.SetValue(arc);
-    }
-  }
-
-  if (current_state == oracle_lat->Start())
-    fst::Connect(oracle_lat);
-
-  return status;
-}
-
 }
 
 int main(int argc, char *argv[]) {
   try {
     using namespace kaldi;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
     using fst::SymbolTable;
     using fst::VectorFst;
     using fst::StdArc;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
     typedef fst::StdArc::Weight Weight;
     typedef fst::StdArc::StateId StateId;
 
     const char *usage =
-        "Finds the path having the smallest edit-distance between two lattices.\n"
-        "For efficiency put the smallest lattices first (for example reference strings).\n"
-        "Usage: lattice-oracle [options] <test-lattice-rspecifier> <reference-rspecifier> "
-        "<transcriptions-wspecifier> [<edit-distance-wspecifier>]\n"
-        " e.g.: lattice-oracle ark:lat.1 'ark:sym2int.pl -f 2- data/lang/words.txt <data/test/text' ark,t:-\n"
-        "Note: you can use this program to compute the n-best oracle WER by first piping\n"
-        "the input lattices through lattice-to-nbest and then nbest-to-lattice.\n";
-        
+        "Finds the path having the smallest edit-distance between two\n"
+        "lattices. For efficiency put the smallest lattices first (for\n"
+        "example reference strings).\n"
+        "\n"
+        "Usage: lattice-oracle [options] <test-lattice-rspecifier> \\\n"
+        "                                <reference-rspecifier> \\\n"
+        "                                <transcriptions-wspecifier> \\\n"
+        "                                [<edit-distance-wspecifier>]\n"
+        " e.g.: lattice-oracle ark:lat.1 'ark:sym2int.pl -f 2- \\\n"
+        "                       data/lang/words.txt <data/test/text|' ark,t:-\n"
+        "\n"
+        "Note the --write-lattices option by which you can write out the\n"
+        "optimal path as a lattice.\n"
+        "Note: you can use this program to compute the n-best oracle WER by\n"
+        "first piping the input lattices through lattice-to-nbest and then\n"
+        "nbest-to-lattice.\n";
+
     ParseOptions po(usage);
-    
+
     std::string word_syms_filename;
     std::string wild_syms_rxfilename;
     std::string wildcard_symbols;
     std::string lats_wspecifier;
-    
+
     po.Register("word-symbol-table", &word_syms_filename,
                 "Symbol table for words [for debug output]");
-    po.Register("wildcard-symbols-list", &wild_syms_rxfilename, "Filename (generally, "
-                "rxfilename) for file containing text-form list of symbols that "
-                "don't count as errors; this option requires --word-symbol-table."
-                "  Deprecated; use --wildcard-symbols option.");
+    po.Register("wildcard-symbols-list", &wild_syms_rxfilename, "Filename "
+                "(generally rxfilename) for file containing text-form list of "
+                "symbols that don't count as errors; this option requires "
+                "--word-symbol-table. Deprecated; use --wildcard-symbols "
+                "option.");
     po.Register("wildcard-symbols", &wildcard_symbols,
                 "Colon-separated list of integer ids of symbols that "
                 "don't count as errors.  Preferred alternative to deprecated "
                 "option --wildcard-symbols-list.");
-    po.Register("write-lattices", &lats_wspecifier, "If supplied, write 1-best "
-                "path as lattices to this wspecifier");
-    
+    po.Register("write-lattices", &lats_wspecifier, "If supplied, write the "
+                "lattice that contains only the oracle path to the given "
+                "wspecifier.");
+
     po.Read(argc, argv);
- 
+
     if (po.NumArgs() != 3 && po.NumArgs() != 4) {
       po.PrintUsage();
       exit(1);
@@ -271,18 +246,16 @@ int main(int argc, char *argv[]) {
         reference_rspecifier = po.GetArg(2),
         transcriptions_wspecifier = po.GetArg(3),
         edit_distance_wspecifier = po.GetOptArg(4);
-    
+
     // will read input as  lattices
     SequentialLatticeReader lattice_reader(lats_rspecifier);
     RandomAccessInt32VectorReader reference_reader(reference_rspecifier);
     Int32VectorWriter transcriptions_writer(transcriptions_wspecifier);
     Int32Writer edit_distance_writer(edit_distance_wspecifier);
-    
-    // Guoguo Chen added the implementation for option "write-lattices".
     CompactLatticeWriter lats_writer(lats_wspecifier);
 
     fst::SymbolTable *word_syms = NULL;
-    if (word_syms_filename != "") 
+    if (word_syms_filename != "")
       if (!(word_syms = fst::SymbolTable::ReadText(word_syms_filename)))
         KALDI_ERR << "Could not read symbol table from file "
                   << word_syms_filename;
@@ -304,24 +277,23 @@ int main(int argc, char *argv[]) {
       }
       for (size_t i = 0; i < wildcard_symbols_vec.size(); i++)
         wildcards.insert(wildcard_symbols_vec[i]);
-    }  
-    
+    }
+
     int32 n_done = 0, n_fail = 0;
-    int32 tot_correct=0, tot_substitutions=0, tot_insertions=0, tot_deletions=0,
-        tot_words=0;
+    int32 tot_correct = 0, tot_substitutions = 0,
+          tot_insertions = 0, tot_deletions = 0, tot_words = 0;
 
     for (; !lattice_reader.Done(); lattice_reader.Next()) {
       std::string key = lattice_reader.Key();
       const Lattice &lat = lattice_reader.Value();
-      cerr << "Lattice "<<key<<" read."<<endl;
+      cerr << "Lattice " << key << " read." << endl;
 
       // remove all weights while creating a standard FST
       VectorFst<StdArc> lattice_fst;
       ConvertLatticeToUnweightedAcceptor(lat, wildcards, &lattice_fst);
       CheckFst(lattice_fst, "lattice_fst_", key);
-      
+
       // TODO: map certain symbols (using an FST created with CreateMapFst())
-      
       if (!reference_reader.HasKey(key)) {
         KALDI_WARN << "No reference present for utterance " << key;
         n_fail++;
@@ -330,27 +302,28 @@ int main(int argc, char *argv[]) {
       const std::vector<int32> &reference = reference_reader.Value(key);
       VectorFst<StdArc> reference_fst;
       MakeLinearAcceptor(reference, &reference_fst);
-      MapWildCards(wildcards, &reference_fst); // Remove any wildcards in reference.
-      
+      MapWildCards(wildcards, &reference_fst);  // Remove any wildcards in
+                                                // reference.
+
       CheckFst(reference_fst, "reference_fst_", key);
-            
+
       // recreate edit distance fst if necessary
       fst::StdVectorFst edit_distance_fst;
       CreateEditDistance(lattice_fst, reference_fst, &edit_distance_fst);
-      
+
       // compose with edit distance transducer
       VectorFst<StdArc> edit_ref_fst;
       fst::Compose(edit_distance_fst, reference_fst, &edit_ref_fst);
       CheckFst(edit_ref_fst, "composed_", key);
-      
+
       // make sure composed FST is input sorted
       fst::ArcSort(&edit_ref_fst, fst::StdILabelCompare());
-      
+
       // compose with previous result
       VectorFst<StdArc> result_fst;
       fst::Compose(lattice_fst, edit_ref_fst, &result_fst);
       CheckFst(result_fst, "result_", key);
-      
+
       // find out best path
       VectorFst<StdArc> best_path;
       fst::ShortestPath(result_fst, &best_path);
@@ -362,23 +335,25 @@ int main(int argc, char *argv[]) {
       } else {
         // count errors
         int32 correct, substitutions, insertions, deletions, num_words;
-        CountErrors(best_path, &correct, &substitutions, &insertions, &deletions, &num_words);
+        CountErrors(best_path, &correct, &substitutions,
+                    &insertions, &deletions, &num_words);
         int32 tot_errs = substitutions + insertions + deletions;
         if (edit_distance_wspecifier != "")
           edit_distance_writer.Write(key, tot_errs);
         KALDI_LOG << "%WER " << (100.*tot_errs) / num_words << " [ " << tot_errs
-                  << " / " << num_words << ", " << insertions << " insertions, " << deletions
-                  << " deletions, " << substitutions << " sub ]";
+                  << " / " << num_words << ", " << insertions << " insertions, "
+                  << deletions << " deletions, " << substitutions << " sub ]";
         tot_correct += correct;
         tot_substitutions += substitutions;
         tot_insertions += insertions;
         tot_deletions += deletions;
-        tot_words += num_words;     
-        
+        tot_words += num_words;
+
         std::vector<int32> oracle_words;
         std::vector<int32> reference_words;
         Weight weight;
-        GetLinearSymbolSequence(best_path, &oracle_words, &reference_words, &weight);
+        GetLinearSymbolSequence(best_path, &oracle_words,
+                                &reference_words, &weight);
         KALDI_LOG << "For utterance " << key << ", best cost " << weight;
         if (transcriptions_wspecifier != "")
           transcriptions_writer.Write(key, oracle_words);
@@ -387,7 +362,8 @@ int main(int argc, char *argv[]) {
           for (size_t i = 0; i < oracle_words.size(); i++) {
             std::string s = word_syms->Find(oracle_words[i]);
             if (s == "")
-              KALDI_ERR << "Word-id " << oracle_words[i] <<" not in symbol table.";
+              KALDI_ERR << "Word-id " << oracle_words[i]
+                  << " not in symbol table.";
             std::cerr << s << ' ';
           }
           std::cerr << '\n' << key << " (reference) ";
@@ -401,25 +377,27 @@ int main(int argc, char *argv[]) {
           std::cerr << '\n';
         }
 
-        // Guoguo Chen added the implementation for option "write-lattices".
-        // Currently it's just a naive implementation: traverse the original
-        // lattice and get the path corresponding to the oracle word sequence.
-        // Note that this new lattice has the alignment information.
+        // If requested, write the lattice that only contains the oracle path.
         if (lats_wspecifier != "") {
-          Lattice oracle_lat = lat;
-          LatticeArc::StateId bad_state = oracle_lat.AddState();
-          if (!GetOracleLattice(&oracle_lat, oracle_words,
-                                bad_state, oracle_lat.Start(), 0)) 
+          CompactLattice oracle_clat_mask;
+          MakeLinearAcceptor(oracle_words, &oracle_clat_mask);
+
+          CompactLattice clat;
+          CompactLattice oracle_clat;
+          ConvertLattice(lat, &clat);
+          fst::Compose(oracle_clat_mask, clat, &oracle_clat);
+
+          if (oracle_clat.Start() == fst::kNoStateId) {
             KALDI_WARN << "Failed to find the oracle path in the original "
                        << "lattice: " << key;
-          CompactLattice oracle_clat;
-          ConvertLattice(oracle_lat, &oracle_clat);
-          lats_writer.Write(key, oracle_clat);
+          } else {
+            lats_writer.Write(key, oracle_clat);
+          }
         }
       }
       n_done++;
     }
-    if (word_syms) delete word_syms;
+    delete word_syms;
     int32 tot_errs = tot_substitutions + tot_deletions + tot_insertions;
     // Warning: the script egs/s5/*/steps/oracle_wer.sh parses the next line.
     KALDI_LOG << "Overall %WER " << (100.*tot_errs)/tot_words << " [ "
diff --git a/src/latbin/lattice-scale.cc b/src/latbin/lattice-scale.cc
index 3c82e56a1c7..5ca6012d994 100644
--- a/src/latbin/lattice-scale.cc
+++ b/src/latbin/lattice-scale.cc
@@ -37,14 +37,14 @@ int main(int argc, char *argv[]) {
         "Apply scaling to lattice weights\n"
         "Usage: lattice-scale [options] lattice-rspecifier lattice-wspecifier\n"
         " e.g.: lattice-scale --lm-scale=0.0 ark:1.lats ark:scaled.lats\n";
-      
+
     ParseOptions po(usage);
     BaseFloat acoustic_scale = 1.0;
     BaseFloat inv_acoustic_scale = 1.0;
     BaseFloat lm_scale = 1.0;
     BaseFloat acoustic2lm_scale = 0.0;
     BaseFloat lm2acoustic_scale = 0.0;
-    
+
     po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic likelihoods");
     po.Register("inv-acoustic-scale", &inv_acoustic_scale, "An alternative way "
                 "of setting the acoustic scale: you can set its inverse.");
@@ -63,16 +63,16 @@ int main(int argc, char *argv[]) {
         lats_wspecifier = po.GetArg(2);
 
     SequentialCompactLatticeReader compact_lattice_reader(lats_rspecifier);
-    
+
     // Write as compact lattice.
-    CompactLatticeWriter compact_lattice_writer(lats_wspecifier); 
+    CompactLatticeWriter compact_lattice_writer(lats_wspecifier);
 
-    int32 n_done = 0; 
+    int32 n_done = 0;
 
     KALDI_ASSERT(acoustic_scale == 1.0 || inv_acoustic_scale == 1.0);
     if (inv_acoustic_scale != 1.0)
       acoustic_scale = 1.0 / inv_acoustic_scale;
-    
+
     std::vector<std::vector<double> > scale(2);
     scale[0].resize(2);
     scale[1].resize(2);
@@ -80,7 +80,7 @@ int main(int argc, char *argv[]) {
     scale[0][1] = acoustic2lm_scale;
     scale[1][0] = lm2acoustic_scale;
     scale[1][1] = acoustic_scale;
-    
+
     for (; !compact_lattice_reader.Done(); compact_lattice_reader.Next()) {
       CompactLattice lat = compact_lattice_reader.Value();
       ScaleLattice(scale, &lat);
diff --git a/src/latbin/lattice-to-ctm-conf.cc b/src/latbin/lattice-to-ctm-conf.cc
index 32d80c0ccf3..56ea983ac9b 100644
--- a/src/latbin/lattice-to-ctm-conf.cc
+++ b/src/latbin/lattice-to-ctm-conf.cc
@@ -51,8 +51,8 @@ int main(int argc, char *argv[]) {
         " e.g.: lattice-to-ctm-conf --acoustic-scale=0.1 ark:1.lats 1.ctm\n"
         "   or: lattice-to-ctm-conf --acoustic-scale=0.1 --decode-mbr=false\\\n"
         "                                      ark:1.lats ark:1.1best 1.ctm\n"
-        "See also: lattice-mbr-decode, nbest-to-ctm, steps/get_ctm.sh,\n"
-        "          steps/get_train_ctm.sh and utils/convert_ctm.sh.\n";
+        "See also: lattice-mbr-decode, nbest-to-ctm, lattice-arc-post,\n"
+        " steps/get_ctm.sh, steps/get_train_ctm.sh and utils/convert_ctm.sh.\n";
 
     ParseOptions po(usage);
     BaseFloat acoustic_scale = 1.0, inv_acoustic_scale = 1.0, lm_scale = 1.0;
@@ -69,7 +69,7 @@ int main(int argc, char *argv[]) {
     po.Register("decode-mbr", &decode_mbr, "If true, do Minimum Bayes Risk "
                 "decoding (else, Maximum a Posteriori)");
     po.Register("frame-shift", &frame_shift, "Time in seconds between frames.");
-    
+
     po.Read(argc, argv);
 
     if (po.NumArgs() != 2 && po.NumArgs() != 3) {
@@ -80,7 +80,7 @@ int main(int argc, char *argv[]) {
     KALDI_ASSERT(acoustic_scale == 1.0 || inv_acoustic_scale == 1.0);
     if (inv_acoustic_scale != 1.0)
       acoustic_scale = 1.0 / inv_acoustic_scale;
-    
+
     std::string lats_rspecifier, one_best_rspecifier, ctm_wxfilename;
 
     if (po.NumArgs() == 2) {
@@ -92,9 +92,9 @@ int main(int argc, char *argv[]) {
       one_best_rspecifier = po.GetArg(2);
       ctm_wxfilename = po.GetArg(3);
     }
-    
+
     // Ensure the output ctm file is not a wspecifier
-    WspecifierType ctm_wx_type; 
+    WspecifierType ctm_wx_type;
     ctm_wx_type  = ClassifyWspecifier(ctm_wxfilename, NULL, NULL, NULL);
     if(ctm_wx_type != kNoWspecifier){
         KALDI_ERR << "The output ctm file should not be a wspecifier. "
@@ -104,7 +104,7 @@ int main(int argc, char *argv[]) {
 
     // Read as compact lattice.
     SequentialCompactLatticeReader clat_reader(lats_rspecifier);
- 
+
     RandomAccessInt32VectorReader one_best_reader(one_best_rspecifier);
 
     Output ko(ctm_wxfilename, false); // false == non-binary writing mode.
@@ -114,7 +114,7 @@ int main(int argc, char *argv[]) {
 
     int32 n_done = 0, n_words = 0;
     BaseFloat tot_bayes_risk = 0.0;
-    
+
     for (; !clat_reader.Done(); clat_reader.Next()) {
       std::string key = clat_reader.Key();
       CompactLattice clat = clat_reader.Value();
@@ -133,7 +133,7 @@ int main(int argc, char *argv[]) {
         const std::vector<int32> &one_best = one_best_reader.Value(key);
         mbr = new MinimumBayesRisk(clat, one_best, decode_mbr);
       }
-      
+
       const std::vector<BaseFloat> &conf = mbr->GetOneBestConfidences();
       const std::vector<int32> &words = mbr->GetOneBest();
       const std::vector<std::pair<BaseFloat, BaseFloat> > &times =
@@ -146,20 +146,19 @@ int main(int argc, char *argv[]) {
                     << words[i] << ' ' << conf[i] << '\n';
       }
       KALDI_LOG << "For utterance " << key << ", Bayes Risk "
-                << mbr->GetBayesRisk() << ", avg. confidence per-word " 
+                << mbr->GetBayesRisk() << ", avg. confidence per-word "
                 << std::accumulate(conf.begin(),conf.end(),0.0) / words.size();
       n_done++;
       n_words += mbr->GetOneBest().size();
       tot_bayes_risk += mbr->GetBayesRisk();
-      if (mbr != NULL)
-        delete mbr;
+      delete mbr;
     }
 
     KALDI_LOG << "Done " << n_done << " lattices.";
     KALDI_LOG << "Overall average Bayes Risk per sentence is "
               << (tot_bayes_risk / n_done) << " and per word, "
               << (tot_bayes_risk / n_words);
-    
+
     return (n_done != 0 ? 0 : 1);
   } catch(const std::exception &e) {
     std::cerr << e.what();
diff --git a/src/latbin/lattice-to-nbest.cc b/src/latbin/lattice-to-nbest.cc
index 7c711afb18b..7fd54525488 100644
--- a/src/latbin/lattice-to-nbest.cc
+++ b/src/latbin/lattice-to-nbest.cc
@@ -40,13 +40,13 @@ int main(int argc, char *argv[]) {
         "lattices).\n"
         "Usage: lattice-to-nbest [options] <lattice-rspecifier> <lattice-wspecifier>\n"
         " e.g.: lattice-to-nbest --acoustic-scale=0.1 --n=10 ark:1.lats ark:nbest.lats\n";
-      
+
     ParseOptions po(usage);
     BaseFloat acoustic_scale = 1.0, lm_scale = 1.0;
     bool random = false;
     int32 srand_seed = 0;
     int32 n = 1;
-    
+
     po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic likelihoods");
     po.Register("lm-scale", &lm_scale, "Scaling factor for language model scores.");
     po.Register("n", &n, "Number of distinct paths");
@@ -54,12 +54,12 @@ int main(int argc, char *argv[]) {
                 "If true, generate n random paths instead of n-best paths");
     po.Register("srand", &srand_seed, "Seed for random number generator "
                 "(only relevant if --random=true)");
-    
-    
+
+
     po.Read(argc, argv);
 
     KALDI_ASSERT(n > 0);
-    srand(srand_seed);        
+    srand(srand_seed);
 
     if (po.NumArgs() != 2) {
       po.PrintUsage();
@@ -72,10 +72,10 @@ int main(int argc, char *argv[]) {
 
     // Read as regular lattice.
     SequentialLatticeReader lattice_reader(lats_rspecifier);
-    
+
     // Write as compact lattice.
-    CompactLatticeWriter compact_nbest_writer(lats_wspecifier); 
-    
+    CompactLatticeWriter compact_nbest_writer(lats_wspecifier);
+
     int32 n_done = 0;
     int64 n_paths_out = 0;
 
@@ -100,7 +100,7 @@ int main(int argc, char *argv[]) {
         }
         fst::ConvertNbestToVector(nbest_lat, &nbest_lats);
       }
-      
+
       if (nbest_lats.empty()) {
         KALDI_WARN << "Possibly empty lattice for utterance-id " << key
                    << "(no N-best entries)";
@@ -120,7 +120,7 @@ int main(int argc, char *argv[]) {
         n_paths_out += nbest_lats.size();
       }
     }
-      
+
     KALDI_LOG << "Done applying N-best algorithm to " << n_done << " lattices with n = "
               << n << ", average actual #paths is "
               << (n_paths_out/(n_done+1.0e-20));
diff --git a/src/latbin/lattice-to-post.cc b/src/latbin/lattice-to-post.cc
index 559fa480920..c04a6748a52 100644
--- a/src/latbin/lattice-to-post.cc
+++ b/src/latbin/lattice-to-post.cc
@@ -35,7 +35,7 @@ int main(int argc, char *argv[]) {
         "Do forward-backward and collect posteriors over lattices.\n"
         "Usage: lattice-to-post [options] lats-rspecifier posts-wspecifier [loglikes-wspecifier]\n"
         " e.g.: lattice-to-post --acoustic-scale=0.1 ark:1.lats ark:1.post\n"
-        "See also: lattice-to-ctm-conf, post-to-pdf-post\n";
+        "See also: lattice-to-ctm-conf, post-to-pdf-post, lattice-arc-post\n";
 
     kaldi::BaseFloat acoustic_scale = 1.0, lm_scale = 1.0;
     kaldi::ParseOptions po(usage);
@@ -76,7 +76,7 @@ int main(int argc, char *argv[]) {
       lattice_reader.FreeCurrent();
       if (acoustic_scale != 1.0 || lm_scale != 1.0)
         fst::ScaleLattice(fst::LatticeScale(lm_scale, acoustic_scale), &lat);
-      
+
       kaldi::uint64 props = lat.Properties(fst::kFstProperties, false);
       if (!(props & fst::kTopSorted)) {
         if (fst::TopSort(&lat) == false)
@@ -95,8 +95,8 @@ int main(int argc, char *argv[]) {
                     << " arcs. Average log-likelihood = " << (lat_like/lat_time)
                     << " over " << lat_time << " frames.  Average acoustic log-like"
                     << " per frame is " << (lat_ac_like/lat_time);
-      
-      if (loglikes_writer.IsOpen()) 
+
+      if (loglikes_writer.IsOpen())
         loglikes_writer.Write(key, lat_like);
 
       posterior_writer.Write(key, post);
diff --git a/src/latbin/linear-to-nbest.cc b/src/latbin/linear-to-nbest.cc
index 968f5204c9b..fd025f382b6 100644
--- a/src/latbin/linear-to-nbest.cc
+++ b/src/latbin/linear-to-nbest.cc
@@ -69,7 +69,7 @@ int main(int argc, char *argv[]) {
         "these value will default to zero.\n"
         " e.g.: linear-to-nbest ark:1.ali ark:1.tra ark:1.lmscore ark:1.acscore "
         "ark:1.nbest\n";
-    
+
     ParseOptions po(usage);
 
     po.Read(argc, argv);
@@ -91,11 +91,11 @@ int main(int argc, char *argv[]) {
     RandomAccessInt32VectorReader trans_reader(trans_rspecifier);
     RandomAccessBaseFloatReader lm_cost_reader(lm_cost_rspecifier);
     RandomAccessBaseFloatReader ac_cost_reader(ac_cost_rspecifier);
-    
+
     CompactLatticeWriter compact_lattice_writer(lats_wspecifier);
-    
+
     int32 n_done = 0, n_err = 0;
-    
+
     for (; !ali_reader.Done(); ali_reader.Next()) {
       std::string key = ali_reader.Key();
       if (!trans_reader.HasKey(key)) {
@@ -122,7 +122,7 @@ int main(int argc, char *argv[]) {
       MakeLatticeFromLinear(ali, words, lm_cost, ac_cost, &lat);
       CompactLattice clat;
       ConvertLattice(lat, &clat);
-      
+
       compact_lattice_writer.Write(key, clat);
       n_done++;
     }
diff --git a/src/latbin/nbest-to-ctm.cc b/src/latbin/nbest-to-ctm.cc
index 1993041dee6..e396f315ba1 100644
--- a/src/latbin/nbest-to-ctm.cc
+++ b/src/latbin/nbest-to-ctm.cc
@@ -1,6 +1,6 @@
 // latbin/nbest-to-ctm.cc
 
-// Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
+// Copyright 2012-2016  Johns Hopkins University (Author: Daniel Povey)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -43,14 +43,19 @@ int main(int argc, char *argv[]) {
         "e.g.: lattice-1best --acoustic-weight=0.08333 ark:1.lats | \\\n"
         "      lattice-align-words data/lang/phones/word_boundary.int exp/dir/final.mdl ark:- ark:- | \\\n"
         "      nbest-to-ctm ark:- 1.ctm\n";
-    
+
     ParseOptions po(usage);
 
+    bool print_silence = false;
     BaseFloat frame_shift = 0.01;
     int32 precision = 2;
+    po.Register("print-silence", &print_silence, "If true, print optional-silence "
+                "(<eps>) arcs");
     po.Register("frame-shift", &frame_shift, "Time in seconds between frames.\n");
     po.Register("precision", &precision,
-                "Number of decimal places for start duration times\n");
+                "Number of decimal places for start duration times (note: we "
+                "may use a higher value than this if it's obvious from "
+                "--frame-shift that this value is too small");
 
     po.Read(argc, argv);
 
@@ -62,15 +67,21 @@ int main(int argc, char *argv[]) {
     std::string lats_rspecifier = po.GetArg(1),
         ctm_wxfilename = po.GetArg(2);
 
+    if (frame_shift < 0.01 && precision <= 2)
+      precision = 3;
+    if (frame_shift < 0.001 && precision <= 3)
+      precision = 4;
+
+
     SequentialCompactLatticeReader clat_reader(lats_rspecifier);
-    
+
     int32 n_done = 0, n_err = 0;
 
     Output ko(ctm_wxfilename, false); // false == non-binary write mode.
     ko.Stream() << std::fixed;  // Set to "fixed" floating point model, where precision() specifies
     // the #digits after the decimal point.
     ko.Stream().precision(precision);
-    
+
     for (; !clat_reader.Done(); clat_reader.Next()) {
       std::string key = clat_reader.Key();
       CompactLattice clat = clat_reader.Value();
@@ -84,7 +95,7 @@ int main(int argc, char *argv[]) {
         KALDI_ASSERT(words.size() == times.size() &&
                      words.size() == lengths.size());
         for (size_t i = 0; i < words.size(); i++) {
-          if (words[i] == 0)  // Don't output anything for <eps> links, which
+          if (words[i] == 0 && !print_silence)  // Don't output anything for <eps> links, which
             continue; // correspond to silence....
           ko.Stream() << key << " 1 " << (frame_shift * times[i]) << ' '
                       << (frame_shift * lengths[i]) << ' ' << words[i] <<std::endl;
@@ -96,7 +107,7 @@ int main(int argc, char *argv[]) {
     // we just let them go out of scope and it happens automatically.
     // We do it this time in order to avoid wrongly printing out a success message
     // if the stream was going to fail to close
-            
+
     KALDI_LOG << "Converted " << n_done << " linear lattices to ctm format; "
               << n_err  << " had errors.";
     return (n_done != 0 ? 0 : 1);
diff --git a/src/lm/Makefile b/src/lm/Makefile
index 5edc55a563a..2dc3a7db302 100644
--- a/src/lm/Makefile
+++ b/src/lm/Makefile
@@ -2,22 +2,16 @@ EXTRA_CXXFLAGS = -Wno-sign-compare
 
 all:
 
-# Disable linking math libs because  not needed here.  Just for compilation speed.
-MATHLIB = NONE
-
-# Uncomment following line to use IRSTLM toolkit installed in ../lmtoolkit
-#include ./irstlm.mk
-
 include ../kaldi.mk
 
-TESTFILES = lm-lib-test
-
-OBJFILES = const-arpa-lm.o kaldi-lmtable.o kaldi-lm.o
+TESTFILES = arpa-file-parser-test arpa-lm-compiler-test
 
-TESTOUTPUTS = composed.fst output.fst output1.fst output2.fst
+OBJFILES = arpa-file-parser.o arpa-lm-compiler.o const-arpa-lm.o \
+	   kaldi-rnnlm.o mikolov-rnnlm-lib.o
 
 LIBNAME = kaldi-lm
 
-ADDLIBS = ../base/kaldi-base.a ../fstext/kaldi-fstext.a ../util/kaldi-util.a
+ADDLIBS = ../fstext/kaldi-fstext.a ../util/kaldi-util.a ../thread/kaldi-thread.a \
+          ../matrix/kaldi-matrix.a ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/lm/arpa-file-parser-test.cc b/src/lm/arpa-file-parser-test.cc
new file mode 100644
index 00000000000..419bc29fb68
--- /dev/null
+++ b/src/lm/arpa-file-parser-test.cc
@@ -0,0 +1,372 @@
+// lm/arpa-file-parser-test.cc
+
+// Copyright 2016  Smart Action Company LLC (kkm)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//  http://www.apache.org/licenses/LICENSE-2.0
+
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+ * @file lm-lib-test.cc
+ * @brief Unit tests for language model code.
+ */
+
+#include <iomanip>
+#include <iostream>
+#include <string>
+#include <sstream>
+#include <vector>
+
+#include "base/kaldi-common.h"
+#include "fst/fstlib.h"
+#include "lm/arpa-file-parser.h"
+
+namespace kaldi {
+namespace {
+
+const int kMaxOrder = 3;
+
+struct NGramTestData {
+  int32 line_number;
+  float logprob;
+  int32 words[kMaxOrder];
+  float backoff;
+};
+
+std::ostream& operator<<(std::ostream &os, const NGramTestData &data) {
+  std::ios::fmtflags saved_state(os.flags());
+  os << std::fixed << std::setprecision(6);
+
+  os << data.logprob << ' ';
+  for (int i = 0; i < kMaxOrder; ++i) os << data.words[i] << ' ';
+  os << data.backoff << " // Line " << data.line_number;
+
+  os.flags(saved_state);
+  return os;
+}
+
+// This does not own the array pointer, and uset to simplify passing expected
+// result to TestableArpaFileParser::Verify.
+template <class T>
+struct CountedArray {
+  template <size_t N>
+  CountedArray(T(&array)[N]) : array(array), count(N) { }
+  const T *array;
+  const size_t count;
+};
+
+template <class T, size_t N>
+inline CountedArray<T> MakeCountedArray(T(&array)[N]) {
+  return CountedArray<T>(array);
+}
+
+class TestableArpaFileParser : public ArpaFileParser {
+ public:
+  TestableArpaFileParser(ArpaParseOptions options, fst::SymbolTable *symbols)
+      : ArpaFileParser(options, symbols),
+        header_available_(false),
+        read_complete_(false),
+        last_order_(0) { }
+  void Validate(CountedArray<int32> counts, CountedArray<NGramTestData> ngrams);
+
+ private:
+  // ArpaFileParser overrides.
+  virtual void HeaderAvailable();
+  virtual void ConsumeNGram(const NGram& ngram);
+  virtual void ReadComplete();
+
+  bool header_available_;
+  bool read_complete_;
+  int32 last_order_;
+  std::vector <NGramTestData> ngrams_;
+};
+
+void TestableArpaFileParser::HeaderAvailable() {
+  KALDI_ASSERT(!header_available_);
+  KALDI_ASSERT(!read_complete_);
+  header_available_ = true;
+  KALDI_ASSERT(NgramCounts().size() <= kMaxOrder);
+}
+
+void TestableArpaFileParser::ConsumeNGram(const NGram& ngram) {
+  KALDI_ASSERT(header_available_);
+  KALDI_ASSERT(!read_complete_);
+  KALDI_ASSERT(ngram.words.size() <= NgramCounts().size());
+  KALDI_ASSERT(ngram.words.size() >= last_order_);
+  last_order_ = ngram.words.size();
+
+  NGramTestData entry = { 0 };
+  entry.line_number = LineNumber();
+  entry.logprob = ngram.logprob;
+  entry.backoff = ngram.backoff;
+  std::copy(ngram.words.begin(), ngram.words.end(), entry.words);
+  ngrams_.push_back(entry);
+}
+
+void TestableArpaFileParser::ReadComplete() {
+  KALDI_ASSERT(header_available_);
+  KALDI_ASSERT(!read_complete_);
+  read_complete_ = true;
+}
+
+bool CompareNgrams(const NGramTestData &actual,
+                   NGramTestData expected) {
+  expected.logprob *= Log(10.0);
+  expected.backoff *= Log(10.0);
+  if (actual.line_number != expected.line_number
+      || !std::equal(actual.words, actual.words + kMaxOrder,
+                     expected.words)
+      || !ApproxEqual(actual.logprob, expected.logprob)
+      || !ApproxEqual(actual.backoff, expected.backoff)) {
+    KALDI_WARN << "Actual n-gram [" << actual
+               << "] differs from expected [" << expected << "]";
+    return false;
+  }
+  return true;
+}
+
+void TestableArpaFileParser::Validate(
+    CountedArray<int32> expect_counts,
+    CountedArray<NGramTestData> expect_ngrams) {
+  // This needs better disagnostics probably.
+  KALDI_ASSERT(NgramCounts().size() == expect_counts.count);
+  KALDI_ASSERT(std::equal(NgramCounts().begin(), NgramCounts().end(),
+                          expect_counts.array));
+
+  KALDI_ASSERT(ngrams_.size() == expect_ngrams.count);
+  // auto mpos = std::mismatch(ngrams_.begin(), ngrams_.end(),
+  //                           expect_ngrams.array, CompareNgrams);
+  // if (mpos.first != ngrams_.end())
+  //   KALDI_ERR << "Maismatch at index " << mpos.first - ngrams_.begin();
+  //TODO:auto above requres C++11, and I cannot spell out the type!!!
+  KALDI_ASSERT(std::equal(ngrams_.begin(), ngrams_.end(),
+                          expect_ngrams.array, CompareNgrams));
+}
+
+// Read integer LM (no symbols) with log base conversion.
+void ReadIntegerLmLogconvExpectSuccess() {
+  KALDI_LOG << "ReadIntegerLmLogconvExpectSuccess()";
+
+  static std::string integer_lm = "\
+\\data\\\n\
+ngram 1=4\n\
+ngram 2=2\n\
+ngram 3=2\n\
+\n\
+\\1-grams:\n\
+-5.2      4 -3.3\n\
+-3.4      5\n\
+0         1 -2.5\n\
+-4.3      2\n\
+\n\
+\\2-grams:\n\
+-1.4       4 5 -3.2\n\
+-1.3       1 4 -4.2\n\
+\n\
+\\3-grams:\n\
+-0.3       1 4 5\n\
+-0.2       4 5 2\n\
+\n\
+\\end\\";
+
+  int32 expect_counts[] = { 4, 2, 2 };
+  NGramTestData expect_ngrams[] = {
+    {  7, -5.2, { 4, 0, 0 }, -3.3 },
+    {  8, -3.4, { 5, 0, 0 },  0.0 },
+    {  9,  0.0, { 1, 0, 0 }, -2.5 },
+    { 10, -4.3, { 2, 0, 0 },  0.0 },
+
+    { 13, -1.4, { 4, 5, 0 }, -3.2 },
+    { 14, -1.3, { 1, 4, 0 }, -4.2 },
+
+    { 17, -0.3, { 1, 4, 5 },  0.0 },
+    { 18, -0.2, { 4, 5, 2 },  0.0 } };
+
+  ArpaParseOptions options;
+  options.bos_symbol = 1;
+  options.eos_symbol = 2;
+
+  TestableArpaFileParser parser(options, NULL);
+  std::istringstream stm(integer_lm, std::ios_base::in);
+  parser.Read(stm, false);
+  parser.Validate(MakeCountedArray(expect_counts),
+                  MakeCountedArray(expect_ngrams));
+}
+
+// \xCE\xB2 = UTF-8 for Greek beta, to churn some UTF-8 cranks.
+static std::string symbolic_lm = "\
+We also allow random text coming before the \\data\\\n\
+section marker. Even this is ok:\n\
+\n\
+\\1-grams:\n\
+\n\
+and should be ignored before the \\data\\ marker\n\
+is seen alone by itself on a line.\n\
+\n\
+\\data\\\n\
+ngram 1=4\n\
+ngram 2=2\n\
+ngram 3=2\n\
+\n\
+\\1-grams:\n\
+-5.2	a -3.3\n\
+-3.4	\xCE\xB2\n\
+0.0	<s> -2.5\n\
+-4.3	</s>\n\
+\n\
+\\2-grams:\n\
+-1.5	a \xCE\xB2 -3.2\n\
+-1.3	<s> a -4.2\n\
+\n\
+\\3-grams:\n\
+-0.3	<s> a \xCE\xB2\n\
+-0.2	<s> a </s>\n\
+\\end\\";
+
+// Symbol table that is created with predefined test symbols, "a" but no "b".
+class TestSymbolTable : public fst::SymbolTable {
+ public:
+  TestSymbolTable() {
+    AddSymbol("<eps>", 0);
+    AddSymbol("<s>", 1);
+    AddSymbol("</s>", 2);
+    AddSymbol("<unk>", 3);
+    AddSymbol("a", 4);
+  }
+};
+
+// Full expected result shared between ReadSymbolicLmNoOovImpl and
+// ReadSymbolicLmWithOovAddToSymbols().
+NGramTestData expect_symbolic_full[] = {
+  { 15, -5.2, { 4, 0, 0 }, -3.3 },
+  { 16, -3.4, { 5, 0, 0 },  0.0 },
+  { 17,  0.0, { 1, 0, 0 }, -2.5 },
+  { 18, -4.3, { 2, 0, 0 },  0.0 },
+
+  { 21, -1.5, { 4, 5, 0 }, -3.2 },
+  { 22, -1.3, { 1, 4, 0 }, -4.2 },
+
+  { 25, -0.3, { 1, 4, 5 },  0.0 },
+  { 26, -0.2, { 1, 4, 2 },  0.0 } };
+
+// This is run with all possible oov setting and yields same result.
+void ReadSymbolicLmNoOovImpl(ArpaParseOptions::OovHandling oov) {
+  int32 expect_counts[] = { 4, 2, 2 };
+  TestSymbolTable symbols;
+  symbols.AddSymbol("\xCE\xB2", 5);
+
+  ArpaParseOptions options;
+  options.bos_symbol = 1;
+  options.eos_symbol = 2;
+  options.unk_symbol = 3;
+  options.oov_handling = oov;
+  TestableArpaFileParser parser(options, &symbols);
+  std::istringstream stm(symbolic_lm, std::ios_base::in);
+  parser.Read(stm, false);
+  parser.Validate(MakeCountedArray(expect_counts),
+                  MakeCountedArray(expect_symbolic_full));
+  KALDI_ASSERT(symbols.NumSymbols() == 6);
+}
+
+void ReadSymbolicLmNoOovTests() {
+  KALDI_LOG << "ReadSymbolicLmNoOovImpl(kRaiseError)";
+  ReadSymbolicLmNoOovImpl(ArpaParseOptions::kRaiseError);
+  KALDI_LOG << "ReadSymbolicLmNoOovImpl(kAddToSymbols)";
+  ReadSymbolicLmNoOovImpl(ArpaParseOptions::kAddToSymbols);
+  KALDI_LOG << "ReadSymbolicLmNoOovImpl(kReplaceWithUnk)";
+  ReadSymbolicLmNoOovImpl(ArpaParseOptions::kReplaceWithUnk);
+  KALDI_LOG << "ReadSymbolicLmNoOovImpl(kSkipNGram)";
+  ReadSymbolicLmNoOovImpl(ArpaParseOptions::kSkipNGram);
+}
+
+// This is run with all possible oov setting and yields same result.
+void ReadSymbolicLmWithOovImpl(
+    ArpaParseOptions::OovHandling oov,
+    CountedArray<NGramTestData> expect_ngrams,
+    fst::SymbolTable* symbols) {
+  int32 expect_counts[] = { 4, 2, 2 };
+  ArpaParseOptions options;
+  options.bos_symbol = 1;
+  options.eos_symbol = 2;
+  options.unk_symbol = 3;
+  options.oov_handling = oov;
+  TestableArpaFileParser parser(options, symbols);
+  std::istringstream stm(symbolic_lm, std::ios_base::in);
+  parser.Read(stm, false);
+  parser.Validate(MakeCountedArray(expect_counts), expect_ngrams);
+}
+
+void ReadSymbolicLmWithOovAddToSymbols() {
+  TestSymbolTable symbols;
+  ReadSymbolicLmWithOovImpl(ArpaParseOptions::kAddToSymbols,
+                            MakeCountedArray(expect_symbolic_full),
+                            &symbols);
+  KALDI_ASSERT(symbols.NumSymbols() == 6);
+  KALDI_ASSERT(symbols.Find("\xCE\xB2") == 5);
+}
+
+void ReadSymbolicLmWithOovReplaceWithUnk() {
+  NGramTestData expect_symbolic_unk_b[] = {
+    { 15, -5.2, { 4, 0, 0 }, -3.3 },
+    { 16, -3.4, { 3, 0, 0 },  0.0 },
+    { 17,  0.0, { 1, 0, 0 }, -2.5 },
+    { 18, -4.3, { 2, 0, 0 },  0.0 },
+
+    { 21, -1.5, { 4, 3, 0 }, -3.2 },
+    { 22, -1.3, { 1, 4, 0 }, -4.2 },
+
+    { 25, -0.3, { 1, 4, 3 },  0.0 },
+    { 26, -0.2, { 1, 4, 2 },  0.0 } };
+
+  TestSymbolTable symbols;
+  ReadSymbolicLmWithOovImpl(ArpaParseOptions::kReplaceWithUnk,
+                            MakeCountedArray(expect_symbolic_unk_b),
+                            &symbols);
+  KALDI_ASSERT(symbols.NumSymbols() == 5);
+}
+
+void ReadSymbolicLmWithOovSkipNGram() {
+  NGramTestData expect_symbolic_no_b[] = {
+    { 15, -5.2, { 4, 0, 0 }, -3.3 },
+    { 17,  0.0, { 1, 0, 0 }, -2.5 },
+    { 18, -4.3, { 2, 0, 0 },  0.0 },
+
+    { 22, -1.3, { 1, 4, 0 }, -4.2 },
+
+    { 26, -0.2, { 1, 4, 2 },  0.0 } };
+
+  TestSymbolTable symbols;
+  ReadSymbolicLmWithOovImpl(ArpaParseOptions::kSkipNGram,
+                            MakeCountedArray(expect_symbolic_no_b),
+                            &symbols);
+  KALDI_ASSERT(symbols.NumSymbols() == 5);
+}
+
+void ReadSymbolicLmWithOovTests() {
+  KALDI_LOG << "ReadSymbolicLmWithOovAddToSymbols()";
+  ReadSymbolicLmWithOovAddToSymbols();
+  KALDI_LOG << "ReadSymbolicLmWithOovReplaceWithUnk()";
+  ReadSymbolicLmWithOovReplaceWithUnk();
+  KALDI_LOG << "ReadSymbolicLmWithOovSkipNGram()";
+  ReadSymbolicLmWithOovSkipNGram();
+}
+
+}  // namespace
+}  // namespace kaldi
+
+int main(int argc, char *argv[]) {
+  kaldi::ReadIntegerLmLogconvExpectSuccess();
+  kaldi::ReadSymbolicLmNoOovTests();
+  kaldi::ReadSymbolicLmWithOovTests();
+}
diff --git a/src/lm/arpa-file-parser.cc b/src/lm/arpa-file-parser.cc
new file mode 100644
index 00000000000..49b425adca4
--- /dev/null
+++ b/src/lm/arpa-file-parser.cc
@@ -0,0 +1,252 @@
+// lm/arpa-file-parser.cc
+
+// Copyright 2014  Guoguo Chen
+// Copyright 2016  Smart Action Company LLC (kkm)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sstream>
+
+#include <fst/fstlib.h>
+
+#include "base/kaldi-error.h"
+#include "base/kaldi-math.h"
+#include "lm/arpa-file-parser.h"
+#include "util/text-utils.h"
+
+namespace kaldi {
+
+ArpaFileParser::ArpaFileParser(ArpaParseOptions options,
+                               fst::SymbolTable* symbols)
+    : options_(options), symbols_(symbols),
+      line_number_(0), warning_count_(0) {
+}
+
+ArpaFileParser::~ArpaFileParser() {
+}
+
+void ArpaFileParser::Read(std::istream &is, bool binary) {
+  if (binary) {
+    KALDI_ERR << "binary-mode reading is not implemented for ArpaFileParser";
+  }
+
+  // Argument sanity checks.
+  if (options_.bos_symbol <= 0 || options_.eos_symbol <= 0 ||
+      options_.bos_symbol == options_.eos_symbol)
+    KALDI_ERR << "BOS and EOS symbols are required, must not be epsilons, and "
+              << "differ from each other. Given:"
+              << " BOS=" << options_.bos_symbol
+              << " EOS=" << options_.eos_symbol;
+  if (symbols_ != NULL &&
+      options_.oov_handling == ArpaParseOptions::kReplaceWithUnk &&
+      (options_.unk_symbol <= 0 ||
+       options_.unk_symbol == options_.bos_symbol ||
+       options_.unk_symbol == options_.eos_symbol))
+    KALDI_ERR << "When symbol table is given and OOV mode is kReplaceWithUnk, "
+              << "UNK symbol is required, must not be epsilon, and "
+              << "differ from both BOS and EOS symbols. Given:"
+              << " UNK=" << options_.unk_symbol
+              << " BOS=" << options_.bos_symbol
+              << " EOS=" << options_.eos_symbol;
+  if (symbols_ != NULL && symbols_->Find(options_.bos_symbol).empty())
+    KALDI_ERR << "BOS symbol must exist in symbol table";
+  if (symbols_ != NULL && symbols_->Find(options_.eos_symbol).empty())
+    KALDI_ERR << "EOS symbol must exist in symbol table";
+  if (symbols_ != NULL && options_.unk_symbol > 0 &&
+      symbols_->Find(options_.unk_symbol).empty())
+    KALDI_ERR << "UNK symbol must exist in symbol table";
+
+  ngram_counts_.clear();
+  line_number_ = 0;
+  warning_count_ = 0;
+  current_line_.clear();
+
+#define PARSE_ERR (KALDI_ERR << LineReference() << ": ")
+
+  // Give derived class an opportunity to prepare its state.
+  ReadStarted();
+
+  // Processes "\data\" section.
+  bool keyword_found = false;
+  while (++line_number_, getline(is, current_line_) && !is.eof()) {
+    if (current_line_.empty()) continue;
+
+    // Continue skipping lines until the \data\ marker alone on a line is found.
+    if (!keyword_found) {
+      if (current_line_ == "\\data\\") {
+        KALDI_LOG << "Reading \\data\\ section.";
+        keyword_found = true;
+      }
+      continue;
+    }
+
+    if (current_line_[0] == '\\') break;
+
+    // Enters "\data\" section, and looks for patterns like "ngram 1=1000",
+    // which means there are 1000 unigrams.
+    std::size_t equal_symbol_pos = current_line_.find("=");
+    if (equal_symbol_pos != std::string::npos)
+      // Guaranteed spaces around the "=".
+      current_line_.replace(equal_symbol_pos, 1, " = ");
+    std::vector<std::string> col;
+    SplitStringToVector(current_line_, " \t", true, &col);
+    if (col.size() == 4 && col[0] == "ngram" && col[2] == "=") {
+      int32 order, ngram_count = 0;
+      if (!ConvertStringToInteger(col[1], &order) ||
+          !ConvertStringToInteger(col[3], &ngram_count)) {
+        PARSE_ERR << "cannot parse ngram count";
+      }
+      if (ngram_counts_.size() <= order) {
+        ngram_counts_.resize(order);
+      }
+      ngram_counts_[order - 1] = ngram_count;
+    } else {
+      KALDI_WARN << LineReference()
+                 << ": uninterpretable line in \\data\\ section";
+    }
+  }
+
+  if (ngram_counts_.size() == 0)
+    PARSE_ERR << "\\data\\ section missing or empty.";
+
+  // Signal that grammar order and n-gram counts are known.
+  HeaderAvailable();
+
+  NGram ngram;
+  ngram.words.reserve(ngram_counts_.size());
+
+  // Processes "\N-grams:" section.
+  for (int32 cur_order = 1; cur_order <= ngram_counts_.size(); ++cur_order) {
+    // Skips n-grams with zero count.
+    if (ngram_counts_[cur_order - 1] == 0)
+      KALDI_WARN << "Zero ngram count in ngram order " << cur_order
+                 << "(look for 'ngram " << cur_order << "=0' in the \\data\\ "
+                 << " section). There is possibly a problem with the file.";
+
+    // Must be looking at a \k-grams: directive at this point.
+    std::ostringstream keyword;
+    keyword << "\\" << cur_order << "-grams:";
+    if (current_line_ != keyword.str()) {
+      PARSE_ERR << "invalid directive, expecting '" << keyword.str() << "'";
+    }
+    KALDI_LOG << "Reading " << current_line_ << " section.";
+
+    int32 ngram_count = 0;
+    while (++line_number_, getline(is, current_line_) && !is.eof()) {
+      if (current_line_.empty()) continue;
+      if (current_line_[0] == '\\') break;
+
+      std::vector<std::string> col;
+      SplitStringToVector(current_line_, " \t", true, &col);
+
+      if (col.size() < 1 + cur_order ||
+          col.size() > 2 + cur_order ||
+          (cur_order == ngram_counts_.size() && col.size() != 1 + cur_order)) {
+        PARSE_ERR << "Invalid n-gram data line";
+      }
+      ++ngram_count;
+
+      // Parse out n-gram logprob and, if present, backoff weight.
+      if (!ConvertStringToReal(col[0], &ngram.logprob)) {
+        PARSE_ERR << "invalid n-gram logprob '" << col[0] << "'";
+      }
+      ngram.backoff = 0.0;
+      if (col.size() > cur_order + 1) {
+        if (!ConvertStringToReal(col[cur_order + 1], &ngram.backoff))
+          PARSE_ERR << "invalid backoff weight '" << col[cur_order + 1] << "'";
+      }
+      // Convert to natural log.
+      ngram.logprob *= M_LN10;
+      ngram.backoff *= M_LN10;
+
+      ngram.words.resize(cur_order);
+      bool skip_ngram = false;
+      for (int32 index = 0; !skip_ngram && index < cur_order; ++index) {
+        int32 word;
+        if (symbols_) {
+          // Symbol table provided, so symbol labels are expected.
+          if (options_.oov_handling == ArpaParseOptions::kAddToSymbols) {
+            word = symbols_->AddSymbol(col[1 + index]);
+          } else {
+            word = symbols_->Find(col[1 + index]);
+            if (word == fst::SymbolTable::kNoSymbol) {
+              switch(options_.oov_handling) {
+                case ArpaParseOptions::kReplaceWithUnk:
+                  word = options_.unk_symbol;
+                  break;
+                case ArpaParseOptions::kSkipNGram:
+                  if (ShouldWarn())
+                    KALDI_WARN << LineReference() << " skipped: word '"
+                               << col[1 + index] << "' not in symbol table";
+                  skip_ngram = true;
+                  break;
+                default:
+                  PARSE_ERR << "word '"  << col[1 + index]
+                            << "' not in symbol table";
+              }
+            }
+          }
+        } else {
+          // Symbols not provided, LM file should contain integers.
+          if (!ConvertStringToInteger(col[1 + index], &word) || word < 0) {
+            PARSE_ERR << "invalid symbol '" << col[1 + index] << "'";
+          }
+        }
+        // Whichever way we got it, an epsilon is invalid.
+        if (word == 0) {
+          PARSE_ERR << "epsilon symbol '" << col[1 + index]
+                    << "' is illegal in ARPA LM";
+        }
+        ngram.words[index] = word;
+      }
+      if (!skip_ngram) {
+        ConsumeNGram(ngram);
+      }
+    }
+    if (ngram_count > ngram_counts_[cur_order - 1]) {
+      PARSE_ERR << "header said there would be " << ngram_counts_[cur_order - 1]
+                << " n-grams of order " << cur_order
+                << ", but we saw more already.";
+    }
+  }
+
+  if (current_line_ != "\\end\\") {
+    PARSE_ERR << "invalid or unexpected directive line, expecting \\end\\";
+  }
+
+  if (warning_count_ > 0 && warning_count_ > (uint32)options_.max_warnings) {
+    KALDI_WARN << "Of " << warning_count_ << " parse warnings, "
+               << options_.max_warnings << " were reported. Run program with "
+               << "--max_warnings=-1 to see all warnings";
+  }
+
+  current_line_.empty();
+  ReadComplete();
+
+#undef PARSE_ERR
+}
+
+std::string ArpaFileParser::LineReference() const {
+  std::stringstream ss;
+  ss << "line " << line_number_ << " [" << current_line_ << "]";
+  return ss.str();
+}
+
+bool ArpaFileParser::ShouldWarn() {
+ return ++warning_count_ <= (uint32)options_.max_warnings;
+}
+
+}  // namespace kaldi
diff --git a/src/lm/arpa-file-parser.h b/src/lm/arpa-file-parser.h
new file mode 100644
index 00000000000..fc7c83deb30
--- /dev/null
+++ b/src/lm/arpa-file-parser.h
@@ -0,0 +1,146 @@
+// lm/arpa-file-parser.h
+
+// Copyright 2014  Guoguo Chen
+// Copyright 2016  Smart Action Company LLC (kkm)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_LM_ARPA_FILE_PARSER_H_
+#define KALDI_LM_ARPA_FILE_PARSER_H_
+
+#include <string>
+#include <vector>
+
+#include <fst/fst-decl.h>
+
+#include "base/kaldi-types.h"
+#include "itf/options-itf.h"
+
+namespace kaldi {
+
+/**
+  Options that control ArpaFileParser
+*/
+struct ArpaParseOptions {
+  enum OovHandling {
+    kRaiseError,     ///< Abort on OOV words
+    kAddToSymbols,   ///< Add novel words to the symbol table.
+    kReplaceWithUnk, ///< Replace OOV words with <unk>.
+    kSkipNGram       ///< Skip n-gram with OOV word and continue.
+  };
+
+  ArpaParseOptions()
+      : bos_symbol(-1), eos_symbol(-1), unk_symbol(-1),
+        oov_handling(kRaiseError), max_warnings(30) { }
+
+  void Register(OptionsItf *opts) {
+    // Registering only the max_warnings count, since other options are
+    // treated differently by client programs: some want integer symbols,
+    // while other are passed words in their command line.
+    opts->Register("max-arpa-warnings", &max_warnings,
+                   "Maximum warnings to report on ARPA parsing, "
+                   "0 to disable, -1 to show all");
+  }
+
+  int32 bos_symbol;  ///< Symbol for <s>, Required non-epsilon.
+  int32 eos_symbol;  ///< Symbol for </s>, Required non-epsilon.
+  int32 unk_symbol;  ///< Symbol for <unk>, Required for kReplaceWithUnk.
+  OovHandling oov_handling;  ///< How to handle OOV words in the file.
+  int32 max_warnings; ///< Maximum warnings to report, <0 unlimited.
+};
+
+/**
+   A parsed n-gram from ARPA LM file.
+*/
+struct NGram {
+  NGram() : logprob(0.0), backoff(0.0) { }
+  std::vector<int32> words;  ///< Symbols in LTR order.
+  float logprob;             ///< Log-prob of the n-gram.
+  float backoff;             ///< log-backoff weight of the n-gram.
+};
+
+/**
+    ArpaFileParser is an abstract base class for ARPA LM file conversion.
+
+    See ConstArpaLmBuilder and ArpaLmCompiler for usage examples.
+*/
+class ArpaFileParser {
+ public:
+  /// Constructs the parser with the given options and optional symbol table.
+  /// If symbol table is provided, then the file should contain text n-grams,
+  /// and the words are mapped to symbols through it. bos_symbol and
+  /// eos_symbol in the options structure must be valid symbols in the table,
+  /// and so must be unk_symbol if provided. The table is not owned by the
+  /// parser, but may be augmented, if oov_handling is set to kAddToSymbols.
+  /// If symbol table is a null pointer, the file should contain integer
+  /// symbol values, and oov_handling has no effect. bos_symbol and eos_symbol
+  /// must be valid symbols still.
+  ArpaFileParser(ArpaParseOptions options, fst::SymbolTable* symbols);
+  virtual ~ArpaFileParser();
+
+  /// Read ARPA LM file through Kaldi I/O functions. Only text mode is
+  /// supported.
+  void Read(std::istream &is, bool binary);
+
+  /// Parser options.
+  const ArpaParseOptions& Options() const { return options_; }
+
+ protected:
+  /// Override called before reading starts. This is the point to prepare
+  /// any state in the derived class.
+  virtual void ReadStarted() { }
+
+  /// Override function called to signal that ARPA header with the expected
+  /// number of n-grams has been read, and ngram_counts() is now valid.
+  virtual void HeaderAvailable() { }
+
+  /// Pure override that must be implemented to process current n-gram. The
+  /// n-grams are sent in the file order, which guarantees that all
+  /// (k-1)-grams are processed before the first k-gram is.
+  virtual void ConsumeNGram(const NGram&) = 0;
+
+  /// Override function called after the last n-gram has been consumed.
+  virtual void ReadComplete() { }
+
+  /// Read-only access to symbol table. Not owned, do not make public.
+  const fst::SymbolTable* Symbols() const { return symbols_; }
+
+  /// Inside ConsumeNGram(), provides the current line number.
+  int32 LineNumber() const { return line_number_; }
+
+  /// Inside ConsumeNGram(), returns a formatted reference to the line being
+  /// compiled, to print out as part of diagnostics.
+  std::string LineReference() const;
+
+  /// Increments warning count, and returns true if a warning should be
+  /// printed or false if the count has exceeded the set maximum.
+  bool ShouldWarn();
+
+  /// N-gram counts. Valid in and after a call to HeaderAvailable().
+  const std::vector<int32>& NgramCounts() const { return ngram_counts_; }
+
+ private:
+  ArpaParseOptions options_;
+  fst::SymbolTable* symbols_;  // Not owned.
+  int32 line_number_;
+  uint32 warning_count_;
+  std::string current_line_;
+  std::vector<int32> ngram_counts_;
+};
+
+}  // namespace kaldi
+
+#endif  // KALDI_LM_ARPA_FILE_PARSER_H_
diff --git a/src/lm/arpa-lm-compiler-test.cc b/src/lm/arpa-lm-compiler-test.cc
new file mode 100644
index 00000000000..21112dd256d
--- /dev/null
+++ b/src/lm/arpa-lm-compiler-test.cc
@@ -0,0 +1,233 @@
+// lm/arpa-lm-compiler-test.cc
+
+// Copyright 2009-2011 Gilles Boulianne
+// Copyright 2016 Smart Action LLC (kkm)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+#include <string>
+#include <sstream>
+
+#include "base/kaldi-error.h"
+#include "base/kaldi-math.h"
+#include "lm/arpa-lm-compiler.h"
+#include "util/kaldi-io.h"
+
+namespace kaldi {
+
+// Predefine some symbol values, because any integer is as good than any other.
+enum {
+  kEps = 0,
+  kDisambig,
+  kBos,kEos,
+};
+
+// Number of random sentences for coverage test.
+static const int kRandomSentences = 50;
+
+// Creates an FST that generates any sequence of symbols taken from given
+// symbol table. The FST is then associated with the symbol table.
+static fst::StdVectorFst* CreateGenFst(bool seps, const fst::SymbolTable* pst) {
+  fst::StdVectorFst* genFst = new fst::StdVectorFst;
+  genFst->SetInputSymbols(pst);
+  genFst->SetOutputSymbols(pst);
+
+  fst::StdArc::StateId midId   = genFst->AddState();
+  if (!seps) {
+    fst::StdArc::StateId initId  = genFst->AddState();
+    fst::StdArc::StateId finalId = genFst->AddState();
+    genFst->SetStart(initId);
+    genFst->SetFinal(finalId, fst::StdArc::Weight::One());
+    genFst->AddArc(initId, fst::StdArc(kBos, kBos, 0, midId));
+    genFst->AddArc(midId,  fst::StdArc(kEos, kEos, 0, finalId));
+  } else {
+    genFst->SetStart(midId);
+    genFst->SetFinal(midId, fst::StdArc::Weight::One());
+  }
+
+  // Add a loop for each symbol in the table except the four special ones.
+  fst::SymbolTableIterator si(*pst);
+  for (si.Reset(); !si.Done(); si.Next()) {
+    if (si.Value() == kBos || si.Value() == kEos ||
+        si.Value() == kEps || si.Value() == kDisambig)
+      continue;
+    genFst->AddArc(midId, fst::StdArc(si.Value(), si.Value(),
+                                      fst::StdArc::Weight::One(), midId));
+  }
+  return genFst;
+}
+
+// Compile given ARPA file.
+ArpaLmCompiler* Compile(bool seps, const string &infile) {
+  ArpaParseOptions options;
+  fst::SymbolTable symbols;
+  // Use spaces on special symbols, so we rather fail than read them by mistake.
+  symbols.AddSymbol(" <eps>", kEps);
+  symbols.AddSymbol(" #0", kDisambig);
+  options.bos_symbol = symbols.AddSymbol("<s>", kBos);
+  options.eos_symbol = symbols.AddSymbol("</s>", kEos);
+  options.oov_handling = ArpaParseOptions::kAddToSymbols;
+
+  // Tests in this form cannot be run with epsilon substitution, unless every
+  // random path is also fitted with a #0-transducing self-loop.
+  ArpaLmCompiler* lm_compiler =
+      new ArpaLmCompiler(options,
+                         seps ? kDisambig : 0,
+                         &symbols);
+  ReadKaldiObject(infile, lm_compiler);
+  return lm_compiler;
+}
+
+// Add a state to an FSA after last_state, add a form last_state to the new
+// atate, and return the new state.
+fst::StdArc::StateId AddToChainFsa(fst::StdMutableFst* fst,
+                                   fst::StdArc::StateId last_state,
+                                   int64 symbol) {
+  fst::StdArc::StateId next_state  = fst->AddState();
+  fst->AddArc(last_state, fst::StdArc(symbol, symbol, 0, next_state));
+  return next_state;
+}
+
+// Add a disambiguator-generating self loop to every state of an FST.
+void AddSelfLoops(fst::StdMutableFst* fst) {
+  for (fst::StateIterator<fst::StdMutableFst> siter(*fst);
+       !siter.Done(); siter.Next()) {
+    fst->AddArc(siter.Value(),
+                fst::StdArc(kEps, kDisambig, 0, siter.Value()));
+  }
+}
+
+// Compiles infile and then runs kRandomSentences random coverage tests on the
+// compiled FST.
+bool CoverageTest(bool seps, const string &infile) {
+  // Compile ARPA model.
+  ArpaLmCompiler* lm_compiler = Compile(seps, infile);
+
+  // Create an FST that generates any sequence of symbols taken from the model
+  // output.
+  fst::StdVectorFst* genFst =
+      CreateGenFst(seps, lm_compiler->Fst().OutputSymbols());
+
+  int num_successes = 0;
+  for (int32 i = 0; i < kRandomSentences; ++i) {
+    // Generate a random sentence FST.
+    fst::StdVectorFst sentence;
+    RandGen(*genFst, &sentence);
+    if (seps)
+      AddSelfLoops(&sentence);
+
+    // The past must successfullycompose with the LM FST.
+    fst::StdVectorFst composition;
+    Compose(sentence, lm_compiler->Fst(), &composition);
+    if (composition.Start() != fst::kNoStateId)
+      ++num_successes;
+  }
+
+  delete genFst;
+  delete lm_compiler;
+
+  bool ok = num_successes == kRandomSentences;
+  if (!ok) {
+    KALDI_WARN << "Coverage test failed on " << infile << ": composed "
+               << num_successes << "/" << kRandomSentences;
+  }
+  return ok;
+}
+
+bool ScoringTest(bool seps, const string &infile, const string& sentence,
+                 float expected) {
+  ArpaLmCompiler* lm_compiler = Compile(seps, infile);
+  const fst::SymbolTable* symbols = lm_compiler->Fst().InputSymbols();
+
+  // Create a sentence FST for scoring.
+  fst::StdVectorFst sentFst;
+  fst::StdArc::StateId state = sentFst.AddState();
+  sentFst.SetStart(state);
+  if (!seps) {
+    state = AddToChainFsa(&sentFst, state, kBos);
+  }
+  std::stringstream ss(sentence);
+  string word;
+  while (ss >> word) {
+    int64 word_sym = symbols->Find(word);
+    KALDI_ASSERT(word_sym != -1);
+    state = AddToChainFsa(&sentFst, state, word_sym);
+  }
+  if (!seps) {
+    state = AddToChainFsa(&sentFst, state, kEos);
+  }
+  if (seps) {
+    AddSelfLoops(&sentFst);
+  }
+  sentFst.SetFinal(state, 0);
+  sentFst.SetOutputSymbols(symbols);
+
+  // Do the composition and extract final weight.
+  fst::StdVectorFst composed;
+  fst::Compose(sentFst, lm_compiler->Fst(), &composed);
+  delete lm_compiler;
+
+  if (composed.Start() == fst::kNoStateId) {
+    KALDI_WARN << "Test sentence " << sentence << " did not compose "
+               << "with the language model FST\n";
+    return false;
+  }
+
+  std::vector<fst::StdArc::Weight> shortest;
+  fst::ShortestDistance(composed, &shortest, true);
+  float actual = shortest[composed.Start()].Value();
+
+  bool ok = ApproxEqual(expected, actual);
+  if (!ok) {
+    KALDI_WARN << "Scored " << sentence << " in " << infile
+               << ": Expected=" << expected << " actual=" << actual;
+  }
+  return ok;
+}
+
+}  // namespace kaldi
+
+bool RunAllTests(bool seps) {
+  bool ok = true;
+  ok &= kaldi::CoverageTest(seps, "test_data/missing_backoffs.arpa");
+  ok &= kaldi::CoverageTest(seps, "test_data/unused_backoffs.arpa");
+  ok &= kaldi::CoverageTest(seps, "test_data/input.arpa");
+
+  ok &= kaldi::ScoringTest(seps, "test_data/input.arpa", "b b b a", 59.2649);
+  ok &= kaldi::ScoringTest(seps, "test_data/input.arpa", "a b", 4.36082);
+  if (!ok) {
+    KALDI_WARN << "Tests " << (seps ? "with" : "without")
+               << " epsilon substitution FAILED";
+  }
+  return ok;
+}
+
+int main(int argc, char *argv[]) {
+  bool ok = true;
+
+  ok &= RunAllTests(false);  // Without disambiguators (old behavior).
+  ok &= RunAllTests(true);   // With epsilon substitution (new behavior).
+
+  if (ok) {
+    KALDI_LOG << "All tests passed";
+    return 0;
+  }
+  else {
+    KALDI_WARN << "Test FAILED";
+    return 1;
+  }
+}
diff --git a/src/lm/arpa-lm-compiler.cc b/src/lm/arpa-lm-compiler.cc
new file mode 100644
index 00000000000..e3e7cbd525d
--- /dev/null
+++ b/src/lm/arpa-lm-compiler.cc
@@ -0,0 +1,323 @@
+// lm/arpa-lm-compiler.cc
+
+// Copyright 2009-2011 Gilles Boulianne
+// Copyright 2016 Smart Action LLC (kkm)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <limits>
+#include <sstream>
+#include <utility>
+
+#include "base/kaldi-math.h"
+#include "lm/arpa-lm-compiler.h"
+#include "util/stl-utils.h"
+#include "util/text-utils.h"
+
+namespace kaldi {
+
+class ArpaLmCompilerImplInterface {
+ public:
+  virtual ~ArpaLmCompilerImplInterface() { }
+  virtual void ConsumeNGram(const NGram& ngram, bool is_highest) = 0;
+};
+
+namespace {
+
+typedef int32 StateId;
+typedef int32 Symbol;
+
+// GeneralHistKey can represent state history in an arbitrarily large n
+// n-gram model with symbol ids fitting int32.
+class GeneralHistKey {
+ public:
+  // Construct key from being and end iterators.
+  template<class InputIt>
+  GeneralHistKey(InputIt begin, InputIt end) : vector_(begin, end) { }
+  // Construct empty history key.
+  GeneralHistKey() : vector_() { }
+  // Return tails of the key as a GeneralHistKey. The tails of an n-gram
+  // w[1..n] is the sequence w[2..n] (and the heads is w[1..n-1], but the
+  // key class does not need this operartion).
+  GeneralHistKey Tails() const {
+    return GeneralHistKey(vector_.begin() + 1, vector_.end());
+  }
+  // Keys are equal if represent same state.
+  friend bool operator==(const GeneralHistKey& a, const GeneralHistKey& b) {
+    return a.vector_ == b.vector_;
+  }
+  // Public typename HashType for hashing.
+  struct HashType : public std::unary_function<GeneralHistKey, size_t> {
+    size_t operator()(const GeneralHistKey& key) const {
+      return VectorHasher<Symbol>().operator()(key.vector_);
+    }
+  };
+
+ private:
+  std::vector<Symbol> vector_;
+};
+
+// OptimizedHistKey combiness 3 21-bit symbol ID values into one 64-bit
+// machine word. allowing significant memory reduction and some runtime
+// benefit over GeneralHistKey. Since 3 symbolss are enough to track history
+// in a 4-gram model, this optimized key is used for smalled models with up
+// to 4-gram and symbol values up to 2^21-1.
+//
+// See GeneralHistKey for interface requrements of a key class.
+class OptimizedHistKey {
+ public:
+  enum {
+    kShift = 21,  // 21 * 3 = 63 bits for data.
+    kMaxData = (1 << kShift) - 1
+  };
+  template<class InputIt>
+  OptimizedHistKey(InputIt begin, InputIt end) : data_(0) {
+    for (uint32 shift = 0; begin != end; ++begin, shift += kShift) {
+      data_ |= static_cast<uint64>(*begin) << shift;
+    }
+  }
+  OptimizedHistKey() : data_(0) { }
+  OptimizedHistKey Tails() const {
+    return OptimizedHistKey(data_ >> kShift);
+  }
+  friend bool operator==(const OptimizedHistKey& a, const OptimizedHistKey& b) {
+    return a.data_ == b.data_;
+  }
+  struct HashType : public std::unary_function<OptimizedHistKey, size_t> {
+    size_t operator()(const OptimizedHistKey& key) const { return key.data_; }
+  };
+
+ private:
+  explicit OptimizedHistKey(uint64 data) : data_(data) { }
+  uint64 data_;
+};
+
+}  // namespace
+
+template <class HistKey>
+class ArpaLmCompilerImpl : public ArpaLmCompilerImplInterface {
+ public:
+  ArpaLmCompilerImpl(ArpaLmCompiler* parent, fst::StdVectorFst* fst,
+                     Symbol sub_eps);
+
+  virtual void ConsumeNGram(const NGram &ngram, bool is_highest);
+
+ private:
+  StateId AddStateWithBackoff(HistKey key, float backoff);
+  void CreateBackoff(HistKey key, StateId state, float weight);
+
+  ArpaLmCompiler *parent_;  // Not owned.
+  fst::StdVectorFst* fst_;  // Not owned.
+  Symbol bos_symbol_;
+  Symbol eos_symbol_;
+  Symbol sub_eps_;
+
+  StateId eos_state_;
+  typedef unordered_map<HistKey, StateId,
+                        typename HistKey::HashType> HistoryMap;
+  HistoryMap history_;
+};
+
+template <class HistKey>
+ArpaLmCompilerImpl<HistKey>::ArpaLmCompilerImpl(
+    ArpaLmCompiler* parent, fst::StdVectorFst* fst, Symbol sub_eps)
+    : parent_(parent), fst_(fst), bos_symbol_(parent->Options().bos_symbol),
+      eos_symbol_(parent->Options().eos_symbol), sub_eps_(sub_eps) {
+  // The algorithm maintains state per history. The 0-gram is a special state
+  // for emptry history. All unigrams (including BOS) backoff into this state.
+  StateId zerogram = fst_->AddState();
+  history_[HistKey()] = zerogram;
+
+  // Also, if </s> is not treated as epsilon, create a common end state for
+  // all transitions acepting the </s>, since they do not back off. This small
+  // optimization saves about 2% states in an average grammar.
+  if (sub_eps_ == 0) {
+    eos_state_ = fst_->AddState();
+    fst_->SetFinal(eos_state_, 0);
+  }
+}
+
+template <class HistKey>
+void ArpaLmCompilerImpl<HistKey>::ConsumeNGram(const NGram &ngram,
+                                               bool is_highest) {
+  // Generally, we do the following. Suppose we are adding an n-gram "A B
+  // C". Then find the node for "A B", add a new node for "A B C", and connect
+  // them with the arc accepting "C" with the specified weight. Also, add a
+  // backoff arc from the new "A B C" node to its backoff state "B C".
+  //
+  // Two notable exceptions are the highest order n-grams, and final n-grams.
+  //
+  // When adding a highest order n-gram (e. g., our "A B C" is in a 3-gram LM),
+  // the following optimization is performed. There is no point adding a node
+  // for "A B C" with a "C" arc from "A B", since there will be no other
+  // arcs ingoing to this node, and an epsilon backoff arc into the backoff
+  // model "B C", with the weight of \bar{1}. To save a node, create an arc
+  // accepting "C" directly from "A B" to "B C". This saves as many nodes
+  // as there are the highest order n-grams, which is typically about half
+  // the size of a large 3-gram model.
+  //
+  // Indeed, this does not apply to n-grams ending in EOS, since they do not
+  // back off. These are special, as they do not have a back-off state, and
+  // the node for "(..anything..) </s>" is always final. These are handled
+  // in one of the two possible ways, If symbols <s> and </s> are being
+  // replaced by epsilons, neither node nor arc is created, and the logprob
+  // of the n-gram is applied to its source node as final weight. If <s> and
+  // </s> are preserved, then a special final node for </s> is allocated and
+  // used as the destination of the "</s>" acceptor arc.
+  HistKey heads(ngram.words.begin(), ngram.words.end() - 1);
+  typename HistoryMap::iterator source_it = history_.find(heads);
+  if (source_it == history_.end()) {
+    // There was no "A B", therefore the probability of "A B C" is zero.
+    // Print a warning and discard current n-gram.
+    if (parent_->ShouldWarn())
+      KALDI_WARN << parent_->LineReference()
+                 << " skipped: no parent (n-1)-gram exists";
+    return;
+  }
+
+  StateId source = source_it->second;
+  StateId dest;
+  Symbol sym = ngram.words.back();
+  float weight = -ngram.logprob;
+  if (sym == eos_symbol_) {
+    if (sub_eps_ == 0) {
+      // Keep </s> as a real symbol when not substituting.
+      dest = eos_state_;
+    } else {
+      // Treat </s> as if it was epsilon: mark source final, with the weight
+      // of the n-gram.
+      fst_->SetFinal(source, weight);
+      return;
+    }
+  } else {
+    // For the highest order n-gram, this may find an existing state, for
+    // non-highest, will create one (unless there are duplicate n-grams
+    // in the grammar, which cannot be reliably detected if highest order,
+    // so we better do not do that at all).
+    dest = AddStateWithBackoff(
+        HistKey(ngram.words.begin() + (is_highest ? 1 : 0),
+                ngram.words.end()),
+        -ngram.backoff);
+  }
+
+  if (sym == bos_symbol_) {
+    weight = 0;  // Accepting <s> is always free.
+    if (sub_eps_ == 0) {
+      // <s> is as a real symbol, only accepted in the start state.
+      source = fst_->AddState();
+      fst_->SetStart(source);
+    } else {
+      // The new state for <s> unigram history *is* the start state.
+      fst_->SetStart(dest);
+      return;
+    }
+  }
+
+  // Add arc from source to dest, whichever way it was found.
+  fst_->AddArc(source, fst::StdArc(sym, sym, weight, dest));
+  return;
+}
+
+// Find or create a new state for n-gram defined by key, and ensure it has a
+// backoff transition.  The key is either the current n-gram for all but
+// highest orders, or the tails of the n-gram for the highest order. The
+// latter arises from the chain-collapsing optimization described above.
+template <class HistKey>
+StateId ArpaLmCompilerImpl<HistKey>::AddStateWithBackoff(HistKey key,
+                                                         float backoff) {
+  typename HistoryMap::iterator dest_it = history_.find(key);
+  if (dest_it != history_.end()) {
+    // Found an existing state in the history map. Invariant: if the state in
+    // the map, then its backoff arc is in the FST. We are done.
+    return dest_it->second;
+  }
+  // Otherwise create a new state and its backoff arc, and register in the map.
+  StateId dest = fst_->AddState();
+  history_[key] = dest;
+  CreateBackoff(key.Tails(), dest, backoff);
+  return dest;
+}
+
+// Create a backoff arc for a state. Key is a backoff destination that may or
+// may not exist. When the destination is not found, naturally fall back to
+// the lower order model, and all the way down until one is found (since the
+// 0-gram model is always present, the search is guaranteed to terminate).
+template <class HistKey>
+inline void ArpaLmCompilerImpl<HistKey>::CreateBackoff(
+    HistKey key, StateId state, float weight) {
+  typename HistoryMap::iterator dest_it = history_.find(key);
+  while (dest_it == history_.end()) {
+    key = key.Tails();
+    dest_it = history_.find(key);
+  }
+
+  // The arc should transduce either <eos> or #0 to <eps>, depending on the
+  // epsilon substitution mode. This is the only case when input and output
+  // label may differ.
+  fst_->AddArc(state, fst::StdArc(sub_eps_, 0, weight, dest_it->second));
+}
+
+ArpaLmCompiler::~ArpaLmCompiler() {
+  if (impl_ != NULL)
+    delete impl_;
+}
+
+void ArpaLmCompiler::HeaderAvailable() {
+  KALDI_ASSERT(impl_ == NULL);
+  // Use optimized implementation if the grammar is 4-gram or less, and the
+  // maximum attained symbol id will fit into the optimized range.
+  int64 max_symbol = 0;
+  if (Symbols() != NULL)
+    max_symbol = Symbols()->AvailableKey() - 1;
+  // If augmenting the symbol table, assume the wors case when all words in
+  // the model being read are novel.
+  if (Options().oov_handling == ArpaParseOptions::kAddToSymbols)
+    max_symbol += NgramCounts()[0];
+
+  if (NgramCounts().size() <= 4 && max_symbol < OptimizedHistKey::kMaxData) {
+    impl_ = new ArpaLmCompilerImpl<OptimizedHistKey>(this, &fst_, sub_eps_);
+  } else {
+    impl_ = new ArpaLmCompilerImpl<GeneralHistKey>(this, &fst_, sub_eps_);
+    KALDI_LOG << "Reverting to slower state tracking because model is large: "
+              << NgramCounts().size() << "-gram with symbols up to "
+              << max_symbol;
+  }
+}
+
+void ArpaLmCompiler::ConsumeNGram(const NGram &ngram) {
+  // <s> is invalid in tails, </s> in heads of an n-gram.
+  for (int i = 0; i < ngram.words.size(); ++i) {
+    if ((i > 0 && ngram.words[i] == Options().bos_symbol) ||
+        (i + 1 < ngram.words.size()
+         && ngram.words[i] == Options().eos_symbol)) {
+      if (ShouldWarn())
+        KALDI_WARN << LineReference()
+                   << " skipped: n-gram has invalid BOS/EOS placement";
+      return;
+    }
+  }
+
+  bool is_highest = ngram.words.size() == NgramCounts().size();
+  impl_->ConsumeNGram(ngram, is_highest);
+}
+
+void ArpaLmCompiler::ReadComplete() {
+  fst_.SetInputSymbols(Symbols());
+  fst_.SetOutputSymbols(Symbols());
+}
+
+}  // namespace kaldi
diff --git a/src/lm/arpa-lm-compiler.h b/src/lm/arpa-lm-compiler.h
new file mode 100644
index 00000000000..304301c731d
--- /dev/null
+++ b/src/lm/arpa-lm-compiler.h
@@ -0,0 +1,59 @@
+// lm/arpa-lm-compiler.h
+
+// Copyright 2009-2011 Gilles Boulianne
+// Copyright 2016 Smart Action LLC (kkm)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_LM_ARPA_LM_COMPILER_H_
+#define KALDI_LM_ARPA_LM_COMPILER_H_
+
+#include <fst/fstlib.h>
+
+#include "lm/arpa-file-parser.h"
+
+namespace kaldi {
+
+class ArpaLmCompilerImplInterface;
+
+class ArpaLmCompiler : public ArpaFileParser {
+ public:
+  ArpaLmCompiler(ArpaParseOptions options, int sub_eps,
+                 fst::SymbolTable* symbols)
+      : ArpaFileParser(options, symbols),
+        sub_eps_(sub_eps), impl_(NULL) {
+  }
+  ~ArpaLmCompiler();
+
+  const fst::StdVectorFst& Fst() const { return fst_; }
+  fst::StdVectorFst* MutableFst() { return &fst_; }
+
+ protected:
+  // ArpaFileParser overrides.
+  virtual void HeaderAvailable();
+  virtual void ConsumeNGram(const NGram& ngram);
+  virtual void ReadComplete();
+
+ private:
+  int sub_eps_;
+  ArpaLmCompilerImplInterface* impl_;  // Owned.
+  fst::StdVectorFst fst_;
+  template <class HistKey> friend class ArpaLmCompilerImpl;
+};
+
+}  // namespace kaldi
+
+#endif  // KALDI_LM_ARPA_LM_COMPILER_H_
diff --git a/src/lm/const-arpa-lm.cc b/src/lm/const-arpa-lm.cc
index fba7629fda1..bb1517c8875 100644
--- a/src/lm/const-arpa-lm.cc
+++ b/src/lm/const-arpa-lm.cc
@@ -17,21 +17,26 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
+#include <algorithm>
+#include <limits>
 #include <sstream>
+#include <utility>
 
+#include "base/kaldi-math.h"
+#include "lm/arpa-file-parser.h"
 #include "lm/const-arpa-lm.h"
 #include "util/stl-utils.h"
 #include "util/text-utils.h"
-#include "base/kaldi-math.h"
+
 
 namespace kaldi {
 
 // Auxiliary struct for converting ConstArpaLm format langugae model to Arpa
 // format.
 struct ArpaLine {
-  std::vector<int32> words; // Sequence of words to be printed.
-  float logprob;            // Logprob corresponds to word sequence.
-  float backoff_logprob;    // Backoff_logprob corresponds to word sequence.
+  std::vector<int32> words;  // Sequence of words to be printed.
+  float logprob;             // Logprob corresponds to word sequence.
+  float backoff_logprob;     // Backoff_logprob corresponds to word sequence.
   // Comparison function for sorting.
   bool operator < (const ArpaLine &other) const {
     if (words.size() < other.words.size()) {
@@ -60,7 +65,7 @@ class LmState {
   struct ChildrenVectorLessThan {
     bool operator()(
         const std::pair<int32, union ChildType>& lhs,
-        const std::pair<int32, union ChildType>& rhs ) const {
+        const std::pair<int32, union ChildType>& rhs) const {
       return lhs.first < rhs.first;
     }
   };
@@ -163,19 +168,16 @@ class LmState {
   // "A B -> X" backing off to "B -> X".
   float backoff_logprob_;
 
-  // List of children. 
+  // List of children.
   std::vector<std::pair<int32, union ChildType> > children_;
 };
 
 // Class to build ConstArpaLm from Arpa format language model. It relies on the
 // auxiliary class LmState above.
-class ConstArpaLmBuilder {
+class ConstArpaLmBuilder : public ArpaFileParser {
  public:
-  ConstArpaLmBuilder(
-      const bool natural_base, const int32 bos_symbol,
-      const int32 eos_symbol, const int32 unk_symbol) :
-      natural_base_(natural_base), bos_symbol_(bos_symbol),
-      eos_symbol_(eos_symbol), unk_symbol_(unk_symbol) {
+  ConstArpaLmBuilder(ArpaParseOptions options)
+      : ArpaFileParser(options, NULL) {
     ngram_order_ = 0;
     num_words_ = 0;
     overflow_buffer_size_ = 0;
@@ -184,7 +186,7 @@ class ConstArpaLmBuilder {
     is_built_ = false;
     lm_states_ = NULL;
     unigram_states_ = NULL;
-    overflow_buffer_ = NULL; 
+    overflow_buffer_ = NULL;
   }
 
   ~ConstArpaLmBuilder() {
@@ -200,35 +202,31 @@ class ConstArpaLmBuilder {
     }
   }
 
-  // Reads in the Arpa format language model, parses it and creates LmStates.
-  void Read(std::istream &is, bool binary);
-
   // Writes ConstArpaLm.
   void Write(std::ostream &os, bool binary) const;
 
-  // Builds ConstArpaLm.
-  void Build();
-
   void SetMaxAddressOffset(const int32 max_address_offset) {
     KALDI_WARN << "You are changing <max_address_offset_>; the default should "
         << "not be changed unless you are in testing mode.";
     max_address_offset_ = max_address_offset;
   }
 
+ protected:
+  // ArpaFileParser overrides.
+  virtual void HeaderAvailable();
+  virtual void ConsumeNGram(const NGram& ngram);
+  virtual void ReadComplete();
+
  private:
   struct WordsAndLmStatePairLessThan {
     bool operator()(
         const std::pair<std::vector<int32>*, LmState*>& lhs,
-        const std::pair<std::vector<int32>*, LmState*>& rhs ) const {
+        const std::pair<std::vector<int32>*, LmState*>& rhs) const {
       return *(lhs.first) < *(rhs.first);
     }
   };
 
  private:
-  // If true, use natural base e for log-prob, otherwise use base 10. The
-  // default base in Arpa format language model is base 10.
-  bool natural_base_;
-
   // Indicating if ConstArpaLm has been built or not.
   bool is_built_;
 
@@ -236,16 +234,6 @@ class ConstArpaLmBuilder {
   // The default value is 30-bits and should not be changed except for testing.
   int32 max_address_offset_;
 
-  // Integer corresponds to <s>.
-  int32 bos_symbol_;
-
-  // Integer corresponds to </s>.
-  int32 eos_symbol_;
-
-  // Integer corresponds to unknown-word. -1 if no unknown-word symbol is
-  // provided.
-  int32 unk_symbol_;
-
   // N-gram order of language model. This can be figured out from "/data/"
   // section in Arpa format language model.
   int32 ngram_order_;
@@ -259,7 +247,7 @@ class ConstArpaLmBuilder {
   int32 overflow_buffer_size_;
 
   // Size of the <lm_states_> array, which will be needed by I/O.
-  int32 lm_states_size_;
+  int64 lm_states_size_;
 
   // Memory blcok for storing LmStates.
   int32* lm_states_;
@@ -276,201 +264,58 @@ class ConstArpaLmBuilder {
                 LmState*, VectorHasher<int32> > seq_to_state_;
 };
 
-// Reads in the Arpa format language model, parses it and puts the word sequence
-// into the corresponding LmState in <seq_to_state_>.
-void ConstArpaLmBuilder::Read(std::istream &is, bool binary) {
-  if (binary) {
-    KALDI_ERR << "binary-mode reading is not implemented for "
-        << "ConstArpaLmBuilder.";
-  }
-
-  std::string line;
-
-  // Number of n-grams from "\data\" section. Those numbers should match the
-  // actual number of n-grams from "\N-grams:" sections.
-  // Note that when we convert the words in the Arpa format language model into
-  // integers, we remove lines with OOV words. We also modify the n-gram counts
-  // in "\data\" correspondingly.
-  std::vector<int32> num_ngrams;
-
-  // Processes "\data\" section.
-  bool keyword_found = false;
-  while (getline(is, line) && !is.eof()) {
-    // The section keywords starts with backslash. We terminate the while loop
-    // if a new section is found.
-    if (!line.empty() && line[0] == '\\') {
-      if (line.find("-grams:") != std::string::npos) break;
-      if (line.find("\\end\\") != std::string::npos) break;
-    }
-
-    std::size_t equal_symbol_pos = line.find("=");
-    if (equal_symbol_pos != std::string::npos)
-      line.replace(equal_symbol_pos, 1, " = "); // Inserts spaces around "="
-    std::vector<std::string> col;
-    SplitStringToVector(line, " \t", true, &col);
-
-    // Looks for keyword "\data\".
-    if (!keyword_found && col.size() == 1 && col[0] == "\\data\\") {
-      KALDI_LOG << "Reading \"\\data\\\" section.";
-      keyword_found = true;
-      continue;
-    }
+void ConstArpaLmBuilder::HeaderAvailable() {
+  ngram_order_ = NgramCounts().size();
+}
 
-    // Enters "\data\" section, and looks for patterns like"ngram 1=1000", which
-    // means there are 1000 unigrams.
-    if (keyword_found && col.size() == 4 && col[0] == "ngram") {
-      if (col[2] == "=") {
-        int32 order, ngram_count;
-        if (!ConvertStringToInteger(col[1], &order)) {
-          KALDI_ERR << "bad line: " << line << "; fail to convert "
-              << col[1] << " to integer.";
-        }
-        if (!ConvertStringToInteger(col[3], &ngram_count)) {
-          KALDI_ERR << "bad line: " << line << "; fail to convert "
-              << col[3] << " to integer.";
-        }
-        if (num_ngrams.size() <= order) {
-          num_ngrams.resize(order + 1);
-        }
-        num_ngrams[order] = ngram_count;
-      } else {
-        KALDI_WARN << "Uninterpretable line \"\\data\\\" section: " << line;
-      }
-    } else if (keyword_found) {
-      KALDI_WARN << "Uninterpretable line \"\\data\\\" section: " << line;
-    }
+void ConstArpaLmBuilder::ConsumeNGram(const NGram &ngram) {
+  int32 cur_order = ngram.words.size();
+  // If <ngram_order_> is larger than 1, then we do not create LmState for
+  // the final order entry. We only keep the log probability for it.
+  LmState *lm_state = NULL;
+  if (cur_order != ngram_order_ || ngram_order_ == 1) {
+    lm_state = new LmState(cur_order == 1,
+                           cur_order == ngram_order_ - 1,
+                           ngram.logprob, ngram.backoff);
+
+    KALDI_ASSERT(seq_to_state_.find(ngram.words) == seq_to_state_.end());
+    seq_to_state_[ngram.words] = lm_state;
   }
-  if (num_ngrams.size() == 0)
-    KALDI_ERR << "Fail to read \"\\data\\\" section.";
-  ngram_order_ = num_ngrams.size() - 1;
-
-  // Processes "\N-grams:" section.
-  int32 max_word_id = 0;
-  for (int32 cur_order = 1; cur_order < num_ngrams.size(); ++cur_order) {
-    // Skips n-grams with zero count.
-    if (num_ngrams[cur_order] == 0) continue;
-
-    keyword_found = false;
-    int32 ngram_count = 0;
-    std::ostringstream keyword;
-    keyword << "\\" << cur_order << "-grams:";
-    // We use "do ... while" loop since one line has already been read.
-    do {
-      // The section keywords starts with backslash. We terminate the while loop
-      // if a new section is found.
-      if (!line.empty() && line[0] == '\\') {
-        if (line.find("-grams:") != std::string::npos && keyword_found) break;
-        if (line.find("\\end\\") != std::string::npos) break;
-      }
-
-      std::vector<std::string> col;
-      SplitStringToVector(line, " \t", true, &col);
-
-      // Looks for keyword "\N-gram:" if the keyword has not been located.
-      if (!keyword_found && col.size() == 1 && col[0] == keyword.str()) {
-        KALDI_LOG << "Reading \"" << keyword.str() << "\" section.";
-        ngram_count = 0;
-        keyword_found = true;
-        continue;
-      }
-
-      // Enters "\N-grams:" section if the keyword has been located.
-      if (keyword_found && col.size() > 0) {
-        KALDI_ASSERT(col.size() >= 1 + cur_order);
-        KALDI_ASSERT(col.size() <= 2 + cur_order);// backoff_logprob could be 0.
-        if (cur_order == ngram_order_ && col.size() == 2 + cur_order) {
-          KALDI_ERR << "Backoff probability detected for final-order entry \""
-              << line << "\".";
-        }
-        ngram_count++;
 
-        // If backoff_logprob is 0, it will not appear in Arpa format language
-        // model. We put it back so the processing afterwards will be easier.
-        if (col.size() == 1 + cur_order) {
-          col.push_back("0");
-        }
-
-        // Creates LmState for the current word sequence.
-        bool is_unigram = (cur_order == 1) ? true : false;
-        float logprob;
-        float backoff_logprob;
-        KALDI_ASSERT(ConvertStringToReal(col[0], &logprob));
-        KALDI_ASSERT(ConvertStringToReal(col[1 + cur_order], &backoff_logprob));
-        if (natural_base_) {
-          logprob *= Log(10.0f);
-          backoff_logprob *= Log(10.0f);
-        }
-       
-        // If <ngram_order_> is larger than 1, then we do not create LmState for
-        // the final order entry. We only keep the log probability for it.
-        LmState *lm_state = NULL;
-        if (cur_order != ngram_order_ || ngram_order_ == 1) {
-          lm_state = new LmState(is_unigram,
-                                 (cur_order == ngram_order_ - 1),
-                                 logprob, backoff_logprob);
-        }
-
-        // Figures out the sequence of words.
-        std::vector<int32> seq(cur_order, 0);
-        for (int32 index = 0; index < cur_order; ++index) {
-          int32 word;
-          if (!ConvertStringToInteger(col[1 + index], &word)) {
-            KALDI_ERR << "bad line: " << line << "; fail to convert "
-                << col[1 + index] << " to integer.";
-          }
-          seq[index] = word;
-        }
-
-        // If <ngram_order_> is larger than 1, then we do not insert LmState to
-        // <seq_to_state_>.
-        if (cur_order != ngram_order_ || ngram_order_ == 1) {
-          KALDI_ASSERT(lm_state != NULL);
-          KALDI_ASSERT(seq_to_state_.find(seq) == seq_to_state_.end());
-          seq_to_state_[seq] = lm_state;
-        }
-
-        // If n-gram order is larger than 1, we have to add possible child to
-        // existing LmStates. We have the following two assumptions:
-        // 1. N-grams are processed from small order to larger ones, i.e., from
-        //    1, 2, ... to the highest order.
-        // 2. If a n-gram exists in the Arpa format language model, then the
-        //    "history" n-gram also exists. For example, if "A B C" is a valid
-        //    n-gram, then "A B" is also a valid n-gram.
-        if (cur_order > 1) {
-          std::vector<int32> hist(seq.begin(), seq.begin() + cur_order - 1);
-          int32 word = seq[seq.size() - 1];
-          unordered_map<std::vector<int32>,
-                        LmState*, VectorHasher<int32> >::iterator hist_iter;
-          hist_iter = seq_to_state_.find(hist);
-          KALDI_ASSERT(hist_iter != seq_to_state_.end());
-          if (cur_order != ngram_order_ || ngram_order_ == 1) {
-            KALDI_ASSERT(lm_state != NULL);
-            KALDI_ASSERT(!hist_iter->second->IsChildFinalOrder());
-            hist_iter->second->AddChild(word, lm_state);
-          } else {
-            KALDI_ASSERT(lm_state == NULL);
-            KALDI_ASSERT(hist_iter->second->IsChildFinalOrder());
-            hist_iter->second->AddChild(word, logprob);
-          }
-        } else {
-          // Figures out <max_word_id>.
-          KALDI_ASSERT(seq.size() == 1);
-          if (seq[0] > max_word_id) {
-            max_word_id = seq[0];
-          }
-        }
-      }
-    } while (getline(is, line) && !is.eof());
-    if (ngram_count > num_ngrams[cur_order] ||
-        (ngram_count == 0 && num_ngrams[cur_order] != 0)) {
-      KALDI_ERR << "Header said there would be " << num_ngrams[cur_order]
-                << " n-grams of order " << cur_order << ", but we saw "
-                << ngram_count;
+  // If n-gram order is larger than 1, we have to add possible child to
+  // existing LmStates. We have the following two assumptions:
+  // 1. N-grams are processed from small order to larger ones, i.e., from
+  //    1, 2, ... to the highest order.
+  // 2. If a n-gram exists in the Arpa format language model, then the
+  //    "history" n-gram also exists. For example, if "A B C" is a valid
+  //    n-gram, then "A B" is also a valid n-gram.
+  int32 last_word = ngram.words[cur_order - 1];
+  if (cur_order > 1) {
+    std::vector<int32> hist(ngram.words.begin(), ngram.words.end() - 1);
+    unordered_map<std::vector<int32>,
+                  LmState*, VectorHasher<int32> >::iterator hist_iter;
+    hist_iter = seq_to_state_.find(hist);
+    if (hist_iter == seq_to_state_.end()) {
+      std::ostringstream ss;
+      for (int i = 0; i < cur_order; ++i)
+        ss << (i == 0 ? '[' : ' ') << ngram.words[i];
+      KALDI_ERR << "In line " << LineNumber() << ": "
+                << cur_order << "-gram " << ss.str() << "] does not have "
+                << "a parent model " << cur_order << "-gram.";
     }
+    if (cur_order != ngram_order_ || ngram_order_ == 1) {
+      KALDI_ASSERT(lm_state != NULL);
+      KALDI_ASSERT(!hist_iter->second->IsChildFinalOrder());
+      hist_iter->second->AddChild(last_word, lm_state);
+    } else {
+      KALDI_ASSERT(lm_state == NULL);
+      KALDI_ASSERT(hist_iter->second->IsChildFinalOrder());
+      hist_iter->second->AddChild(last_word, ngram.logprob);
+    }
+  } else {
+    // Figures out <max_word_id>.
+    num_words_ = std::max(num_words_, last_word + 1);
   }
-
-  // <num_words_> is <max_word_id> plus 1.
-  num_words_ = max_word_id + 1;
 }
 
 // ConstArpaLm can be built in the following steps, assuming we have already
@@ -499,7 +344,7 @@ void ConstArpaLmBuilder::Read(std::istream &is, bool binary) {
 //    At the same time, we will also create two special buffers:
 //    <unigram_states_>
 //    <overflow_buffer_>
-void ConstArpaLmBuilder::Build() {
+void ConstArpaLmBuilder::ReadComplete() {
   // STEP 1: sorting LmStates lexicographically.
   // Vector for holding the sorted LmStates.
   std::vector<std::pair<std::vector<int32>*, LmState*> > sorted_vec;
@@ -529,7 +374,7 @@ void ConstArpaLmBuilder::Build() {
 
   // STEP 3: creating memory block to store LmStates.
   // Reserves a memory block for LmStates.
-  int32 lm_states_index = 0;
+  int64 lm_states_index = 0;
   try {
     lm_states_ = new int32[lm_states_size_];
   } catch(const std::exception &e) {
@@ -547,12 +392,12 @@ void ConstArpaLmBuilder::Build() {
     int32* parent_address = lm_states_ + lm_states_index;
 
     // Adds logprob.
-    float logprob = sorted_vec[i].second->Logprob();
-    lm_states_[lm_states_index++] = *reinterpret_cast<int32*>(&logprob);
+    Int32AndFloat logprob_f(sorted_vec[i].second->Logprob());
+    lm_states_[lm_states_index++] = logprob_f.i;
 
     // Adds backoff_logprob.
-    float backoff_logprob = sorted_vec[i].second->BackoffLogprob();
-    lm_states_[lm_states_index++] = *reinterpret_cast<int32*>(&backoff_logprob);
+    Int32AndFloat backoff_logprob_f(sorted_vec[i].second->BackoffLogprob());
+    lm_states_[lm_states_index++] = backoff_logprob_f.i;
 
     // Adds num_children.
     lm_states_[lm_states_index++] = sorted_vec[i].second->NumChildren();
@@ -570,14 +415,14 @@ void ConstArpaLmBuilder::Build() {
         // Child is a leaf and not unigram. In this case we will not create an
         // entry in <lm_states_>; instead, we put the logprob in the place where
         // we normally store the poitner.
-        float child_logprob;
+        Int32AndFloat child_logprob_f;
         if (sorted_vec[i].second->IsChildFinalOrder()) {
-          child_logprob = sorted_vec[i].second->GetChild(j).second.prob;
+          child_logprob_f.f = sorted_vec[i].second->GetChild(j).second.prob;
         } else {
-          child_logprob =
+          child_logprob_f.f =
               sorted_vec[i].second->GetChild(j).second.state->Logprob();
         }
-        child_info = *reinterpret_cast<int32*>(&child_logprob);
+        child_info = child_logprob_f.i;
         child_info &= ~1;   // Sets the last bit to 0 so <child_info> is even.
       } else {
         // Child is not a leaf or is unigram.
@@ -633,9 +478,10 @@ void ConstArpaLmBuilder::Write(std::ostream &os, bool binary) const {
   KALDI_ASSERT(is_built_);
 
   // Creates ConstArpaLm.
-  ConstArpaLm const_arpa_lm(bos_symbol_, eos_symbol_, unk_symbol_, ngram_order_,
-                            num_words_, overflow_buffer_size_, lm_states_size_,
-                            unigram_states_, overflow_buffer_, lm_states_);
+  ConstArpaLm const_arpa_lm(
+      Options().bos_symbol, Options().eos_symbol, Options().unk_symbol,
+      ngram_order_, num_words_, overflow_buffer_size_, lm_states_size_,
+      unigram_states_, overflow_buffer_, lm_states_);
   const_arpa_lm.Write(os, binary);
 }
 
@@ -645,21 +491,31 @@ void ConstArpaLm::Write(std::ostream &os, bool binary) const {
     KALDI_ERR << "text-mode writing is not implemented for ConstArpaLm.";
   }
 
+  WriteToken(os, binary, "<ConstArpaLm>");
+
   // Misc info.
+  WriteToken(os, binary, "<LmInfo>");
   WriteBasicType(os, binary, bos_symbol_);
   WriteBasicType(os, binary, eos_symbol_);
   WriteBasicType(os, binary, unk_symbol_);
   WriteBasicType(os, binary, ngram_order_);
+  WriteToken(os, binary, "</LmInfo>");
 
   // LmStates section.
+  WriteToken(os, binary, "<LmStates>");
   WriteBasicType(os, binary, lm_states_size_);
-  for (int32 i = 0; i < lm_states_size_; ++i) {
-    WriteBasicType(os, binary, lm_states_[i]);
+  os.write(reinterpret_cast<char *>(lm_states_),
+           sizeof(int32) * lm_states_size_);
+  if (!os.good()) {
+    KALDI_ERR << "ConstArpaLm <LmStates> section writing failed.";
   }
+  WriteToken(os, binary, "</LmStates>");
 
   // Unigram section. We write memory offset to disk instead of the absolute
   // pointers.
+  WriteToken(os, binary, "<LmUnigram>");
   WriteBasicType(os, binary, num_words_);
+  int64* tmp_unigram_address = new int64[num_words_];
   for (int32 i = 0; i < num_words_; ++i) {
     // The relative address here is a little bit tricky:
     // 1. If the original address is NULL, then we set the relative address to
@@ -667,14 +523,23 @@ void ConstArpaLm::Write(std::ostream &os, bool binary) const {
     // 2. If the original address is not NULL, we set it to the following:
     //      unigram_states_[i] - lm_states_ + 1
     //    we plus 1 to ensure that the above value is positive.
-    int64 tmp_address = (unigram_states_[i] == NULL) ? 0 :
+    tmp_unigram_address[i] = (unigram_states_[i] == NULL) ? 0 :
         unigram_states_[i] - lm_states_ + 1;
-    WriteBasicType(os, binary, tmp_address);
   }
+  os.write(reinterpret_cast<char *>(tmp_unigram_address),
+           sizeof(int64) * num_words_);
+  if (!os.good()) {
+    KALDI_ERR << "ConstArpaLm <LmUnigram> section writing failed.";
+  }
+  delete[] tmp_unigram_address;   // Releases the memory.
+  tmp_unigram_address = NULL;
+  WriteToken(os, binary, "</LmUnigram>");
 
   // Overflow section. We write memory offset to disk instead of the absolute
   // pointers.
+  WriteToken(os, binary, "<LmOverflow>");
   WriteBasicType(os, binary, overflow_buffer_size_);
+  int64* tmp_overflow_address = new int64[overflow_buffer_size_];
   for (int32 i = 0; i < overflow_buffer_size_; ++i) {
     // The relative address here is a little bit tricky:
     // 1. If the original address is NULL, then we set the relative address to
@@ -682,10 +547,18 @@ void ConstArpaLm::Write(std::ostream &os, bool binary) const {
     // 2. If the original address is not NULL, we set it to the following:
     //      overflow_buffer_[i] - lm_states_ + 1
     //    we plus 1 to ensure that the above value is positive.
-    int64 tmp_address = (overflow_buffer_[i] == NULL) ? 0 :
+    tmp_overflow_address[i] = (overflow_buffer_[i] == NULL) ? 0 :
         overflow_buffer_[i] - lm_states_ + 1;
-    WriteBasicType(os, binary, tmp_address);
   }
+  os.write(reinterpret_cast<char *>(tmp_overflow_address),
+           sizeof(int64) * overflow_buffer_size_);
+  if (!os.good()) {
+    KALDI_ERR << "ConstArpaLm <LmOverflow> section writing failed.";
+  }
+  delete[] tmp_overflow_address;
+  tmp_overflow_address = NULL;
+  WriteToken(os, binary, "</LmOverflow>");
+  WriteToken(os, binary, "</ConstArpaLm>");
 }
 
 void ConstArpaLm::Read(std::istream &is, bool binary) {
@@ -694,16 +567,112 @@ void ConstArpaLm::Read(std::istream &is, bool binary) {
     KALDI_ERR << "text-mode reading is not implemented for ConstArpaLm.";
   }
 
+  int first_char = is.peek();
+  if (first_char == 4) {  // Old on-disk format starts with length of int32.
+    ReadInternalOldFormat(is, binary);
+  } else {                // New on-disk format starts with token <ConstArpaLm>.
+    ReadInternal(is, binary);
+  }
+}
+
+void ConstArpaLm::ReadInternal(std::istream &is, bool binary) {
+  KALDI_ASSERT(!initialized_);
+  if (!binary) {
+    KALDI_ERR << "text-mode reading is not implemented for ConstArpaLm.";
+  }
+
+  ExpectToken(is, binary, "<ConstArpaLm>");
+
   // Misc info.
+  ExpectToken(is, binary, "<LmInfo>");
   ReadBasicType(is, binary, &bos_symbol_);
   ReadBasicType(is, binary, &eos_symbol_);
   ReadBasicType(is, binary, &unk_symbol_);
   ReadBasicType(is, binary, &ngram_order_);
+  ExpectToken(is, binary, "</LmInfo>");
 
   // LmStates section.
+  ExpectToken(is, binary, "<LmStates>");
   ReadBasicType(is, binary, &lm_states_size_);
   lm_states_ = new int32[lm_states_size_];
-  for (int32 i = 0; i < lm_states_size_; ++i) {
+  is.read(reinterpret_cast<char *>(lm_states_),
+          sizeof(int32) * lm_states_size_);
+  if (!is.good()) {
+    KALDI_ERR << "ConstArpaLm <LmStates> section reading failed.";
+  }
+  ExpectToken(is, binary, "</LmStates>");
+
+  // Unigram section. We write memory offset to disk instead of the absolute
+  // pointers.
+  ExpectToken(is, binary, "<LmUnigram>");
+  ReadBasicType(is, binary, &num_words_);
+  unigram_states_ = new int32*[num_words_];
+  int64* tmp_unigram_address = new int64[num_words_];
+  is.read(reinterpret_cast<char *>(tmp_unigram_address),
+          sizeof(int64) * num_words_);
+  if (!is.good()) {
+    KALDI_ERR << "ConstArpaLm <LmUnigram> section reading failed.";
+  }
+  for (int32 i = 0; i < num_words_; ++i) {
+    // Check out how we compute the relative address in ConstArpaLm::Write().
+    unigram_states_[i] = (tmp_unigram_address[i] == 0) ? NULL
+        : lm_states_ + tmp_unigram_address[i] - 1;
+  }
+  delete[] tmp_unigram_address;
+  tmp_unigram_address = NULL;
+  ExpectToken(is, binary, "</LmUnigram>");
+
+  // Overflow section. We write memory offset to disk instead of the absolute
+  // pointers.
+  ExpectToken(is, binary, "<LmOverflow>");
+  ReadBasicType(is, binary, &overflow_buffer_size_);
+  overflow_buffer_ = new int32*[overflow_buffer_size_];
+  int64* tmp_overflow_address = new int64[overflow_buffer_size_];
+  is.read(reinterpret_cast<char *>(tmp_overflow_address),
+          sizeof(int64) * overflow_buffer_size_);
+  if (!is.good()) {
+    KALDI_ERR << "ConstArpaLm <LmOverflow> section reading failed.";
+  }
+  for (int32 i = 0; i < overflow_buffer_size_; ++i) {
+    // Check out how we compute the relative address in ConstArpaLm::Write().
+    overflow_buffer_[i] = (tmp_overflow_address[i] == 0) ? NULL
+        : lm_states_ + tmp_overflow_address[i] - 1;
+  }
+  delete[] tmp_overflow_address;
+  tmp_overflow_address = NULL;
+  ExpectToken(is, binary, "</LmOverflow>");
+  ExpectToken(is, binary, "</ConstArpaLm>");
+
+  KALDI_ASSERT(ngram_order_ > 0);
+  KALDI_ASSERT(bos_symbol_ < num_words_ && bos_symbol_ > 0);
+  KALDI_ASSERT(eos_symbol_ < num_words_ && eos_symbol_ > 0);
+  KALDI_ASSERT(unk_symbol_ < num_words_ &&
+               (unk_symbol_ > 0 || unk_symbol_ == -1));
+  lm_states_end_ = lm_states_ + lm_states_size_ - 1;
+  memory_assigned_ = true;
+  initialized_ = true;
+}
+
+void ConstArpaLm::ReadInternalOldFormat(std::istream &is, bool binary) {
+  KALDI_ASSERT(!initialized_);
+  if (!binary) {
+    KALDI_ERR << "text-mode reading is not implemented for ConstArpaLm.";
+  }
+
+  // Misc info.
+  ReadBasicType(is, binary, &bos_symbol_);
+  ReadBasicType(is, binary, &eos_symbol_);
+  ReadBasicType(is, binary, &unk_symbol_);
+  ReadBasicType(is, binary, &ngram_order_);
+
+  // LmStates section.
+  // In the deprecated version, <lm_states_size_> used to be type of int32,
+  // which was a bug. We therefore use int32 for read for back-compatibility.
+  int32 lm_states_size_int32;
+  ReadBasicType(is, binary, &lm_states_size_int32);
+  lm_states_size_ = static_cast<int64>(lm_states_size_int32);
+  lm_states_ = new int32[lm_states_size_];
+  for (int64 i = 0; i < lm_states_size_; ++i) {
     ReadBasicType(is, binary, &lm_states_[i]);
   }
 
@@ -737,7 +706,7 @@ void ConstArpaLm::Read(std::istream &is, bool binary) {
                (unk_symbol_ > 0 || unk_symbol_ == -1));
   lm_states_end_ = lm_states_ + lm_states_size_ - 1;
   memory_assigned_ = true;
-  initialized_ = true;;
+  initialized_ = true;
 }
 
 bool ConstArpaLm::HistoryStateExists(const std::vector<int32>& hist) const {
@@ -815,7 +784,8 @@ float ConstArpaLm::GetNgramLogprobRecurse(
       // defined.
       return std::numeric_limits<float>::min();
     } else {
-      return *reinterpret_cast<float*>(unigram_states_[word]);
+      Int32AndFloat logprob_i(*unigram_states_[word]);
+      return logprob_i.f;
     }
   }
 
@@ -830,7 +800,8 @@ float ConstArpaLm::GetNgramLogprobRecurse(
       DecodeChildInfo(child_info, state, &child_lm_state, &logprob);
       return logprob;
     } else {
-      backoff_logprob = *reinterpret_cast<float*>(state + 1);
+      Int32AndFloat backoff_logprob_i(*(state + 1));
+      backoff_logprob = backoff_logprob_i.f;
     }
   }
   std::vector<int32> new_hist(hist);
@@ -909,17 +880,19 @@ void ConstArpaLm::DecodeChildInfo(const int32 child_info,
   if (child_info % 2 == 0) {
     // Child is a leaf, only returns the log probability.
     *child_lm_state = NULL;
-    int32 tmp_child_info = child_info;
-    *logprob = *reinterpret_cast<float*>(&tmp_child_info);
+    Int32AndFloat logprob_i(child_info);
+    *logprob = logprob_i.f;
   } else {
     int32 child_offset = child_info / 2;
     if (child_offset > 0) {
       *child_lm_state = parent + child_offset;
-      *logprob = *reinterpret_cast<float*>(*child_lm_state);
+      Int32AndFloat logprob_i(**child_lm_state);
+      *logprob = logprob_i.f;
     } else {
       KALDI_ASSERT(-child_offset < overflow_buffer_size_);
       *child_lm_state = overflow_buffer_[-child_offset];
-      *logprob = *reinterpret_cast<float*>(*child_lm_state);
+      Int32AndFloat logprob_i(**child_lm_state);
+      *logprob = logprob_i.f;
     }
     KALDI_ASSERT(*child_lm_state >= lm_states_);
     KALDI_ASSERT(*child_lm_state <= lm_states_end_);
@@ -937,8 +910,10 @@ void ConstArpaLm::WriteArpaRecurse(int32* lm_state,
   // Inserts the current LmState to <output>.
   ArpaLine arpa_line;
   arpa_line.words = seq;
-  arpa_line.logprob = *reinterpret_cast<float*>(lm_state);
-  arpa_line.backoff_logprob = *reinterpret_cast<float*>(lm_state + 1);
+  Int32AndFloat logprob_i(*lm_state);
+  arpa_line.logprob = logprob_i.f;
+  Int32AndFloat backoff_logprob_i(*(lm_state + 1));
+  arpa_line.backoff_logprob = backoff_logprob_i.f;
   output->push_back(arpa_line);
 
   // Scans for possible children, and recursively adds child to <output>.
@@ -998,7 +973,7 @@ void ConstArpaLm::WriteArpa(std::ostream &os) const {
   // Writes n-grams.
   int32 current_order = 0;
   for (int32 i = 0; i < tmp_output.size(); ++i) {
-    // Beginning of a n-gram section. 
+    // Beginning of a n-gram section.
     if (tmp_output[i].words.size() != current_order) {
       current_order = tmp_output[i].words.size();
       os << std::endl;
@@ -1087,16 +1062,14 @@ bool ConstArpaLmDeterministicFst::GetArc(StateId s,
   return true;
 }
 
-bool BuildConstArpaLm(const bool natural_base, const int32 bos_symbol,
-                      const int32 eos_symbol, const int32 unk_symbol,
+bool BuildConstArpaLm(const ArpaParseOptions& options,
                       const std::string& arpa_rxfilename,
                       const std::string& const_arpa_wxfilename) {
-  ConstArpaLmBuilder lm_builder(natural_base, bos_symbol,
-                                eos_symbol, unk_symbol);
+  ConstArpaLmBuilder lm_builder(options);
+  KALDI_LOG << "Reading " << arpa_rxfilename;
   ReadKaldiObject(arpa_rxfilename, &lm_builder);
-  lm_builder.Build();
   WriteKaldiObject(lm_builder, const_arpa_wxfilename, true);
   return true;
 }
 
-} // namespace kaldi
+}  // namespace kaldi
diff --git a/src/lm/const-arpa-lm.h b/src/lm/const-arpa-lm.h
index d0adf331f24..0a52bef0206 100644
--- a/src/lm/const-arpa-lm.h
+++ b/src/lm/const-arpa-lm.h
@@ -20,15 +20,194 @@
 #ifndef KALDI_LM_CONST_ARPA_LM_H_
 #define KALDI_LM_CONST_ARPA_LM_H_
 
+#include <string>
+#include <vector>
+
 #include "base/kaldi-common.h"
 #include "fstext/deterministic-fst.h"
+#include "lm/arpa-file-parser.h"
 #include "util/common-utils.h"
 
 namespace kaldi {
 
+/**
+    The following explains how the const arpa LM works. We will start from a toy
+    example, and gradually get to the existing framework. Related classes are:
+    LmState, ConstArpaLmBuilder and ConstArpaLm.
+
+    First, let's explain how we can compute LM scores from an Arpa file. Suppose
+    we want to get the N-gram prob for "A B C". We can code the lookup something
+    very roughly like this:
+
+    float GetNgramLogprob(hist, word) {  // hist = "A B", word = "C"
+      backoff_logprob = 0.0;
+      if ((state = GetLmState(hist)) != NULL) {
+        // "A B" exists as a prefix in the LM
+        if (state->HasWord(word)) {
+          return state->Logprob(word);
+        } else {
+          // We'll need to backoff to "B C", but include the backoff penalty.
+          backoff_logprob = state->BackoffLogprob();
+        }
+      }
+      return backoff_logprob + GetNgramLogprob(hist_minus_first_word, word);
+    }
+
+    In terms of data-structures, in the most abstract form of it would be
+    something like the following (note, we assume words in the lexicon can be
+    represented as int32, and that these indexes are nonnegative):
+
+    class LmState {  // e.g., LmState for "A B"
+      // This is the actual LM-prob of this sequence, e.g. if this state is
+      // "A B" then it would be the logprob of "A -> B".
+      float logprob_;
+
+      // Backoff probability for LM-state "A B" -> "X" backing off to "B" -> "X"
+      // if "A B X" is not present in the language model.
+      float backoff_logprob_;
+
+      // e.g. "C" -> LmState of "A B C".
+      std::unordered_map<int32, LmState*> children_;
+    };
+
+    The above design is very memory inefficient for two reasons:
+    1. Suppose "A B" has no children, i.e. no C such that "A B C" is an n-gram.
+       In this case the backoff_logprob will be zero and the 'children' vector
+       will be empty. So all we need is the "float logprob_". Let's call "A B" a
+       leaf in this case.
+    2. The map std::unordered_map uses a lot of memory.
+
+    A first iteration of making this efficient is to get rid of the map as
+    follows:
+
+    class LmState {
+      float logprob_;
+      float backoff_logprob_;
+      std::vector<std::pair<int32, int32> > children_;
+    };
+
+    Here, the 'children_' vector contains pairs (child_word, child_info), sorted
+    by 'child_word' so we can use binary search to locate the entry. We have to
+    do some fancy bit-work to avoid having to allocate an LmState if a given
+    N-gram is a leaf. We design the child_info in the children_ vector as
+    follows:
+    1. If it's an even number, then it represents a float (i.e. we
+       reinterpret_cast to float), and the associated N-gram is a leaf. This
+       requires losing the least significant bit of information in the float.
+    2. If it's an odd number, then it will be used to represent a pointer to the
+       LmState of the child. In order to use a 32-bit number to represent a
+       possibly 64-bit pointer, we store the LmState structures in memory in
+       a way that's sorted lexicographically by the vector of words, so that
+       following "A B" will be the LmStates for "A B A", "A B B", "A B C" and so
+       on (note, we actually deal with integers instead of letters). So if we
+       make the pointers relative to the current LmState, most of them will be
+       quite small (and all will be positive, due to the lexicographic sorting).
+       As for the pointers that are too large, if any, we can have an "overflow
+       buffer" indexed by a 30-bit index that stores, directly as pointers, the
+       child LmStates. We use the first bit to distinguish the relative pointer
+       case and the overflow pointer case, i.e.,
+       a. If (child_info / 2) is positive, then (current_lmstate_pointer +
+          child_info / 2) is the address of the child LmState.
+       b. If (child_info / 2) is negative, then -1 * (child_info / 2) is the
+          index into the overflow buffer which gives the address of the child
+          LmState.
+
+    Note that unigram LM-states are usually frequently accessed, so it makes
+    sense to assign one LmState to each single word even if it would otherwise
+    be "leaf" as defined above. We then can have an array of those unigram
+    LM-states for efficient lookup.
+
+    Also, we define the class LmState just to set up data structure for Arpa
+    LM. In the end, we have a class like the following:
+
+    class ConstArpaLm {
+     public:
+      // Some public functions.
+     private:
+      // Index of largest word-id, plus one; defines end of "unigram_states_"
+      // array.
+      int32 num_words_;
+
+      // Loopup table for pointers of unigrams. The pointer could be NULL, for
+      // example for those words that are in words.txt, but not in the language
+      // model.
+      int32 **unigram_states_;
+
+      // Number of entries in the overflow buffer for pointers that couldn't be
+      // represented as a 30-bit relative index
+      int32 overflow_buffer_size_;
+
+      // Technically a 32-bit number cannot represent a possibly 64-bit pointer.
+      // We therefore use "relative" address instead of "absolute" address,
+      // which will be a small number most of the time. This buffer is for the
+      // case where the relative address has more than 30-bits.
+      int32 **overflow_buffer_;
+
+      // Size of the array lm_states_. This is required only for I/O.
+      int64 lm_states_size_;
+
+      // Data block for LmState.
+      int32 *lm_states_;
+    };
+
+    Note, when we do I/O, we don't write out the arrays of pointers
+    "overflow_buffer_" and "unigram_states_" directly. Instead we subtract
+    "lm_states_" from each one before writing them out, so we are writing out
+    indexes. Then, when we read them back in, after we allocate "lm_states_"
+    we can convert them back to pointers. When we create these temporary arrays
+    of indexes while reading and writing, we use int64, even if the pointer type
+    of the machine is int32. This way the I/O is independent of the pointer size
+    of the machine.
+
+    Now it is time to put things together.
+
+    ConstArpaLmBuilder takes charge of reading in the Arpa LM and building the
+    ConstArpaLm.
+
+    ConstArpaLM holds the Arpa LM in memory, and provides interfaces for LM
+    operations, such as GetNgramLogprob().
+
+    LmState is an auxiliary class that computes the relative pointers for
+    ConstArpaLmBuilder and ConstArpaLm. It will only be called once during the
+    building process, so it doesn't have to be very efficient.
+
+    In summary, the general building process is as follows:
+    1. In ConstArpaLmBuilder, read in the Arpa format LM. While reading, we keep
+       in memory something like this:
+         std::unordered_map<std::vector<int32>,
+                            LmState*, VectorHasher<int32> > seq_to_state_;
+       The map helps us to convert n-gram entries into LmState (including
+       setting up the parent-children relationship, see above about LmState).
+       Note that at this stage, we don't work on the relative pointers yet.
+    2. In ConstArpaLmBuilder, create a sorted vector from <seq_to_state_>
+         std::vector<std::pair<std::vector<int32>*, LmState*> > sorted_vec;
+       Note, only LmState with non-zero MemSize() should be put into the sorted
+       vector, and we sort it lexicographically according to the word.
+    3. In ConstArpaLmBuilder, update the address for each LmState, relative to
+       the first LmState in the sorted vector (i.e. assume the first LmState has
+       address 0, and work out the rest LmState address using the MemSize() of
+       each LmState).
+    4. In ConstArpaLmBuilder, create a memory block for all the LmStates (after
+       sorting and updating the address). This includes <lm_state_> that stores
+       all the LmStates in an int32 array, <unigram_states_> that keeps the
+       address of unigram LmStates, <overflow_buffer_> that keeps the address
+       of LmState whose address differs too much from the parent address. See
+       above how we handle the leaf case.
+    5. With the information in step 4, create the class ConstArpaLm.
+*/
+
 // Forward declaration of Auxiliary struct ArpaLine.
 struct ArpaLine;
 
+union Int32AndFloat {
+  int32 i;
+  float f;
+
+  Int32AndFloat() {}
+  Int32AndFloat(int32 input_i) : i(input_i) {}
+  Int32AndFloat(float input_f) : f(input_f) {}
+};
+
 class ConstArpaLm {
  public:
 
@@ -47,7 +226,7 @@ class ConstArpaLm {
   ConstArpaLm(const int32 bos_symbol, const int32 eos_symbol,
               const int32 unk_symbol, const int32 ngram_order,
               const int32 num_words, const int32 overflow_buffer_size,
-              const int32 lm_states_size, int32** unigram_states,
+              const int64 lm_states_size, int32** unigram_states,
               int32** overflow_buffer, int32* lm_states) :
       bos_symbol_(bos_symbol), eos_symbol_(eos_symbol),
       unk_symbol_(unk_symbol), ngram_order_(ngram_order),
@@ -75,7 +254,8 @@ class ConstArpaLm {
     }
   }
 
-  // Reads the ConstArpaLm format language model.
+  // Reads the ConstArpaLm format language model. It calls ReadInternal() or
+  // ReadInternalOldFormat() to do the actual reading.
   void Read(std::istream &is, bool binary);
 
   // Writes the language model in ConstArpaLm format.
@@ -99,8 +279,17 @@ class ConstArpaLm {
   int32 NgramOrder() const { return ngram_order_; }
 
  private:
+  // Function that loads data from stream to the class.
+  void ReadInternal(std::istream &is, bool binary);
+
+  // Function that loads data from stream to the class. This is a deprecated one
+  // that handles the old on-disk format. We keep this for back-compatibility
+  // purpose. We have modified the Write() function so for all the new on-disk
+  // format, ReadInternal() will be called.
+  void ReadInternalOldFormat(std::istream &is, bool binary);
+
   // Loops up n-gram probability for given word sequence. Backoff is handled by
-  // recursively calling this function. 
+  // recursively calling this function.
   float GetNgramLogprobRecurse(const int32 word,
                                const std::vector<int32>& hist) const;
 
@@ -109,7 +298,7 @@ class ConstArpaLm {
   //
   // If the word sequence exists in n-gram language model, but it is a leaf and
   // is not an unigram, we still return NULL, since there is no LmState struct
-  // reserved for this sequence. 
+  // reserved for this sequence.
   int32* GetLmState(const std::vector<int32>& seq) const;
 
   // Given a pointer to the parent, find the child_info that corresponds to
@@ -161,7 +350,7 @@ class ConstArpaLm {
   int32 overflow_buffer_size_;
 
   // Size of the <lm_states_> array, which will be needed by I/O.
-  int32 lm_states_size_;
+  int64 lm_states_size_;
 
   // Points to the end of <lm_states_>. We use this information to check if
   // there is any illegal visit to the un-reserved memory.
@@ -191,7 +380,7 @@ class ConstArpaLm {
   // Note that the floating point representation has 4 bytes, int32 also has 4
   // bytes, therefore one LmState will occupy the following number of bytes:
   //
-  // x = 1 + 1 + 1 + 2 * children.size() = 3 + 2 * children.size() 
+  // x = 1 + 1 + 1 + 2 * children.size() = 3 + 2 * children.size()
   int32* lm_states_;
 };
 
@@ -199,14 +388,14 @@ class ConstArpaLm {
  This class wraps a ConstArpaLm format language model with the interface defined
  in DeterministicOnDemandFst.
  */
-class ConstArpaLmDeterministicFst :
-    public fst::DeterministicOnDemandFst<fst::StdArc> {
+class ConstArpaLmDeterministicFst
+  : public fst::DeterministicOnDemandFst<fst::StdArc> {
  public:
   typedef fst::StdArc::Weight Weight;
   typedef fst::StdArc::StateId StateId;
   typedef fst::StdArc::Label Label;
 
-  ConstArpaLmDeterministicFst(const ConstArpaLm& lm);
+  explicit ConstArpaLmDeterministicFst(const ConstArpaLm& lm);
 
   // We cannot use "const" because the pure virtual function in the interface is
   // not const.
@@ -230,11 +419,10 @@ class ConstArpaLmDeterministicFst :
 // Reads in an Arpa format language model and converts it into ConstArpaLm
 // format. We assume that the words in the input Arpa format language model have
 // been converted into integers.
-bool BuildConstArpaLm(const bool natural_base, const int32 bos_symbol,
-                      const int32 eos_symbol, const int32 unk_symbol,
+bool BuildConstArpaLm(const ArpaParseOptions& options,
                       const std::string& arpa_rxfilename,
                       const std::string& const_arpa_wxfilename);
 
-} // namespace kaldi
+}  // namespace kaldi
 
 #endif  // KALDI_LM_CONST_ARPA_LM_H_
diff --git a/src/lm/input1.score b/src/lm/input1.score
deleted file mode 100644
index ac91c36c484..00000000000
--- a/src/lm/input1.score
+++ /dev/null
@@ -1 +0,0 @@
-25.73836137
diff --git a/src/lm/input1.txt b/src/lm/input1.txt
deleted file mode 100644
index cd69d5f77f7..00000000000
--- a/src/lm/input1.txt
+++ /dev/null
@@ -1 +0,0 @@
-<s> b b b a </s>
diff --git a/src/lm/input2.score b/src/lm/input2.score
deleted file mode 100644
index 8b4efaf4da1..00000000000
--- a/src/lm/input2.score
+++ /dev/null
@@ -1 +0,0 @@
-1.89388004
diff --git a/src/lm/input2.txt b/src/lm/input2.txt
deleted file mode 100644
index fcd30d6f206..00000000000
--- a/src/lm/input2.txt
+++ /dev/null
@@ -1 +0,0 @@
-<s> a b </s>
diff --git a/src/lm/kaldi-lm.cc b/src/lm/kaldi-lm.cc
deleted file mode 100644
index 6f85ec78fc0..00000000000
--- a/src/lm/kaldi-lm.cc
+++ /dev/null
@@ -1,122 +0,0 @@
-// lm/kaldi-lm.cc
-//
-// Copyright 2009-2011 Gilles Boulianne.
-//
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-//  http://www.apache.org/licenses/LICENSE-2.0
-
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-/**
- * @file kaldi-lm.cc
- * @brief Language model FST implementation.
- *
- * See kaldi-lm.h for more details.
- *
- */
-
-#include "lm/kaldi-lm.h"
-#include <stdexcept>
-
-namespace kaldi {
-// add the string contained in inpline to the current transducer
-// starting at initial state
-LangModelFst::StateId LangModelFst::ReadTxtLine(const string &inpline) {
-  KALDI_ASSERT(pfst_);
-  KALDI_ASSERT(pfst_->InputSymbols());
-  KALDI_ASSERT(pfst_->OutputSymbols());
-
-  StateId dst = pfst_->Start(), src = pfst_->Start();
-  // this will split on white spaces only
-  string curwrd;  // Have a buffer string
-  std::stringstream ss(inpline);  // Insert the string into a stream
-  while (ss >> curwrd) {
-    // add labels to symbol tables
-    int64 ilab = pfst_->MutableInputSymbols()->AddSymbol(curwrd);
-    int64 olab = pfst_->MutableOutputSymbols()->AddSymbol(curwrd);
-    dst = pfst_->AddState();
-    pfst_->AddArc(src, fst::StdArc(ilab, olab, 0, dst));
-    // cerr << "  adding word " << curwrd << " from state " << src;
-    // cerr << " to state " << dst <<endl;
-    src = dst;
-  }
-  return dst;
-}
-
-// create a path in the FST for each line of the input stream
-// fst must already be provided with symbol tables
-void LangModelFst::ReadTxtString(std::istream &strm) {
-  string inpline;
-  StateId src, final = pfst_->AddState();
-
-  while (getline(strm, inpline) && !strm.eof()) {
-    // cerr << "ReadTxtString: read line " << inpline << endl;
-    src = ReadTxtLine(inpline);
-    // add arc from last state produced to final state
-    pfst_->AddArc(src, fst::StdArc(0, 0, 0, final));
-  }
-  pfst_->SetFinal(final, fst::StdArc::Weight::One());
-}
-
-// allocate an FST and provide symbol tables if not provided through pst
-// we allocate the FST here to parallel OpenFst Read()
-// although this is questionable
-fst::StdVectorFst* LangModelFst::ReadStream(
-                                            std::istream &strm,
-                                            const string &sourcename,
-                                            GrammarType gtype,
-                                            fst::SymbolTable *pst,
-                                            bool useNaturalLog,
-                                            const string startSent,
-                                            const string endSent) {
-  if (gtype == kArpaLm || gtype == kTextString) {
-    // always allocate local symbol table so we know we always have to delete it
-    fst::SymbolTable *psyms = new fst::SymbolTable("lmInputSymbols");
-
-    // initialize FST and reserve initial state
-    // (we can retrieve it through Start())
-    pfst_ = new fst::StdVectorFst;
-    pfst_->SetStart(pfst_->AddState());
-
-    // these will be added if not already there
-    pst = pst ? pst : psyms;
-    pst->AddSymbol("<eps>");
-    pst->AddSymbol(startSent);
-    pst->AddSymbol(endSent);
-
-    // this creates reference-counted copies managed by fst
-    pfst_->SetInputSymbols(pst);
-    pfst_->SetOutputSymbols(pst);
-
-    // so local objects are not needed anymore
-    delete psyms;
-
-    if (gtype == kArpaLm) {
-      LmTable lmt;
-      lmt.ReadFstFromLmFile(strm, pfst_, useNaturalLog, startSent, endSent);
-    } else if (gtype== kTextString) {
-      ReadTxtString(strm);
-    }
-
-  } else if (gtype == kFst) {
-    // this is going to be reference-counted
-    pfst_ = fst::StdVectorFst::Read(sourcename);
-
-  } else {
-    KALDI_ERR << "LangModelFst: unsupported grammar type";
-  }
-  return pfst_;
-}
-
-}  // end namespace kaldi
-
-
diff --git a/src/lm/kaldi-lm.h b/src/lm/kaldi-lm.h
deleted file mode 100644
index b3cbe00ac0c..00000000000
--- a/src/lm/kaldi-lm.h
+++ /dev/null
@@ -1,150 +0,0 @@
-// lm/kaldi-lm.h
-// Copyright 2009-2011 Gilles Boulianne.
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-//  http://www.apache.org/licenses/LICENSE-2.0
-
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-/**
-  * @file kaldi-lm.h
-  * @brief Language model FST definitions.
-  *
-  * Provides methods to read and write
-  * file-based language models, strings and grammars.
-  *
-  * The entire FST is created at read time.
-*/
-
-// Future work (Gilles):
-//    - use a StdVectorFst implementation and a Fst interface
-//    - have a lm composed of several "factor" lms
-
-#ifndef KALDI_LM_KALDI_LM_H_
-#define KALDI_LM_KALDI_LM_H_
-
-#include <string>
-
-#include "fst/fstlib.h"
-#include "fst/fst-decl.h"
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "lm/kaldi-lmtable.h"
-
-
-namespace kaldi {
-
-/// @defgroup LanguageModel LanguageModel
-/// @{
-/// @brief Language model and lexicon FST implementations.
-
-/// Controls reading or ARPA, IRSTLM, OpenFST, or text formats
-enum GrammarType {
-  kArpaLm, kIrstLm, kFst, kTextString
-};
-
-/// @brief Finite-state transducer language model.
-
-/// LangModelFst is a standard vector FST that also provides
-/// Read() and Write() functions for file-based language models
-/// or text files defining strings and grammars.
-class LangModelFst: public fst::VectorFst<fst::StdArc> {
- public:
-  typedef fst::StdArc::Weight LmWeight;
-  typedef fst::StdArc::StateId StateId;
-  
-
-  LangModelFst() {
-    pfst_ = new fst::VectorFst<fst::StdArc>;
-  }
-
-  LangModelFst(const LangModelFst &lm)
-    : pfst_(lm.pfst_ ? new fst::VectorFst<fst::StdArc>(*(lm.pfst_)) : 0) {}
-
-  ~LangModelFst() {
-    if (pfst_) delete pfst_;
-  }
-
-  /// Reads a language model from an input stream.
-  bool Read(std::istream &strm,
-            const string &sourcename,
-            GrammarType gtype,
-            fst::SymbolTable *pst = NULL,
-            bool useNaturalLog = true,
-            const string startSent = "<s>",
-            const string endSent = "</s>") {
-    if (pfst_) delete pfst_;
-    pfst_ = ReadStream(strm, sourcename,
-                       gtype, pst,
-                       useNaturalLog,
-                       startSent, endSent);
-    return(pfst_ ? true : false);
-  }
-
-  bool Read(const string &rxfilename,
-            GrammarType gtype,
-            fst::SymbolTable *pst = 0,
-            bool useNaturalLog = true,
-            const string startSent = "<s>",
-            const string endSent = "</s>") {
-    if (pfst_) { delete pfst_; pfst_ = NULL; }
-    if (rxfilename == "") {
-      KALDI_ERR << "arpa2fst and similar programs no longer support empty filename "
-                << "for standard input; use '-'";
-    }
-    Input ki(rxfilename);
-    
-    pfst_ = ReadStream(ki.Stream(),
-                       PrintableRxfilename(rxfilename),
-                       gtype, pst,
-                       useNaturalLog,
-                       startSent, endSent);
-    return(pfst_ ? true : false);
-  }
-
-  fst::SymbolTable* MutableInputSymbols() {
-    return pfst_->MutableInputSymbols();
-  }
-
-  const fst::VectorFst<fst::StdArc>* GetFst() const {return pfst_;}
-  fst::VectorFst<fst::StdArc>* GetFst() {return pfst_;}
-
-  /// Writes language model FST to named output file, return false on error.
-  bool Write(const std::string &filename) const {
-    // interpret "" as stdout for compatibility with OpenFst conventions.
-    std::string wxfilename(filename == "" ? "-" : filename.c_str());
-    bool write_binary = true, write_header = false;
-    kaldi::Output ko(wxfilename, write_binary, write_header);
-    fst::FstWriteOptions wopts(kaldi::PrintableWxfilename(wxfilename));
-    return /* fst::Verify(*pfst_) && */
-        pfst_->Write(ko.Stream(), wopts);
-  }
-
- private:
-  fst::VectorFst<fst::StdArc> *pfst_;
-  fst::VectorFst<fst::StdArc>* ReadStream(std::istream &strm,
-                                          const string &sourcename,
-                                          GrammarType gtype,
-                                          fst::SymbolTable *pst,
-                                          bool useNaturalLog,
-                                          const string startSent,
-                                          const string endSent);
-  void ReadTxtString(std::istream &strm);
-  fst::StdArc::StateId ReadTxtLine(const string &inpline);
-};
-/// @} LanguageModel
-
-}  // end namespace kaldi
-
-#endif  // KALDI_LM_KALDI_LM_H_
-
diff --git a/src/lm/kaldi-lmtable.cc b/src/lm/kaldi-lmtable.cc
deleted file mode 100644
index 8f7e43bed51..00000000000
--- a/src/lm/kaldi-lmtable.cc
+++ /dev/null
@@ -1,459 +0,0 @@
-// lm/kaldi-lmtable.cc
-//
-// Copyright 2009-2011 Gilles Boulianne.
-//
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-//  http://www.apache.org/licenses/LICENSE-2.0
-
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-/**
- * @file kaldi-lmtable.cc
- * @brief Implementation of internal representation for language model.
- *
- * See kaldi-lmtable.h for more details.
- */
-
-#include "lm/kaldi-lmtable.h"
-#include "base/kaldi-common.h"
-#include <sstream>
-
-namespace kaldi {
-
-// typedef fst::StdArc::StateId StateId;
-
-// newly_added will be updated
-LmFstConverter::StateId LmFstConverter::AddStateFromSymb(
-    const std::vector<string> &ngramString,
-    int kstart, int kend,
-    fst::StdVectorFst *pfst,
-    bool &newly_added) {
-  fst::StdArc::StateId sid;
-  std::string separator;
-  separator.resize(1);
-  separator[0] = '\0';
-  
-  std::string hist;
-  if (kstart == 0) {
-    hist.append(separator);
-  } else {
-    for (int k = kstart; k >= kend; k--) {
-      hist.append(ngramString[k]);
-      hist.append(separator);
-    }
-  }
-
-  newly_added = false;
-  sid = FindState(hist);
-  if (sid < 0) {
-    sid = pfst->AddState();
-    hist_state_[hist] = sid; 
-    newly_added = true;
-    //cerr << "Created state " << sid << " for " << hist << endl;
-  } else {
-    //cerr << "State symbol " << hist << " already exists" << endl;
-  }
-
-  return sid;
-}
-
-void LmFstConverter::ConnectUnusedStates(fst::StdVectorFst *pfst) {
-
-  // go through all states with a recorded backoff destination 
-  // and find out any that has no output arcs and is not final
-  unsigned int connected = 0;
-  // cerr << "ConnectUnusedStates has recorded "<<backoff_state_.size()<<" states.\n";
-
-  for (BackoffStateMap::iterator bkit = backoff_state_.begin(); bkit != backoff_state_.end(); ++bkit) {
-    // add an output arc to its backoff destination recorded in backoff_
-    fst::StdArc::StateId src = bkit->first, dst = bkit->second;
-    if (pfst->NumArcs(src)==0 && !IsFinal(pfst, src)) {
-      // cerr << "ConnectUnusedStates: adding arc from "<<src<<" to "<<dst<<endl;
-      // epsilon arc with no cost
-      pfst->AddArc(src,
-                   fst::StdArc(0, 0, fst::StdArc::Weight::One(), dst));
-      connected++;
-    }
-  }
-  cerr << "Connected " << connected << " states without outgoing arcs." << endl;
-}
-
-void LmFstConverter::AddArcsForNgramProb(
-    int ngram_order, int max_ngram_order,
-    float logProb,
-    float logBow,
-    std::vector<string> &ngram,
-    fst::StdVectorFst *fst,
-    const string startSent,
-    const string endSent) {
-  fst::StdArc::StateId src, dst, dbo;
-  std::string curwrd = ngram[1];
-  if (curwrd == "<eps>") {
-    KALDI_ERR << "The word <eps> is not allowed as a word in an ARPA LM.";
-  }
-  int64 ilab, olab;
-  LmWeight prob = ConvertArpaLogProbToWeight(logProb);
-  LmWeight bow  = ConvertArpaLogProbToWeight(logBow);
-  bool newSrc, newDbo, newDst = false;
-
-  if (ngram_order >= 2) {
-    // General case works from N down to 2-grams
-    src = AddStateFromSymb(ngram, ngram_order, 2, fst, newSrc);
-    if (ngram_order != max_ngram_order) {
-      // add all intermediate levels from 2 to current
-      // last ones will be current backoff source and destination
-      for (int i = 2; i <= ngram_order; i++) {
-        dst = AddStateFromSymb(ngram, i,   1, fst, newDst);
-        dbo = AddStateFromSymb(ngram, i-1, 1, fst, newDbo);
-        backoff_state_[dst] = dbo;
-      }
-    } else {
-      // add all intermediate levels from 2 to current
-      // last ones will be current backoff source and destination
-      for (int i = 2; i <= ngram_order; i++) {
-        dst = AddStateFromSymb(ngram, i-1, 1, fst, newDst);
-        dbo = AddStateFromSymb(ngram, i-2, 1, fst, newDbo);
-        backoff_state_[dst] = dbo;
-      }
-    }
-  } else {
-    // special case for 1-grams: start from 0-gram
-    if (curwrd.compare(startSent) != 0) {
-      src = AddStateFromSymb(ngram, 0, 1, fst, newSrc);
-    } else {
-      // extra special case if in addition we are at beginning of sentence
-      // starts from initial state and has no cost
-      src = fst->Start();
-      prob = fst::StdArc::Weight::One();
-    }
-    dst = AddStateFromSymb(ngram, 1, 1, fst, newDst);
-    dbo = AddStateFromSymb(ngram, 0, 1, fst, newDbo);
-    backoff_state_[dst] = dbo;
-  }
-
-  // state is final if last word is end of sentence
-  if (curwrd.compare(endSent) == 0) {
-    fst->SetFinal(dst, fst::StdArc::Weight::One());
-  }
-  // add labels to symbol tables
-  ilab = fst->MutableInputSymbols()->AddSymbol(curwrd);
-  olab = fst->MutableOutputSymbols()->AddSymbol(curwrd);
-
-  // add arc with weight "prob" between source and destination states
-  // cerr << "n-gram prob, fstAddArc: src "<< src << " dst " << dst;
-  // cerr << " lab " << ilab << endl;
-  fst->AddArc(src, fst::StdArc(ilab, olab, prob, dst));
-
-  // add backoffs to any newly created destination state
-  // but only if non-final
-  if (!IsFinal(fst, dst) && newDst && dbo != dst) {
-    ilab = olab = 0;
-    // cerr << "backoff, fstAddArc: src "<< src << " dst " << dst;
-    // cerr << " lab " << ilab << endl;
-    fst->AddArc(dst, fst::StdArc(ilab, olab, bow, dbo));
-  }
-}
-
-#ifndef HAVE_IRSTLM
-
-bool LmTable::ReadFstFromLmFile(std::istream &istrm,
-                                fst::StdVectorFst *fst,
-                                bool useNaturalOpt,
-                                const string startSent,
-                                const string endSent) {
-#ifdef KALDI_PARANOID
-  KALDI_ASSERT(fst);
-  KALDI_ASSERT(fst->InputSymbols() && fst->OutputSymbols());
-#endif
-
-  conv_->UseNaturalLog(useNaturalOpt);
-
-  // do not use state symbol table for word histories anymore
-  string inpline;
-  size_t pos1, pos2;
-  int ngram_order, max_ngram_order = 0;
-
-  // process \data\ section
-
-  while (getline(istrm, inpline) && !istrm.eof()) {
-    std::istringstream ss(inpline);
-    std::string token;
-    ss >> token >> std::ws;
-    if (token == "\\data\\" && ss.eof()) break;
-  }
-  if (istrm.eof()) {
-    KALDI_ERR << "\\data\\ token not found in arpa file.";
-  }
-
-  while (getline(istrm, inpline) && !istrm.eof()) {
-    // break out of loop if another section is found
-    if (inpline.find("-grams:") != string::npos) break;
-    if (inpline.find("\\end\\") != string::npos) break;
-
-    // look for valid "ngram N = M" lines
-    pos1 = inpline.find("ngram");
-    pos2 = inpline.find("=");
-    if (pos1 == string::npos ||  pos2 == string::npos || pos2 <= pos1) {
-      continue;  // not valid, continue looking
-    }
-    // found valid line
-    ngram_order = atoi(inpline.substr(pos1+5, pos2-(pos1+5)).c_str());
-    if (ngram_order > max_ngram_order) {
-      max_ngram_order = ngram_order;
-    }
-  }
-  if (max_ngram_order == 0) {
-    // reached end of loop without having found any n-gram
-    KALDI_ERR << "No ngrams found in specified file";
-  }
-
-  // process "\N-grams:" sections, we may have already read a "\N-grams:" line
-  // if so, process it, otherwise get another line
-  while (inpline.find("-grams:") != string::npos
-         || (getline(istrm, inpline) && !istrm.eof()) ) {
-    // look for a valid "\N-grams:" section
-    pos1 = inpline.find("\\");
-    pos2 = inpline.find("-grams:");
-    if (pos1 == string::npos || pos2 == string::npos || pos2 <= pos1) {
-      continue;  // not valid line, continue looking for one
-    }
-    // found, set current level
-    ngram_order = atoi(inpline.substr(pos1+1, pos2-(pos1+1)).c_str());
-    cerr << "Processing " << ngram_order << "-grams" << endl;
-
-    // process individual n-grams
-    while (getline(istrm, inpline) && !istrm.eof()) {
-      // break out of inner loop if another section is found
-      if (!inpline.empty() && inpline[0] == '\\') {
-        if (inpline.find("-grams:") != string::npos) break;
-        if (inpline.find("\\end\\") != string::npos) break;
-      }
-      // parse ngram line: first field = prob, other fields = words,
-      // last field = backoff (optional)
-      std::vector<string> ngramString;
-      float prob, bow;
-
-      // eat up space.
-      const char *cur_cstr = inpline.c_str();
-      while (*cur_cstr && isspace(*cur_cstr))
-        cur_cstr++;
-
-      if (*cur_cstr == '\0') // Ignore empty lines.
-        continue;
-      char *next_cstr;
-      // found, parse probability from first field
-      prob = KALDI_STRTOF(cur_cstr, &next_cstr);
-      if (prob != prob || prob - prob != 0) {
-        KALDI_ERR << "nan or inf detected in LM file [parsing " << (ngram_order)
-            << "-grams]: " << inpline;
-      }
-      if (next_cstr == cur_cstr)
-        KALDI_ERR << "Bad line in LM file [parsing "<<(ngram_order)<<"-grams]: "<<inpline;
-      cur_cstr = next_cstr;
-      while (*cur_cstr && isspace(*cur_cstr))
-        cur_cstr++;
-
-      // element 0 will be empty, element 1 will be the current word,
-      // element 2 will be the immediately preceding word, and so on.
-      // Apparently an IRSTLM convention.
-      ngramString.resize(ngram_order + 1);
-      
-      bool illegal_bos_or_eos = false;
-      for (int i = 0; i < ngram_order; i++) {
-        if (*cur_cstr == '\0')
-          KALDI_ERR << "Bad line in LM file [parsing "<<(ngram_order)<<"-grams]: "<<inpline;
-
-        const char *end_cstr = strpbrk(cur_cstr, " \t\r");
-        std::string this_word;
-        if (end_cstr == NULL) {
-          this_word = std::string(cur_cstr);
-          cur_cstr += strlen(cur_cstr);
-        } else {
-          this_word = std::string(cur_cstr, end_cstr-cur_cstr);
-          cur_cstr = end_cstr;
-          while (*cur_cstr && isspace(*cur_cstr))
-            cur_cstr++;
-        }
-
-        // Checks if <s> only appears at the beginning of the ngram, and if </s>
-        // only appears at the end of the ngram.
-        if ((ngram_order > 1 && i != 0 && this_word == "<s>") ||
-            (ngram_order > 1 && i != ngram_order - 1 && this_word == "</s>")) {
-          illegal_bos_or_eos = true;
-          break;
-        }
-
-        ngramString[ngram_order - i].swap(this_word);
-      }
-      if (illegal_bos_or_eos) {
-        KALDI_WARN << "<s> is not at the beginning of the n-gram, or </s> is "
-            << "not at the end of the n-gram, skipping it: " << inpline;
-        continue;
-      }
-
-      bow = 0;
-      if (ngram_order < max_ngram_order) {
-        // try converting anything left in the line to a backoff weight
-        if (*cur_cstr != '\0') {
-          char *end_cstr;
-          bow = KALDI_STRTOF(cur_cstr, &end_cstr);
-          if (bow != bow || bow - bow != 0) {
-            KALDI_ERR << "nan or inf detected in LM file [parsing " << (ngram_order)
-                << "-grams]: " << inpline;
-          }
-          if (end_cstr != cur_cstr) {  // got something.
-            while (*end_cstr != '\0' && isspace(*end_cstr))
-              end_cstr++;
-            if (*end_cstr != '\0')
-              KALDI_ERR << "Junk " << (end_cstr) << " at end of line [parsing "
-                        << (ngram_order) << "-grams]" << inpline;
-          } else {
-            KALDI_ERR << "Junk " << (cur_cstr) << " at end of line [parsing "
-                      << (ngram_order) << "-grams]" << inpline;
-          }
-        }
-      }
-      conv_->AddArcsForNgramProb(ngram_order, max_ngram_order, prob, bow,
-                                 ngramString, fst,
-                                 startSent, endSent);
-    }  // end of loop on individual n-gram lines
-  }
-
-  conv_->ConnectUnusedStates(fst);
-
-  // not used anymore: delete pStateSymbs;
-
-  // input and output symbol tables will be deleted by ~fst()
-  return true;
-}
-
-#else
-
-// #ifdef HAVE_IRSTLM implementation
-
-bool LmTable::ReadFstFromLmFile(std::istream &istrm,
-                                fst::StdVectorFst *fst,
-                                bool useNaturalOpt,
-                                const string startSent,
-                                const string endSent) {
-  load(istrm, "input name?", "output name?", 0, NONE);
-  ngram ng(this->getDict(), 0);
-
-  conv_->UseNaturalLog(useNaturalOpt);
-  DumpStart(ng, fst, startSent, endSent);
-
-  // should do some check before returning true
-  return true;
-}
-
-// run through all nodes in table (as in dumplm)
-void LmTable::DumpStart(ngram ng,
-                        fst::StdVectorFst *fst,
-                        const string startSent,
-                        const string endSent) {
-#ifdef KALDI_PARANOID
-  KALDI_ASSERT(fst);
-  KALDI_ASSERT(fst->InputSymbols() && fst->OutputSymbols());
-#endif
-  // we need a state symbol table while traversing word contexts
-  fst::SymbolTable *pStateSymbs = new fst::SymbolTable("kaldi-lm-state");
-
-  // dump level by level
-  for (int l = 1; l <= max_ngram_order; l++) {
-    ng.size = 0;
-    cerr << "Processing " << l << "-grams" << endl;
-    DumpContinue(ng, 1, l, 0, cursize[1],
-                 fst, pStateSymbs, startSent, endSent);
-  }
-
-  delete pStateSymbs;
-  // input and output symbol tables will be deleted by ~fst()
-}
-
-// run through given levels and positions in table
-void LmTable::DumpContinue(ngram ng, int ngram_order, int elev,
-                           table_entry_pos_t ipos, table_entry_pos_t epos,
-                           fst::StdVectorFst *fst,
-                           fst::SymbolTable *pStateSymbs,
-                           const string startSent, const string endSent) {
-  LMT_TYPE ndt = tbltype[ngram_order];
-  ngram ing(ng.dict);
-  int ndsz = nodesize(ndt);
-
-#ifdef KALDI_PARANOID
-  KALDI_ASSERT(ng.size == ngram_order - 1);
-  KALDI_ASSERT(ipos >= 0 && epos <= cursize[ngram_order] && ipos < epos);
-  KALDI_ASSERT(pStateSymbs);
-#endif
-
-  ng.pushc(0);
-
-  for (table_entry_pos_t i = ipos; i < epos; i++) {
-    *ng.wordp(1) = word(table[ngram_order] + (table_pos_t)i * ndsz);
-    float ipr = prob(table[ngram_order] + (table_pos_t)i * ndsz, ndt);
-    // int ipr = prob(table[ngram_order] + i * ndsz, ndt);
-    // skip pruned n-grams
-    if (isPruned && ipr == NOPROB) continue;
-
-    if (ngram_order < elev) {
-      // get first and last successor position
-      table_entry_pos_t isucc = (i > 0 ? bound(table[ngram_order] +
-                                               (table_pos_t) (i-1) * ndsz,
-                                               ndt) : 0);
-      table_entry_pos_t esucc = bound(table[ngram_order] +
-                                      (table_pos_t) i * ndsz, ndt);
-      if (isucc < esucc)  // there are successors!
-        DumpContinue(ng, ngram_order+1, elev, isucc, esucc,
-                     fst, pStateSymbs, startSent, endSent);
-      // else
-      // cerr << "no successors for " << ng << "\n";
-    } else {
-      // cerr << i << " ";  // this was just to count printed n-grams
-      // cerr << ipr <<"\t";
-      // cerr << (isQtable?ipr:*(float *)&ipr) <<"\t";
-
-      // if table is inverted then revert n-gram
-      if (isInverted && ng.size > 1) {
-        ing.invert(ng);
-        ng = ing;
-      }
-
-      // cerr << "ngram_order " << ngram_order << " ngsize " << ng.size << endl;
-
-      // for FST creation: vector of words strings
-      std::vector<string> ngramString;
-      for (int k = ng.size; k >= 1; k--) {
-        // words are inserted so position 1 is most recent word,
-        // and position N oldest word (IRSTLM convention)
-        ngramString.insert(ngramString.begin(),
-                           this->getDict()->decode(*ng.wordp(k)));
-      }
-      // reserve index 0 so that words go from 1, .., ng.size-1
-      ngramString.insert(ngramString.begin(), "");
-      float ibo = 0;
-      if (ngram_order < max_ngram_order) {
-        // Backoff
-        ibo = bow(table[ngram_order]+ (table_pos_t)i * ndsz, ndt);
-        // if (isQtable) cerr << "\t" << ibo;
-        // else if (ibo != 0.0) cerr << "\t" << ibo;
-      }
-      conv_->AddArcsForNgramProb(ngram_order, max_ngram_order, ipr, ibo,
-                                 ngramString, fst, pStateSymbs,
-                                 startSent, endSent);
-    }
-  }
-}
-
-#endif
-
-}  // end namespace kaldi
-
diff --git a/src/lm/kaldi-lmtable.h b/src/lm/kaldi-lmtable.h
deleted file mode 100644
index 94492e0d649..00000000000
--- a/src/lm/kaldi-lmtable.h
+++ /dev/null
@@ -1,189 +0,0 @@
-// lm/kaldi-lmtable.h
-// Copyright Gilles Boulianne.
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-//  http://www.apache.org/licenses/LICENSE-2.0
-
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_LM_KALDI_LMTABLE_H_
-#define KALDI_LM_KALDI_LMTABLE_H_
-
-// To use IRSTLM toolkit, use #define HAVE_IRSTLM
-// otherwise there is limited support (reading of arpa files)
-// provided by kaldi-lmtable.cc
-
-#include <fstream>
-#include <vector>
-#include <string>
-
-#ifdef _MSC_VER
-#include <unordered_map>
-using std::unordered_map;
-#elif __cplusplus > 199711L || defined(__GXX_EXPERIMENTAL_CXX0X__)
-#include <unordered_map>
-using std::unordered_map;
-#else
-#include <tr1/unordered_map>
-using std::tr1::unordered_map;
-#endif
-
-#ifndef HAVE_IRSTLM
-#else
-#include "irstlm/lmtable.h"
-#include "irstlm/n_gram.h"
-#endif
-
-#include "fst/fstlib.h"
-#include "fst/fst-decl.h"
-#include "fst/arc.h"
-#include "base/kaldi-common.h"
-#include "util/stl-utils.h"
-
-namespace kaldi {
-/// @addtogroup LanguageModel
-/// @{
-/// @file kaldi-lmtable.h
-/**
-  * @brief Definition of internal representation for language model.
-  *
-  * Provides a unified interface to various toolkits such as IRSTLM.
-  * Also contains a basic implementation for reading ARPA files which
-  * does not require an external library.
-*/
-
-
-/// @brief Helper methods to convert toolkit internal representations into FST.
-class LmFstConverter {
-  typedef fst::StdArc::Weight LmWeight;
-  typedef fst::StdArc::StateId StateId;
-  
-  typedef unordered_map<StateId, StateId> BackoffStateMap;
-  typedef unordered_map<std::string, StateId, StringHasher> HistStateMap;
-
- public:
-
-  LmFstConverter() : use_natural_log_(true) {}
-
-  ~LmFstConverter() {}
-
-  void UseNaturalLog(bool use_natural) { use_natural_log_ = use_natural; }
-
-  void AddArcsForNgramProb(int ilev,
-                           int maxlev,
-                           float prob,
-                           float bow,
-                           std::vector<string> &ngs,
-                           fst::StdVectorFst *fst,
-                           const string startSent,
-                           const string endSent);
-
-  float ConvertArpaLogProbToWeight(float lp) {
-    if ( use_natural_log_ ) {
-      // convert from arpa base 10 log to natural base, then to cost
-      return -2.302585*lp;
-    } else {
-      // keep original base but convert to cost
-      return -lp;
-    }
-  }
-
-  bool IsFinal(fst::StdVectorFst *pfst,
-               StateId s) {
-    return(pfst->Final(s) != fst::StdArc::Weight::Zero());
-  }
-
-  void ConnectUnusedStates(fst::StdVectorFst *pfst);
-
- private:
-  StateId AddStateFromSymb(const std::vector<string> &ngramString,
-                            int kstart,
-                            int kend,
-                            fst::StdVectorFst *pfst,
-                            bool &newly_added);
-
-  StateId FindState(const std::string str) {
-    HistStateMap::const_iterator it = hist_state_.find(str);
-     if (it == hist_state_.end()) return -1;
-     else return it->second;
-  }
-
-  bool use_natural_log_;
-  BackoffStateMap backoff_state_;
-  HistStateMap hist_state_;
-};
-
-#ifndef HAVE_IRSTLM
-
-/** @brief Basic Kaldi implementation for reading ARPA format files.
-  *
-  * It does not rely on any external toolkit
-  * but only supports standard ARPA format language model files.
-*/
-class LmTable {
- public:
-  LmTable() { conv_ = new LmFstConverter; }
-  ~LmTable() { if (conv_) delete conv_; }
-
-  bool ReadFstFromLmFile(std::istream &istrm,
-                         fst::StdVectorFst *pfst,
-                         bool useNaturalLog,
-                         const string startSent,
-                         const string endSent);
- private:
-  LmFstConverter *conv_;
-};
-
-#else
-
-// special value for pruned iprobs
-#define NOPROB (static_cast<float>(-1.329227995784915872903807060280344576e36))
-
-/// @brief IRSTLM implementation that inherits from IRSTLM table class
-/// in order to access internal data needed to create an FST.
-class LmTable : public lmtable {
- public:
-  LmTable() { conv_ = new LmFstConverter; }
-  ~LmTable() { if (conv_) delete conv_; }
-
-  /// in this implementation, needed functions come from parent class, e.g.
-  ///   table_entry_pos_t wdprune(float *thr, int aflag = 0);
-  ///   void load(std::istream& inp, ...);
-  bool ReadFstFromLmFile(std::istream &istrm,
-                       fst::StdVectorFst *pfst,
-                        bool useNaturalOpt,
-                       const string startSent,
-                       const string endSent);
- private:
-
-  /// Method specific to the IRSTLM implementation.
-  void DumpStart(ngram ng,
-                 fst::StdVectorFst *pfst,
-                 const string startSent,
-                 const string endSent);
-  /// Method specific to the IRSTLM implementation.
-  void DumpContinue(ngram ng,
-                    int ilev, int elev,
-                    table_entry_pos_t ipos, table_entry_pos_t epos,
-                    fst::StdVectorFst *pfst,
-                    const string startSent, const string endSent);
-
-  LmFstConverter *conv_;
-};
-
-#endif
-/// @} end of "LanguageModel"
-}  // end namespace kaldi
-
-
-#endif  // KALDI_LM_KALDI_LMTABLE_H_
diff --git a/src/lm/kaldi-rnnlm.cc b/src/lm/kaldi-rnnlm.cc
new file mode 100644
index 00000000000..3a811c4c0e5
--- /dev/null
+++ b/src/lm/kaldi-rnnlm.cc
@@ -0,0 +1,140 @@
+// lm/kaldi-rnnlm.cc
+
+// Copyright 2015  Guoguo Chen
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <utility>
+
+#include "lm/kaldi-rnnlm.h"
+#include "util/stl-utils.h"
+#include "util/text-utils.h"
+
+namespace kaldi {
+
+KaldiRnnlmWrapper::KaldiRnnlmWrapper(
+    const KaldiRnnlmWrapperOpts &opts,
+    const std::string &unk_prob_rspecifier,
+    const std::string &word_symbol_table_rxfilename,
+    const std::string &rnnlm_rxfilename) {
+  rnnlm_.setRnnLMFile(rnnlm_rxfilename);
+  rnnlm_.setRandSeed(1);
+  rnnlm_.setUnkSym(opts.unk_symbol);
+  rnnlm_.setUnkPenalty(unk_prob_rspecifier);
+  rnnlm_.restoreNet();
+
+  // Reads symbol table.
+  fst::SymbolTable *word_symbols = NULL;
+  if (!(word_symbols =
+        fst::SymbolTable::ReadText(word_symbol_table_rxfilename))) {
+    KALDI_ERR << "Could not read symbol table from file "
+        << word_symbol_table_rxfilename;
+  }
+  label_to_word_.resize(word_symbols->NumSymbols() + 1);
+  for (int32 i = 0; i < label_to_word_.size() - 1; ++i) {
+    label_to_word_[i] = word_symbols->Find(i);
+    if (label_to_word_[i] == "") {
+      KALDI_ERR << "Could not find word for integer " << i << "in the word "
+          << "symbol table, mismatched symbol table or you have discoutinuous "
+          << "integers in your symbol table?";
+    }
+  }
+  label_to_word_[label_to_word_.size() - 1] = opts.eos_symbol;
+  eos_ = label_to_word_.size() - 1;
+}
+
+BaseFloat KaldiRnnlmWrapper::GetLogProb(
+    int32 word, const std::vector<int32> &wseq,
+    const std::vector<float> &context_in,
+    std::vector<float> *context_out) {
+
+  std::vector<std::string> wseq_symbols(wseq.size());
+  for (int32 i = 0; i < wseq_symbols.size(); ++i) {
+    KALDI_ASSERT(wseq[i] < label_to_word_.size());
+    wseq_symbols[i] = label_to_word_[wseq[i]];
+  }
+
+  return rnnlm_.computeConditionalLogprob(label_to_word_[word], wseq_symbols,
+                                          context_in, context_out);
+}
+
+RnnlmDeterministicFst::RnnlmDeterministicFst(int32 max_ngram_order,
+                                             KaldiRnnlmWrapper *rnnlm) {
+  KALDI_ASSERT(rnnlm != NULL);
+  max_ngram_order_ = max_ngram_order;
+  rnnlm_ = rnnlm;
+
+  // Uses empty history for <s>.
+  std::vector<Label> bos;
+  std::vector<float> bos_context(rnnlm->GetHiddenLayerSize(), 1.0);
+  state_to_wseq_.push_back(bos);
+  state_to_context_.push_back(bos_context);
+  wseq_to_state_[bos] = 0;
+  start_state_ = 0;
+}
+
+fst::StdArc::Weight RnnlmDeterministicFst::Final(StateId s) {
+  // At this point, we should have created the state.
+  KALDI_ASSERT(static_cast<size_t>(s) < state_to_wseq_.size());
+
+  std::vector<Label> wseq = state_to_wseq_[s];
+  BaseFloat logprob = rnnlm_->GetLogProb(rnnlm_->GetEos(), wseq,
+                                         state_to_context_[s], NULL);
+  return Weight(-logprob);
+}
+
+bool RnnlmDeterministicFst::GetArc(StateId s, Label ilabel, fst::StdArc *oarc) {
+  // At this point, we should have created the state.
+  KALDI_ASSERT(static_cast<size_t>(s) < state_to_wseq_.size());
+
+  std::vector<Label> wseq = state_to_wseq_[s];
+  std::vector<float> new_context(rnnlm_->GetHiddenLayerSize());
+  BaseFloat logprob = rnnlm_->GetLogProb(ilabel, wseq,
+                                         state_to_context_[s], &new_context);
+
+  wseq.push_back(ilabel);
+  if (max_ngram_order_ > 0) {
+    while (wseq.size() >= max_ngram_order_) {
+      // History state has at most <max_ngram_order_> - 1 words in the state.
+      wseq.erase(wseq.begin(), wseq.begin() + 1);
+    }
+  }
+
+  std::pair<const std::vector<Label>, StateId> wseq_state_pair(
+      wseq, static_cast<Label>(state_to_wseq_.size()));
+
+  // Attemps to insert the current <lseq_state_pair>. If the pair already exists
+  // then it returns false.
+  typedef MapType::iterator IterType;
+  std::pair<IterType, bool> result = wseq_to_state_.insert(wseq_state_pair);
+
+  // If the pair was just inserted, then also add it to <state_to_wseq_> and
+  // <state_to_context_>.
+  if (result.second == true) {
+    state_to_wseq_.push_back(wseq);
+    state_to_context_.push_back(new_context);
+  }
+
+  // Creates the arc.
+  oarc->ilabel = ilabel;
+  oarc->olabel = ilabel;
+  oarc->nextstate = result.first->second;
+  oarc->weight = Weight(-logprob);
+
+  return true;
+}
+
+}  // namespace kaldi
diff --git a/src/lm/kaldi-rnnlm.h b/src/lm/kaldi-rnnlm.h
new file mode 100644
index 00000000000..2383058a1a8
--- /dev/null
+++ b/src/lm/kaldi-rnnlm.h
@@ -0,0 +1,104 @@
+// lm/kaldi-rnnlm.h
+
+// Copyright 2015  Guoguo Chen
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_LM_KALDI_RNNLM_H_
+#define KALDI_LM_KALDI_RNNLM_H_
+
+#include <string>
+#include <vector>
+
+#include "base/kaldi-common.h"
+#include "fstext/deterministic-fst.h"
+#include "lm/mikolov-rnnlm-lib.h"
+#include "util/common-utils.h"
+
+namespace kaldi {
+
+struct KaldiRnnlmWrapperOpts {
+  std::string unk_symbol;
+  std::string eos_symbol;
+
+  KaldiRnnlmWrapperOpts() : unk_symbol("<RNN_UNK>"), eos_symbol("</s>") {}
+
+  void Register(OptionsItf *opts) {
+    opts->Register("unk-symbol", &unk_symbol, "Symbol for out-of-vocabulary "
+                   "words in rnnlm.");
+    opts->Register("eos-symbol", &eos_symbol, "End of setence symbol in "
+                   "rnnlm.");
+  }
+};
+
+class KaldiRnnlmWrapper {
+ public:
+  KaldiRnnlmWrapper(const KaldiRnnlmWrapperOpts &opts,
+                    const std::string &unk_prob_rspecifier,
+                    const std::string &word_symbol_table_rxfilename,
+                    const std::string &rnnlm_rxfilename);
+
+  int32 GetHiddenLayerSize() const { return rnnlm_.getHiddenLayerSize(); }
+
+  int32 GetEos() const { return eos_; }
+
+  BaseFloat GetLogProb(int32 word, const std::vector<int32> &wseq,
+                       const std::vector<float> &context_in,
+                       std::vector<float> *context_out);
+
+ private:
+  rnnlm::CRnnLM rnnlm_;
+  std::vector<std::string> label_to_word_;
+  int32 eos_;
+
+  KALDI_DISALLOW_COPY_AND_ASSIGN(KaldiRnnlmWrapper);
+};
+
+class RnnlmDeterministicFst
+    : public fst::DeterministicOnDemandFst<fst::StdArc> {
+ public:
+  typedef fst::StdArc::Weight Weight;
+  typedef fst::StdArc::StateId StateId;
+  typedef fst::StdArc::Label Label;
+
+  // Does not take ownership.
+  RnnlmDeterministicFst(int32 max_ngram_order, KaldiRnnlmWrapper *rnnlm);
+
+  // We cannot use "const" because the pure virtual function in the interface is
+  // not const.
+  virtual StateId Start() { return start_state_; }
+
+  // We cannot use "const" because the pure virtual function in the interface is
+  // not const.
+  virtual Weight Final(StateId s);
+
+  virtual bool GetArc(StateId s, Label ilabel, fst::StdArc* oarc);
+
+ private:
+  typedef unordered_map<std::vector<Label>,
+                        StateId, VectorHasher<Label> > MapType;
+  StateId start_state_;
+  MapType wseq_to_state_;
+  std::vector<std::vector<Label> > state_to_wseq_;
+
+  KaldiRnnlmWrapper *rnnlm_;
+  int32 max_ngram_order_;
+  std::vector<std::vector<float> > state_to_context_;
+};
+
+}  // namespace kaldi
+
+#endif  // KALDI_LM_KALDI_RNNLM_H_
diff --git a/src/lm/lm-lib-test.cc b/src/lm/lm-lib-test.cc
deleted file mode 100644
index ff5e69006a9..00000000000
--- a/src/lm/lm-lib-test.cc
+++ /dev/null
@@ -1,345 +0,0 @@
-// lm/lm-lib-test.cc
-//
-// Copyright 2009-2011 Gilles Boulianne.
-//
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-//  http://www.apache.org/licenses/LICENSE-2.0
-
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-/// @addtogroup LanguageModel
-/// @{
-
-/**
- * @file lm-lib-test.cc
- * @brief Unit tests for language model code.
- */
-
-#include <iostream>
-#include <string>
-#include <sstream>
-#include "lm/kaldi-lm.h"
-
-namespace kaldi {
-
-// hard-coded symbols (for now)
-
-#define startOfSentence "<s>"
-#define endOfSentence   "</s>"
-#define epsilon         "<eps>"
-#define MAX_SENTENCE_LENGTH 1000
-
-/// @brief Recursively prints all complete paths starting at s and their score.
-static LangModelFst::LmWeight PrintCompletePath(fst::SymbolTable *pst,
-                                                fst::StdVectorFst *pfst,
-                                                fst::StdArc::StateId s,
-                                                LangModelFst::LmWeight score) {
-  fst::ArcIterator<fst::StdVectorFst> ai(*pfst, s);
-  for (ai.Reset(); !ai.Done(); ai.Next()) {
-    std::cout << pst->Find(ai.Value().ilabel) << " ";
-    fst::StdArc::Weight w = score;             // initialize with current score
-    // reset weight to 0 if we are going through the initial state again
-    if (s == pfst->Start()) {
-      w = fst::StdArc::Weight::One();
-    }
-    std::cout << " \tcurrent score " << w;
-    w = fst::Times(w, ai.Value().weight);     // add in value from current arc
-    std::cout << " added arc " << ai.Value().weight;
-    fst::StdArc::Weight fw = pfst->Final(ai.Value().nextstate);
-    if (fw != fst::StdArc::Weight::Zero()) {
-      w = fst::Times(w, fw);   // add in destination state weight if final
-      std::cout << " added state weight " << w << '\n';
-    }
-    std::cout << '\n';
-    score = PrintCompletePath(pst, pfst, ai.Value().nextstate, w);
-  }
-  // test this after recursive call in case there are arcs out of a final state
-  if (pfst->Final(s) == fst::StdArc::Weight::One()) {
-    // we hit final state, stop there
-    // std::cout << " total score: " << score << '\n';
-  }
-  return score;
-}
-
-/// @brief Recursively prints all complete paths starting from initial state.
-static LangModelFst::LmWeight PrintCompletePaths(fst::SymbolTable *pst,
-                                                 fst::StdVectorFst *pfst) {
-  KALDI_ASSERT(pst);
-  KALDI_ASSERT(pfst);
-  KALDI_ASSERT(pfst->Start() >=0);
-  return PrintCompletePath(pst, pfst, pfst->Start(),
-                           fst::StdArc::Weight::One());
-}
-
-/// @brief Creates an FST that generates any sequence of symbols
-/// taken from given symbol table.
-/// This FST is then associated with given symbol table.
-static fst::StdVectorFst* CreateGenFst(fst::SymbolTable *pst) {
-  fst::StdArc::StateId initId, midId, finalId;
-  fst::StdVectorFst *genFst = new fst::StdVectorFst;
-  pst->AddSymbol(epsilon);                         // added if not there
-  int64 boslab = pst->AddSymbol(startOfSentence);  // added if not there
-  int64 eoslab = pst->AddSymbol(endOfSentence);    // added if not there
-  genFst->SetInputSymbols(pst);
-  genFst->SetOutputSymbols(pst);
-
-  initId  = genFst->AddState();
-  midId   = genFst->AddState();
-  finalId = genFst->AddState();
-  genFst->SetStart(initId);                        // initial state
-  genFst->SetFinal(finalId, fst::StdArc::Weight::One());  // final state
-  genFst->AddArc(initId, fst::StdArc(boslab, boslab, 0, midId));
-  genFst->AddArc(midId,  fst::StdArc(eoslab, eoslab, 0, finalId));
-  // add a loop for each symbol except epsilon, begin and end of sentence
-  fst::SymbolTableIterator si(*pst);
-  for (si.Reset(); !si.Done(); si.Next()) {
-    if (si.Value() == boslab ||
-        si.Value() == eoslab ||
-        si.Value() == 0) continue;
-    genFst->AddArc(midId, fst::StdArc(si.Value(), si.Value(), 0, midId));
-  }
-  return genFst;
-}
-
-/// @brief Randomly generates ntests paths with uniform distribution.
-static fst::StdVectorFst* CreateRandPathFst(int n, fst::StdVectorFst *genFst) {
-  typedef fst::UniformArcSelector<fst::StdArc> UniformSelector;
-
-  int nTrials = 50;
-  UniformSelector uniform_sel;
-  fst::RandGenOptions<UniformSelector > opts(uniform_sel,
-                                             MAX_SENTENCE_LENGTH, n);
-
-  for (int i = 0; i < nTrials; i++) {
-    fst::StdVectorFst *tmpFst = new fst::StdVectorFst;
-    RandGen(*genFst, tmpFst, opts);
-    if (tmpFst->Properties(fst::kCoAccessible, true)) {
-      // std::cout << "Got valid random path after " << i << " tries" << '\n';
-      return tmpFst;
-    }
-    // not good, try another
-    delete tmpFst;
-  }
-  // couldn't generate it within allowed trials
-  std::cerr << " Warning: couldn't generate complete paths within " << nTrials;
-  std::cerr << " trials and " << MAX_SENTENCE_LENGTH << " max length" << '\n';
-  return NULL;
-}
-
-/// @brief Tests if all paths generated from genFst are included in testFst.
-static bool coverageTests(fst::StdVectorFst *genFst,
-                          fst::StdVectorFst *testFst,
-                          int ntests) {
-  bool success = true;
-#ifdef KALDI_PARANOID
-  KALDI_ASSERT(genFst != NULL);
-  KALDI_ASSERT(testFst != NULL);
-#endif
-
-  std::cout << "Generating " << ntests << " tests";
-  std::cout.flush();
-
-  // randomly generate ntests paths with uniform distribution
-  fst::StdVectorFst *pathFst = CreateRandPathFst(ntests, genFst);
-  if (!pathFst) return false;
-
-  // compose paths with language model fst
-  fst::StdVectorFst *outFst = new fst::StdVectorFst;
-  // std::cout << "Path FST " << '\n';
-  // printFirstCompletePath(pst, pathFst, pathFst->Start());
-
-  Compose(*pathFst, *testFst, outFst);
-
-  // Composition result must have ntests arcs out of initial state
-  int narcs = outFst->NumArcs(outFst->Start());
-  std::cout << ", composition has " << narcs << " arcs out of start state" << '\n';
-  if (narcs !=  ntests) success = false;
-
-  // std::cout << "Out  FST " << '\n';
-  // printFirstCompletePath(pst, outFst, outFst->Start());
-
-  delete pathFst;
-  delete outFst;
-
-  return success;
-}
-
-/// @brief Tests read and write methods.
-bool TestLmTableReadWrite(int nTests,
-                          const string &infile,
-                          const string &outfile) {
-  bool success = true;
-  // reading test: create a language model FST from input file
-  std::cout << "LangModelFst test: read file " << infile << '\n';
-  LangModelFst lm;
-  if (!lm.Read(infile, kArpaLm)) return false;
-
-  // first create an FST that generates
-  // any sequence of symbols taken from symbol table
-  fst::StdVectorFst *genFst = CreateGenFst(lm.GetFst()->MutableInputSymbols());
-
-  // see if path generated in this FST are covered by the LM FST
-  std::cout << "For any sequence of symbols found in symbol table:" << '\n';
-  if (coverageTests(genFst, lm.GetFst(), nTests)) {
-    std::cout << "PASSED";
-  } else {
-    std::cout << "FAILED";
-    success = false;
-  }
-  std::cout <<'\n';
-
-  // writing test: write out FST, read it back in a new lm
-  // reading doesn't provide symbol tables automatically ?
-  std::cout << "LangModelFst test: write to " << outfile;
-  std::cout << " and read it back" << '\n';
-  // std::cout << "lm input symbol table:" << '\n';
-  // lm.GetFst()->InputSymbols()->WriteText(std::cout);
-  // std::cout << "lm output symbol table:" << '\n';
-  // lm.GetFst()->OutputSymbols()->WriteText(std::cout);
-  lm.Write(outfile);
-
-  std::cout << "LangModelFst test: read from " << outfile << '\n';
-  LangModelFst lm2;
-  if (!lm2.Read(outfile, kFst)) return false;
-  // std::cout << "lm2 output symbol table:" << '\n';
-  // lm2.GetFst()->InputSymbols()->WriteText(std::cout);
-  // std::cout << "lm2 output symbol table:" << '\n';
-  // lm2.GetFst()->OutputSymbols()->WriteText(std::cout);
-
-  // generate random sequences from the original LM
-  // and see if they are covered by the FST that was just read
-  std::cout << "For any complete path in original LM:" << '\n';
-  if (coverageTests(lm.GetFst(), lm2.GetFst(), nTests)) {
-    std::cout << "PASSED";
-  } else {
-    std::cout << "FAILED";
-    success = false;
-  }
-  std::cout <<'\n';
-  delete genFst;
-
-  return success;
-}
-
-/// @brief Tests correctness of path weights.
-bool TestLmTableEvalScore(const string &inpfile,
-                          const string &intext,
-                          const string &refScoreFile) {
-  bool success = true;
-
-  // read in reference score
-  std::ifstream strm(refScoreFile.c_str(), std::ifstream::in);
-  LangModelFst::LmWeight refScore;
-  strm >> refScore;
-  std::cout << "Reference score is " << refScore << '\n';
-
-  std::cout << "LangModelFst test: score text strings with LM " << intext << '\n';
-  // use original log base for testing
-  LangModelFst lm;
-  if (!lm.Read(inpfile, kArpaLm, NULL, false)) return false;
-
-  std::cout << "LangModelFst test: read text strings " << intext << '\n';
-  // here specify symbol table to be used so composition works
-  LangModelFst txtString;
-  if (!txtString.Read(intext, kTextString,
-                      lm.GetFst()->MutableInputSymbols())) {
-    return false;
-  }
-
-  // PrintCompletePaths(txtString.GetFst()->InputSymbols(), txtString.GetFst());
-  // std::cout << "Fst string input symbol table:" << '\n';
-  // txtString.GetFst()->OutputSymbols()->WriteText(std::cout);
-  // std::cout << "Fst string output symbol table:" << '\n';
-  // txtString.GetFst()->OutputSymbols()->WriteText(std::cout);
-
-  // compose paths with language model fst
-  fst::StdVectorFst composedFst;
-  fst::ComposeFstOptions < fst::StdArc,
-    fst::Matcher<fst::StdFst >,
-    fst::MatchComposeFilter< fst::Matcher<fst::StdFst > > > copts;
-  copts.gc_limit = 0;  // Cache only the last state for fastest copy.
-  composedFst = fst::ComposeFst<fst::StdArc>(*txtString.GetFst(),
-                                             *lm.GetFst(),
-                                             copts);
-  composedFst.Write("composed.fst");
-
-  // find best path score
-  fst::StdVectorFst *bestFst = new fst::StdVectorFst;
-  fst::ShortestPath(composedFst, bestFst, 1);
-
-  std::cout << "Best path has " << bestFst->NumStates() << " states" << '\n';
-  LangModelFst::LmWeight testScore = PrintCompletePaths(
-      bestFst->MutableInputSymbols(),
-      bestFst);
-  std::cout << "Complete path score is " << testScore << '\n';
-
-  if (testScore.Value() <= refScore.Value()) {
-    std::cout << "PASSED";
-  } else {
-    std::cout << "FAILED";
-    success = false;
-  }
-  std::cout <<'\n';
-
-  delete bestFst;
-
-  unlink("composed.fst");
-
-  return success;
-}
-
-}  // end namespace kaldi
-
-int main(int argc, char *argv[]) {
-  int ntests;
-  bool success = true;
-  std::string infile = "input.arpa";
-  std::string outfile = "output.fst";
-
-  // Note that for these tests to work, language models must be acceptors
-  // (i.e. have same symbol table for input and output) since we
-  // compose them with one another
-
-  ntests = 20;
-  std::cout << "Testing small arpa file with missing backoffs" << '\n';
-  infile = "missing_backoffs.arpa";
-  success &= kaldi::TestLmTableReadWrite(ntests, infile, outfile);
-
-  std::cout << "Testing small arpa file with unused backoffs" << '\n';
-  infile = "unused_backoffs.arpa";
-  success &= kaldi::TestLmTableReadWrite(ntests, infile, outfile);
-
-  std::cout << "Testing normal small arpa file" << '\n';
-  infile = "input.arpa";
-  success &= kaldi::TestLmTableReadWrite(ntests, infile, outfile);
-
-  ntests = 2;
-  // note that we use latest value of 'infile' as the tested language model
-  for (int i = 1; i <= ntests; i++) {
-    std::ostringstream intext("");
-    std::ostringstream refscore("");
-    // these inputN.txt sentences have been scored
-    // by an external LM tool with results in inputN.score
-    intext << "input" << i << ".txt";
-    refscore << "input" << i << ".score";
-    success &= kaldi::TestLmTableEvalScore(infile,
-                                           intext.str(),
-                                           refscore.str());
-  }
-
-  unlink("output.fst");
-
-  exit(success ? 0 : 1);
-}
-/// @}
-
diff --git a/src/lm/mikolov-rnnlm-lib.cc b/src/lm/mikolov-rnnlm-lib.cc
new file mode 100644
index 00000000000..f69d47f7b80
--- /dev/null
+++ b/src/lm/mikolov-rnnlm-lib.cc
@@ -0,0 +1,1177 @@
+// lm/mikolov-rnnlm-lib.cc
+
+// Copyright       2015  Guoguo Chen  Hainan Xu
+//            2010-2012  Tomas Mikolov
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// This file is based on version 0.3e of the RNNLM language modeling
+// toolkit by Tomas Mikolov.  Changes made by authors other than
+// Tomas Mikolov are licensed under the Apache License, the short form
+// os which is below.  The original code by Tomas Mikolov is licensed
+// under the BSD 3-clause license, whose text is further below.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+//
+//
+// Original BSD 3-clause license text:
+// Copyright (c) 2010-2012 Tomas Mikolov
+//
+// All rights reserved. Redistribution and use in source and binary forms, with
+// or without modification, are permitted provided that the following conditions
+// are met: 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following
+// disclaimer. 2. Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the
+// distribution. 3. Neither name of copyright holders nor the names of its
+// contributors may be used to endorse or promote products derived from this
+// software without specific prior written permission. THIS SOFTWARE IS PROVIDED
+// BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' AND ANY EXPRESS OR
+// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+// EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+// OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include "lm/mikolov-rnnlm-lib.h"
+#include "util/table-types.h"
+
+namespace rnnlm {
+
+///// fast exp() implementation
+static union {
+  double d;
+  struct {
+    int j, i;
+  } n;
+} d2i;
+#define EXP_A (1048576/M_LN2)
+#define EXP_C 60801
+#define FAST_EXP(y) (d2i.n.i = EXP_A * (y) + (1072693248 - EXP_C), d2i.d)
+
+CRnnLM::CRnnLM() {
+  version = 10;
+  filetype = TEXT;
+
+  use_lmprob = 0;
+  gradient_cutoff = 15;
+  dynamic = 0;
+
+  train_file[0] = 0;
+  valid_file[0] = 0;
+  test_file[0] = 0;
+  rnnlm_file[0] = 0;
+
+  alpha_set = 0;
+  train_file_set = 0;
+
+  alpha = 0.1;
+  beta = 0.0000001;
+  // beta = 0.00000;
+  alpha_divide = 0;
+  logp = 0;
+  llogp = -100000000;
+  iter = 0;
+
+  min_improvement = 1.003;
+
+  train_words = 0;
+  vocab_max_size = 100;
+  vocab_size = 0;
+  vocab = (struct vocab_word *)calloc(vocab_max_size,
+                                      sizeof(struct vocab_word));
+
+  layer1_size = 30;
+
+  direct_size = 0;
+  direct_order = 0;
+
+  bptt = 0;
+  bptt_block = 10;
+  bptt_history = NULL;
+  bptt_hidden = NULL;
+  bptt_syn0 = NULL;
+
+  gen = 0;
+
+  independent = 0;
+
+  neu0 = NULL;
+  neu1 = NULL;
+  neuc = NULL;
+  neu2 = NULL;
+
+  syn0 = NULL;
+  syn1 = NULL;
+  sync = NULL;
+  syn_d = NULL;
+  syn_db = NULL;
+  // backup
+  neu0b = NULL;
+  neu1b = NULL;
+  neucb = NULL;
+  neu2b = NULL;
+
+  neu1b2 = NULL;
+
+  syn0b = NULL;
+  syn1b = NULL;
+  syncb = NULL;
+
+  rand_seed = 1;
+
+  class_size = 100;
+  old_classes = 0;
+
+  srand(rand_seed);
+
+  vocab_hash_size = 100000000;
+  vocab_hash  =  (int *)calloc(vocab_hash_size, sizeof(int));
+}
+
+CRnnLM::~CRnnLM() {
+  int i;
+
+  if (neu0 != NULL) {
+    free(neu0);
+    free(neu1);
+    if (neuc != NULL) free(neuc);
+    free(neu2);
+
+    free(syn0);
+    free(syn1);
+    if (sync != NULL) free(sync);
+
+    if (syn_d != NULL) free(syn_d);
+
+    if (syn_db != NULL) free(syn_db);
+
+    free(neu0b);
+    free(neu1b);
+    if (neucb != NULL) free(neucb);
+    free(neu2b);
+
+    free(neu1b2);
+
+    free(syn0b);
+    free(syn1b);
+    if (syncb != NULL) free(syncb);
+
+    for (i = 0; i < class_size; i++) free(class_words[i]);
+    free(class_max_cn);
+    free(class_cn);
+    free(class_words);
+
+    free(vocab);
+    free(vocab_hash);
+
+    if (bptt_history != NULL) free(bptt_history);
+    if (bptt_hidden != NULL) free(bptt_hidden);
+    if (bptt_syn0 != NULL) free(bptt_syn0);
+
+    // todo: free bptt variables too
+  }
+}
+
+real CRnnLM::random(real min, real max) {
+  return rand()/(real)RAND_MAX*(max-min)+min;
+}
+
+void CRnnLM::setRnnLMFile(const std::string &str) {
+  strcpy(rnnlm_file, str.c_str());
+}
+
+void CRnnLM::setRandSeed(int newSeed) {
+  rand_seed = newSeed;
+  srand(rand_seed);
+}
+
+void CRnnLM::readWord(char *word, FILE *fin) {
+  int a = 0, ch;
+
+  while (!feof(fin)) {
+    ch = fgetc(fin);
+
+    if (ch == 13) continue;
+
+    if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
+      if (a > 0) {
+        if (ch == '\n') ungetc(ch, fin);
+        break;
+      }
+
+      if (ch == '\n') {
+        strcpy(word, (char *)"</s>");
+        return;
+      } else {
+        continue;
+      }
+    }
+
+    word[a] = ch;
+    a++;
+
+    if (a >= MAX_STRING) {
+      // printf("Too long word found!\n");   //truncate too long words
+      a--;
+    }
+  }
+  word[a] = 0;
+}
+
+int CRnnLM::getWordHash(const char *word) {
+  unsigned int hash, a;
+
+  hash = 0;
+  for (a = 0; a < strlen(word); a++) hash = hash * 237 + word[a];
+  hash = hash % vocab_hash_size;
+
+  return hash;
+}
+
+int CRnnLM::searchVocab(const char *word) {
+  int a;
+  unsigned int hash;
+
+  hash = getWordHash(word);
+
+  if (vocab_hash[hash] == -1) return -1;
+  if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash];
+
+  for (a = 0; a < vocab_size; a++) {        // search in vocabulary
+    if (!strcmp(word, vocab[a].word)) {
+      vocab_hash[hash] = a;
+      return a;
+    }
+  }
+
+  return -1;              // return OOV if not found
+}
+
+void CRnnLM::sortVocab() {
+  int a, b, max;
+  vocab_word swap;
+
+  for (a = 1; a < vocab_size; a++) {
+    max = a;
+    for (b = a + 1; b < vocab_size; b++)
+      if (vocab[max].cn < vocab[b].cn) max = b;
+
+    swap = vocab[max];
+    vocab[max] = vocab[a];
+    vocab[a] = swap;
+  }
+}
+
+void CRnnLM::saveWeights() {      // saves current weights and unit activations
+  int a, b;
+
+  for (a = 0; a < layer0_size; a++) {
+    neu0b[a].ac = neu0[a].ac;
+    neu0b[a].er = neu0[a].er;
+  }
+
+  for (a = 0; a < layer1_size; a++) {
+    neu1b[a].ac = neu1[a].ac;
+    neu1b[a].er = neu1[a].er;
+  }
+
+  for (a = 0; a < layerc_size; a++) {
+    neucb[a].ac = neuc[a].ac;
+    neucb[a].er = neuc[a].er;
+  }
+
+  for (a = 0; a < layer2_size; a++) {
+    neu2b[a].ac = neu2[a].ac;
+    neu2b[a].er = neu2[a].er;
+  }
+
+  for (b = 0; b < layer1_size; b++)
+    for (a = 0; a < layer0_size; a++) {
+      syn0b[a + b * layer0_size].weight = syn0[a + b * layer0_size].weight;
+    }
+
+  if (layerc_size > 0) {
+    for (b = 0; b < layerc_size; b++) for (a = 0; a < layer1_size; a++) {
+      syn1b[a + b * layer1_size].weight = syn1[a + b * layer1_size].weight;
+    }
+
+    for (b = 0; b < layer2_size; b++) for (a = 0; a < layerc_size; a++) {
+      syncb[a + b * layerc_size].weight = sync[a + b * layerc_size].weight;
+    }
+  } else {
+    for (b = 0; b < layer2_size; b++)
+      for (a = 0; a < layer1_size; a++) {
+        syn1b[a + b * layer1_size].weight = syn1[a + b * layer1_size].weight;
+      }
+  }
+
+  // for (a = 0; a < direct_size; a++) syn_db[a].weight = syn_d[a].weight;
+}
+
+void CRnnLM::initNet() {
+  int a, b, cl;
+
+  layer0_size = vocab_size + layer1_size;
+  layer2_size = vocab_size + class_size;
+
+  neu0 = (struct neuron *)calloc(layer0_size, sizeof(struct neuron));
+  neu1 = (struct neuron *)calloc(layer1_size, sizeof(struct neuron));
+  neuc = (struct neuron *)calloc(layerc_size, sizeof(struct neuron));
+  neu2 = (struct neuron *)calloc(layer2_size, sizeof(struct neuron));
+
+  syn0 = (struct synapse *)calloc(layer0_size * layer1_size,
+                                  sizeof(struct synapse));
+  if (layerc_size == 0) {
+    syn1 = (struct synapse *)calloc(layer1_size * layer2_size,
+                                    sizeof(struct synapse));
+  } else {
+    syn1 = (struct synapse *)calloc(layer1_size * layerc_size,
+                                    sizeof(struct synapse));
+    sync = (struct synapse *)calloc(layerc_size * layer2_size,
+                                    sizeof(struct synapse));
+  }
+
+  if (syn1 == NULL) {
+    printf("Memory allocation failed\n");
+    exit(1);
+  }
+
+  if (layerc_size > 0)
+    if (sync == NULL) {
+      printf("Memory allocation failed\n");
+      exit(1);
+    }
+
+  syn_d = (direct_t *)calloc((long long)direct_size, sizeof(direct_t));
+
+  if (syn_d == NULL) {
+    printf("Memory allocation for direct"
+           " connections failed (requested %lld bytes)\n",
+           (long long)direct_size * (long long)sizeof(direct_t));
+    exit(1);
+  }
+
+  neu0b = (struct neuron *)calloc(layer0_size, sizeof(struct neuron));
+  neu1b = (struct neuron *)calloc(layer1_size, sizeof(struct neuron));
+  neucb = (struct neuron *)calloc(layerc_size, sizeof(struct neuron));
+  neu1b2 = (struct neuron *)calloc(layer1_size, sizeof(struct neuron));
+  neu2b = (struct neuron *)calloc(layer2_size, sizeof(struct neuron));
+
+  syn0b = (struct synapse *)calloc(layer0_size * layer1_size,
+                                   sizeof(struct synapse));
+  // syn1b = (struct synapse *)calloc(layer1_size*layer2_size,
+  // sizeof(struct synapse));
+  if (layerc_size == 0) {
+    syn1b = (struct synapse *)calloc(layer1_size * layer2_size,
+                                     sizeof(struct synapse));
+  } else {
+    syn1b = (struct synapse *)calloc(layer1_size * layerc_size,
+                                     sizeof(struct synapse));
+    syncb = (struct synapse *)calloc(layerc_size * layer2_size,
+                                     sizeof(struct synapse));
+  }
+
+  if (syn1b == NULL) {
+    printf("Memory allocation failed\n");
+    exit(1);
+  }
+
+  for (a = 0; a < layer0_size; a++) {
+    neu0[a].ac = 0;
+    neu0[a].er = 0;
+  }
+
+  for (a = 0; a < layer1_size; a++) {
+    neu1[a].ac = 0;
+    neu1[a].er = 0;
+  }
+
+  for (a = 0; a < layerc_size; a++) {
+    neuc[a].ac = 0;
+    neuc[a].er = 0;
+  }
+
+  for (a = 0; a < layer2_size; a++) {
+    neu2[a].ac = 0;
+    neu2[a].er = 0;
+  }
+
+  for (b = 0; b < layer1_size; b++)
+    for (a = 0; a < layer0_size; a++) {
+      syn0[a + b * layer0_size].weight =
+          random(-0.1, 0.1) + random(-0.1, 0.1) + random(-0.1, 0.1);
+    }
+
+  if (layerc_size > 0) {
+    for (b = 0; b < layerc_size; b++)
+      for (a = 0; a < layer1_size; a++) {
+        syn1[a + b * layer1_size].weight =
+            random(-0.1, 0.1) + random(-0.1, 0.1) + random(-0.1, 0.1);
+      }
+
+    for (b = 0; b < layer2_size; b++)
+      for (a = 0; a < layerc_size; a++) {
+        sync[a + b * layerc_size].weight =
+            random(-0.1, 0.1) + random(-0.1, 0.1) + random(-0.1, 0.1);
+      }
+  } else {
+    for (b = 0; b < layer2_size; b++)
+      for (a = 0; a < layer1_size; a++) {
+        syn1[a + b * layer1_size].weight =
+            random(-0.1, 0.1) + random(-0.1, 0.1) + random(-0.1, 0.1);
+      }
+  }
+
+  long long aa;
+  for (aa = 0; aa < direct_size; aa++) syn_d[aa] = 0;
+
+  if (bptt > 0) {
+    bptt_history = (int *)calloc((bptt + bptt_block + 10), sizeof(int));
+    for (a = 0; a < bptt + bptt_block; a++) bptt_history[a] = -1;
+    bptt_hidden = (neuron *)calloc((bptt + bptt_block + 1) * layer1_size,
+                                   sizeof(neuron));
+    for (a = 0; a < (bptt + bptt_block) * layer1_size; a++) {
+      bptt_hidden[a].ac = 0;
+      bptt_hidden[a].er = 0;
+    }
+    bptt_syn0 = (struct synapse *)calloc(layer0_size * layer1_size,
+                                         sizeof(struct synapse));
+    if (bptt_syn0 == NULL) {
+      printf("Memory allocation failed\n");
+      exit(1);
+    }
+  }
+
+  saveWeights();
+
+  double df, dd;
+  int i;
+
+  df = 0;
+  dd = 0;
+  a = 0;
+  b = 0;
+
+  if (old_classes) {    // old classes
+    for (i = 0; i < vocab_size; i++) b += vocab[i].cn;
+    for (i = 0; i < vocab_size; i++) {
+      df+= vocab[i].cn / (double)b;
+      if (df > 1) df = 1;
+      if (df > (a + 1) / (double)class_size) {
+        vocab[i].class_index = a;
+        if (a < class_size - 1) a++;
+      } else {
+        vocab[i].class_index = a;
+      }
+    }
+  } else {      // new classes
+    for (i = 0; i < vocab_size; i++) b += vocab[i].cn;
+    for (i = 0; i < vocab_size; i++) dd += sqrt(vocab[i].cn / (double)b);
+    for (i = 0; i < vocab_size; i++) {
+      df += sqrt(vocab[i].cn / (double)b) / dd;
+      if (df > 1) df = 1;
+      if (df > (a + 1) / (double)class_size) {
+        vocab[i].class_index = a;
+        if (a < class_size-1) a++;
+      } else {
+        vocab[i].class_index = a;
+      }
+    }
+  }
+
+  // allocate auxiliary class variables (for faster search when
+  // normalizing probability at output layer)
+
+  class_words = (int **)calloc(class_size, sizeof(int *));
+  class_cn = (int *)calloc(class_size, sizeof(int));
+  class_max_cn = (int *)calloc(class_size, sizeof(int));
+
+  for (i = 0; i < class_size; i++) {
+    class_cn[i] = 0;
+    class_max_cn[i] = 10;
+    class_words[i] = (int *)calloc(class_max_cn[i], sizeof(int));
+  }
+
+  for (i = 0; i < vocab_size; i++) {
+    cl = vocab[i].class_index;
+    class_words[cl][class_cn[cl]] = i;
+    class_cn[cl]++;
+    if (class_cn[cl] + 2 >= class_max_cn[cl]) {
+      class_max_cn[cl] += 10;
+      class_words[cl] = (int *)realloc(class_words[cl],
+                                       class_max_cn[cl] * sizeof(int));
+    }
+  }
+}
+
+void CRnnLM::goToDelimiter(int delim, FILE *fi) {
+  int ch = 0;
+
+  while (ch != delim) {
+    ch = fgetc(fi);
+    if (feof(fi)) {
+      printf("Unexpected end of file\n");
+      exit(1);
+    }
+  }
+}
+
+void CRnnLM::restoreNet() {   // will read whole network structure
+  FILE *fi;
+  int a, b, ver;
+  float fl;
+  char str[MAX_STRING];
+  double d;
+
+  fi = fopen(rnnlm_file, "rb");
+  if (fi == NULL) {
+    printf("ERROR: model file '%s' not found!\n", rnnlm_file);
+    exit(1);
+  }
+
+  goToDelimiter(':', fi);
+  fscanf(fi, "%d", &ver);
+  if ((ver == 4) && (version == 5)) {
+    /* we will solve this later.. */
+  } else {
+    if (ver != version) {
+      printf("Unknown version of file %s\n", rnnlm_file);
+      exit(1);
+    }
+  }
+  goToDelimiter(':', fi);
+  fscanf(fi, "%d", &filetype);
+  goToDelimiter(':', fi);
+  if (train_file_set == 0) {
+    fscanf(fi, "%s", train_file);
+  } else {
+    fscanf(fi, "%s", str);
+  }
+  goToDelimiter(':', fi);
+  fscanf(fi, "%s", valid_file);
+  goToDelimiter(':', fi);
+  fscanf(fi, "%lf", &llogp);
+  goToDelimiter(':', fi);
+  fscanf(fi, "%d", &iter);
+  goToDelimiter(':', fi);
+  fscanf(fi, "%d", &train_cur_pos);
+  goToDelimiter(':', fi);
+  fscanf(fi, "%lf", &logp);
+  goToDelimiter(':', fi);
+  fscanf(fi, "%d", &anti_k);
+  goToDelimiter(':', fi);
+  fscanf(fi, "%d", &train_words);
+  goToDelimiter(':', fi);
+  fscanf(fi, "%d", &layer0_size);
+  goToDelimiter(':', fi);
+  fscanf(fi, "%d", &layer1_size);
+  goToDelimiter(':', fi);
+  fscanf(fi, "%d", &layerc_size);
+  goToDelimiter(':', fi);
+  fscanf(fi, "%d", &layer2_size);
+  if (ver > 5) {
+    goToDelimiter(':', fi);
+    fscanf(fi, "%lld", &direct_size);
+  }
+  if (ver > 6) {
+    goToDelimiter(':', fi);
+    fscanf(fi, "%d", &direct_order);
+  }
+  goToDelimiter(':', fi);
+  fscanf(fi, "%d", &bptt);
+  if (ver > 4) {
+    goToDelimiter(':', fi);
+    fscanf(fi, "%d", &bptt_block);
+  } else {
+    bptt_block = 10;
+  }
+  goToDelimiter(':', fi);
+  fscanf(fi, "%d", &vocab_size);
+  goToDelimiter(':', fi);
+  fscanf(fi, "%d", &class_size);
+  goToDelimiter(':', fi);
+  fscanf(fi, "%d", &old_classes);
+  goToDelimiter(':', fi);
+  fscanf(fi, "%d", &independent);
+  goToDelimiter(':', fi);
+  fscanf(fi, "%lf", &d);
+  starting_alpha = d;
+  goToDelimiter(':', fi);
+  if (alpha_set == 0) {
+    fscanf(fi, "%lf", &d);
+    alpha = d;
+  } else {
+    fscanf(fi, "%lf", &d);
+  }
+  goToDelimiter(':', fi);
+  fscanf(fi, "%d", &alpha_divide);
+
+  // read normal vocabulary
+  if (vocab_max_size < vocab_size) {
+    if (vocab != NULL) free(vocab);
+    vocab_max_size = vocab_size + 1000;
+    // initialize memory for vocabulary
+    vocab = (struct vocab_word *)calloc(vocab_max_size,
+                                        sizeof(struct vocab_word));
+  }
+  goToDelimiter(':', fi);
+  for (a = 0; a < vocab_size; a++) {
+    // fscanf(fi, "%d%d%s%d", &b, &vocab[a].cn,
+    // vocab[a].word, &vocab[a].class_index);
+    fscanf(fi, "%d%d", &b, &vocab[a].cn);
+    readWord(vocab[a].word, fi);
+    fscanf(fi, "%d", &vocab[a].class_index);
+    // printf("%d  %d  %s  %d\n", b, vocab[a].cn,
+    // vocab[a].word, vocab[a].class_index);
+  }
+  if (neu0 == NULL) initNet();    // memory allocation here
+
+  if (filetype == TEXT) {
+    goToDelimiter(':', fi);
+    for (a = 0; a < layer1_size; a++) {
+      fscanf(fi, "%lf", &d);
+      neu1[a].ac = d;
+    }
+  }
+  if (filetype == BINARY) {
+    fgetc(fi);
+    for (a = 0; a < layer1_size; a++) {
+      fread(&fl, 4, 1, fi);
+      neu1[a].ac = fl;
+    }
+  }
+  if (filetype == TEXT) {
+    goToDelimiter(':', fi);
+    for (b = 0; b < layer1_size; b++) {
+      for (a = 0; a < layer0_size; a++) {
+        fscanf(fi, "%lf", &d);
+        syn0[a + b * layer0_size].weight = d;
+      }
+    }
+  }
+  if (filetype == BINARY) {
+    for (b = 0; b < layer1_size; b++) {
+      for (a = 0; a < layer0_size; a++) {
+        fread(&fl, 4, 1, fi);
+        syn0[a + b * layer0_size].weight = fl;
+      }
+    }
+  }
+  if (filetype == TEXT) {
+    goToDelimiter(':', fi);
+    if (layerc_size == 0) {  // no compress layer
+      for (b = 0; b < layer2_size; b++) {
+        for (a = 0; a < layer1_size; a++) {
+          fscanf(fi, "%lf", &d);
+          syn1[a + b * layer1_size].weight = d;
+        }
+      }
+    } else {        // with compress layer
+      for (b = 0; b < layerc_size; b++) {
+        for (a = 0; a < layer1_size; a++) {
+          fscanf(fi, "%lf", &d);
+          syn1[a + b * layer1_size].weight = d;
+        }
+      }
+
+      goToDelimiter(':', fi);
+
+      for (b = 0; b < layer2_size; b++) {
+        for (a = 0; a < layerc_size; a++) {
+          fscanf(fi, "%lf", &d);
+          sync[a + b * layerc_size].weight = d;
+        }
+      }
+    }
+  }
+  if (filetype == BINARY) {
+    if (layerc_size == 0) {  // no compress layer
+      for (b = 0; b < layer2_size; b++) {
+        for (a = 0; a < layer1_size; a++) {
+          fread(&fl, 4, 1, fi);
+          syn1[a + b * layer1_size].weight = fl;
+        }
+      }
+    } else {        // with compress layer
+      for (b = 0; b < layerc_size; b++) {
+        for (a = 0; a < layer1_size; a++) {
+          fread(&fl, 4, 1, fi);
+          syn1[a + b * layer1_size].weight = fl;
+        }
+      }
+
+      for (b = 0; b < layer2_size; b++) {
+        for (a = 0; a < layerc_size; a++) {
+          fread(&fl, 4, 1, fi);
+          sync[a + b * layerc_size].weight = fl;
+        }
+      }
+    }
+  }
+  if (filetype == TEXT) {
+    goToDelimiter(':', fi);    // direct conenctions
+    long long aa;
+    for (aa = 0; aa < direct_size; aa++) {
+      fscanf(fi, "%lf", &d);
+      syn_d[aa] = d;
+    }
+  }
+  if (filetype == BINARY) {
+    long long aa;
+    for (aa = 0; aa < direct_size; aa++) {
+      fread(&fl, 4, 1, fi);
+      syn_d[aa] = fl;
+
+      /*fread(&si, 2, 1, fi);
+        fl = si/(float)(4*256);
+        syn_d[aa] = fl;*/
+    }
+  }
+
+  saveWeights();
+
+  fclose(fi);
+}
+
+void CRnnLM::netReset() {  // cleans hidden layer activation + bptt history
+  int a, b;
+
+  for (a = 0; a < layer1_size; a++) {
+    neu1[a].ac = 1.0;
+  }
+
+  copyHiddenLayerToInput();
+
+  if (bptt > 0) {
+    for (a = 1; a < bptt + bptt_block; a++) bptt_history[a] = 0;
+    for (a = bptt + bptt_block-1; a > 1; a--)
+      for (b = 0; b < layer1_size; b++) {
+        bptt_hidden[a * layer1_size + b].ac = 0;
+        bptt_hidden[a * layer1_size + b].er = 0;
+    }
+  }
+
+  for (a = 0; a < MAX_NGRAM_ORDER; a++) history[a] = 0;
+}
+
+void CRnnLM::matrixXvector(struct neuron *dest, struct neuron *srcvec,
+                           struct synapse *srcmatrix, int matrix_width,
+                           int from, int to, int from2, int to2, int type) {
+  int a, b;
+  real val1, val2, val3, val4;
+  real val5, val6, val7, val8;
+
+  if (type == 0) {    // ac mod
+    for (b = 0; b < (to - from) / 8; b++) {
+      val1 = 0;
+      val2 = 0;
+      val3 = 0;
+      val4 = 0;
+
+      val5 = 0;
+      val6 = 0;
+      val7 = 0;
+      val8 = 0;
+
+      for (a = from2; a < to2; a++) {
+        val1 += srcvec[a].ac * srcmatrix[a+(b*8+from+0)*matrix_width].weight;
+        val2 += srcvec[a].ac * srcmatrix[a+(b*8+from+1)*matrix_width].weight;
+        val3 += srcvec[a].ac * srcmatrix[a+(b*8+from+2)*matrix_width].weight;
+        val4 += srcvec[a].ac * srcmatrix[a+(b*8+from+3)*matrix_width].weight;
+
+        val5 += srcvec[a].ac * srcmatrix[a+(b*8+from+4)*matrix_width].weight;
+        val6 += srcvec[a].ac * srcmatrix[a+(b*8+from+5)*matrix_width].weight;
+        val7 += srcvec[a].ac * srcmatrix[a+(b*8+from+6)*matrix_width].weight;
+        val8 += srcvec[a].ac * srcmatrix[a+(b*8+from+7)*matrix_width].weight;
+      }
+      dest[b*8+from+0].ac += val1;
+      dest[b*8+from+1].ac += val2;
+      dest[b*8+from+2].ac += val3;
+      dest[b*8+from+3].ac += val4;
+
+      dest[b*8+from+4].ac += val5;
+      dest[b*8+from+5].ac += val6;
+      dest[b*8+from+6].ac += val7;
+      dest[b*8+from+7].ac += val8;
+    }
+
+    for (b = b*8; b < to-from; b++) {
+      for (a = from2; a < to2; a++) {
+        dest[b+from].ac +=
+            srcvec[a].ac * srcmatrix[a+(b+from)*matrix_width].weight;
+      }
+    }
+  } else {    // er mod
+    for (a = 0; a < (to2-from2)/8; a++) {
+      val1 = 0;
+      val2 = 0;
+      val3 = 0;
+      val4 = 0;
+
+      val5 = 0;
+      val6 = 0;
+      val7 = 0;
+      val8 = 0;
+
+      for (b = from; b < to; b++) {
+        val1 += srcvec[b].er * srcmatrix[a*8+from2+0+b*matrix_width].weight;
+        val2 += srcvec[b].er * srcmatrix[a*8+from2+1+b*matrix_width].weight;
+        val3 += srcvec[b].er * srcmatrix[a*8+from2+2+b*matrix_width].weight;
+        val4 += srcvec[b].er * srcmatrix[a*8+from2+3+b*matrix_width].weight;
+
+        val5 += srcvec[b].er * srcmatrix[a*8+from2+4+b*matrix_width].weight;
+        val6 += srcvec[b].er * srcmatrix[a*8+from2+5+b*matrix_width].weight;
+        val7 += srcvec[b].er * srcmatrix[a*8+from2+6+b*matrix_width].weight;
+        val8 += srcvec[b].er * srcmatrix[a*8+from2+7+b*matrix_width].weight;
+      }
+      dest[a*8+from2+0].er += val1;
+      dest[a*8+from2+1].er += val2;
+      dest[a*8+from2+2].er += val3;
+      dest[a*8+from2+3].er += val4;
+
+      dest[a*8+from2+4].er += val5;
+      dest[a*8+from2+5].er += val6;
+      dest[a*8+from2+6].er += val7;
+      dest[a*8+from2+7].er += val8;
+    }
+
+    for (a = a * 8; a < to2 - from2; a++) {
+      for (b = from; b < to; b++) {
+        dest[a + from2].er
+            += srcvec[b].er * srcmatrix[a + from2 + b * matrix_width].weight;
+      }
+    }
+
+    if (gradient_cutoff > 0)
+      for (a = from2; a < to2; a++) {
+        if (dest[a].er > gradient_cutoff) dest[a].er = gradient_cutoff;
+        if (dest[a].er < -gradient_cutoff) dest[a].er = -gradient_cutoff;
+      }
+  }
+
+  // this is normal implementation (about 3x slower):
+
+  /*if (type == 0) {    //ac mod
+    for (b = from; b < to; b++) {
+    for (a = from2; a < to2; a++) {
+    dest[b].ac += srcvec[a].ac * srcmatrix[a+b*matrix_width].weight;
+    }
+    }
+    }
+    else     //er mod
+    if (type == 1) {
+    for (a = from2; a < to2; a++) {
+    for (b = from; b < to; b++) {
+    dest[a].er += srcvec[b].er * srcmatrix[a+b*matrix_width].weight;
+    }
+    }
+    }*/
+}
+
+void CRnnLM::computeNet(int last_word, int word) {
+  int a, b, c;
+  real val;
+  double sum;   // sum is used for normalization: it's better to have larger
+                // precision as many numbers are summed together here
+
+  if (last_word != -1) neu0[last_word].ac = 1;
+
+  // propagate 0->1
+  for (a = 0; a < layer1_size; a++) neu1[a].ac = 0;
+  for (a = 0; a < layerc_size; a++) neuc[a].ac = 0;
+
+  matrixXvector(neu1, neu0, syn0, layer0_size, 0, layer1_size,
+                layer0_size-layer1_size, layer0_size, 0);
+
+  for (b = 0; b < layer1_size; b++) {
+    a = last_word;
+    if (a != -1) neu1[b].ac += neu0[a].ac * syn0[a+b*layer0_size].weight;
+  }
+
+  // activate 1      --sigmoid
+  for (a = 0; a < layer1_size; a++) {
+    if (neu1[a].ac > 50) neu1[a].ac = 50;    // for numerical stability
+    if (neu1[a].ac < -50) neu1[a].ac = -50;  // for numerical stability
+    val = -neu1[a].ac;
+    neu1[a].ac = 1 / (1 + FAST_EXP(val));
+  }
+
+  if (layerc_size > 0) {
+    matrixXvector(neuc, neu1, syn1, layer1_size,
+                  0, layerc_size, 0, layer1_size, 0);
+    // activate compression      --sigmoid
+    for (a = 0; a < layerc_size; a++) {
+      if (neuc[a].ac > 50) neuc[a].ac = 50;    // for numerical stability
+      if (neuc[a].ac < -50) neuc[a].ac = -50;  // for numerical stability
+      val = -neuc[a].ac;
+      neuc[a].ac = 1 / (1 + FAST_EXP(val));
+    }
+  }
+
+  // 1->2 class
+  for (b = vocab_size; b < layer2_size; b++) neu2[b].ac = 0;
+
+  if (layerc_size > 0) {
+    matrixXvector(neu2, neuc, sync, layerc_size,
+                  vocab_size, layer2_size, 0, layerc_size, 0);
+  } else {
+    matrixXvector(neu2, neu1, syn1, layer1_size,
+                  vocab_size, layer2_size, 0, layer1_size, 0);
+  }
+
+  // apply direct connections to classes
+  if (direct_size > 0) {
+    unsigned long long hash[MAX_NGRAM_ORDER];
+    // this will hold pointers to syn_d that contains hash parameters
+
+    for (a = 0; a < direct_order; a++) hash[a] = 0;
+
+    for (a = 0; a < direct_order; a++) {
+      b = 0;
+      if (a > 0) if (history[a-1] == -1) break;
+      // if OOV was in history, do not use this N-gram feature and higher orders
+      hash[a] = PRIMES[0]*PRIMES[1];
+
+      for (b = 1; b <= a; b++)
+        hash[a] += PRIMES[(a * PRIMES[b] + b) % PRIMES_SIZE]
+            * (unsigned long long)(history[b - 1] + 1);
+      // update hash value based on words from the history
+
+      hash[a] = hash[a] % (direct_size / 2);
+      // make sure that starting hash index is in the first
+      // half of syn_d (second part is reserved for history->words features)
+    }
+
+    for (a = vocab_size; a < layer2_size; a++) {
+      for (b = 0; b < direct_order; b++)
+        if (hash[b]) {
+          neu2[a].ac += syn_d[hash[b]];
+          // apply current parameter and move to the next one
+
+          hash[b]++;
+        } else {
+          break;
+        }
+    }
+  }
+
+  // activation 2   --softmax on classes
+  sum = 0;
+  for (a = vocab_size; a < layer2_size; a++) {
+    if (neu2[a].ac > 50) neu2[a].ac = 50;    // for numerical stability
+    if (neu2[a].ac < -50) neu2[a].ac = -50;  // for numerical stability
+    val = FAST_EXP(neu2[a].ac);
+    sum+= val;
+    neu2[a].ac = val;
+  }
+  for (a = vocab_size; a < layer2_size; a++) neu2[a].ac /= sum;
+  // output layer activations now sum exactly to 1
+
+  if (gen > 0) return;  // if we generate words, we don't know what current word
+                        // is -> only classes are estimated and word is selected
+                        // in testGen()
+
+
+  // 1->2 word
+  if (word != -1) {
+    for (c = 0; c < class_cn[vocab[word].class_index]; c++)
+      neu2[class_words[vocab[word].class_index][c]].ac = 0;
+    if (layerc_size > 0) {
+      matrixXvector(neu2, neuc, sync, layerc_size,
+                    class_words[vocab[word].class_index][0],
+                    class_words[vocab[word].class_index][0]
+                    + class_cn[vocab[word].class_index],
+                    0, layerc_size, 0);
+    } else {
+      matrixXvector(neu2, neu1, syn1, layer1_size,
+                    class_words[vocab[word].class_index][0],
+                    class_words[vocab[word].class_index][0]
+                    + class_cn[vocab[word].class_index],
+                    0, layer1_size, 0);
+    }
+  }
+
+  // apply direct connections to words
+  if (word != -1) if (direct_size > 0) {
+    unsigned long long hash[MAX_NGRAM_ORDER];
+
+    for (a = 0; a < direct_order; a++) hash[a] = 0;
+
+    for (a = 0; a < direct_order; a++) {
+      b = 0;
+      if (a > 0) if (history[a-1] == -1) break;
+      hash[a] =
+          PRIMES[0] * PRIMES[1] *
+          (unsigned long long)(vocab[word].class_index+1);
+
+      for (b = 1; b <= a; b++)
+        hash[a] += PRIMES[(a * PRIMES[b] + b) % PRIMES_SIZE]
+            * (unsigned long long)(history[b - 1] + 1);
+      hash[a] = (hash[a] % (direct_size / 2)) + (direct_size) / 2;
+    }
+
+    for (c = 0; c < class_cn[vocab[word].class_index]; c++) {
+      a = class_words[vocab[word].class_index][c];
+
+      for (b = 0; b < direct_order; b++) if (hash[b]) {
+        neu2[a].ac += syn_d[hash[b]];
+        hash[b]++;
+        hash[b] = hash[b] % direct_size;
+      } else {
+        break;
+      }
+    }
+  }
+
+  // activation 2   --softmax on words
+  sum = 0;
+  if (word != -1) {
+    for (c = 0; c < class_cn[vocab[word].class_index]; c++) {
+      a = class_words[vocab[word].class_index][c];
+      if (neu2[a].ac > 50) neu2[a].ac = 50;    // for numerical stability
+      if (neu2[a].ac < -50) neu2[a].ac = -50;  // for numerical stability
+      val = FAST_EXP(neu2[a].ac);
+      sum+= val;
+      neu2[a].ac = val;
+    }
+    for (c = 0; c < class_cn[vocab[word].class_index]; c++)
+      neu2[class_words[vocab[word].class_index][c]].ac /= sum;
+  }
+}
+
+void CRnnLM::copyHiddenLayerToInput() {
+  int a;
+
+  for (a = 0; a < layer1_size; a++) {
+    neu0[a + layer0_size - layer1_size].ac = neu1[a].ac;
+  }
+}
+
+void CRnnLM::restoreContextFromVector(const std::vector <float> &context_in) {
+  assert(context_in.size() == layer1_size);
+  for (int i = 0; i  <  layer1_size; ++i) {
+    neu1[i].ac = context_in[i];
+  }
+}
+
+void CRnnLM::saveContextToVector(std::vector <float> *context_out) {
+  assert(context_out != NULL);
+  context_out->resize(layer1_size);
+  for (int i = 0; i  <  layer1_size; ++i) {
+    (*context_out)[i] = neu1[i].ac;
+  }
+}
+
+float CRnnLM::computeConditionalLogprob(
+    std::string current_word,
+    const std::vector < std::string >  &history_words,
+    const std::vector < float >  &context_in,
+    std::vector < float >  *context_out) {
+  // We assume the network has been restored.
+  netReset();
+  restoreContextFromVector(context_in);
+  copyHiddenLayerToInput();
+
+  // Maps unk to the unk symbol.
+  std::vector <std::string>  history_words_nounk(history_words);
+  std::string current_word_nounk = current_word;
+  if (isUnk(current_word_nounk)) {
+    current_word_nounk = unk_sym;
+  }
+  for (int i = 0; i < history_words_nounk.size(); ++i) {
+    if (isUnk(history_words_nounk[i])) {
+      history_words_nounk[i] = unk_sym;
+    }
+  }
+
+  // Handles history for n-gram features.
+  for (int i = 0; i < MAX_NGRAM_ORDER; i++) {
+    history[i] = 0;
+  }
+  for (int i = 0; i < history_words_nounk.size() && i < MAX_NGRAM_ORDER; i++) {
+    history[i] = searchVocab(
+        history_words_nounk[history_words_nounk.size() - 1 - i].c_str());
+  }
+
+  int word = 0, last_word = 0;
+  float logprob = 0;
+  if (current_word_nounk == unk_sym) {
+    logprob += getUnkPenalty(current_word);
+  }
+  word = searchVocab(current_word_nounk.c_str());
+  if (history_words_nounk.size() > 0) {
+    last_word = searchVocab(
+        history_words_nounk[history_words_nounk.size() - 1].c_str());
+  }
+  computeNet(last_word, word);
+
+  if (word != -1) {
+    logprob +=
+        log(neu2[vocab[word].class_index + vocab_size].ac * neu2[word].ac);
+  } else {
+    logprob += -16.118;
+  }
+
+  if (context_out != NULL) {
+    saveContextToVector(context_out);
+  }
+
+  if (last_word != -1) {
+    neu0[last_word].ac = 0;
+  }
+
+  return logprob;
+}
+
+bool CRnnLM::isUnk(const std::string &word) {
+  int word_int = searchVocab(word.c_str());
+  if (word_int == -1)
+    return true;
+  return false;
+}
+
+void CRnnLM::setUnkSym(const std::string &unk) {
+  unk_sym = unk;
+}
+
+float CRnnLM::getUnkPenalty(const std::string &word) {
+  unordered_map <std::string, float>::const_iterator iter  =
+      unk_penalty.find(word);
+  if (iter != unk_penalty.end())
+    return iter->second;
+  return -16.118;  // Fixed penalty.
+}
+
+void CRnnLM::setUnkPenalty(const std::string &filename) {
+  kaldi::SequentialBaseFloatReader unk_reader(filename);
+  for (; !unk_reader.Done(); unk_reader.Next()) {
+    std::string key = unk_reader.Key();
+    float prob = unk_reader.Value();
+    unk_reader.FreeCurrent();
+    unk_penalty[key] = log(prob);
+  }
+}
+
+}  // namespace rnnlm
diff --git a/src/lm/mikolov-rnnlm-lib.h b/src/lm/mikolov-rnnlm-lib.h
new file mode 100644
index 00000000000..36d88a0a5d0
--- /dev/null
+++ b/src/lm/mikolov-rnnlm-lib.h
@@ -0,0 +1,239 @@
+// lm/mikolov-rnnlm-lib.h
+
+// Copyright       2015  Guoguo Chen  Hainan Xu
+//            2010-2012  Tomas Mikolov
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// This file is based on version 0.3e of the RNNLM language modeling
+// toolkit by Tomas Mikolov.  Changes made by authors other than
+// Tomas Mikolov are licensed under the Apache License, the short form
+// os which is below.  The original code by Tomas Mikolov is licensed
+// under the BSD 3-clause license, whose text is further below.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+//
+//
+// Original BSD 3-clause license text:
+// Copyright (c) 2010-2012 Tomas Mikolov
+//
+// All rights reserved. Redistribution and use in source and binary forms, with
+// or without modification, are permitted provided that the following conditions
+// are met: 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following
+// disclaimer. 2. Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the
+// distribution. 3. Neither name of copyright holders nor the names of its
+// contributors may be used to endorse or promote products derived from this
+// software without specific prior written permission. THIS SOFTWARE IS PROVIDED
+// BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' AND ANY EXPRESS OR
+// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+// EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+// OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef KALDI_LM_MIKOLOV_RNNLM_LIB_H_
+#define KALDI_LM_MIKOLOV_RNNLM_LIB_H_
+
+#include <string>
+#include <vector>
+#include "util/stl-utils.h"
+
+namespace rnnlm {
+
+#define MAX_STRING 100
+#define MAX_FILENAME_STRING 300
+
+typedef double real;      //  doubles for NN weights
+typedef double direct_t;  //  doubles for ME weights;
+
+struct neuron {
+  real ac;    // actual value stored in neuron
+  real er;    // error value in neuron, used by learning algorithm
+};
+
+struct synapse {
+  real weight;  // weight of synapse
+};
+
+struct vocab_word {
+  int cn;
+  char word[MAX_STRING];
+
+  real prob;
+  int class_index;
+};
+
+const unsigned int PRIMES[] = {108641969, 116049371, 125925907, 133333309,
+  145678979, 175308587, 197530793, 234567803, 251851741, 264197411,
+  330864029, 399999781,
+  407407183, 459258997, 479012069, 545678687, 560493491, 607407037, 629629243,
+  656789717, 716048933, 718518067, 725925469, 733332871, 753085943, 755555077,
+  782715551, 790122953, 812345159, 814814293, 893826581, 923456189, 940740127,
+  953085797, 985184539, 990122807};
+const unsigned int PRIMES_SIZE  =  sizeof(PRIMES) / sizeof(PRIMES[0]);
+
+const int MAX_NGRAM_ORDER = 20;
+
+enum FileTypeEnum {TEXT, BINARY, COMPRESSED};  // COMPRESSED not yet implemented
+
+class CRnnLM {
+ protected:
+  char train_file[MAX_FILENAME_STRING];
+  char valid_file[MAX_FILENAME_STRING];
+  char test_file[MAX_FILENAME_STRING];
+  char rnnlm_file[MAX_FILENAME_STRING];
+  char lmprob_file[MAX_FILENAME_STRING];
+
+  int rand_seed;
+  int version;
+  int filetype;
+
+  int use_lmprob;
+  real gradient_cutoff;
+
+  real dynamic;
+
+  real alpha;
+  real starting_alpha;
+  int alpha_divide;
+  double logp, llogp;
+  float min_improvement;
+  int iter;
+  int vocab_max_size;
+  int vocab_size;
+  int train_words;
+  int train_cur_pos;
+  int counter;
+
+  int anti_k;
+
+  real beta;
+
+  int class_size;
+  int **class_words;
+  int *class_cn;
+  int *class_max_cn;
+  int old_classes;
+
+  struct vocab_word *vocab;
+  void sortVocab();
+  int *vocab_hash;
+  int vocab_hash_size;
+
+  int layer0_size;
+  int layer1_size;
+  int layerc_size;
+  int layer2_size;
+
+  long long direct_size;
+  int direct_order;
+  int history[MAX_NGRAM_ORDER];
+
+  int bptt;
+  int bptt_block;
+  int *bptt_history;
+  neuron *bptt_hidden;
+  struct synapse *bptt_syn0;
+
+  int gen;
+
+  int independent;
+
+  struct neuron *neu0;    // neurons in input layer
+  struct neuron *neu1;    // neurons in hidden layer
+  struct neuron *neuc;    // neurons in hidden layer
+  struct neuron *neu2;    // neurons in output layer
+
+  struct synapse *syn0;   // weights between input and hidden layer
+  struct synapse *syn1;   // weights between hidden and output layer
+                          // (or hidden and compression if compression>0)
+  struct synapse *sync;   // weights between hidden and compression layer
+  direct_t *syn_d;        // direct parameters between input and output layer
+                          // (similar to Maximum Entropy model parameters)
+
+  // backup used in training:
+  struct neuron *neu0b;
+  struct neuron *neu1b;
+  struct neuron *neucb;
+  struct neuron *neu2b;
+
+  struct synapse *syn0b;
+  struct synapse *syn1b;
+  struct synapse *syncb;
+  direct_t *syn_db;
+
+  // backup used in n-bset rescoring:
+  struct neuron *neu1b2;
+
+  unordered_map<std::string, float> unk_penalty;
+  std::string unk_sym;
+
+ public:
+
+  int alpha_set, train_file_set;
+
+  CRnnLM();
+
+  ~CRnnLM();
+
+  real random(real min, real max);
+
+  void setRnnLMFile(const std::string &str);
+  int getHiddenLayerSize() const { return layer1_size; }
+  void setRandSeed(int newSeed);
+
+  int getWordHash(const char *word);
+  void readWord(char *word, FILE *fin);
+  int searchVocab(const char *word);
+
+  void saveWeights();      // saves current weights and unit activations
+  void initNet();
+  void goToDelimiter(int delim, FILE *fi);
+  void restoreNet();
+  void netReset();         // will erase just hidden layer state + bptt history
+                           // + maxent history (called at end of sentences in
+                           // the independent mode)
+
+  void computeNet(int last_word, int word);
+  void copyHiddenLayerToInput();
+
+  void matrixXvector(struct neuron *dest, struct neuron *srcvec,
+                     struct synapse *srcmatrix, int matrix_width,
+                     int from, int to, int from2, int to2, int type);
+
+  void restoreContextFromVector(const std::vector<float> &context_in);
+  void saveContextToVector(std::vector<float> *context_out);
+
+  float computeConditionalLogprob(
+      std::string current_word,
+      const std::vector<std::string> &history_words,
+      const std::vector<float> &context_in,
+      std::vector<float> *context_out);
+
+  void setUnkSym(const std::string &unk);
+  void setUnkPenalty(const std::string &filename);
+  float getUnkPenalty(const std::string &word);
+  bool isUnk(const std::string &word);
+};
+
+}  // namespace rnnlm
+
+#endif  // KALDI_LM_MIKOLOV_RNNLM_LIB_H_
diff --git a/src/lm/input.arpa b/src/lm/test_data/input.arpa
similarity index 100%
rename from src/lm/input.arpa
rename to src/lm/test_data/input.arpa
diff --git a/src/lm/missing_backoffs.arpa b/src/lm/test_data/missing_backoffs.arpa
similarity index 100%
rename from src/lm/missing_backoffs.arpa
rename to src/lm/test_data/missing_backoffs.arpa
diff --git a/src/lm/unused_backoffs.arpa b/src/lm/test_data/unused_backoffs.arpa
similarity index 89%
rename from src/lm/unused_backoffs.arpa
rename to src/lm/test_data/unused_backoffs.arpa
index ded5e0aff90..b66b984173d 100644
--- a/src/lm/unused_backoffs.arpa
+++ b/src/lm/test_data/unused_backoffs.arpa
@@ -3,7 +3,7 @@
 ngram 1=4
 ngram 2=2
 ngram 3=2
-ngram 4=1
+ngram 4=2
 
 \1-grams:
 -5.234679	a -3.3
@@ -20,7 +20,7 @@ ngram 4=1
 -0.23940	a b </s>
 
 \4-grams:
--0.01888	<s> a b b 
+-0.01888	<s> a b b
 -0.03333        <s> b b b
 
 \end\
diff --git a/src/lmbin/Makefile b/src/lmbin/Makefile
index 87348aaf884..889892c150c 100644
--- a/src/lmbin/Makefile
+++ b/src/lmbin/Makefile
@@ -4,12 +4,13 @@ all:
 EXTRA_CXXFLAGS = -Wno-sign-compare
 include ../kaldi.mk
 
-BINFILES = arpa-to-const-arpa
+BINFILES = arpa2fst arpa-to-const-arpa
 
 OBJFILES =
 
 TESTFILES =
 
-ADDLIBS = ../lm/kaldi-lm.a ../util/kaldi-util.a ../base/kaldi-base.a
+ADDLIBS = ../lm/kaldi-lm.a ../util/kaldi-util.a ../thread/kaldi-thread.a \
+          ../matrix/kaldi-matrix.a ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/lmbin/arpa-to-const-arpa.cc b/src/lmbin/arpa-to-const-arpa.cc
index f99686b375c..33d1cb70aae 100644
--- a/src/lmbin/arpa-to-const-arpa.cc
+++ b/src/lmbin/arpa-to-const-arpa.cc
@@ -13,7 +13,7 @@
 // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
 // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
+// MERCHANTABILITY OR NON-INFRINGEMENT.
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
@@ -33,7 +33,7 @@ int main(int argc, char *argv[]) {
         "that wants to rescore lattices. We assume that the words in the\n"
         "input arpa language model has been converted to integers.\n"
         "\n"
-        "The program is used joinly with utils/map_arpa_lm.pl to build\n"
+        "The program is used jointly with utils/map_arpa_lm.pl to build\n"
         "ConstArpaLm format language model. We first map the words in an Arpa\n"
         "format language model to integers using utils/map_arpa_m.pl, and\n"
         "then use this program to build a ConstArpaLm format language model.\n"
@@ -44,19 +44,19 @@ int main(int argc, char *argv[]) {
 
     kaldi::ParseOptions po(usage);
 
-    bool natural_base = true;
-    int32 unk_symbol = -1;
-    int32 bos_symbol = -1;
-    int32 eos_symbol = -1;
-    po.Register("natural-base", &natural_base,
-                "If true, use log-base e instead of log-base 10.");
-    po.Register("unk-symbol", &unk_symbol,
+    ArpaParseOptions options;
+    options.Register(&po);
+
+    // Ideally, these registrations would be in ArpaParseOptions, but some
+    // programs want integers and other want symbols, so we register them
+    // outside instead.
+    po.Register("unk-symbol", &options.unk_symbol,
                 "Integer corresponds to unknown-word in language model. -1 if "
                 "no such word is provided.");
-    po.Register("bos-symbol", &bos_symbol,
+    po.Register("bos-symbol", &options.bos_symbol,
                 "Integer corresponds to <s>. You must set this to your actual "
                 "BOS integer.");
-    po.Register("eos-symbol", &eos_symbol,
+    po.Register("eos-symbol", &options.eos_symbol,
                 "Integer corresponds to </s>. You must set this to your actual "
                 "EOS integer.");
 
@@ -67,18 +67,16 @@ int main(int argc, char *argv[]) {
       exit(1);
     }
 
-    if (bos_symbol == -1 || eos_symbol == -1) {
+    if (options.bos_symbol == -1 || options.eos_symbol == -1) {
       KALDI_ERR << "Please set --bos-symbol and --eos-symbol.";
-      exit (1);
+      exit(1);
     }
 
     std::string arpa_rxfilename = po.GetArg(1),
         const_arpa_wxfilename = po.GetOptArg(2);
 
-    bool ans = BuildConstArpaLm(natural_base, bos_symbol,
-                                eos_symbol, unk_symbol,
-                                arpa_rxfilename, const_arpa_wxfilename);
-
+    bool ans = BuildConstArpaLm(options, arpa_rxfilename,
+                                const_arpa_wxfilename);
     if (ans)
       return 0;
     else
diff --git a/src/lmbin/arpa2fst.cc b/src/lmbin/arpa2fst.cc
new file mode 100755
index 00000000000..37d2d9a8c1d
--- /dev/null
+++ b/src/lmbin/arpa2fst.cc
@@ -0,0 +1,146 @@
+// bin/arpa2fst.cc
+//
+// Copyright 2009-2011  Gilles Boulianne.
+//
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABILITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+
+#include "lm/arpa-lm-compiler.h"
+#include "util/kaldi-io.h"
+#include "util/parse-options.h"
+
+
+int main(int argc, char *argv[]) {
+  using namespace kaldi;
+  try {
+    const char *usage  =
+        "Convert an ARPA format language model into an FST\n"
+        "Usage: arpa2fst [opts] <input_arpa> <output_fst>\n"
+        " e.g.: arpa2fst --disambig-symbol=#0 --read-symbol-table="
+        "data/lang/words.txt lm/input.arpa G.fst\n\n"
+        "Note: When called without switches, the output G.fst will contain\n"
+        "an embedded symbol table. This is compatible with the way a previous\n"
+        "version of arpa2fst worked.\n";
+
+    ParseOptions po(usage);
+
+    ArpaParseOptions options;
+    options.Register(&po);
+
+    // Option flags.
+    std::string bos_symbol = "<s>";
+    std::string eos_symbol = "</s>";
+    std::string disambig_symbol;
+    std::string read_syms_filename;
+    std::string write_syms_filename;
+    bool keep_symbols = false;
+    bool ilabel_sort = true;
+
+    po.Register("bos-symbol", &bos_symbol,
+                "Beginning of sentence symbol");
+    po.Register("eos-symbol", &eos_symbol,
+                "End of sentence symbol");
+    po.Register("disambig-symbol", &disambig_symbol,
+                "Disambiguator. If provided (e. g. #0), used on input side of "
+                "backoff links, and <s> and </s> are replaced with epsilons");
+    po.Register("read-symbol-table", &read_syms_filename,
+                "Use existing symbol table");
+    po.Register("write-symbol-table", &write_syms_filename,
+                "Write generated symbol table to a file");
+    po.Register("keep-symbols", &keep_symbols,
+                "Store symbol table with FST. Symbols always saved to FST if "
+                "symbol tables are neither read or written (otherwise symbols "
+                "would be lost entirely)");
+    po.Register("ilabel-sort", &ilabel_sort,
+                "Ilabel-sort the output FST");
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 1 && po.NumArgs() != 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+    std::string arpa_rxfilename = po.GetArg(1),
+        fst_wxfilename = po.GetOptArg(2);
+
+    int64 disambig_symbol_id = 0;
+
+    fst::SymbolTable* symbols;
+    if (!read_syms_filename.empty()) {
+      // Use existing symbols. Required symbolds must be in the table.
+      kaldi::Input kisym(read_syms_filename);
+      symbols = fst::SymbolTable::ReadText(
+          kisym.Stream(), PrintableWxfilename(read_syms_filename));
+      if (symbols == NULL)
+        KALDI_ERR << "Could not read symbol table from file "
+                  << read_syms_filename;
+
+      options.oov_handling = ArpaParseOptions::kSkipNGram;
+      if (!disambig_symbol.empty()) {
+        disambig_symbol_id = symbols->Find(disambig_symbol);
+        if (disambig_symbol_id == fst::SymbolTable::kNoSymbol)
+          KALDI_ERR << "Symbol table " << read_syms_filename
+                    << " has no symbol for " << disambig_symbol;
+      }
+    } else {
+      // Create a new symbol table and populate it from ARPA file.
+      symbols = new fst::SymbolTable(PrintableWxfilename(fst_wxfilename));
+      options.oov_handling = ArpaParseOptions::kAddToSymbols;
+      symbols->AddSymbol("<eps>", 0);
+      if (!disambig_symbol.empty()) {
+        disambig_symbol_id = symbols->AddSymbol(disambig_symbol);
+      }
+    }
+
+    // Add or use existing BOS and EOS.
+    options.bos_symbol = symbols->AddSymbol(bos_symbol);
+    options.eos_symbol = symbols->AddSymbol(eos_symbol);
+
+    // If producing new (not reading existing) symbols and not saving them,
+    // need to keep symbols with FST, otherwise they would be lost.
+    if (read_syms_filename.empty() && write_syms_filename.empty())
+      keep_symbols = true;
+
+    // Actually compile LM.
+    KALDI_ASSERT (symbols != NULL);
+    ArpaLmCompiler lm_compiler(options, disambig_symbol_id, symbols);
+    ReadKaldiObject(arpa_rxfilename, &lm_compiler);
+
+    // Sort the FST in-place if requested by options.
+    if (ilabel_sort) {
+      fst::ArcSort(lm_compiler.MutableFst(), fst::StdILabelCompare());
+    }
+
+    // Write symbols if requested.
+    if (!write_syms_filename.empty()) {
+      kaldi::Output kosym(write_syms_filename, false);
+      symbols->WriteText(kosym.Stream());
+    }
+
+    // Write LM FST.
+    bool write_binary = true, write_header = false;
+    kaldi::Output kofst(fst_wxfilename, write_binary, write_header);
+    fst::FstWriteOptions wopts(PrintableWxfilename(fst_wxfilename));
+    wopts.write_isymbols = wopts.write_osymbols = keep_symbols;
+    lm_compiler.Fst().Write(kofst.Stream(), wopts);
+
+    delete symbols;
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
diff --git a/src/makefiles/linux_cuda.mk b/src/makefiles/cuda_32bit.mk
similarity index 59%
rename from src/makefiles/linux_cuda.mk
rename to src/makefiles/cuda_32bit.mk
index 502bf0ffc03..c5103972b08 100644
--- a/src/makefiles/linux_cuda.mk
+++ b/src/makefiles/cuda_32bit.mk
@@ -1,8 +1,11 @@
+ifndef DOUBLE_PRECISION
+$(error DOUBLE_PRECISION not defined.)
+endif
 
-CUDA_INCLUDE= -I$(CUDATKDIR)/include
-CUDA_FLAGS = -g -Xcompiler -fPIC --verbose --machine 32 -DHAVE_CUDA
 
-CXXFLAGS += -DHAVE_CUDA -I$(CUDATKDIR)/include 
+CUDA_INCLUDE= -I$(CUDATKDIR)/include
+CUDA_FLAGS = -g -Xcompiler -fPIC --verbose --machine 32 -DHAVE_CUDA \
+             -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION)
+CXXFLAGS += -DHAVE_CUDA -I$(CUDATKDIR)/include
 LDFLAGS += -L$(CUDATKDIR)/lib -Wl,-rpath=$(CUDATKDIR)/lib
 LDLIBS += -lcublas -lcudart #LDLIBS : The libs are loaded later than static libs in implicit rule
-
diff --git a/src/makefiles/linux_x86_64_cuda.mk b/src/makefiles/cuda_64bit.mk
similarity index 55%
rename from src/makefiles/linux_x86_64_cuda.mk
rename to src/makefiles/cuda_64bit.mk
index 46613083188..d0fc98fe1ca 100644
--- a/src/makefiles/linux_x86_64_cuda.mk
+++ b/src/makefiles/cuda_64bit.mk
@@ -1,14 +1,13 @@
 
+ifndef DOUBLE_PRECISION
+$(error DOUBLE_PRECISION not defined.)
+endif
+
+
 CUDA_INCLUDE= -I$(CUDATKDIR)/include
-CUDA_FLAGS = -g -Xcompiler -fPIC --verbose --machine 64 -DHAVE_CUDA
-
-CXXFLAGS += -DHAVE_CUDA -I$(CUDATKDIR)/include 
-UNAME := $(shell uname)
-#aware of fact in cuda60 there is no lib64, just lib.
-ifeq ($(UNAME), Darwin)
-CUDA_LDFLAGS += -L$(CUDATKDIR)/lib -Wl,-rpath,$(CUDATKDIR)/lib
-else
+CUDA_FLAGS = -g -Xcompiler -fPIC --verbose --machine 64 -DHAVE_CUDA \
+             -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION)
+CXXFLAGS += -DHAVE_CUDA -I$(CUDATKDIR)/include
 CUDA_LDFLAGS += -L$(CUDATKDIR)/lib64 -Wl,-rpath,$(CUDATKDIR)/lib64
-endif
 CUDA_LDLIBS += -lcublas -lcudart #LDLIBS : The libs are loaded later than static libs in implicit rule
 
diff --git a/src/makefiles/cygwin.mk b/src/makefiles/cygwin.mk
index 1a57bbbe3a6..6da982e20a4 100644
--- a/src/makefiles/cygwin.mk
+++ b/src/makefiles/cygwin.mk
@@ -4,13 +4,14 @@ ifndef FSTROOT
 $(error FSTROOT not defined.)
 endif
 
-CXXFLAGS = -msse -msse2 -Wall -I.. -DKALDI_DOUBLEPRECISION=0  \
-    -DHAVE_POSIX_MEMALIGN -DHAVE_CLAPACK -I ../../tools/CLAPACK/ \
-    -Wno-sign-compare -Winit-self \
+DOUBLE_PRECISION = 0
+CXXFLAGS = -msse -msse2 -Wall -I.. -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
+    -DHAVE_CLAPACK -I ../../tools/CLAPACK/ \
+    -Wno-sign-compare -Wno-unused-local-typedefs -Winit-self \
     -I ../../tools/CLAPACK/ \
     -I $(FSTROOT)/include \
     $(EXTRA_CXXFLAGS) \
-    -g # -O0 -DKALDI_PARANOID 
+    -g # -O0 -DKALDI_PARANOID
 
 ifeq ($(KALDI_FLAVOR), dynamic)
 CXXFLAGS += -fPIC
diff --git a/src/makefiles/darwin_10_10.mk b/src/makefiles/darwin_10_10.mk
index 4d5a031d0c9..dcb35b0c59e 100644
--- a/src/makefiles/darwin_10_10.mk
+++ b/src/makefiles/darwin_10_10.mk
@@ -4,9 +4,10 @@ ifndef FSTROOT
 $(error FSTROOT not defined.)
 endif
 
+DOUBLE_PRECISION = 0
 CXXFLAGS += -msse -msse2 -Wall -I.. \
       -pthread \
-      -DKALDI_DOUBLEPRECISION=0 -DHAVE_POSIX_MEMALIGN \
+      -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
       -Wno-sign-compare -Winit-self \
       -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H \
       -DHAVE_CLAPACK \
diff --git a/src/makefiles/darwin_10_11.mk b/src/makefiles/darwin_10_11.mk
new file mode 100644
index 00000000000..73cd006735e
--- /dev/null
+++ b/src/makefiles/darwin_10_11.mk
@@ -0,0 +1,46 @@
+# makefiles/darwin_10_11.mk contains Darwin-specific rules for OS X 10.11.*
+
+ifndef FSTROOT
+$(error FSTROOT not defined.)
+endif
+
+DOUBLE_PRECISION = 0
+CXXFLAGS += -msse -msse2 -Wall -I.. \
+      -pthread \
+      -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
+      -Wno-sign-compare -Winit-self \
+      -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H \
+      -DHAVE_CLAPACK \
+      -I$(FSTROOT)/include \
+      $(EXTRA_CXXFLAGS) -Wno-unused-local-typedef \
+      -g # -O0 -DKALDI_PARANOID
+
+
+ifeq ($(KALDI_FLAVOR), dynamic)
+CXXFLAGS += -fPIC
+endif
+
+LDFLAGS = -g
+LDLIBS = $(EXTRA_LDLIBS) $(FSTROOT)/lib/libfst.a -ldl -lm -lpthread -framework Accelerate
+CXX = g++
+CC = $(CXX)
+RANLIB = ranlib
+AR = ar
+
+# Add no-mismatched-tags flag to suppress the annoying clang warnings
+# that are perfectly valid per spec.
+COMPILER = $(shell $(CXX) -v 2>&1 )
+ifeq ($(findstring clang,$(COMPILER)),clang)
+  CXXFLAGS += -Wno-mismatched-tags
+  # Link with libstdc++ if we are building against OpenFst < 1.4
+  ifneq ("$(OPENFST_GE_10400)","1")
+    CXXFLAGS += -stdlib=libstdc++
+    LDFLAGS += -stdlib=libstdc++
+  endif
+endif
+
+# We need to tell recent versions of g++ to allow vector conversions without
+# an explicit cast provided the vectors are of the same size.
+ifeq ($(findstring GCC,$(COMPILER)),GCC)
+	CXXFLAGS += -flax-vector-conversions -Wno-unused-local-typedefs
+endif
diff --git a/src/makefiles/darwin_10_5.mk b/src/makefiles/darwin_10_5.mk
index ee58cea8e3c..5a1353b3893 100644
--- a/src/makefiles/darwin_10_5.mk
+++ b/src/makefiles/darwin_10_5.mk
@@ -4,9 +4,10 @@ ifndef FSTROOT
 $(error FSTROOT not defined.)
 endif
 
+DOUBLE_PRECISION = 0
 CXXFLAGS += -msse -msse2 -Wall -I.. \
 	  -pthread \
-      -DKALDI_DOUBLEPRECISION=0  \
+      -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION)  \
       -Wno-sign-compare -Winit-self \
       -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H \
       -DHAVE_CLAPACK \
diff --git a/src/makefiles/darwin_10_6.mk b/src/makefiles/darwin_10_6.mk
index 50094fb097e..50883335a9d 100644
--- a/src/makefiles/darwin_10_6.mk
+++ b/src/makefiles/darwin_10_6.mk
@@ -4,9 +4,10 @@ ifndef FSTROOT
 $(error FSTROOT not defined.)
 endif
 
+DOUBLE_PRECISION = 0
 CXXFLAGS += -msse -msse2 -Wall -I.. \
 	  -pthread \
-      -DKALDI_DOUBLEPRECISION=0 -DHAVE_POSIX_MEMALIGN \
+      -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
       -Wno-sign-compare -Winit-self \
       -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -rdynamic \
       -DHAVE_CLAPACK \
diff --git a/src/makefiles/darwin_10_7.mk b/src/makefiles/darwin_10_7.mk
index 124483ab4a8..ad5a153f5a9 100644
--- a/src/makefiles/darwin_10_7.mk
+++ b/src/makefiles/darwin_10_7.mk
@@ -4,9 +4,10 @@ ifndef FSTROOT
 $(error FSTROOT not defined.)
 endif
 
+DOUBLE_PRECISION = 0
 CXXFLAGS += -msse -msse2 -Wall -I.. \
 	  -pthread \
-      -DKALDI_DOUBLEPRECISION=0 -DHAVE_POSIX_MEMALIGN \
+      -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
       -Wno-sign-compare -Winit-self \
       -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -rdynamic \
       -DHAVE_CLAPACK \
diff --git a/src/makefiles/darwin_10_8.mk b/src/makefiles/darwin_10_8.mk
index 1f197923c51..c89aea0f44f 100644
--- a/src/makefiles/darwin_10_8.mk
+++ b/src/makefiles/darwin_10_8.mk
@@ -4,9 +4,10 @@ ifndef FSTROOT
 $(error FSTROOT not defined.)
 endif
 
+DOUBLE_PRECISION = 0
 CXXFLAGS += -msse -msse2 -Wall -I.. \
 	   -pthread \
-      -DKALDI_DOUBLEPRECISION=0 -DHAVE_POSIX_MEMALIGN \
+      -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
       -Wno-sign-compare -Winit-self \
       -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -rdynamic \
       -DHAVE_CLAPACK \
diff --git a/src/makefiles/darwin_10_9.mk b/src/makefiles/darwin_10_9.mk
index 938c38eb9ef..0069372c8ef 100644
--- a/src/makefiles/darwin_10_9.mk
+++ b/src/makefiles/darwin_10_9.mk
@@ -4,9 +4,10 @@ ifndef FSTROOT
 $(error FSTROOT not defined.)
 endif
 
+DOUBLE_PRECISION = 0
 CXXFLAGS += -msse -msse2 -Wall -I.. \
       -pthread \
-      -DKALDI_DOUBLEPRECISION=0 -DHAVE_POSIX_MEMALIGN \
+      -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
       -Wno-sign-compare -Winit-self \
       -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H \
       -DHAVE_CLAPACK \
diff --git a/src/makefiles/linux_atlas.mk b/src/makefiles/linux_atlas.mk
index a9b6664eeed..a0b757ed39a 100644
--- a/src/makefiles/linux_atlas.mk
+++ b/src/makefiles/linux_atlas.mk
@@ -13,9 +13,10 @@ $(error ATLASLIBS not defined.)
 endif
 
 
+DOUBLE_PRECISION = 0
 CXXFLAGS = -msse -msse2 -Wall -I.. \
 	   -pthread \
-      -DKALDI_DOUBLEPRECISION=0 -DHAVE_POSIX_MEMALIGN \
+      -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
       -Wno-sign-compare -Wno-unused-local-typedefs -Winit-self \
       -DHAVE_EXECINFO_H=1 -rdynamic -DHAVE_CXXABI_H \
       -DHAVE_ATLAS -I$(ATLASINC) \
diff --git a/src/makefiles/linux_clapack.mk b/src/makefiles/linux_clapack.mk
index 5e72b4be982..83ec0ddce82 100644
--- a/src/makefiles/linux_clapack.mk
+++ b/src/makefiles/linux_clapack.mk
@@ -1,7 +1,8 @@
 # You have to make sure CLAPACKLIBS is set...
 
+DOUBLE_PRECISION = 0
 CXXFLAGS = -msse -Wall -I.. -pthread \
-      -DKALDI_DOUBLEPRECISION=0 -msse2 -DHAVE_POSIX_MEMALIGN \
+      -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) -msse2 \
       -Wno-sign-compare -Wno-unused-local-typedefs \
       -DHAVE_EXECINFO_H=1 -rdynamic -DHAVE_CXXABI_H \
       -DHAVE_CLAPACK -I ../../tools/CLAPACK \
diff --git a/src/makefiles/linux_openblas.mk b/src/makefiles/linux_openblas.mk
index 2dcb3da98ba..7a4e2687664 100644
--- a/src/makefiles/linux_openblas.mk
+++ b/src/makefiles/linux_openblas.mk
@@ -13,9 +13,10 @@ $(error OPENBLASROOT not defined.)
 endif
 
 
+DOUBLE_PRECISION = 0
 CXXFLAGS = -msse -msse2 -Wall -I.. \
            -pthread \
-      -DKALDI_DOUBLEPRECISION=0 -DHAVE_POSIX_MEMALIGN \
+      -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
       -Wno-sign-compare -Wno-unused-local-typedefs -Winit-self \
       -DHAVE_EXECINFO_H=1 -rdynamic -DHAVE_CXXABI_H \
       -DHAVE_OPENBLAS -I $(OPENBLASROOT)/include \
diff --git a/src/makefiles/linux_x86_64_mkl.mk b/src/makefiles/linux_x86_64_mkl.mk
index 102d084be17..dd0af0255df 100644
--- a/src/makefiles/linux_x86_64_mkl.mk
+++ b/src/makefiles/linux_x86_64_mkl.mk
@@ -19,8 +19,9 @@ endif
 
 MKLLIB ?= $(MKLROOT)/lib/em64t
 
+DOUBLE_PRECISION = 0
 CXXFLAGS = -m64 -msse -msse2 -pthread -Wall -I.. \
-      -DKALDI_DOUBLEPRECISION=0 -DHAVE_POSIX_MEMALIGN \
+      -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
       -Wno-sign-compare -Wno-unused-local-typedefs -Winit-self \
       -DHAVE_EXECINFO_H=1 -rdynamic -DHAVE_CXXABI_H \
       -DHAVE_MKL -I$(MKLROOT)/include \
diff --git a/src/matrix/core b/src/matrix/core
deleted file mode 100644
index d53352bd315..00000000000
Binary files a/src/matrix/core and /dev/null differ
diff --git a/src/matrix/jama-eig.h b/src/matrix/jama-eig.h
index c7278bc741c..92d8c27e760 100644
--- a/src/matrix/jama-eig.h
+++ b/src/matrix/jama-eig.h
@@ -912,8 +912,8 @@ EigenvalueDecomposition<Real>::~EigenvalueDecomposition() {
   delete [] d_;
   delete [] e_;
   delete [] V_;
-  if (H_) delete [] H_;
-  if (ort_) delete [] ort_;
+  delete [] H_;
+  delete [] ort_;
 }
 
 // see function MatrixBase<Real>::Eig in kaldi-matrix.cc
diff --git a/src/matrix/kaldi-matrix-inl.h b/src/matrix/kaldi-matrix-inl.h
index 8bc4749b703..c2ff00793f8 100644
--- a/src/matrix/kaldi-matrix-inl.h
+++ b/src/matrix/kaldi-matrix-inl.h
@@ -35,7 +35,7 @@ void MatrixBase<float>::AddVecVec(const float alpha, const VectorBase<float> &ra
 
 template<>
 template<>
-void MatrixBase<double>::AddVecVec(const double alpha, const VectorBase<double> &ra, const VectorBase<double> &rb); 
+void MatrixBase<double>::AddVecVec(const double alpha, const VectorBase<double> &ra, const VectorBase<double> &rb);
 
 template<typename Real>
 inline std::ostream & operator << (std::ostream & os, const MatrixBase<Real> & M) {
@@ -60,3 +60,4 @@ inline std::istream & operator >> (std::istream & is, MatrixBase<Real> & M) {
 
 
 #endif  // KALDI_MATRIX_KALDI_MATRIX_INL_H_
+
diff --git a/src/matrix/kaldi-matrix.cc b/src/matrix/kaldi-matrix.cc
index ef791b51d15..3fa909f36d3 100644
--- a/src/matrix/kaldi-matrix.cc
+++ b/src/matrix/kaldi-matrix.cc
@@ -68,6 +68,9 @@ void MatrixBase<Real>::Invert(Real *log_det, Real *det_sign,
       if (log_det) *log_det = -std::numeric_limits<Real>::infinity();
       if (det_sign) *det_sign = 0;
       delete[] pivot;
+#ifndef HAVE_ATLAS
+      KALDI_MEMALIGN_FREE(p_work);
+#endif
       return;
     }
   }
@@ -134,7 +137,7 @@ void MatrixBase<Real>::AddVecVec(const Real alpha,
         row_data[j] += alpha_ai * b_data[j];
     }
   }
-} 
+}
 
 // instantiate the template above.
 template
@@ -246,7 +249,7 @@ void MatrixBase<Real>::SymAddMat2(const Real alpha,
 #endif // HAVE_ATLAS
 
   MatrixIndexT A_other_dim = (transA == kNoTrans ? A.num_cols_ : A.num_rows_);
-  
+
   // This function call is hard-coded to update the lower triangle.
   cblas_Xsyrk(transA, num_rows_, A_other_dim, alpha, A.Data(),
               A.Stride(), beta, this->data_, this->stride_);
@@ -271,7 +274,7 @@ void MatrixBase<Real>::AddMatSmat(const Real alpha,
   MatrixIndexT Astride = A.stride_, Bstride = B.stride_, stride = this->stride_,
       Arows = A.num_rows_, Acols = A.num_cols_;
   Real *data = this->data_, *Adata = A.data_, *Bdata = B.data_;
-  MatrixIndexT num_cols = this->num_cols_; 
+  MatrixIndexT num_cols = this->num_cols_;
   if (transB == kNoTrans) {
     // Iterate over the columns of *this and of B.
     for (MatrixIndexT c = 0; c < num_cols; c++) {
@@ -287,7 +290,7 @@ void MatrixBase<Real>::AddMatSmat(const Real alpha,
       // [this column] = [alpha * A * this row of B] + [beta * this column]
       Xgemv_sparsevec(transA, Arows, Acols, alpha, Adata, Astride,
                       Bdata + (c * Bstride), 1, beta, data + c, stride);
-    }    
+    }
   }
 }
 
@@ -324,7 +327,7 @@ void MatrixBase<Real>::AddSmatMat(const Real alpha,
       // [this row] = [alpha * (this column of A) * B^T] + [beta * this row]
       Xgemv_sparsevec(invTransB, Brows, Bcols, alpha, Bdata, Bstride,
                       Adata + r, Astride, beta, data + (r * stride), 1);
-    }    
+    }
   }
 }
 
@@ -386,7 +389,7 @@ void MatrixBase<Real>::AddMat(const Real alpha, const MatrixBase<Real>& A,
       }
     } else {
       KALDI_ASSERT(A.num_cols_ == num_rows_ && A.num_rows_ == num_cols_);
-      if (num_rows_ == 0) return;      
+      if (num_rows_ == 0) return;
       for (MatrixIndexT row = 0; row < num_rows_; row++, adata++, data += stride)
         cblas_Xaxpy(num_cols_, alpha, adata, aStride, data, 1);
     }
@@ -423,10 +426,10 @@ template<typename Real>
 void MatrixBase<Real>::AddDiagVecMat(
     const Real alpha, const VectorBase<Real> &v,
     const MatrixBase<Real> &M,
-    MatrixTransposeType transM, 
+    MatrixTransposeType transM,
     Real beta) {
   if (beta != 1.0) this->Scale(beta);
-  
+
   if (transM == kNoTrans) {
     KALDI_ASSERT(SameDim(*this, M));
   } else {
@@ -446,13 +449,13 @@ void MatrixBase<Real>::AddDiagVecMat(
 
 template<typename Real>
 void MatrixBase<Real>::AddMatDiagVec(
-    const Real alpha, 
-    const MatrixBase<Real> &M, MatrixTransposeType transM, 
-    VectorBase<Real> &v, 
+    const Real alpha,
+    const MatrixBase<Real> &M, MatrixTransposeType transM,
+    VectorBase<Real> &v,
     Real beta) {
-  
+
   if (beta != 1.0) this->Scale(beta);
-  
+
   if (transM == kNoTrans) {
     KALDI_ASSERT(SameDim(*this, M));
   } else {
@@ -460,13 +463,13 @@ void MatrixBase<Real>::AddMatDiagVec(
   }
   KALDI_ASSERT(v.Dim() == this->NumCols());
 
-  MatrixIndexT M_row_stride = M.Stride(), 
-               M_col_stride = 1, 
+  MatrixIndexT M_row_stride = M.Stride(),
+               M_col_stride = 1,
                stride = stride_,
-               num_rows = num_rows_, 
+               num_rows = num_rows_,
                num_cols = num_cols_;
 
-  if (transM == kTrans) 
+  if (transM == kTrans)
     std::swap(M_row_stride, M_col_stride);
 
   Real *data = data_;
@@ -504,7 +507,7 @@ void MatrixBase<Real>::AddMatMatElements(const Real alpha,
 // ****************************************************************************
 // ****************************************************************************
 template<typename Real>
-void MatrixBase<Real>::LapackGesvd(VectorBase<Real> *s, MatrixBase<Real> *U_in, 
+void MatrixBase<Real>::LapackGesvd(VectorBase<Real> *s, MatrixBase<Real> *U_in,
                                    MatrixBase<Real> *V_in) {
   KALDI_ASSERT(s != NULL && U_in != this && V_in != this);
 
@@ -553,7 +556,7 @@ void MatrixBase<Real>::LapackGesvd(VectorBase<Real> *s, MatrixBase<Real> *U_in,
                  U->Data(), &U_stride,
                  &work_query, &l_work,
                  &result);
-  
+
   KALDI_ASSERT(result >= 0 && "Call to CLAPACK dgesvd_ called with wrong arguments");
 
   l_work = static_cast<KaldiBlasInt>(work_query);
@@ -562,7 +565,7 @@ void MatrixBase<Real>::LapackGesvd(VectorBase<Real> *s, MatrixBase<Real> *U_in,
   if ((p_work = static_cast<Real*>(
           KALDI_MEMALIGN(16, sizeof(Real)*l_work, &temp))) == NULL)
     throw std::bad_alloc();
-  
+
   // perform svd
   clapack_Xgesvd(v_job, u_job,
                  &M, &N, data_, &LDA,
@@ -628,7 +631,8 @@ Matrix<double>::Matrix(const MatrixBase<float> & M,
 
 template<typename Real>
 inline void Matrix<Real>::Init(const MatrixIndexT rows,
-                               const MatrixIndexT cols) {
+                               const MatrixIndexT cols,
+                               const MatrixStrideType stride_type) {
   if (rows * cols == 0) {
     KALDI_ASSERT(rows == 0 && cols == 0);
     this->num_rows_ = 0;
@@ -638,9 +642,7 @@ inline void Matrix<Real>::Init(const MatrixIndexT rows,
     return;
   }
   KALDI_ASSERT(rows > 0 && cols > 0);
-  // initialize some helping vars
-  MatrixIndexT skip;
-  MatrixIndexT real_cols;
+  MatrixIndexT skip, stride;
   size_t size;
   void *data;  // aligned memory block
   void *temp;  // memory block to be really freed
@@ -648,16 +650,16 @@ inline void Matrix<Real>::Init(const MatrixIndexT rows,
   // compute the size of skip and real cols
   skip = ((16 / sizeof(Real)) - cols % (16 / sizeof(Real)))
       % (16 / sizeof(Real));
-  real_cols = cols + skip;
-  size = static_cast<size_t>(rows) * static_cast<size_t>(real_cols)
+  stride = cols + skip;
+  size = static_cast<size_t>(rows) * static_cast<size_t>(stride)
       * sizeof(Real);
-  
+
   // allocate the memory and set the right dimensions and parameters
   if (NULL != (data = KALDI_MEMALIGN(16, size, &temp))) {
     MatrixBase<Real>::data_        = static_cast<Real *> (data);
     MatrixBase<Real>::num_rows_      = rows;
     MatrixBase<Real>::num_cols_      = cols;
-    MatrixBase<Real>::stride_  = real_cols;
+    MatrixBase<Real>::stride_  = (stride_type == kDefaultStride ? stride : cols);
   } else {
     throw std::bad_alloc();
   }
@@ -666,7 +668,8 @@ inline void Matrix<Real>::Init(const MatrixIndexT rows,
 template<typename Real>
 void Matrix<Real>::Resize(const MatrixIndexT rows,
                           const MatrixIndexT cols,
-                          MatrixResizeType resize_type) {
+                          MatrixResizeType resize_type,
+                          MatrixStrideType stride_type) {
   // the next block uses recursion to handle what we have to do if
   // resize_type == kCopyData.
   if (resize_type == kCopyData) {
@@ -699,7 +702,7 @@ void Matrix<Real>::Resize(const MatrixIndexT rows,
     else
       Destroy();
   }
-  Init(rows, cols);
+  Init(rows, cols, stride_type);
   if (resize_type == kSetZero) MatrixBase<Real>::SetZero();
 }
 
@@ -778,7 +781,7 @@ void MatrixBase<double>::CopyFromSp(const SpMatrix<double> & M) {
   }
 }
 
-  
+
 template<typename Real>
 template<typename OtherReal>
 void MatrixBase<Real>::CopyFromSp(const SpMatrix<OtherReal> & M) {
@@ -888,7 +891,7 @@ void MatrixBase<Real>::CopyRowsFromVec(const VectorBase<OtherReal> &rv) {
     KALDI_ERR << "Wrong sized arguments.";
   }
 }
-  
+
 
 template
 void MatrixBase<float>::CopyRowsFromVec(const VectorBase<double> &rv);
@@ -912,7 +915,7 @@ void MatrixBase<Real>::CopyColsFromVec(const VectorBase<Real> &rv) {
     const Real *v_inc_data = rv.Data();
     Real *m_inc_data = data_;
     for (MatrixIndexT r = 0; r < num_rows_; r++) {
-      BaseFloat value = *(v_inc_data++);
+      Real value = *(v_inc_data++);
       for (MatrixIndexT c = 0; c < num_cols_; c++)
         m_inc_data[c] = value;
       m_inc_data += stride_;
@@ -985,7 +988,7 @@ void Matrix<Real>::Destroy() {
 template<typename Real>
 void MatrixBase<Real>::MulElements(const MatrixBase<Real> &a) {
   KALDI_ASSERT(a.NumRows() == num_rows_ && a.NumCols() == num_cols_);
-  
+
   if (num_cols_ == stride_ && num_cols_ == a.stride_) {
     mul_elements(num_rows_ * num_cols_, a.data_, data_);
   } else {
@@ -1037,7 +1040,7 @@ template<typename Real> void MatrixBase<Real>::Max(const MatrixBase<Real> &A) {
     }
   }
 }
-           
+
 
 template<typename Real> void MatrixBase<Real>::Scale(Real alpha) {
   if (alpha == 1.0) return;
@@ -1067,7 +1070,7 @@ void MatrixBase<Real>::MulRowsVec(const VectorBase<Real> &scale) {
 }
 
 
-template<typename Real> 
+template<typename Real>
 void MatrixBase<Real>::MulRowsGroupMat(const MatrixBase<Real> &src) {
   KALDI_ASSERT(src.NumRows() == this->NumRows() &&
                this->NumCols() % src.NumCols() == 0);
@@ -1084,7 +1087,7 @@ void MatrixBase<Real>::MulRowsGroupMat(const MatrixBase<Real> &src) {
   }
 }
 
-template<typename Real> 
+template<typename Real>
 void MatrixBase<Real>::GroupPnormDeriv(const MatrixBase<Real> &input,
                                        const MatrixBase<Real> &output,
                                        Real power) {
@@ -1093,10 +1096,10 @@ void MatrixBase<Real>::GroupPnormDeriv(const MatrixBase<Real> &input,
                this->NumRows() == output.NumRows());
 
   int group_size = this->NumCols() / output.NumCols(),
-    num_rows = this->NumRows(), num_cols = this->NumCols(); 
+    num_rows = this->NumRows(), num_cols = this->NumCols();
 
-  if (power == 1.0) {   
-    for (MatrixIndexT i = 0; i < num_rows; i++) { 
+  if (power == 1.0) {
+    for (MatrixIndexT i = 0; i < num_rows; i++) {
       for (MatrixIndexT j = 0; j < num_cols; j++) {
         Real input_val = input(i, j);
         (*this)(i, j) = (input_val == 0 ? 0 : (input_val > 0 ? 1 : -1));
@@ -1107,10 +1110,10 @@ void MatrixBase<Real>::GroupPnormDeriv(const MatrixBase<Real> &input,
       for (MatrixIndexT j = 0; j < num_cols; j++) {
         Real output_val = output(i, j / group_size),
           input_val = input(i, j);
-        if (output_val == 0) 
+        if (output_val == 0)
           (*this)(i, j) = 0;
          else
-            (*this)(i, j) = pow(std::abs(input_val), power - 1) * 
+            (*this)(i, j) = pow(std::abs(input_val), power - 1) *
               pow(output_val, 1 - power) * (input_val >= 0 ? 1 : -1) ;
       }
     }
@@ -1448,7 +1451,7 @@ SubMatrix<Real>::SubMatrix(const MatrixBase<Real> &M,
     // we support the empty sub-matrix as a special case.
     KALDI_ASSERT(c == 0 && r == 0);
     this->data_ = NULL;
-    this->num_cols_ = 0;    
+    this->num_cols_ = 0;
     this->num_rows_ = 0;
     this->stride_ = 0;
     return;
@@ -1465,7 +1468,9 @@ SubMatrix<Real>::SubMatrix(const MatrixBase<Real> &M,
   MatrixBase<Real>::num_rows_ = r;
   MatrixBase<Real>::num_cols_ = c;
   MatrixBase<Real>::stride_ = M.Stride();
-  MatrixBase<Real>::data_ = M.Data_workaround() + co + ro * M.Stride();
+  MatrixBase<Real>::data_ = M.Data_workaround() +
+      static_cast<size_t>(co) +
+      static_cast<size_t>(ro) * static_cast<size_t>(M.Stride());
 }
 
 
@@ -1482,7 +1487,7 @@ SubMatrix<Real>::SubMatrix(Real *data,
     this->stride_ = 0;
   } else {
     KALDI_ASSERT(this->stride_ >= this->num_cols_);
-  }  
+  }
 }
 
 
@@ -1499,7 +1504,7 @@ template<typename Real>
 void MatrixBase<Real>::AddToDiag(const Real alpha) {
   Real *data = data_;
   MatrixIndexT this_stride = stride_ + 1,
-      num_to_add = std::min(num_rows_, num_cols_);  
+      num_to_add = std::min(num_rows_, num_cols_);
   for (MatrixIndexT r = 0; r < num_to_add; r++)
     data[r * this_stride] += alpha;
 }
@@ -1653,7 +1658,7 @@ void MatrixBase<Real>::Svd(VectorBase<Real> *s, MatrixBase<Real> *U, MatrixBase<
     KALDI_ERR << "Error doing Svd (did not converge), first part of matrix is\n"
               << SubMatrix<Real>(*this, 0, std::min((MatrixIndexT)10, num_rows_),
                                  0, std::min((MatrixIndexT)10, num_cols_))
-              << ", min and max are: " << Min() << ", " << Max(); 
+              << ", min and max are: " << Min() << ", " << Max();
   }
 }
 
@@ -1697,7 +1702,7 @@ void MatrixBase<Real>::TestUninitialized() const {
   if (positive > R * C)
     KALDI_ERR << "Error....";
 }
-  
+
 
 template<typename Real>
 bool MatrixBase<Real>::IsUnit(Real cutoff) const {
@@ -1765,6 +1770,13 @@ void MatrixBase<Real>::OrthogonalizeRows() {
     int32 counter = 0;
     while (1) {
       Real start_prod = VecVec(this->Row(i), this->Row(i));
+      if (start_prod - start_prod != 0.0 || start_prod == 0.0) {
+        KALDI_WARN << "Self-product of row " << i << " of matrix is "
+                   << start_prod << ", randomizing.";
+        this->Row(i).SetRandn();
+        counter++;
+        continue;  // loop again.
+      }
       for (MatrixIndexT j = 0; j < i; j++) {
         Real prod = VecVec(this->Row(i), this->Row(j));
         this->Row(i).AddVec(-prod, this->Row(j));
@@ -1785,7 +1797,7 @@ void MatrixBase<Real>::OrthogonalizeRows() {
       } else {
         this->Row(i).Scale(1.0 / std::sqrt(end_prod));
         break;
-      } 
+      }
     }
   }
 }
@@ -1866,7 +1878,7 @@ void MatrixBase<Real>::CopyFromMat(const CompressedMatrix &mat) {
 
 template<class Real>
 Matrix<Real>::Matrix(const CompressedMatrix &M): MatrixBase<Real>() {
-  Resize(M.NumRows(), M.NumCols(), kUndefined);  
+  Resize(M.NumRows(), M.NumCols(), kUndefined);
   M.CopyToMat(this);
 }
 
@@ -1962,6 +1974,19 @@ void MatrixBase<Real>::ApplyHeaviside() {
   }
 }
 
+template<typename Real>
+void MatrixBase<Real>::Heaviside(const MatrixBase<Real> &src) {
+  KALDI_ASSERT(SameDim(*this, src));
+  MatrixIndexT num_rows = num_rows_, num_cols = num_cols_;
+  Real *row_data = data_;
+  const Real *src_row_data = src.Data();
+  for (MatrixIndexT row = 0; row < num_rows;
+       row++,row_data += stride_, src_row_data += src.stride_) {
+    for (MatrixIndexT col = 0; col < num_cols; col++)
+      row_data[col] = (src_row_data[col] > 0 ? 1.0 : 0.0);
+  }
+}
+
 
 template<typename Real>
 bool MatrixBase<Real>::Power(Real power) {
@@ -2056,11 +2081,11 @@ bool ReadHtk(std::istream &is, Matrix<Real> *M_ptr, HtkHeader *header_ptr)
 
   bool has_checksum = false;
   {
-    // See HParm.h in HTK code for sources of these things.  
+    // See HParm.h in HTK code for sources of these things.
     enum BaseParmKind{
       Waveform, Lpc, Lprefc, Lpcepstra, Lpdelcep,
       Irefc, Mfcc, Fbank, Melspec, User, Discrete, Plp, Anon };
-    
+
     const int32 IsCompressed = 02000, HasChecksum = 010000, HasVq = 040000,
         Problem = IsCompressed | HasVq;
     int32 base_parm = htk_hdr.mSampleKind & (077);
@@ -2074,7 +2099,7 @@ bool ReadHtk(std::istream &is, Matrix<Real> *M_ptr, HtkHeader *header_ptr)
       KALDI_ERR << "Attempting to read HTK features from unsupported type "
           "(e.g. waveform or discrete features.";
   }
-  
+
   KALDI_VLOG(3) << "HTK header: Num Samples: " << htk_hdr.mNSamples
                 << "; Sample period: " << htk_hdr.mSamplePeriod
                 << "; Sample size: " << htk_hdr.mSampleSize
@@ -2569,8 +2594,8 @@ void MatrixBase<Real>::CopyCols(const MatrixBase<Real> &src,
   MatrixIndexT src_cols = src.NumCols();
   for (MatrixIndexT i = 0; i < num_cols; i++)
     KALDI_ASSERT(indices[i] >= -1 && indices[i] < src_cols);
-#endif                
-  
+#endif
+
   // For the sake of memory locality we do this row by row, rather
   // than doing it column-wise using cublas_Xcopy
   for (MatrixIndexT r = 0; r < num_rows; r++, this_data += this_stride, src_data += src_stride) {
@@ -2595,8 +2620,8 @@ void MatrixBase<Real>::AddCols(const MatrixBase<Real> &src,
   MatrixIndexT src_cols = src.NumCols();
   for (MatrixIndexT i = 0; i < num_cols; i++)
     KALDI_ASSERT(indices[i] >= -1 && indices[i] < src_cols);
-#endif                
-  
+#endif
+
   // For the sake of memory locality we do this row by row, rather
   // than doing it column-wise using cublas_Xcopy
   for (MatrixIndexT r = 0; r < num_rows; r++, this_data += this_stride, src_data += src_stride) {
@@ -2615,7 +2640,7 @@ void MatrixBase<Real>::CopyRows(const MatrixBase<Real> &src,
   MatrixIndexT num_rows = num_rows_, num_cols = num_cols_,
       this_stride = stride_;
   Real *this_data = this->data_;
-  
+
   for (MatrixIndexT r = 0; r < num_rows; r++, this_data += this_stride) {
     MatrixIndexT index = indices[r];
     if (index < 0) memset(this_data, 0, sizeof(Real) * num_cols_);
@@ -2628,7 +2653,7 @@ void MatrixBase<Real>::CopyRows(const Real *const *src) {
   MatrixIndexT num_rows = num_rows_,
       num_cols = num_cols_, this_stride = stride_;
   Real *this_data = this->data_;
-  
+
   for (MatrixIndexT r = 0; r < num_rows; r++, this_data += this_stride) {
     const Real *const src_data = src[r];
     if (src_data == NULL) memset(this_data, 0, sizeof(Real) * num_cols);
@@ -2641,7 +2666,7 @@ void MatrixBase<Real>::CopyToRows(Real *const *dst) const {
   MatrixIndexT num_rows = num_rows_,
       num_cols = num_cols_, this_stride = stride_;
   const Real *this_data = this->data_;
-  
+
   for (MatrixIndexT r = 0; r < num_rows; r++, this_data += this_stride) {
     Real *const dst_data = dst[r];
     if (dst_data != NULL)
@@ -2657,7 +2682,7 @@ void MatrixBase<Real>::AddRows(Real alpha,
   MatrixIndexT num_rows = num_rows_,
       num_cols = num_cols_, this_stride = stride_;
   Real *this_data = this->data_;
-  
+
   for (MatrixIndexT r = 0; r < num_rows; r++, this_data += this_stride) {
     MatrixIndexT index = indexes[r];
     KALDI_ASSERT(index >= -1 && index < src.NumRows());
@@ -2671,7 +2696,7 @@ void MatrixBase<Real>::AddRows(Real alpha, const Real *const *src) {
   MatrixIndexT num_rows = num_rows_,
       num_cols = num_cols_, this_stride = stride_;
   Real *this_data = this->data_;
-  
+
   for (MatrixIndexT r = 0; r < num_rows; r++, this_data += this_stride) {
     const Real *const src_data = src[r];
     if (src_data != NULL)
@@ -2684,7 +2709,7 @@ void MatrixBase<Real>::AddToRows(Real alpha, Real *const *dst) const {
   MatrixIndexT num_rows = num_rows_,
       num_cols = num_cols_, this_stride = stride_;
   const Real *this_data = this->data_;
-  
+
   for (MatrixIndexT r = 0; r < num_rows; r++, this_data += this_stride) {
     Real *const dst_data = dst[r];
     if (dst_data != NULL)
@@ -2762,7 +2787,7 @@ void MatrixBase<Real>::AddVecToRows(const Real alpha, const VectorBase<OtherReal
     Vector<OtherReal> ones(num_rows);
     ones.Set(1.0);
     this->AddVecVec(alpha, ones, v);
-   }  
+   }
 }
 
 template void MatrixBase<float>::AddVecToRows(const float alpha,
@@ -2796,7 +2821,7 @@ void MatrixBase<Real>::AddVecToCols(const Real alpha, const VectorBase<OtherReal
     ones.Set(1.0);
     this->AddVecVec(alpha, v, ones);
   }
-}  
+}
 
 template void MatrixBase<float>::AddVecToCols(const float alpha,
                                               const VectorBase<float> &v);
@@ -2808,8 +2833,8 @@ template void MatrixBase<double>::AddVecToCols(const double alpha,
                                                const VectorBase<double> &v);
 
 //Explicit instantiation of the classes
-//Apparently, it seems to be necessary that the instantiation 
-//happens at the end of the file. Otherwise, not all the member 
+//Apparently, it seems to be necessary that the instantiation
+//happens at the end of the file. Otherwise, not all the member
 //functions will get instantiated.
 
 template class Matrix<float>;
diff --git a/src/matrix/kaldi-matrix.h b/src/matrix/kaldi-matrix.h
index 10156af2c0d..5b4216002fb 100644
--- a/src/matrix/kaldi-matrix.h
+++ b/src/matrix/kaldi-matrix.h
@@ -51,7 +51,7 @@ class MatrixBase {
   friend class CuMatrix<Real>;
   friend class CuSubMatrix<Real>;
   friend class CuPackedMatrix<Real>;
-  
+
   friend class PackedMatrix<Real>;
 
   /// Returns number of rows (or zero for emtpy matrix).
@@ -103,7 +103,7 @@ class MatrixBase {
   /// Indexing operator, provided for ease of debugging (gdb doesn't work
   /// with parenthesis operator).
   Real &Index (MatrixIndexT r, MatrixIndexT c) {  return (*this)(r, c); }
-  
+
   /// Indexing operator, const
   /// (only checks sizes if compiled with -DKALDI_PARANOID)
   inline const Real operator() (MatrixIndexT r, MatrixIndexT c) const {
@@ -146,20 +146,21 @@ class MatrixBase {
   template<typename OtherReal>
   void CopyFromTp(const TpMatrix<OtherReal> &M,
                   MatrixTransposeType trans = kNoTrans);
-  
+
   /// Copy from CUDA matrix.  Implemented in ../cudamatrix/cu-matrix.h
-  template<typename OtherReal>  
+  template<typename OtherReal>
   void CopyFromMat(const CuMatrixBase<OtherReal> &M,
                    MatrixTransposeType trans = kNoTrans);
 
-  /// Inverse of vec() operator. Copies vector into matrix, row-by-row.
-  /// Note that rv.Dim() must either equal NumRows()*NumCols() or
-  /// NumCols()-- this has two modes of operation.
+  /// This function has two modes of operation.  If v.Dim() == NumRows() *
+  /// NumCols(), then treats the vector as a row-by-row concatenation of a
+  /// matrix and copies to *this.
+  /// if v.Dim() == NumCols(), it sets each row of *this to a copy of v.
   void CopyRowsFromVec(const VectorBase<Real> &v);
 
   /// This version of CopyRowsFromVec is implemented in ../cudamatrix/cu-vector.cc
   void CopyRowsFromVec(const CuVectorBase<Real> &v);
-  
+
   template<typename OtherReal>
   void CopyRowsFromVec(const VectorBase<OtherReal> &v);
 
@@ -167,7 +168,7 @@ class MatrixBase {
   /// Note that rv.Dim() must either equal NumRows()*NumCols() or NumRows();
   /// this has two modes of operation.
   void CopyColsFromVec(const VectorBase<Real> &v);
-  
+
   /// Copy vector into specific column of matrix.
   void CopyColFromVec(const VectorBase<Real> &v, const MatrixIndexT col);
   /// Copy vector into specific row of matrix.
@@ -202,11 +203,11 @@ class MatrixBase {
   inline SubMatrix<Real> RowRange(const MatrixIndexT row_offset,
                                   const MatrixIndexT num_rows) const {
     return SubMatrix<Real>(*this, row_offset, num_rows, 0, num_cols_);
-  }  
+  }
   inline SubMatrix<Real> ColRange(const MatrixIndexT col_offset,
                                   const MatrixIndexT num_cols) const {
     return SubMatrix<Real>(*this, 0, num_rows_, col_offset, num_cols);
-  }  
+  }
 
   /* Various special functions. */
   /// Returns sum of all elements in matrix.
@@ -244,10 +245,10 @@ class MatrixBase {
   /// j'th group of elements by src(i, j).  Requires src.NumRows() ==
   /// this->NumRows() and this->NumCols() % src.NumCols() == 0.
   void MulRowsGroupMat(const MatrixBase<Real> &src);
-    
+
   /// Returns logdet of matrix.
   Real LogDet(Real *det_sign = NULL) const;
-  
+
   /// matrix inverse.
   /// if inverse_needed = false, will fill matrix with garbage.
   /// (only useful if logdet wanted).
@@ -281,7 +282,7 @@ class MatrixBase {
   /// and src.NumCols() must equal this.NumCols()
   void CopyRows(const MatrixBase<Real> &src,
                 const MatrixIndexT *indices);
-  
+
   /// Add column indices[r] of src to column r.
   /// As a special case, if indexes[i] == -1, skip column i
   /// indices.size() must equal this->NumCols(),
@@ -293,7 +294,7 @@ class MatrixBase {
   /// Copies row r of this matrix from an array of floats at the location given
   /// by src[r]. If any src[r] is NULL then this.Row(r) will be set to zero.
   /// Note: we are using "pointer to const pointer to const object" for "src",
-  ///       because we may create "src" by calling Data() of const CuArray 
+  ///       because we may create "src" by calling Data() of const CuArray
   void CopyRows(const Real *const *src);
 
   /// Copies row r of this matrix to the array of floats at the location given
@@ -320,7 +321,7 @@ class MatrixBase {
   /// to by the pointers in "dst" overlap (e.g. none of the pointers should be
   /// the same).
   void AddToRows(Real alpha, Real *const *dst) const;
- 
+
   /// Applies floor to all matrix elements
   void ApplyFloor(Real floor_val);
 
@@ -336,18 +337,18 @@ class MatrixBase {
   /// Applies power to all matrix elements
   void ApplyPow(Real power);
 
-  /// Apply power to the absolute value of each element. 
+  /// Apply power to the absolute value of each element.
   /// Include the sign of the input element if include_sign == true.
   /// If the power is negative and the input to the power is zero,
   /// The output will be set zero.
   void ApplyPowAbs(Real power, bool include_sign=false);
-  
+
   /// Applies the Heaviside step function (x > 0 ? 1 : 0) to all matrix elements
   /// Note: in general you can make different choices for x = 0, but for now
   /// please leave it as it (i.e. returning zero) because it affects the
   /// RectifiedLinearComponent in the neural net code.
   void ApplyHeaviside();
-  
+
   /// Eigenvalue Decomposition of a square NxN matrix into the form (*this) = P D
   /// P^{-1}.  Be careful: the relationship of D to the eigenvalues we output is
   /// slightly complicated, due to the need for P to be real.  In the symmetric
@@ -408,7 +409,7 @@ class MatrixBase {
 
   void TestUninitialized() const; // This function is designed so that if any element
   // if the matrix is uninitialized memory, valgrind will complain.
-  
+
   /// Returns condition number by computing Svd.  Works even if cols > rows.
   /// Returns infinity if all singular values are zero.
   Real Cond() const;
@@ -452,13 +453,19 @@ class MatrixBase {
   /// Apply soft-max to the collection of all elements of the
   /// matrix and return normalizer (log sum of exponentials).
   Real ApplySoftMax();
-  
+
   /// Set each element to the sigmoid of the corresponding element of "src".
   void Sigmoid(const MatrixBase<Real> &src);
 
+  /// Sets each element to the Heaviside step function (x > 0 ? 1 : 0) of the
+  /// corresponding element in "src".  Note: in general you can make different
+  /// choices for x = 0, but for now please leave it as it (i.e. returning zero)
+  /// because it affects the RectifiedLinearComponent in the neural net code.
+  void Heaviside(const MatrixBase<Real> &src);
+
   /// Set each element to y = log(1 + exp(x))
   void SoftHinge(const MatrixBase<Real> &src);
-  
+
   /// Apply the function y(i) = (sum_{j = i*G}^{(i+1)*G-1} x_j^(power))^(1 / p).
   /// Requires src.NumRows() == this->NumRows() and  src.NumCols() % this->NumCols() == 0.
   void GroupPnorm(const MatrixBase<Real> &src, Real power);
@@ -497,7 +504,7 @@ class MatrixBase {
   // element-by-element, set *this = diff * (1.0 - value^2).
   void DiffTanh(const MatrixBase<Real> &value,
                 const MatrixBase<Real> &diff);
-  
+
   /** Uses Svd to compute the eigenvalue decomposition of a symmetric positive
    * semi-definite matrix: (*this) = rP * diag(rS) * rP^T, with rP an
    * orthogonal matrix so rP^{-1} = rP^T.   Throws exception if input was not
@@ -528,11 +535,11 @@ class MatrixBase {
   /// [each row of *this] += alpha * v
   template<typename OtherReal>
   void AddVecToRows(const Real alpha, const VectorBase<OtherReal> &v);
-  
+
   /// [each col of *this] += alpha * v
   template<typename OtherReal>
-  void AddVecToCols(const Real alpha, const VectorBase<OtherReal> &v);      
-  
+  void AddVecToCols(const Real alpha, const VectorBase<OtherReal> &v);
+
   /// *this += alpha * M [or M^T]
   void AddMat(const Real alpha, const MatrixBase<Real> &M,
               MatrixTransposeType transA = kNoTrans);
@@ -546,13 +553,13 @@ class MatrixBase {
   /// *this = beta * *this + alpha * diag(v) * M [or M^T].
   /// The same as adding M but scaling each row M_i by v(i).
   void AddDiagVecMat(const Real alpha, const VectorBase<Real> &v,
-                     const MatrixBase<Real> &M, MatrixTransposeType transM, 
+                     const MatrixBase<Real> &M, MatrixTransposeType transM,
                      Real beta = 1.0);
- 
-  /// *this = beta * *this + alpha * M [or M^T] * diag(v) 
+
+  /// *this = beta * *this + alpha * M [or M^T] * diag(v)
   /// The same as adding M but scaling each column M_j by v(j).
-  void AddMatDiagVec(const Real alpha, 
-                     const MatrixBase<Real> &M, MatrixTransposeType transM, 
+  void AddMatDiagVec(const Real alpha,
+                     const MatrixBase<Real> &M, MatrixTransposeType transM,
                      VectorBase<Real> &v,
                      Real beta = 1.0);
 
@@ -561,7 +568,7 @@ class MatrixBase {
                          const MatrixBase<Real>& A,
                          const MatrixBase<Real>& B,
                          const Real beta);
-  
+
   /// *this += alpha * S
   template<typename OtherReal>
   void AddSp(const Real alpha, const SpMatrix<OtherReal> &S);
@@ -570,7 +577,7 @@ class MatrixBase {
                  const MatrixBase<Real>& A, MatrixTransposeType transA,
                  const MatrixBase<Real>& B, MatrixTransposeType transB,
                  const Real beta);
- 
+
   /// *this = a * b / c (by element; when c = 0, *this = a)
   void AddMatMatDivMat(const MatrixBase<Real>& A,
                         const MatrixBase<Real>& B,
@@ -584,7 +591,7 @@ class MatrixBase {
                   const Real beta);
 
   /// A version of AddMatMat specialized for when the first argument
-  /// contains a lot of zeroes.  
+  /// contains a lot of zeroes.
   void AddSmatMat(const Real alpha,
                   const MatrixBase<Real>& A, MatrixTransposeType transA,
                   const MatrixBase<Real>& B, MatrixTransposeType transB,
@@ -661,7 +668,7 @@ class MatrixBase {
 
   /// Copy upper triangle to lower triangle (symmetrize)
   void CopyUpperToLower();
-  
+
   /// This function orthogonalizes the rows of a matrix using the Gram-Schmidt
   /// process.  It is only applicable if NumRows() <= NumCols().  It will use
   /// random number generation to fill in rows with something nonzero, in cases
@@ -737,12 +744,12 @@ class Matrix : public MatrixBase<Real> {
   /// Empty constructor.
   Matrix();
 
-  /// Basic constructor.  Sets to zero by default.
-  /// if set_zero == false, memory contents are undefined.
+  /// Basic constructor.
   Matrix(const MatrixIndexT r, const MatrixIndexT c,
-         MatrixResizeType resize_type = kSetZero):
-      MatrixBase<Real>() { Resize(r, c, resize_type); }
-  
+         MatrixResizeType resize_type = kSetZero,
+         MatrixStrideType stride_type = kDefaultStride):
+      MatrixBase<Real>() { Resize(r, c, resize_type, stride_type); }
+
   /// Copy constructor from CUDA matrix
   /// This is defined in ../cudamatrix/cu-matrix.h
   template<typename OtherReal>
@@ -760,7 +767,7 @@ class Matrix : public MatrixBase<Real> {
   /// Allocates new memory.
   explicit Matrix(const MatrixBase<Real> & M,
                   MatrixTransposeType trans = kNoTrans);
-  
+
   /// Same as above, but need to avoid default copy constructor.
   Matrix(const Matrix<Real> & M);  //  (cannot make explicit)
 
@@ -779,7 +786,7 @@ class Matrix : public MatrixBase<Real> {
 
   /// Constructor from CompressedMatrix
   explicit Matrix(const CompressedMatrix &C);
-  
+
   /// Copy constructor taking TpMatrix...
   template <typename OtherReal>
   explicit Matrix(const TpMatrix<OtherReal> & M,
@@ -799,7 +806,7 @@ class Matrix : public MatrixBase<Real> {
 
   /// Remove a specified row.
   void RemoveRow(MatrixIndexT i);
-  
+
   /// Transpose the matrix.  Works for non-square
   /// matrices as well as square ones.
   void Transpose();
@@ -813,10 +820,16 @@ class Matrix : public MatrixBase<Real> {
   ///   -if kUndefined, the new data will be undefined
   ///   -if kCopyData, the new data will be the same as the old data in any
   ///      shared positions, and zero elsewhere.
+  ///
+  /// You can set stride_type to kStrideEqualNumCols to force the stride
+  /// to equal the number of columns; by default it is set so that the stride
+  /// in bytes is a multiple of 16.
+  ///
   /// This function takes time proportional to the number of data elements.
   void Resize(const MatrixIndexT r,
               const MatrixIndexT c,
-              MatrixResizeType resize_type = kSetZero);
+              MatrixResizeType resize_type = kSetZero,
+              MatrixStrideType stride_type = kDefaultStride);
 
   /// Assignment operator that takes MatrixBase.
   Matrix<Real> &operator = (const MatrixBase<Real> &other) {
@@ -835,18 +848,19 @@ class Matrix : public MatrixBase<Real> {
     MatrixBase<Real>::CopyFromMat(other);
     return *this;
   }
-  
+
 
  private:
   /// Deallocates memory and sets to empty matrix (dimension 0, 0).
   void Destroy();
-  
+
   /// Init assumes the current class contents are invalid (i.e. junk or have
   /// already been freed), and it sets the matrix to newly allocated memory with
   /// the specified number of rows and columns.  r == c == 0 is acceptable.  The data
   /// memory contents will be undefined.
   void Init(const MatrixIndexT r,
-            const MatrixIndexT c);
+            const MatrixIndexT c,
+            const MatrixStrideType stride_type);
 
 };
 /// @} end "addtogroup matrix_group"
@@ -901,16 +915,16 @@ class SubMatrix : public MatrixBase<Real> {
             const MatrixIndexT r,   // number of rows, r > 0
             const MatrixIndexT co,  // column offset, 0 < co < NumCols()
             const MatrixIndexT c);   // number of columns, c > 0
-  
+
   // This initializer is mostly intended for use in CuMatrix and related
   // classes.  Be careful!
   SubMatrix(Real *data,
             MatrixIndexT num_rows,
             MatrixIndexT num_cols,
             MatrixIndexT stride);
-  
+
   ~SubMatrix<Real>() {}
-  
+
   /// This type of constructor is needed for Range() to work [in Matrix base
   /// class]. Cannot make it explicit.
   SubMatrix<Real> (const SubMatrix &other):
diff --git a/src/matrix/kaldi-vector.cc b/src/matrix/kaldi-vector.cc
index 65418a3f202..41a25f598c2 100644
--- a/src/matrix/kaldi-vector.cc
+++ b/src/matrix/kaldi-vector.cc
@@ -243,18 +243,6 @@ template void VectorBase<float>::CopyFromPacked(const PackedMatrix<float> &other
 template void VectorBase<double>::CopyFromPacked(const PackedMatrix<double> &other);
 template void VectorBase<double>::CopyFromPacked(const PackedMatrix<float> &other);
 
-template<typename Real>
-template<typename OtherReal>
-void VectorBase<Real>::CopyFromSmat(const SparseMatrix<OtherReal> &M) {
-  KALDI_ASSERT(dim_ == M.NumElements());
-  M.CopyToVec(this);
-}
-template void VectorBase<float>::CopyFromSmat(const SparseMatrix<float> &M);
-template void VectorBase<float>::CopyFromSmat(const SparseMatrix<double> &M);
-template void VectorBase<double>::CopyFromSmat(const SparseMatrix<float> &M);
-template void VectorBase<double>::CopyFromSmat(const SparseMatrix<double> &M);
-
-
 /// Load data into the vector
 template<typename Real>
 void VectorBase<Real>::CopyFromPtr(const Real *data, MatrixIndexT sz) {
diff --git a/src/matrix/kaldi-vector.h b/src/matrix/kaldi-vector.h
index 7c51546626f..498ddda302d 100644
--- a/src/matrix/kaldi-vector.h
+++ b/src/matrix/kaldi-vector.h
@@ -54,7 +54,7 @@ class VectorBase {
   /// chosen with probability proportional to the corresponding
   /// element.  Requires that this->Min() >= 0 and this->Sum() > 0.
   MatrixIndexT RandCategorical() const;
-  
+
   /// Returns the  dimension of the vector.
   inline MatrixIndexT Dim() const { return dim_; }
 
@@ -107,10 +107,6 @@ class VectorBase {
   template<typename OtherReal>
   void CopyFromPacked(const PackedMatrix<OtherReal> &M);
 
-  /// Copy data from a SparseMatrix.
-  template<typename OtherReal>
-  void CopyFromSmat(const SparseMatrix<OtherReal> &M);
-  
   /// Copy data from another vector of different type (double vs. float)
   template<typename OtherReal>
   void CopyFromVec(const VectorBase<OtherReal> &v);
@@ -119,7 +115,7 @@ class VectorBase {
   template<typename OtherReal>
   void CopyFromVec(const CuVectorBase<OtherReal> &v);
 
-  
+
   /// Apply natural log to all elements.  Throw if any element of
   /// the vector is negative (but doesn't complain about zero; the
   /// log will be -infinity
@@ -139,7 +135,7 @@ class VectorBase {
 
   /// Applies ceiling to all elements. Returns number of elements changed.
   MatrixIndexT ApplyCeiling(Real ceil_val);
-  
+
   /// Applies floor to all elements. Returns number of elements floored.
   MatrixIndexT ApplyFloor(const VectorBase<Real> &floor_vec);
 
@@ -158,7 +154,7 @@ class VectorBase {
   /// Sets each element of *this to the sigmoid function of the corresponding
   /// element of "src".
   void Sigmoid(const VectorBase<Real> &src);
-  
+
   /// Take all  elements of vector to a power.
   void ApplyPow(Real power);
 
@@ -166,13 +162,13 @@ class VectorBase {
   /// Include the sign of the input element if include_sign == true.
   /// If power is negative and the input value is zero, the output is set zero.
   void ApplyPowAbs(Real power, bool include_sign=false);
-  
+
   /// Compute the p-th norm of the vector.
   Real Norm(Real p) const;
-  
+
   /// Returns true if ((*this)-other).Norm(2.0) <= tol * (*this).Norm(2.0).
   bool ApproxEqual(const VectorBase<Real> &other, float tol = 0.01) const;
-  
+
   /// Invert all elements.
   void InvertElements();
 
@@ -201,7 +197,7 @@ class VectorBase {
                   const MatrixTransposeType trans,  const VectorBase<Real> &v,
                   const Real beta); // **beta previously defaulted to 0.0**
 
-  
+
   /// Add symmetric positive definite matrix times vector:
   ///  this <-- beta*this + alpha*M*v.   Calls BLAS SPMV.
   void AddSpVec(const Real alpha, const SpMatrix<Real> &M,
@@ -275,7 +271,7 @@ class VectorBase {
   /// Extracts a row of the symmetric matrix S.
   template<typename OtherReal>
   void CopyRowFromSp(const SpMatrix<OtherReal> &S, MatrixIndexT row);
-  
+
   /// Extracts a column of the matrix M.
   template<typename OtherReal>
   void CopyColFromMat(const MatrixBase<OtherReal> &M , MatrixIndexT col);
@@ -299,14 +295,14 @@ class VectorBase {
   /// Returns the maximum value of any element, and the associated index.
   /// Error if vector is empty.
   Real Max(MatrixIndexT *index) const;
-  
+
   /// Returns the minimum value of any element, or +infinity for the empty vector.
   Real Min() const;
 
   /// Returns the minimum value of any element, and the associated index.
   /// Error if vector is empty.
   Real Min(MatrixIndexT *index) const;
-  
+
   /// Returns sum of the elements
   Real Sum() const;
 
@@ -317,7 +313,7 @@ class VectorBase {
 
   /// Does *this = alpha * (sum of rows of M) + beta * *this.
   void AddRowSumMat(Real alpha, const MatrixBase<Real> &M, Real beta = 1.0);
-  
+
   /// Does *this = alpha * (sum of columns of M) + beta * *this.
   void AddColSumMat(Real alpha, const MatrixBase<Real> &M, Real beta = 1.0);
 
@@ -332,7 +328,7 @@ class VectorBase {
   /// as you would expect.
   void AddDiagMatMat(Real alpha, const MatrixBase<Real> &M, MatrixTransposeType transM,
                      const MatrixBase<Real> &N, MatrixTransposeType transN,
-                     Real beta = 1.0);  
+                     Real beta = 1.0);
 
   /// Returns log(sum(exp())) without exp overflow
   /// If prune > 0.0, ignores terms less than the max - prune.
@@ -420,12 +416,6 @@ class Vector: public VectorBase<Real> {
     this->CopyFromVec(v);
   }
 
-  template<typename OtherReal>
-  explicit Vector(const SparseMatrix<OtherReal> &smat) : VectorBase<Real>() {
-    Resize(smat.NumElements(), kUndefined);
-    this->CopyFromSmat(smat);
-  }
-
 // Took this out since it is unsafe : Arnab
 //  /// Constructor from a pointer and a size; copies the data to a location
 //  /// it owns.
@@ -508,7 +498,7 @@ class SubVector : public VectorBase<Real> {
     VectorBase<Real>::data_ = const_cast<Real*> (M.Data());
     VectorBase<Real>::dim_   = (M.NumRows()*(M.NumRows()+1))/2;
   }
-  
+
   /// Copy constructor
   SubVector(const SubVector &other) : VectorBase<Real> () {
     // this copy constructor needed for Range() to work in base class.
diff --git a/src/matrix/matrix-common.h b/src/matrix/matrix-common.h
index cca65ae7bbd..a9a1b02a48b 100644
--- a/src/matrix/matrix-common.h
+++ b/src/matrix/matrix-common.h
@@ -38,6 +38,12 @@ typedef enum {
   kCopyData
 } MatrixResizeType;
 
+
+typedef enum {
+  kDefaultStride,
+  kStrideEqualNumCols,
+} MatrixStrideType;
+
 typedef enum {
   kTakeLower,
   kTakeUpper,
diff --git a/src/matrix/matrix-lib-speed-test.cc b/src/matrix/matrix-lib-speed-test.cc
index 6e3ebf2b598..cd5405b8c3d 100644
--- a/src/matrix/matrix-lib-speed-test.cc
+++ b/src/matrix/matrix-lib-speed-test.cc
@@ -1,4 +1,4 @@
-// matrix/matrix-lib-test.cc
+// matrix/matrix-lib-speed-test.cc
 
 // Copyright 2009-2014   Microsoft Corporation;  Mohit Agarwal;  Lukas Burget;
 //                       Ondrej Glembek;  Saarland University (Author: Arnab Ghoshal);
diff --git a/src/matrix/matrix-lib-test.cc b/src/matrix/matrix-lib-test.cc
index 3dc51ed61b4..687ac66ac46 100644
--- a/src/matrix/matrix-lib-test.cc
+++ b/src/matrix/matrix-lib-test.cc
@@ -44,7 +44,7 @@ void RandPosdefSpMatrix(MatrixIndexT dim, SpMatrix<Real> *matrix) {
   }
   // tmp * tmp^T will give positive definite matrix
   matrix->AddMat2(1.0, tmp, kNoTrans, 0.0);
-  
+
   // Checks that the matrix is indeed pos-def
   TpMatrix<Real> sqrt(dim);
   sqrt.Cholesky(*matrix);
@@ -189,7 +189,7 @@ template<typename Real> static void SlowMatMul() {
   for (MatrixIndexT i = 0; i < 10000; i++) {
     Q.AddMatMat(1.0, M, kNoTrans, P, kNoTrans, 0.0);
   }
-}  
+}
 
 template<typename Real> static void UnitTestAddToDiagMatrix() {
   for (int p = 0; p < 2; p++) {
@@ -202,7 +202,7 @@ template<typename Real> static void UnitTestAddToDiagMatrix() {
     AssertEqual(M, Mcopy);
   }
 }
-  
+
 
 template<typename Real> static void UnitTestAddDiagVecMat() {
   for (int p = 0; p < 2; p++) {
@@ -214,7 +214,7 @@ template<typename Real> static void UnitTestAddDiagVecMat() {
     MatrixTransposeType trans = (p % 2 == 0 ? kNoTrans : kTrans);
     if (trans == kTrans)
       N.Transpose();
-    
+
     Vector<Real> V(dimM);
     V.SetRandn();
 
@@ -241,8 +241,8 @@ template<typename Real> static void UnitTestAddMatDiagVec() {
     Real alpha = 0.43243, beta = 1.423;
 
     Matrix<Real> M(dimM, dimN), N(dimM, dimN), buf(dimM, dimN);
-    M.SetRandn(); 
-    N.SetRandn(); 
+    M.SetRandn();
+    N.SetRandn();
     buf.CopyFromMat(N);
     MatrixTransposeType trans = (p % 2 == 0 ? kNoTrans : kTrans);
     if (trans == kTrans)
@@ -251,9 +251,9 @@ template<typename Real> static void UnitTestAddMatDiagVec() {
     Vector<Real> V(dimN);
     V.SetRandn();
 
-    Matrix<Real> Mcheck(M); 
+    Matrix<Real> Mcheck(M);
     Mcheck.Scale(beta);
-    buf.MulColsVec(V);  
+    buf.MulColsVec(V);
     Mcheck.AddMat(alpha, buf, kNoTrans);
 
     M.AddMatDiagVec(alpha, N, trans, V, beta);
@@ -349,7 +349,7 @@ template<typename Real> static void UnitTestCopyRowsAndCols() {
         KALDI_ASSERT(M(r, c) == v(r));
         KALDI_ASSERT(N(r, c) == w(c));
       }
-    }    
+    }
   }
 }
 
@@ -364,8 +364,8 @@ template<typename Real> static void UnitTestSpliceRows() {
     Matrix<Real> M(dimM, dimN);
     M.CopyRowsFromVec(V);
     V10.CopyRowsFromMat(M);
-    AssertEqual(V, V10);        
-    
+    AssertEqual(V, V10);
+
     for (MatrixIndexT i = 0;i < dimM;i++)
       for (MatrixIndexT  j = 0;j < dimN;j++)
         KALDI_ASSERT(M(i, j) == V(i*dimN + j));
@@ -574,7 +574,7 @@ static void UnitTestSimpleForVec() {  // testing some simple operaters on vector
       Matrix<Real> M(dimM, dimN);
       InitRand(&M);
       Vector<Real> Vr(dimN), Vc(dimM);
-      Vr.AddRowSumMat(0.4, M); 
+      Vr.AddRowSumMat(0.4, M);
       Vr.AddRowSumMat(0.3, M, 0.5); // note: 0.3 + 0.4*0.5 = 0.5.
       Vc.AddColSumMat(0.4, M);
       Vc.AddColSumMat(0.3, M, 0.5); // note: 0.3 + 0.4*0.5 = 0.5.
@@ -598,7 +598,7 @@ static void UnitTestSimpleForVec() {  // testing some simple operaters on vector
   for (MatrixIndexT i = 0; i < 5; i++) {
     Vector<Real> V(100), V1(100), V2(100);
     InitRand(&V);
-    
+
     V1.CopyFromVec(V);
     V1.ApplyExp();
     Real a = V.LogSumExp();
@@ -670,7 +670,7 @@ static void UnitTestVectorMin() {
   KALDI_ASSERT(m == V(i));
 }
 
-template<typename Real>  
+template<typename Real>
 static void UnitTestReplaceValue(){
   // for vector
   MatrixIndexT dim = 10 + Rand() % 2;
@@ -738,13 +738,14 @@ static void UnitTestCopyRows() {
     Matrix<Real> N1(num_rows2, num_cols),
         N2(num_rows2, num_cols), O(num_rows2, num_cols);
     std::vector<int32> reorder(num_rows2);
-    std::vector<const Real*> reorder_src(num_rows2, NULL);
+    std::vector<const Real*> reorder_src(num_rows2,
+                                         static_cast<const Real*>(NULL));
     for (int32 i = 0; i < num_rows2; i++) {
       reorder[i] = -1 + (Rand() % (num_rows1 + 1));
       if (reorder[i] != -1)
         reorder_src[i] = M.RowData(reorder[i]);
     }
- 
+
     N1.CopyRows(M, &(reorder[0]));
     N2.CopyRows(&(reorder_src[0]));
 
@@ -768,7 +769,8 @@ static void UnitTestCopyToRows() {
     InitRand(&M);
 
     Matrix<Real> N(num_rows2, num_cols), O(num_rows2, num_cols);
-    std::vector<Real*> reorder_dst(num_rows1, NULL);
+    std::vector<Real*> reorder_dst(num_rows1,
+                                   static_cast<Real*>(NULL));
     unordered_map<MatrixIndexT, bool> used_index;
     for (int32 i = 0; i < num_rows1; i++) {
       MatrixIndexT index = -1 + (Rand() % (num_rows2 + 1));
@@ -802,7 +804,8 @@ static void UnitTestAddRows() {
     Matrix<Real> N1(num_rows2, num_cols),
         N2(num_rows2, num_cols), O(num_rows2, num_cols);
     std::vector<int32> reorder(num_rows2);
-    std::vector<const Real*> reorder_src(num_rows2, NULL);
+    std::vector<const Real*> reorder_src(num_rows2,
+                                         static_cast<const Real*>(NULL));
     for (int32 i = 0; i < num_rows2; i++) {
       reorder[i] = -1 + (Rand() % (num_rows1 + 1));
       if (reorder[i] != -1)
@@ -811,7 +814,7 @@ static void UnitTestAddRows() {
 
     Real alpha =
         static_cast<Real>((Rand() % num_rows2)) / static_cast<Real>(num_rows1);
- 
+
     N1.AddRows(alpha, M, &(reorder[0]));
     N2.AddRows(alpha, &(reorder_src[0]));
 
@@ -841,7 +844,7 @@ static void UnitTestAddToRows() {
         static_cast<Real>((Rand() % num_rows2)) / static_cast<Real>(num_rows1);
 
     Matrix<Real> N(num_rows2, num_cols), O(num_rows2, num_cols);
-    std::vector<Real*> reorder_dst(num_rows1, NULL);
+    std::vector<Real*> reorder_dst(num_rows1, static_cast<Real*>(NULL));
     unordered_map<MatrixIndexT, bool> used_index;
     for (int32 i = 0; i < num_rows1; i++) {
       MatrixIndexT index = -1 + (Rand() % (num_rows2 + 1));
@@ -871,14 +874,14 @@ static void UnitTestCopyCols() {
         num_rows = 10 + Rand() % 10;
     Matrix<Real> M(num_rows, num_cols1);
     InitRand(&M);
-    
+
     Matrix<Real> N(num_rows, num_cols2), O(num_rows, num_cols2);
     std::vector<int32> reorder(num_cols2);
     for (int32 i = 0; i < num_cols2; i++)
       reorder[i] = -1 + (Rand() % (num_cols1 + 1));
-    
+
     N.CopyCols(M, &(reorder[0]));
-    
+
     for (int32 i = 0; i < num_rows; i++)
       for (int32 j = 0; j < num_cols2; j++)
         if (reorder[j] < 0) O(i, j) = 0;
@@ -1020,7 +1023,7 @@ template<typename Real> static void UnitTestRow() {
       v2.CopyRowFromMat(M, dim2);
       AssertEqual(v1, v2);
     }
-    
+
     MatrixIndexT j = Rand() % dimN;  // Col to get.
     Vector<Real> W(dimM);
     W.CopyColFromMat(M, j);  // get row.
@@ -1160,7 +1163,7 @@ template<typename Real> static void UnitTestAddOuterProductPlusMinus() {
       InitRand(&v2);
       Real alpha = 0.333 * ((Rand() % 10) - 5);
       M.AddVecVec(alpha, v1, v2);
-         
+
       AddOuterProductPlusMinus(alpha, v1, v2, &Plus, &Minus);
       M2.SetZero();
       M2.AddMat(-1.0, Minus);
@@ -1542,7 +1545,7 @@ template<typename Real> static void UnitTestEig() {
     Pinv.Invert();
     Matrix<Real> D(dimM, dimM);
     CreateEigenvalueMatrix(real_eigs, imag_eigs, &D);
-    
+
     // check that M = P D P^{-1}.
     Matrix<Real> tmp(dimM, dimM);
     tmp.AddMatMat(1.0, P, kNoTrans, D, kNoTrans, 0.0);  // tmp = P * D
@@ -1571,7 +1574,7 @@ template<typename Real> static void UnitTestEigSp() {
   // either large zero eigenspaces, or two large
   // eigenspaces with the same absolute value but +ve
   // and -ve.  Also zero matrix.
-  
+
   for (MatrixIndexT iter = 0; iter < 100; iter++) {
     MatrixIndexT dimM = 1 + (Rand() % 10);
     SpMatrix<Real> S(dimM);
@@ -1620,7 +1623,7 @@ template<typename Real> static void UnitTestEigSp() {
       KALDI_LOG << "Eigs of difference are " << s;
     }
     KALDI_ASSERT(S.ApproxEqual(S2, 1.0e-03f));
-  }    
+  }
 }
 
 // TEMP!
@@ -1687,7 +1690,7 @@ static void UnitTestTridiagonalize() {
     // Very small or large scaling is challenging to qr due to squares that
     // could go out of range.
     if (Rand() % 3 == 0)
-      S.Scale(1.0e-15); 
+      S.Scale(1.0e-15);
     else if (Rand() % 2 == 0)
       S.Scale(1.0e+15);
     if (i == 0 || i == 1) {
@@ -1722,7 +1725,7 @@ static void UnitTestTridiagonalize() {
     //KALDI_LOG << "S2 (should be like S) is " << S2;
     //KALDI_LOG << "S3 (should be like T) is " << S3;
     AssertEqual(S, S2);
-    AssertEqual(T, S3);        
+    AssertEqual(T, S3);
   }
 }
 
@@ -1813,7 +1816,7 @@ template<typename Real> static void UnitTestMmulSym() {
 
     Matrix<Real> A(dimM, dimM), B(dimM, dimM), C(dimM, dimM), tmp(dimM, dimM), tmp2(dimM, dimM);
     SpMatrix<Real> sA(dimM), sB(dimM), sC(dimM), stmp(dimM);
-    
+
     InitRand(&A); InitRand(&B); InitRand(&C);
     // Make A, B, C symmetric.
     tmp.CopyFromMat(A); A.AddMat(1.0, tmp, kTrans);
@@ -1837,7 +1840,7 @@ template<typename Real> static void UnitTestMmulSym() {
 template<typename Real> static void UnitTestAddVecVec() {
   for (int32 i = 0; i < 20; i++) {
     int32 dimM = 5 + Rand() % 10, dimN = 5 + Rand() % 10;
-    
+
     Matrix<Real> M(dimM, dimN);
     M.SetRandn();
     Matrix<Real> N(M);
@@ -1851,7 +1854,7 @@ template<typename Real> static void UnitTestAddVecVec() {
       AssertEqual(M(dimX, dimY),
                   N(dimX, dimY) + alpha * v(dimX) * w(dimY));
     }
-  }    
+  }
 }
 
 
@@ -1870,7 +1873,7 @@ template<typename Real> static void UnitTestVecmul() {
 
     x.AddMatVec(alpha, A, trans, y, beta);  // x = A * y + beta*x.
     x2.AddMatSvec(alpha, A, trans, y, beta);  // x = A * y + beta*x
-    
+
     for (MatrixIndexT i = 0; i < dimM; i++) {
       double sum = beta * orig_x(i);
       for (MatrixIndexT j = 0; j < dimN; j++) {
@@ -1960,9 +1963,9 @@ template<typename Real> static void UnitTestDotprod() {
     Real f = VecVec(w, v), f2 = VecVec(wd, v), f3 = VecVec(v, wd);
     Real sum = 0.0;
     for (MatrixIndexT i = 0;i < dimM;i++) sum += v(i)*w(i);
-    KALDI_ASSERT(std::abs(f-sum) < 0.0001);
-    KALDI_ASSERT(std::abs(f2-sum) < 0.0001);
-    KALDI_ASSERT(std::abs(f3-sum) < 0.0001);
+    KALDI_ASSERT(std::abs(f-sum) < 0.001);
+    KALDI_ASSERT(std::abs(f2-sum) < 0.001);
+    KALDI_ASSERT(std::abs(f3-sum) < 0.001);
   }
 }
 
@@ -2014,8 +2017,8 @@ template<typename Real>
 static void UnitTestTp2Sp() {
   // Tests AddTp2Sp()
   for (MatrixIndexT iter = 0; iter < 4; iter++) {
-    MatrixIndexT dimM = 10 + Rand()%3;    
-  
+    MatrixIndexT dimM = 10 + Rand()%3;
+
     TpMatrix<Real> T(dimM);
     InitRand(&T);
     SpMatrix<Real> S(dimM);
@@ -2039,8 +2042,8 @@ template<typename Real>
 static void UnitTestTp2() {
   // Tests AddTp2()
   for (MatrixIndexT iter = 0; iter < 4; iter++) {
-    MatrixIndexT dimM = 10 + Rand()%3;    
-  
+    MatrixIndexT dimM = 10 + Rand()%3;
+
     TpMatrix<Real> T(dimM);
     InitRand(&T);
 
@@ -2100,12 +2103,12 @@ static void UnitTestAddDiagMatMat() {
     MatrixTransposeType transM = (iter % 2 == 0 ? kNoTrans : kTrans);
     MatrixTransposeType transN = ((iter/2) % 2 == 0 ? kNoTrans : kTrans);
     Matrix<Real> M(M_orig, transM), N(N_orig, transN);
-    
+
     InitRand(&v);
     Vector<Real> w(v);
 
     w.AddDiagMatMat(alpha, M, transM, N, transN, beta);
-    
+
     {
       Vector<Real> w2(v);
       Matrix<Real> MN(dimM, dimM);
@@ -2140,7 +2143,7 @@ static void UnitTestOrthogonalizeRows() {
     Matrix<Real> I(dimM, dimM);
     I.AddMatMat(1.0, M, kNoTrans, M, kTrans, 0.0);
     KALDI_ASSERT(I.IsUnit(1.0e-05));
-  }  
+  }
 }
 
 template<typename Real>
@@ -2310,8 +2313,8 @@ template<typename Real> static void  UnitTestFloorChol() {
     if (i%2 == 0)
       C.Scale(0.001);  // so it's not too much bigger than B (or it's trivial)
     SpMatrix<Real> BFloored(B); BFloored.ApplyFloor(C, alpha);
-    
-    
+
+
     for (MatrixIndexT j = 0;j < 10;j++) {
       Vector<Real> v(dimM);
       InitRand(&v);
@@ -2338,7 +2341,7 @@ template<typename Real> static void  UnitTestFloorUnit() {
     B.AddMat2(1.0, M, kNoTrans, 0.0);  // B = M*M^T -> positive semidefinite.
 
     SpMatrix<Real> BFloored(B); BFloored.ApplyFloor(floor);
-    
+
 
     Vector<Real> s(dimM); Matrix<Real> P(dimM, dimM); B.SymPosSemiDefEig(&s, &P);
     Vector<Real> s2(dimM); Matrix<Real> P2(dimM, dimM); BFloored.SymPosSemiDefEig(&s2, &P2);
@@ -2368,7 +2371,7 @@ template<typename Real> static void  UnitTestFloorCeiling() {
     KALDI_ASSERT(ceiled == ceiled2);
   }
 }
-    
+
 template<typename Real> static void  UnitTestMat2Vec() {
   for (MatrixIndexT i = 0; i < 5; i++) {
     MatrixIndexT dimM = 10 + Rand() % 10;
@@ -2670,7 +2673,7 @@ template<typename Real> static void UnitTestHtkIo() {
     hdr.mSamplePeriod = 10000;  // in funny HTK units-- can set it arbitrarily
     hdr.mSampleSize = sizeof(float)*dimN;
     hdr.mSampleKind = 8;  // Mel spectrum.
-    
+
     Matrix<Real> M(dimM, dimN);
     InitRand(&M);
 
@@ -2883,7 +2886,7 @@ template<typename Real> static void UnitTestScaleDiag() {
 
 
 template<typename Real> static void UnitTestSetDiag() {
-  
+
   MatrixIndexT N = 1 + Rand() % 10;
   SpMatrix<Real> S(N), T(N);
   S.SetUnit();
@@ -2946,7 +2949,7 @@ template<typename Real> static void UnitTestAddMat2Sp() {
     SpMatrix<Real> S2(S), S3(S);
     S.AddMat2Sp(alpha, M, trans, T, beta);
     S3.AddSmat2Sp(alpha, M, trans, T, beta);
-    
+
     // M[trans?] * T.
     Matrix<Real> A(dimM, dimN);
     A.AddMatSp(1.0, M, trans, T, 0.0);
@@ -2990,9 +2993,9 @@ template<typename Real> static void UnitTestAddMat2() {
     M.SetRandn();
 
     Matrix<Real> Sfull(S), Sfull2(S);
-    
+
     S.AddMat2(alpha, M, trans, beta);
-    
+
     Sfull.AddMatMat(alpha, M, trans, M, other_trans, beta);
 
     Sfull2.SymAddMat2(alpha, M, trans, beta);
@@ -3010,14 +3013,14 @@ template<typename Real> static void UnitTestAddMat2() {
 
 template<typename Real> static void UnitTestSymAddMat2() {
   for (int32 i = 0; i < 5; i++) {
-    int32 dimM = 10 + Rand() % 200, dimN = 10 + Rand() % 30;                                                            
+    int32 dimM = 10 + Rand() % 200, dimN = 10 + Rand() % 30;
     KALDI_LOG << "dimM = " << dimM << ", dimN = " << dimN;
 
-    Matrix<Real> M(dimM, dimM); // square matrix..                                                                            
+    Matrix<Real> M(dimM, dimM); // square matrix..
     Matrix<Real> N(dimM, dimN);
     M.SetRandn();
     N.SetRandn();
-    //MatrixTransposeType trans = (i % 2 == 0 ? kTrans : kNoTrans),                                                          
+    //MatrixTransposeType trans = (i % 2 == 0 ? kTrans : kNoTrans),
     MatrixTransposeType trans = kTrans,
         other_trans = (trans == kTrans ? kNoTrans : kTrans);
     if (trans == kTrans) N.Transpose();
@@ -3038,7 +3041,7 @@ template<typename Real> static void UnitTestSymAddMat2() {
     TpMatrix<Real> T1(M.NumRows()), T2(M2.NumRows());
     T1.CopyFromMat(M);
     T2.CopyFromMat(M2);
-    Matrix<Real> X1(T1), X2(T2); // so we can test equality.                                                                  
+    Matrix<Real> X1(T1), X2(T2); // so we can test equality.
     AssertEqual(X1, X2);
     KALDI_ASSERT(dimM == 0 || X1.Trace() != 0 || (alpha == 0 && beta == 0));
   }
@@ -3067,7 +3070,7 @@ template<typename Real> static void UnitTestSolve() {
     opts2.optimize_delta = Rand() % 2;
     opts3.diagonal_precondition = Rand() % 2;
     opts3.optimize_delta = Rand() % 2;
-    
+
     double ans2 =  SolveQuadraticProblem(H, g, opts2, &x2),
         ans3 = SolveQuadraticProblem(H, g, opts3, &x3);
 
@@ -3079,7 +3082,7 @@ template<typename Real> static void UnitTestSolve() {
     AssertEqual(observed_impr3, ans3);
     KALDI_ASSERT(ans2 >= 0);
     KALDI_ASSERT(ans3 >= 0);
-    KALDI_ASSERT(abs(ans2 - ans3) / std::max(ans2, ans3) < 0.01);
+    KALDI_ASSERT(std::abs(ans2 - ans3) / std::max(ans2, ans3) < 0.01);
     //AssertEqual(x2, x3);
     //AssertEqual(ans1, ans2);
   }
@@ -3112,7 +3115,7 @@ template<typename Real> static void UnitTestSolve() {
     opts.optimize_delta = Rand() % 2;
     opts.diagonal_precondition = Rand() % 2;
     double ans = SolveQuadraticMatrixProblem(Q, Y, SigmaInv, opts, &M2);
-    
+
     Matrix<Real> M3(M);
     M3.AddMatSp(1.0, Y, kNoTrans, Qinv, 0.0);
     if (Q.Cond() < 1000.0) {
@@ -3152,7 +3155,7 @@ template<typename Real> static void UnitTestSolve() {
     Matrix<Real> M2(M);
 
     SolverOptions opts;
-    opts.optimize_delta = Rand() % 2;    
+    opts.optimize_delta = Rand() % 2;
     SolveDoubleQuadraticMatrixProblem(G, P1, P2, Q1, Q2, opts, &M2);
 
     {
@@ -3209,10 +3212,10 @@ template<typename Real> static void UnitTestLbfgs() {
     InitRand(&v);
     // Function will be f = exp(0.1 * [ x' v  -0.5 x' S x ])
     // This is to maximize; we negate it when minimizing.
-    
+
     //Vector<Real> hessian(dim);
     //hessian.CopyDiagFromSp(S);
-    
+
     SpMatrix<Real> Sinv(S);
     Sinv.Invert();
     Vector<Real> x_opt(dim);
@@ -3260,7 +3263,7 @@ template<typename Real> static void UnitTestLbfgs() {
 template<typename Real> static void UnitTestLinearCgd() {
   for (int i = 0; i < 20 ; i++) {
     MatrixIndexT M = 1 + Rand() % 20;
-    
+
     SpMatrix<Real> A(M);
     RandPosdefSpMatrix(M, &A);
     Vector<Real> x(M), b(M), b2(M);
@@ -3270,15 +3273,15 @@ template<typename Real> static void UnitTestLinearCgd() {
       opts.max_iters = 1 + Rand() % 10;
     if (Rand() % 2 == 0)
       opts.max_error = 1.0;  // note: an absolute, not relative, error.
-    
+
     x.SetRandn();
-    
+
     b.AddSpVec(1.0, A, x, 0.0);
     Vector<Real> x_e(M);  // x_e means x_estimated.
     x_e.SetRandn();
-    
+
     int32 iters = LinearCgd(opts, A, b, &x_e);
-    
+
     b2.AddSpVec(1.0, A, x_e, 0.0);
 
     Vector<Real> residual_error(b);
@@ -3300,7 +3303,7 @@ template<typename Real> static void UnitTestLinearCgd() {
           KALDI_ASSERT(error <= wiggle_room * opts.max_error);
         }
       } else {
-        KALDI_ASSERT(error <= wiggle_room * opts.max_error);        
+        KALDI_ASSERT(error <= wiggle_room * opts.max_error);
       }
     }
   }
@@ -3489,7 +3492,7 @@ template<typename Real> static void UnitTestSplitRadixComplexFft() {
 
     MatrixIndexT twoN = 2*N;
     std::vector<Real> temp_buffer;
-    SplitRadixComplexFft<Real> srfft(N);
+    SplitRadixComplexFft<Real> srfft(N), srfft2(srfft);
     for (MatrixIndexT p = 0; p < 3; p++) {
       Vector<Real> v(twoN), w_base(twoN), w_alg(twoN), x_base(twoN), x_alg(twoN);
 
@@ -3501,7 +3504,7 @@ template<typename Real> static void UnitTestSplitRadixComplexFft() {
       if (Rand() % 2 == 0)
         srfft.Compute(w_alg.Data(), true);
       else
-        srfft.Compute(w_alg.Data(), true, &temp_buffer);
+        srfft2.Compute(w_alg.Data(), true, &temp_buffer);
 
       if (N< 100) AssertEqual(w_base, w_alg, 0.01*N);
 
@@ -3555,7 +3558,7 @@ template<typename Real> static void UnitTestAddVec2Sp() {
     v.SetRandn();
     Matrix<Real> M(dim, dim);
     M.CopyDiagFromVec(v);
-    
+
     SpMatrix<Real> T1(dim);
     T1.SetRandn();
     SpMatrix<Real> T2(T1);
@@ -3564,7 +3567,7 @@ template<typename Real> static void UnitTestAddVec2Sp() {
     T2.AddMat2Sp(alpha, M, kNoTrans, S, beta);
     AssertEqual(T1, T2);
   }
-}    
+}
 
 
 template<typename Real> static void UnitTestAddVecToCols() {
@@ -3672,8 +3675,8 @@ template<typename Real> static void UnitTestSplitRadixRealFft() {
     MatrixIndexT logn = 2 + Rand() % 11,
         N = 1 << logn;
 
-    SplitRadixRealFft<Real> srfft(N);
-    std::vector<Real> temp_buffer;    
+    SplitRadixRealFft<Real> srfft(N), srfft2(srfft);
+    std::vector<Real> temp_buffer;
     for (MatrixIndexT q = 0; q < 3; q++) {
       Vector<Real> v(N), w(N), x(N), y(N);
       InitRand(&v);
@@ -3683,8 +3686,8 @@ template<typename Real> static void UnitTestSplitRadixRealFft() {
       if (Rand() % 2 == 0)
         srfft.Compute(y.Data(), true);
       else
-        srfft.Compute(y.Data(), true, &temp_buffer);
-      
+        srfft2.Compute(y.Data(), true, &temp_buffer);
+
       // KALDI_LOG <<"v = "<<v;
       // KALDI_LOG << "Inefficient real fft of v is: "<< w;
       // KALDI_LOG << "Efficient real fft of v is: "<< y;
@@ -4017,7 +4020,7 @@ static void UnitTestPca2(bool full_test) {
 
     Matrix<Real> X(num_points, feat_dim);
     X.SetRandn();
-    
+
     MatrixIndexT pca_dim = 30;
 
     Matrix<Real> U(pca_dim, feat_dim); // rows PCA directions.
@@ -4180,12 +4183,12 @@ template<typename Real> static void UnitTestCompressedMatrix() {
         KALDI_LOG << "cmat2 is: ";
         cmat2.Write(std::cout, false);
         KALDI_WARN << "Matrices differ " << M2 << " vs. " << M3 << ", M2 range is "
-                   << M2.Min() << " to " << M2.Max() << ", M3 range is " 
+                   << M2.Min() << " to " << M2.Max() << ", M3 range is "
                    << M3.Min() << " to " << M3.Max();
         num_failure++;
       }
     }
-    
+
     // test CopyRowToVec
     for (MatrixIndexT i = 0; i < num_rows; i++) {
       Vector<Real> V(num_cols);
@@ -4194,7 +4197,7 @@ template<typename Real> static void UnitTestCompressedMatrix() {
         AssertEqual(M2(i, k), V(k));
       }
     }
-    
+
     // test CopyColToVec
     for (MatrixIndexT i = 0; i < num_cols; i++) {
       Vector<Real> V(num_rows);
@@ -4267,8 +4270,8 @@ template<typename Real> static void UnitTestCompressedMatrix() {
         Matrix<Real> mat2(cmat2);
         AssertEqual(mat1, mat2);
       }
-      
-      
+
+
       Matrix<Real> M3(cmat2.NumRows(), cmat2.NumCols());
       cmat2.CopyToMat(&M3);
       AssertEqual(M2, M3); // tests I/O of CompressedMatrix.
@@ -4345,7 +4348,7 @@ template<typename Real> static void UnitTestGeneralMatrix() {
       smat.CopyToMat(&M, kNoTrans);
       pmat.SwapSparseMatrix(&smat);
     }
-    
+
     KALDI_ASSERT(pmat.NumRows() == num_rows);
     KALDI_ASSERT(pmat.NumCols() == num_cols);
     GeneralMatrix pmat2(pmat);
@@ -4460,6 +4463,38 @@ static void UnitTestRandCategorical() {
   }
 }
 
+
+template<class Real>
+void PlaceNansInGaps(Matrix<Real> *mat) {
+  int32 num_rows = mat->NumRows(), num_cols = mat->NumCols(),
+      stride = mat->Stride();
+  BaseFloat not_a_number = nan(" ");  // nan is from <cmath>
+  for (int32 r = 0; r + 1 < num_rows; r++) {
+    for (int32 j = num_cols; j < stride; j++) {
+      if (RandInt(0, 1) == 0)
+        (mat->RowData(r))[j] = not_a_number;
+      else
+        (mat->RowData(r))[j] = RandGauss() * 1.5e+31;
+    }
+  }
+}
+
+
+template <class Real>
+static void UnitTestAddMatMatNans() {
+  for (int32 i = 0; i < 200; i++) {
+    int32 num_rows = RandInt(1, 256), mid = RandInt(1, 256), num_cols = RandInt(1, 256);
+    Matrix<Real> mat1(num_rows, mid), mat2(mid, num_cols), prod(num_rows, num_cols);
+    PlaceNansInGaps(&mat1);
+    PlaceNansInGaps(&mat2);
+    prod.AddMatMat(1.0, mat1, kNoTrans, mat2, kNoTrans, 0.0);
+    // make sure the nan's don't propagate.
+    KALDI_ASSERT(prod.Sum() == 0.0 &&
+                 "The BLAS library that you are linking against has an issue that might "
+                 "cause problems later on.");
+  }
+}
+
 template<class Real>
 static void UnitTestTopEigs() {
   for (MatrixIndexT i = 0; i < 2; i++) {
@@ -4523,7 +4558,7 @@ template<typename Real> static void UnitTestTriVecSolver() {
       // Test may fail due to almost-singular matrix.
       continue;
     }
-    
+
     Vector<Real> x(b);
     MatrixTransposeType trans = (iter % 2 == 0 ? kTrans : kNoTrans);
     x.Solve(T, trans);  // solve for T x = b
@@ -4539,7 +4574,7 @@ template<typename Real> static void MatrixUnitTest(bool full_test) {
   UnitTestLinearCgd<Real>();
   UnitTestGeneralMatrix<BaseFloat>();
   UnitTestTridiagonalize<Real>();
-  UnitTestTridiagonalizeAndQr<Real>();  
+  UnitTestTridiagonalizeAndQr<Real>();
   UnitTestAddMatSmat<Real>();
   UnitTestFloorChol<Real>();
   UnitTestFloorUnit<Real>();
@@ -4605,7 +4640,7 @@ template<typename Real> static void MatrixUnitTest(bool full_test) {
   UnitTestSger<Real>();
   UnitTestAddOuterProductPlusMinus<Real>();
   UnitTestTraceProduct<Real>();
-  UnitTestTransposeScatter<Real>(); 
+  UnitTestTransposeScatter<Real>();
   UnitTestRankNUpdate<Real>();
   UnitTestSherman<Real>();
   UnitTestSpVec<Real>();
@@ -4669,6 +4704,7 @@ template<typename Real> static void MatrixUnitTest(bool full_test) {
   UnitTestAddDiagVecMat<Real>();
   UnitTestAddMatDiagVec<Real>();
   UnitTestAddMatMatElements<Real>();
+  UnitTestAddMatMatNans<Real>();
   UnitTestAddToDiagMatrix<Real>();
   UnitTestAddToDiag<Real>();
   UnitTestMaxAbsEig<Real>();
@@ -4678,7 +4714,7 @@ template<typename Real> static void MatrixUnitTest(bool full_test) {
   UnitTestAddVecVec<Real>();
   UnitTestReplaceValue<Real>();
   // The next one is slow.  The upshot is that Eig is up to ten times faster
-  // than SVD. 
+  // than SVD.
   // UnitTestSvdSpeed<Real>();
   KALDI_LOG << " Point K";
   UnitTestTriVecSolver<Real>();
diff --git a/src/matrix/sparse-matrix-test.cc b/src/matrix/sparse-matrix-test.cc
index 3f350025bc6..5ecf1c887d9 100644
--- a/src/matrix/sparse-matrix-test.cc
+++ b/src/matrix/sparse-matrix-test.cc
@@ -32,7 +32,7 @@ void UnitTestSparseVectorSum() {
 
     Vector<Real> vec(dim);
     vec.SetRandn();
-    svec.CopyToVec(&vec);
+    svec.CopyElementsToVec(&vec);
 
     Real sum1 = svec.Sum();
     Real sum2 = vec.Sum();
@@ -50,7 +50,7 @@ void UnitTestSparseVectorAddToVec() {
 
     Vector<Real> vec(dim);
     vec.SetRandn();
-    svec.CopyToVec(&vec);
+    svec.CopyElementsToVec(&vec);
 
     Vector<Real> other_vec1(dim);
     other_vec1.SetRandn();
@@ -66,13 +66,16 @@ template <typename Real>
 void UnitTestSparseVectorMax() {
   for (int32 i = 0; i < 10; i++) {
     MatrixIndexT dim = 10 + Rand() % 40;
+    if (RandInt(0, 3) == 0)
+      dim = RandInt(1, 5);
 
     SparseVector<Real> svec(dim);
-    svec.SetRandn(0.8);
+    if (RandInt(0, 3) != 0)
+      svec.SetRandn(0.8);
 
     Vector<Real> vec(dim);
     vec.SetRandn();
-    svec.CopyToVec(&vec);
+    svec.CopyElementsToVec(&vec);
 
     int32 index1, index2;
     Real max1, max2;
@@ -95,7 +98,7 @@ void UnitTestSparseVectorVecSvec() {
 
     Vector<Real> vec(dim);
     vec.SetRandn();
-    svec.CopyToVec(&vec);
+    svec.CopyElementsToVec(&vec);
 
     Vector<Real> other_vec(dim);
     other_vec.SetRandn();
@@ -103,7 +106,7 @@ void UnitTestSparseVectorVecSvec() {
     Real product1 = VecSvec(other_vec, svec);
     Real product2 = VecVec(other_vec, vec);
 
-    AssertEqual(product1, product2, 0.00001);
+    KALDI_ASSERT(fabs(product1 - product2) < 1.0e-04);
   }
 }
 
diff --git a/src/matrix/sparse-matrix.cc b/src/matrix/sparse-matrix.cc
index 1db926d4bd7..477d36f190a 100644
--- a/src/matrix/sparse-matrix.cc
+++ b/src/matrix/sparse-matrix.cc
@@ -54,7 +54,7 @@ Real SparseVector<Real>::Sum() const {
 
 template <typename Real>
 template <typename OtherReal>
-void SparseVector<Real>::CopyToVec(VectorBase<OtherReal> *vec) const {
+void SparseVector<Real>::CopyElementsToVec(VectorBase<OtherReal> *vec) const {
   KALDI_ASSERT(vec->Dim() == this->dim_);
   vec->SetZero();
   OtherReal *other_data = vec->Data();
@@ -63,7 +63,14 @@ void SparseVector<Real>::CopyToVec(VectorBase<OtherReal> *vec) const {
   for (; iter != end; ++iter)
     other_data[iter->first] = iter->second;
 }
-
+template
+void SparseVector<float>::CopyElementsToVec(VectorBase<float> *vec) const;
+template
+void SparseVector<float>::CopyElementsToVec(VectorBase<double> *vec) const;
+template
+void SparseVector<double>::CopyElementsToVec(VectorBase<float> *vec) const;
+template
+void SparseVector<double>::CopyElementsToVec(VectorBase<double> *vec) const;
 
 template <typename Real>
 template <typename OtherReal>
@@ -342,7 +349,7 @@ void SparseMatrix<Real>::CopyToMat(MatrixBase<OtherReal> *other,
     KALDI_ASSERT(other->NumRows() == num_rows);
     for (MatrixIndexT i = 0; i < num_rows; i++) {
       SubVector<OtherReal> vec(*other, i);
-      rows_[i].CopyToVec(&vec);
+      rows_[i].CopyElementsToVec(&vec);
     }
   } else {
     OtherReal *other_col_data = other->Data();
@@ -374,23 +381,18 @@ void SparseMatrix<double>::CopyToMat(MatrixBase<double> *other,
                                     MatrixTransposeType trans) const;
 
 template <typename Real>
-template <typename OtherReal>
-void SparseMatrix<Real>::CopyToVec(VectorBase<OtherReal> *other) const {
+void SparseMatrix<Real>::CopyElementsToVec(VectorBase<Real> *other) const {
   KALDI_ASSERT(other->Dim() == NumElements());
-  OtherReal *dst_data = other->Data();
+  Real *dst_data = other->Data();
   int32 dst_index = 0;
   for (int32 i = 0; i < rows_.size(); ++i) {
     for (int32 j = 0; j < rows_[i].NumElements(); ++j) {
       dst_data[dst_index] =
-          static_cast<OtherReal>(rows_[i].GetElement(j).second);
+          static_cast<Real>(rows_[i].GetElement(j).second);
       dst_index++;
     }
   }
 }
-template void SparseMatrix<float>::CopyToVec(VectorBase<float> *other) const;
-template void SparseMatrix<float>::CopyToVec(VectorBase<double> *other) const;
-template void SparseMatrix<double>::CopyToVec(VectorBase<float> *other) const;
-template void SparseMatrix<double>::CopyToVec(VectorBase<double> *other) const;
 
 template <typename Real>
 template <typename OtherReal>
@@ -1086,11 +1088,23 @@ Real SparseVector<Real>::Max(int32 *index_out) const {
       index = iter->first + 1;
     }
   }
-  KALDI_ERR << "Code error";  // you should not reach here, it would be a bug in
-                              // the code.
-  return 0;
+  // we can reach here if either pairs_.empty(), or
+  // pairs_ is nonempty but contains a sequence (0, 1, 2,...).
+  if (!pairs_.empty())
+    index = pairs_.back().first + 1;
+  // else leave index at zero
+  KALDI_ASSERT(index < dim_);
+  *index_out = index;
+  return 0.0;
+}
+
+void GeneralMatrix::Swap(GeneralMatrix *other) {
+  mat_.Swap(&(other->mat_));
+  cmat_.Swap(&(other->cmat_));
+  smat_.Swap(&(other->smat_));
 }
 
+
 template class SparseVector<float>;
 template class SparseVector<double>;
 template class SparseMatrix<float>;
diff --git a/src/matrix/sparse-matrix.h b/src/matrix/sparse-matrix.h
index b85794c4d1c..9f9362542e1 100644
--- a/src/matrix/sparse-matrix.h
+++ b/src/matrix/sparse-matrix.h
@@ -43,7 +43,7 @@ class SparseVector {
   Real Sum() const;
 
   template <class OtherReal>
-  void CopyToVec(VectorBase<OtherReal> *vec) const;
+  void CopyElementsToVec(VectorBase<OtherReal> *vec) const;
 
   // *vec += alpha * *this.
   template <class OtherReal>
@@ -136,8 +136,7 @@ class SparseMatrix {
 
   /// Copies the values of all the elements in SparseMatrix into a VectorBase
   /// object.
-  template <class OtherReal>
-  void CopyToVec(VectorBase<OtherReal> *other) const;
+  void CopyElementsToVec(VectorBase<Real> *other) const;
 
   /// Copies data from another sparse matrix. We will add the transpose option
   /// later when it is necessary.
@@ -259,7 +258,7 @@ class GeneralMatrix {
   const Matrix<BaseFloat>& GetFullMatrix() const;
 
   /// Outputs the contents as a matrix.  This will work regardless of
-  /// Type().
+  /// Type().  Sizes its output, unlike CopyToMat().
   void GetMatrix(Matrix<BaseFloat> *mat) const;
 
   /// Swaps the with the given Matrix.  This will only work if
@@ -267,7 +266,7 @@ class GeneralMatrix {
   void SwapFullMatrix(Matrix<BaseFloat> *mat);
 
   /// Copies contents, regardless of type, to "mat", which must be correctly
-  /// sized.
+  /// sized.  See also GetMatrix(), which will size its output for you.
   void CopyToMat(MatrixBase<BaseFloat> *mat,
                  MatrixTransposeType trans = kNoTrans) const;
 
@@ -304,7 +303,6 @@ class GeneralMatrix {
 
   explicit GeneralMatrix(const SparseMatrix<BaseFloat> &smat) { *this = smat; }
 
-
   GeneralMatrix() { }
   // Assignment operator.
   GeneralMatrix &operator =(const GeneralMatrix &other);
@@ -312,6 +310,8 @@ class GeneralMatrix {
   GeneralMatrix(const GeneralMatrix &other) { *this = other; }
   // Sets to the empty matrix.
   void Clear();
+  // shallow swap
+  void Swap(GeneralMatrix *other);
  private:
   // We don't explicitly store the type of the matrix.  Rather, we make
   // sure that only one of the matrices is ever nonempty, and the Type()
diff --git a/src/matrix/srfft.cc b/src/matrix/srfft.cc
index efd95c08283..9d04c3badd6 100644
--- a/src/matrix/srfft.cc
+++ b/src/matrix/srfft.cc
@@ -45,6 +45,32 @@ SplitRadixComplexFft<Real>::SplitRadixComplexFft(MatrixIndexT N) {
   ComputeTables();
 }
 
+template <typename Real>
+SplitRadixComplexFft<Real>::SplitRadixComplexFft(
+    const SplitRadixComplexFft<Real> &other):
+    N_(other.N_), logn_(other.logn_) {
+  // This code duplicates tables from a previously computed object.
+  // Compare with the code in ComputeTables().
+  MatrixIndexT lg2 = logn_ >> 1;
+  if (logn_ & 1) lg2++;
+  MatrixIndexT brseed_size = 1 << lg2;
+  brseed_ = new MatrixIndexT[brseed_size];
+  std::memcpy(brseed_, other.brseed_, sizeof(MatrixIndexT) * brseed_size);
+
+  if (logn_ < 4) {
+    tab_ = NULL;
+  } else {
+    tab_ = new Real*[logn_ - 3];
+    for (MatrixIndexT i = logn_; i >= 4 ; i--) {
+      MatrixIndexT m = 1 << i, m2 = m / 2, m4 = m2 / 2;
+      MatrixIndexT this_array_size = 6 * (m4 - 2);
+      tab_[i-4] = new Real[this_array_size];
+      std::memcpy(tab_[i-4], other.tab_[i-4],
+                  sizeof(Real) * this_array_size);
+    }
+  }
+}
+
 template<typename Real>
 void SplitRadixComplexFft<Real>::ComputeTables() {
   MatrixIndexT    imax, lg2, i, j;
diff --git a/src/matrix/srfft.h b/src/matrix/srfft.h
index c0d36afabe3..98ff782a84a 100644
--- a/src/matrix/srfft.h
+++ b/src/matrix/srfft.h
@@ -54,6 +54,9 @@ class SplitRadixComplexFft {
   // initialize the object once and do the computation many times.
   SplitRadixComplexFft(Integer N);
 
+  // Copy constructor
+  SplitRadixComplexFft(const SplitRadixComplexFft &other);
+
   // Does the FFT computation, given pointers to the real and
   // imaginary parts.  If "forward", do the forward FFT; else
   // do the inverse FFT (without the 1/N factor).
@@ -94,7 +97,8 @@ class SplitRadixComplexFft {
   // IEEE Trans. ASSP, Aug. 1987, pp. 1120-1125).
   Real **tab_;       // Tables of butterfly coefficients.
 
-  KALDI_DISALLOW_COPY_AND_ASSIGN(SplitRadixComplexFft);
+  // Disallow assignment.
+  SplitRadixComplexFft &operator =(const SplitRadixComplexFft<Real> &other);
 };
 
 template<typename Real>
@@ -102,7 +106,11 @@ class SplitRadixRealFft: private SplitRadixComplexFft<Real> {
  public:
   SplitRadixRealFft(MatrixIndexT N):  // will fail unless N>=4 and N is a power of 2.
       SplitRadixComplexFft<Real> (N/2), N_(N) { }
-  
+
+  // Copy constructor
+  SplitRadixRealFft(const SplitRadixRealFft<Real> &other):
+      SplitRadixComplexFft<Real>(other), N_(other.N_) { }
+
   /// If forward == true, this function transforms from a sequence of N real points to its complex fourier
   /// transform; otherwise it goes in the reverse direction.  If you call it
   /// in the forward and then reverse direction and multiply by 1.0/N, you
@@ -118,7 +126,8 @@ class SplitRadixRealFft: private SplitRadixComplexFft<Real> {
   void Compute(Real *x, bool forward, std::vector<Real> *temp_buffer) const;
 
  private:
-  KALDI_DISALLOW_COPY_AND_ASSIGN(SplitRadixRealFft);  
+  // Disallow assignment.
+  SplitRadixRealFft &operator =(const SplitRadixRealFft<Real> &other);
   int N_;
 };
 
diff --git a/src/nnet/Makefile b/src/nnet/Makefile
index 8955bb57c22..d8c40efb533 100644
--- a/src/nnet/Makefile
+++ b/src/nnet/Makefile
@@ -14,7 +14,8 @@ OBJFILES = nnet-nnet.o nnet-component.o nnet-loss.o \
 
 LIBNAME = kaldi-nnet
 
-ADDLIBS = ../cudamatrix/kaldi-cudamatrix.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a  ../util/kaldi-util.a 
+ADDLIBS = ../cudamatrix/kaldi-cudamatrix.a ../matrix/kaldi-matrix.a ../util/kaldi-util.a \
+         ../thread/kaldi-thread.a ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
 
diff --git a/src/nnet/nnet-activation.h b/src/nnet/nnet-activation.h
index 6fb20bf349b..7391c519743 100644
--- a/src/nnet/nnet-activation.h
+++ b/src/nnet/nnet-activation.h
@@ -21,6 +21,9 @@
 #ifndef KALDI_NNET_NNET_ACTIVATION_H_
 #define KALDI_NNET_NNET_ACTIVATION_H_
 
+#include <string>
+#include <vector>
+
 #include "nnet/nnet-component.h"
 #include "nnet/nnet-utils.h"
 #include "cudamatrix/cu-math.h"
@@ -32,64 +35,67 @@ namespace nnet1 {
 
 class Softmax : public Component {
  public:
-  Softmax(int32 dim_in, int32 dim_out) 
-    : Component(dim_in, dim_out)
+  Softmax(int32 dim_in, int32 dim_out):
+    Component(dim_in, dim_out)
   { }
+
   ~Softmax()
   { }
 
   Component* Copy() const { return new Softmax(*this); }
   ComponentType GetType() const { return kSoftmax; }
 
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in, CuMatrixBase<BaseFloat> *out) {
+  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
+                    CuMatrixBase<BaseFloat> *out) {
     // y = e^x_j/sum_j(e^x_j)
     out->ApplySoftMaxPerRow(in);
   }
 
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in, const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff, CuMatrixBase<BaseFloat> *in_diff) {
+  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
+                        const CuMatrixBase<BaseFloat> &out,
+                        const CuMatrixBase<BaseFloat> &out_diff,
+                        CuMatrixBase<BaseFloat> *in_diff) {
     // simply copy the error derivative
-    // (ie. assume crossentropy error function, 
+    // (ie. assume crossentropy error function,
     // while in_diff contains (net_output-target) :
-    // this is already derivative of the error with 
+    // this is already derivative of the error with
     // respect to activations of last layer neurons)
     in_diff->CopyFromMat(out_diff);
   }
 };
 
 
-
 class BlockSoftmax : public Component {
  public:
-  BlockSoftmax(int32 dim_in, int32 dim_out) 
-    : Component(dim_in, dim_out)
+  BlockSoftmax(int32 dim_in, int32 dim_out):
+    Component(dim_in, dim_out)
   { }
+
   ~BlockSoftmax()
   { }
 
   Component* Copy() const { return new BlockSoftmax(*this); }
   ComponentType GetType() const { return kBlockSoftmax; }
-  
+
   void InitData(std::istream &is) {
     // parse config
     std::string token,
       dims_str;
-    while (!is.eof()) {
-      ReadToken(is, false, &token); 
+    while (is >> std::ws, !is.eof()) {
+      ReadToken(is, false, &token);
       /**/ if (token == "<BlockDims>") is >> dims_str;
       else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
                      << " (BlockDims)";
-      is >> std::ws; // eat-up whitespace
     }
     // parse dims,
     if (!kaldi::SplitStringToIntegers(dims_str, ",:", false, &block_dims))
       KALDI_ERR << "Invalid block-dims " << dims_str;
     // sanity check
     int32 sum = 0;
-    for (int32 i=0; i<block_dims.size(); i++) {
+    for (int32 i = 0; i < block_dims.size(); i++) {
       sum += block_dims[i];
     }
-    KALDI_ASSERT(sum == OutputDim()); 
+    KALDI_ASSERT(sum == OutputDim());
   }
 
   void ReadData(std::istream &is, bool binary) {
@@ -106,33 +112,42 @@ class BlockSoftmax : public Component {
     WriteIntegerVector(os, binary, block_dims);
   }
 
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in, CuMatrixBase<BaseFloat> *out) {
+  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
+                    CuMatrixBase<BaseFloat> *out) {
     // perform softmax per block:
     for (int32 bl = 0; bl < block_dims.size(); bl++) {
-      CuSubMatrix<BaseFloat> in_bl = in.ColRange(block_offset[bl], block_dims[bl]);
-      CuSubMatrix<BaseFloat> out_bl = out->ColRange(block_offset[bl], block_dims[bl]);
-      // y = e^x_j/sum_j(e^x_j)
+      // get the blocks,
+      CuSubMatrix<BaseFloat> in_bl =
+        in.ColRange(block_offset[bl], block_dims[bl]);
+      CuSubMatrix<BaseFloat> out_bl =
+        out->ColRange(block_offset[bl], block_dims[bl]);
+      // y = e^x_j/sum_j(e^x_j),
       out_bl.ApplySoftMaxPerRow(in_bl);
     }
   }
 
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in, const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff, CuMatrixBase<BaseFloat> *in_diff) {
+  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
+                        const CuMatrixBase<BaseFloat> &out,
+                        const CuMatrixBase<BaseFloat> &out_diff,
+                        CuMatrixBase<BaseFloat> *in_diff) {
     // copy the error derivative:
     // (assuming we already got softmax-cross-entropy derivative in out_diff)
     in_diff->CopyFromMat(out_diff);
-    
-    // zero-out line-in-block, where sum different from zero,
-    // process per block:
+
+    // Set the derivatives to zero for the matrix-lines in which
+    // the sum of 'derivatives' was 1.0 (i.e. there was no target):
     for (int32 bl = 0; bl < block_dims.size(); bl++) {
-      CuSubMatrix<BaseFloat> diff_bl = in_diff->ColRange(block_offset[bl], block_dims[bl]);
+      // get the block,
+      CuSubMatrix<BaseFloat> diff_bl =
+        in_diff->ColRange(block_offset[bl], block_dims[bl]);
+      // get the sum of each row,
       CuVector<BaseFloat> row_sum(diff_bl.NumRows());
-      row_sum.AddColSumMat(1.0, diff_bl, 0.0); // 0:keep, 1:zero-out
-      // we'll scale rows by 0/1 masks
+      row_sum.AddColSumMat(1.0, diff_bl, 0.0);  // 0: keep as-is, 1: zero-out
+      // we'll scale rows by 0/1 masks,
       CuVector<BaseFloat> row_diff_mask(row_sum);
-      row_diff_mask.Scale(-1.0); // 0:keep, -1:zero-out
-      row_diff_mask.Add(1.0); // 1:keep, 0:zero-out
-      // here we should have only 0 and 1
+      row_diff_mask.Scale(-1.0);  // 0: keep as-is, -1: zero-out
+      row_diff_mask.Add(1.0);  // 1: keep as-is, 0: zero-out
+      // here we should have only 0's and 1's,
       diff_bl.MulRowsVec(row_diff_mask);
     }
   }
@@ -150,23 +165,27 @@ class BlockSoftmax : public Component {
 
 class Sigmoid : public Component {
  public:
-  Sigmoid(int32 dim_in, int32 dim_out) 
-    : Component(dim_in, dim_out)
+  Sigmoid(int32 dim_in, int32 dim_out):
+    Component(dim_in, dim_out)
   { }
+
   ~Sigmoid()
   { }
 
   Component* Copy() const { return new Sigmoid(*this); }
   ComponentType GetType() const { return kSigmoid; }
 
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in, CuMatrixBase<BaseFloat> *out) {
+  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
+                    CuMatrixBase<BaseFloat> *out) {
     // y = 1/(1+e^-x)
     out->Sigmoid(in);
   }
 
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in, const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff, CuMatrixBase<BaseFloat> *in_diff) {
-    // ey = y(1-y)ex
+  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
+                        const CuMatrixBase<BaseFloat> &out,
+                        const CuMatrixBase<BaseFloat> &out_diff,
+                        CuMatrixBase<BaseFloat> *in_diff) {
+    // ey = y(1-y)ex,
     in_diff->DiffSigmoid(out, out_diff);
   }
 };
@@ -175,22 +194,26 @@ class Sigmoid : public Component {
 
 class Tanh : public Component {
  public:
-  Tanh(int32 dim_in, int32 dim_out) 
-    : Component(dim_in, dim_out)
+  Tanh(int32 dim_in, int32 dim_out):
+    Component(dim_in, dim_out)
   { }
+
   ~Tanh()
   { }
 
   Component* Copy() const { return new Tanh(*this); }
   ComponentType GetType() const { return kTanh; }
 
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in, CuMatrixBase<BaseFloat> *out) {
-    // y = (e^x - e^(-x)) / (e^x + e^(-x))
+  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
+                    CuMatrixBase<BaseFloat> *out) {
+    // y = (e^x - e^(-x)) / (e^x + e^(-x)),
     out->Tanh(in);
   }
 
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in, const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff, CuMatrixBase<BaseFloat> *in_diff) {
+  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
+                        const CuMatrixBase<BaseFloat> &out,
+                        const CuMatrixBase<BaseFloat> &out_diff,
+                        CuMatrixBase<BaseFloat> *in_diff) {
     // ey = (1 - y^2)ex
     in_diff->DiffTanh(out, out_diff);
   }
@@ -201,8 +224,10 @@ class Tanh : public Component {
 class Dropout : public Component {
  public:
   Dropout(int32 dim_in, int32 dim_out):
-      Component(dim_in, dim_out), dropout_retention_(0.5)
+      Component(dim_in, dim_out),
+      dropout_retention_(0.5)
   { }
+
   ~Dropout()
   { }
 
@@ -210,11 +235,11 @@ class Dropout : public Component {
   ComponentType GetType() const { return kDropout; }
 
   void InitData(std::istream &is) {
-    is >> std::ws; // eat-up whitespace
+    is >> std::ws;  // eat-up whitespace
     // parse config
-    std::string token; 
-    while (!is.eof()) {
-      ReadToken(is, false, &token); 
+    std::string token;
+    while (is >> std::ws, !is.eof()) {
+      ReadToken(is, false, &token);
       /**/ if (token == "<DropoutRetention>") ReadBasicType(is, false, &dropout_retention_);
       else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
                      << " (DropoutRetention)";
@@ -235,29 +260,30 @@ class Dropout : public Component {
     WriteBasicType(os, binary, dropout_retention_);
   }
 
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in, CuMatrixBase<BaseFloat> *out) {
+  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
+                    CuMatrixBase<BaseFloat> *out) {
     out->CopyFromMat(in);
     // switch off 50% of the inputs...
-    dropout_mask_.Resize(out->NumRows(),out->NumCols());
+    dropout_mask_.Resize(out->NumRows(), out->NumCols());
     dropout_mask_.Set(dropout_retention_);
-    rand_.BinarizeProbs(dropout_mask_,&dropout_mask_);
+    rand_.BinarizeProbs(dropout_mask_, &dropout_mask_);
     out->MulElements(dropout_mask_);
     // rescale to keep same dynamic range as w/o dropout
     out->Scale(1.0/dropout_retention_);
   }
 
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in, const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff, CuMatrixBase<BaseFloat> *in_diff) {
+  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
+                        const CuMatrixBase<BaseFloat> &out,
+                        const CuMatrixBase<BaseFloat> &out_diff,
+                        CuMatrixBase<BaseFloat> *in_diff) {
     in_diff->CopyFromMat(out_diff);
     // use same mask on the error derivatives...
     in_diff->MulElements(dropout_mask_);
     // enlarge output to fit dynamic range w/o dropout
     in_diff->Scale(1.0/dropout_retention_);
   }
-  
-  BaseFloat GetDropoutRetention() {
-    return dropout_retention_;
-  }
+
+  BaseFloat GetDropoutRetention() { return dropout_retention_; }
 
   void SetDropoutRetention(BaseFloat dr) {
     dropout_retention_ = dr;
@@ -272,8 +298,8 @@ class Dropout : public Component {
 
 
 
-} // namespace nnet1
-} // namespace kaldi
+}  // namespace nnet1
+}  // namespace kaldi
 
-#endif
+#endif  // KALDI_NNET_NNET_ACTIVATION_H_
 
diff --git a/src/nnet/nnet-affine-transform.h b/src/nnet/nnet-affine-transform.h
index 4714fb1c4a4..0dc84fae6d8 100644
--- a/src/nnet/nnet-affine-transform.h
+++ b/src/nnet/nnet-affine-transform.h
@@ -21,6 +21,7 @@
 #ifndef KALDI_NNET_NNET_AFFINE_TRANSFORM_H_
 #define KALDI_NNET_NNET_AFFINE_TRANSFORM_H_
 
+#include <string>
 
 #include "nnet/nnet-component.h"
 #include "nnet/nnet-utils.h"
@@ -31,76 +32,71 @@ namespace nnet1 {
 
 class AffineTransform : public UpdatableComponent {
  public:
-  AffineTransform(int32 dim_in, int32 dim_out) 
-    : UpdatableComponent(dim_in, dim_out), 
-      linearity_(dim_out, dim_in), bias_(dim_out),
-      linearity_corr_(dim_out, dim_in), bias_corr_(dim_out),
-      learn_rate_coef_(1.0), bias_learn_rate_coef_(1.0), max_norm_(0.0) 
+  AffineTransform(int32 dim_in, int32 dim_out):
+    UpdatableComponent(dim_in, dim_out),
+    linearity_(dim_out, dim_in), bias_(dim_out),
+    linearity_corr_(dim_out, dim_in), bias_corr_(dim_out),
+    max_norm_(0.0)
   { }
   ~AffineTransform()
   { }
 
   Component* Copy() const { return new AffineTransform(*this); }
   ComponentType GetType() const { return kAffineTransform; }
-  
+
   void InitData(std::istream &is) {
     // define options
     float bias_mean = -2.0, bias_range = 2.0, param_stddev = 0.1;
-    float learn_rate_coef = 1.0, bias_learn_rate_coef = 1.0;
-    float max_norm = 0.0;
     // parse config
-    std::string token; 
-    while (!is.eof()) {
-      ReadToken(is, false, &token); 
+    std::string token;
+    while (is >> std::ws, !is.eof()) {
+      ReadToken(is, false, &token);
       /**/ if (token == "<ParamStddev>") ReadBasicType(is, false, &param_stddev);
       else if (token == "<BiasMean>")    ReadBasicType(is, false, &bias_mean);
       else if (token == "<BiasRange>")   ReadBasicType(is, false, &bias_range);
-      else if (token == "<LearnRateCoef>") ReadBasicType(is, false, &learn_rate_coef);
-      else if (token == "<BiasLearnRateCoef>") ReadBasicType(is, false, &bias_learn_rate_coef);
-      else if (token == "<MaxNorm>") ReadBasicType(is, false, &max_norm);
+      else if (token == "<LearnRateCoef>") ReadBasicType(is, false, &learn_rate_coef_);
+      else if (token == "<BiasLearnRateCoef>") ReadBasicType(is, false, &bias_learn_rate_coef_);
+      else if (token == "<MaxNorm>") ReadBasicType(is, false, &max_norm_);
       else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
                      << " (ParamStddev|BiasMean|BiasRange|LearnRateCoef|BiasLearnRateCoef)";
-      is >> std::ws; // eat-up whitespace
     }
 
     //
-    // initialize
-    //
-    Matrix<BaseFloat> mat(output_dim_, input_dim_);
-    for (int32 r=0; r<output_dim_; r++) {
-      for (int32 c=0; c<input_dim_; c++) {
-        mat(r,c) = param_stddev * RandGauss(); // 0-mean Gauss with given std_dev
-      }
-    }
-    linearity_ = mat;
-    //
-    Vector<BaseFloat> vec(output_dim_);
-    for (int32 i=0; i<output_dim_; i++) {
-      // +/- 1/2*bias_range from bias_mean:
-      vec(i) = bias_mean + (RandUniform() - 0.5) * bias_range; 
-    }
-    bias_ = vec;
-    //
-    learn_rate_coef_ = learn_rate_coef;
-    bias_learn_rate_coef_ = bias_learn_rate_coef;
-    max_norm_ = max_norm;
+    // Initialize trainable parameters,
     //
+    // Gaussian with given std_dev (mean = 0),
+    linearity_.Resize(OutputDim(), InputDim());
+    RandGauss(0.0, param_stddev, &linearity_);
+    // Uniform,
+    bias_.Resize(OutputDim());
+    RandUniform(bias_mean, bias_range, &bias_);
   }
 
   void ReadData(std::istream &is, bool binary) {
-    // optional learning-rate coefs
-    if ('<' == Peek(is, binary)) {
-      ExpectToken(is, binary, "<LearnRateCoef>");
-      ReadBasicType(is, binary, &learn_rate_coef_);
-      ExpectToken(is, binary, "<BiasLearnRateCoef>");
-      ReadBasicType(is, binary, &bias_learn_rate_coef_);
-    }
-    if ('<' == Peek(is, binary)) {
-      ExpectToken(is, binary, "<MaxNorm>");
-      ReadBasicType(is, binary, &max_norm_);
+    // Read all the '<Tokens>' in arbitrary order,
+    while ('<' == Peek(is, binary)) {
+      int first_char = PeekToken(is, binary);
+      switch (first_char) {
+        case 'L': ExpectToken(is, binary, "<LearnRateCoef>");
+          ReadBasicType(is, binary, &learn_rate_coef_);
+          break;
+        case 'B': ExpectToken(is, binary, "<BiasLearnRateCoef>");
+          ReadBasicType(is, binary, &bias_learn_rate_coef_);
+          break;
+        case 'M': ExpectToken(is, binary, "<MaxNorm>");
+          ReadBasicType(is, binary, &max_norm_);
+          break;
+        default:
+          std::string token;
+          ReadToken(is, false, &token);
+          KALDI_ERR << "Unknown token: " << token;
+      }
     }
-    // weights
+    // Read the data (data follow the tokens),
+
+    // weight matrix,
     linearity_.Read(is, binary);
+    // bias vector,
     bias_.Read(is, binary);
 
     KALDI_ASSERT(linearity_.NumRows() == output_dim_);
@@ -115,49 +111,73 @@ class AffineTransform : public UpdatableComponent {
     WriteBasicType(os, binary, bias_learn_rate_coef_);
     WriteToken(os, binary, "<MaxNorm>");
     WriteBasicType(os, binary, max_norm_);
+    if (!binary) os << "\n";
     // weights
     linearity_.Write(os, binary);
     bias_.Write(os, binary);
   }
 
-  int32 NumParams() const { return linearity_.NumRows()*linearity_.NumCols() + bias_.Dim(); }
-  
-  void GetParams(Vector<BaseFloat>* wei_copy) const {
-    wei_copy->Resize(NumParams());
-    int32 linearity_num_elem = linearity_.NumRows() * linearity_.NumCols(); 
-    wei_copy->Range(0,linearity_num_elem).CopyRowsFromMat(Matrix<BaseFloat>(linearity_));
-    wei_copy->Range(linearity_num_elem, bias_.Dim()).CopyFromVec(Vector<BaseFloat>(bias_));
+  int32 NumParams() const {
+    return linearity_.NumRows()*linearity_.NumCols() + bias_.Dim();
+  }
+
+  void GetGradient(VectorBase<BaseFloat>* gradient) const {
+    KALDI_ASSERT(gradient->Dim() == NumParams());
+    int32 linearity_num_elem = linearity_.NumRows() * linearity_.NumCols();
+    gradient->Range(0, linearity_num_elem).CopyRowsFromMat(linearity_corr_);
+    gradient->Range(linearity_num_elem, bias_.Dim()).CopyFromVec(bias_corr_);
   }
-  
+
+  void GetParams(VectorBase<BaseFloat>* params) const {
+    KALDI_ASSERT(params->Dim() == NumParams());
+    int32 linearity_num_elem = linearity_.NumRows() * linearity_.NumCols();
+    params->Range(0, linearity_num_elem).CopyRowsFromMat(linearity_);
+    params->Range(linearity_num_elem, bias_.Dim()).CopyFromVec(bias_);
+  }
+
+  void SetParams(const VectorBase<BaseFloat>& params) {
+    KALDI_ASSERT(params.Dim() == NumParams());
+    int32 linearity_num_elem = linearity_.NumRows() * linearity_.NumCols();
+    linearity_.CopyRowsFromVec(params.Range(0, linearity_num_elem));
+    bias_.CopyFromVec(params.Range(linearity_num_elem, bias_.Dim()));
+  }
+
   std::string Info() const {
-    return std::string("\n  linearity") + MomentStatistics(linearity_) +
-           "\n  bias" + MomentStatistics(bias_);
+    return std::string("\n  linearity") +
+      MomentStatistics(linearity_) +
+      ", lr-coef " + ToString(learn_rate_coef_) +
+      ", max-norm " + ToString(max_norm_) +
+      "\n  bias" + MomentStatistics(bias_) +
+      ", lr-coef " + ToString(bias_learn_rate_coef_);
   }
   std::string InfoGradient() const {
-    return std::string("\n  linearity_grad") + MomentStatistics(linearity_corr_) + 
-           ", lr-coef " + ToString(learn_rate_coef_) +
-           ", max-norm " + ToString(max_norm_) +
-           "\n  bias_grad" + MomentStatistics(bias_corr_) + 
-           ", lr-coef " + ToString(bias_learn_rate_coef_);
-           
+    return std::string("\n  linearity_grad") +
+      MomentStatistics(linearity_corr_) +
+      ", lr-coef " + ToString(learn_rate_coef_) +
+      ", max-norm " + ToString(max_norm_) +
+      "\n  bias_grad" + MomentStatistics(bias_corr_) +
+      ", lr-coef " + ToString(bias_learn_rate_coef_);
   }
 
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in, CuMatrixBase<BaseFloat> *out) {
+  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
+                    CuMatrixBase<BaseFloat> *out) {
     // precopy bias
     out->AddVecToRows(1.0, bias_, 0.0);
     // multiply by weights^t
     out->AddMatMat(1.0, in, kNoTrans, linearity_, kTrans, 1.0);
   }
 
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in, const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff, CuMatrixBase<BaseFloat> *in_diff) {
+  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
+                        const CuMatrixBase<BaseFloat> &out,
+                        const CuMatrixBase<BaseFloat> &out_diff,
+                        CuMatrixBase<BaseFloat> *in_diff) {
     // multiply error derivative by weights
     in_diff->AddMatMat(1.0, out_diff, kNoTrans, linearity_, kNoTrans, 0.0);
   }
 
 
-  void Update(const CuMatrixBase<BaseFloat> &input, const CuMatrixBase<BaseFloat> &diff) {
+  void Update(const CuMatrixBase<BaseFloat> &input,
+              const CuMatrixBase<BaseFloat> &diff) {
     // we use following hyperparameters from the option class
     const BaseFloat lr = opts_.learn_rate * learn_rate_coef_;
     const BaseFloat lr_bias = opts_.learn_rate * bias_learn_rate_coef_;
@@ -186,28 +206,24 @@ class AffineTransform : public UpdatableComponent {
       lin_sqr.MulElements(linearity_);
       CuVector<BaseFloat> l2(OutputDim());
       l2.AddColSumMat(1.0, lin_sqr, 0.0);
-      l2.ApplyPow(0.5); // we have per-neuron L2 norms
+      l2.ApplyPow(0.5);  // we have per-neuron L2 norms,
       CuVector<BaseFloat> scl(l2);
       scl.Scale(1.0/max_norm_);
       scl.ApplyFloor(1.0);
       scl.InvertElements();
-      linearity_.MulRowsVec(scl); // shink to sphere!
+      linearity_.MulRowsVec(scl);  // shink to sphere!
     }
   }
 
-  /// Accessors to the component parameters
-  const CuVectorBase<BaseFloat>& GetBias() const {
-    return bias_;
-  }
+  /// Accessors to the component parameters,
+  const CuVectorBase<BaseFloat>& GetBias() const { return bias_; }
 
   void SetBias(const CuVectorBase<BaseFloat>& bias) {
     KALDI_ASSERT(bias.Dim() == bias_.Dim());
     bias_.CopyFromVec(bias);
   }
 
-  const CuMatrixBase<BaseFloat>& GetLinearity() const {
-    return linearity_;
-  }
+  const CuMatrixBase<BaseFloat>& GetLinearity() const { return linearity_; }
 
   void SetLinearity(const CuMatrixBase<BaseFloat>& linearity) {
     KALDI_ASSERT(linearity.NumRows() == linearity_.NumRows());
@@ -215,15 +231,6 @@ class AffineTransform : public UpdatableComponent {
     linearity_.CopyFromMat(linearity);
   }
 
-  const CuVectorBase<BaseFloat>& GetBiasCorr() const {
-    return bias_corr_;
-  }
-
-  const CuMatrixBase<BaseFloat>& GetLinearityCorr() const {
-    return linearity_corr_;
-  }
-
-
  private:
   CuMatrix<BaseFloat> linearity_;
   CuVector<BaseFloat> bias_;
@@ -231,12 +238,10 @@ class AffineTransform : public UpdatableComponent {
   CuMatrix<BaseFloat> linearity_corr_;
   CuVector<BaseFloat> bias_corr_;
 
-  BaseFloat learn_rate_coef_;
-  BaseFloat bias_learn_rate_coef_;
   BaseFloat max_norm_;
 };
 
-} // namespace nnet1
-} // namespace kaldi
+}  // namespace nnet1
+}  // namespace kaldi
 
-#endif
+#endif  // KALDI_NNET_NNET_AFFINE_TRANSFORM_H_
diff --git a/src/nnet/nnet-average-pooling-2d-component.h b/src/nnet/nnet-average-pooling-2d-component.h
index 0405f915b2f..17ae87f94db 100644
--- a/src/nnet/nnet-average-pooling-2d-component.h
+++ b/src/nnet/nnet-average-pooling-2d-component.h
@@ -1,4 +1,4 @@
-// nnet/nnet-average-pooling-component.h
+// nnet/nnet-average-pooling-2d-component.h
 
 // Copyright 2014  Brno University of Technology (author: Karel Vesely)
 //                 Johns Hopkins University (author: Sri Harish Mallidi)
@@ -22,6 +22,8 @@
 #ifndef KALDI_NNET_NNET_AVERAGE_POOLING_2D_COMPONENT_H_
 #define KALDI_NNET_NNET_AVERAGE_POOLING_2D_COMPONENT_H_
 
+#include <string>
+#include <vector>
 
 #include "nnet/nnet-component.h"
 #include "nnet/nnet-utils.h"
@@ -38,10 +40,11 @@ namespace nnet1 {
  */
 class AveragePooling2DComponent : public Component {
  public:
-  AveragePooling2DComponent(int32 dim_in, int32 dim_out)
-    : Component(dim_in, dim_out),
-      fmap_x_len_(0), fmap_y_len_(0),
-      pool_x_len_(0), pool_y_len_(0), pool_x_step_(0), pool_y_step_(0)
+  AveragePooling2DComponent(int32 dim_in, int32 dim_out):
+    Component(dim_in, dim_out),
+    fmap_x_len_(0), fmap_y_len_(0),
+    pool_x_len_(0), pool_y_len_(0),
+    pool_x_step_(0), pool_y_step_(0)
   { }
   ~AveragePooling2DComponent()
   { }
@@ -52,7 +55,7 @@ class AveragePooling2DComponent : public Component {
   void InitData(std::istream &is) {
     // parse config
     std::string token;
-    while (!is.eof()) {
+    while (is >> std::ws, !is.eof()) {
       ReadToken(is, false, &token);
       /**/ if (token == "<FmapXLen>") ReadBasicType(is, false, &fmap_x_len_);
       else if (token == "<FmapYLen>") ReadBasicType(is, false, &fmap_y_len_);
@@ -61,11 +64,12 @@ class AveragePooling2DComponent : public Component {
       else if (token == "<PoolXStep>") ReadBasicType(is, false, &pool_x_step_);
       else if (token == "<PoolYStep>") ReadBasicType(is, false, &pool_y_step_);
       else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
-                     << " (FmapXLen|FmapYLen|PoolXLen|PoolYLen|PoolXStep|PoolYStep)";
-      is >> std::ws;  // eat-up whitespace
+             << " (FmapXLen|FmapYLen|PoolXLen|PoolYLen|PoolXStep|PoolYStep)";
     }
     // check
-    KALDI_ASSERT(fmap_x_len_ * fmap_y_len_ * pool_x_len_ * pool_y_len_ * pool_x_step_ * pool_y_step_  != 0);
+    KALDI_ASSERT(fmap_x_len_ * fmap_y_len_ != 0);
+    KALDI_ASSERT(pool_x_len_ * pool_y_len_ != 0);
+    KALDI_ASSERT(pool_x_step_ * pool_y_step_  != 0);
   }
 
   void ReadData(std::istream &is, bool binary) {
@@ -119,14 +123,15 @@ class AveragePooling2DComponent : public Component {
     WriteBasicType(os, binary, pool_y_step_);
   }
 
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in, CuMatrixBase<BaseFloat> *out) {
+  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
+                    CuMatrixBase<BaseFloat> *out) {
     // useful dims
     int32 num_input_fmaps = input_dim_ / (fmap_x_len_ * fmap_y_len_);
     int out_fmap_cnt = 0;
     for (int32 m = 0; m < fmap_x_len_-pool_x_len_+1; m = m+pool_x_step_) {
       for (int32 n = 0; n < fmap_y_len_-pool_y_len_+1; n = n+pool_y_step_) {
         int32 st = 0;
-        st = (m*fmap_y_len_+n)*num_input_fmaps;
+        st = (m * fmap_y_len_ + n) * num_input_fmaps;
         CuSubMatrix<BaseFloat> pool(out->ColRange(out_fmap_cnt * num_input_fmaps, num_input_fmaps));
         pool.SetZero();  // reset
         for (int32 i = 0; i < pool_x_len_; i++) {
@@ -143,8 +148,10 @@ class AveragePooling2DComponent : public Component {
     }
   }
 
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in, const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff, CuMatrixBase<BaseFloat> *in_diff) {
+  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
+                        const CuMatrixBase<BaseFloat> &out,
+                        const CuMatrixBase<BaseFloat> &out_diff,
+                        CuMatrixBase<BaseFloat> *in_diff) {
     // useful dims
     int32 num_input_fmaps = input_dim_ / (fmap_x_len_ * fmap_y_len_);
     int32 inp_fmap_size = fmap_x_len_ * fmap_y_len_;
@@ -160,8 +167,8 @@ class AveragePooling2DComponent : public Component {
     for (int32 m = 0; m < fmap_x_len_-pool_x_len_+1; m = m+pool_x_step_) {
       for (int32 n = 0; n < fmap_y_len_-pool_y_len_+1; n = n+pool_y_step_) {
         int32 st = 0;
-        st = (m*fmap_y_len_+n)*num_input_fmaps;
-        CuSubMatrix<BaseFloat> src(out_diff.ColRange(out_fmap_cnt*num_input_fmaps, num_input_fmaps));
+        st = (m * fmap_y_len_ + n) * num_input_fmaps;
+        CuSubMatrix<BaseFloat> src(out_diff.ColRange(out_fmap_cnt * num_input_fmaps, num_input_fmaps));
         for (int32 i = 0; i < pool_x_len_; i++) {
           for (int32 j = 0; j < pool_y_len_; j++) {
             int32 c = 0;
@@ -169,7 +176,7 @@ class AveragePooling2DComponent : public Component {
                    + j * num_input_fmaps;
             CuSubMatrix<BaseFloat> tgt(in_diff->ColRange(c, num_input_fmaps));
             tgt.AddMat(1.0, src);
-            patch_summands[c/num_input_fmaps] += 1;
+            patch_summands[c / num_input_fmaps] += 1;
           }
         }
         out_fmap_cnt++;
@@ -192,11 +199,11 @@ class AveragePooling2DComponent : public Component {
 
  private:
   int32 fmap_x_len_, fmap_y_len_,
-    pool_x_len_, pool_y_len_,
-    pool_x_step_, pool_y_step_;
+        pool_x_len_, pool_y_len_,
+        pool_x_step_, pool_y_step_;
 };
 
 }  // namespace nnet1
 }  // namespace kaldi
 
-#endif
+#endif  // KALDI_NNET_NNET_AVERAGE_POOLING_2D_COMPONENT_H_
diff --git a/src/nnet/nnet-average-pooling-component.h b/src/nnet/nnet-average-pooling-component.h
index 0fcfcb55f30..605c6ba327a 100644
--- a/src/nnet/nnet-average-pooling-component.h
+++ b/src/nnet/nnet-average-pooling-component.h
@@ -21,6 +21,8 @@
 #ifndef KALDI_NNET_NNET_AVERAGE_POOLING_COMPONENT_H_
 #define KALDI_NNET_NNET_AVERAGE_POOLING_COMPONENT_H_
 
+#include <string>
+#include <vector>
 
 #include "nnet/nnet-component.h"
 #include "nnet/nnet-utils.h"
@@ -37,26 +39,29 @@ namespace nnet1 {
  */
 class AveragePoolingComponent : public Component {
  public:
-  AveragePoolingComponent(int32 dim_in, int32 dim_out) 
-    : Component(dim_in, dim_out), pool_size_(0), pool_step_(0), pool_stride_(0)
+  AveragePoolingComponent(int32 dim_in, int32 dim_out):
+    Component(dim_in, dim_out),
+    pool_size_(0),
+    pool_step_(0),
+    pool_stride_(0)
   { }
+
   ~AveragePoolingComponent()
   { }
 
   Component* Copy() const { return new AveragePoolingComponent(*this); }
   ComponentType GetType() const { return kAveragePoolingComponent; }
-  
+
   void InitData(std::istream &is) {
     // parse config
-    std::string token; 
-    while (!is.eof()) {
+    std::string token;
+    while (is >> std::ws, !is.eof()) {
       ReadToken(is, false, &token);
       /**/ if (token == "<PoolSize>") ReadBasicType(is, false, &pool_size_);
       else if (token == "<PoolStep>") ReadBasicType(is, false, &pool_step_);
       else if (token == "<PoolStride>") ReadBasicType(is, false, &pool_stride_);
       else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
                      << " (PoolSize|PoolStep|PoolStride)";
-      is >> std::ws; // eat-up whitespace
     }
     // check
     KALDI_ASSERT(pool_size_ != 0 && pool_step_ != 0 && pool_stride_ != 0);
@@ -95,7 +100,8 @@ class AveragePoolingComponent : public Component {
     WriteBasicType(os, binary, pool_stride_);
   }
 
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in, CuMatrixBase<BaseFloat> *out) {
+  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
+                    CuMatrixBase<BaseFloat> *out) {
     // useful dims
     int32 num_patches = input_dim_ / pool_stride_;
     int32 num_pools = 1 + (num_patches - pool_size_) / pool_step_;
@@ -104,17 +110,19 @@ class AveragePoolingComponent : public Component {
     for (int32 q = 0; q < num_pools; q++) {
       // get output buffer of the pool
       CuSubMatrix<BaseFloat> pool(out->ColRange(q*pool_stride_, pool_stride_));
-      pool.SetZero(); // reset
-      for (int32 r = 0; r < pool_size_; r++) { // sum
-        int32 p = r + q * pool_step_; // p = input patch
+      pool.SetZero();  // reset,
+      for (int32 r = 0; r < pool_size_; r++) {  // sum
+        int32 p = r + q * pool_step_;  // p = input patch
         pool.AddMat(1.0, in.ColRange(p*pool_stride_, pool_stride_));
       }
-      pool.Scale(1.0 / pool_size_); // divide by #summands
+      pool.Scale(1.0 / pool_size_);  // divide by #summands
     }
   }
 
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in, const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff, CuMatrixBase<BaseFloat> *in_diff) {
+  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
+                        const CuMatrixBase<BaseFloat> &out,
+                        const CuMatrixBase<BaseFloat> &out_diff,
+                        CuMatrixBase<BaseFloat> *in_diff) {
     // useful dims
     int32 num_patches = input_dim_ / pool_stride_;
     int32 num_pools = 1 + (num_patches - pool_size_) / pool_step_;
@@ -122,14 +130,14 @@ class AveragePoolingComponent : public Component {
     //
     // here we note how many diff matrices are summed for each input patch,
     std::vector<int32> patch_summands(num_patches, 0);
-    // this metainfo will be used to divide diff of patches 
+    // this metainfo will be used to divide diff of patches
     // used in more than one pool.
     //
-    
-    in_diff->SetZero(); // reset
 
-    for(int32 q=0; q<num_pools; q++) { // sum
-      for(int32 r=0; r<pool_size_; r++) {
+    in_diff->SetZero();  // reset
+
+    for (int32 q = 0; q < num_pools; q++) {  // sum
+      for (int32 r = 0; r < pool_size_; r++) {
         int32 p = r + q * pool_step_;
         CuSubMatrix<BaseFloat> tgt(in_diff->ColRange(p*pool_stride_, pool_stride_));
         CuSubMatrix<BaseFloat> src(out_diff.ColRange(q*pool_stride_, pool_stride_));
@@ -142,20 +150,20 @@ class AveragePoolingComponent : public Component {
     in_diff->Scale(1.0 / pool_size_);
 
     // divide diff by #summands (compensate for patches used in more pools)
-    for(int32 p=0; p<num_patches; p++) {
+    for (int32 p = 0; p < num_patches; p++) {
       CuSubMatrix<BaseFloat> tgt(in_diff->ColRange(p*pool_stride_, pool_stride_));
-      KALDI_ASSERT(patch_summands[p] > 0); // patch at least in one pool
+      KALDI_ASSERT(patch_summands[p] > 0);  // patch at least in one pool
       tgt.Scale(1.0/patch_summands[p]);
     }
   }
 
  private:
-  int32 pool_size_,   // input patches used for pooling
-        pool_step_,   // shift used for pooling (allow overlapping pools)
-        pool_stride_; // stride used to cut input matrix to a vector of matrices
+  int32 pool_size_,    // input patches used for pooling
+        pool_step_,    // shift used for pooling (allow overlapping pools)
+        pool_stride_;  // stride used to cut input to a vector of matrices
 };
 
-} // namespace nnet1
-} // namespace kaldi
+}  // namespace nnet1
+}  // namespace kaldi
 
-#endif
+#endif  // KALDI_NNET_NNET_AVERAGE_POOLING_COMPONENT_H_
diff --git a/src/nnet/nnet-blstm-projected-streams.h b/src/nnet/nnet-blstm-projected-streams.h
index e7922061c4d..6e136b898ee 100644
--- a/src/nnet/nnet-blstm-projected-streams.h
+++ b/src/nnet/nnet-blstm-projected-streams.h
@@ -1,7 +1,7 @@
 // nnet/nnet-blstm-projected-streams.h
 
-// Copyright 2014  Jiayu DU (Jerry), Wei Li
 // Copyright 2015  Chongjia Ni
+// Copyright 2014  Jiayu DU (Jerry), Wei Li
 // See ../../COPYING for clarification regarding multiple authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
@@ -22,6 +22,9 @@
 #ifndef KALDI_NNET_NNET_BLSTM_PROJECTED_STREAMS_H_
 #define KALDI_NNET_NNET_BLSTM_PROJECTED_STREAMS_H_
 
+#include <string>
+#include <vector>
+
 #include "nnet/nnet-component.h"
 #include "nnet/nnet-utils.h"
 #include "cudamatrix/cu-math.h"
@@ -46,13 +49,13 @@ namespace nnet1 {
 
 class BLstmProjectedStreams : public UpdatableComponent {
  public:
-  BLstmProjectedStreams(int32 input_dim, int32 output_dim) :
+  BLstmProjectedStreams(int32 input_dim, int32 output_dim):
     UpdatableComponent(input_dim, output_dim),
     ncell_(0),
     nrecur_(static_cast<int32>(output_dim/2)),
     nstream_(0),
     clip_gradient_(0.0)
-    //, dropout_rate_(0.0)
+    // , dropout_rate_(0.0)
   { }
 
   ~BLstmProjectedStreams()
@@ -61,70 +64,53 @@ class BLstmProjectedStreams : public UpdatableComponent {
   Component* Copy() const { return new BLstmProjectedStreams(*this); }
   ComponentType GetType() const { return kBLstmProjectedStreams; }
 
-  static void InitMatParam(CuMatrix<BaseFloat> &m, float scale) {
-    m.SetRandUniform();  // uniform in [0, 1]
-    m.Add(-0.5);         // uniform in [-0.5, 0.5]
-    m.Scale(2 * scale);  // uniform in [-scale, +scale]
-  }
-
-  static void InitVecParam(CuVector<BaseFloat> &v, float scale) {
-    Vector<BaseFloat> tmp(v.Dim());
-    for (int i=0; i < tmp.Dim(); i++) {
-      tmp(i) = (RandUniform() - 0.5) * 2 * scale;
-    }
-    v = tmp;
-  }
-
   /// set the utterance length used for parallel training
   void SetSeqLengths(const std::vector<int32> &sequence_lengths) {
-        sequence_lengths_ = sequence_lengths;
+    sequence_lengths_ = sequence_lengths;
   }
 
   void InitData(std::istream &is) {
-    // define options
+    // define options,
     float param_scale = 0.02;
-    // parse config
+    // parse the line from prototype,
     std::string token;
     while (!is.eof()) {
       ReadToken(is, false, &token);
-      if (token == "<CellDim>")
-        ReadBasicType(is, false, &ncell_);
-      else if (token == "<ClipGradient>")
-        ReadBasicType(is, false, &clip_gradient_);
-      // else if (token == "<DropoutRate>")
-      //  ReadBasicType(is, false, &dropout_rate_);
-      else if (token == "<ParamScale>")
-        ReadBasicType(is, false, &param_scale);
+      /**/ if (token == "<CellDim>") ReadBasicType(is, false, &ncell_);
+      else if (token == "<ClipGradient>") ReadBasicType(is, false, &clip_gradient_);
+      else if (token == "<LearnRateCoef>") ReadBasicType(is, false, &learn_rate_coef_);
+      else if (token == "<BiasLearnRateCoef>") ReadBasicType(is, false, &bias_learn_rate_coef_);
+      else if (token == "<ParamScale>") ReadBasicType(is, false, &param_scale);
+      // else if (token == "<DropoutRate>") ReadBasicType(is, false, &dropout_rate_);
       else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
-               << " (CellDim|NumStream|ParamScale)";
-               //<< " (CellDim|NumStream|DropoutRate|ParamScale)";
-      is >> std::ws;
+                     << " (CellDim|ClipGradient|LearnRateCoef|BiasLearnRateCoef|ParamScale)";
     }
 
-    // init weight and bias (Uniform)
+    // init the weights and biases (from uniform dist.),
     // forward direction
     f_w_gifo_x_.Resize(4*ncell_, input_dim_, kUndefined);
     f_w_gifo_r_.Resize(4*ncell_, nrecur_, kUndefined);
     f_w_r_m_.Resize(nrecur_, ncell_, kUndefined);
 
-    InitMatParam(f_w_gifo_x_, param_scale);
-    InitMatParam(f_w_gifo_r_, param_scale);
-    InitMatParam(f_w_r_m_, param_scale);
+    RandUniform(0.0, 2.0 * param_scale, &f_w_gifo_x_);
+    RandUniform(0.0, 2.0 * param_scale, &f_w_gifo_r_);
+    RandUniform(0.0, 2.0 * param_scale, &f_w_r_m_);
+
     // backward direction
     b_w_gifo_x_.Resize(4*ncell_, input_dim_, kUndefined);
     b_w_gifo_r_.Resize(4*ncell_, nrecur_, kUndefined);
     b_w_r_m_.Resize(nrecur_, ncell_, kUndefined);
 
-    InitMatParam(b_w_gifo_x_, param_scale);
-    InitMatParam(b_w_gifo_r_, param_scale);
-    InitMatParam(b_w_r_m_, param_scale);
+    RandUniform(0.0, 2.0 * param_scale, &b_w_gifo_x_);
+    RandUniform(0.0, 2.0 * param_scale, &b_w_gifo_r_);
+    RandUniform(0.0, 2.0 * param_scale, &b_w_r_m_);
 
     // forward direction
     f_bias_.Resize(4*ncell_, kUndefined);
     // backward direction
     b_bias_.Resize(4*ncell_, kUndefined);
-    InitVecParam(f_bias_, param_scale);
-    InitVecParam(b_bias_, param_scale);
+    RandUniform(0.0, 2.0 * param_scale, &f_bias_);
+    RandUniform(0.0, 2.0 * param_scale, &b_bias_);
 
     // forward direction
     f_peephole_i_c_.Resize(ncell_, kUndefined);
@@ -135,16 +121,16 @@ class BLstmProjectedStreams : public UpdatableComponent {
     b_peephole_f_c_.Resize(ncell_, kUndefined);
     b_peephole_o_c_.Resize(ncell_, kUndefined);
 
-    InitVecParam(f_peephole_i_c_, param_scale);
-    InitVecParam(f_peephole_f_c_, param_scale);
-    InitVecParam(f_peephole_o_c_, param_scale);
+    RandUniform(0.0, 2.0 * param_scale, &f_peephole_i_c_);
+    RandUniform(0.0, 2.0 * param_scale, &f_peephole_f_c_);
+    RandUniform(0.0, 2.0 * param_scale, &f_peephole_o_c_);
 
-    InitVecParam(b_peephole_i_c_, param_scale);
-    InitVecParam(b_peephole_f_c_, param_scale);
-    InitVecParam(b_peephole_o_c_, param_scale);
+    RandUniform(0.0, 2.0 * param_scale, &b_peephole_i_c_);
+    RandUniform(0.0, 2.0 * param_scale, &b_peephole_f_c_);
+    RandUniform(0.0, 2.0 * param_scale, &b_peephole_o_c_);
 
-    // init delta buffers
-    // forward direction
+    // init delta buffers,
+    // forward direction,
     f_w_gifo_x_corr_.Resize(4*ncell_, input_dim_, kSetZero);
     f_w_gifo_r_corr_.Resize(4*ncell_, nrecur_, kSetZero);
     f_bias_corr_.Resize(4*ncell_, kSetZero);
@@ -154,7 +140,7 @@ class BLstmProjectedStreams : public UpdatableComponent {
     b_w_gifo_r_corr_.Resize(4*ncell_, nrecur_, kSetZero);
     b_bias_corr_.Resize(4*ncell_, kSetZero);
 
-    // peep hole connect
+    // peep-hole connections
     // forward direction
     f_peephole_i_c_corr_.Resize(ncell_, kSetZero);
     f_peephole_f_c_corr_.Resize(ncell_, kSetZero);
@@ -169,17 +155,39 @@ class BLstmProjectedStreams : public UpdatableComponent {
     // backward direction
     b_w_r_m_corr_.Resize(nrecur_, ncell_, kSetZero);
 
+    KALDI_ASSERT(ncell_ > 0);
     KALDI_ASSERT(clip_gradient_ >= 0.0);
+    KALDI_ASSERT(learn_rate_coef_ >= 0.0);
+    KALDI_ASSERT(bias_learn_rate_coef_ >= 0.0);
   }
 
 
   void ReadData(std::istream &is, bool binary) {
-    ExpectToken(is, binary, "<CellDim>");
-    ReadBasicType(is, binary, &ncell_);
-    ExpectToken(is, binary, "<ClipGradient>");
-    ReadBasicType(is, binary, &clip_gradient_);
-    // ExpectToken(is, binary, "<DropoutRate>");
-    // ReadBasicType(is, binary, &dropout_rate_);
+    // Read all the '<Tokens>' in arbitrary order,
+    while ('<' == Peek(is, binary)) {
+      std::string token;
+      int first_char = PeekToken(is, binary);
+      switch (first_char) {
+        case 'C': ReadToken(is, false, &token);
+          /**/ if (token == "<CellDim>") ReadBasicType(is, binary, &ncell_);
+          else if (token == "<ClipGradient>") ReadBasicType(is, binary, &clip_gradient_);
+          else KALDI_ERR << "Unknown token: " << token;
+          break;
+        // case 'D': ExpectToken(is, binary, "<DropoutRate>");
+        //   ReadBasicType(is, binary, &dropout_rate_);
+        //   break;
+        case 'L': ExpectToken(is, binary, "<LearnRateCoef>");
+          ReadBasicType(is, binary, &learn_rate_coef_);
+          break;
+        case 'B': ExpectToken(is, binary, "<BiasLearnRateCoef>");
+          ReadBasicType(is, binary, &bias_learn_rate_coef_);
+          break;
+        default: ReadToken(is, false, &token);
+          KALDI_ERR << "Unknown token: " << token;
+      }
+    }
+    KALDI_ASSERT(ncell_ != 0);
+    // Read the data (data follow the tokens),
 
     // reading parameters corresponding to forward direction
     f_w_gifo_x_.Read(is, binary);
@@ -236,6 +244,12 @@ class BLstmProjectedStreams : public UpdatableComponent {
     // WriteToken(os, binary, "<DropoutRate>");
     // WriteBasicType(os, binary, dropout_rate_);
 
+    WriteToken(os, binary, "<LearnRateCoef>");
+    WriteBasicType(os, binary, learn_rate_coef_);
+    WriteToken(os, binary, "<BiasLearnRateCoef>");
+    WriteBasicType(os, binary, bias_learn_rate_coef_);
+
+    if (!binary) os << "\n";
     // writing parameters corresponding to forward direction
     f_w_gifo_x_.Write(os, binary);
     f_w_gifo_r_.Write(os, binary);
@@ -262,64 +276,175 @@ class BLstmProjectedStreams : public UpdatableComponent {
 
   int32 NumParams() const {
     return 2*( f_w_gifo_x_.NumRows() * f_w_gifo_x_.NumCols() +
-         f_w_gifo_r_.NumRows() * f_w_gifo_r_.NumCols() +
-         f_bias_.Dim() +
-         f_peephole_i_c_.Dim() +
-         f_peephole_f_c_.Dim() +
-         f_peephole_o_c_.Dim() +
-         f_w_r_m_.NumRows() * f_w_r_m_.NumCols() );
+      f_w_gifo_r_.NumRows() * f_w_gifo_r_.NumCols() +
+      f_bias_.Dim() +
+      f_peephole_i_c_.Dim() +
+      f_peephole_f_c_.Dim() +
+      f_peephole_o_c_.Dim() +
+      f_w_r_m_.NumRows() * f_w_r_m_.NumCols() );
   }
 
 
-  void GetParams(Vector<BaseFloat>* wei_copy) const {
-    wei_copy->Resize(NumParams());
+  void GetGradient(VectorBase<BaseFloat>* gradient) const {
+    KALDI_ASSERT(gradient->Dim() == NumParams());
     int32 offset, len;
 
     // Copying parameters corresponding to forward direction
-    offset = 0;  len = f_w_gifo_x_.NumRows() * f_w_gifo_x_.NumCols();
-    wei_copy->Range(offset, len).CopyRowsFromMat(f_w_gifo_x_);
+    offset = 0;    len = f_w_gifo_x_.NumRows() * f_w_gifo_x_.NumCols();
+    gradient->Range(offset, len).CopyRowsFromMat(f_w_gifo_x_corr_);
 
-    offset += len; len =f_w_gifo_r_.NumRows() * f_w_gifo_r_.NumCols();
-    wei_copy->Range(offset, len).CopyRowsFromMat(f_w_gifo_r_);
+    offset += len; len = f_w_gifo_r_.NumRows() * f_w_gifo_r_.NumCols();
+    gradient->Range(offset, len).CopyRowsFromMat(f_w_gifo_r_corr_);
 
     offset += len; len = f_bias_.Dim();
-    wei_copy->Range(offset, len).CopyFromVec(f_bias_);
+    gradient->Range(offset, len).CopyFromVec(f_bias_corr_);
 
     offset += len; len = f_peephole_i_c_.Dim();
-    wei_copy->Range(offset, len).CopyFromVec(f_peephole_i_c_);
+    gradient->Range(offset, len).CopyFromVec(f_peephole_i_c_corr_);
 
     offset += len; len = f_peephole_f_c_.Dim();
-    wei_copy->Range(offset, len).CopyFromVec(f_peephole_f_c_);
+    gradient->Range(offset, len).CopyFromVec(f_peephole_f_c_corr_);
 
     offset += len; len = f_peephole_o_c_.Dim();
-    wei_copy->Range(offset, len).CopyFromVec(f_peephole_o_c_);
+    gradient->Range(offset, len).CopyFromVec(f_peephole_o_c_corr_);
 
     offset += len; len = f_w_r_m_.NumRows() * f_w_r_m_.NumCols();
-    wei_copy->Range(offset, len).CopyRowsFromMat(f_w_r_m_);
+    gradient->Range(offset, len).CopyRowsFromMat(f_w_r_m_corr_);
 
     // Copying parameters corresponding to backward direction
     offset += len; len = b_w_gifo_x_.NumRows() * b_w_gifo_x_.NumCols();
-    wei_copy->Range(offset, len).CopyRowsFromMat(b_w_gifo_x_);
+    gradient->Range(offset, len).CopyRowsFromMat(b_w_gifo_x_corr_);
 
     offset += len; len = b_w_gifo_r_.NumRows() * b_w_gifo_r_.NumCols();
-    wei_copy->Range(offset, len).CopyRowsFromMat(b_w_gifo_r_);
+    gradient->Range(offset, len).CopyRowsFromMat(b_w_gifo_r_corr_);
 
     offset += len; len = b_bias_.Dim();
-    wei_copy->Range(offset, len).CopyFromVec(b_bias_);
+    gradient->Range(offset, len).CopyFromVec(b_bias_corr_);
 
     offset += len; len = b_peephole_i_c_.Dim();
-    wei_copy->Range(offset, len).CopyFromVec(b_peephole_i_c_);
+    gradient->Range(offset, len).CopyFromVec(b_peephole_i_c_corr_);
 
     offset += len; len = b_peephole_f_c_.Dim();
-    wei_copy->Range(offset, len).CopyFromVec(b_peephole_f_c_);
+    gradient->Range(offset, len).CopyFromVec(b_peephole_f_c_corr_);
 
     offset += len; len = b_peephole_o_c_.Dim();
-    wei_copy->Range(offset, len).CopyFromVec(b_peephole_o_c_);
+    gradient->Range(offset, len).CopyFromVec(b_peephole_o_c_corr_);
 
     offset += len; len = b_w_r_m_.NumRows() * b_w_r_m_.NumCols();
-    wei_copy->Range(offset, len).CopyRowsFromMat(b_w_r_m_);
+    gradient->Range(offset, len).CopyRowsFromMat(b_w_r_m_corr_);
 
-    return;
+    // check the dim,
+    offset += len;
+    KALDI_ASSERT(offset == NumParams());
+  }
+
+
+  void GetParams(VectorBase<BaseFloat>* params) const {
+    KALDI_ASSERT(params->Dim() == NumParams());
+    int32 offset, len;
+
+    // Copying parameters corresponding to forward direction
+    offset = 0;    len = f_w_gifo_x_.NumRows() * f_w_gifo_x_.NumCols();
+    params->Range(offset, len).CopyRowsFromMat(f_w_gifo_x_);
+
+    offset += len; len = f_w_gifo_r_.NumRows() * f_w_gifo_r_.NumCols();
+    params->Range(offset, len).CopyRowsFromMat(f_w_gifo_r_);
+
+    offset += len; len = f_bias_.Dim();
+    params->Range(offset, len).CopyFromVec(f_bias_);
+
+    offset += len; len = f_peephole_i_c_.Dim();
+    params->Range(offset, len).CopyFromVec(f_peephole_i_c_);
+
+    offset += len; len = f_peephole_f_c_.Dim();
+    params->Range(offset, len).CopyFromVec(f_peephole_f_c_);
+
+    offset += len; len = f_peephole_o_c_.Dim();
+    params->Range(offset, len).CopyFromVec(f_peephole_o_c_);
+
+    offset += len; len = f_w_r_m_.NumRows() * f_w_r_m_.NumCols();
+    params->Range(offset, len).CopyRowsFromMat(f_w_r_m_);
+
+    // Copying parameters corresponding to backward direction
+    offset += len; len = b_w_gifo_x_.NumRows() * b_w_gifo_x_.NumCols();
+    params->Range(offset, len).CopyRowsFromMat(b_w_gifo_x_);
+
+    offset += len; len = b_w_gifo_r_.NumRows() * b_w_gifo_r_.NumCols();
+    params->Range(offset, len).CopyRowsFromMat(b_w_gifo_r_);
+
+    offset += len; len = b_bias_.Dim();
+    params->Range(offset, len).CopyFromVec(b_bias_);
+
+    offset += len; len = b_peephole_i_c_.Dim();
+    params->Range(offset, len).CopyFromVec(b_peephole_i_c_);
+
+    offset += len; len = b_peephole_f_c_.Dim();
+    params->Range(offset, len).CopyFromVec(b_peephole_f_c_);
+
+    offset += len; len = b_peephole_o_c_.Dim();
+    params->Range(offset, len).CopyFromVec(b_peephole_o_c_);
+
+    offset += len; len = b_w_r_m_.NumRows() * b_w_r_m_.NumCols();
+    params->Range(offset, len).CopyRowsFromMat(b_w_r_m_);
+
+    // check the dim,
+    offset += len;
+    KALDI_ASSERT(offset == NumParams());
+  }
+
+
+  void SetParams(const VectorBase<BaseFloat>& params) {
+    KALDI_ASSERT(params.Dim() == NumParams());
+    int32 offset, len;
+
+    // Copying parameters corresponding to forward direction
+    offset = 0;    len = f_w_gifo_x_.NumRows() * f_w_gifo_x_.NumCols();
+    f_w_gifo_x_.CopyRowsFromVec(params.Range(offset, len));
+
+    offset += len; len = f_w_gifo_r_.NumRows() * f_w_gifo_r_.NumCols();
+    f_w_gifo_r_.CopyRowsFromVec(params.Range(offset, len));
+
+    offset += len; len = f_bias_.Dim();
+    f_bias_.CopyFromVec(params.Range(offset, len));
+
+    offset += len; len = f_peephole_i_c_.Dim();
+    f_peephole_i_c_.CopyFromVec(params.Range(offset, len));
+
+    offset += len; len = f_peephole_f_c_.Dim();
+    f_peephole_f_c_.CopyFromVec(params.Range(offset, len));
+
+    offset += len; len = f_peephole_o_c_.Dim();
+    f_peephole_o_c_.CopyFromVec(params.Range(offset, len));
+
+    offset += len; len = f_w_r_m_.NumRows() * f_w_r_m_.NumCols();
+    f_w_r_m_.CopyRowsFromVec(params.Range(offset, len));
+
+
+    // Copying parameters corresponding to backward direction
+    offset += len; len = b_w_gifo_x_.NumRows() * b_w_gifo_x_.NumCols();
+    b_w_gifo_x_.CopyRowsFromVec(params.Range(offset, len));
+
+    offset += len; len = b_w_gifo_r_.NumRows() * b_w_gifo_r_.NumCols();
+    b_w_gifo_r_.CopyRowsFromVec(params.Range(offset, len));
+
+    offset += len; len = b_bias_.Dim();
+    b_bias_.CopyFromVec(params.Range(offset, len));
+
+    offset += len; len = b_peephole_i_c_.Dim();
+    b_peephole_i_c_.CopyFromVec(params.Range(offset, len));
+
+    offset += len; len = b_peephole_f_c_.Dim();
+    b_peephole_f_c_.CopyFromVec(params.Range(offset, len));
+
+    offset += len; len = b_peephole_o_c_.Dim();
+    b_peephole_o_c_.CopyFromVec(params.Range(offset, len));
+
+    offset += len; len = b_w_r_m_.NumRows() * b_w_r_m_.NumCols();
+    b_w_r_m_.CopyRowsFromVec(params.Range(offset, len));
+
+    // check the dim,
+    offset += len;
+    KALDI_ASSERT(offset == NumParams());
   }
 
 
@@ -443,8 +568,10 @@ class BLstmProjectedStreams : public UpdatableComponent {
       "\n  B_DR  " + MomentStatistics(B_DR);
   }
 
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in, CuMatrixBase<BaseFloat> *out) {
+  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
+                    CuMatrixBase<BaseFloat> *out) {
     int DEBUG = 0;
+
     int32 nstream_ = sequence_lengths_.size();
     KALDI_ASSERT(in.NumRows() % nstream_ == 0);
     int32 T = in.NumRows() / nstream_;
@@ -658,14 +785,18 @@ class BLstmProjectedStreams : public UpdatableComponent {
     out->CopyFromMat(YR_FB.RowRange(1*S, T*S));
   }
 
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in, const CuMatrixBase<BaseFloat> &out,
-              const CuMatrixBase<BaseFloat> &out_diff, CuMatrixBase<BaseFloat> *in_diff) {
+
+  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
+                        const CuMatrixBase<BaseFloat> &out,
+                        const CuMatrixBase<BaseFloat> &out_diff,
+                        CuMatrixBase<BaseFloat> *in_diff) {
     int DEBUG = 0;
+
     // the number of sequences to be processed in parallel
     int32 nstream_ = sequence_lengths_.size();
     int32 T = in.NumRows() / nstream_;
     int32 S = nstream_;
-    // disassembling forward-pass forward-propagation buffer into different neurons,
+    // disassembling forward-pass forward-propagation buffer by neuron type,
     CuSubMatrix<BaseFloat> F_YG(f_propagate_buf_.ColRange(0*ncell_, ncell_));
     CuSubMatrix<BaseFloat> F_YI(f_propagate_buf_.ColRange(1*ncell_, ncell_));
     CuSubMatrix<BaseFloat> F_YF(f_propagate_buf_.ColRange(2*ncell_, ncell_));
@@ -678,7 +809,7 @@ class BLstmProjectedStreams : public UpdatableComponent {
     // 0:dummy, [1,T] frames, T+1 backward pass history
     f_backpropagate_buf_.Resize((T+2)*S, 7 * ncell_ + nrecur_, kSetZero);
 
-    // disassembling forward-pass back-propagation buffer into different neurons,
+    // disassembling forward-pass back-propagation buffer by neuron type,
     CuSubMatrix<BaseFloat> F_DG(f_backpropagate_buf_.ColRange(0*ncell_, ncell_));
     CuSubMatrix<BaseFloat> F_DI(f_backpropagate_buf_.ColRange(1*ncell_, ncell_));
     CuSubMatrix<BaseFloat> F_DF(f_backpropagate_buf_.ColRange(2*ncell_, ncell_));
@@ -690,7 +821,7 @@ class BLstmProjectedStreams : public UpdatableComponent {
 
     CuSubMatrix<BaseFloat> F_DGIFO(f_backpropagate_buf_.ColRange(0, 4*ncell_));
 
-    // projection layer to BLSTM output is not recurrent, so backprop it all in once
+    // projection layer to BLSTM output is not recurrent, backprop it all in once
     F_DR.RowRange(1*S, T*S).CopyFromMat(out_diff.ColRange(0, nrecur_));
 
     for (int t = T; t >= 1; t--) {
@@ -780,7 +911,7 @@ class BLstmProjectedStreams : public UpdatableComponent {
       }
     }
 
-    // disassembling backward-pass forward-propagation buffer into different neurons,
+    // disassembling backward-pass forward-propagation buffer by neuron types,
     CuSubMatrix<BaseFloat> B_YG(b_propagate_buf_.ColRange(0*ncell_, ncell_));
     CuSubMatrix<BaseFloat> B_YI(b_propagate_buf_.ColRange(1*ncell_, ncell_));
     CuSubMatrix<BaseFloat> B_YF(b_propagate_buf_.ColRange(2*ncell_, ncell_));
@@ -793,7 +924,7 @@ class BLstmProjectedStreams : public UpdatableComponent {
     // 0:dummy, [1,T] frames, T+1 backward pass history
     b_backpropagate_buf_.Resize((T+2)*S, 7 * ncell_ + nrecur_, kSetZero);
 
-    // disassembling backward-pass back-propagation buffer into different neurons,
+    // disassembling backward-pass back-propagation buffer by neuron types,
     CuSubMatrix<BaseFloat> B_DG(b_backpropagate_buf_.ColRange(0*ncell_, ncell_));
     CuSubMatrix<BaseFloat> B_DI(b_backpropagate_buf_.ColRange(1*ncell_, ncell_));
     CuSubMatrix<BaseFloat> B_DF(b_backpropagate_buf_.ColRange(2*ncell_, ncell_));
@@ -866,7 +997,7 @@ class BLstmProjectedStreams : public UpdatableComponent {
       d_c.AddMatMatElements(1.0, B_DC.RowRange((t-1)*S, S), B_YF.RowRange((t-1)*S, S), 1.0);
       d_c.AddMatDiagVec(1.0, B_DI.RowRange((t-1)*S, S), kNoTrans, b_peephole_i_c_, 1.0);
       d_c.AddMatDiagVec(1.0, B_DF.RowRange((t-1)*S, S), kNoTrans, b_peephole_f_c_, 1.0);
-      d_c.AddMatDiagVec(1.0, d_o                     , kNoTrans, b_peephole_o_c_, 1.0);
+      d_c.AddMatDiagVec(1.0, d_o                      , kNoTrans, b_peephole_o_c_, 1.0);
 
       // f
       d_f.AddMatMatElements(1.0, d_c, B_YC.RowRange((t-1)*S, S), 0.0);
@@ -903,7 +1034,7 @@ class BLstmProjectedStreams : public UpdatableComponent {
     // backward pass dropout
     // if (dropout_rate_ != 0.0) {
     //  in_diff->MulElements(dropout_mask_);
-    //}
+    // }
 
     // calculate delta
     const BaseFloat mmt = opts_.momentum;
@@ -1013,32 +1144,34 @@ class BLstmProjectedStreams : public UpdatableComponent {
     }
   }
 
-
-  void Update(const CuMatrixBase<BaseFloat> &input, const CuMatrixBase<BaseFloat> &diff) {
+  void Update(const CuMatrixBase<BaseFloat> &input,
+              const CuMatrixBase<BaseFloat> &diff) {
     const BaseFloat lr  = opts_.learn_rate;
     // forward direction update
-    f_w_gifo_x_.AddMat(-lr, f_w_gifo_x_corr_);
-    f_w_gifo_r_.AddMat(-lr, f_w_gifo_r_corr_);
-    f_bias_.AddVec(-lr, f_bias_corr_, 1.0);
+    f_w_gifo_x_.AddMat(-lr * learn_rate_coef_, f_w_gifo_x_corr_);
+    f_w_gifo_r_.AddMat(-lr * learn_rate_coef_, f_w_gifo_r_corr_);
+    f_bias_.AddVec(-lr * bias_learn_rate_coef_, f_bias_corr_, 1.0);
 
-    f_peephole_i_c_.AddVec(-lr, f_peephole_i_c_corr_, 1.0);
-    f_peephole_f_c_.AddVec(-lr, f_peephole_f_c_corr_, 1.0);
-    f_peephole_o_c_.AddVec(-lr, f_peephole_o_c_corr_, 1.0);
+    f_peephole_i_c_.AddVec(-lr * bias_learn_rate_coef_, f_peephole_i_c_corr_, 1.0);
+    f_peephole_f_c_.AddVec(-lr * bias_learn_rate_coef_, f_peephole_f_c_corr_, 1.0);
+    f_peephole_o_c_.AddVec(-lr * bias_learn_rate_coef_, f_peephole_o_c_corr_, 1.0);
 
-    f_w_r_m_.AddMat(-lr, f_w_r_m_corr_);
+    f_w_r_m_.AddMat(-lr * learn_rate_coef_, f_w_r_m_corr_);
 
     // backward direction update
-    b_w_gifo_x_.AddMat(-lr, b_w_gifo_x_corr_);
-    b_w_gifo_r_.AddMat(-lr, b_w_gifo_r_corr_);
-    b_bias_.AddVec(-lr, b_bias_corr_, 1.0);
+    b_w_gifo_x_.AddMat(-lr * learn_rate_coef_, b_w_gifo_x_corr_);
+    b_w_gifo_r_.AddMat(-lr * learn_rate_coef_, b_w_gifo_r_corr_);
+    b_bias_.AddVec(-lr * bias_learn_rate_coef_, b_bias_corr_, 1.0);
 
-    b_peephole_i_c_.AddVec(-lr, b_peephole_i_c_corr_, 1.0);
-    b_peephole_f_c_.AddVec(-lr, b_peephole_f_c_corr_, 1.0);
-    b_peephole_o_c_.AddVec(-lr, b_peephole_o_c_corr_, 1.0);
+    b_peephole_i_c_.AddVec(-lr * bias_learn_rate_coef_, b_peephole_i_c_corr_, 1.0);
+    b_peephole_f_c_.AddVec(-lr * bias_learn_rate_coef_, b_peephole_f_c_corr_, 1.0);
+    b_peephole_o_c_.AddVec(-lr * bias_learn_rate_coef_, b_peephole_o_c_corr_, 1.0);
 
-    b_w_r_m_.AddMat(-lr, b_w_r_m_corr_);
+    b_w_r_m_.AddMat(-lr * learn_rate_coef_, b_w_r_m_corr_);
 
-    /* For L2 regularization see "vanishing & exploding difficulties" in nnet-lstm-projected-streams.h */
+    /* For L2 regularization see "vanishing & exploding difficulties"
+     * in nnet-lstm-projected-streams.h
+     */
   }
 
  private:
@@ -1119,9 +1252,9 @@ class BLstmProjectedStreams : public UpdatableComponent {
   CuMatrix<BaseFloat> f_backpropagate_buf_;
   // backward direction
   CuMatrix<BaseFloat> b_backpropagate_buf_;
+};  // class BLstmProjectedStreams
 
-};
 }  // namespace nnet1
 }  // namespace kaldi
 
-#endif
+#endif  // KALDI_NNET_NNET_BLSTM_PROJECTED_STREAMS_H_
diff --git a/src/nnet/nnet-component-test.cc b/src/nnet/nnet-component-test.cc
index dcd40cfd81f..f19dc6217de 100644
--- a/src/nnet/nnet-component-test.cc
+++ b/src/nnet/nnet-component-test.cc
@@ -17,6 +17,9 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
+#include <sstream>
+#include <fstream>
+#include <algorithm>
 
 #include "nnet/nnet-component.h"
 #include "nnet/nnet-nnet.h"
@@ -27,28 +30,75 @@
 #include "nnet/nnet-average-pooling-2d-component.h"
 #include "util/common-utils.h"
 
-#include <sstream>
-#include <fstream>
-#include <algorithm>
-
 namespace kaldi {
 namespace nnet1 {
 
   /*
    * Helper functions
-   */  
+   */
   template<typename Real>
   void ReadCuMatrixFromString(const std::string& s, CuMatrix<Real>* m) {
     std::istringstream is(s + "\n");
-    m->Read(is, false); // false for ascii
+    m->Read(is, false);  // false for ascii
   }
 
   Component* ReadComponentFromString(const std::string& s) {
     std::istringstream is(s + "\n");
-    return Component::Read(is, false); // false for ascii
+    return Component::Read(is, false);  // false for ascii
   }
+
+
   /*
+   * Unit tests,
    */
+  void UnitTestLengthNorm() {
+    // make L2-length normalization component,
+    Component* c = ReadComponentFromString("<LengthNormComponent> 5 5");
+    // prepare input,
+    CuMatrix<BaseFloat> mat_in;
+    ReadCuMatrixFromString("[ 1 2 3 4 5 \n 2 3 5 6 8 ] ", &mat_in);
+    // propagate,
+    CuMatrix<BaseFloat> mat_out;
+    c->Propagate(mat_in, &mat_out);
+    // check the length,
+    mat_out.MulElements(mat_out);  // ^2,
+    CuVector<BaseFloat> check_length_is_one(2);
+    check_length_is_one.AddColSumMat(1.0, mat_out, 0.0);  // sum_of_cols(x^2),
+    check_length_is_one.ApplyPow(0.5);  // L2norm = sqrt(sum_of_cols(x^2)),
+    CuVector<BaseFloat> ones(2);
+    ones.Set(1.0);
+    AssertEqual(check_length_is_one, ones);
+  }
+
+  void UnitTestSimpleSentenceAveragingComponent() {
+    // make SimpleSentenceAveraging component,
+    Component* c = ReadComponentFromString(
+      "<SimpleSentenceAveragingComponent> 2 2 <GradientBoost> 10.0"
+    );
+    // prepare input,
+    CuMatrix<BaseFloat> mat_in;
+    ReadCuMatrixFromString("[ 0 0.5 \n 1 1 \n 2 1.5 ] ", &mat_in);
+
+    // propagate,
+    CuMatrix<BaseFloat> mat_out;
+    c->Propagate(mat_in, &mat_out);
+    // check the output,
+    CuVector<BaseFloat> ones(2);
+    ones.Set(1.0);
+    for (int32 i = 0; i < mat_out.NumRows(); i++) {
+      AssertEqual(mat_out.Row(i), ones);
+    }
+
+    // backpropagate,
+    CuMatrix<BaseFloat> dummy1(3, 2), dummy2(3, 2), diff_out(mat_in), diff_in;
+    // the average 1.0 in 'diff_in' will be boosted by 10.0,
+    c->Backpropagate(dummy1, dummy2, diff_out, &diff_in);
+    // check the output,
+    CuVector<BaseFloat> tens(2); tens.Set(10);
+    for (int32 i = 0; i < diff_in.NumRows(); i++) {
+      AssertEqual(diff_in.Row(i), tens);
+    }
+  }
 
   void UnitTestConvolutionalComponentUnity() {
     // make 'identity' convolutional component,
@@ -59,29 +109,31 @@ namespace nnet1 {
       <Filters> [ 1 \
       ] <Bias> [ 0 ]"
     );
-    
+
     // prepare input,
     CuMatrix<BaseFloat> mat_in;
     ReadCuMatrixFromString("[ 1 2 3 4 5 ] ", &mat_in);
-    
+
     // propagate,
     CuMatrix<BaseFloat> mat_out;
-    c->Propagate(mat_in,&mat_out);
+    c->Propagate(mat_in, &mat_out);
     KALDI_LOG << "mat_in" << mat_in << "mat_out" << mat_out;
-    AssertEqual(mat_in,mat_out);
+    AssertEqual(mat_in, mat_out);
 
     // backpropagate,
     CuMatrix<BaseFloat> mat_out_diff(mat_in), mat_in_diff;
     c->Backpropagate(mat_in, mat_out, mat_out_diff, &mat_in_diff);
-    KALDI_LOG << "mat_out_diff " << mat_out_diff << " mat_in_diff " << mat_in_diff;
-    AssertEqual(mat_out_diff,mat_in_diff);
-    
+    KALDI_LOG << "mat_out_diff " << mat_out_diff
+              << " mat_in_diff " << mat_in_diff;
+    AssertEqual(mat_out_diff, mat_in_diff);
+
     // clean,
     delete c;
   }
 
   void UnitTestConvolutionalComponent3x3() {
-    // make 3x3 convolutional component, design such weights and input so output is zero,
+    // make 3x3 convolutional component,
+    // design such weights and input so output is zero,
     Component* c = ReadComponentFromString("<ConvolutionalComponent> 9 15 \
       <PatchDim> 3 <PatchStep> 1 <PatchStride> 5 \
       <LearnRateCoef> 1.0 <BiasLearnRateCoef> 1.0 \
@@ -91,13 +143,13 @@ namespace nnet1 {
                   -4  0  0  -3 0 3   4 0 0 ] \
       <Bias> [ -20 -20 -20 ]"
     );
-    
+
     // prepare input, reference output,
     CuMatrix<BaseFloat> mat_in;
     ReadCuMatrixFromString("[ 1 3 5 7 9  2 4 6 8 10  3 5 7 9 11 ]", &mat_in);
     CuMatrix<BaseFloat> mat_out_ref;
     ReadCuMatrixFromString("[ 0 0 0  0 0 0  0 0 0 ]", &mat_out_ref);
-    
+
     // propagate,
     CuMatrix<BaseFloat> mat_out;
     c->Propagate(mat_in, &mat_out);
@@ -107,25 +159,30 @@ namespace nnet1 {
     // prepare mat_out_diff, mat_in_diff_ref,
     CuMatrix<BaseFloat> mat_out_diff;
     ReadCuMatrixFromString("[ 1 0 0  1 1 0  1 1 1 ]", &mat_out_diff);
-    CuMatrix<BaseFloat> mat_in_diff_ref; // hand-computed back-propagated values,
-    ReadCuMatrixFromString("[ -1 -4 -15 -8 -6   0 -3 -6 3 6   1 1 14 11 7 ]", &mat_in_diff_ref);
+    // hand-computed back-propagated values,
+    CuMatrix<BaseFloat> mat_in_diff_ref;
+    ReadCuMatrixFromString("[ -1 -4 -15 -8 -6   0 -3 -6 3 6   1 1 14 11 7 ]",
+                           &mat_in_diff_ref);
 
     // backpropagate,
     CuMatrix<BaseFloat> mat_in_diff;
     c->Backpropagate(mat_in, mat_out, mat_out_diff, &mat_in_diff);
-    KALDI_LOG << "mat_in_diff " << mat_in_diff << " mat_in_diff_ref " << mat_in_diff_ref;
+    KALDI_LOG << "mat_in_diff " << mat_in_diff
+              << " mat_in_diff_ref " << mat_in_diff_ref;
     AssertEqual(mat_in_diff, mat_in_diff_ref);
-    
+
     // clean,
     delete c;
   }
 
 
-
   void UnitTestMaxPoolingComponent() {
-    // make max-pooling component, assuming 4 conv. neurons, non-overlapping pool of size 3,
-    Component* c = Component::Init("<MaxPoolingComponent> <InputDim> 24 <OutputDim> 8 \
-                     <PoolSize> 3 <PoolStep> 3 <PoolStride> 4");
+    // make max-pooling component, assuming 4 conv. neurons,
+    // non-overlapping pool of size 3,
+    Component* c = Component::Init(
+        "<MaxPoolingComponent> <InputDim> 24 <OutputDim> 8 \
+         <PoolSize> 3 <PoolStep> 3 <PoolStride> 4"
+    );
 
     // input matrix,
     CuMatrix<BaseFloat> mat_in;
@@ -151,18 +208,18 @@ namespace nnet1 {
                               7 4 9 4;\
                               5 9 8 9 \
                               8 4 5 7 ]", &mat_out_ref);
-    
+
     // propagate,
     CuMatrix<BaseFloat> mat_out;
-    c->Propagate(mat_in,&mat_out);
+    c->Propagate(mat_in, &mat_out);
     KALDI_LOG << "mat_out" << mat_out << "mat_out_ref" << mat_out_ref;
     AssertEqual(mat_out, mat_out_ref);
 
     // locations of max values will be shown,
     CuMatrix<BaseFloat> mat_out_diff(mat_out);
     mat_out_diff.Set(1);
-    // expected backpropagated values,
-    CuMatrix<BaseFloat> mat_in_diff_ref; // hand-computed back-propagated values,
+    // expected backpropagated values (hand-computed),
+    CuMatrix<BaseFloat> mat_in_diff_ref;
     ReadCuMatrixFromString("[ 0 1 0 1 \
                               1 0 1 0 \
                               0 0 1 0 \
@@ -181,7 +238,8 @@ namespace nnet1 {
     // backpropagate,
     CuMatrix<BaseFloat> mat_in_diff;
     c->Backpropagate(mat_in, mat_out, mat_out_diff, &mat_in_diff);
-    KALDI_LOG << "mat_in_diff " << mat_in_diff << " mat_in_diff_ref " << mat_in_diff_ref;
+    KALDI_LOG << "mat_in_diff " << mat_in_diff
+              << " mat_in_diff_ref " << mat_in_diff_ref;
     AssertEqual(mat_in_diff, mat_in_diff_ref);
 
     delete c;
@@ -189,17 +247,23 @@ namespace nnet1 {
 
   void UnitTestMaxPooling2DComponent() { /* Implemented by Harish Mallidi */
     // make max-pooling2d component
-    Component* c = Component::Init("<MaxPooling2DComponent> <InputDim> 56 <OutputDim> 18 \
-<FmapXLen> 4 <FmapYLen> 7 <PoolXLen> 2 <PoolYLen> 3 <PoolXStep> 1 <PoolYStep> 2");
+    Component* c = Component::Init(
+      "<MaxPooling2DComponent> <InputDim> 56 <OutputDim> 18 \
+       <FmapXLen> 4 <FmapYLen> 7 <PoolXLen> 2 <PoolYLen> 3 \
+       <PoolXStep> 1 <PoolYStep> 2"
+    );
 
     // input matrix,
     CuMatrix<BaseFloat> mat_in;
-    ReadCuMatrixFromString("[ 0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 10 10 11 11 12 12 13 13 14 14 15 15 16 16 17 17 18 18 19 19 20 20 21 21 22 22 23 23 24 24 25 25 26 26 27 27 ]", &mat_in);
-    
+    ReadCuMatrixFromString("[ 0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 10 10 \
+      11 11 12 12 13 13 14 14 15 15 16 16 17 17 18 18 19 19 20 20 21 21 \
+      22 22 23 23 24 24 25 25 26 26 27 27 ]", &mat_in);
+
     // expected output (max values in the patch)
     CuMatrix<BaseFloat> mat_out_ref;
-    ReadCuMatrixFromString("[ 9 9 11 11 13 13 16 16 18 18 20 20 23 23 25 25 27 27 ]", &mat_out_ref);
-    
+    ReadCuMatrixFromString("[ 9 9 11 11 13 13 16 16 18 18 \
+      20 20 23 23 25 25 27 27 ]", &mat_out_ref);
+
     // propagate,
     CuMatrix<BaseFloat> mat_out;
     c->Propagate(mat_in, &mat_out);
@@ -209,16 +273,22 @@ namespace nnet1 {
 
     // locations of max values will be shown
     CuMatrix<BaseFloat> mat_out_diff(mat_out);
-    ReadCuMatrixFromString("[ 0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 ]", &mat_out_diff);
-    
-    //expected backpropagated values,
-    CuMatrix<BaseFloat> mat_in_diff_ref; //hand-computed back-propagated values,
-    ReadCuMatrixFromString("[ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.25 0.25 0 0 1 1 0 0 0 0 0.75 0.75 0 0 1 1 0 0 2.5 2.5 0 0 0 0 3 3 0 0 3.5 3.5 0 0 8 8 ]", &mat_in_diff_ref);
-    
+    ReadCuMatrixFromString(
+      "[ 0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 ]", &mat_out_diff
+    );
+
+    // expected backpropagated values,
+    CuMatrix<BaseFloat> mat_in_diff_ref;  // hand-computed back-propagated values,
+    ReadCuMatrixFromString("[ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 \
+      0.25 0.25 0 0 1 1 0 0 0 0 0.75 0.75 0 0 1 1 0 0 2.5 2.5 \
+      0 0 0 0 3 3 0 0 3.5 3.5 0 0 8 8 ]", &mat_in_diff_ref
+    );
+
     // backpropagate,
     CuMatrix<BaseFloat> mat_in_diff;
     c->Backpropagate(mat_in, mat_out, mat_out_diff, &mat_in_diff);
-    KALDI_LOG << "mat_in_diff " << mat_in_diff << " mat_in_diff_ref " << mat_in_diff_ref;
+    KALDI_LOG << "mat_in_diff " << mat_in_diff
+              << " mat_in_diff_ref " << mat_in_diff_ref;
     AssertEqual(mat_in_diff, mat_in_diff_ref);
 
     delete c;
@@ -226,17 +296,23 @@ namespace nnet1 {
 
   void UnitTestAveragePooling2DComponent() { /* Implemented by Harish Mallidi */
     // make average-pooling2d component
-    Component* c = Component::Init("<AveragePooling2DComponent> <InputDim> 56 <OutputDim> 18 \
-<FmapXLen> 4 <FmapYLen> 7 <PoolXLen> 2 <PoolYLen> 3 <PoolXStep> 1 <PoolYStep> 2");
+    Component* c = Component::Init(
+      "<AveragePooling2DComponent> <InputDim> 56 <OutputDim> 18 \
+       <FmapXLen> 4 <FmapYLen> 7 <PoolXLen> 2 <PoolYLen> 3 \
+       <PoolXStep> 1 <PoolYStep> 2"
+    );
 
     // input matrix,
     CuMatrix<BaseFloat> mat_in;
-    ReadCuMatrixFromString("[ 0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 10 10 11 11 12 12 13 13 14 14 15 15 16 16 17 17 18 18 19 19 20 20 21 21 22 22 23 23 24 24 25 25 26 26 27 27 ]", &mat_in);
-    
+    ReadCuMatrixFromString("[ 0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 10 10 \
+      11 11 12 12 13 13 14 14 15 15 16 16 17 17 18 18 19 19 20 20 \
+      21 21 22 22 23 23 24 24 25 25 26 26 27 27 ]", &mat_in);
+
     // expected output (max values in the patch)
     CuMatrix<BaseFloat> mat_out_ref;
-    ReadCuMatrixFromString("[ 4.5 4.5 6.5 6.5 8.5 8.5 11.5 11.5 13.5 13.5 15.5 15.5 18.5 18.5 20.5 20.5 22.5 22.5 ]", &mat_out_ref);
-    
+    ReadCuMatrixFromString("[ 4.5 4.5 6.5 6.5 8.5 8.5 11.5 11.5 13.5 13.5 \
+      15.5 15.5 18.5 18.5 20.5 20.5 22.5 22.5 ]", &mat_out_ref);
+
     // propagate,
     CuMatrix<BaseFloat> mat_out;
     c->Propagate(mat_in, &mat_out);
@@ -247,15 +323,22 @@ namespace nnet1 {
     // locations of max values will be shown
     CuMatrix<BaseFloat> mat_out_diff(mat_out);
     ReadCuMatrixFromString("[ 0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 ]", &mat_out_diff);
-    
+
     // expected backpropagated values,
-    CuMatrix<BaseFloat> mat_in_diff_ref; // hand-computed back-propagated values,
-    ReadCuMatrixFromString("[  0 0 0 0 0.0833333 0.0833333 0.166667 0.166667 0.25 0.25 0.333333 0.333333 0.333333 0.333333 0.25 0.25 0.25 0.25 0.333333 0.333333 0.416667 0.416667 0.5 0.5 0.583333 0.583333 0.583333 0.583333 0.75 0.75 0.75 0.75 0.833333 0.833333 0.916667 0.916667 1 1 1.08333 1.08333 1.08333 1.08333 1 1 1 1 1.08333 1.08333 1.16667 1.16667 1.25 1.25 1.33333 1.33333 1.33333 1.33333 ]", &mat_in_diff_ref);
-    
+    CuMatrix<BaseFloat> mat_in_diff_ref;  // hand-computed back-propagated values,
+    ReadCuMatrixFromString("[  0 0 0 0 0.0833333 0.0833333 0.166667 0.166667 \
+      0.25 0.25 0.333333 0.333333 0.333333 0.333333 0.25 0.25 0.25 0.25 \
+      0.333333 0.333333 0.416667 0.416667 0.5 0.5 0.583333 0.583333 0.583333 \
+      0.583333 0.75 0.75 0.75 0.75 0.833333 0.833333 0.916667 0.916667 1 1 \
+      1.08333 1.08333 1.08333 1.08333 1 1 1 1 1.08333 1.08333 1.16667 1.16667 \
+      1.25 1.25 1.33333 1.33333 1.33333 1.33333 ]", &mat_in_diff_ref
+    );
+
     // backpropagate,
     CuMatrix<BaseFloat> mat_in_diff;
     c->Backpropagate(mat_in, mat_out, mat_out_diff, &mat_in_diff);
-    KALDI_LOG << "mat_in_diff " << mat_in_diff << " mat_in_diff_ref " << mat_in_diff_ref;
+    KALDI_LOG << "mat_in_diff " << mat_in_diff
+              << " mat_in_diff_ref " << mat_in_diff_ref;
     AssertEqual(mat_in_diff, mat_in_diff_ref);
 
     delete c;
@@ -263,52 +346,69 @@ namespace nnet1 {
 
 
   void UnitTestConvolutional2DComponent() { /* Implemented by Harish Mallidi */
-    // Convolutional2D component 
+    // Convolutional2D component
     Component* c = ReadComponentFromString("<Convolutional2DComponent> 18 56 \
-<LearnRateCoef> 0 <BiasLearnRateCoef> 0 <FmapXLen> 4 <FmapYLen> 7 <FiltXLen> 2 <FiltYLen> 3 <FiltXStep> 1 <FiltYStep> 2 <ConnectFmap> 1 <Filters> [ 0 0 1 1 2 2 3 3 4 4 5 5 ; 0 0 1 1 2 2 3 3 4 4 5 5 ] <Bias> [ 0 0 ]");
-    
+      <LearnRateCoef> 0 <BiasLearnRateCoef> 0 <FmapXLen> 4 <FmapYLen> 7 \
+      <FiltXLen> 2 <FiltYLen> 3 <FiltXStep> 1 <FiltYStep> 2 <ConnectFmap> 1 \
+      <Filters> [ 0 0 1 1 2 2 3 3 4 4 5 5 ; 0 0 1 1 2 2 3 3 4 4 5 5 ] \
+      <Bias> [ 0 0 ]"
+    );
+
     // input matrix
     CuMatrix<BaseFloat> mat_in;
-    ReadCuMatrixFromString("[ 0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 10 10 11 11 12 12 13 13 14 14 15 15 16 16 17 17 18 18 19 19 20 20 21 21 22 22 23 23 24 24 25 25 26 26 27 27 ]", &mat_in);
+    ReadCuMatrixFromString("[ 0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 10 10 \
+      11 11 12 12 13 13 14 14 15 15 16 16 17 17 18 18 19 19 20 20 \
+      21 21 22 22 23 23 24 24 25 25 26 26 27 27 ]", &mat_in);
+
     CuMatrix<BaseFloat> mat_out_ref;
-    ReadCuMatrixFromString("[ 206 206 266 266 326 326 416 416 476 476 536 536 626 626 686 686 746 746 ]", &mat_out_ref);
-    
+    ReadCuMatrixFromString("[ 206 206 266 266 326 326 416 416 476 476 536 536 \
+      626 626 686 686 746 746 ]", &mat_out_ref);
+
     // propagate
     CuMatrix<BaseFloat> mat_out;
-    c->Propagate(mat_in,&mat_out);
+    c->Propagate(mat_in, &mat_out);
     KALDI_LOG << "mat_out" << mat_out << "mat_out" << mat_out_ref;
     AssertEqual(mat_out, mat_out_ref);
 
     // prepare mat_out_diff, mat_in_diff_ref,
     CuMatrix<BaseFloat> mat_out_diff;
-    ReadCuMatrixFromString("[ 0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 ]", &mat_out_diff);
+    ReadCuMatrixFromString("[ 0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 ]",
+                           &mat_out_diff);
+
     CuMatrix<BaseFloat> mat_in_diff_ref;
-    ReadCuMatrixFromString("[ 0 0 0 0 0 0 2 2 2 2 4 4 8 8 0 0 3 3 4.5 4.5 8 8 9.5 9.5 13 13 20 20 9 9 18 18 19.5 19.5 23 23 24.5 24.5 28 28 41 41 36 36 48 48 51 51 56 56 59 59 64 64 80 80 ]", &mat_in_diff_ref);
+    ReadCuMatrixFromString("[ 0 0 0 0 0 0 2 2 2 2 4 4 8 8 0 0 3 3 4.5 4.5 8 8 \
+      9.5 9.5 13 13 20 20 9 9 18 18 19.5 19.5 23 23 24.5 24.5 28 28 41 41 \
+      36 36 48 48 51 51 56 56 59 59 64 64 80 80 ]", &mat_in_diff_ref);
 
     // backpropagate
     CuMatrix<BaseFloat> mat_in_diff;
     c->Backpropagate(mat_in, mat_out, mat_out_diff, &mat_in_diff);
-    KALDI_LOG << "mat_in_diff " << mat_in_diff << " mat_in_diff_ref " << mat_in_diff_ref;
+    KALDI_LOG << "mat_in_diff " << mat_in_diff
+              << " mat_in_diff_ref " << mat_in_diff_ref;
     AssertEqual(mat_in_diff, mat_in_diff_ref);
 
     delete c;
   }
 
-} // namespace nnet1
-} // namespace kaldi
+}  // namespace nnet1
+}  // namespace kaldi
 
 int main() {
   using namespace kaldi;
   using namespace kaldi::nnet1;
 
-  for (int32 loop = 0; loop < 2; loop++) {
+  for (kaldi::int32 loop = 0; loop < 2; loop++) {
 #if HAVE_CUDA == 1
     if (loop == 0)
-      CuDevice::Instantiate().SelectGpuId("no"); // use no GPU
+      // use no GPU,
+      CuDevice::Instantiate().SelectGpuId("no");
     else
-      CuDevice::Instantiate().SelectGpuId("optional"); // use GPU when available
+      // use GPU when available,
+      CuDevice::Instantiate().SelectGpuId("optional");
 #endif
     // unit-tests :
+    UnitTestLengthNorm();
+    UnitTestSimpleSentenceAveragingComponent();
     UnitTestConvolutionalComponentUnity();
     UnitTestConvolutionalComponent3x3();
     UnitTestMaxPoolingComponent();
@@ -324,5 +424,5 @@ int main() {
 #if HAVE_CUDA == 1
   CuDevice::Instantiate().PrintProfile();
 #endif
-  return 0; 
+  return 0;
 }
diff --git a/src/nnet/nnet-component.cc b/src/nnet/nnet-component.cc
index 42eb0166181..022938d3983 100644
--- a/src/nnet/nnet-component.cc
+++ b/src/nnet/nnet-component.cc
@@ -17,6 +17,10 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
+
+#include <algorithm>
+#include <sstream>
+
 #include "nnet/nnet-component.h"
 
 #include "nnet/nnet-nnet.h"
@@ -26,7 +30,6 @@
 #include "nnet/nnet-linear-transform.h"
 #include "nnet/nnet-rbm.h"
 #include "nnet/nnet-various.h"
-#include "nnet/nnet-kl-hmm.h"
 
 #include "nnet/nnet-convolutional-component.h"
 #include "nnet/nnet-average-pooling-component.h"
@@ -43,53 +46,53 @@
 #include "nnet/nnet-frame-pooling-component.h"
 #include "nnet/nnet-parallel-component.h"
 
-#include <sstream>
-
 namespace kaldi {
 namespace nnet1 {
 
 const struct Component::key_value Component::kMarkerMap[] = {
-  { Component::kAffineTransform,"<AffineTransform>" },
-  { Component::kLinearTransform,"<LinearTransform>" },
-  { Component::kConvolutionalComponent,"<ConvolutionalComponent>"},
-  { Component::kConvolutional2DComponent,"<Convolutional2DComponent>"},
-  { Component::kLstmProjectedStreams,"<LstmProjectedStreams>"},
-  { Component::kBLstmProjectedStreams,"<BLstmProjectedStreams>"},
-  { Component::kSoftmax,"<Softmax>" },
-  { Component::kBlockSoftmax,"<BlockSoftmax>" },
-  { Component::kSigmoid,"<Sigmoid>" },
-  { Component::kTanh,"<Tanh>" },
-  { Component::kDropout,"<Dropout>" },
-  { Component::kRbm,"<Rbm>" },
-  { Component::kSplice,"<Splice>" },
-  { Component::kCopy,"<Copy>" },
-  { Component::kAddShift,"<AddShift>" },
-  { Component::kRescale,"<Rescale>" },
-  { Component::kKlHmm,"<KlHmm>" },
-  { Component::kAveragePoolingComponent,"<AveragePoolingComponent>"},
-  { Component::kAveragePooling2DComponent,"<AveragePooling2DComponent>"},
-  { Component::kMaxPoolingComponent, "<MaxPoolingComponent>"},
-  { Component::kMaxPooling2DComponent, "<MaxPooling2DComponent>"},
-  { Component::kSentenceAveragingComponent,"<SentenceAveragingComponent>"},
-  { Component::kFramePoolingComponent, "<FramePoolingComponent>"},
-  { Component::kParallelComponent, "<ParallelComponent>"},
+  { Component::kAffineTransform, "<AffineTransform>" },
+  { Component::kLinearTransform, "<LinearTransform>" },
+  { Component::kConvolutionalComponent, "<ConvolutionalComponent>" },
+  { Component::kConvolutional2DComponent, "<Convolutional2DComponent>" },
+  { Component::kLstmProjectedStreams, "<LstmProjectedStreams>" },
+  { Component::kBLstmProjectedStreams, "<BLstmProjectedStreams>" },
+  { Component::kSoftmax, "<Softmax>" },
+  { Component::kBlockSoftmax, "<BlockSoftmax>" },
+  { Component::kSigmoid, "<Sigmoid>" },
+  { Component::kTanh, "<Tanh>" },
+  { Component::kDropout, "<Dropout>" },
+  { Component::kLengthNormComponent, "<LengthNormComponent>" },
+  { Component::kRbm, "<Rbm>" },
+  { Component::kSplice, "<Splice>" },
+  { Component::kCopy, "<Copy>" },
+  { Component::kAddShift, "<AddShift>" },
+  { Component::kRescale, "<Rescale>" },
+  { Component::kKlHmm, "<KlHmm>" },
+  { Component::kAveragePoolingComponent, "<AveragePoolingComponent>" },
+  { Component::kAveragePooling2DComponent, "<AveragePooling2DComponent>" },
+  { Component::kMaxPoolingComponent, "<MaxPoolingComponent>" },
+  { Component::kMaxPooling2DComponent, "<MaxPooling2DComponent>" },
+  { Component::kSentenceAveragingComponent, "<SentenceAveragingComponent>" },
+  { Component::kSimpleSentenceAveragingComponent, "<SimpleSentenceAveragingComponent>" },
+  { Component::kFramePoolingComponent, "<FramePoolingComponent>" },
+  { Component::kParallelComponent, "<ParallelComponent>" },
 };
 
 
 const char* Component::TypeToMarker(ComponentType t) {
-  int32 N=sizeof(kMarkerMap)/sizeof(kMarkerMap[0]);
-  for(int i=0; i<N; i++) {
+  int32 N = sizeof(kMarkerMap) / sizeof(kMarkerMap[0]);
+  for (int i = 0; i < N; i++) {
     if (kMarkerMap[i].key == t) return kMarkerMap[i].value;
   }
-  KALDI_ERR << "Unknown type" << t;
+  KALDI_ERR << "Unknown type : " << t;
   return NULL;
 }
 
 Component::ComponentType Component::MarkerToType(const std::string &s) {
   std::string s_lowercase(s);
-  std::transform(s.begin(), s.end(), s_lowercase.begin(), ::tolower); // lc
-  int32 N=sizeof(kMarkerMap)/sizeof(kMarkerMap[0]);
-  for(int i=0; i<N; i++) {
+  std::transform(s.begin(), s.end(), s_lowercase.begin(), ::tolower);  // lc
+  int32 N = sizeof(kMarkerMap) / sizeof(kMarkerMap[0]);
+  for (int i = 0; i < N; i++) {
     std::string m(kMarkerMap[i].value);
     std::string m_lowercase(m);
     std::transform(m.begin(), m.end(), m_lowercase.begin(), ::tolower);
@@ -105,10 +108,10 @@ Component* Component::NewComponentOfType(ComponentType comp_type,
   Component *ans = NULL;
   switch (comp_type) {
     case Component::kAffineTransform :
-      ans = new AffineTransform(input_dim, output_dim); 
+      ans = new AffineTransform(input_dim, output_dim);
       break;
     case Component::kLinearTransform :
-      ans = new LinearTransform(input_dim, output_dim); 
+      ans = new LinearTransform(input_dim, output_dim);
       break;
     case Component::kConvolutionalComponent :
       ans = new ConvolutionalComponent(input_dim, output_dim);
@@ -135,7 +138,10 @@ Component* Component::NewComponentOfType(ComponentType comp_type,
       ans = new Tanh(input_dim, output_dim);
       break;
     case Component::kDropout :
-      ans = new Dropout(input_dim, output_dim); 
+      ans = new Dropout(input_dim, output_dim);
+      break;
+    case Component::kLengthNormComponent :
+      ans = new LengthNormComponent(input_dim, output_dim);
       break;
     case Component::kRbm :
       ans = new Rbm(input_dim, output_dim);
@@ -158,6 +164,9 @@ Component* Component::NewComponentOfType(ComponentType comp_type,
     case Component::kSentenceAveragingComponent :
       ans = new SentenceAveragingComponent(input_dim, output_dim);
       break;
+    case Component::kSimpleSentenceAveragingComponent :
+      ans = new SimpleSentenceAveragingComponent(input_dim, output_dim);
+      break;
     case Component::kAveragePoolingComponent :
       ans = new AveragePoolingComponent(input_dim, output_dim);
       break;
@@ -193,7 +202,7 @@ Component* Component::Init(const std::string &conf_line) {
   ReadToken(is, false, &component_type_string);
   ComponentType component_type = MarkerToType(component_type_string);
   ExpectToken(is, false, "<InputDim>");
-  ReadBasicType(is, false, &input_dim); 
+  ReadBasicType(is, false, &input_dim);
   ExpectToken(is, false, "<OutputDim>");
   ReadBasicType(is, false, &output_dim);
   Component *ans = NewComponentOfType(component_type, input_dim, output_dim);
@@ -213,20 +222,30 @@ Component* Component::Read(std::istream &is, bool binary) {
   if (first_char == EOF) return NULL;
 
   ReadToken(is, binary, &token);
-  // Skip optional initial token
-  if(token == "<Nnet>") {
-    ReadToken(is, binary, &token); // Next token is a Component
+  // Skip the optional initial token,
+  if (token == "<Nnet>") {
+    ReadToken(is, binary, &token);
   }
-  // Finish reading when optional terminal token appears
-  if(token == "</Nnet>") {
+  // Network ends after terminal token appears,
+  if (token == "</Nnet>") {
     return NULL;
   }
 
-  ReadBasicType(is, binary, &dim_out); 
+  // Read the dims,
+  ReadBasicType(is, binary, &dim_out);
   ReadBasicType(is, binary, &dim_in);
 
+  // Create the component,
   Component *ans = NewComponentOfType(MarkerToType(token), dim_in, dim_out);
+
+  // Read the content,
   ans->ReadData(is, binary);
+
+  // 'Eat' the component separtor (can be already consumed by 'ReadData(.)'),
+  if ('<' == Peek(is, binary) && '!' == PeekToken(is, binary)) {
+    ExpectToken(is, binary, "<!EndOfComponent>");
+  }
+
   return ans;
 }
 
@@ -235,10 +254,12 @@ void Component::Write(std::ostream &os, bool binary) const {
   WriteToken(os, binary, Component::TypeToMarker(GetType()));
   WriteBasicType(os, binary, OutputDim());
   WriteBasicType(os, binary, InputDim());
-  if(!binary) os << "\n";
+  if (!binary) os << "\n";
   this->WriteData(os, binary);
+  WriteToken(os, binary, "<!EndOfComponent>");  // Write component separator.
+  if (!binary) os << "\n";
 }
 
 
-} // namespace nnet1
-} // namespace kaldi
+}  // namespace nnet1
+}  // namespace kaldi
diff --git a/src/nnet/nnet-component.h b/src/nnet/nnet-component.h
index 2ceabf851df..3d0b4303cf7 100644
--- a/src/nnet/nnet-component.h
+++ b/src/nnet/nnet-component.h
@@ -1,6 +1,6 @@
 // nnet/nnet-component.h
 
-// Copyright 2011-2013  Brno University of Technology (Author: Karel Vesely)
+// Copyright 2011-2016  Brno University of Technology (Author: Karel Vesely)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -22,6 +22,8 @@
 #ifndef KALDI_NNET_NNET_COMPONENT_H_
 #define KALDI_NNET_NNET_COMPONENT_H_
 
+#include <iostream>
+#include <string>
 
 #include "base/kaldi-common.h"
 #include "matrix/matrix-lib.h"
@@ -29,8 +31,6 @@
 #include "cudamatrix/cu-vector.h"
 #include "nnet/nnet-trnopts.h"
 
-#include <iostream>
-
 namespace kaldi {
 namespace nnet1 {
 
@@ -39,16 +39,15 @@ namespace nnet1 {
  * It is able to propagate (PropagateFnc: compute the output based on its input)
  * and backpropagate (BackpropagateFnc: i.e. transform loss derivative w.r.t. output to derivative w.r.t. the input)
  * the formulas are implemented in descendant classes (AffineTransform,Sigmoid,Softmax,...).
- */ 
+ */
 class Component {
-
- /// Component type identification mechanism
- public: 
-  /// Types of Components
+ /// Component type identification mechanism,
+ public:
+  /// Types of Components,
   typedef enum {
     kUnknown = 0x0,
-     
-    kUpdatableComponent = 0x0100, 
+
+    kUpdatableComponent = 0x0100,
     kAffineTransform,
     kLinearTransform,
     kConvolutionalComponent,
@@ -56,12 +55,13 @@ class Component {
     kLstmProjectedStreams,
     kBLstmProjectedStreams,
 
-    kActivationFunction = 0x0200, 
-    kSoftmax, 
-    kBlockSoftmax, 
+    kActivationFunction = 0x0200,
+    kSoftmax,
+    kBlockSoftmax,
     kSigmoid,
     kTanh,
     kDropout,
+    kLengthNormComponent,
 
     kTranform = 0x0400,
     kRbm,
@@ -71,85 +71,105 @@ class Component {
     kBlockLinearity,
     kAddShift,
     kRescale,
-    
+
     kKlHmm = 0x0800,
-    kSentenceAveragingComponent,
+    kSentenceAveragingComponent, /* deprecated */
+    kSimpleSentenceAveragingComponent,
     kAveragePoolingComponent,
     kAveragePooling2DComponent,
     kMaxPoolingComponent,
     kMaxPooling2DComponent,
-    kFramePoolingComponent, 
+    kFramePoolingComponent,
     kParallelComponent
   } ComponentType;
-  /// A pair of type and marker 
+
+  /// A pair of type and marker,
   struct key_value {
     const Component::ComponentType key;
     const char *value;
   };
-  /// Mapping of types and markers (the table is defined in nnet-component.cc) 
+
+  /// The table with pairs of Component types and markers
+  /// (defined in nnet-component.cc),
   static const struct key_value kMarkerMap[];
-  /// Convert component type to marker
+
+  /// Converts component type to marker,
   static const char* TypeToMarker(ComponentType t);
-  /// Convert marker to component type (case insensitive)
-  static ComponentType MarkerToType(const std::string &s);  
- 
- /// General interface of a component  
+
+  /// Converts marker to component type (case insensitive),
+  static ComponentType MarkerToType(const std::string &s);
+
+ /// Generic interface of a component,
  public:
-  Component(int32 input_dim, int32 output_dim) 
-      : input_dim_(input_dim), output_dim_(output_dim) { }
-  virtual ~Component() { }
+  Component(int32 input_dim, int32 output_dim):
+    input_dim_(input_dim),
+    output_dim_(output_dim)
+  { }
+
+  virtual ~Component()
+  { }
 
-  /// Copy component (deep copy).
+  /// Copy component (deep copy),
   virtual Component* Copy() const = 0;
 
-  /// Get Type Identification of the component
-  virtual ComponentType GetType() const = 0;  
-  /// Check if contains trainable parameters 
-  virtual bool IsUpdatable() const { 
-    return false; 
+  /// Get Type Identification of the component,
+  virtual ComponentType GetType() const = 0;
+
+  /// Check if contains trainable parameters,
+  virtual bool IsUpdatable() const {
+    return false;
   }
 
-  /// Get size of input vectors
-  int32 InputDim() const { 
-    return input_dim_; 
-  }  
-  /// Get size of output vectors 
-  int32 OutputDim() const { 
-    return output_dim_; 
+  /// Get the dimension of the input,
+  int32 InputDim() const {
+    return input_dim_;
   }
- 
-  /// Perform forward pass propagation Input->Output
-  void Propagate(const CuMatrixBase<BaseFloat> &in, CuMatrix<BaseFloat> *out); 
-  /// Perform backward pass propagation, out_diff -> in_diff
-  /// '&in' and '&out' will sometimes be unused... 
+
+  /// Get the dimension of the output,
+  int32 OutputDim() const {
+    return output_dim_;
+  }
+
+  /// Perform forward-pass propagation 'in' -> 'out',
+  void Propagate(const CuMatrixBase<BaseFloat> &in, CuMatrix<BaseFloat> *out);
+
+  /// Perform backward-pass propagation 'out_diff' -> 'in_diff'.
+  /// Note: 'in' and 'out' will be used only sometimes...
   void Backpropagate(const CuMatrixBase<BaseFloat> &in,
                      const CuMatrixBase<BaseFloat> &out,
                      const CuMatrixBase<BaseFloat> &out_diff,
-                     CuMatrix<BaseFloat> *in_diff); 
+                     CuMatrix<BaseFloat> *in_diff);
 
-  /// Initialize component from a line in config file
+  /// Initialize component from a line in config file,
   static Component* Init(const std::string &conf_line);
-  /// Read component from stream
+
+  /// Read the component from a stream (static method),
   static Component* Read(std::istream &is, bool binary);
-  /// Write component to stream
+
+  /// Write the component to a stream,
   void Write(std::ostream &os, bool binary) const;
 
-  /// Optionally print some additional info
+  /// Print some additional info (after <ComponentName> and the dims),
   virtual std::string Info() const { return ""; }
+
+  /// Print some additional info about gradient (after <...> and dims),
   virtual std::string InfoGradient() const { return ""; }
 
 
- /// Abstract interface for propagation/backpropagation 
+ /// Abstract interface for propagation/backpropagation
  protected:
   /// Forward pass transformation (to be implemented by descending class...)
   virtual void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
                             CuMatrixBase<BaseFloat> *out) = 0;
+
   /// Backward pass transformation (to be implemented by descending class...)
   virtual void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
                                 const CuMatrixBase<BaseFloat> &out,
                                 const CuMatrixBase<BaseFloat> &out_diff,
                                 CuMatrixBase<BaseFloat> *in_diff) = 0;
 
+ /// Virtual interface for initialization and I/O,
+ protected:
   /// Initialize internal data of a component
   virtual void InitData(std::istream &is) { }
 
@@ -159,110 +179,141 @@ class Component {
   /// Writes the component content
   virtual void WriteData(std::ostream &os, bool binary) const { }
 
- /// Data members
+ /// Data members,
  protected:
-  int32 input_dim_;  ///< Size of input vectors
-  int32 output_dim_; ///< Size of output vectors
+  int32 input_dim_;  ///< Dimension of the input of the Component,
+  int32 output_dim_;  ///< Dimension of the output of the Component,
 
+ /// Private members (descending classes cannot call this),
  private:
-  /// Create new intance of component
-  static Component* NewComponentOfType(ComponentType t, 
-                      int32 input_dim, int32 output_dim);
-  
- protected:
-  //KALDI_DISALLOW_COPY_AND_ASSIGN(Component);
+  /// Create a new intance of component,
+  static Component* NewComponentOfType(
+    ComponentType t, int32 input_dim, int32 output_dim
+  );
 };
 
 
 /**
  * Class UpdatableComponent is a Component which has trainable parameters,
- * contains SGD training hyper-parameters in NnetTrainOptions.
+ * it contains SGD training hyper-parameters in NnetTrainOptions.
+ * The constants 'learning_rate_coef_' and 'bias_learn_rate_coef_'
+ * are separate, and should be stored by ::WriteData(...),
  */
 class UpdatableComponent : public Component {
- public: 
-  UpdatableComponent(int32 input_dim, int32 output_dim)
-    : Component(input_dim, output_dim) { }
-  virtual ~UpdatableComponent() { }
-
-  /// Check if contains trainable parameters 
-  bool IsUpdatable() const { 
-    return true; 
+ public:
+  UpdatableComponent(int32 input_dim, int32 output_dim):
+    Component(input_dim, output_dim),
+    learn_rate_coef_(1.0),
+    bias_learn_rate_coef_(1.0)
+  { }
+
+  virtual ~UpdatableComponent()
+  { }
+
+  /// Check if contains trainable parameters,
+  bool IsUpdatable() const {
+    return true;
   }
 
-  /// Number of trainable parameters
+  /// Number of trainable parameters,
   virtual int32 NumParams() const = 0;
-  virtual void GetParams(Vector<BaseFloat> *params) const = 0;
 
-  /// Compute gradient and update parameters
+  /// Get gradient reshaped as a vector,
+  virtual void GetGradient(VectorBase<BaseFloat> *gradient) const = 0;
+
+  /// Get the trainable parameters reshaped as a vector,
+  virtual void GetParams(VectorBase<BaseFloat> *params) const = 0;
+
+  /// Set the trainable parameters from, reshaped as a vector,
+  virtual void SetParams(const VectorBase<BaseFloat>& params) = 0;
+
+  /// Compute gradient and update parameters,
   virtual void Update(const CuMatrixBase<BaseFloat> &input,
                       const CuMatrixBase<BaseFloat> &diff) = 0;
 
-  /// Sets the training options to the component
+  /// Set the training options to the component,
   virtual void SetTrainOptions(const NnetTrainOptions &opts) {
     opts_ = opts;
   }
-  /// Gets the training options from the component
-  const NnetTrainOptions& GetTrainOptions() const { 
-    return opts_; 
+
+  /// Get the training options from the component,
+  const NnetTrainOptions& GetTrainOptions() const {
+    return opts_;
+  }
+
+  /// Set the learn-rate coefficient,
+  virtual void SetLearnRateCoef(BaseFloat val) {
+    learn_rate_coef_ = val;
+  }
+
+  /// Set the learn-rate coefficient for bias,
+  virtual void SetBiasLearnRateCoef(BaseFloat val) {
+    bias_learn_rate_coef_ = val;
   }
 
+  /// Initialize the content of the component by the 'line' from the prototype,
   virtual void InitData(std::istream &is) = 0;
 
  protected:
-  /// Option-class with training hyper-parameters
-  NnetTrainOptions opts_; 
+  /// Option-class with training hyper-parameters,
+  NnetTrainOptions opts_;
+
+  /// Scalar applied to learning rate for weight matrices
+  /// (to be used in ::Update method),
+  BaseFloat learn_rate_coef_;
+
+  /// Scalar applied to learning rate for bias
+  /// (to be used in ::Update method),
+  BaseFloat bias_learn_rate_coef_;
 };
 
 
+
+/*
+ * Inline methods for ::Component,
+ */
 inline void Component::Propagate(const CuMatrixBase<BaseFloat> &in,
                                  CuMatrix<BaseFloat> *out) {
   // Check the dims
   if (input_dim_ != in.NumCols()) {
-    KALDI_ERR << "Non-matching dims! " << TypeToMarker(GetType()) 
-              << " input-dim : " << input_dim_ << " data : " << in.NumCols();
+    KALDI_ERR << "Non-matching dims on the input of " << TypeToMarker(GetType())
+              << " component. The input-dim is " << input_dim_
+              << ", the data had " << in.NumCols() << " dims.";
   }
   // Allocate target buffer
-  out->Resize(in.NumRows(), output_dim_, kSetZero); // reset
+  out->Resize(in.NumRows(), output_dim_, kSetZero);  // reset
   // Call the propagation implementation of the component
   PropagateFnc(in, out);
 }
 
-
 inline void Component::Backpropagate(const CuMatrixBase<BaseFloat> &in,
                                      const CuMatrixBase<BaseFloat> &out,
                                      const CuMatrixBase<BaseFloat> &out_diff,
                                      CuMatrix<BaseFloat> *in_diff) {
-  // Check the dims
-  if (output_dim_ != out_diff.NumCols()) {
-    KALDI_ERR << "Non-matching output dims, component:" << output_dim_ 
-              << " data:" << out_diff.NumCols();
-  }
-  
-  // Target buffer NULL : backpropagate only through components with nested nnets.
-  if (in_diff == NULL) {
-    if (GetType() == kParallelComponent ||
-        GetType() == kSentenceAveragingComponent) {
-      BackpropagateFnc(in, out, out_diff, NULL);
-    } else {
-      return;
-    }
-  } else {
-    // Allocate target buffer
-    in_diff->Resize(out_diff.NumRows(), input_dim_, kSetZero); // reset
-    // Asserts on the dims
-    KALDI_ASSERT((in.NumRows() == out.NumRows()) &&
-                 (in.NumRows() == out_diff.NumRows()) &&
-                 (in.NumRows() == in_diff->NumRows()));
-    KALDI_ASSERT(in.NumCols() == in_diff->NumCols());
-    KALDI_ASSERT(out.NumCols() == out_diff.NumCols());
-    // Call the backprop implementation of the component
-    BackpropagateFnc(in, out, out_diff, in_diff);
+  // Check the dims,
+  if (OutputDim() != out_diff.NumCols()) {
+    KALDI_ERR << "Non-matching dims! Component output dim " << OutputDim()
+              << ", the dim of output derivatives " << out_diff.NumCols();
   }
+
+  int32 num_frames = out_diff.NumRows();
+  KALDI_ASSERT(num_frames == in.NumRows());
+  KALDI_ASSERT(num_frames == out.NumRows());
+
+  KALDI_ASSERT(InputDim() == in.NumCols());
+  KALDI_ASSERT(OutputDim() == out.NumCols());
+
+  // Allocate target buffer,
+  KALDI_ASSERT(in_diff != NULL);
+  in_diff->Resize(num_frames, InputDim(), kSetZero);  // reset,
+
+  // Call the 'virtual' backprop function,
+  BackpropagateFnc(in, out, out_diff, in_diff);
 }
 
 
-} // namespace nnet1
-} // namespace kaldi
+}  // namespace nnet1
+}  // namespace kaldi
 
 
-#endif
+#endif  // KALDI_NNET_NNET_COMPONENT_H_
diff --git a/src/nnet/nnet-convolutional-2d-component.h b/src/nnet/nnet-convolutional-2d-component.h
index 322053bb1f0..765a7c430ed 100644
--- a/src/nnet/nnet-convolutional-2d-component.h
+++ b/src/nnet/nnet-convolutional-2d-component.h
@@ -1,4 +1,4 @@
-// nnet/nnet-convolutional-component.h
+// nnet/nnet-convolutional-2d-component.h
 
 // Copyright 2014-2015  Johns Hopkins University (author: Sri Harish Mallidi)
 //                      Brno University of Technology (author: Karel Vesely),
@@ -23,6 +23,8 @@
 #ifndef KALDI_NNET_NNET_CONVOLUTIONAL_2D_COMPONENT_H_
 #define KALDI_NNET_NNET_CONVOLUTIONAL_2D_COMPONENT_H_
 
+#include <string>
+#include <vector>
 
 #include "nnet/nnet-component.h"
 #include "nnet/nnet-various.h"
@@ -33,25 +35,25 @@ namespace nnet1 {
 
 /**
  * Convolutional2DComponent implements convolution over 2-axis (frequency and temporal)
- * (i.e. frequency axis in case we are the 1st component in NN). 
- * // We don't do convolution along temporal axis, which simplifies the 
+ * (i.e. frequency axis in case we are the 1st component in NN).
+ * // We don't do convolution along temporal axis, which simplifies the
  * // implementation (and was not helpful for Tara).
  *
  * We assume the input featrues are spliced, i.e. each frame
  * is in fact a set of stacked frames, where we can form patches
  * which span over several frequency bands and time axes.
  *
- * The convolution is done over whole axis with same filters, 
- * i.e. we don't use separate filters for different 'regions' 
+ * The convolution is done over whole axis with same filters,
+ * i.e. we don't use separate filters for different 'regions'
  * of frequency axis.
  *
- * In order to have a fast implementations, the filters 
+ * In order to have a fast implementations, the filters
  * are represented in vectorized form, where each rectangular
- * filter corresponds to a row in a matrix, where all filters 
- * are stored. The features are then re-shaped to a set of matrices, 
- * where one matrix corresponds to single patch-position, 
+ * filter corresponds to a row in a matrix, where all filters
+ * are stored. The features are then re-shaped to a set of matrices,
+ * where one matrix corresponds to single patch-position,
  * where the filters get applied.
- * 
+ *
  * The type of convolution is controled by hyperparameters:
  * x_patch_dim_,y_patch_dim_     ... temporal and frequency axes sizes of the patch (e.g. (9,9) for 9x9 2D filter)
  * x_patch_step_,y_patch_step_    ... temporal and frequencey sizes of shifts in the convolution (e.g. (1,1) 2D filter with 1 step shift in both axes)
@@ -60,22 +62,23 @@ namespace nnet1 {
  * fmap_x_len_, fmap_y_len_ ... dimension of the feature (maps if inside convolutional layer) (e.g. (11,32) for 32-band 11 frame spliced spectrogram patch)
  * filt_x_len_, filt_y_len_ ... temporal and frequency sizes of the filters (e.g. (9,9) for 9x9 2D filter)
  * filt_x_step_, filt_y_step_ ... temporal and frequency sizes of the filters (e.g. (1,1) for 2D-filter, with 1 step shift in both axes)
- * 
  *
- * Due to convolution same weights are used repeateadly, 
- * the final gradient is average of all position-specific 
+ *
+ * Due to convolution same weights are used repeateadly,
+ * the final gradient is average of all position-specific
  * gradients.
  *
  */
 class Convolutional2DComponent : public UpdatableComponent {
  public:
-  Convolutional2DComponent(int32 dim_in, int32 dim_out)
-    : UpdatableComponent(dim_in, dim_out),
-      fmap_x_len_(0), fmap_y_len_(0),
-      filt_x_len_(0), filt_y_len_(0),
-      filt_x_step_(0), filt_y_step_(0),
-      connect_fmap_(0), learn_rate_coef_(1.0), bias_learn_rate_coef_(1.0)
+  Convolutional2DComponent(int32 dim_in, int32 dim_out):
+    UpdatableComponent(dim_in, dim_out),
+    fmap_x_len_(0), fmap_y_len_(0),
+    filt_x_len_(0), filt_y_len_(0),
+    filt_x_step_(0), filt_y_step_(0),
+    connect_fmap_(0)
   { }
+
   ~Convolutional2DComponent()
   { }
 
@@ -85,10 +88,9 @@ class Convolutional2DComponent : public UpdatableComponent {
   void InitData(std::istream &is) {
     // define options
     BaseFloat bias_mean = -2.0, bias_range = 2.0, param_stddev = 0.1;
-    BaseFloat learn_rate_coef = 1.0, bias_learn_rate_coef = 1.0;
     // parse config
     std::string token;
-    while (!is.eof()) {
+    while (is >> std::ws, !is.eof()) {
       ReadToken(is, false, &token);
       /**/ if (token == "<ParamStddev>") ReadBasicType(is, false, &param_stddev);
       else if (token == "<BiasMean>")    ReadBasicType(is, false, &bias_mean);
@@ -100,11 +102,12 @@ class Convolutional2DComponent : public UpdatableComponent {
       else if (token == "<FiltXStep>")   ReadBasicType(is, false, &filt_x_step_);
       else if (token == "<FiltYStep>")   ReadBasicType(is, false, &filt_y_step_);
       else if (token == "<ConnectFmap>") ReadBasicType(is, false, &connect_fmap_);
-      else if (token == "<LearnRateCoef>") ReadBasicType(is, false, &learn_rate_coef);
-      else if (token == "<BiasLearnRateCoef>") ReadBasicType(is, false, &bias_learn_rate_coef);
-      else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
-                     << " (ParamStddev|BiasMean|BiasRange|FmapXLen|FmapYLen|FiltXLen|FiltYLen|FiltXStep|FiltYStep|ConnectFmap|LearnRateCoef|BiasLearnRateCoef)";
-      is >> std::ws;  // eat-up whitespace
+      else if (token == "<LearnRateCoef>") ReadBasicType(is, false, &learn_rate_coef_);
+      else if (token == "<BiasLearnRateCoef>") ReadBasicType(is, false, &bias_learn_rate_coef_);
+      else KALDI_ERR << "Unknown token " << token << ", a typo in config? "
+                     << "(ParamStddev|BiasMean|BiasRange|FmapXLen|FmapYLen|"
+                        "FiltXLen|FiltYLen|FiltXStep|FiltYStep|ConnectFmap|"
+                        "LearnRateCoef|BiasLearnRateCoef)";
     }
 
     //
@@ -120,7 +123,6 @@ class Convolutional2DComponent : public UpdatableComponent {
     KALDI_ASSERT((fmap_y_len_ - filt_y_len_) % (filt_y_step_) == 0);
     int32 out_fmap_x_len = (fmap_x_len_ - filt_x_len_)/filt_x_step_ + 1;
     int32 out_fmap_y_len = (fmap_y_len_ - filt_y_len_)/filt_y_step_ + 1;
-    //    int32 out_fmap_size = out_fmap_x_len*out_fmap_y_len;
     // output sanity checks
     KALDI_ASSERT(output_dim_ % (out_fmap_x_len * out_fmap_y_len)  == 0);
     int32 num_output_fmaps = output_dim_ / (out_fmap_x_len * out_fmap_y_len);
@@ -129,27 +131,13 @@ class Convolutional2DComponent : public UpdatableComponent {
     KALDI_LOG << "num_filters " << num_filters;
 
     //
-    // Initialize parameters
+    // Initialize trainable parameters,
     //
-    Matrix<BaseFloat> mat(num_filters, num_input_fmaps*filt_x_len_*filt_y_len_);
-    for (int32 r = 0; r < num_filters; r++) {
-      for (int32 c = 0; c < num_input_fmaps*filt_x_len_*filt_y_len_; c++) {
-        // 0-mean Gauss with given std_dev
-        mat(r, c) = param_stddev * RandGauss();
-      }
-    }
-    filters_ = mat;
-    //
-    Vector<BaseFloat> vec(num_filters);
-    for (int32 i = 0; i < num_filters; i++) {
-      // +/- 1/2*bias_range from bias_mean:
-      vec(i) = bias_mean + (RandUniform() - 0.5) * bias_range;
-    }
-    bias_ = vec;
-    //
-    learn_rate_coef_ = learn_rate_coef;
-    bias_learn_rate_coef_ = bias_learn_rate_coef;
+    filters_.Resize(num_filters, num_input_fmaps*filt_x_len_*filt_y_len_);
+    RandGauss(0.0, param_stddev, &filters_);
     //
+    bias_.Resize(num_filters);
+    RandUniform(bias_mean, bias_range, &bias_);
   }
 
   void ReadData(std::istream &is, bool binary) {
@@ -202,6 +190,8 @@ class Convolutional2DComponent : public UpdatableComponent {
     WriteBasicType(os, binary, learn_rate_coef_);
     WriteToken(os, binary, "<BiasLearnRateCoef>");
     WriteBasicType(os, binary, bias_learn_rate_coef_);
+    if (!binary) os << "\n";
+
     // convolution hyperparameters
     WriteToken(os, binary, "<FmapXLen>");
     WriteBasicType(os, binary, fmap_x_len_);
@@ -217,11 +207,14 @@ class Convolutional2DComponent : public UpdatableComponent {
     WriteBasicType(os, binary, filt_y_step_);
     WriteToken(os, binary, "<ConnectFmap>");
     WriteBasicType(os, binary, connect_fmap_);
+    if (!binary) os << "\n";
 
     // trainable parameters
     WriteToken(os, binary, "<Filters>");
+    if (!binary) os << "\n";
     filters_.Write(os, binary);
     WriteToken(os, binary, "<Bias>");
+    if (!binary) os << "\n";
     bias_.Write(os, binary);
   }
 
@@ -229,16 +222,32 @@ class Convolutional2DComponent : public UpdatableComponent {
     return filters_.NumRows()*filters_.NumCols() + bias_.Dim();
   }
 
-  void GetParams(Vector<BaseFloat>* wei_copy) const {
-    wei_copy->Resize(NumParams());
+  void GetGradient(VectorBase<BaseFloat>* gradient) const {
+    KALDI_ASSERT(gradient->Dim() == NumParams());
     int32 filters_num_elem = filters_.NumRows() * filters_.NumCols();
-    wei_copy->Range(0, filters_num_elem).CopyRowsFromMat(Matrix<BaseFloat>(filters_));
-    wei_copy->Range(filters_num_elem, bias_.Dim()).CopyFromVec(Vector<BaseFloat>(bias_));
+    gradient->Range(0, filters_num_elem).CopyRowsFromMat(filters_);
+    gradient->Range(filters_num_elem, bias_.Dim()).CopyFromVec(bias_);
+  }
+
+  void GetParams(VectorBase<BaseFloat>* params) const {
+    KALDI_ASSERT(params->Dim() == NumParams());
+    int32 filters_num_elem = filters_.NumRows() * filters_.NumCols();
+    params->Range(0, filters_num_elem).CopyRowsFromMat(filters_);
+    params->Range(filters_num_elem, bias_.Dim()).CopyFromVec(bias_);
+  }
+
+  void SetParams(const VectorBase<BaseFloat>& params) {
+    KALDI_ASSERT(params.Dim() == NumParams());
+    int32 filters_num_elem = filters_.NumRows() * filters_.NumCols();
+    filters_.CopyRowsFromVec(params.Range(0, filters_num_elem));
+    bias_.CopyFromVec(params.Range(filters_num_elem, bias_.Dim()));
   }
 
   std::string Info() const {
     return std::string("\n  filters") + MomentStatistics(filters_) +
-           "\n  bias" + MomentStatistics(bias_);
+           ", lr-coef " + ToString(learn_rate_coef_) +
+           "\n  bias" + MomentStatistics(bias_) +
+           ", lr-coef " + ToString(bias_learn_rate_coef_);
   }
   std::string InfoGradient() const {
     return std::string("\n  filters_grad") + MomentStatistics(filters_grad_) +
@@ -247,7 +256,8 @@ class Convolutional2DComponent : public UpdatableComponent {
            ", lr-coef " + ToString(bias_learn_rate_coef_);
   }
 
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in, CuMatrixBase<BaseFloat> *out) {
+  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
+                    CuMatrixBase<BaseFloat> *out) {
     // useful dims
     int32 num_input_fmaps = input_dim_ / (fmap_x_len_ * fmap_y_len_);
     // int32 inp_fmap_size = fmap_x_len_ * fmap_y_len_;
@@ -311,8 +321,10 @@ class Convolutional2DComponent : public UpdatableComponent {
   }
 
 
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in, const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff, CuMatrixBase<BaseFloat> *in_diff) {
+  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
+                        const CuMatrixBase<BaseFloat> &out,
+                        const CuMatrixBase<BaseFloat> &out_diff,
+                        CuMatrixBase<BaseFloat> *in_diff) {
     // useful dims
     int32 num_input_fmaps = input_dim_ / (fmap_x_len_ * fmap_y_len_);
 
@@ -384,8 +396,11 @@ class Convolutional2DComponent : public UpdatableComponent {
                      + (j / num_input_fmaps)
                      + (j % num_input_fmaps) * fmap_y_len_;
             }
-            CuSubMatrix<BaseFloat> src(feature_patch_diffs_[out_fmap_cnt].ColRange(i*filt_y_len_*num_input_fmaps+j, 1));  // from which col
-            CuSubMatrix<BaseFloat> tgt(in_diff->ColRange(c, 1));  // to which col?
+            // from which col?
+            CuMatrix<BaseFloat>& diff_mat = feature_patch_diffs_[out_fmap_cnt];
+            CuSubMatrix<BaseFloat> src(diff_mat.ColRange(i*filt_y_len_*num_input_fmaps+j, 1));
+            // to which col?
+            CuSubMatrix<BaseFloat> tgt(in_diff->ColRange(c, 1));
             tgt.AddMat(1.0, src);
           }
         }
@@ -397,56 +412,38 @@ class Convolutional2DComponent : public UpdatableComponent {
   }
 
 
-  void Update(const CuMatrixBase<BaseFloat> &input, const CuMatrixBase<BaseFloat> &diff) {
-    // useful dims
+  void Update(const CuMatrixBase<BaseFloat> &input,
+              const CuMatrixBase<BaseFloat> &diff) {
+    // useful dims,
     int32 out_fmap_x_len = (fmap_x_len_ - filt_x_len_)/filt_x_step_ + 1;
     int32 out_fmap_y_len = (fmap_y_len_ - filt_y_len_)/filt_y_step_ + 1;
-    int32 out_fmap_size = out_fmap_x_len*out_fmap_y_len;
     int32 num_output_fmaps = output_dim_ / (out_fmap_x_len * out_fmap_y_len);
-    int32 num_filters = filters_.NumRows();  // this is total num_filters, so each input_fmap has num_filters/num_input_fmaps
+
+    // This is total num_filters,
+    // each input_fmap has num_filters / num_input_fmaps:
+    int32 num_filters = filters_.NumRows();
     KALDI_ASSERT(num_filters == num_output_fmaps);
 
-    // we use following hyperparameters from the option class
+    // we use following hyperparameters from the option class,
     const BaseFloat lr = opts_.learn_rate;
-    /* NOT NOW:
-    const BaseFloat mmt = opts_.momentum;
-    const BaseFloat l2 = opts_.l2_penalty;
-    const BaseFloat l1 = opts_.l1_penalty;
-    */
-
 
-    //
-    // calculate the gradient
-    //
-    filters_grad_.Resize(filters_.NumRows(), filters_.NumCols(), kSetZero);
-    bias_grad_.Resize(filters_.NumRows());
-
-    for (int32 p = 0; p < out_fmap_size; p++) {
-      CuSubMatrix<BaseFloat> diff_patch(diff.ColRange(p*num_filters, num_filters));
-      filters_grad_.AddMatMat(1.0, diff_patch, kTrans, vectorized_feature_patches_[p], kNoTrans, 1.0);
-      bias_grad_.AddRowSumMat(1.0, diff_patch, 1.0);
-    }
-
-    // scale
-    filters_grad_.Scale(1.0/out_fmap_size);
-    bias_grad_.Scale(1.0/out_fmap_size);
-
-    //
-    // update
-    //
-    filters_.AddMat(-lr*learn_rate_coef_, filters_grad_);
-    bias_.AddVec(-lr*bias_learn_rate_coef_, bias_grad_);
-    //
+    filters_.AddMat(-lr * learn_rate_coef_, filters_grad_);
+    bias_.AddVec(-lr * bias_learn_rate_coef_, bias_grad_);
   }
 
  private:
-  int32 fmap_x_len_, fmap_y_len_,  ///< feature maps dimensions (for input x_ is usually splice and y_ is num of fbanks) shift for 2nd dim of a patch (i.e. frame length before splicing)
-    filt_x_len_, filt_y_len_,  ///< 2D filter dimensions, x_ temporal, y_ spectral
-    filt_x_step_, filt_y_step_,  ///< 2D shifts along temporal and spectral
-    connect_fmap_;  ///< if connect_fmap_ = 1, then each fmap has num_filt
+  /// feature maps dimensions (for input x_ is usually splice
+  /// and y_ is num of fbanks) shift for 2nd dim of a patch
+  /// (i.e. frame length before splicing),
+  int32 fmap_x_len_, fmap_y_len_;
+
+  /// 2D filter dimensions, x_ temporal, y_ spectral,
+  int32 filt_x_len_, filt_y_len_;
+
+  /// 2D shifts along temporal and spectral axis,
+  int32 filt_x_step_, filt_y_step_;
 
-  BaseFloat learn_rate_coef_;
-  BaseFloat bias_learn_rate_coef_;
+  int32 connect_fmap_;  ///< if connect_fmap_ = 1, then each fmap has num_filt
 
   CuMatrix<BaseFloat> filters_;  ///< row = vectorized rectangular filter
   CuVector<BaseFloat> bias_;  ///< bias for each filter
@@ -476,4 +473,4 @@ class Convolutional2DComponent : public UpdatableComponent {
 }  // namespace nnet1
 }  // namespace kaldi
 
-#endif
+#endif  // KALDI_NNET_NNET_CONVOLUTIONAL_2D_COMPONENT_H_
diff --git a/src/nnet/nnet-convolutional-component.h b/src/nnet/nnet-convolutional-component.h
index 7771862a2aa..bd4da7d3c0c 100644
--- a/src/nnet/nnet-convolutional-component.h
+++ b/src/nnet/nnet-convolutional-component.h
@@ -21,6 +21,8 @@
 #ifndef KALDI_NNET_NNET_CONVOLUTIONAL_COMPONENT_H_
 #define KALDI_NNET_NNET_CONVOLUTIONAL_COMPONENT_H_
 
+#include <string>
+#include <vector>
 
 #include "nnet/nnet-component.h"
 #include "nnet/nnet-utils.h"
@@ -30,44 +32,47 @@ namespace kaldi {
 namespace nnet1 {
 
 /**
- * ConvolutionalComponent implements convolution over single axis 
- * (i.e. frequency axis in case we are the 1st component in NN). 
- * We don't do convolution along temporal axis, which simplifies the 
+ * ConvolutionalComponent implements convolution over single axis
+ * (i.e. frequency axis in case we are the 1st component in NN).
+ * We don't do convolution along temporal axis, which simplifies the
  * implementation (and was not helpful for Tara).
  *
  * We assume the input featrues are spliced, i.e. each frame
  * is in fact a set of stacked frames, where we can form patches
  * which span over several frequency bands and whole time axis.
  *
- * The convolution is done over whole axis with same filters, 
- * i.e. we don't use separate filters for different 'regions' 
+ * The convolution is done over whole axis with same filters,
+ * i.e. we don't use separate filters for different 'regions'
  * of frequency axis.
  *
- * In order to have a fast implementations, the filters 
+ * In order to have a fast implementations, the filters
  * are represented in vectorized form, where each rectangular
- * filter corresponds to a row in a matrix, where all the filters 
- * are stored. The features are then re-shaped to a set of matrices, 
- * where one matrix corresponds to single patch-position, 
+ * filter corresponds to a row in a matrix, where all the filters
+ * are stored. The features are then re-shaped to a set of matrices,
+ * where one matrix corresponds to single patch-position,
  * where all the filters get applied.
- * 
+ *
  * The type of convolution is controled by hyperparameters:
  * patch_dim_     ... frequency axis size of the patch
  * patch_step_    ... size of shift in the convolution
- * patch_stride_  ... shift for 2nd dim of a patch 
+ * patch_stride_  ... shift for 2nd dim of a patch
  *                    (i.e. frame length before splicing)
  *
- * Due to convolution same weights are used repeateadly, 
- * the final gradient is a sum of all position-specific 
+ * Due to convolution same weights are used repeateadly,
+ * the final gradient is a sum of all position-specific
  * gradients (the sum was found better than averaging).
  *
  */
 class ConvolutionalComponent : public UpdatableComponent {
  public:
-  ConvolutionalComponent(int32 dim_in, int32 dim_out) 
-    : UpdatableComponent(dim_in, dim_out),
-      patch_dim_(0), patch_step_(0), patch_stride_(0), 
-      learn_rate_coef_(1.0), bias_learn_rate_coef_(1.0), max_norm_(0.0)  
+  ConvolutionalComponent(int32 dim_in, int32 dim_out):
+    UpdatableComponent(dim_in, dim_out),
+    patch_dim_(0),
+    patch_step_(0),
+    patch_stride_(0),
+    max_norm_(0.0)
   { }
+
   ~ConvolutionalComponent()
   { }
 
@@ -78,9 +83,9 @@ class ConvolutionalComponent : public UpdatableComponent {
     // define options
     BaseFloat bias_mean = -2.0, bias_range = 2.0, param_stddev = 0.1;
     // parse config
-    std::string token; 
-    while (!is.eof()) {
-      ReadToken(is, false, &token); 
+    std::string token;
+    while (is >> std::ws, !is.eof()) {
+      ReadToken(is, false, &token);
       /**/ if (token == "<ParamStddev>") ReadBasicType(is, false, &param_stddev);
       else if (token == "<BiasMean>")    ReadBasicType(is, false, &bias_mean);
       else if (token == "<BiasRange>")   ReadBasicType(is, false, &bias_range);
@@ -92,7 +97,6 @@ class ConvolutionalComponent : public UpdatableComponent {
       else if (token == "<MaxNorm>") ReadBasicType(is, false, &max_norm_);
       else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
                      << " (ParamStddev|BiasMean|BiasRange|PatchDim|PatchStep|PatchStride)";
-      is >> std::ws; // eat-up whitespace
     }
 
     //
@@ -116,27 +120,18 @@ class ConvolutionalComponent : public UpdatableComponent {
     //
 
     //
-    // Initialize parameters
-    //
-    Matrix<BaseFloat> mat(num_filters, filter_dim);
-    for(int32 r=0; r<num_filters; r++) {
-      for(int32 c=0; c<filter_dim; c++) {
-        mat(r,c) = param_stddev * RandGauss(); // 0-mean Gauss with given std_dev
-      }
-    }
-    filters_ = mat;
-    //
-    Vector<BaseFloat> vec(num_filters);
-    for(int32 i=0; i<num_filters; i++) {
-      // +/- 1/2*bias_range from bias_mean:
-      vec(i) = bias_mean + (RandUniform() - 0.5) * bias_range; 
-    }
-    bias_ = vec;
+    // Initialize trainable parameters,
     //
+    // Gaussian with given std_dev (mean = 0),
+    filters_.Resize(num_filters, filter_dim);
+    RandGauss(0.0, param_stddev, &filters_);
+    // Uniform,
+    bias_.Resize(num_filters);
+    RandUniform(bias_mean, bias_range, &bias_);
   }
 
   void ReadData(std::istream &is, bool binary) {
-    // convolution hyperparameters
+    // convolution hyperparameters,
     ExpectToken(is, binary, "<PatchDim>");
     ReadBasicType(is, binary, &patch_dim_);
     ExpectToken(is, binary, "<PatchStep>");
@@ -144,15 +139,24 @@ class ConvolutionalComponent : public UpdatableComponent {
     ExpectToken(is, binary, "<PatchStride>");
     ReadBasicType(is, binary, &patch_stride_);
 
-    // re-scale learn rate
-    ExpectToken(is, binary, "<LearnRateCoef>");
-    ReadBasicType(is, binary, &learn_rate_coef_);
-    ExpectToken(is, binary, "<BiasLearnRateCoef>");
-    ReadBasicType(is, binary, &bias_learn_rate_coef_);
-    
-    // max-norm regualrization
-    ExpectToken(is, binary, "<MaxNorm>");
-    ReadBasicType(is, binary, &max_norm_);
+    // variant-length list of parameters,
+    bool end_loop = false;
+    while (!end_loop) {
+      int first_char = PeekToken(is, binary);
+      switch (first_char) {
+        case 'L': ExpectToken(is, binary, "<LearnRateCoef>");
+          ReadBasicType(is, binary, &learn_rate_coef_);
+          break;
+        case 'B': ExpectToken(is, binary, "<BiasLearnRateCoef>");
+          ReadBasicType(is, binary, &bias_learn_rate_coef_);
+          break;
+        case 'M': ExpectToken(is, binary, "<MaxNorm>");
+          ReadBasicType(is, binary, &max_norm_);
+          break;
+        case '!': ExpectToken(is, binary, "<!EndOfComponent>");
+        default: end_loop = true;
+      }
+    }
 
     // trainable parameters
     ExpectToken(is, binary, "<Filters>");
@@ -189,48 +193,70 @@ class ConvolutionalComponent : public UpdatableComponent {
     WriteBasicType(os, binary, patch_step_);
     WriteToken(os, binary, "<PatchStride>");
     WriteBasicType(os, binary, patch_stride_);
+    if (!binary) os << "\n";
 
     // re-scale learn rate
     WriteToken(os, binary, "<LearnRateCoef>");
     WriteBasicType(os, binary, learn_rate_coef_);
     WriteToken(os, binary, "<BiasLearnRateCoef>");
     WriteBasicType(os, binary, bias_learn_rate_coef_);
-
     // max-norm regularization
     WriteToken(os, binary, "<MaxNorm>");
     WriteBasicType(os, binary, max_norm_);
+    if (!binary) os << "\n";
 
     // trainable parameters
     WriteToken(os, binary, "<Filters>");
+    if (!binary) os << "\n";
     filters_.Write(os, binary);
     WriteToken(os, binary, "<Bias>");
+    if (!binary) os << "\n";
     bias_.Write(os, binary);
   }
 
-  int32 NumParams() const { 
-    return filters_.NumRows()*filters_.NumCols() + bias_.Dim(); 
+  int32 NumParams() const {
+    return filters_.NumRows()*filters_.NumCols() + bias_.Dim();
   }
-  
-  void GetParams(Vector<BaseFloat>* wei_copy) const {
-    wei_copy->Resize(NumParams());
+
+  void GetGradient(VectorBase<BaseFloat>* gradient) const {
+    KALDI_ASSERT(gradient->Dim() == NumParams());
     int32 filters_num_elem = filters_.NumRows() * filters_.NumCols();
-    wei_copy->Range(0,filters_num_elem).CopyRowsFromMat(Matrix<BaseFloat>(filters_));
-    wei_copy->Range(filters_num_elem, bias_.Dim()).CopyFromVec(Vector<BaseFloat>(bias_));
+    gradient->Range(0, filters_num_elem).CopyRowsFromMat(filters_);
+    gradient->Range(filters_num_elem, bias_.Dim()).CopyFromVec(bias_);
+  }
+
+  void GetParams(VectorBase<BaseFloat>* params) const {
+    KALDI_ASSERT(params->Dim() == NumParams());
+    int32 filters_num_elem = filters_.NumRows() * filters_.NumCols();
+    params->Range(0, filters_num_elem).CopyRowsFromMat(filters_);
+    params->Range(filters_num_elem, bias_.Dim()).CopyFromVec(bias_);
+  }
+
+  void SetParams(const VectorBase<BaseFloat>& params) {
+    KALDI_ASSERT(params.Dim() == NumParams());
+    int32 filters_num_elem = filters_.NumRows() * filters_.NumCols();
+    filters_.CopyRowsFromVec(params.Range(0, filters_num_elem));
+    bias_.CopyFromVec(params.Range(filters_num_elem, bias_.Dim()));
   }
 
   std::string Info() const {
     return std::string("\n  filters") + MomentStatistics(filters_) +
-           "\n  bias" + MomentStatistics(bias_);
+      ", lr-coef " + ToString(learn_rate_coef_) +
+      ", max-norm " + ToString(max_norm_) +
+      "\n  bias" + MomentStatistics(bias_) +
+      ", lr-coef " + ToString(bias_learn_rate_coef_);
   }
+
   std::string InfoGradient() const {
     return std::string("\n  filters_grad") + MomentStatistics(filters_grad_) +
-           ", lr-coef " + ToString(learn_rate_coef_) +
-           ", max-norm " + ToString(max_norm_) +
-           "\n  bias_grad" + MomentStatistics(bias_grad_) +
-           ", lr-coef " + ToString(bias_learn_rate_coef_);
+      ", lr-coef " + ToString(learn_rate_coef_) +
+      ", max-norm " + ToString(max_norm_) +
+      "\n  bias_grad" + MomentStatistics(bias_grad_) +
+      ", lr-coef " + ToString(bias_learn_rate_coef_);
   }
 
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in, CuMatrixBase<BaseFloat> *out) {
+  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
+                    CuMatrixBase<BaseFloat> *out) {
     // useful dims
     int32 num_splice = input_dim_ / patch_stride_;
     int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_;
@@ -238,10 +264,9 @@ class ConvolutionalComponent : public UpdatableComponent {
     int32 num_frames = in.NumRows();
     int32 filter_dim = filters_.NumCols();
 
-    // we will need the buffers 
-    if (vectorized_feature_patches_.NumRows() == 0) {
-      vectorized_feature_patches_.Resize(num_frames,
-                                         filter_dim * num_patches, kUndefined);
+    // we will need the buffers
+    if (vectorized_feature_patches_.NumRows() != num_frames) {
+      vectorized_feature_patches_.Resize(num_frames, filter_dim * num_patches, kUndefined);
       feature_patch_diffs_.Resize(num_frames, filter_dim * num_patches, kSetZero);
     }
 
@@ -250,19 +275,21 @@ class ConvolutionalComponent : public UpdatableComponent {
      *   xxx        xxx        xxx        xxx       (x = selected elements)
      *
      *   xxx : patch dim
-     *    xxx 
+     *    xxx
      *   ^---: patch step
      * |----------| : patch stride
      *
      *   xxx-xxx-xxx-xxx : filter dim
-     *  
+     *
      */
     // build-up a column selection map:
+    int32 index = 0;
     column_map_.resize(filter_dim * num_patches);
-    for (int32 p=0, index=0; p<num_patches; p++) {
-      for (int32 s=0; s<num_splice; s++) {
-          for (int32 d=0; d<patch_dim_; d++, index++) {
+    for (int32 p = 0; p < num_patches; p++) {
+      for (int32 s = 0; s < num_splice; s++) {
+        for (int32 d = 0; d < patch_dim_; d++) {
           column_map_[index] = p * patch_step_ + s * patch_stride_ + d;
+          index++;
         }
       }
     }
@@ -271,11 +298,11 @@ class ConvolutionalComponent : public UpdatableComponent {
     vectorized_feature_patches_.CopyCols(in, cu_column_map);
 
     // compute filter activations
-    for (int32 p=0; p<num_patches; p++) {
+    for (int32 p = 0; p < num_patches; p++) {
       CuSubMatrix<BaseFloat> tgt(out->ColRange(p * num_filters, num_filters));
       CuSubMatrix<BaseFloat> patch(vectorized_feature_patches_.ColRange(
                                    p * filter_dim, filter_dim));
-      tgt.AddVecToRows(1.0, bias_, 0.0); // add bias
+      tgt.AddVecToRows(1.0, bias_, 0.0);  // add bias
       // apply all filters
       tgt.AddMatMat(1.0, patch, kNoTrans, filters_, kTrans, 1.0);
     }
@@ -338,17 +365,18 @@ class ConvolutionalComponent : public UpdatableComponent {
     }
   }
 
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in, const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff, CuMatrixBase<BaseFloat> *in_diff) {
+  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
+                        const CuMatrixBase<BaseFloat> &out,
+                        const CuMatrixBase<BaseFloat> &out_diff,
+                        CuMatrixBase<BaseFloat> *in_diff) {
     // useful dims
-    int32 num_splice = input_dim_ / patch_stride_;
     int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_;
     int32 num_filters = filters_.NumRows();
-    int32 num_frames = in.NumRows();
     int32 filter_dim = filters_.NumCols();
 
-    // backpropagate to vector of matrices (corresponding to position of a filter)
-    for (int32 p=0; p<num_patches; p++) {
+    // backpropagate to vector of matrices
+    // (corresponding to position of a filter)
+    for (int32 p = 0; p < num_patches; p++) {
       CuSubMatrix<BaseFloat> patch_diff(feature_patch_diffs_.ColRange(
                                         p * filter_dim, filter_dim));
       CuSubMatrix<BaseFloat> out_diff_patch(out_diff.ColRange(
@@ -369,7 +397,8 @@ class ConvolutionalComponent : public UpdatableComponent {
   }
 
 
-  void Update(const CuMatrixBase<BaseFloat> &input, const CuMatrixBase<BaseFloat> &diff) {
+  void Update(const CuMatrixBase<BaseFloat> &input,
+              const CuMatrixBase<BaseFloat> &diff) {
     // useful dims
     int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_;
     int32 num_filters = filters_.NumRows();
@@ -381,11 +410,12 @@ class ConvolutionalComponent : public UpdatableComponent {
     //
     // calculate the gradient
     //
-    filters_grad_.Resize(num_filters, filter_dim, kSetZero); // reset
-    bias_grad_.Resize(num_filters, kSetZero); // reset
+    filters_grad_.Resize(num_filters, filter_dim, kSetZero);  // reset
+    bias_grad_.Resize(num_filters, kSetZero);  // reset
     // use all the patches
-    for (int32 p=0; p<num_patches; p++) { // sum
-      CuSubMatrix<BaseFloat> diff_patch(diff.ColRange(p * num_filters, num_filters));
+    for (int32 p = 0; p < num_patches; p++) {  // sum
+      CuSubMatrix<BaseFloat> diff_patch(diff.ColRange(p * num_filters,
+                                                      num_filters));
       CuSubMatrix<BaseFloat> patch(vectorized_feature_patches_.ColRange(
                                    p * filter_dim, filter_dim));
       filters_grad_.AddMatMat(1.0, diff_patch, kTrans, patch, kNoTrans, 1.0);
@@ -394,7 +424,7 @@ class ConvolutionalComponent : public UpdatableComponent {
 
     //
     // update
-    // 
+    //
     filters_.AddMat(-lr*learn_rate_coef_, filters_grad_);
     bias_.AddVec(-lr*bias_learn_rate_coef_, bias_grad_);
     //
@@ -405,30 +435,29 @@ class ConvolutionalComponent : public UpdatableComponent {
       lin_sqr.MulElements(filters_);
       CuVector<BaseFloat> l2(filters_.NumRows());
       l2.AddColSumMat(1.0, lin_sqr, 0.0);
-      l2.ApplyPow(0.5); // we have per-neuron L2 norms
+      l2.ApplyPow(0.5);  // we have per-neuron L2 norms
       CuVector<BaseFloat> scl(l2);
       scl.Scale(1.0/max_norm_);
       scl.ApplyFloor(1.0);
       scl.InvertElements();
-      filters_.MulRowsVec(scl); // shink to sphere!
+      filters_.MulRowsVec(scl);  // shink to sphere!
     }
-
   }
 
  private:
   int32 patch_dim_,    ///< number of consecutive inputs, 1st dim of patch
-        patch_step_,   ///< step of the convolution (i.e. shift between 2 patches)
-        patch_stride_; ///< shift for 2nd dim of a patch (i.e. frame length before splicing)
+        patch_step_,   ///< step of the convolution
+                       ///  (i.e. shift between 2 patches)
+        patch_stride_;  ///< shift for 2nd dim of a patch
+                       ///  (i.e. frame length before splicing)
 
-  CuMatrix<BaseFloat> filters_; ///< row = vectorized rectangular filter
-  CuVector<BaseFloat> bias_; ///< bias for each filter
+  CuMatrix<BaseFloat> filters_;  ///< row = vectorized rectangular filter
+  CuVector<BaseFloat> bias_;  ///< bias for each filter
 
-  CuMatrix<BaseFloat> filters_grad_; ///< gradient of filters
-  CuVector<BaseFloat> bias_grad_; ///< gradient of biases
+  CuMatrix<BaseFloat> filters_grad_;  ///< gradient of filters
+  CuVector<BaseFloat> bias_grad_;  ///< gradient of biases
 
-  BaseFloat learn_rate_coef_; ///< weight learn rate
-  BaseFloat bias_learn_rate_coef_; ///< bias learn rate
-  BaseFloat max_norm_; ///< limit L2 norm of a neuron weights to positive value
+  BaseFloat max_norm_;  ///< limit L2 norm of a neuron weights to positive value
 
   /** Buffer of reshaped inputs:
    *  1row = vectorized rectangular feature patches,
@@ -447,7 +476,7 @@ class ConvolutionalComponent : public UpdatableComponent {
   CuMatrix<BaseFloat> feature_patch_diffs_;
 };
 
-} // namespace nnet1
-} // namespace kaldi
+}  // namespace nnet1
+}  // namespace kaldi
 
-#endif
+#endif  // KALDI_NNET_NNET_CONVOLUTIONAL_COMPONENT_H_
diff --git a/src/nnet/nnet-frame-pooling-component.h b/src/nnet/nnet-frame-pooling-component.h
index b3c1c959969..ecc71274993 100644
--- a/src/nnet/nnet-frame-pooling-component.h
+++ b/src/nnet/nnet-frame-pooling-component.h
@@ -21,13 +21,15 @@
 #ifndef KALDI_NNET_NNET_FRAME_POOLING_COMPONENT_H_
 #define KALDI_NNET_NNET_FRAME_POOLING_COMPONENT_H_
 
+#include <string>
+#include <vector>
+#include <algorithm>
+#include <sstream>
 
 #include "nnet/nnet-component.h"
 #include "nnet/nnet-utils.h"
 #include "cudamatrix/cu-math.h"
 
-#include <sstream>
-
 namespace kaldi {
 namespace nnet1 {
 
@@ -35,22 +37,25 @@ namespace nnet1 {
  * FramePoolingComponent :
  * The input/output matrices are split to frames of width 'feature_dim_'.
  * Here we do weighted pooling of frames along the temporal axis,
- * given a frame-offset of leftmost frame, the pool-size is defined 
+ * given a frame-offset of leftmost frame, the pool-size is defined
  * by weight-vector size.
  */
 class FramePoolingComponent : public UpdatableComponent {
  public:
-  FramePoolingComponent(int32 dim_in, int32 dim_out) 
-    : UpdatableComponent(dim_in, dim_out), feature_dim_(0), learn_rate_coef_(0.01), normalize_(false)
+  FramePoolingComponent(int32 dim_in, int32 dim_out):
+    UpdatableComponent(dim_in, dim_out),
+    feature_dim_(0),
+    normalize_(false)
   { }
+
   ~FramePoolingComponent()
   { }
 
   Component* Copy() const { return new FramePoolingComponent(*this); }
   ComponentType GetType() const { return kFramePoolingComponent; }
 
-  /** 
-   * Here the offsets are w.r.t. central frames, which has offset 0. 
+  /**
+   * Here the offsets are w.r.t. central frames, which has offset 0.
    * Note.: both the offsets and pool sizes can be negative.
    */
   void InitData(std::istream &is) {
@@ -61,7 +66,7 @@ class FramePoolingComponent : public UpdatableComponent {
     float learn_rate_coef = 0.01;
     // parse config
     std::string token;
-    while (!is.eof()) {
+    while (is >> std::ws, !is.eof()) {
       ReadToken(is, false, &token);
       /**/ if (token == "<FeatureDim>") ReadBasicType(is, false, &feature_dim_);
       else if (token == "<CentralOffset>") ReadIntegerVector(is, false, &central_offset);
@@ -71,7 +76,6 @@ class FramePoolingComponent : public UpdatableComponent {
       else if (token == "<Normalize>") ReadBasicType(is, false, &normalize_);
       else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
                      << " (FeatureDim|CentralOffset <vec>|PoolSize <vec>|LearnRateCoef|Normalize)";
-      is >> std::ws; // eat-up whitespace
     }
     // check inputs:
     KALDI_ASSERT(feature_dim_ > 0);
@@ -83,8 +87,8 @@ class FramePoolingComponent : public UpdatableComponent {
     int32 num_pools = central_offset.size();
     offset_.resize(num_pools);
     weight_.resize(num_pools);
-    for (int32 p=0; p<num_pools; p++) {
-      offset_[p] = central_frame + central_offset[p] + std::min(0,pool_size[p]+1);
+    for (int32 p = 0; p < num_pools; p++) {
+      offset_[p] = central_frame + central_offset[p] + std::min(0, pool_size[p]+1);
       weight_[p].Resize(std::abs(pool_size[p]));
       weight_[p].Set(1.0/std::abs(pool_size[p]));
     }
@@ -92,22 +96,22 @@ class FramePoolingComponent : public UpdatableComponent {
     if (pool_weight.Dim() != 0) {
       KALDI_LOG << "Initializing from pool-weight vector";
       int32 num_weights = 0;
-      for (int32 p=0; p<num_pools; p++) {
+      for (int32 p = 0; p < num_pools; p++) {
         weight_[p].CopyFromVec(pool_weight.Range(num_weights, weight_[p].Dim()));
         num_weights += weight_[p].Dim();
       }
       KALDI_ASSERT(num_weights == pool_weight.Dim());
     }
     // check that offsets are within the splice we had,
-    for (int32 p=0; p<num_pools; p++) {
+    for (int32 p = 0; p < num_pools; p++) {
       KALDI_ASSERT(offset_[p] >= 0);
       KALDI_ASSERT(offset_[p] + weight_[p].Dim() <= num_frames);
     }
   }
 
   /**
-   * Here the offsets are w.r.t. leftmost frame from splice, its offset is 0. 
-   * If we spliced +/- 15 frames, the central frames has index '15'. 
+   * Here the offsets are w.r.t. leftmost frame from splice, its offset is 0.
+   * If we spliced +/- 15 frames, the central frames has index '15'.
    */
   void ReadData(std::istream &is, bool binary) {
     // get the input dimension before splicing
@@ -124,7 +128,7 @@ class FramePoolingComponent : public UpdatableComponent {
     ExpectToken(is, binary, "<FrameWeight>");
     int32 num_pools = offset_.size();
     weight_.resize(num_pools);
-    for (int32 p=0; p<num_pools; p++) {
+    for (int32 p = 0; p < num_pools; p++) {
       weight_[p].Read(is, binary);
     }
     //
@@ -136,7 +140,7 @@ class FramePoolingComponent : public UpdatableComponent {
     KALDI_ASSERT(offset_.size() == weight_.size());
     // check the shifts don't exceed the splicing
     int32 total_frame = InputDim() / feature_dim_;
-    for (int32 p=0; p<num_pools; p++) {
+    for (int32 p = 0; p < num_pools; p++) {
       KALDI_ASSERT(offset_[p] >= 0);
       KALDI_ASSERT(offset_[p] + (weight_[p].Dim()-1) < total_frame);
     }
@@ -155,31 +159,41 @@ class FramePoolingComponent : public UpdatableComponent {
     // write pooling weights of individual frames
     WriteToken(os, binary, "<FrameWeight>");
     int32 num_pools = offset_.size();
-    for (int32 p=0; p<num_pools; p++) {
+    for (int32 p = 0; p < num_pools; p++) {
       weight_[p].Write(os, binary);
     }
   }
 
-  int32 NumParams() const { 
-    int32 sum = 0; 
-    for (int32 p=0; p<weight_.size(); p++) sum += weight_[p].Dim(); 
-    return sum; 
+  int32 NumParams() const {
+    int32 ans = 0;
+    for (int32 p = 0; p < weight_.size(); p++) {
+      ans += weight_[p].Dim();
+    }
+    return ans;
+  }
+
+  void GetGradient(VectorBase<BaseFloat> *gradient) const {
+    KALDI_ERR << "Unimplemented.";
   }
-  
-  void GetParams(Vector<BaseFloat>* wei_copy) const {
-    wei_copy->Resize(NumParams());
+
+  void GetParams(VectorBase<BaseFloat>* params) const {
+    KALDI_ASSERT(params->Dim() == NumParams());
     int32 offset = 0;
-    for (int32 p=0; p<weight_.size(); p++) {
-      wei_copy->Range(offset, weight_[p].Dim()).CopyFromVec(weight_[p]);
-      offset += weight_[p].Dim(); 
+    for (int32 p = 0; p < weight_.size(); p++) {
+      params->Range(offset, weight_[p].Dim()).CopyFromVec(weight_[p]);
+      offset += weight_[p].Dim();
     }
-    KALDI_ASSERT(offset == wei_copy->Dim());
+    KALDI_ASSERT(offset == params->Dim());
   }
-  
+
+  void SetParams(const VectorBase<BaseFloat>& params) {
+    KALDI_ERR << "Unimplemented.";
+  }
+
   std::string Info() const {
     std::ostringstream oss;
     oss << "\n  (offset,weights) : ";
-    for (int32 p=0; p<weight_.size(); p++) {
+    for (int32 p = 0; p < weight_.size(); p++) {
       oss << "(" << offset_[p] << "," << weight_[p] << "), ";
     }
     return oss.str();
@@ -189,7 +203,7 @@ class FramePoolingComponent : public UpdatableComponent {
     std::ostringstream oss;
     oss << "\n  lr-coef " << ToString(learn_rate_coef_);
     oss << "\n  (offset,weights_grad) : ";
-    for (int32 p=0; p<weight_diff_.size(); p++) {
+    for (int32 p = 0; p < weight_diff_.size(); p++) {
       oss << "(" << offset_[p] << ",";
       // pass the weight vector, remove '\n' as last char
       oss << weight_diff_[p];
@@ -199,52 +213,60 @@ class FramePoolingComponent : public UpdatableComponent {
     return oss.str();
   }
 
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in, CuMatrixBase<BaseFloat> *out) {
+  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
+                    CuMatrixBase<BaseFloat> *out) {
     // check dims
     KALDI_ASSERT(in.NumCols() % feature_dim_ == 0);
     KALDI_ASSERT(out->NumCols() % feature_dim_ == 0);
     // useful dims
     int32 num_pools = offset_.size();
     // compute the output pools
-    for (int32 p=0; p<num_pools; p++) {
-      CuSubMatrix<BaseFloat> tgt(out->ColRange(p*feature_dim_,feature_dim_));
-      tgt.SetZero(); // reset
-      for (int32 i=0; i<weight_[p].Dim(); i++) {
+    for (int32 p = 0; p < num_pools; p++) {
+      CuSubMatrix<BaseFloat> tgt(out->ColRange(p*feature_dim_, feature_dim_));
+      tgt.SetZero();  // reset
+      for (int32 i = 0; i < weight_[p].Dim(); i++) {
         tgt.AddMat(weight_[p](i), in.ColRange((offset_[p]+i) * feature_dim_, feature_dim_));
       }
     }
   }
 
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in, const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff, CuMatrixBase<BaseFloat> *in_diff) {
-    KALDI_ERR << "Unimplemented";
+  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
+                        const CuMatrixBase<BaseFloat> &out,
+                        const CuMatrixBase<BaseFloat> &out_diff,
+                        CuMatrixBase<BaseFloat> *in_diff) {
+    KALDI_ERR << "Unimplemented.";
   }
 
 
-  void Update(const CuMatrixBase<BaseFloat> &input, const CuMatrixBase<BaseFloat> &diff) {
+  void Update(const CuMatrixBase<BaseFloat> &input,
+              const CuMatrixBase<BaseFloat> &diff) {
     // useful dims
     int32 num_pools = offset_.size();
     // lazy init
     if (weight_diff_.size() != num_pools) weight_diff_.resize(num_pools);
     // get the derivatives
-    for (int32 p=0; p<num_pools; p++) {
-      weight_diff_[p].Resize(weight_[p].Dim(), kSetZero); // reset
-      for (int32 i=0; i<weight_[p].Dim(); i++) {
+    for (int32 p = 0; p < num_pools; p++) {
+      weight_diff_[p].Resize(weight_[p].Dim(), kSetZero);  // reset
+      for (int32 i = 0; i < weight_[p].Dim(); i++) {
         // multiply matrices element-wise, and sum to get the derivative
-        CuSubMatrix<BaseFloat> in_frame(input.ColRange((offset_[p]+i) * feature_dim_, feature_dim_));
-        CuSubMatrix<BaseFloat> diff_frame(diff.ColRange(p * feature_dim_, feature_dim_));
+        CuSubMatrix<BaseFloat> in_frame(
+          input.ColRange((offset_[p]+i) * feature_dim_, feature_dim_)
+        );
+        CuSubMatrix<BaseFloat> diff_frame(
+          diff.ColRange(p * feature_dim_, feature_dim_)
+        );
         CuMatrix<BaseFloat> mul_elems(in_frame);
         mul_elems.MulElements(diff_frame);
         weight_diff_[p](i) = mul_elems.Sum();
       }
     }
     // update
-    for (int32 p=0; p<num_pools; p++) {
+    for (int32 p = 0; p < num_pools; p++) {
       weight_[p].AddVec(- learn_rate_coef_ * opts_.learn_rate, weight_diff_[p]);
     }
     // force to be positive, re-normalize the sum
     if (normalize_) {
-      for (int32 p=0; p<num_pools; p++) {
+      for (int32 p = 0; p < num_pools; p++) {
         weight_[p].ApplyFloor(0.0);
         weight_[p].Scale(1.0/weight_[p].Sum());
       }
@@ -252,16 +274,17 @@ class FramePoolingComponent : public UpdatableComponent {
   }
 
  private:
-  int32 feature_dim_; // feature dimension before splicing
-  std::vector<int32> offset_; // vector of pooling offsets
-  std::vector<Vector<BaseFloat> > weight_; // vector of pooling weight vectors
-  std::vector<Vector<BaseFloat> > weight_diff_; // detivatives of weight vectors
+  int32 feature_dim_;  // feature dimension before splicing
+  std::vector<int32> offset_;  // vector of pooling offsets
+  /// Vector of pooling weight vectors,
+  std::vector<Vector<BaseFloat> > weight_;
+  /// detivatives of weight vectors,
+  std::vector<Vector<BaseFloat> > weight_diff_;
 
-  BaseFloat learn_rate_coef_; // learninig rate multiplier
-  bool normalize_; // apply normalization after each update
+  bool normalize_;  // apply normalization after each update
 };
 
-} // namespace nnet1
-} // namespace kaldi
+}  // namespace nnet1
+}  // namespace kaldi
 
-#endif
+#endif  // KALDI_NNET_NNET_FRAME_POOLING_COMPONENT_H_
diff --git a/src/nnet/nnet-kl-hmm.h b/src/nnet/nnet-kl-hmm.h
index 8d9a809a814..8ba3901daa7 100644
--- a/src/nnet/nnet-kl-hmm.h
+++ b/src/nnet/nnet-kl-hmm.h
@@ -23,6 +23,8 @@
 #ifndef KALDI_NNET_NNET_KL_HMM_H_
 #define KALDI_NNET_NNET_KL_HMM_H_
 
+#include <vector>
+
 #include "nnet/nnet-component.h"
 #include "cudamatrix/cu-math.h"
 #include "cudamatrix/cu-rand.h"
@@ -34,24 +36,25 @@ namespace nnet1 {
 
 class KlHmm : public Component {
  public:
-  KlHmm(int32 dim_in, int32 dim_out) 
-    : Component(dim_in, dim_out), kl_stats_(dim_out, dim_in, kSetZero)
+  KlHmm(int32 dim_in, int32 dim_out):
+    Component(dim_in, dim_out),
+    kl_stats_(dim_out, dim_in, kSetZero)
   { }
+
   ~KlHmm()
   { }
 
   Component* Copy() const { return new KlHmm(*this); }
-  ComponentType GetType() const {
-    return kKlHmm;
-  }
+  ComponentType GetType() const { return kKlHmm; }
 
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in, CuMatrixBase<BaseFloat> *out) {
+  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
+                    CuMatrixBase<BaseFloat> *out) {
     if (kl_inv_q_.NumRows() == 0) {
       // Copy the CudaMatrix to a Matrix
       Matrix<BaseFloat> in_tmp(in.NumRows(), in.NumCols());
       in.CopyToMat(&in_tmp);
-      // Just check if there are posteriors in the Matrix (just check the first row)
-      BaseFloat post_sum=in_tmp.Row(0).Sum();
+      // Check if there are posteriors in the Matrix (check on first row),
+      BaseFloat post_sum = in_tmp.Row(0).Sum();
       KALDI_ASSERT(ApproxEqual(post_sum, 1.0));
       // Get a tmp Matrix of the stats
       Matrix<BaseFloat> kl_stats_tmp(kl_stats_);
@@ -65,15 +68,15 @@ class KlHmm : public Component {
       row_sum.InvertElements();
       // Normalizing the statistics vector
       kl_stats_tmp.MulRowsVec(row_sum);
-      //Apply floor before inversion and logarithm
+      // Apply floor before inversion and logarithm
       kl_stats_tmp.ApplyFloor(1e-20);
-      //Apply invesion
+      // Apply invesion
       kl_stats_tmp.InvertElements();
-      //Apply logarithm
+      // Apply logarithm
       kl_stats_tmp.ApplyLog();
-      //Inverted and logged values
-      kl_inv_q_.Resize(kl_stats_.NumRows(),kl_stats_.NumCols());
-      //Holds now log (1/Q)
+      // Inverted and logged values
+      kl_inv_q_.Resize(kl_stats_.NumRows(), kl_stats_.NumCols());
+      // Holds now log (1/Q)
       kl_inv_q_.CopyFromMat(kl_stats_tmp);
     }
     // Get the logarithm of the features for the Entropy calculation
@@ -90,7 +93,7 @@ class KlHmm : public Component {
     tmp_entropy.MulElements(log_in);
     // Getting the entropy (sum P*logP)
     CuVector<BaseFloat> in_entropy(in.NumRows(), kSetZero);
-    in_entropy.AddColSumMat(1,tmp_entropy);
+    in_entropy.AddColSumMat(1, tmp_entropy);
     // sum P*log (1/Q)
     out->AddMatMat(1, in, kNoTrans, kl_inv_q_, kTrans, 0);
     // (sum P*logP) + (sum P*log(1/Q)
@@ -99,54 +102,54 @@ class KlHmm : public Component {
     out->Scale(-1);
   }
 
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in, const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff, CuMatrixBase<BaseFloat> *in_diff) {
+  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
+                        const CuMatrixBase<BaseFloat> &out,
+                        const CuMatrixBase<BaseFloat> &out_diff,
+                        CuMatrixBase<BaseFloat> *in_diff) {
     KALDI_ERR << "Unimplemented";
   }
- 
+
   /// Reads the component content
-  void ReadData(std::istream &is, bool binary) { 
+  void ReadData(std::istream &is, bool binary) {
     kl_stats_.Read(is, binary);
     KALDI_ASSERT(kl_stats_.NumRows() == output_dim_);
     KALDI_ASSERT(kl_stats_.NumCols() == input_dim_);
   }
 
   /// Writes the component content
-  void WriteData(std::ostream &os, bool binary) const { 
+  void WriteData(std::ostream &os, bool binary) const {
     kl_stats_.Write(os, binary);
   }
-  
+
   /// Set the statistics matrix
   void SetStats(const Matrix<BaseFloat> mat) {
     KALDI_ASSERT(mat.NumRows() == output_dim_);
     KALDI_ASSERT(mat.NumCols() == input_dim_);
     kl_stats_.Resize(mat.NumRows(), mat.NumCols());
     kl_stats_.CopyFromMat(mat);
-   }
+  }
 
-  /// Accumulate the statistics for KL-HMM paramter estimation
-  void Accumulate (const Matrix<BaseFloat> &posteriors, const std::vector<int32> &alignment) {
+  /// Accumulate the statistics for KL-HMM paramter estimation,
+  void Accumulate(const Matrix<BaseFloat> &posteriors,
+                  const std::vector<int32> &alignment) {
     KALDI_ASSERT(posteriors.NumRows() == alignment.size());
     KALDI_ASSERT(posteriors.NumCols() == kl_stats_.NumCols());
     int32 num_frames = alignment.size();
-    for(int32 i = 0; i < num_frames; i++) {
-      //Convertin the float posterior into a double (to have higher precision during collection)
+    for (int32 i = 0; i < num_frames; i++) {
+      // Casting float posterior to double (fixing numerical issue),
       Vector<double> temp(posteriors.Row(i));
-      //Sum all the postiors associated with a particular state
-      kl_stats_.Row(alignment[i]).AddVec(1,temp);
+      // Sum the postiors grouped by states from the alignment,
+      kl_stats_.Row(alignment[i]).AddVec(1, temp);
     }
   }
 
- private: 
+ private:
   Matrix<double> kl_stats_;
   CuMatrix<BaseFloat> kl_inv_q_;
 };
 
+}  // namespace nnet1
+}  // namespace kaldi
 
-
-
-} // namespace nnet1
-} // namespace kaldi
-
-#endif
+#endif  // KALDI_NNET_NNET_KL_HMM_H_
 
diff --git a/src/nnet/nnet-linear-transform.h b/src/nnet/nnet-linear-transform.h
index cb03325033f..733ad778970 100644
--- a/src/nnet/nnet-linear-transform.h
+++ b/src/nnet/nnet-linear-transform.h
@@ -1,4 +1,4 @@
-// nnet/nnet-affine-transform.h
+// nnet/nnet-linear-transform.h
 
 // Copyright 2011-2014  Brno University of Technology (author: Karel Vesely)
 
@@ -21,6 +21,7 @@
 #ifndef KALDI_NNET_NNET_LINEAR_TRANSFORM_H_
 #define KALDI_NNET_NNET_LINEAR_TRANSFORM_H_
 
+#include <string>
 
 #include "nnet/nnet-component.h"
 #include "nnet/nnet-utils.h"
@@ -31,63 +32,76 @@ namespace nnet1 {
 
 class LinearTransform : public UpdatableComponent {
  public:
-  LinearTransform(int32 dim_in, int32 dim_out) 
-    : UpdatableComponent(dim_in, dim_out), 
-      linearity_(dim_out, dim_in), linearity_corr_(dim_out, dim_in), learn_rate_coef_(1.0)
+  LinearTransform(int32 dim_in, int32 dim_out):
+    UpdatableComponent(dim_in, dim_out),
+    linearity_(dim_out, dim_in),
+    linearity_corr_(dim_out, dim_in)
   { }
+
   ~LinearTransform()
   { }
 
   Component* Copy() const { return new LinearTransform(*this); }
   ComponentType GetType() const { return kLinearTransform; }
-  
+
   void InitData(std::istream &is) {
     // define options
     float param_stddev = 0.1;
-    float learn_rate_coef = 1.0;
     std::string read_matrix_file;
     // parse config
-    std::string token; 
-    while (!is.eof()) {
-      ReadToken(is, false, &token); 
+    std::string token;
+    while (is >> std::ws, !is.eof()) {
+      ReadToken(is, false, &token);
       /**/ if (token == "<ParamStddev>") ReadBasicType(is, false, &param_stddev);
-      else if (token == "<LearnRateCoef>") ReadBasicType(is, false, &learn_rate_coef);
       else if (token == "<ReadMatrix>") ReadToken(is, false, &read_matrix_file);
+      else if (token == "<LearnRateCoef>") ReadBasicType(is, false, &learn_rate_coef_);
       else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
                      << " (ParamStddev|ReadMatrix|LearnRateCoef)";
-      is >> std::ws; // eat-up whitespace
     }
 
-    //
-    // initialize
-    //
-    if (read_matrix_file != "") { // load from file,
+    if (read_matrix_file != "") {  // load from file,
       bool binary;
       Input in(read_matrix_file, &binary);
       linearity_.Read(in.Stream(), binary);
       in.Close();
-      KALDI_LOG << "Loaded <LinearTransform> matrix from file : " << read_matrix_file;
-    } else { // random initialization,
-      linearity_.Resize(output_dim_, input_dim_);
-      for (int32 r=0; r<output_dim_; r++) {
-        for (int32 c=0; c<input_dim_; c++) {
-          linearity_(r,c) = param_stddev * RandGauss(); // 0-mean Gauss with given std_dev
-        }
+      // check dims,
+      if (OutputDim() != linearity_.NumRows() ||
+          InputDim() != linearity_.NumCols()) {
+        KALDI_ERR << "Dimensionality mismatch! Expected matrix"
+                  << " r=" << OutputDim() << " c=" << InputDim()
+                  << ", loaded matrix " << read_matrix_file
+                  << " with r=" << linearity_.NumRows()
+                  << " c=" << linearity_.NumCols();
       }
+      KALDI_LOG << "Loaded <LinearTransform> matrix from file : "
+                << read_matrix_file;
+      return;
     }
+
     //
-    learn_rate_coef_ = learn_rate_coef;
+    // Initialize trainable parameters,
     //
-
-    // check dims,
-    KALDI_ASSERT(linearity_.NumRows() == output_dim_);
-    KALDI_ASSERT(linearity_.NumCols() == input_dim_);
+    // Gaussian with given std_dev (mean = 0),
+    linearity_.Resize(OutputDim(), InputDim());
+    RandGauss(0.0, param_stddev, &linearity_);
   }
 
   void ReadData(std::istream &is, bool binary) {
-    // learning-rate coefficien
-    ExpectToken(is, binary, "<LearnRateCoef>");
-    ReadBasicType(is, binary, &learn_rate_coef_);
+    // Read all the '<Tokens>' in arbitrary order,
+    while ('<' == Peek(is, binary)) {
+      int first_char = PeekToken(is, binary);
+      switch (first_char) {
+        case 'L': ExpectToken(is, binary, "<LearnRateCoef>");
+          ReadBasicType(is, binary, &learn_rate_coef_);
+          break;
+        default:
+          std::string token;
+          ReadToken(is, false, &token);
+          KALDI_ERR << "Unknown token: " << token;
+      }
+    }
+    // Read the data (data follow the tokens),
+
     // weights
     linearity_.Read(is, binary);
 
@@ -98,38 +112,63 @@ class LinearTransform : public UpdatableComponent {
   void WriteData(std::ostream &os, bool binary) const {
     WriteToken(os, binary, "<LearnRateCoef>");
     WriteBasicType(os, binary, learn_rate_coef_);
+    if (!binary) os << "\n";
     linearity_.Write(os, binary);
   }
 
-  int32 NumParams() const { return linearity_.NumRows()*linearity_.NumCols(); }
-  
-  void GetParams(Vector<BaseFloat>* wei_copy) const {
-    wei_copy->Resize(NumParams());
-    int32 linearity_num_elem = linearity_.NumRows() * linearity_.NumCols(); 
-    wei_copy->Range(0,linearity_num_elem).CopyRowsFromMat(Matrix<BaseFloat>(linearity_));
+  int32 NumParams() const {
+    return linearity_.NumRows()*linearity_.NumCols();
   }
-  
+
+  void GetGradient(VectorBase<BaseFloat>* gradient) const {
+    KALDI_ASSERT(gradient->Dim() == NumParams());
+    gradient->CopyRowsFromMat(linearity_corr_);
+  }
+
+  void GetParams(VectorBase<BaseFloat>* params) const {
+    KALDI_ASSERT(params->Dim() == NumParams());
+    params->CopyRowsFromMat(linearity_);
+  }
+
+  void SetParams(const VectorBase<BaseFloat>& params) {
+    KALDI_ASSERT(params.Dim() == NumParams());
+    linearity_.CopyRowsFromVec(params);
+  }
+
+  void SetLinearity(const MatrixBase<BaseFloat>& l) {
+    KALDI_ASSERT(l.NumCols() == linearity_.NumCols());
+    KALDI_ASSERT(l.NumRows() == linearity_.NumRows());
+    linearity_.CopyFromMat(l);
+  }
+
   std::string Info() const {
-    return std::string("\n  linearity") + MomentStatistics(linearity_);
+    return std::string("\n  linearity") +
+      MomentStatistics(linearity_) +
+      ", lr-coef " + ToString(learn_rate_coef_);
   }
   std::string InfoGradient() const {
-    return std::string("\n  linearity_grad") + MomentStatistics(linearity_corr_) +
-           ", lr-coef " + ToString(learn_rate_coef_);
+    return std::string("\n  linearity_grad") +
+      MomentStatistics(linearity_corr_) +
+      ", lr-coef " + ToString(learn_rate_coef_);
   }
 
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in, CuMatrixBase<BaseFloat> *out) {
+  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
+                    CuMatrixBase<BaseFloat> *out) {
     // multiply by weights^t
     out->AddMatMat(1.0, in, kNoTrans, linearity_, kTrans, 0.0);
   }
 
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in, const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff, CuMatrixBase<BaseFloat> *in_diff) {
+  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
+                        const CuMatrixBase<BaseFloat> &out,
+                        const CuMatrixBase<BaseFloat> &out_diff,
+                        CuMatrixBase<BaseFloat> *in_diff) {
     // multiply error derivative by weights
     in_diff->AddMatMat(1.0, out_diff, kNoTrans, linearity_, kNoTrans, 0.0);
   }
 
 
-  void Update(const CuMatrixBase<BaseFloat> &input, const CuMatrixBase<BaseFloat> &diff) {
+  void Update(const CuMatrixBase<BaseFloat> &input,
+              const CuMatrixBase<BaseFloat> &diff) {
     // we use following hyperparameters from the option class
     const BaseFloat lr = opts_.learn_rate;
     const BaseFloat mmt = opts_.momentum;
@@ -152,9 +191,7 @@ class LinearTransform : public UpdatableComponent {
   }
 
   /// Accessors to the component parameters
-  const CuMatrixBase<BaseFloat>& GetLinearity() {
-    return linearity_;
-  }
+  const CuMatrixBase<BaseFloat>& GetLinearity() { return linearity_; }
 
   void SetLinearity(const CuMatrixBase<BaseFloat>& linearity) {
     KALDI_ASSERT(linearity.NumRows() == linearity_.NumRows());
@@ -162,19 +199,14 @@ class LinearTransform : public UpdatableComponent {
     linearity_.CopyFromMat(linearity);
   }
 
-  const CuMatrixBase<BaseFloat>& GetLinearityCorr() {
-    return linearity_corr_;
-  }
-
+  const CuMatrixBase<BaseFloat>& GetLinearityCorr() { return linearity_corr_; }
 
  private:
   CuMatrix<BaseFloat> linearity_;
   CuMatrix<BaseFloat> linearity_corr_;
-  
-  BaseFloat learn_rate_coef_;
 };
 
-} // namespace nnet1
-} // namespace kaldi
+}  // namespace nnet1
+}  // namespace kaldi
 
-#endif
+#endif  // KALDI_NNET_NNET_LINEAR_TRANSFORM_H_
diff --git a/src/nnet/nnet-linear-transform.h.orig b/src/nnet/nnet-linear-transform.h.orig
new file mode 100644
index 00000000000..bdb7d09029d
--- /dev/null
+++ b/src/nnet/nnet-linear-transform.h.orig
@@ -0,0 +1,207 @@
+// nnet/nnet-linear-transform.h
+
+// Copyright 2011-2014  Brno University of Technology (author: Karel Vesely)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#ifndef KALDI_NNET_NNET_LINEAR_TRANSFORM_H_
+#define KALDI_NNET_NNET_LINEAR_TRANSFORM_H_
+
+
+#include "nnet/nnet-component.h"
+#include "nnet/nnet-utils.h"
+#include "cudamatrix/cu-math.h"
+
+namespace kaldi {
+namespace nnet1 {
+
+class LinearTransform : public UpdatableComponent {
+ public:
+  LinearTransform(int32 dim_in, int32 dim_out) : 
+    UpdatableComponent(dim_in, dim_out), 
+    linearity_(dim_out, dim_in), 
+    linearity_corr_(dim_out, dim_in)
+  { }
+<<<<<<< Updated upstream
+
+=======
+  
+>>>>>>> Stashed changes
+  ~LinearTransform()
+  { }
+
+  Component* Copy() const { 
+    return new LinearTransform(*this); 
+  }
+
+  ComponentType GetType() const { 
+    return kLinearTransform; 
+  }
+  
+  void InitData(std::istream &is) {
+    // define options
+    float param_stddev = 0.1;
+    float learn_rate_coef = 1.0;
+    std::string read_matrix_file;
+    // parse config
+    std::string token; 
+    while (is >> std::ws, !is.eof()) {
+      ReadToken(is, false, &token); 
+      /**/ if (token == "<ParamStddev>") ReadBasicType(is, false, &param_stddev);
+      else if (token == "<LearnRateCoef>") ReadBasicType(is, false, &learn_rate_coef);
+      else if (token == "<ReadMatrix>") ReadToken(is, false, &read_matrix_file);
+      else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
+                     << " (ParamStddev|ReadMatrix|LearnRateCoef)";
+    }
+
+    //
+    // initialize
+    //
+    if (read_matrix_file != "") { // load from file,
+      bool binary;
+      Input in(read_matrix_file, &binary);
+      linearity_.Read(in.Stream(), binary);
+      in.Close();
+      KALDI_LOG << "Loaded <LinearTransform> matrix from file : " << read_matrix_file;
+    } else { // random initialization,
+      linearity_.Resize(output_dim_, input_dim_);
+      for (int32 r=0; r<output_dim_; r++) {
+        for (int32 c=0; c<input_dim_; c++) {
+          linearity_(r,c) = param_stddev * RandGauss(); // 0-mean Gauss with given std_dev
+        }
+      }
+    }
+    //
+    learn_rate_coef_ = learn_rate_coef;
+    //
+
+    // check dims,
+    KALDI_ASSERT(linearity_.NumRows() == output_dim_);
+    KALDI_ASSERT(linearity_.NumCols() == input_dim_);
+  }
+
+  void ReadData(std::istream &is, bool binary) {
+    // Read all the '<Tokens>' in arbitrary order,
+    while ('<' == Peek(is, binary)) {
+      int first_char = PeekToken(is, binary);
+      switch (first_char) {
+        case 'L': ExpectToken(is, binary, "<LearnRateCoef>"); 
+          ReadBasicType(is, binary, &learn_rate_coef_);
+          break;
+        default: 
+          std::string token;
+          ReadToken(is, false, &token);
+          KALDI_ERR << "Unknown token: " << token;
+      }
+    }
+    // Read the data (data follow the tokens),
+    
+    // weights
+    linearity_.Read(is, binary);
+
+    KALDI_ASSERT(linearity_.NumRows() == output_dim_);
+    KALDI_ASSERT(linearity_.NumCols() == input_dim_);
+  }
+
+  void WriteData(std::ostream &os, bool binary) const {
+    WriteToken(os, binary, "<LearnRateCoef>");
+    WriteBasicType(os, binary, learn_rate_coef_);
+    if(!binary) os << "\n";
+    linearity_.Write(os, binary);
+  }
+
+  int32 NumParams() const { 
+    return linearity_.NumRows()*linearity_.NumCols(); 
+  }
+  
+  void GetParams(Vector<BaseFloat>* wei_copy) const {
+    wei_copy->Resize(NumParams());
+    int32 linearity_num_elem = linearity_.NumRows() * linearity_.NumCols(); 
+    wei_copy->Range(0,linearity_num_elem).CopyRowsFromMat(Matrix<BaseFloat>(linearity_));
+  }
+  
+  std::string Info() const {
+    return std::string("\n  linearity") + MomentStatistics(linearity_) +
+      ", lr-coef " + ToString(learn_rate_coef_);
+  }
+  std::string InfoGradient() const {
+    return std::string("\n  linearity_grad") + MomentStatistics(linearity_corr_) +
+      ", lr-coef " + ToString(learn_rate_coef_);
+  }
+
+  void PropagateFnc(const CuMatrixBase<BaseFloat> &in, 
+                    CuMatrixBase<BaseFloat> *out) {
+    // multiply by weights^t
+    out->AddMatMat(1.0, in, kNoTrans, linearity_, kTrans, 0.0);
+  }
+
+  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in, 
+                        const CuMatrixBase<BaseFloat> &out,
+                        const CuMatrixBase<BaseFloat> &out_diff, 
+                        CuMatrixBase<BaseFloat> *in_diff) {
+    // multiply error derivative by weights
+    in_diff->AddMatMat(1.0, out_diff, kNoTrans, linearity_, kNoTrans, 0.0);
+  }
+
+
+  void Update(const CuMatrixBase<BaseFloat> &input, 
+              const CuMatrixBase<BaseFloat> &diff) {
+    // we use following hyperparameters from the option class
+    const BaseFloat lr = opts_.learn_rate;
+    const BaseFloat mmt = opts_.momentum;
+    const BaseFloat l2 = opts_.l2_penalty;
+    const BaseFloat l1 = opts_.l1_penalty;
+    // we will also need the number of frames in the mini-batch
+    const int32 num_frames = input.NumRows();
+    // compute gradient (incl. momentum)
+    linearity_corr_.AddMatMat(1.0, diff, kTrans, input, kNoTrans, mmt);
+    // l2 regularization
+    if (l2 != 0.0) {
+      linearity_.AddMat(-lr*l2*num_frames, linearity_);
+    }
+    // l1 regularization
+    if (l1 != 0.0) {
+      cu::RegularizeL1(&linearity_, &linearity_corr_, lr*l1*num_frames, lr);
+    }
+    // update
+    linearity_.AddMat(-lr*learn_rate_coef_, linearity_corr_);
+  }
+
+  /// Accessors to the component parameters
+  const CuMatrixBase<BaseFloat>& GetLinearity() {
+    return linearity_;
+  }
+
+  void SetLinearity(const CuMatrixBase<BaseFloat>& linearity) {
+    KALDI_ASSERT(linearity.NumRows() == linearity_.NumRows());
+    KALDI_ASSERT(linearity.NumCols() == linearity_.NumCols());
+    linearity_.CopyFromMat(linearity);
+  }
+
+  const CuMatrixBase<BaseFloat>& GetLinearityCorr() {
+    return linearity_corr_;
+  }
+
+ private:
+  CuMatrix<BaseFloat> linearity_;
+  CuMatrix<BaseFloat> linearity_corr_;
+};
+
+} // namespace nnet1
+} // namespace kaldi
+
+#endif
diff --git a/src/nnet/nnet-loss.cc b/src/nnet/nnet-loss.cc
index ee436d12d6e..9243e4e385e 100644
--- a/src/nnet/nnet-loss.cc
+++ b/src/nnet/nnet-loss.cc
@@ -17,14 +17,15 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
+#include <sstream>
+#include <iterator>
+#include <algorithm>
+
 #include "nnet/nnet-loss.h"
 #include "nnet/nnet-utils.h"
 #include "cudamatrix/cu-math.h"
 #include "hmm/posterior.h"
 
-#include <sstream>
-#include <iterator>
-
 namespace kaldi {
 namespace nnet1 {
 
@@ -33,35 +34,33 @@ namespace nnet1 {
 
 /**
  * Helper function of Xent::Eval,
- * calculates number of matching elemente in 'v1', 'v2' weighted by 'weights'.
+ * calculates number of matching elemente in 'hyp', 'ref' weighted by 'weights'.
  */
 template <typename T>
-inline void CountCorrectFramesWeighted(const CuArray<T> &v1, 
-                                       const CuArray<T> &v2, 
-                                       const CuVectorBase<BaseFloat> &weights, 
-                                       double *correct) {
-  KALDI_ASSERT(v1.Dim() == v2.Dim());
-  KALDI_ASSERT(v1.Dim() == weights.Dim());
-  int32 dim = v1.Dim();
+inline void CountCorrectFramesWeighted(const CuArray<T> &hyp,
+                                       const CuArray<T> &ref,
+                                       const CuVectorBase<BaseFloat> &weights,
+                                       Vector<double> *correct) {
+  KALDI_ASSERT(hyp.Dim() == ref.Dim());
+  KALDI_ASSERT(hyp.Dim() == weights.Dim());
+  int32 dim = hyp.Dim();
   // Get GPU data to host,
-  std::vector<T> v1_h(dim), v2_h(dim);
-  v1.CopyToVec(&v1_h);
-  v2.CopyToVec(&v2_h);
+  std::vector<T> hyp_h(dim), ref_h(dim);
+  hyp.CopyToVec(&hyp_h);
+  ref.CopyToVec(&ref_h);
   Vector<BaseFloat> w(dim);
   weights.CopyToVec(&w);
-  // Get correct frame count (weighted),
-  double corr = 0.0;
-  for (int32 i=0; i<dim; i++) {
-   corr += w(i) * (v1_h[i] == v2_h[i] ? 1.0 : 0.0);
+  // Accumulate weighted counts of correct frames,
+  for (int32 i = 0; i < dim; i++) {
+    KALDI_ASSERT(ref_h[i] < correct->Dim());
+    (*correct)(ref_h[i]) += w(i) * (hyp_h[i] == ref_h[i] ? 1.0 : 0.0);
   }
-  // Return,
-  (*correct) = corr;
 }
 
 
 void Xent::Eval(const VectorBase<BaseFloat> &frame_weights,
-                const CuMatrixBase<BaseFloat> &net_out, 
-                const CuMatrixBase<BaseFloat> &targets, 
+                const CuMatrixBase<BaseFloat> &net_out,
+                const CuMatrixBase<BaseFloat> &targets,
                 CuMatrix<BaseFloat> *diff) {
   // check inputs,
   KALDI_ASSERT(net_out.NumCols() == targets.NumCols());
@@ -72,72 +71,81 @@ void Xent::Eval(const VectorBase<BaseFloat> &frame_weights,
   KALDI_ASSERT(KALDI_ISFINITE(net_out.Sum()));
   KALDI_ASSERT(KALDI_ISFINITE(targets.Sum()));
 
+  // buffer initialization,
+  int32 num_classes = targets.NumCols();
+  if (frames_.Dim() == 0) {
+    frames_.Resize(num_classes);
+    xentropy_.Resize(num_classes);
+    entropy_.Resize(num_classes);
+    correct_.Resize(num_classes);
+  }
+
   // get frame_weights to GPU,
   frame_weights_ = frame_weights;
 
   // There may be frames for which the sum of targets is zero.
-  // This happens in multi-lingual training when the frame 
+  // This happens in multi-lingual training when the frame
   // has target class in the softmax of another language.
   // We 'switch-off' such frames by masking the 'frame_weights_',
   target_sum_.Resize(targets.NumRows());
   target_sum_.AddColSumMat(1.0, targets, 0.0);
   frame_weights_.MulElements(target_sum_);
 
-  // get the number of frames after the masking,
-  double num_frames = frame_weights_.Sum();
-  KALDI_ASSERT(num_frames >= 0.0);
-
   // compute derivative wrt. activations of last layer of neurons,
   *diff = net_out;
   diff->AddMat(-1.0, targets);
-  diff->MulRowsVec(frame_weights_); // weighting,
+  diff->MulRowsVec(frame_weights_);  // weighting,
+
+  // count frames per class,
+  frames_aux_ = targets;
+  frames_aux_.MulRowsVec(frame_weights_);
+  frames_.AddRowSumMat(1.0, CuMatrix<double>(frames_aux_));
 
   // evaluate the frame-level classification,
-  double correct; 
-  net_out.FindRowMaxId(&max_id_out_); // find max in nn-output
-  targets.FindRowMaxId(&max_id_tgt_); // find max in targets
-  CountCorrectFramesWeighted(max_id_out_, max_id_tgt_, frame_weights_, &correct);
+  net_out.FindRowMaxId(&max_id_out_);  // find max in nn-output
+  targets.FindRowMaxId(&max_id_tgt_);  // find max in targets
+  CountCorrectFramesWeighted(max_id_out_, max_id_tgt_,
+                             frame_weights_, &correct_);
 
   // calculate cross_entropy (in GPU),
-  xentropy_aux_ = net_out; // y
-  xentropy_aux_.Add(1e-20); // avoid log(0)
-  xentropy_aux_.ApplyLog(); // log(y)
-  xentropy_aux_.MulElements(targets); // t*log(y)
-  xentropy_aux_.MulRowsVec(frame_weights_); // w*t*log(y) 
-  double cross_entropy = -xentropy_aux_.Sum();
-  
+  xentropy_aux_ = net_out;  // y
+  xentropy_aux_.Add(1e-20);  // avoid log(0)
+  xentropy_aux_.ApplyLog();  // log(y)
+  xentropy_aux_.MulElements(targets);  // t*log(y)
+  xentropy_aux_.MulRowsVec(frame_weights_);  // w*t*log(y)
+  xentropy_.AddRowSumMat(-1.0, CuMatrix<double>(xentropy_aux_));
+
   // caluculate entropy (in GPU),
-  entropy_aux_ = targets; // t
-  entropy_aux_.Add(1e-20); // avoid log(0)
-  entropy_aux_.ApplyLog(); // log(t)
-  entropy_aux_.MulElements(targets); // t*log(t)
-  entropy_aux_.MulRowsVec(frame_weights_); // w*t*log(t) 
-  double entropy = -entropy_aux_.Sum();
-
-  KALDI_ASSERT(KALDI_ISFINITE(cross_entropy));
-  KALDI_ASSERT(KALDI_ISFINITE(entropy));
-
-  loss_ += cross_entropy;
-  entropy_ += entropy;
-  correct_ += correct;
-  frames_ += num_frames;
+  entropy_aux_ = targets;  // t
+  entropy_aux_.Add(1e-20);  // avoid log(0)
+  entropy_aux_.ApplyLog();  // log(t)
+  entropy_aux_.MulElements(targets);  // t*log(t)
+  entropy_aux_.MulRowsVec(frame_weights_);  // w*t*log(t)
+  entropy_.AddRowSumMat(-1.0, CuMatrix<double>(entropy_aux_));
 
   // progressive loss reporting
   {
-    static const int32 progress_step = 3600*100; // 1h
-    frames_progress_ += num_frames;
-    loss_progress_ += cross_entropy;
-    entropy_progress_ += entropy;
+    static const int32 progress_step = 3600*100;  // 1h
+    frames_progress_ += frame_weights_.Sum();
+    xentropy_progress_ += -xentropy_aux_.Sum();
+    entropy_progress_ += -entropy_aux_.Sum();
+
+    KALDI_ASSERT(KALDI_ISFINITE(xentropy_progress_));
+    KALDI_ASSERT(KALDI_ISFINITE(entropy_progress_));
+
     if (frames_progress_ > progress_step) {
-      KALDI_VLOG(1) << "ProgressLoss[last " 
-                    << static_cast<int>(frames_progress_/100/3600) << "h of " 
-                    << static_cast<int>(frames_/100/3600) << "h]: " 
-                    << (loss_progress_-entropy_progress_)/frames_progress_ << " (Xent)";
-      // store
-      loss_vec_.push_back((loss_progress_-entropy_progress_)/frames_progress_);
-      // reset
+      double progress_value =
+        (xentropy_progress_ - entropy_progress_) / frames_progress_;
+      // print,
+      KALDI_VLOG(1) << "ProgressLoss[last "
+                    << static_cast<int>(frames_progress_/100/3600) << "h of "
+                    << static_cast<int>(frames_.Sum()/100/3600) << "h]: "
+                    << progress_value << " (Xent)";
+      // store,
+      loss_vec_.push_back(progress_value);
+      // reset,
       frames_progress_ = 0;
-      loss_progress_ = 0.0;
+      xentropy_progress_ = 0.0;
       entropy_progress_ = 0.0;
     }
   }
@@ -145,8 +153,8 @@ void Xent::Eval(const VectorBase<BaseFloat> &frame_weights,
 
 
 void Xent::Eval(const VectorBase<BaseFloat> &frame_weights,
-                const CuMatrixBase<BaseFloat> &net_out, 
-                const Posterior &post, 
+                const CuMatrixBase<BaseFloat> &net_out,
+                const Posterior &post,
                 CuMatrix<BaseFloat> *diff) {
   int32 num_frames = net_out.NumRows(),
     num_pdf = net_out.NumCols();
@@ -161,28 +169,53 @@ void Xent::Eval(const VectorBase<BaseFloat> &frame_weights,
 
 
 std::string Xent::Report() {
+  double loss_value =
+    (xentropy_.Sum() - entropy_.Sum()) / frames_.Sum();
   std::ostringstream oss;
-  oss << "AvgLoss: " << (loss_-entropy_)/frames_ << " (Xent), "
-      << "[AvgXent " << loss_/frames_ 
-      << ", AvgTargetEnt " << entropy_/frames_ 
-      << ", frames " << frames_ << "]" << std::endl;
-  if (loss_vec_.size() > 0) {
-     oss << "progress: [";
-     std::copy(loss_vec_.begin(),loss_vec_.end(),std::ostream_iterator<float>(oss," "));
-     oss << "]" << std::endl;
-  }
-  if (correct_ >= 0.0) {
-    oss << "FRAME_ACCURACY >> " << 100.0*correct_/frames_ << "% <<" << std::endl;
-  }
-  return oss.str(); 
+  oss << "AvgLoss: " << loss_value << " (Xent), "
+      << "[AvgXent: " << xentropy_.Sum() / frames_.Sum()
+      << ", AvgTargetEnt: " << entropy_.Sum() / frames_.Sum()
+      << "]" << std::endl;
+
+  oss << "progress: [";
+  std::copy(loss_vec_.begin(), loss_vec_.end(),
+            std::ostream_iterator<float>(oss, " "));
+  oss << "]" << std::endl;
+
+  double frame_accuracy = 100.0 * correct_.Sum() / frames_.Sum();
+  oss << "FRAME_ACCURACY >> " << frame_accuracy << "% <<" << std::endl;
+
+  return oss.str();
+}
+
+
+std::string Xent::ReportPerClass() {
+  std::ostringstream oss;
+  oss << "PER-CLASS PERFORMANCE:" << std::endl;
+  oss << "@@@ Frames per-class:" << frames_;
+  // get inverted counts,
+  CuVector<double> inv_frames(frames_);
+  inv_frames.ApplyPow(-1.0);
+  // loss, kl = xentropy-entropy,
+  CuVector<double> loss(xentropy_);
+  loss.AddVec(-1.0, entropy_);
+  loss.MulElements(inv_frames);
+  oss << "@@@ Loss per-class:" << loss;
+  // frame accuracy (assuming targets are binary),
+  CuVector<double> frm_accu(correct_);
+  frm_accu.MulElements(inv_frames);
+  frm_accu.Scale(100.0);
+  oss << "@@@ Frame-accuracy per-class:" << frm_accu;
+  //
+  return oss.str();
 }
 
 
 /* Mse */
 
 void Mse::Eval(const VectorBase<BaseFloat> &frame_weights,
-               const CuMatrixBase<BaseFloat>& net_out, 
-               const CuMatrixBase<BaseFloat>& target, 
+               const CuMatrixBase<BaseFloat>& net_out,
+               const CuMatrixBase<BaseFloat>& target,
                CuMatrix<BaseFloat>* diff) {
   // check inputs,
   KALDI_ASSERT(net_out.NumCols() == target.NumCols());
@@ -199,16 +232,16 @@ void Mse::Eval(const VectorBase<BaseFloat> &frame_weights,
   // get frame_weights to GPU,
   frame_weights_ = frame_weights;
 
-  //compute derivative w.r.t. neural nerwork outputs
-  *diff = net_out; // y
-  diff->AddMat(-1.0,target); // (y - t)
-  diff->MulRowsVec(frame_weights_); // weighting,
+  // compute derivative w.r.t. neural nerwork outputs
+  *diff = net_out;  // y
+  diff->AddMat(-1.0, target);  // (y - t)
+  diff->MulRowsVec(frame_weights_);  // weighting,
 
   // Compute MeanSquareError loss of mini-batch
   diff_pow_2_ = *diff;
-  diff_pow_2_.MulElements(diff_pow_2_); // (y - t)^2
-  diff_pow_2_.MulRowsVec(frame_weights_); // w*(y - t)^2
-  double mean_square_error = 0.5 * diff_pow_2_.Sum(); // sum the matrix,
+  diff_pow_2_.MulElements(diff_pow_2_);  // (y - t)^2
+  diff_pow_2_.MulRowsVec(frame_weights_);  // w*(y - t)^2
+  double mean_square_error = 0.5 * diff_pow_2_.Sum();  // sum the matrix,
 
   KALDI_ASSERT(KALDI_ISFINITE(mean_square_error));
 
@@ -218,13 +251,13 @@ void Mse::Eval(const VectorBase<BaseFloat> &frame_weights,
 
   // progressive loss reporting
   {
-    static const int32 progress_step = 3600*100; // 1h
+    static const int32 progress_step = 3600*100;  // 1h
     frames_progress_ += num_frames;
     loss_progress_ += mean_square_error;
     if (frames_progress_ > progress_step) {
-      KALDI_VLOG(1) << "ProgressLoss[last " 
-                    << static_cast<int>(frames_progress_/100/3600) << "h of " 
-                    << static_cast<int>(frames_/100/3600) << "h]: " 
+      KALDI_VLOG(1) << "ProgressLoss[last "
+                    << static_cast<int>(frames_progress_/100/3600) << "h of "
+                    << static_cast<int>(frames_/100/3600) << "h]: "
                     << loss_progress_/frames_progress_ << " (Mse)";
       // store
       loss_vec_.push_back(loss_progress_/frames_progress_);
@@ -237,8 +270,8 @@ void Mse::Eval(const VectorBase<BaseFloat> &frame_weights,
 
 
 void Mse::Eval(const VectorBase<BaseFloat> &frame_weights,
-               const CuMatrixBase<BaseFloat>& net_out, 
-               const Posterior& post, 
+               const CuMatrixBase<BaseFloat>& net_out,
+               const Posterior& post,
                CuMatrix<BaseFloat>* diff) {
   int32 num_frames = net_out.NumRows(),
     num_nn_outputs = net_out.NumCols();
@@ -250,7 +283,7 @@ void Mse::Eval(const VectorBase<BaseFloat> &frame_weights,
   // call the other eval function,
   Eval(frame_weights, net_out, tgt_mat_, diff);
 }
- 
+
 
 std::string Mse::Report() {
   // compute root mean square,
@@ -258,10 +291,12 @@ std::string Mse::Report() {
   BaseFloat root_mean_square = sqrt(loss_/frames_/num_tgt);
   // build the message,
   std::ostringstream oss;
-  oss << "AvgLoss: " << loss_/frames_ << " (Mse), " 
-      << "[RMS " << root_mean_square << ", frames " << frames_ << "]" << std::endl;
+  oss << "AvgLoss: " << loss_/frames_ << " (Mse), "
+      << "[RMS " << root_mean_square << ", frames "
+      << frames_ << "]" << std::endl;
   oss << "progress: [";
-  std::copy(loss_vec_.begin(),loss_vec_.end(),std::ostream_iterator<float>(oss," "));
+  std::copy(loss_vec_.begin(), loss_vec_.end(),
+            std::ostream_iterator<float>(oss, " "));
   oss << "]" << std::endl;
   return oss.str();
 }
@@ -273,11 +308,11 @@ void MultiTaskLoss::InitFromString(const std::string& s) {
   std::vector<std::string> v;
   SplitStringToVector(s, ",:" /* delimiter */, false, &v);
 
-  KALDI_ASSERT((v.size()-1) % 3 == 0); // triplets,
-  KALDI_ASSERT(v[0] == "multitask"); // header,
+  KALDI_ASSERT((v.size()-1) % 3 == 0);  // triplets,
+  KALDI_ASSERT(v[0] == "multitask");  // header,
 
   // parse the definition of multitask loss,
-  std::vector<std::string>::iterator it(v.begin()+1); // skip header,
+  std::vector<std::string>::iterator it(v.begin()+1);  // skip header,
   for ( ; it != v.end(); ++it) {
     // type,
     if (*it == "xent") {
@@ -305,7 +340,7 @@ void MultiTaskLoss::InitFromString(const std::string& s) {
   }
 
   // build vector with starting-point offsets,
-  loss_dim_offset_.resize(loss_dim_.size()+1, 0); // 1st zero stays,
+  loss_dim_offset_.resize(loss_dim_.size()+1, 0);  // 1st zero stays,
   for (int32 i = 1; i <= loss_dim_.size(); i++) {
     loss_dim_offset_[i] = loss_dim_offset_[i-1] + loss_dim_[i-1];
   }
@@ -316,25 +351,25 @@ void MultiTaskLoss::InitFromString(const std::string& s) {
   KALDI_ASSERT(loss_vec_.size() == loss_weights_.size());
 }
 
-void MultiTaskLoss::Eval(const VectorBase<BaseFloat> &frame_weights, 
-            const CuMatrixBase<BaseFloat>& net_out, 
+void MultiTaskLoss::Eval(const VectorBase<BaseFloat> &frame_weights,
+            const CuMatrixBase<BaseFloat>& net_out,
             const Posterior& post,
             CuMatrix<BaseFloat>* diff) {
   int32 num_frames = net_out.NumRows(),
     num_output = net_out.NumCols();
   KALDI_ASSERT(num_frames == post.size());
-  KALDI_ASSERT(num_output == loss_dim_offset_.back()); // sum of loss-dims,
+  KALDI_ASSERT(num_output == loss_dim_offset_.back());  // sum of loss-dims,
 
   // convert posterior to matrix,
   PosteriorToMatrix(post, num_output, &tgt_mat_);
 
   // allocate diff matrix,
   diff->Resize(num_frames, num_output);
-  
+
   // call the vector of loss functions,
   CuMatrix<BaseFloat> diff_aux;
   for (int32 i = 0; i < loss_vec_.size(); i++) {
-    loss_vec_[i]->Eval(frame_weights, 
+    loss_vec_[i]->Eval(frame_weights,
       net_out.ColRange(loss_dim_offset_[i], loss_dim_[i]),
       tgt_mat_.ColRange(loss_dim_offset_[i], loss_dim_[i]),
       &diff_aux);
@@ -356,14 +391,15 @@ std::string MultiTaskLoss::Report() {
 
   // build the message,
   std::ostringstream oss;
-  oss << "MultiTaskLoss, with " << loss_vec_.size() << " parallel loss functions." << std::endl;
+  oss << "MultiTaskLoss, with " << loss_vec_.size()
+      << " parallel loss functions." << std::endl;
   // individual loss reports first,
   for (int32 i = 0; i < loss_vec_.size(); i++) {
     oss << "Loss " << i+1 << ", " << loss_vec_[i]->Report() << std::endl;
   }
 
   // overall loss is last,
-  oss << "Loss (OVERALL), " 
+  oss << "Loss (OVERALL), "
       << "AvgLoss: " << overall_loss << " (MultiTaskLoss), "
       << "weights " << loss_weights_ << ", "
       << "values " << loss_values << std::endl;
@@ -375,8 +411,9 @@ BaseFloat MultiTaskLoss::AvgLoss() {
   BaseFloat ans(0.0);
   for (int32 i = 0; i < loss_vec_.size(); i++) {
     BaseFloat val = loss_weights_[i] * loss_vec_[i]->AvgLoss();
-    if(!KALDI_ISFINITE(val)) {
-      KALDI_WARN << "Loss " << i+1 << ", has bad objective function value '" << val << "', using 0.0 instead.";
+    if (!KALDI_ISFINITE(val)) {
+      KALDI_WARN << "Loss " << i+1 << ", has bad objective function value '"
+                 << val << "', using 0.0 instead.";
       val = 0.0;
     }
     ans += val;
@@ -384,5 +421,5 @@ BaseFloat MultiTaskLoss::AvgLoss() {
   return ans;
 }
 
-} // namespace nnet1
-} // namespace kaldi
+}  // namespace nnet1
+}  // namespace kaldi
diff --git a/src/nnet/nnet-loss.h b/src/nnet/nnet-loss.h
index abc4f60cdd6..1e0558f1b39 100644
--- a/src/nnet/nnet-loss.h
+++ b/src/nnet/nnet-loss.h
@@ -20,6 +20,9 @@
 #ifndef KALDI_NNET_NNET_LOSS_H_
 #define KALDI_NNET_NNET_LOSS_H_
 
+#include <string>
+#include <vector>
+
 #include "base/kaldi-common.h"
 #include "util/kaldi-holder.h"
 #include "cudamatrix/cu-matrix.h"
@@ -37,17 +40,17 @@ class LossItf {
   virtual ~LossItf() { }
 
   /// Evaluate cross entropy using target-matrix (supports soft labels),
-  virtual void Eval(const VectorBase<BaseFloat> &frame_weights, 
-            const CuMatrixBase<BaseFloat> &net_out, 
+  virtual void Eval(const VectorBase<BaseFloat> &frame_weights,
+            const CuMatrixBase<BaseFloat> &net_out,
             const CuMatrixBase<BaseFloat> &target,
             CuMatrix<BaseFloat> *diff) = 0;
 
   /// Evaluate cross entropy using target-posteriors (supports soft labels),
-  virtual void Eval(const VectorBase<BaseFloat> &frame_weights, 
-            const CuMatrixBase<BaseFloat> &net_out, 
+  virtual void Eval(const VectorBase<BaseFloat> &frame_weights,
+            const CuMatrixBase<BaseFloat> &net_out,
             const Posterior &target,
             CuMatrix<BaseFloat> *diff) = 0;
-  
+
   /// Generate string with error report,
   virtual std::string Report() = 0;
 
@@ -58,39 +61,48 @@ class LossItf {
 
 class Xent : public LossItf {
  public:
-  Xent() : frames_(0.0), correct_(0.0), loss_(0.0), entropy_(0.0), 
-           frames_progress_(0.0), loss_progress_(0.0), entropy_progress_(0.0) { }
-  ~Xent() { }
+  Xent():
+    frames_progress_(0.0),
+    xentropy_progress_(0.0),
+    entropy_progress_(0.0)
+  { }
+
+  ~Xent()
+  { }
 
   /// Evaluate cross entropy using target-matrix (supports soft labels),
-  void Eval(const VectorBase<BaseFloat> &frame_weights, 
-            const CuMatrixBase<BaseFloat> &net_out, 
+  void Eval(const VectorBase<BaseFloat> &frame_weights,
+            const CuMatrixBase<BaseFloat> &net_out,
             const CuMatrixBase<BaseFloat> &target,
             CuMatrix<BaseFloat> *diff);
 
   /// Evaluate cross entropy using target-posteriors (supports soft labels),
-  void Eval(const VectorBase<BaseFloat> &frame_weights, 
-            const CuMatrixBase<BaseFloat> &net_out, 
+  void Eval(const VectorBase<BaseFloat> &frame_weights,
+            const CuMatrixBase<BaseFloat> &net_out,
             const Posterior &target,
             CuMatrix<BaseFloat> *diff);
-  
+
   /// Generate string with error report,
   std::string Report();
 
+  /// Generate string with per-class error report,
+  std::string ReportPerClass();
+
   /// Get loss value (frame average),
   BaseFloat AvgLoss() {
-    return (loss_ - entropy_) / frames_;
+    return (xentropy_.Sum() - entropy_.Sum()) / frames_.Sum();
   }
 
- private: 
-  double frames_;
-  double correct_;
-  double loss_;
-  double entropy_;
+ private:
+  // main stats collected per target-class,
+  CuVector<double> frames_;
+  Vector<double> correct_;
+  CuVector<double> xentropy_;
+  CuVector<double> entropy_;
 
-  // partial results during training
+  // partial results during training,
   double frames_progress_;
-  double loss_progress_;
+  double xentropy_progress_;
   double entropy_progress_;
   std::vector<float> loss_vec_;
 
@@ -98,12 +110,13 @@ class Xent : public LossItf {
   CuVector<BaseFloat> frame_weights_;
   CuVector<BaseFloat> target_sum_;
 
-  // loss computation buffers
+  // loss computation buffers,
   CuMatrix<BaseFloat> tgt_mat_;
+  CuMatrix<BaseFloat> frames_aux_;
   CuMatrix<BaseFloat> xentropy_aux_;
   CuMatrix<BaseFloat> entropy_aux_;
 
-  // frame classification buffers, 
+  // frame classification buffers,
   CuArray<int32> max_id_out_;
   CuArray<int32> max_id_tgt_;
 };
@@ -111,22 +124,28 @@ class Xent : public LossItf {
 
 class Mse : public LossItf {
  public:
-  Mse() : frames_(0.0), loss_(0.0), 
-          frames_progress_(0.0), loss_progress_(0.0) { }
-  ~Mse() { }
+  Mse():
+    frames_(0.0),
+    loss_(0.0),
+    frames_progress_(0.0),
+    loss_progress_(0.0)
+  { }
+
+  ~Mse()
+  { }
 
   /// Evaluate mean square error using target-matrix,
-  void Eval(const VectorBase<BaseFloat> &frame_weights, 
-            const CuMatrixBase<BaseFloat>& net_out, 
+  void Eval(const VectorBase<BaseFloat> &frame_weights,
+            const CuMatrixBase<BaseFloat>& net_out,
             const CuMatrixBase<BaseFloat>& target,
             CuMatrix<BaseFloat>* diff);
 
   /// Evaluate mean square error using target-posteior,
-  void Eval(const VectorBase<BaseFloat> &frame_weights, 
-            const CuMatrixBase<BaseFloat>& net_out, 
+  void Eval(const VectorBase<BaseFloat> &frame_weights,
+            const CuMatrixBase<BaseFloat>& net_out,
             const Posterior& target,
             CuMatrix<BaseFloat>* diff);
-  
+
   /// Generate string with error report
   std::string Report();
 
@@ -138,7 +157,7 @@ class Mse : public LossItf {
  private:
   double frames_;
   double loss_;
-  
+
   double frames_progress_;
   double loss_progress_;
   std::vector<float> loss_vec_;
@@ -151,7 +170,9 @@ class Mse : public LossItf {
 
 class MultiTaskLoss : public LossItf {
  public:
-  MultiTaskLoss() { }
+  MultiTaskLoss()
+  { }
+
   ~MultiTaskLoss() {
     while (loss_vec_.size() > 0) {
       delete loss_vec_.back();
@@ -167,19 +188,19 @@ class MultiTaskLoss : public LossItf {
   void InitFromString(const std::string& s);
 
   /// Evaluate mean square error using target-matrix,
-  void Eval(const VectorBase<BaseFloat> &frame_weights, 
-            const CuMatrixBase<BaseFloat>& net_out, 
+  void Eval(const VectorBase<BaseFloat> &frame_weights,
+            const CuMatrixBase<BaseFloat>& net_out,
             const CuMatrixBase<BaseFloat>& target,
             CuMatrix<BaseFloat>* diff) {
     KALDI_ERR << "This is not supposed to be called!";
   }
 
   /// Evaluate mean square error using target-posteior,
-  void Eval(const VectorBase<BaseFloat> &frame_weights, 
-            const CuMatrixBase<BaseFloat>& net_out, 
+  void Eval(const VectorBase<BaseFloat> &frame_weights,
+            const CuMatrixBase<BaseFloat>& net_out,
             const Posterior& target,
             CuMatrix<BaseFloat>* diff);
-  
+
   /// Generate string with error report
   std::string Report();
 
@@ -190,14 +211,14 @@ class MultiTaskLoss : public LossItf {
   std::vector<LossItf*>  loss_vec_;
   std::vector<int32>     loss_dim_;
   std::vector<BaseFloat> loss_weights_;
-  
+
   std::vector<int32>     loss_dim_offset_;
 
   CuMatrix<BaseFloat>    tgt_mat_;
 };
 
-} // namespace nnet1
-} // namespace kaldi
+}  // namespace nnet1
+}  // namespace kaldi
 
-#endif
+#endif  // KALDI_NNET_NNET_LOSS_H_
 
diff --git a/src/nnet/nnet-lstm-projected-streams.h b/src/nnet/nnet-lstm-projected-streams.h
index 2e6fd4e0c15..e2710d8b1ca 100644
--- a/src/nnet/nnet-lstm-projected-streams.h
+++ b/src/nnet/nnet-lstm-projected-streams.h
@@ -1,5 +1,6 @@
 // nnet/nnet-lstm-projected-streams.h
 
+// Copyright 2015-2016  Brno University of Technology (author: Karel Vesely)
 // Copyright 2014  Jiayu DU (Jerry), Wei Li
 
 // See ../../COPYING for clarification regarding multiple authors
@@ -22,6 +23,9 @@
 #ifndef KALDI_NNET_NNET_LSTM_PROJECTED_STREAMS_H_
 #define KALDI_NNET_NNET_LSTM_PROJECTED_STREAMS_H_
 
+#include <string>
+#include <vector>
+
 #include "nnet/nnet-component.h"
 #include "nnet/nnet-utils.h"
 #include "cudamatrix/cu-math.h"
@@ -44,13 +48,13 @@ namespace nnet1 {
 
 class LstmProjectedStreams : public UpdatableComponent {
  public:
-  LstmProjectedStreams(int32 input_dim, int32 output_dim) :
+  LstmProjectedStreams(int32 input_dim, int32 output_dim):
     UpdatableComponent(input_dim, output_dim),
     ncell_(0),
     nrecur_(output_dim),
     nstream_(0),
     clip_gradient_(0.0)
-    //, dropout_rate_(0.0)
+    // , dropout_rate_(0.0)
   { }
 
   ~LstmProjectedStreams()
@@ -59,61 +63,44 @@ class LstmProjectedStreams : public UpdatableComponent {
   Component* Copy() const { return new LstmProjectedStreams(*this); }
   ComponentType GetType() const { return kLstmProjectedStreams; }
 
-  static void InitMatParam(CuMatrix<BaseFloat> &m, float scale) {
-    m.SetRandUniform();  // uniform in [0, 1]
-    m.Add(-0.5);         // uniform in [-0.5, 0.5]
-    m.Scale(2 * scale);  // uniform in [-scale, +scale]
-  }
-
-  static void InitVecParam(CuVector<BaseFloat> &v, float scale) {
-    Vector<BaseFloat> tmp(v.Dim());
-    for (int i=0; i < tmp.Dim(); i++) {
-      tmp(i) = (RandUniform() - 0.5) * 2 * scale;
-    }
-    v = tmp;
-  }
-
   void InitData(std::istream &is) {
-    // define options
+    // define options,
     float param_scale = 0.02;
-    // parse config
+    // parse the line from prototype,
     std::string token;
-    while (!is.eof()) {
+    while (is >> std::ws, !is.eof()) {
       ReadToken(is, false, &token);
-      if (token == "<CellDim>")
-        ReadBasicType(is, false, &ncell_);
-      else if (token == "<ClipGradient>")
-        ReadBasicType(is, false, &clip_gradient_);
-      //else if (token == "<DropoutRate>")
-      //  ReadBasicType(is, false, &dropout_rate_);
-      else if (token == "<ParamScale>")
-        ReadBasicType(is, false, &param_scale);
+      /**/ if (token == "<CellDim>") ReadBasicType(is, false, &ncell_);
+      else if (token == "<ClipGradient>") ReadBasicType(is, false, &clip_gradient_);
+      else if (token == "<LearnRateCoef>") ReadBasicType(is, false, &learn_rate_coef_);
+      else if (token == "<BiasLearnRateCoef>") ReadBasicType(is, false, &bias_learn_rate_coef_);
+      else if (token == "<ParamScale>") ReadBasicType(is, false, &param_scale);
+      // else if (token == "<DropoutRate>") ReadBasicType(is, false, &dropout_rate_);
       else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
-               << " (CellDim|ClipGradient|ParamScale)";
-               //<< " (CellDim|ClipGradient|DropoutRate|ParamScale)";
-      is >> std::ws;
+                     << " (CellDim|ClipGradient|LearnRateCoef|BiasLearnRateCoef|ParamScale)";
     }
 
-    // init weight and bias (Uniform)
+    // init the weights and biases (from uniform dist.),
     w_gifo_x_.Resize(4*ncell_, input_dim_, kUndefined);
     w_gifo_r_.Resize(4*ncell_, nrecur_, kUndefined);
     w_r_m_.Resize(nrecur_, ncell_, kUndefined);
 
-    InitMatParam(w_gifo_x_, param_scale);
-    InitMatParam(w_gifo_r_, param_scale);
-    InitMatParam(w_r_m_, param_scale);
+    RandUniform(0.0, 2.0 * param_scale, &w_gifo_x_);
+    RandUniform(0.0, 2.0 * param_scale, &w_gifo_r_);
+    RandUniform(0.0, 2.0 * param_scale, &w_r_m_);
 
     bias_.Resize(4*ncell_, kUndefined);
     peephole_i_c_.Resize(ncell_, kUndefined);
     peephole_f_c_.Resize(ncell_, kUndefined);
     peephole_o_c_.Resize(ncell_, kUndefined);
 
-    InitVecParam(bias_, param_scale);
-    InitVecParam(peephole_i_c_, param_scale);
-    InitVecParam(peephole_f_c_, param_scale);
-    InitVecParam(peephole_o_c_, param_scale);
 
-    // init delta buffers
+    RandUniform(0.0, 2.0 * param_scale, &bias_);
+    RandUniform(0.0, 2.0 * param_scale, &peephole_i_c_);
+    RandUniform(0.0, 2.0 * param_scale, &peephole_f_c_);
+    RandUniform(0.0, 2.0 * param_scale, &peephole_o_c_);
+
+    // init buffers for gradient,
     w_gifo_x_corr_.Resize(4*ncell_, input_dim_, kSetZero);
     w_gifo_r_corr_.Resize(4*ncell_, nrecur_, kSetZero);
     bias_corr_.Resize(4*ncell_, kSetZero);
@@ -124,16 +111,38 @@ class LstmProjectedStreams : public UpdatableComponent {
 
     w_r_m_corr_.Resize(nrecur_, ncell_, kSetZero);
 
+    KALDI_ASSERT(ncell_ > 0);
     KALDI_ASSERT(clip_gradient_ >= 0.0);
+    KALDI_ASSERT(learn_rate_coef_ >= 0.0);
+    KALDI_ASSERT(bias_learn_rate_coef_ >= 0.0);
   }
 
   void ReadData(std::istream &is, bool binary) {
-    ExpectToken(is, binary, "<CellDim>");
-    ReadBasicType(is, binary, &ncell_);
-    ExpectToken(is, binary, "<ClipGradient>");
-    ReadBasicType(is, binary, &clip_gradient_);
-    //ExpectToken(is, binary, "<DropoutRate>");
-    //ReadBasicType(is, binary, &dropout_rate_);
+    // Read all the '<Tokens>' in arbitrary order,
+    while ('<' == Peek(is, binary)) {
+      std::string token;
+      int first_char = PeekToken(is, binary);
+      switch (first_char) {
+        case 'C': ReadToken(is, false, &token);
+          /**/ if (token == "<CellDim>") ReadBasicType(is, binary, &ncell_);
+          else if (token == "<ClipGradient>") ReadBasicType(is, binary, &clip_gradient_);
+          else KALDI_ERR << "Unknown token: " << token;
+          break;
+        // case 'D': ExpectToken(is, binary, "<DropoutRate>");
+        //   ReadBasicType(is, binary, &dropout_rate_);
+        //   break;
+        case 'L': ExpectToken(is, binary, "<LearnRateCoef>");
+          ReadBasicType(is, binary, &learn_rate_coef_);
+          break;
+        case 'B': ExpectToken(is, binary, "<BiasLearnRateCoef>");
+          ReadBasicType(is, binary, &bias_learn_rate_coef_);
+          break;
+        default: ReadToken(is, false, &token);
+          KALDI_ERR << "Unknown token: " << token;
+      }
+    }
+    KALDI_ASSERT(ncell_ != 0);
+    // Read the data (data follow the tokens),
 
     w_gifo_x_.Read(is, binary);
     w_gifo_r_.Read(is, binary);
@@ -165,6 +174,12 @@ class LstmProjectedStreams : public UpdatableComponent {
     //WriteToken(os, binary, "<DropoutRate>");
     //WriteBasicType(os, binary, dropout_rate_);
 
+    WriteToken(os, binary, "<LearnRateCoef>");
+    WriteBasicType(os, binary, learn_rate_coef_);
+    WriteToken(os, binary, "<BiasLearnRateCoef>");
+    WriteBasicType(os, binary, bias_learn_rate_coef_);
+
+    if (!binary) os << "\n";
     w_gifo_x_.Write(os, binary);
     w_gifo_r_.Write(os, binary);
     bias_.Write(os, binary);
@@ -186,33 +201,91 @@ class LstmProjectedStreams : public UpdatableComponent {
          w_r_m_.NumRows() * w_r_m_.NumCols() );
   }
 
-  void GetParams(Vector<BaseFloat>* wei_copy) const {
-    wei_copy->Resize(NumParams());
+  void GetGradient(VectorBase<BaseFloat>* gradient) const {
+    KALDI_ASSERT(gradient->Dim() == NumParams());
+    int32 offset, len;
+
+    offset = 0;    len = w_gifo_x_.NumRows() * w_gifo_x_.NumCols();
+    gradient->Range(offset, len).CopyRowsFromMat(w_gifo_x_corr_);
+
+    offset += len; len = w_gifo_r_.NumRows() * w_gifo_r_.NumCols();
+    gradient->Range(offset, len).CopyRowsFromMat(w_gifo_r_corr_);
+
+    offset += len; len = bias_.Dim();
+    gradient->Range(offset, len).CopyFromVec(bias_corr_);
+
+    offset += len; len = peephole_i_c_.Dim();
+    gradient->Range(offset, len).CopyFromVec(peephole_i_c_corr_);
+
+    offset += len; len = peephole_f_c_.Dim();
+    gradient->Range(offset, len).CopyFromVec(peephole_f_c_corr_);
+
+    offset += len; len = peephole_o_c_.Dim();
+    gradient->Range(offset, len).CopyFromVec(peephole_o_c_corr_);
 
+    offset += len; len = w_r_m_.NumRows() * w_r_m_.NumCols();
+    gradient->Range(offset, len).CopyRowsFromMat(w_r_m_corr_);
+
+    offset += len;
+    KALDI_ASSERT(offset == NumParams());
+  }
+
+  void GetParams(VectorBase<BaseFloat>* params) const {
+    KALDI_ASSERT(params->Dim() == NumParams());
+    int32 offset, len;
+
+    offset = 0;    len = w_gifo_x_.NumRows() * w_gifo_x_.NumCols();
+    params->Range(offset, len).CopyRowsFromMat(w_gifo_x_);
+
+    offset += len; len = w_gifo_r_.NumRows() * w_gifo_r_.NumCols();
+    params->Range(offset, len).CopyRowsFromMat(w_gifo_r_);
+
+    offset += len; len = bias_.Dim();
+    params->Range(offset, len).CopyFromVec(bias_);
+
+    offset += len; len = peephole_i_c_.Dim();
+    params->Range(offset, len).CopyFromVec(peephole_i_c_);
+
+    offset += len; len = peephole_f_c_.Dim();
+    params->Range(offset, len).CopyFromVec(peephole_f_c_);
+
+    offset += len; len = peephole_o_c_.Dim();
+    params->Range(offset, len).CopyFromVec(peephole_o_c_);
+
+    offset += len; len = w_r_m_.NumRows() * w_r_m_.NumCols();
+    params->Range(offset, len).CopyRowsFromMat(w_r_m_);
+
+    offset += len;
+    KALDI_ASSERT(offset == NumParams());
+  }
+
+  void SetParams(const VectorBase<BaseFloat>& params) {
+    KALDI_ASSERT(params.Dim() == NumParams());
     int32 offset, len;
 
-    offset = 0;  len = w_gifo_x_.NumRows() * w_gifo_x_.NumCols();
-    wei_copy->Range(offset, len).CopyRowsFromMat(w_gifo_x_);
+    offset = 0;    len = w_gifo_x_.NumRows() * w_gifo_x_.NumCols();
+    w_gifo_x_.CopyRowsFromVec(params.Range(offset, len));
 
     offset += len; len = w_gifo_r_.NumRows() * w_gifo_r_.NumCols();
-    wei_copy->Range(offset, len).CopyRowsFromMat(w_gifo_r_);
+    w_gifo_r_.CopyRowsFromVec(params.Range(offset, len));
 
     offset += len; len = bias_.Dim();
-    wei_copy->Range(offset, len).CopyFromVec(bias_);
+    bias_.CopyFromVec(params.Range(offset, len));
 
     offset += len; len = peephole_i_c_.Dim();
-    wei_copy->Range(offset, len).CopyFromVec(peephole_i_c_);
+    peephole_i_c_.CopyFromVec(params.Range(offset, len));
 
     offset += len; len = peephole_f_c_.Dim();
-    wei_copy->Range(offset, len).CopyFromVec(peephole_f_c_);
+    peephole_f_c_.CopyFromVec(params.Range(offset, len));
 
     offset += len; len = peephole_o_c_.Dim();
-    wei_copy->Range(offset, len).CopyFromVec(peephole_o_c_);
+    peephole_o_c_.CopyFromVec(params.Range(offset, len));
 
     offset += len; len = w_r_m_.NumRows() * w_r_m_.NumCols();
-    wei_copy->Range(offset, len).CopyRowsFromMat(w_r_m_);
+    w_r_m_.CopyRowsFromVec(params.Range(offset, len));
 
-    return;
+    offset += len;
+    KALDI_ASSERT(offset == NumParams());
   }
 
   std::string Info() const {
@@ -293,13 +366,14 @@ class LstmProjectedStreams : public UpdatableComponent {
     }
   }
 
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in, CuMatrixBase<BaseFloat> *out) {
+  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
+                    CuMatrixBase<BaseFloat> *out) {
     int DEBUG = 0;
 
     static bool do_stream_reset = false;
     if (nstream_ == 0) {
       do_stream_reset = true;
-      nstream_ = 1; // Karel: we are in nnet-forward, so 1 stream,
+      nstream_ = 1;  // Karel: we are in nnet-forward, so 1 stream,
       prev_nnet_state_.Resize(nstream_, 7*ncell_ + 1*nrecur_, kSetZero);
       KALDI_LOG << "Running nnet-forward with per-utterance LSTM-state reset";
     }
@@ -312,7 +386,7 @@ class LstmProjectedStreams : public UpdatableComponent {
 
     // 0:forward pass history, [1, T]:current sequence, T+1:dummy
     propagate_buf_.Resize((T+2)*S, 7 * ncell_ + nrecur_, kSetZero);
-    propagate_buf_.RowRange(0*S,S).CopyFromMat(prev_nnet_state_);
+    propagate_buf_.RowRange(0*S, S).CopyFromMat(prev_nnet_state_);
 
     // disassemble entire neuron activation buffer into different neurons
     CuSubMatrix<BaseFloat> YG(propagate_buf_.ColRange(0*ncell_, ncell_));
@@ -327,42 +401,42 @@ class LstmProjectedStreams : public UpdatableComponent {
     CuSubMatrix<BaseFloat> YGIFO(propagate_buf_.ColRange(0, 4*ncell_));
 
     // x -> g, i, f, o, not recurrent, do it all in once
-    YGIFO.RowRange(1*S,T*S).AddMatMat(1.0, in, kNoTrans, w_gifo_x_, kTrans, 0.0);
+    YGIFO.RowRange(1*S, T*S).AddMatMat(1.0, in, kNoTrans, w_gifo_x_, kTrans, 0.0);
     //// LSTM forward dropout
     //// Google paper 2014: Recurrent Neural Network Regularization
     //// by Wojciech Zaremba, Ilya Sutskever, Oriol Vinyals
-    //if (dropout_rate_ != 0.0) {
-    //  dropout_mask_.Resize(in.NumRows(), 4*ncell_, kUndefined);
-    //  dropout_mask_.SetRandUniform();   // [0,1]
-    //  dropout_mask_.Add(-dropout_rate_);  // [-dropout_rate, 1-dropout_rate_],
-    //  dropout_mask_.ApplyHeaviside();   // -tive -> 0.0, +tive -> 1.0
-    //  YGIFO.RowRange(1*S,T*S).MulElements(dropout_mask_);
-    //}
+    // if (dropout_rate_ != 0.0) {
+    //   dropout_mask_.Resize(in.NumRows(), 4*ncell_, kUndefined);
+    //   dropout_mask_.SetRandUniform();   // [0,1]
+    //   dropout_mask_.Add(-dropout_rate_);  // [-dropout_rate, 1-dropout_rate_],
+    //   dropout_mask_.ApplyHeaviside();   // -tive -> 0.0, +tive -> 1.0
+    //   YGIFO.RowRange(1*S,T*S).MulElements(dropout_mask_);
+    // }
 
     // bias -> g, i, f, o
-    YGIFO.RowRange(1*S,T*S).AddVecToRows(1.0, bias_);
+    YGIFO.RowRange(1*S, T*S).AddVecToRows(1.0, bias_);
 
     for (int t = 1; t <= T; t++) {
       // multistream buffers for current time-step
-      CuSubMatrix<BaseFloat> y_g(YG.RowRange(t*S,S));
-      CuSubMatrix<BaseFloat> y_i(YI.RowRange(t*S,S));
-      CuSubMatrix<BaseFloat> y_f(YF.RowRange(t*S,S));
-      CuSubMatrix<BaseFloat> y_o(YO.RowRange(t*S,S));
-      CuSubMatrix<BaseFloat> y_c(YC.RowRange(t*S,S));
-      CuSubMatrix<BaseFloat> y_h(YH.RowRange(t*S,S));
-      CuSubMatrix<BaseFloat> y_m(YM.RowRange(t*S,S));
-      CuSubMatrix<BaseFloat> y_r(YR.RowRange(t*S,S));
+      CuSubMatrix<BaseFloat> y_g(YG.RowRange(t*S, S));
+      CuSubMatrix<BaseFloat> y_i(YI.RowRange(t*S, S));
+      CuSubMatrix<BaseFloat> y_f(YF.RowRange(t*S, S));
+      CuSubMatrix<BaseFloat> y_o(YO.RowRange(t*S, S));
+      CuSubMatrix<BaseFloat> y_c(YC.RowRange(t*S, S));
+      CuSubMatrix<BaseFloat> y_h(YH.RowRange(t*S, S));
+      CuSubMatrix<BaseFloat> y_m(YM.RowRange(t*S, S));
+      CuSubMatrix<BaseFloat> y_r(YR.RowRange(t*S, S));
 
-      CuSubMatrix<BaseFloat> y_gifo(YGIFO.RowRange(t*S,S));
+      CuSubMatrix<BaseFloat> y_gifo(YGIFO.RowRange(t*S, S));
 
       // r(t-1) -> g, i, f, o
-      y_gifo.AddMatMat(1.0, YR.RowRange((t-1)*S,S), kNoTrans, w_gifo_r_, kTrans,  1.0);
+      y_gifo.AddMatMat(1.0, YR.RowRange((t-1)*S, S), kNoTrans, w_gifo_r_, kTrans,  1.0);
 
       // c(t-1) -> i(t) via peephole
-      y_i.AddMatDiagVec(1.0, YC.RowRange((t-1)*S,S), kNoTrans, peephole_i_c_, 1.0);
+      y_i.AddMatDiagVec(1.0, YC.RowRange((t-1)*S, S), kNoTrans, peephole_i_c_, 1.0);
 
       // c(t-1) -> f(t) via peephole
-      y_f.AddMatDiagVec(1.0, YC.RowRange((t-1)*S,S), kNoTrans, peephole_f_c_, 1.0);
+      y_f.AddMatDiagVec(1.0, YC.RowRange((t-1)*S, S), kNoTrans, peephole_f_c_, 1.0);
 
       // i, f sigmoid squashing
       y_i.Sigmoid(y_i);
@@ -375,7 +449,7 @@ class LstmProjectedStreams : public UpdatableComponent {
       y_c.AddMatMatElements(1.0, y_g, y_i, 0.0);
 
       // c(t-1) -> c(t) via forget-gate
-      y_c.AddMatMatElements(1.0, YC.RowRange((t-1)*S,S), y_f, 1.0);
+      y_c.AddMatMatElements(1.0, YC.RowRange((t-1)*S, S), y_f, 1.0);
 
       y_c.ApplyFloor(-50);   // optional clipping of cell activation
       y_c.ApplyCeiling(50);  // google paper Interspeech2014: LSTM for LVCSR
@@ -409,15 +483,16 @@ class LstmProjectedStreams : public UpdatableComponent {
     }
 
     // recurrent projection layer is also feed-forward as LSTM output
-    out->CopyFromMat(YR.RowRange(1*S,T*S));
+    out->CopyFromMat(YR.RowRange(1*S, T*S));
 
     // now the last frame state becomes previous network state for next batch
-    prev_nnet_state_.CopyFromMat(propagate_buf_.RowRange(T*S,S));
+    prev_nnet_state_.CopyFromMat(propagate_buf_.RowRange(T*S, S));
   }
 
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in, const CuMatrixBase<BaseFloat> &out,
-              const CuMatrixBase<BaseFloat> &out_diff, CuMatrixBase<BaseFloat> *in_diff) {
-
+  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
+                        const CuMatrixBase<BaseFloat> &out,
+                        const CuMatrixBase<BaseFloat> &out_diff,
+                        CuMatrixBase<BaseFloat> *in_diff) {
     int DEBUG = 0;
 
     int32 T = in.NumRows() / nstream_;
@@ -449,31 +524,31 @@ class LstmProjectedStreams : public UpdatableComponent {
     CuSubMatrix<BaseFloat> DGIFO(backpropagate_buf_.ColRange(0, 4*ncell_));
 
     // projection layer to LSTM output is not recurrent, so backprop it all in once
-    DR.RowRange(1*S,T*S).CopyFromMat(out_diff);
+    DR.RowRange(1*S, T*S).CopyFromMat(out_diff);
 
     for (int t = T; t >= 1; t--) {
-      CuSubMatrix<BaseFloat> y_g(YG.RowRange(t*S,S));
-      CuSubMatrix<BaseFloat> y_i(YI.RowRange(t*S,S));
-      CuSubMatrix<BaseFloat> y_f(YF.RowRange(t*S,S));
-      CuSubMatrix<BaseFloat> y_o(YO.RowRange(t*S,S));
-      CuSubMatrix<BaseFloat> y_c(YC.RowRange(t*S,S));
-      CuSubMatrix<BaseFloat> y_h(YH.RowRange(t*S,S));
-      CuSubMatrix<BaseFloat> y_m(YM.RowRange(t*S,S));
-      CuSubMatrix<BaseFloat> y_r(YR.RowRange(t*S,S));
-
-      CuSubMatrix<BaseFloat> d_g(DG.RowRange(t*S,S));
-      CuSubMatrix<BaseFloat> d_i(DI.RowRange(t*S,S));
-      CuSubMatrix<BaseFloat> d_f(DF.RowRange(t*S,S));
-      CuSubMatrix<BaseFloat> d_o(DO.RowRange(t*S,S));
-      CuSubMatrix<BaseFloat> d_c(DC.RowRange(t*S,S));
-      CuSubMatrix<BaseFloat> d_h(DH.RowRange(t*S,S));
-      CuSubMatrix<BaseFloat> d_m(DM.RowRange(t*S,S));
-      CuSubMatrix<BaseFloat> d_r(DR.RowRange(t*S,S));
+      CuSubMatrix<BaseFloat> y_g(YG.RowRange(t*S, S));
+      CuSubMatrix<BaseFloat> y_i(YI.RowRange(t*S, S));
+      CuSubMatrix<BaseFloat> y_f(YF.RowRange(t*S, S));
+      CuSubMatrix<BaseFloat> y_o(YO.RowRange(t*S, S));
+      CuSubMatrix<BaseFloat> y_c(YC.RowRange(t*S, S));
+      CuSubMatrix<BaseFloat> y_h(YH.RowRange(t*S, S));
+      CuSubMatrix<BaseFloat> y_m(YM.RowRange(t*S, S));
+      CuSubMatrix<BaseFloat> y_r(YR.RowRange(t*S, S));
+
+      CuSubMatrix<BaseFloat> d_g(DG.RowRange(t*S, S));
+      CuSubMatrix<BaseFloat> d_i(DI.RowRange(t*S, S));
+      CuSubMatrix<BaseFloat> d_f(DF.RowRange(t*S, S));
+      CuSubMatrix<BaseFloat> d_o(DO.RowRange(t*S, S));
+      CuSubMatrix<BaseFloat> d_c(DC.RowRange(t*S, S));
+      CuSubMatrix<BaseFloat> d_h(DH.RowRange(t*S, S));
+      CuSubMatrix<BaseFloat> d_m(DM.RowRange(t*S, S));
+      CuSubMatrix<BaseFloat> d_r(DR.RowRange(t*S, S));
 
       // r
       //   Version 1 (precise gradients):
       //   backprop error from g(t+1), i(t+1), f(t+1), o(t+1) to r(t)
-      d_r.AddMatMat(1.0, DGIFO.RowRange((t+1)*S,S), kNoTrans, w_gifo_r_, kNoTrans, 1.0);
+      d_r.AddMatMat(1.0, DGIFO.RowRange((t+1)*S, S), kNoTrans, w_gifo_r_, kNoTrans, 1.0);
 
       /*
       //   Version 2 (Alex Graves' PhD dissertation):
@@ -506,10 +581,10 @@ class LstmProjectedStreams : public UpdatableComponent {
       // 4. diff from f(t+1) (via peephole)
       // 5. diff from o(t)   (via peephole, not recurrent)
       d_c.AddMat(1.0, d_h);
-      d_c.AddMatMatElements(1.0, DC.RowRange((t+1)*S,S), YF.RowRange((t+1)*S,S), 1.0);
-      d_c.AddMatDiagVec(1.0, DI.RowRange((t+1)*S,S), kNoTrans, peephole_i_c_, 1.0);
-      d_c.AddMatDiagVec(1.0, DF.RowRange((t+1)*S,S), kNoTrans, peephole_f_c_, 1.0);
-      d_c.AddMatDiagVec(1.0, d_o                   , kNoTrans, peephole_o_c_, 1.0);
+      d_c.AddMatMatElements(1.0, DC.RowRange((t+1)*S, S), YF.RowRange((t+1)*S,S), 1.0);
+      d_c.AddMatDiagVec(1.0, DI.RowRange((t+1)*S, S), kNoTrans, peephole_i_c_, 1.0);
+      d_c.AddMatDiagVec(1.0, DF.RowRange((t+1)*S, S), kNoTrans, peephole_f_c_, 1.0);
+      d_c.AddMatDiagVec(1.0, d_o                    , kNoTrans, peephole_o_c_, 1.0);
 
       // f
       d_f.AddMatMatElements(1.0, d_c, YC.RowRange((t-1)*S,S), 0.0);
@@ -541,34 +616,34 @@ class LstmProjectedStreams : public UpdatableComponent {
     in_diff->AddMatMat(1.0, DGIFO.RowRange(1*S,T*S), kNoTrans, w_gifo_x_, kNoTrans, 0.0);
 
     //// backward pass dropout
-    //if (dropout_rate_ != 0.0) {
-    //  in_diff->MulElements(dropout_mask_);
-    //}
+    // if (dropout_rate_ != 0.0) {
+    //   in_diff->MulElements(dropout_mask_);
+    // }
 
     // calculate delta
     const BaseFloat mmt = opts_.momentum;
 
     // weight x -> g, i, f, o
-    w_gifo_x_corr_.AddMatMat(1.0, DGIFO.RowRange(1*S,T*S), kTrans,
-                                  in                     , kNoTrans, mmt);
+    w_gifo_x_corr_.AddMatMat(1.0, DGIFO.RowRange(1*S, T*S), kTrans,
+                                  in                      , kNoTrans, mmt);
     // recurrent weight r -> g, i, f, o
-    w_gifo_r_corr_.AddMatMat(1.0, DGIFO.RowRange(1*S,T*S), kTrans,
-                                  YR.RowRange(0*S,T*S)   , kNoTrans, mmt);
+    w_gifo_r_corr_.AddMatMat(1.0, DGIFO.RowRange(1*S, T*S), kTrans,
+                                  YR.RowRange(0*S, T*S)   , kNoTrans, mmt);
     // bias of g, i, f, o
-    bias_corr_.AddRowSumMat(1.0, DGIFO.RowRange(1*S,T*S), mmt);
+    bias_corr_.AddRowSumMat(1.0, DGIFO.RowRange(1*S, T*S), mmt);
 
     // recurrent peephole c -> i
-    peephole_i_c_corr_.AddDiagMatMat(1.0, DI.RowRange(1*S,T*S), kTrans,
-                                          YC.RowRange(0*S,T*S), kNoTrans, mmt);
+    peephole_i_c_corr_.AddDiagMatMat(1.0, DI.RowRange(1*S, T*S), kTrans,
+                                          YC.RowRange(0*S, T*S), kNoTrans, mmt);
     // recurrent peephole c -> f
-    peephole_f_c_corr_.AddDiagMatMat(1.0, DF.RowRange(1*S,T*S), kTrans,
-                                          YC.RowRange(0*S,T*S), kNoTrans, mmt);
+    peephole_f_c_corr_.AddDiagMatMat(1.0, DF.RowRange(1*S, T*S), kTrans,
+                                          YC.RowRange(0*S, T*S), kNoTrans, mmt);
     // peephole c -> o
-    peephole_o_c_corr_.AddDiagMatMat(1.0, DO.RowRange(1*S,T*S), kTrans,
-                                          YC.RowRange(1*S,T*S), kNoTrans, mmt);
+    peephole_o_c_corr_.AddDiagMatMat(1.0, DO.RowRange(1*S, T*S), kTrans,
+                                          YC.RowRange(1*S, T*S), kNoTrans, mmt);
 
-    w_r_m_corr_.AddMatMat(1.0, DR.RowRange(1*S,T*S), kTrans,
-                               YM.RowRange(1*S,T*S), kNoTrans, mmt);
+    w_r_m_corr_.AddMatMat(1.0, DR.RowRange(1*S, T*S), kTrans,
+                               YM.RowRange(1*S, T*S), kNoTrans, mmt);
 
     if (clip_gradient_ > 0.0) {
       w_gifo_x_corr_.ApplyFloor(-clip_gradient_);
@@ -599,34 +674,40 @@ class LstmProjectedStreams : public UpdatableComponent {
     }
   }
 
-  void Update(const CuMatrixBase<BaseFloat> &input, const CuMatrixBase<BaseFloat> &diff) {
+  void Update(const CuMatrixBase<BaseFloat> &input,
+              const CuMatrixBase<BaseFloat> &diff) {
+    // getting the learning rate,
     const BaseFloat lr  = opts_.learn_rate;
 
-    w_gifo_x_.AddMat(-lr, w_gifo_x_corr_);
-    w_gifo_r_.AddMat(-lr, w_gifo_r_corr_);
-    bias_.AddVec(-lr, bias_corr_, 1.0);
+    w_gifo_x_.AddMat(-lr * learn_rate_coef_, w_gifo_x_corr_);
+    w_gifo_r_.AddMat(-lr * learn_rate_coef_, w_gifo_r_corr_);
+    bias_.AddVec(-lr * bias_learn_rate_coef_, bias_corr_, 1.0);
 
-    peephole_i_c_.AddVec(-lr, peephole_i_c_corr_, 1.0);
-    peephole_f_c_.AddVec(-lr, peephole_f_c_corr_, 1.0);
-    peephole_o_c_.AddVec(-lr, peephole_o_c_corr_, 1.0);
+    // we use 'bias_learn_rate_coef_' to peephole connections, as these tend to explode,
+    peephole_i_c_.AddVec(-lr * bias_learn_rate_coef_, peephole_i_c_corr_, 1.0);
+    peephole_f_c_.AddVec(-lr * bias_learn_rate_coef_, peephole_f_c_corr_, 1.0);
+    peephole_o_c_.AddVec(-lr * bias_learn_rate_coef_, peephole_o_c_corr_, 1.0);
 
-    w_r_m_.AddMat(-lr, w_r_m_corr_);
+    w_r_m_.AddMat(-lr * learn_rate_coef_, w_r_m_corr_);
 
 //    /*
-//      Here we deal with the famous "vanishing & exploding difficulties" in RNN learning.
+//      Here we deal with the famous "vanishing & exploding difficulties"
+//      in RNN learning.
 //
 //      *For gradients vanishing*
-//      LSTM architecture introduces linear CEC as the "error bridge" across long time distance
-//      solving vanishing problem.
+//      LSTM architecture introduces linear CEC as the "error bridge" across
+//      long time distance solving vanishing problem.
 //
 //      *For gradients exploding*
-//      LSTM is still vulnerable to gradients explosing in BPTT(with large weight & deep time expension).
+//      LSTM is still vulnerable to gradients explosing in BPTT
+//      (with large weight & deep time expension).
 //      To prevent this, we tried L2 regularization, which didn't work well
 //
 //      Our approach is a *modified* version of Max Norm Regularization:
 //      For each nonlinear neuron,
 //      1. fan-in weights & bias model a seperation hyper-plane: W x + b = 0
-//      2. squashing function models a differentiable nonlinear slope around this hyper-plane.
+//      2. squashing function models a differentiable nonlinear slope around
+//         this hyper-plane.
 //
 //      Conventional max norm regularization scale W to keep its L2 norm bounded,
 //      As a modification, we scale down large (W & b) *simultaneously*, this:
@@ -687,8 +768,8 @@ class LstmProjectedStreams : public UpdatableComponent {
   BaseFloat clip_gradient_;
 
   // non-recurrent dropout
-  //BaseFloat dropout_rate_;
-  //CuMatrix<BaseFloat> dropout_mask_;
+  // BaseFloat dropout_rate_;
+  // CuMatrix<BaseFloat> dropout_mask_;
 
   // feed-forward connections: from x to [g, i, f, o]
   CuMatrix<BaseFloat> w_gifo_x_;
@@ -721,9 +802,9 @@ class LstmProjectedStreams : public UpdatableComponent {
 
   // back-propagate buffer: diff-input of [g, i, f, o, c, h, m, r]
   CuMatrix<BaseFloat> backpropagate_buf_;
+};  // class LstmProjectedStreams
 
-};
-} // namespace nnet1
-} // namespace kaldi
+}  // namespace nnet1
+}  // namespace kaldi
 
-#endif
+#endif  // KALDI_NNET_NNET_LSTM_PROJECTED_STREAMS_H_
diff --git a/src/nnet/nnet-max-pooling-2d-component.h b/src/nnet/nnet-max-pooling-2d-component.h
index 0082c74d13d..4a4045ca73d 100644
--- a/src/nnet/nnet-max-pooling-2d-component.h
+++ b/src/nnet/nnet-max-pooling-2d-component.h
@@ -1,4 +1,4 @@
-// nnet/nnet-average-pooling-component.h
+// nnet/nnet-max-pooling-2d-component.h
 
 // Copyright 2014  Brno University of Technology (author: Karel Vesely),
 //                 Johns Hopkins University (author: Sri Harish Mallidi)
@@ -22,6 +22,8 @@
 #ifndef KALDI_NNET_NNET_MAX_POOLING_2D_COMPONENT_H_
 #define KALDI_NNET_NNET_MAX_POOLING_2D_COMPONENT_H_
 
+#include <string>
+#include <vector>
 
 #include "nnet/nnet-component.h"
 #include "nnet/nnet-utils.h"
@@ -38,11 +40,13 @@ namespace nnet1 {
  */
 class MaxPooling2DComponent : public Component {
  public:
-  MaxPooling2DComponent(int32 dim_in, int32 dim_out)
-      : Component(dim_in, dim_out),
-        fmap_x_len_(0), fmap_y_len_(0),
-        pool_x_len_(0), pool_y_len_(0), pool_x_step_(0), pool_y_step_(0)
+  MaxPooling2DComponent(int32 dim_in, int32 dim_out):
+    Component(dim_in, dim_out),
+    fmap_x_len_(0), fmap_y_len_(0),
+    pool_x_len_(0), pool_y_len_(0),
+    pool_x_step_(0), pool_y_step_(0)
   { }
+
   ~MaxPooling2DComponent()
   { }
 
@@ -52,7 +56,7 @@ class MaxPooling2DComponent : public Component {
   void InitData(std::istream &is) {
     // parse config
     std::string token;
-    while (!is.eof()) {
+    while (is >> std::ws, !is.eof()) {
       ReadToken(is, false, &token);
       /**/ if (token == "<FmapXLen>") ReadBasicType(is, false, &fmap_x_len_);
       else if (token == "<FmapYLen>") ReadBasicType(is, false, &fmap_y_len_);
@@ -62,10 +66,11 @@ class MaxPooling2DComponent : public Component {
       else if (token == "<PoolYStep>") ReadBasicType(is, false, &pool_y_step_);
       else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
                      << " (FmapXLen|FmapYLen|PoolXLen|PoolYLen|PoolXStep|PoolYStep)";
-      is >> std::ws;  // eat-up whitespace
     }
     // check
-    KALDI_ASSERT(fmap_x_len_ * fmap_y_len_ * pool_x_len_ * pool_y_len_ * pool_x_step_ * pool_y_step_  != 0);
+    KALDI_ASSERT(fmap_x_len_ * fmap_y_len_ != 0);
+    KALDI_ASSERT(pool_x_len_ * pool_y_len_ != 0);
+    KALDI_ASSERT(pool_x_step_ * pool_y_step_ != 0);
   }
 
   void ReadData(std::istream &is, bool binary) {
@@ -119,7 +124,8 @@ class MaxPooling2DComponent : public Component {
     WriteBasicType(os, binary, pool_y_step_);
   }
 
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in, CuMatrixBase<BaseFloat> *out) {
+  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
+                    CuMatrixBase<BaseFloat> *out) {
     // useful dims
     int32 num_input_fmaps = input_dim_ / (fmap_x_len_ * fmap_y_len_);
     int out_fmap_cnt = 0;
@@ -127,7 +133,9 @@ class MaxPooling2DComponent : public Component {
       for (int32 n = 0; n < fmap_y_len_-pool_y_len_+1; n = n+pool_y_step_) {
         int32 st = 0;
         st = (m * fmap_y_len_ + n) * num_input_fmaps;
-        CuSubMatrix<BaseFloat> pool(out->ColRange(out_fmap_cnt * num_input_fmaps, num_input_fmaps));
+        CuSubMatrix<BaseFloat> pool(
+          out->ColRange(out_fmap_cnt * num_input_fmaps, num_input_fmaps)
+        );
         pool.Set(-1e20);  // reset (large neg value)
         for (int32 i = 0; i < pool_x_len_; i++) {
           for (int32 j = 0; j < pool_y_len_; j++) {
@@ -142,8 +150,10 @@ class MaxPooling2DComponent : public Component {
     }
   }
 
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in, const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff, CuMatrixBase<BaseFloat> *in_diff) {
+  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
+                        const CuMatrixBase<BaseFloat> &out,
+                        const CuMatrixBase<BaseFloat> &out_diff,
+                        CuMatrixBase<BaseFloat> *in_diff) {
     // useful dims
     int32 num_input_fmaps = input_dim_ / (fmap_x_len_ * fmap_y_len_);
     int32 inp_fmap_size = fmap_x_len_ * fmap_y_len_;
@@ -170,11 +180,15 @@ class MaxPooling2DComponent : public Component {
                    + j * num_input_fmaps;
             //
             CuSubMatrix<BaseFloat> in_p(in.ColRange(c, num_input_fmaps));
-            CuSubMatrix<BaseFloat> out_p(out.ColRange(out_fmap_cnt*num_input_fmaps, num_input_fmaps));
+            CuSubMatrix<BaseFloat> out_p(
+              out.ColRange(out_fmap_cnt*num_input_fmaps, num_input_fmaps)
+            );
             //
 
             CuSubMatrix<BaseFloat> tgt(in_diff->ColRange(c, num_input_fmaps));
-            CuMatrix<BaseFloat> src(out_diff.ColRange(out_fmap_cnt*num_input_fmaps, num_input_fmaps));
+            CuMatrix<BaseFloat> src(
+              out_diff.ColRange(out_fmap_cnt*num_input_fmaps, num_input_fmaps)
+            );
 
             CuMatrix<BaseFloat> mask;
             in_p.EqualElementMask(out_p, &mask);
@@ -188,7 +202,7 @@ class MaxPooling2DComponent : public Component {
       }
     }
 
-    // divide diff by #summands (compensate for patches used in more pools)
+    // divide diff by #summands (compensate for patches used in more pools),
     for (int i = 0; i < fmap_x_len_; i++) {
       for (int32 j = 0; j < fmap_y_len_; j++) {
         int32 c = i * fmap_y_len_ + j;
@@ -201,11 +215,11 @@ class MaxPooling2DComponent : public Component {
 
  private:
   int32 fmap_x_len_, fmap_y_len_,
-    pool_x_len_, pool_y_len_,
-    pool_x_step_, pool_y_step_;
+        pool_x_len_, pool_y_len_,
+        pool_x_step_, pool_y_step_;
 };
 
 }  // namespace nnet1
 }  // namespace kaldi
 
-#endif
+#endif  // KALDI_NNET_NNET_MAX_POOLING_2D_COMPONENT_H_
diff --git a/src/nnet/nnet-max-pooling-component.h b/src/nnet/nnet-max-pooling-component.h
index 75cf290c1e3..c1add201b02 100644
--- a/src/nnet/nnet-max-pooling-component.h
+++ b/src/nnet/nnet-max-pooling-component.h
@@ -21,6 +21,8 @@
 #ifndef KALDI_NNET_NNET_MAX_POOLING_COMPONENT_H_
 #define KALDI_NNET_NNET_MAX_POOLING_COMPONENT_H_
 
+#include <string>
+#include <vector>
 
 #include "nnet/nnet-component.h"
 #include "nnet/nnet-utils.h"
@@ -37,26 +39,29 @@ namespace nnet1 {
  */
 class MaxPoolingComponent : public Component {
  public:
-  MaxPoolingComponent(int32 dim_in, int32 dim_out) 
-    : Component(dim_in, dim_out), pool_size_(0), pool_step_(0), pool_stride_(0)
+  MaxPoolingComponent(int32 dim_in, int32 dim_out):
+    Component(dim_in, dim_out),
+    pool_size_(0),
+    pool_step_(0),
+    pool_stride_(0)
   { }
+
   ~MaxPoolingComponent()
   { }
 
   Component* Copy() const { return new MaxPoolingComponent(*this); }
   ComponentType GetType() const { return kMaxPoolingComponent; }
-  
+
   void InitData(std::istream &is) {
     // parse config
-    std::string token; 
-    while (!is.eof()) {
+    std::string token;
+    while (is >> std::ws, !is.eof()) {
       ReadToken(is, false, &token);
       /**/ if (token == "<PoolSize>") ReadBasicType(is, false, &pool_size_);
       else if (token == "<PoolStep>") ReadBasicType(is, false, &pool_step_);
       else if (token == "<PoolStride>") ReadBasicType(is, false, &pool_stride_);
       else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
                      << " (PoolSize|PoolStep|PoolStride)";
-      is >> std::ws; // eat-up whitespace
     }
     // check
     KALDI_ASSERT(pool_size_ != 0 && pool_step_ != 0 && pool_stride_ != 0);
@@ -95,7 +100,8 @@ class MaxPoolingComponent : public Component {
     WriteBasicType(os, binary, pool_stride_);
   }
 
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in, CuMatrixBase<BaseFloat> *out) {
+  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
+                    CuMatrixBase<BaseFloat> *out) {
     // useful dims
     int32 num_patches = input_dim_ / pool_stride_;
     int32 num_pools = 1 + (num_patches - pool_size_) / pool_step_;
@@ -104,16 +110,18 @@ class MaxPoolingComponent : public Component {
     for (int32 q = 0; q < num_pools; q++) {
       // get output buffer of the pool
       CuSubMatrix<BaseFloat> pool(out->ColRange(q*pool_stride_, pool_stride_));
-      pool.Set(-1e20); // reset (large negative value)
-      for (int32 r = 0; r < pool_size_; r++) { // max
-        int32 p = r + q * pool_step_; // p = input patch
+      pool.Set(-1e20);  // reset (large negative value)
+      for (int32 r = 0; r < pool_size_; r++) {  // max
+        int32 p = r + q * pool_step_;  // p = input patch
         pool.Max(in.ColRange(p*pool_stride_, pool_stride_));
       }
     }
   }
 
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in, const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff, CuMatrixBase<BaseFloat> *in_diff) {
+  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
+                        const CuMatrixBase<BaseFloat> &out,
+                        const CuMatrixBase<BaseFloat> &out_diff,
+                        CuMatrixBase<BaseFloat> *in_diff) {
     // useful dims
     int32 num_patches = input_dim_ / pool_stride_;
     int32 num_pools = 1 + (num_patches - pool_size_) / pool_step_;
@@ -121,22 +129,22 @@ class MaxPoolingComponent : public Component {
     //
     // here we note how many diff matrices are summed for each input patch,
     std::vector<int32> patch_summands(num_patches, 0);
-    // this metainfo will be used to divide diff of patches 
+    // this metainfo will be used to divide diff of patches
     // used in more than one pool.
     //
-    
-    in_diff->SetZero(); // reset
 
-    for(int32 q=0; q<num_pools; q++) { // sum
-      for(int32 r=0; r<pool_size_; r++) {
-        int32 p = r + q * pool_step_; // patch number
+    in_diff->SetZero();  // reset
+
+    for (int32 q = 0; q<num_pools; q++) {  // sum
+      for (int32 r = 0; r<pool_size_; r++) {
+        int32 p = r + q * pool_step_;  // patch number
         //
         CuSubMatrix<BaseFloat> in_p(in.ColRange(p*pool_stride_, pool_stride_));
         CuSubMatrix<BaseFloat> out_q(out.ColRange(q*pool_stride_, pool_stride_));
         //
         CuSubMatrix<BaseFloat> tgt(in_diff->ColRange(p*pool_stride_, pool_stride_));
         CuMatrix<BaseFloat> src(out_diff.ColRange(q*pool_stride_, pool_stride_));
-        
+
         // Only the pool-inputs with 'max-values' are used to back-propagate into,
         // the rest of derivatives is zeroed-out by a mask.
         CuMatrix<BaseFloat> mask;
@@ -149,20 +157,20 @@ class MaxPoolingComponent : public Component {
     }
 
     // divide diff by #summands (compensate for patches used in more pools)
-    for(int32 p=0; p<num_patches; p++) {
+    for (int32 p = 0; p < num_patches; p++) {
       CuSubMatrix<BaseFloat> tgt(in_diff->ColRange(p*pool_stride_, pool_stride_));
-      KALDI_ASSERT(patch_summands[p] > 0); // patch at least in one pool
+      KALDI_ASSERT(patch_summands[p] > 0);  // patch at least in one pool
       tgt.Scale(1.0/patch_summands[p]);
     }
   }
 
  private:
-  int32 pool_size_,   // input patches used for pooling
-        pool_step_,   // shift used for pooling (allow overlapping pools)
-        pool_stride_; // stride used to cut input matrix to a vector of matrices
+  int32 pool_size_,    // input patches used for pooling
+        pool_step_,    // shift used for pooling (allow overlapping pools)
+        pool_stride_;  // stride used to slice input to a vector of matrices
 };
 
-} // namespace nnet1
-} // namespace kaldi
+}  // namespace nnet1
+}  // namespace kaldi
 
-#endif
+#endif  // KALDI_NNET_NNET_MAX_POOLING_COMPONENT_H_
diff --git a/src/nnet/nnet-nnet.cc b/src/nnet/nnet-nnet.cc
index cdad2938f89..1903e68cfba 100644
--- a/src/nnet/nnet-nnet.cc
+++ b/src/nnet/nnet-nnet.cc
@@ -1,6 +1,6 @@
 // nnet/nnet-nnet.cc
 
-// Copyright 2011-2013  Brno University of Technology (Author: Karel Vesely)
+// Copyright 2011-2016  Brno University of Technology (Author: Karel Vesely)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -29,10 +29,16 @@
 namespace kaldi {
 namespace nnet1 {
 
+Nnet::Nnet() {
+}
+
+Nnet::~Nnet() {
+  Destroy();
+}
 
 Nnet::Nnet(const Nnet& other) {
   // copy the components
-  for(int32 i = 0; i < other.NumComponents(); i++) {
+  for (int32 i = 0; i < other.NumComponents(); i++) {
     components_.push_back(other.GetComponent(i).Copy());
   }
   // create empty buffers
@@ -43,108 +49,95 @@ Nnet::Nnet(const Nnet& other) {
   Check();
 }
 
-Nnet & Nnet::operator = (const Nnet& other) {
+Nnet& Nnet::operator= (const Nnet& other) {
   Destroy();
   // copy the components
-  for(int32 i = 0; i < other.NumComponents(); i++) {
+  for (int32 i = 0; i < other.NumComponents(); i++) {
     components_.push_back(other.GetComponent(i).Copy());
   }
   // create empty buffers
   propagate_buf_.resize(NumComponents()+1);
   backpropagate_buf_.resize(NumComponents()+1);
   // copy train opts
-  SetTrainOptions(other.opts_); 
+  SetTrainOptions(other.opts_);
   Check();
   return *this;
 }
 
-
-Nnet::~Nnet() {
-  Destroy();
-}
-
-
-void Nnet::Propagate(const CuMatrixBase<BaseFloat> &in, CuMatrix<BaseFloat> *out) {
-  KALDI_ASSERT(NULL != out);
-
+/**
+ * Forward propagation through the network,
+ * (from first component to last).
+ */
+void Nnet::Propagate(const CuMatrixBase<BaseFloat> &in,
+                     CuMatrix<BaseFloat> *out) {
+  // In case of empty network copy input to output,
   if (NumComponents() == 0) {
-    (*out) = in; // copy 
-    return; 
+    (*out) = in;  // copy,
+    return;
   }
-
-  // we need at least L+1 input buffers
-  KALDI_ASSERT((int32)propagate_buf_.size() >= NumComponents()+1);
-  
-  propagate_buf_[0].Resize(in.NumRows(), in.NumCols());
-  propagate_buf_[0].CopyFromMat(in);
-
-  for(int32 i=0; i<(int32)components_.size(); i++) {
+  // We need C+1 buffers,
+  if (propagate_buf_.size() != NumComponents()+1) {
+    propagate_buf_.resize(NumComponents()+1);
+  }
+  // Copy input to first buffer,
+  propagate_buf_[0] = in;
+  // Propagate through all the components,
+  for (int32 i = 0; i < static_cast<int32>(components_.size()); i++) {
     components_[i]->Propagate(propagate_buf_[i], &propagate_buf_[i+1]);
   }
-  
-  (*out) = propagate_buf_[components_.size()];
+  // Copy the output from the last buffer,
+  (*out) = propagate_buf_[NumComponents()];
 }
 
 
-void Nnet::Backpropagate(const CuMatrixBase<BaseFloat> &out_diff, CuMatrix<BaseFloat> *in_diff) {
-
-  //////////////////////////////////////
-  // Backpropagation
-  //
-
-  // 0 layers
-  if (NumComponents() == 0) { (*in_diff) = out_diff; return; }
-
-  KALDI_ASSERT((int32)propagate_buf_.size() == NumComponents()+1);
-  KALDI_ASSERT((int32)backpropagate_buf_.size() == NumComponents()+1);
-
-  // copy out_diff to last buffer
+/**
+ * Error back-propagation through the network,
+ * (from last component to first).
+ */
+void Nnet::Backpropagate(const CuMatrixBase<BaseFloat> &out_diff,
+                         CuMatrix<BaseFloat> *in_diff) {
+  // Copy the derivative in case of empty network,
+  if (NumComponents() == 0) {
+    (*in_diff) = out_diff;  // copy,
+    return;
+  }
+  // We need C+1 buffers,
+  KALDI_ASSERT(static_cast<int32>(propagate_buf_.size()) == NumComponents()+1);
+  if (backpropagate_buf_.size() != NumComponents()+1) {
+    backpropagate_buf_.resize(NumComponents()+1);
+  }
+  // Copy 'out_diff' to last buffer,
   backpropagate_buf_[NumComponents()] = out_diff;
-  // backpropagate using buffers
+  // Loop from last Component to the first,
   for (int32 i = NumComponents()-1; i >= 0; i--) {
-    components_[i]->Backpropagate(propagate_buf_[i], propagate_buf_[i+1],
-                            backpropagate_buf_[i+1], &backpropagate_buf_[i]);
+    // Backpropagate through 'Component',
+    components_[i]->Backpropagate(propagate_buf_[i],
+                                  propagate_buf_[i+1],
+                                  backpropagate_buf_[i+1],
+                                  &backpropagate_buf_[i]);
+    // Update 'Component' (if applicable),
     if (components_[i]->IsUpdatable()) {
-      UpdatableComponent *uc = dynamic_cast<UpdatableComponent*>(components_[i]);
+      UpdatableComponent* uc =
+        dynamic_cast<UpdatableComponent*>(components_[i]);
       uc->Update(propagate_buf_[i], backpropagate_buf_[i+1]);
     }
   }
-  // eventually export the derivative
-  if (NULL != in_diff) (*in_diff) = backpropagate_buf_[0];
-
-  //
-  // End of Backpropagation
-  //////////////////////////////////////
+  // Export the derivative (if applicable),
+  if (NULL != in_diff) {
+    (*in_diff) = backpropagate_buf_[0];
+  }
 }
 
 
-void Nnet::Feedforward(const CuMatrixBase<BaseFloat> &in, CuMatrix<BaseFloat> *out) {
+void Nnet::Feedforward(const CuMatrixBase<BaseFloat> &in,
+                       CuMatrix<BaseFloat> *out) {
   KALDI_ASSERT(NULL != out);
-
-  if (NumComponents() == 0) { 
-    out->Resize(in.NumRows(), in.NumCols());
-    out->CopyFromMat(in); 
-    return; 
-  }
-
-  if (NumComponents() == 1) {
-    components_[0]->Propagate(in, out);
-    return;
-  }
-
-  // we need at least 2 input buffers
-  KALDI_ASSERT(propagate_buf_.size() >= 2);
-
-  // propagate by using exactly 2 auxiliary buffers
-  int32 L = 0;
-  components_[L]->Propagate(in, &propagate_buf_[L%2]);
-  for(L++; L<=NumComponents()-2; L++) {
-    components_[L]->Propagate(propagate_buf_[(L-1)%2], &propagate_buf_[L%2]);
+  (*out) = in;  // works even with 0 components,
+  CuMatrix<BaseFloat> tmp_in;
+  for (int32 i = 0; i < NumComponents(); i++) {
+    out->Swap(&tmp_in);
+    components_[i]->Propagate(tmp_in, out);
   }
-  components_[L]->Propagate(propagate_buf_[(L-1)%2], out);
-  // release the buffers we don't need anymore
-  propagate_buf_[0].Resize(0,0);
-  propagate_buf_[1].Resize(0,0);
 }
 
 
@@ -158,193 +151,128 @@ int32 Nnet::InputDim() const {
   return components_.front()->InputDim();
 }
 
-const Component& Nnet::GetComponent(int32 component) const {
-  KALDI_ASSERT(static_cast<size_t>(component) < components_.size());
-  return *(components_[component]);
+const Component& Nnet::GetComponent(int32 c) const {
+  return *(components_.at(c));
 }
 
-Component& Nnet::GetComponent(int32 component) {
-  KALDI_ASSERT(static_cast<size_t>(component) < components_.size());
-  return *(components_[component]);
+Component& Nnet::GetComponent(int32 c) {
+  return *(components_.at(c));
 }
 
-void Nnet::SetComponent(int32 c, Component *component) {
-  KALDI_ASSERT(static_cast<size_t>(c) < components_.size());
-  delete components_[c];
-  components_[c] = component;
-  Check(); // Check that all the dimensions still match up.
+const Component& Nnet::GetLastComponent() const {
+  return *(components_.at(NumComponents()-1));
 }
 
-void Nnet::AppendComponent(Component* dynamically_allocated_comp) {
-  // append,
-  components_.push_back(dynamically_allocated_comp);
-  // create training buffers,
-  propagate_buf_.resize(NumComponents()+1);
-  backpropagate_buf_.resize(NumComponents()+1);
-  //
+Component& Nnet::GetLastComponent() {
+  return *(components_.at(NumComponents()-1));
+}
+
+void Nnet::ReplaceComponent(int32 c, const Component& comp) {
+  delete components_.at(c);
+  components_.at(c) = comp.Copy();  // deep copy,
   Check();
 }
 
-void Nnet::AppendNnet(const Nnet& nnet_to_append) {
-  // append,
-  for(int32 i=0; i<nnet_to_append.NumComponents(); i++) {
-    AppendComponent(nnet_to_append.GetComponent(i).Copy());
-  }
-  // create training buffers,
-  propagate_buf_.resize(NumComponents()+1);
-  backpropagate_buf_.resize(NumComponents()+1);
-  //
+void Nnet::SwapComponent(int32 c, Component** comp) {
+  Component* tmp = components_.at(c);
+  components_.at(c) = *comp;
+  (*comp) = tmp;
   Check();
 }
 
-void Nnet::RemoveComponent(int32 component) {
-  KALDI_ASSERT(component < NumComponents());
-  // remove,
-  Component* ptr = components_[component];
-  components_.erase(components_.begin()+component);
-  delete ptr;
-  // create training buffers,
-  propagate_buf_.resize(NumComponents()+1);
-  backpropagate_buf_.resize(NumComponents()+1);
-  // 
+void Nnet::AppendComponent(const Component& comp) {
+  components_.push_back(comp.Copy());  // append,
   Check();
 }
 
+void Nnet::AppendComponentPointer(Component* dynamically_allocated_comp) {
+  components_.push_back(dynamically_allocated_comp);  // append,
+  Check();
+}
 
-void Nnet::GetParams(Vector<BaseFloat>* wei_copy) const {
-  wei_copy->Resize(NumParams());
-  int32 pos = 0;
-  // copy the params
-  for(int32 i=0; i<components_.size(); i++) {
-    if(components_[i]->IsUpdatable()) {
-      UpdatableComponent& c = dynamic_cast<UpdatableComponent&>(*components_[i]);
-      Vector<BaseFloat> c_params; 
-      c.GetParams(&c_params);
-      wei_copy->Range(pos,c_params.Dim()).CopyFromVec(c_params);
-      pos += c_params.Dim();
-    }
+void Nnet::AppendNnet(const Nnet& other) {
+  for (int32 i = 0; i < other.NumComponents(); i++) {
+    AppendComponent(other.GetComponent(i));
   }
-  KALDI_ASSERT(pos == NumParams());
+  Check();
+}
+
+void Nnet::RemoveComponent(int32 c) {
+  Component* ptr = components_.at(c);
+  components_.erase(components_.begin()+c);
+  delete ptr;
+  Check();
 }
 
+void Nnet::RemoveLastComponent() {
+  RemoveComponent(NumComponents()-1);
+}
 
-void Nnet::GetWeights(Vector<BaseFloat>* wei_copy) const {
-  wei_copy->Resize(NumParams());
-  int32 pos = 0;
-  // copy the params
-  for(int32 n=0; n<components_.size(); n++) {
-    if(components_[n]->IsUpdatable()) {
-      switch(components_[n]->GetType()) {
-        case Component::kAffineTransform : {
-          // copy weight matrix row-by-row to the vector
-          Matrix<BaseFloat> mat(dynamic_cast<AffineTransform*>(components_[n])->GetLinearity());
-          int32 mat_size = mat.NumRows()*mat.NumCols();
-          wei_copy->Range(pos,mat_size).CopyRowsFromMat(mat);
-          pos += mat_size;
-          // append biases
-          Vector<BaseFloat> vec(dynamic_cast<AffineTransform*>(components_[n])->GetBias());
-          wei_copy->Range(pos,vec.Dim()).CopyFromVec(vec);
-          pos += vec.Dim();
-        } break;
-        default :
-          KALDI_ERR << "Unimplemented access to parameters "
-                    << "of updatable component " 
-                    << Component::TypeToMarker(components_[n]->GetType());
-      }
+int32 Nnet::NumParams() const {
+  int32 n_params = 0;
+  for (int32 n = 0; n < components_.size(); n++) {
+    if (components_[n]->IsUpdatable()) {
+      n_params +=
+        dynamic_cast<UpdatableComponent*>(components_[n])->NumParams();
     }
   }
-  KALDI_ASSERT(pos == NumParams());
+  return n_params;
 }
 
-
-void Nnet::SetWeights(const Vector<BaseFloat>& wei_src) {
-  KALDI_ASSERT(wei_src.Dim() == NumParams());
+void Nnet::GetGradient(Vector<BaseFloat>* gradient) const {
+  gradient->Resize(NumParams());
   int32 pos = 0;
-  for(int32 n=0; n<components_.size(); n++) {
-    if(components_[n]->IsUpdatable()) {
-      switch(components_[n]->GetType()) {
-        case Component::kAffineTransform : {
-          // get the component
-          AffineTransform* aff_t = dynamic_cast<AffineTransform*>(components_[n]);
-          // we need weight matrix with original dimensions
-          Matrix<BaseFloat> mat(aff_t->GetLinearity());
-          int32 mat_size = mat.NumRows()*mat.NumCols();
-          mat.CopyRowsFromVec(wei_src.Range(pos,mat_size));
-          pos += mat_size;
-          // get the bias vector
-          Vector<BaseFloat> vec(aff_t->GetBias());
-          vec.CopyFromVec(wei_src.Range(pos,vec.Dim()));
-          pos += vec.Dim();
-          // assign to the component
-          aff_t->SetLinearity(CuMatrix<BaseFloat>(mat));
-          aff_t->SetBias(CuVector<BaseFloat>(vec));
-        } break;
-        default :
-          KALDI_ERR << "Unimplemented access to parameters "
-                    << "of updatable component " 
-                    << Component::TypeToMarker(components_[n]->GetType());
-      }
+  // loop over Components,
+  for (int32 i = 0; i < components_.size(); i++) {
+    if (components_[i]->IsUpdatable()) {
+      UpdatableComponent& c =
+        dynamic_cast<UpdatableComponent&>(*components_[i]);
+      SubVector<BaseFloat> grad_range(gradient->Range(pos, c.NumParams()));
+      c.GetGradient(&grad_range);  // getting gradient,
+      pos += c.NumParams();
     }
   }
   KALDI_ASSERT(pos == NumParams());
 }
 
- 
-void Nnet::GetGradient(Vector<BaseFloat>* grad_copy) const {
-  grad_copy->Resize(NumParams());
+void Nnet::GetParams(Vector<BaseFloat>* params) const {
+  params->Resize(NumParams());
   int32 pos = 0;
-  // copy the params
-  for(int32 n=0; n<components_.size(); n++) {
-    if(components_[n]->IsUpdatable()) {
-      switch(components_[n]->GetType()) {
-        case Component::kAffineTransform : {
-          // get the weights from CuMatrix to Matrix
-          const CuMatrixBase<BaseFloat>& cu_mat = 
-            dynamic_cast<AffineTransform*>(components_[n])->GetLinearityCorr();
-          Matrix<BaseFloat> mat(cu_mat.NumRows(),cu_mat.NumCols());
-          cu_mat.CopyToMat(&mat);
-          // copy the the matrix row-by-row to the vector
-          int32 mat_size = mat.NumRows()*mat.NumCols();
-          grad_copy->Range(pos,mat_size).CopyRowsFromMat(mat);
-          pos += mat_size;
-          // get the biases from CuVector to Vector
-          const CuVector<BaseFloat>& cu_vec = 
-            dynamic_cast<AffineTransform*>(components_[n])->GetBiasCorr();
-          Vector<BaseFloat> vec(cu_vec.Dim());
-          cu_vec.CopyToVec(&vec);
-          // append biases to the supervector
-          grad_copy->Range(pos,vec.Dim()).CopyFromVec(vec);
-          pos += vec.Dim();
-        } break;
-        default :
-          KALDI_ERR << "Unimplemented access to parameters "
-                    << "of updatable component " 
-                    << Component::TypeToMarker(components_[n]->GetType());
-      }
+  // loop over Components,
+  for (int32 i = 0; i < components_.size(); i++) {
+    if (components_[i]->IsUpdatable()) {
+      UpdatableComponent& c =
+        dynamic_cast<UpdatableComponent&>(*components_[i]);
+      SubVector<BaseFloat> params_range(params->Range(pos, c.NumParams()));
+      c.GetParams(&params_range);  // getting params,
+      pos += c.NumParams();
     }
   }
   KALDI_ASSERT(pos == NumParams());
 }
 
-
-int32 Nnet::NumParams() const {
-  int32 n_params = 0;
-  for(int32 n=0; n<components_.size(); n++) {
-    if(components_[n]->IsUpdatable()) {
-      n_params += dynamic_cast<UpdatableComponent*>(components_[n])->NumParams();
+void Nnet::SetParams(const VectorBase<BaseFloat>& params) {
+  KALDI_ASSERT(params.Dim() == NumParams());
+  int32 pos = 0;
+  // loop over Components,
+  for (int32 i = 0; i < components_.size(); i++) {
+    if (components_[i]->IsUpdatable()) {
+      UpdatableComponent& c =
+        dynamic_cast<UpdatableComponent&>(*components_[i]);
+      c.SetParams(params.Range(pos, c.NumParams()));  // setting params,
+      pos += c.NumParams();
     }
   }
-  return n_params;
+  KALDI_ASSERT(pos == NumParams());
 }
 
-
 void Nnet::SetDropoutRetention(BaseFloat r)  {
-  for (int32 c=0; c < NumComponents(); c++) {
+  for (int32 c = 0; c < NumComponents(); c++) {
     if (GetComponent(c).GetType() == Component::kDropout) {
       Dropout& comp = dynamic_cast<Dropout&>(GetComponent(c));
       BaseFloat r_old = comp.GetDropoutRetention();
       comp.SetDropoutRetention(r);
-      KALDI_LOG << "Setting dropout-retention in component " << c 
+      KALDI_LOG << "Setting dropout-retention in component " << c
                 << " from " << r_old << " to " << r;
     }
   }
@@ -352,37 +280,47 @@ void Nnet::SetDropoutRetention(BaseFloat r)  {
 
 
 void Nnet::ResetLstmStreams(const std::vector<int32> &stream_reset_flag) {
-  for (int32 c=0; c < NumComponents(); c++) {
+  for (int32 c = 0; c < NumComponents(); c++) {
     if (GetComponent(c).GetType() == Component::kLstmProjectedStreams) {
-      LstmProjectedStreams& comp = dynamic_cast<LstmProjectedStreams&>(GetComponent(c));
+      LstmProjectedStreams& comp =
+        dynamic_cast<LstmProjectedStreams&>(GetComponent(c));
       comp.ResetLstmStreams(stream_reset_flag);
-    }    
+    }
   }
 }
 
 void Nnet::SetSeqLengths(const std::vector<int32> &sequence_lengths) {
-  for (int32 c=0; c < NumComponents(); c++) {
+  for (int32 c = 0; c < NumComponents(); c++) {
     if (GetComponent(c).GetType() == Component::kBLstmProjectedStreams) {
-      BLstmProjectedStreams& comp = dynamic_cast<BLstmProjectedStreams&>(GetComponent(c));
+      BLstmProjectedStreams& comp =
+        dynamic_cast<BLstmProjectedStreams&>(GetComponent(c));
       comp.SetSeqLengths(sequence_lengths);
     }
   }
 }
 
-void Nnet::Init(const std::string &file) {
-  Input in(file);
+void Nnet::Init(const std::string &proto_file) {
+  Input in(proto_file);
   std::istream &is = in.Stream();
-  // do the initialization with config lines,
-  std::string conf_line, token;
-  while (!is.eof()) {
+  std::string proto_line, token;
+
+  // Initialize from the prototype, where each line
+  // contains the description for one component.
+  while (is >> std::ws, !is.eof()) {
     KALDI_ASSERT(is.good());
-    std::getline(is, conf_line); // get a line from config file,
-    if (conf_line == "") continue;
-    KALDI_VLOG(1) << conf_line; 
-    std::istringstream(conf_line) >> std::ws >> token; // get 1st token,
-    if (token == "<NnetProto>" || token == "</NnetProto>") continue; // ignored tokens,
-    AppendComponent(Component::Init(conf_line+"\n"));
-    is >> std::ws;
+
+    // get a line from the proto file,
+    std::getline(is, proto_line);
+    if (proto_line == "") continue;
+    KALDI_VLOG(1) << proto_line;
+
+    // get the 1st token from the line,
+    std::istringstream(proto_line) >> std::ws >> token;
+    // ignore these tokens:
+    if (token == "<NnetProto>" || token == "</NnetProto>") continue;
+
+    // create new component, append to Nnet,
+    this->AppendComponentPointer(Component::Init(proto_line+"\n"));
   }
   // cleanup
   in.Close();
@@ -390,41 +328,45 @@ void Nnet::Init(const std::string &file) {
 }
 
 
-void Nnet::Read(const std::string &file) {
+/**
+ * I/O wrapper for converting 'rxfilename' to 'istream',
+ */
+void Nnet::Read(const std::string &rxfilename) {
   bool binary;
-  Input in(file, &binary);
+  Input in(rxfilename, &binary);
   Read(in.Stream(), binary);
   in.Close();
   // Warn if the NN is empty
-  if(NumComponents() == 0) {
-    KALDI_WARN << "The network '" << file << "' is empty.";
+  if (NumComponents() == 0) {
+    KALDI_WARN << "The network '" << rxfilename << "' is empty.";
   }
 }
 
 
 void Nnet::Read(std::istream &is, bool binary) {
-  // get the network layers from a factory
-  Component *comp;
-  while (NULL != (comp = Component::Read(is, binary))) {
-    if (NumComponents() > 0 && components_.back()->OutputDim() != comp->InputDim()) {
-      KALDI_ERR << "Dimensionality mismatch!"
-                << " Previous layer output:" << components_.back()->OutputDim()
-                << " Current layer input:" << comp->InputDim();
+  // Read the Components through the 'factory' Component::Read(...),
+  Component* comp(NULL);
+  while (comp = Component::Read(is, binary), comp != NULL) {
+    // Check dims,
+    if (NumComponents() > 0) {
+      if (components_.back()->OutputDim() != comp->InputDim()) {
+        KALDI_ERR << "Dimensionality mismatch!"
+                  << " Previous layer output:" << components_.back()->OutputDim()
+                  << " Current layer input:" << comp->InputDim();
+      }
     }
-    components_.push_back(comp);
+    // Append to 'this' Nnet,
+    AppendComponentPointer(comp);
   }
-  // create empty buffers
-  propagate_buf_.resize(NumComponents()+1);
-  backpropagate_buf_.resize(NumComponents()+1);
-  // reset learn rate
-  opts_.learn_rate = 0.0;
-  
-  Check(); //check consistency (dims...)
+  Check();
 }
 
 
-void Nnet::Write(const std::string &file, bool binary) const {
-  Output out(file, binary, true);
+/**
+ * I/O wrapper for converting 'wxfilename' to 'ostream',
+ */
+void Nnet::Write(const std::string &wxfilename, bool binary) const {
+  Output out(wxfilename, binary, true);
   Write(out.Stream(), binary);
   out.Close();
 }
@@ -433,12 +375,12 @@ void Nnet::Write(const std::string &file, bool binary) const {
 void Nnet::Write(std::ostream &os, bool binary) const {
   Check();
   WriteToken(os, binary, "<Nnet>");
-  if(binary == false) os << std::endl;
-  for(int32 i=0; i<NumComponents(); i++) {
+  if (binary == false) os << std::endl;
+  for (int32 i = 0; i < NumComponents(); i++) {
     components_[i]->Write(os, binary);
   }
-  WriteToken(os, binary, "</Nnet>");  
-  if(binary == false) os << std::endl;
+  WriteToken(os, binary, "</Nnet>");
+  if (binary == false) os << std::endl;
 }
 
 
@@ -448,12 +390,12 @@ std::string Nnet::Info() const {
   ostr << "num-components " << NumComponents() << std::endl;
   ostr << "input-dim " << InputDim() << std::endl;
   ostr << "output-dim " << OutputDim() << std::endl;
-  ostr << "number-of-parameters " << static_cast<float>(NumParams())/1e6 
+  ostr << "number-of-parameters " << static_cast<float>(NumParams())/1e6
        << " millions" << std::endl;
   // topology & weight stats
   for (int32 i = 0; i < NumComponents(); i++) {
-    ostr << "component " << i+1 << " : " 
-         << Component::TypeToMarker(components_[i]->GetType()) 
+    ostr << "component " << i+1 << " : "
+         << Component::TypeToMarker(components_[i]->GetType())
          << ", input-dim " << components_[i]->InputDim()
          << ", output-dim " << components_[i]->OutputDim()
          << ", " << components_[i]->Info() << std::endl;
@@ -461,58 +403,61 @@ std::string Nnet::Info() const {
   return ostr.str();
 }
 
-std::string Nnet::InfoGradient() const {
+std::string Nnet::InfoGradient(bool header) const {
   std::ostringstream ostr;
   // gradient stats
-  ostr << "### Gradient stats :\n";
+  if (header) ostr << "\n### GRADIENT STATS :\n";
   for (int32 i = 0; i < NumComponents(); i++) {
-    ostr << "Component " << i+1 << " : " 
-         << Component::TypeToMarker(components_[i]->GetType()) 
+    ostr << "Component " << i+1 << " : "
+         << Component::TypeToMarker(components_[i]->GetType())
          << ", " << components_[i]->InfoGradient() << std::endl;
   }
+  if (header) ostr << "### END GRADIENT\n";
   return ostr.str();
 }
 
-std::string Nnet::InfoPropagate() const {
+std::string Nnet::InfoPropagate(bool header) const {
   std::ostringstream ostr;
   // forward-pass buffer stats
-  ostr << "### Forward propagation buffer content :\n";
-  ostr << "[0] output of <Input> " << MomentStatistics(propagate_buf_[0]) << std::endl;
-  for (int32 i=0; i<NumComponents(); i++) {
-    ostr << "["<<1+i<< "] output of " 
+  if (header) ostr << "\n### FORWARD PROPAGATION BUFFER CONTENT :\n";
+  ostr << "[0] output of <Input> " << MomentStatistics(propagate_buf_[0])
+       << std::endl;
+  for (int32 i = 0; i < NumComponents(); i++) {
+    ostr << "[" << 1+i << "] output of "
          << Component::TypeToMarker(components_[i]->GetType())
          << MomentStatistics(propagate_buf_[i+1]) << std::endl;
     // nested networks too...
     if (Component::kParallelComponent == components_[i]->GetType()) {
-      ostr << dynamic_cast<ParallelComponent*>(components_[i])->InfoPropagate();
+      ostr <<
+        dynamic_cast<ParallelComponent*>(components_[i])->InfoPropagate();
     }
   }
+  if (header) ostr << "### END FORWARD\n";
   return ostr.str();
 }
 
-std::string Nnet::InfoBackPropagate() const {
+std::string Nnet::InfoBackPropagate(bool header) const {
   std::ostringstream ostr;
   // forward-pass buffer stats
-  ostr << "### Backward propagation buffer content :\n";
-  ostr << "[0] diff of <Input> " << MomentStatistics(backpropagate_buf_[0]) << std::endl;
-  for (int32 i=0; i<NumComponents(); i++) {
-    ostr << "["<<1+i<< "] diff-output of " 
+  if (header) ostr << "\n### BACKWARD PROPAGATION BUFFER CONTENT :\n";
+  ostr << "[0] diff of <Input> " << MomentStatistics(backpropagate_buf_[0])
+       << std::endl;
+  for (int32 i = 0; i < NumComponents(); i++) {
+    ostr << "["<<1+i<< "] diff-output of "
          << Component::TypeToMarker(components_[i]->GetType())
          << MomentStatistics(backpropagate_buf_[i+1]) << std::endl;
     // nested networks too...
     if (Component::kParallelComponent == components_[i]->GetType()) {
-      ostr << dynamic_cast<ParallelComponent*>(components_[i])->InfoBackPropagate();
+      ostr <<
+        dynamic_cast<ParallelComponent*>(components_[i])->InfoBackPropagate();
     }
   }
+  if (header) ostr << "### END BACKWARD\n\n";
   return ostr.str();
 }
 
 
-
 void Nnet::Check() const {
-  // check we have correct number of buffers,
-  KALDI_ASSERT(propagate_buf_.size() == NumComponents()+1);
-  KALDI_ASSERT(backpropagate_buf_.size() == NumComponents()+1);
   // check dims,
   for (size_t i = 0; i + 1 < components_.size(); i++) {
     KALDI_ASSERT(components_[i] != NULL);
@@ -524,17 +469,18 @@ void Nnet::Check() const {
   Vector<BaseFloat> weights;
   GetParams(&weights);
   BaseFloat sum = weights.Sum();
-  if(KALDI_ISINF(sum)) {
-    KALDI_ERR << "'inf' in network parameters (weight explosion, try lower learning rate?)";
+  if (KALDI_ISINF(sum)) {
+    KALDI_ERR << "'inf' in network parameters "
+              << "(weight explosion, need lower learning rate?)";
   }
-  if(KALDI_ISNAN(sum)) {
-    KALDI_ERR << "'nan' in network parameters (try lower learning rate?)";
+  if (KALDI_ISNAN(sum)) {
+    KALDI_ERR << "'nan' in network parameters (need lower learning rate?)";
   }
 }
 
 
 void Nnet::Destroy() {
-  for(int32 i=0; i<NumComponents(); i++) {
+  for (int32 i = 0; i < NumComponents(); i++) {
     delete components_[i];
   }
   components_.resize(0);
@@ -545,14 +491,14 @@ void Nnet::Destroy() {
 
 void Nnet::SetTrainOptions(const NnetTrainOptions& opts) {
   opts_ = opts;
-  //set values to individual components
-  for (int32 l=0; l<NumComponents(); l++) {
-    if(GetComponent(l).IsUpdatable()) {
+  // set values to individual components,
+  for (int32 l = 0; l < NumComponents(); l++) {
+    if (GetComponent(l).IsUpdatable()) {
       dynamic_cast<UpdatableComponent&>(GetComponent(l)).SetTrainOptions(opts_);
     }
   }
 }
 
- 
-} // namespace nnet1
-} // namespace kaldi
+
+}  // namespace nnet1
+}  // namespace kaldi
diff --git a/src/nnet/nnet-nnet.h b/src/nnet/nnet-nnet.h
index f33f1fbde65..493bce091e2 100644
--- a/src/nnet/nnet-nnet.h
+++ b/src/nnet/nnet-nnet.h
@@ -1,6 +1,6 @@
 // nnet/nnet-nnet.h
 
-// Copyright 2011-2013  Brno University of Technology (Author: Karel Vesely)
+// Copyright 2011-2016  Brno University of Technology (Author: Karel Vesely)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -20,9 +20,10 @@
 #ifndef KALDI_NNET_NNET_NNET_H_
 #define KALDI_NNET_NNET_NNET_H_
 
+#include <string>
+#include <vector>
 #include <iostream>
 #include <sstream>
-#include <vector>
 
 #include "base/kaldi-common.h"
 #include "util/kaldi-io.h"
@@ -35,102 +36,130 @@ namespace nnet1 {
 
 class Nnet {
  public:
-  Nnet() {}
-  Nnet(const Nnet& other);  // Copy constructor.
-  Nnet &operator = (const Nnet& other); // Assignment operator.
-
+  Nnet();
   ~Nnet();
 
+  Nnet(const Nnet& other);  // Allow copy constructor.
+  Nnet& operator= (const Nnet& other);  // Allow assignment operator.
+
  public:
-  /// Perform forward pass through the network
-  void Propagate(const CuMatrixBase<BaseFloat> &in, CuMatrix<BaseFloat> *out);
-  /// Perform backward pass through the network
-  void Backpropagate(const CuMatrixBase<BaseFloat> &out_diff, CuMatrix<BaseFloat> *in_diff);
-  /// Perform forward pass through the network, don't keep buffers (use it when not training)
-  void Feedforward(const CuMatrixBase<BaseFloat> &in, CuMatrix<BaseFloat> *out);
-
-  /// Dimensionality on network input (input feature dim.)
+  /// Perform forward pass through the network,
+  void Propagate(const CuMatrixBase<BaseFloat> &in,
+                 CuMatrix<BaseFloat> *out);
+  /// Perform backward pass through the network,
+  void Backpropagate(const CuMatrixBase<BaseFloat> &out_diff,
+                     CuMatrix<BaseFloat> *in_diff);
+  /// Perform forward pass through the network (with 2 swapping buffers),
+  void Feedforward(const CuMatrixBase<BaseFloat> &in,
+                   CuMatrix<BaseFloat> *out);
+
+  /// Dimensionality on network input (input feature dim.),
   int32 InputDim() const;
-  /// Dimensionality of network outputs (posteriors | bn-features | etc.)
+  /// Dimensionality of network outputs (posteriors | bn-features | etc.),
   int32 OutputDim() const;
 
-  /// Returns number of components-- think of this as similar to # of layers, but
-  /// e.g. the nonlinearity and the linear part count as separate components,
-  /// so the number of components will be more than the number of layers.
-  int32 NumComponents() const { return components_.size(); }
+  /// Returns the number of 'Components' which form the NN.
+  /// Typically a NN layer is composed of 2 components:
+  /// the <AffineTransform> with trainable parameters
+  /// and a non-linearity like <Sigmoid> or <Softmax>.
+  /// Usually there are 2x more Components than the NN layers.
+  int32 NumComponents() const {
+    return components_.size();
+  }
 
+  /// Component accessor,
   const Component& GetComponent(int32 c) const;
+
+  /// Component accessor,
   Component& GetComponent(int32 c);
 
-  /// Sets the c'th component to "component", taking ownership of the pointer
-  /// and deleting the corresponding one that we own.
-  void SetComponent(int32 c, Component *component);
+  /// LastComponent accessor,
+  const Component& GetLastComponent() const;
+
+  /// LastComponent accessor,
+  Component& GetLastComponent();
+
+  /// Replace c'th component in 'this' Nnet (deep copy),
+  void ReplaceComponent(int32 c, const Component& comp);
+
+  /// Swap c'th component with the pointer,
+  void SwapComponent(int32 c, Component** comp);
+
+  /// Append Component to 'this' instance of Nnet (deep copy),
+  void AppendComponent(const Component& comp);
 
-  /// Appends this component to the components already in the neural net.
-  /// Takes ownership of the pointer
-  void AppendComponent(Component *dynamically_allocated_comp);
-  /// Append another network to the current one (copy components).
+  /// Append Component* to 'this' instance of Nnet by a shallow copy
+  /// ('this' instance of Nnet over-takes the ownership of the pointer).
+  void AppendComponentPointer(Component *dynamically_allocated_comp);
+
+  /// Append other Nnet to the 'this' Nnet (copy all its components),
   void AppendNnet(const Nnet& nnet_to_append);
 
-  /// Remove component
+  /// Remove c'th component,
   void RemoveComponent(int32 c);
-  void RemoveLastComponent() { RemoveComponent(NumComponents()-1); }
 
-  /// Access to forward pass buffers
+  /// Remove the last of the Components,
+  void RemoveLastComponent();
+
+  /// Access to the forward-pass buffers
   const std::vector<CuMatrix<BaseFloat> >& PropagateBuffer() const {
     return propagate_buf_;
   }
-  /// Access to backward pass buffers
+  /// Access to the backward-pass buffers
   const std::vector<CuMatrix<BaseFloat> >& BackpropagateBuffer() const {
     return backpropagate_buf_;
   }
 
-  /// Get the number of parameters in the network
+  /// Get the number of parameters in the network,
   int32 NumParams() const;
-  /// Get the network weights in a supervector
-  void GetParams(Vector<BaseFloat>* wei_copy) const;
-  /// Get the network weights in a supervector
-  void GetWeights(Vector<BaseFloat>* wei_copy) const;
-  /// Set the network weights from a supervector
-  void SetWeights(const Vector<BaseFloat>& wei_src);
-  /// Get the gradient stored in the network
-  void GetGradient(Vector<BaseFloat>* grad_copy) const;
+
+  /// Get the gradient stored in the network,
+  void GetGradient(Vector<BaseFloat>* gradient) const;
+
+  /// Get the network weights in a supervector,
+  void GetParams(Vector<BaseFloat>* params) const;
+
+  /// Set the network weights from a supervector,
+  void SetParams(const VectorBase<BaseFloat>& params);
 
   /// Set the dropout rate
   void SetDropoutRetention(BaseFloat r);
+
   /// Reset streams in LSTM multi-stream training,
   void ResetLstmStreams(const std::vector<int32> &stream_reset_flag);
 
-  /// set sequence length in LSTM multi-stream training
+  /// Set sequence length in LSTM multi-stream training,
   void SetSeqLengths(const std::vector<int32> &sequence_lengths);
 
-  /// Initialize MLP from config
-  void Init(const std::string &config_file);
-  /// Read the MLP from file (can add layers to exisiting instance of Nnet)
-  void Read(const std::string &file);
-  /// Read the MLP from stream (can add layers to exisiting instance of Nnet)
+  /// Initialize the Nnet from the prototype,
+  void Init(const std::string &proto_file);
+
+  /// Read Nnet from 'rxfilename',
+  void Read(const std::string &rxfilename);
+  /// Read Nnet from 'istream',
   void Read(std::istream &in, bool binary);
-  /// Write MLP to file
-  void Write(const std::string &file, bool binary) const;
-  /// Write MLP to stream
+
+  /// Write Nnet to 'wxfilename',
+  void Write(const std::string &wxfilename, bool binary) const;
+  /// Write Nnet to 'ostream',
   void Write(std::ostream &out, bool binary) const;
 
-  /// Create string with human readable description of the nnet
+  /// Create string with human readable description of the nnet,
   std::string Info() const;
-  /// Create string with per-component gradient statistics
-  std::string InfoGradient() const;
-  /// Create string with propagation-buffer statistics
-  std::string InfoPropagate() const;
-  /// Create string with back-propagation-buffer statistics
-  std::string InfoBackPropagate() const;
-  /// Consistency check.
+  /// Create string with per-component gradient statistics,
+  std::string InfoGradient(bool header = true) const;
+  /// Create string with propagation-buffer statistics,
+  std::string InfoPropagate(bool header = true) const;
+  /// Create string with back-propagation-buffer statistics,
+  std::string InfoBackPropagate(bool header = true) const;
+  /// Consistency check,
   void Check() const;
-  /// Relese the memory
+  /// Relese the memory,
   void Destroy();
 
-  /// Set training hyper-parameters to the network and its UpdatableComponent(s)
+  /// Set hyper-parameters of the training (pushes to all UpdatableComponents),
   void SetTrainOptions(const NnetTrainOptions& opts);
-  /// Get training hyper-parameters from the network
+  /// Get training hyper-parameters from the network,
   const NnetTrainOptions& GetTrainOptions() const {
     return opts_;
   }
@@ -140,8 +169,10 @@ class Nnet {
   /// the components are for example: AffineTransform, Sigmoid, Softmax
   std::vector<Component*> components_;
 
-  std::vector<CuMatrix<BaseFloat> > propagate_buf_;  ///< buffers for forward pass
-  std::vector<CuMatrix<BaseFloat> > backpropagate_buf_;  ///< buffers for backward pass
+  /// Buffers for forward pass (on demand initialization),
+  std::vector<CuMatrix<BaseFloat> > propagate_buf_;
+  /// Buffers for backward pass (on demand initialization),
+  std::vector<CuMatrix<BaseFloat> > backpropagate_buf_;
 
   /// Option class with hyper-parameters passed to UpdatableComponent(s)
   NnetTrainOptions opts_;
@@ -152,3 +183,4 @@ class Nnet {
 
 #endif  // KALDI_NNET_NNET_NNET_H_
 
+
diff --git a/src/nnet/nnet-parallel-component.h b/src/nnet/nnet-parallel-component.h
index 3bdaa6908af..0824ec0d7dd 100644
--- a/src/nnet/nnet-parallel-component.h
+++ b/src/nnet/nnet-parallel-component.h
@@ -21,76 +21,86 @@
 #ifndef KALDI_NNET_NNET_PARALLEL_COMPONENT_H_
 #define KALDI_NNET_NNET_PARALLEL_COMPONENT_H_
 
+#include <string>
+#include <vector>
+#include <sstream>
 
 #include "nnet/nnet-component.h"
 #include "nnet/nnet-utils.h"
 #include "cudamatrix/cu-math.h"
 
-#include <sstream>
 
 namespace kaldi {
 namespace nnet1 {
 
 class ParallelComponent : public UpdatableComponent {
  public:
-  ParallelComponent(int32 dim_in, int32 dim_out) 
-    : UpdatableComponent(dim_in, dim_out)
+  ParallelComponent(int32 dim_in, int32 dim_out):
+    UpdatableComponent(dim_in, dim_out)
   { }
+
   ~ParallelComponent()
   { }
 
   Component* Copy() const { return new ParallelComponent(*this); }
   ComponentType GetType() const { return kParallelComponent; }
 
+  const Nnet& GetNestedNnet(int32 id) const { return nnet_.at(id); }
+  Nnet& GetNestedNnet(int32 id) { return nnet_.at(id); }
+
   void InitData(std::istream &is) {
     // define options
     std::vector<std::string> nested_nnet_proto;
     std::vector<std::string> nested_nnet_filename;
     // parse config
-    std::string token; 
-    while (!is.eof()) {
-      ReadToken(is, false, &token); 
-      /**/ if (token == "<NestedNnetFilename>") {
-        while(!is.eof()) {
+    std::string token;
+    while (is >> std::ws, !is.eof()) {
+      ReadToken(is, false, &token);
+      /**/ if (token == "<NestedNnet>" || token == "<NestedNnetFilename>") {
+        while (is >> std::ws, !is.eof()) {
           std::string file_or_end;
           ReadToken(is, false, &file_or_end);
-          if (file_or_end == "</NestedNnetFilename>") break;
+          if (file_or_end == "</NestedNnet>" ||
+              file_or_end == "</NestedNnetFilename>") break;
           nested_nnet_filename.push_back(file_or_end);
         }
       } else if (token == "<NestedNnetProto>") {
-        while(!is.eof()) {
+        while (is >> std::ws, !is.eof()) {
           std::string file_or_end;
           ReadToken(is, false, &file_or_end);
           if (file_or_end == "</NestedNnetProto>") break;
           nested_nnet_proto.push_back(file_or_end);
         }
-      } else KALDI_ERR << "Unknown token " << token << ", typo in config?"
-                       << " (NestedNnetFilename|NestedNnetProto)";
-      is >> std::ws; // eat-up whitespace
+      } else { KALDI_ERR << "Unknown token " << token << ", typo in config?"
+                         << " (NestedNnet|NestedNnetFilename|NestedNnetProto)";
+      }
     }
-    // initialize
-    KALDI_ASSERT((nested_nnet_proto.size() > 0) ^ (nested_nnet_filename.size() > 0)); //xor
-    // read nnets from files
+    // initialize,
+    KALDI_ASSERT((nested_nnet_proto.size() > 0) ^
+                 (nested_nnet_filename.size() > 0));  // ^xor,
+    // read nnets from files,
     if (nested_nnet_filename.size() > 0) {
-      for (int32 i=0; i<nested_nnet_filename.size(); i++) {
+      for (int32 i = 0; i < nested_nnet_filename.size(); i++) {
         Nnet nnet;
         nnet.Read(nested_nnet_filename[i]);
         nnet_.push_back(nnet);
-        KALDI_LOG << "Loaded nested <Nnet> from file : " << nested_nnet_filename[i];
+        KALDI_LOG << "Loaded nested <Nnet> from file : "
+                  << nested_nnet_filename[i];
       }
     }
-    // initialize nnets from prototypes
+    // initialize nnets from prototypes,
     if (nested_nnet_proto.size() > 0) {
-      for (int32 i=0; i<nested_nnet_proto.size(); i++) {
+      for (int32 i = 0; i < nested_nnet_proto.size(); i++) {
         Nnet nnet;
         nnet.Init(nested_nnet_proto[i]);
         nnet_.push_back(nnet);
-        KALDI_LOG << "Initialized nested <Nnet> from prototype : " << nested_nnet_proto[i];
+        KALDI_LOG << "Initialized nested <Nnet> from prototype : "
+                  << nested_nnet_proto[i];
       }
     }
-    // check dim-sum of nested nnets
+    // check dim-sum of nested nnets,
     int32 nnet_input_sum = 0, nnet_output_sum = 0;
-    for (int32 i=0; i<nnet_.size(); i++) {
+    for (int32 i = 0; i < nnet_.size(); i++) {
       nnet_input_sum += nnet_[i].InputDim();
       nnet_output_sum += nnet_[i].OutputDim();
     }
@@ -103,7 +113,7 @@ class ParallelComponent : public UpdatableComponent {
     ExpectToken(is, binary, "<NestedNnetCount>");
     int32 nnet_count;
     ReadBasicType(is, binary, &nnet_count);
-    for (int32 i=0; i<nnet_count; i++) {
+    for (int32 i = 0; i < nnet_count; i++) {
       ExpectToken(is, binary, "<NestedNnet>");
       int32 dummy;
       ReadBasicType(is, binary, &dummy);
@@ -115,7 +125,7 @@ class ParallelComponent : public UpdatableComponent {
 
     // check dim-sum of nested nnets
     int32 nnet_input_sum = 0, nnet_output_sum = 0;
-    for (int32 i=0; i<nnet_.size(); i++) {
+    for (int32 i = 0; i < nnet_.size(); i++) {
       nnet_input_sum += nnet_[i].InputDim();
       nnet_output_sum += nnet_[i].OutputDim();
     }
@@ -129,115 +139,213 @@ class ParallelComponent : public UpdatableComponent {
     //
     WriteToken(os, binary, "<NestedNnetCount>");
     WriteBasicType(os, binary, nnet_count);
-    for (int32 i=0; i<nnet_count; i++) {
+    if (!binary) os << "\n";
+    for (int32 i = 0; i < nnet_count; i++) {
       WriteToken(os, binary, "<NestedNnet>");
       WriteBasicType(os, binary, i+1);
+      if (!binary) os << "\n";
       nnet_[i].Write(os, binary);
     }
     WriteToken(os, binary, "</ParallelComponent>");
   }
 
-  int32 NumParams() const { 
-    int32 num_params_sum = 0;
-    for (int32 i=0; i<nnet_.size(); i++) 
-      num_params_sum += nnet_[i].NumParams();
-    return num_params_sum;
+  int32 NumParams() const {
+    int32 ans = 0;
+    for (int32 i = 0; i < nnet_.size(); i++) {
+      ans += nnet_[i].NumParams();
+    }
+    return ans;
+  }
+
+  void GetGradient(VectorBase<BaseFloat>* gradient) const {
+    KALDI_ASSERT(gradient->Dim() == NumParams());
+    int32 offset = 0;
+    for (int32 i = 0; i < nnet_.size(); i++) {
+      int32 n_params = nnet_[i].NumParams();
+      Vector<BaseFloat> gradient_aux;  // we need 'Vector<>',
+      nnet_[i].GetGradient(&gradient_aux);  // copy gradient from Nnet,
+      gradient->Range(offset, n_params).CopyFromVec(gradient_aux);
+      offset += n_params;
+    }
+    KALDI_ASSERT(offset == NumParams());
+  }
+
+  void GetParams(VectorBase<BaseFloat>* params) const {
+    KALDI_ASSERT(params->Dim() == NumParams());
+    int32 offset = 0;
+    for (int32 i = 0; i < nnet_.size(); i++) {
+      int32 n_params = nnet_[i].NumParams();
+      Vector<BaseFloat> params_aux;  // we need 'Vector<>',
+      nnet_[i].GetParams(&params_aux);  // copy params from Nnet,
+      params->Range(offset, n_params).CopyFromVec(params_aux);
+      offset += n_params;
+    }
+    KALDI_ASSERT(offset == NumParams());
   }
 
-  void GetParams(Vector<BaseFloat>* wei_copy) const { 
-    wei_copy->Resize(NumParams());
+  void SetParams(const VectorBase<BaseFloat>& params) {
+    KALDI_ASSERT(params.Dim() == NumParams());
     int32 offset = 0;
-    for (int32 i=0; i<nnet_.size(); i++) {
-      Vector<BaseFloat> wei_aux;
-      nnet_[i].GetParams(&wei_aux);
-      wei_copy->Range(offset, wei_aux.Dim()).CopyFromVec(wei_aux);
-      offset += wei_aux.Dim();
+    for (int32 i = 0; i < nnet_.size(); i++) {
+      int32 n_params = nnet_[i].NumParams();
+      nnet_[i].SetParams(params.Range(offset, n_params));
+      offset += n_params;
     }
     KALDI_ASSERT(offset == NumParams());
   }
-    
-  std::string Info() const { 
+
+  std::string Info() const {
     std::ostringstream os;
-    for (int32 i=0; i<nnet_.size(); i++) {
-      os << "nested_network #" << i+1 << "{\n" << nnet_[i].Info() << "}\n";
+    os << "\n";
+    for (int32 i = 0; i < nnet_.size(); i++) {
+      os << "nested_network #" << i+1 << " {\n"
+         << nnet_[i].Info()
+         << "}\n";
     }
     std::string s(os.str());
-    s.erase(s.end() -1); // removing last '\n'
+    s.erase(s.end() -1);  // removing last '\n'
     return s;
   }
-                       
+
   std::string InfoGradient() const {
     std::ostringstream os;
-    for (int32 i=0; i<nnet_.size(); i++) {
-      os << "nested_gradient #" << i+1 << "{\n" << nnet_[i].InfoGradient() << "}\n";
+    os << "\n";
+    for (int32 i = 0; i < nnet_.size(); i++) {
+      os << "nested_gradient #" << i+1 << " {\n"
+         << nnet_[i].InfoGradient(false)
+         << "}\n";
     }
     std::string s(os.str());
-    s.erase(s.end() -1); // removing last '\n'
+    s.erase(s.end() -1);  // removing last '\n'
     return s;
   }
 
   std::string InfoPropagate() const {
     std::ostringstream os;
-    for (int32 i=0; i<nnet_.size(); i++) {
-      os << "nested_propagate #" << i+1 << "{\n" << nnet_[i].InfoPropagate() << "}\n";
+    for (int32 i = 0; i < nnet_.size(); i++) {
+      os << "nested_propagate #" << i+1 << " {\n"
+         << nnet_[i].InfoPropagate(false)
+         << "}\n";
     }
     return os.str();
   }
 
   std::string InfoBackPropagate() const {
     std::ostringstream os;
-    for (int32 i=0; i<nnet_.size(); i++) {
-      os << "nested_backpropagate #" << i+1 << "{\n" << nnet_[i].InfoBackPropagate() << "}\n";
+    for (int32 i = 0; i < nnet_.size(); i++) {
+      os << "nested_backpropagate #" << i+1 << " {\n"
+         << nnet_[i].InfoBackPropagate(false)
+         << "}\n";
     }
     return os.str();
   }
 
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in, CuMatrixBase<BaseFloat> *out) {
+  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
+                    CuMatrixBase<BaseFloat> *out) {
+    // column-offsets for data buffers 'in,out',
     int32 input_offset = 0, output_offset = 0;
-    for (int32 i=0; i<nnet_.size(); i++) {
-      CuSubMatrix<BaseFloat> src(in.ColRange(input_offset, nnet_[i].InputDim()));
-      CuSubMatrix<BaseFloat> tgt(out->ColRange(output_offset, nnet_[i].OutputDim()));
-      //
+    // loop over nnets,
+    for (int32 i = 0; i < nnet_.size(); i++) {
+      // get the data 'windows',
+      CuSubMatrix<BaseFloat> src(
+        in.ColRange(input_offset, nnet_[i].InputDim())
+      );
+      CuSubMatrix<BaseFloat> tgt(
+        out->ColRange(output_offset, nnet_[i].OutputDim())
+      );
+      // forward through auxiliary matrix, as 'Propagate' requires 'CuMatrix',
       CuMatrix<BaseFloat> tgt_aux;
       nnet_[i].Propagate(src, &tgt_aux);
       tgt.CopyFromMat(tgt_aux);
-      //
+      // advance the offsets,
       input_offset += nnet_[i].InputDim();
       output_offset += nnet_[i].OutputDim();
     }
   }
 
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in, const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff, CuMatrixBase<BaseFloat> *in_diff) {
+  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
+                        const CuMatrixBase<BaseFloat> &out,
+                        const CuMatrixBase<BaseFloat> &out_diff,
+                        CuMatrixBase<BaseFloat> *in_diff) {
+    // column-offsets for data buffers 'in,out',
     int32 input_offset = 0, output_offset = 0;
-    for (int32 i=0; i<nnet_.size(); i++) {
-      CuSubMatrix<BaseFloat> src(out_diff.ColRange(output_offset, nnet_[i].OutputDim()));
-      CuSubMatrix<BaseFloat> tgt(in_diff->ColRange(input_offset, nnet_[i].InputDim()));
-      // 
+    // loop over nnets,
+    for (int32 i = 0; i < nnet_.size(); i++) {
+      // get the data 'windows',
+      CuSubMatrix<BaseFloat> src(
+        out_diff.ColRange(output_offset, nnet_[i].OutputDim())
+      );
+      CuSubMatrix<BaseFloat> tgt(
+        in_diff->ColRange(input_offset, nnet_[i].InputDim())
+      );
+      // ::Backpropagate through auxiliary matrix (CuMatrix in the interface),
       CuMatrix<BaseFloat> tgt_aux;
       nnet_[i].Backpropagate(src, &tgt_aux);
       tgt.CopyFromMat(tgt_aux);
-      //
+      // advance the offsets,
       input_offset += nnet_[i].InputDim();
       output_offset += nnet_[i].OutputDim();
     }
   }
 
-  void Update(const CuMatrixBase<BaseFloat> &input, const CuMatrixBase<BaseFloat> &diff) {
-    ; // do nothing
+  void Update(const CuMatrixBase<BaseFloat> &input,
+              const CuMatrixBase<BaseFloat> &diff) {
+    { }  // do nothing
   }
- 
+
+  /**
+   * Overriding the default,
+   * which was UpdatableComponent::SetTrainOptions(...)
+   */
   void SetTrainOptions(const NnetTrainOptions &opts) {
-    for (int32 i=0; i<nnet_.size(); i++) {
+    for (int32 i = 0; i < nnet_.size(); i++) {
       nnet_[i].SetTrainOptions(opts);
     }
   }
 
+  /**
+   * Overriding the default,
+   * which was UpdatableComponent::SetLearnRateCoef(...)
+   */
+  void SetLearnRateCoef(BaseFloat val) {
+    // loop over nnets,
+    for (int32 i = 0; i < nnet_.size(); i++) {
+      // loop over components,
+      for (int32 j = 0; j < nnet_[i].NumComponents(); j++) {
+        if (nnet_[i].GetComponent(j).IsUpdatable()) {
+          UpdatableComponent& comp =
+            dynamic_cast<UpdatableComponent&>(nnet_[i].GetComponent(j));
+          // set the value,
+          comp.SetLearnRateCoef(val);
+        }
+      }
+    }
+  }
+
+  /**
+   * Overriding the default,
+   * which was UpdatableComponent::SetBiasLearnRateCoef(...)
+   */
+  void SetBiasLearnRateCoef(BaseFloat val) {
+    // loop over nnets,
+    for (int32 i = 0; i < nnet_.size(); i++) {
+      // loop over components,
+      for (int32 j = 0; j < nnet_[i].NumComponents(); j++) {
+        if (nnet_[i].GetComponent(j).IsUpdatable()) {
+          UpdatableComponent& comp =
+            dynamic_cast<UpdatableComponent&>(nnet_[i].GetComponent(j));
+          // set the value,
+          comp.SetBiasLearnRateCoef(val);
+        }
+      }
+    }
+  }
+
  private:
   std::vector<Nnet> nnet_;
 };
 
-} // namespace nnet1
-} // namespace kaldi
+}  // namespace nnet1
+}  // namespace kaldi
 
-#endif
+#endif  // KALDI_NNET_NNET_PARALLEL_COMPONENT_H_
diff --git a/src/nnet/nnet-pdf-prior.cc b/src/nnet/nnet-pdf-prior.cc
index b39bfd14600..90ee3239a39 100644
--- a/src/nnet/nnet-pdf-prior.cc
+++ b/src/nnet/nnet-pdf-prior.cc
@@ -32,7 +32,7 @@ PdfPrior::PdfPrior(const PdfPriorOptions &opts)
   }
 
   KALDI_LOG << "Computing pdf-priors from : " << opts.class_frame_counts;
-  
+
   Vector<double> frame_counts, rel_freq, log_priors;
   {
     Input in;
@@ -44,24 +44,26 @@ PdfPrior::PdfPrior(const PdfPriorOptions &opts)
   // get relative frequencies,
   rel_freq = frame_counts;
   rel_freq.Scale(1.0/frame_counts.Sum());
-  
+
   // get the log-prior,
   log_priors = rel_freq;
   log_priors.Add(1e-20);
   log_priors.ApplyLog();
 
-  // Make the priors for classes with low counts +inf (i.e. -log(0)) such that
-  // the classes have 0 likelihood (i.e. -inf log-likelihood). We use sqrt(FLT_MAX)
-  // instead of -kLogZeroFloat to prevent NANs from appearing in computation.
+  // Make the priors for classes with low counts +inf (i.e. -log(0))
+  // such that the classes have 0 likelihood (i.e. -inf log-likelihood).
+  // We use sqrt(FLT_MAX) instead of -kLogZeroFloat to prevent NANs
+  // from appearing in computation.
   int32 num_floored = 0;
-  for (int32 i=0; i<log_priors.Dim(); i++) {
+  for (int32 i = 0; i < log_priors.Dim(); i++) {
     if (rel_freq(i) < opts.prior_floor) {
       log_priors(i) = sqrt(FLT_MAX);
       num_floored++;
     }
   }
-  KALDI_LOG << "Floored " << num_floored << " pdf-priors " 
-            << "(hard-set to " << sqrt(FLT_MAX) << ", which disables DNN output when decoding)";
+  KALDI_LOG << "Floored " << num_floored << " pdf-priors "
+            << "(hard-set to " << sqrt(FLT_MAX)
+            << ", which disables DNN output when decoding)";
 
   // sanity check,
   KALDI_ASSERT(KALDI_ISFINITE(log_priors.Sum()));
@@ -72,11 +74,11 @@ PdfPrior::PdfPrior(const PdfPriorOptions &opts)
 
 
 void PdfPrior::SubtractOnLogpost(CuMatrixBase<BaseFloat> *llk) {
-  if(log_priors_.Dim() == 0) {
+  if (log_priors_.Dim() == 0) {
     KALDI_ERR << "--class-frame-counts is empty: Cannot initialize priors "
               << "without the counts.";
   }
-  if(log_priors_.Dim() != llk->NumCols()) {
+  if (log_priors_.Dim() != llk->NumCols()) {
     KALDI_ERR << "Dimensionality mismatch,"
               << " class_frame_counts " << log_priors_.Dim()
               << " pdf_output_llk " << llk->NumCols();
diff --git a/src/nnet/nnet-pdf-prior.h b/src/nnet/nnet-pdf-prior.h
index 8a6959eec79..f02e61cc993 100644
--- a/src/nnet/nnet-pdf-prior.h
+++ b/src/nnet/nnet-pdf-prior.h
@@ -37,9 +37,11 @@ struct PdfPriorOptions {
   BaseFloat prior_scale;
   BaseFloat prior_floor;
 
-  PdfPriorOptions() : class_frame_counts(""),
-                      prior_scale(1.0),
-                      prior_floor(1e-10) {}
+  PdfPriorOptions():
+    class_frame_counts(""),
+    prior_scale(1.0),
+    prior_floor(1e-10)
+  { }
 
   void Register(OptionsItf *opts) {
     opts->Register("class-frame-counts", &class_frame_counts,
@@ -49,7 +51,8 @@ struct PdfPriorOptions {
     opts->Register("prior-scale", &prior_scale,
                    "Scaling factor to be applied on pdf-log-priors");
     opts->Register("prior-floor", &prior_floor,
-                   "Flooring constatnt for prior probability (i.e. label rel. frequency)");
+                   "Flooring constatnt for prior probability "
+                   "(i.e. label rel. frequency)");
   }
 };
 
diff --git a/src/nnet/nnet-randomizer-test.cc b/src/nnet/nnet-randomizer-test.cc
index b83a629ed79..1f4b2564089 100644
--- a/src/nnet/nnet-randomizer-test.cc
+++ b/src/nnet/nnet-randomizer-test.cc
@@ -28,13 +28,13 @@ using namespace kaldi::nnet1;
 
 //////////////////////////////////////////////////
 
-template<class Real> 
+template<class Real>
 static void InitRand(VectorBase<Real> *v) {
   for (MatrixIndexT i = 0;i < v->Dim();i++)
     (*v)(i) = RandGauss();
 }
 
-template<class Real> 
+template<class Real>
 static void InitRand(MatrixBase<Real> *M) {
   do {
     for (MatrixIndexT i = 0;i < M->NumRows();i++)
@@ -44,20 +44,23 @@ static void InitRand(MatrixBase<Real> *M) {
 }
 
 
-template<class Real> 
-static void AssertEqual(const VectorBase<Real> &A, const VectorBase<Real> &B, float tol = 0.001) {
+template<class Real>
+static void AssertEqual(const VectorBase<Real> &A,
+                        const VectorBase<Real> &B,
+                        float tol = 0.001) {
   KALDI_ASSERT(A.Dim() == B.Dim());
-  for (MatrixIndexT i=0; i < A.Dim(); i++)
+  for (MatrixIndexT i = 0; i < A.Dim(); i++) {
     KALDI_ASSERT(std::abs(A(i)-B(i)) < tol);
+  }
 }
 
 
-template<class RandomAccessIterator> 
+template<class RandomAccessIterator>
 static void AssertEqual(RandomAccessIterator begin1, RandomAccessIterator end1,
                         RandomAccessIterator begin2, RandomAccessIterator end2) {
   KALDI_ASSERT((end1 - begin1) == (end2 - begin2));
   KALDI_ASSERT(end1 > begin1);
-  for ( ; begin1 < end1; ++begin1,++begin2) {
+  for ( ; begin1 < end1; ++begin1, ++begin2) {
     KALDI_ASSERT(*begin1 == *begin2);
   }
 }
@@ -71,12 +74,12 @@ void UnitTestRandomizerMask() {
   r.Init(c);
   const std::vector<int32>& m = r.Generate(5);
   KALDI_ASSERT(m.size() == 5);
-  int32 sum_of_elems = std::accumulate(m.begin(),m.end(),0);
+  int32 sum_of_elems = std::accumulate(m.begin(), m.end(),0);
   KALDI_ASSERT(sum_of_elems == 4 + 3 + 2 + 1 + 0);
 }
 
 void UnitTestMatrixRandomizer() {
-  Matrix<BaseFloat> m(1111,10);
+  Matrix<BaseFloat> m(1111, 10);
   InitRand(&m);
   CuMatrix<BaseFloat> m2(m);
   // config
@@ -90,17 +93,20 @@ void UnitTestMatrixRandomizer() {
   KALDI_ASSERT(r.IsFull());
   // create vector with consecutive indices
   std::vector<int32> mask(1111);
-  for(int32 i=0; i<1111; i++) { mask[i]=i; }
-  r.Randomize(mask); // no shuffling
+  for (int32 i = 0; i < 1111; i++) {
+    mask[i] = i;
+  }
+  r.Randomize(mask);  // no shuffling
   // make sure we get same data we put to randomizer
   int32 i=0;
-  for( ; !r.Done(); r.Next(), i++) {
+  for ( ; !r.Done(); r.Next(), i++) {
     KALDI_LOG << i;
     const CuMatrixBase<BaseFloat> &m3 = r.Value();
-    Matrix<BaseFloat> m4(m3.NumRows(),m3.NumCols()); m3.CopyToMat(&m4);
-    AssertEqual(m4,m.RowRange(i*c.minibatch_size,c.minibatch_size));
+    Matrix<BaseFloat> m4(m3.NumRows(), m3.NumCols());
+    m3.CopyToMat(&m4);
+    AssertEqual(m4, m.RowRange(i * c.minibatch_size, c.minibatch_size));
   }
-  KALDI_ASSERT(i == 11); // 11 minibatches
+  KALDI_ASSERT(i == 11);  // 11 minibatches
 
   KALDI_LOG << "Filling for 2nd time";
   // try to fill buffer one more time, and empty it
@@ -108,17 +114,19 @@ void UnitTestMatrixRandomizer() {
   r.AddData(m2);
   KALDI_ASSERT(r.IsFull());
   KALDI_ASSERT(r.NumFrames() == 11 + 1111);
-  { // check last 11 rows were copied to the front in the buffer
+  {  // check last 11 rows were copied to the front in the buffer
     const CuMatrixBase<BaseFloat> &m3 = r.Value();
-    Matrix<BaseFloat> m4(m3.NumRows(),m3.NumCols()); m3.CopyToMat(&m4);
-    AssertEqual(m4.RowRange(0,11),m.RowRange(1100,11));
+    Matrix<BaseFloat> m4(m3.NumRows(), m3.NumCols());
+    m3.CopyToMat(&m4);
+    AssertEqual(m4.RowRange(0, 11), m.RowRange(1100, 11));
   }
   KALDI_ASSERT(!r.Done());
-  for( ; !r.Done(); r.Next(), i++) {
+  for ( ; !r.Done(); r.Next(), i++) {
     KALDI_LOG << i;
-    const CuMatrixBase<BaseFloat> &m3 = r.Value();
+    const CuMatrixBase<BaseFloat>& m3 = r.Value();
+    static_cast<const void>(m3);  // variable no longer unused,
   }
-  KALDI_ASSERT(i == 22); // 22 minibatches
+  KALDI_ASSERT(i == 22);  // 22 minibatches
 }
 
 void UnitTestVectorRandomizer() {
@@ -135,16 +143,18 @@ void UnitTestVectorRandomizer() {
   KALDI_ASSERT(r.IsFull());
   // create vector with consecutive indices
   std::vector<int32> mask(1111);
-  for(int32 i=0; i<1111; i++) { mask[i]=i; }
-  r.Randomize(mask); // no shuffling
+  for (int32 i = 0; i < 1111; i++) {
+    mask[i] = i;
+  }
+  r.Randomize(mask);  // no shuffling
   // make sure we get same data we put to randomizer
-  int32 i=0;
-  for( ; !r.Done(); r.Next(), i++) {
+  int32 i = 0;
+  for ( ; !r.Done(); r.Next(), i++) {
     KALDI_LOG << i;
     const VectorBase<BaseFloat> &v2 = r.Value();
-    AssertEqual(v2, v.Range(i*c.minibatch_size,c.minibatch_size));
+    AssertEqual(v2, v.Range(i * c.minibatch_size, c.minibatch_size));
   }
-  KALDI_ASSERT(i == 11); // 11 minibatches
+  KALDI_ASSERT(i == 11);  // 11 minibatches
 
   KALDI_LOG << "Filling for 2nd time";
   // try to fill buffer one more time, and empty it
@@ -152,25 +162,26 @@ void UnitTestVectorRandomizer() {
   r.AddData(v);
   KALDI_ASSERT(r.IsFull());
   KALDI_ASSERT(r.NumFrames() == 11 + 1111);
-  { // check last 11 rows were copied to the front in the buffer
+  {  // check last 11 rows were copied to the front in the buffer
     const VectorBase<BaseFloat> &v2 = r.Value();
-    AssertEqual(v2.Range(0,11),v.Range(1100,11));
+    AssertEqual(v2.Range(0, 11), v.Range(1100, 11));
   }
   KALDI_ASSERT(!r.Done());
-  for( ; !r.Done(); r.Next(), i++) {
+  for ( ; !r.Done(); r.Next(), i++) {
     KALDI_LOG << i;
-    const VectorBase<BaseFloat> &v2 = r.Value();
+    const VectorBase<BaseFloat>& v2 = r.Value();
+    static_cast<const void>(v2);  // variable no longer unused,
   }
-  KALDI_ASSERT(i == 22); // 22 minibatches
+  KALDI_ASSERT(i == 22);  // 22 minibatches
 }
 
 void UnitTestStdVectorRandomizer() {
-  //prepare vector with some data
+  // prepare vector with some data,
   std::vector<int32> v(1111);
-  for (int32 i=0; i<v.size(); i++) {
+  for (int32 i = 0; i < v.size(); i++) {
     v.at(i) = i;
   }
-  std::random_shuffle(v.begin(),v.end());
+  std::random_shuffle(v.begin(), v.end());
 
   // config
   NnetDataRandomizerOptions c;
@@ -183,16 +194,21 @@ void UnitTestStdVectorRandomizer() {
   KALDI_ASSERT(r.IsFull());
   // create vector with consecutive indices
   std::vector<int32> mask(1111);
-  for(int32 i=0; i<1111; i++) { mask[i]=i; }
-  r.Randomize(mask); // no shuffling
+  for (int32 i = 0; i < 1111; i++) {
+    mask[i]=i;
+  }
+  r.Randomize(mask);  // no shuffling
   // make sure we get same data we put to randomizer
-  int32 i=0;
-  for( ; !r.Done(); r.Next(), i++) {
+  int32 i = 0;
+  for ( ; !r.Done(); r.Next(), i++) {
     KALDI_LOG << i;
     std::vector<int32> v2 = r.Value();
-    AssertEqual(v2.begin(), v2.end(), v.begin()+(i*c.minibatch_size), v.begin()+((i+1)*c.minibatch_size));
+    AssertEqual(v2.begin(),
+                v2.end(),
+                v.begin() + (i * c.minibatch_size),
+                v.begin() + ((i+1) * c.minibatch_size));
   }
-  KALDI_ASSERT(i == 11); // 11 minibatches
+  KALDI_ASSERT(i == 11);  // 11 minibatches
 
   KALDI_LOG << "Filling for 2nd time";
   // try to fill buffer one more time, and empty it
@@ -200,16 +216,16 @@ void UnitTestStdVectorRandomizer() {
   r.AddData(v);
   KALDI_ASSERT(r.IsFull());
   KALDI_ASSERT(r.NumFrames() == 11 + 1111);
-  { // check last 11 rows were copied to the front in the buffer
+  {  // check last 11 rows were copied to the front in the buffer
     std::vector<int32> v2 = r.Value();
     AssertEqual(v2.begin(), v2.begin()+11, v.begin()+1100, v.begin()+1100+11);
   }
   KALDI_ASSERT(!r.Done());
-  for( ; !r.Done(); r.Next(), i++) {
+  for ( ; !r.Done(); r.Next(), i++) {
     KALDI_LOG << i;
     std::vector<int32> v2 = r.Value();
   }
-  KALDI_ASSERT(i == 22); // 22 minibatches
+  KALDI_ASSERT(i == 22);  // 22 minibatches
 }
 
 
@@ -218,7 +234,7 @@ int main() {
   UnitTestMatrixRandomizer();
   UnitTestVectorRandomizer();
   UnitTestStdVectorRandomizer();
-  
+
   std::cout << "Tests succeeded.\n";
 }
 
diff --git a/src/nnet/nnet-randomizer.cc b/src/nnet/nnet-randomizer.cc
index 07467b1cb4b..22281c3b36a 100644
--- a/src/nnet/nnet-randomizer.cc
+++ b/src/nnet/nnet-randomizer.cc
@@ -19,8 +19,9 @@
 
 #include "nnet/nnet-randomizer.h"
 
-#include <algorithm>
 #include <vector>
+#include <algorithm>
+#include <utility>
 
 namespace kaldi {
 namespace nnet1 {
@@ -34,8 +35,9 @@ void RandomizerMask::Init(const NnetDataRandomizerOptions& conf) {
 
 const std::vector<int32>& RandomizerMask::Generate(int32 mask_size) {
   mask_.resize(mask_size);
-  for (int32 i=0; i<mask_size; i++) mask_[i]=i;
-  std::random_shuffle(mask_.begin(), mask_.end()); //with built-in random generator
+  for (int32 i = 0; i < mask_size; i++) mask_[i] = i;
+  // shuffle using built-in random generator:
+  std::random_shuffle(mask_.begin(), mask_.end());
   return mask_;
 }
 
@@ -44,28 +46,31 @@ const std::vector<int32>& RandomizerMask::Generate(int32 mask_size) {
 
 void MatrixRandomizer::AddData(const CuMatrixBase<BaseFloat>& m) {
   // pre-allocate before 1st use
-  if(data_.NumCols() == 0) {
-    data_.Resize(conf_.randomizer_size,m.NumCols());
+  if (data_.NumCols() == 0) {
+    data_.Resize(conf_.randomizer_size, m.NumCols());
   }
   // optionally put previous left-over to front
   if (data_begin_ > 0) {
-    KALDI_ASSERT(data_begin_ <= data_end_); // sanity check
+    KALDI_ASSERT(data_begin_ <= data_end_);  // sanity check,
     int32 leftover = data_end_ - data_begin_;
-    KALDI_ASSERT(leftover < data_begin_); // no overlap
-    if(leftover > 0) {
-      data_.RowRange(0,leftover).CopyFromMat(data_.RowRange(data_begin_,leftover));
+    KALDI_ASSERT(leftover < data_begin_);  // no overlap,
+    if (leftover > 0) {
+      data_.RowRange(0, leftover).CopyFromMat(data_.RowRange(data_begin_, leftover));
     }
-    data_begin_ = 0; data_end_ = leftover;
-    data_.RowRange(leftover,data_.NumRows()-leftover).SetZero(); // zeroing the rest 
+    data_begin_ = 0;
+    data_end_ = leftover;
+    // set zero to the rest of the buffer,
+    data_.RowRange(leftover, data_.NumRows() - leftover).SetZero();
   }
-  // extend the buffer if necessary
-  if(data_.NumRows() < data_end_ + m.NumRows()) {
+  // extend the buffer if necessary,
+  if (data_.NumRows() < data_end_ + m.NumRows()) {
     CuMatrix<BaseFloat> data_aux(data_);
-    data_.Resize(data_end_ + m.NumRows() + 1000, data_.NumCols()); // +1000 row extra
-    data_.RowRange(0,data_aux.NumRows()).CopyFromMat(data_aux);
+    // Add extra 1000 rows, so we don't reallocate soon:
+    data_.Resize(data_end_ + m.NumRows() + 1000, data_.NumCols());
+    data_.RowRange(0, data_aux.NumRows()).CopyFromMat(data_aux);
   }
   // copy the data
-  data_.RowRange(data_end_,m.NumRows()).CopyFromMat(m);
+  data_.RowRange(data_end_, m.NumRows()).CopyFromMat(m);
   data_end_ += m.NumRows();
 }
 
@@ -75,14 +80,16 @@ void MatrixRandomizer::Randomize(const std::vector<int32>& mask) {
   KALDI_ASSERT(data_end_ == mask.size());
   // Copy to auxiliary buffer for unshuffled data
   data_aux_ = data_;
-  // Put the mask to GPU 
+  // Put the mask to GPU
   CuArray<int32> mask_in_gpu(mask.size());
   mask_in_gpu.CopyFromVec(mask);
   // Randomize the data, mask is used to index rows in source matrix:
-  // (Here the vector 'mask_in_gpu' is typically shorter than number of rows in 'data_aux_',
-  //  because the the buffer 'data_aux_' is larger than capacity 'randomizer_size'.
-  //  The extra rows in 'data_aux_' do not contain speech frames and are not copied
-  //  from 'data_aux_', the extra rows in 'data_' are unchanged by cu::Randomize.)
+  // (Here the vector 'mask_in_gpu' is typically shorter than number
+  //  of rows in 'data_aux_', because the buffer 'data_aux_'
+  //  is larger than capacity 'randomizer_size'.
+  //  The extra rows in 'data_aux_' do not contain speech frames and
+  //  are not copied from 'data_aux_', the extra rows in 'data_' are
+  //  unchanged by cu::Randomize.)
   cu::Randomize(data_aux_, mask_in_gpu, &data_);
 }
 
@@ -91,9 +98,11 @@ void MatrixRandomizer::Next() {
 }
 
 const CuMatrixBase<BaseFloat>& MatrixRandomizer::Value() {
-  KALDI_ASSERT(data_end_ - data_begin_ >= conf_.minibatch_size); // have data for minibatch
-  minibatch_.Resize(conf_.minibatch_size, data_.NumCols(),kUndefined);
-  minibatch_.CopyFromMat(data_.RowRange(data_begin_,conf_.minibatch_size));
+  // make sure we have data for next minibatch,
+  KALDI_ASSERT(data_end_ - data_begin_ >= conf_.minibatch_size);
+  // prepare the mini-batch buffer,
+  minibatch_.Resize(conf_.minibatch_size, data_.NumCols(), kUndefined);
+  minibatch_.CopyFromMat(data_.RowRange(data_begin_, conf_.minibatch_size));
   return minibatch_;
 }
 
@@ -102,28 +111,29 @@ const CuMatrixBase<BaseFloat>& MatrixRandomizer::Value() {
 
 void VectorRandomizer::AddData(const Vector<BaseFloat>& v) {
   // pre-allocate before 1st use
-  if(data_.Dim() == 0) {
+  if (data_.Dim() == 0) {
     data_.Resize(conf_.randomizer_size);
   }
   // optionally put previous left-over to front
   if (data_begin_ > 0) {
-    KALDI_ASSERT(data_begin_ <= data_end_); // sanity check
+    KALDI_ASSERT(data_begin_ <= data_end_);  // sanity check
     int32 leftover = data_end_ - data_begin_;
-    KALDI_ASSERT(leftover < data_begin_); // no overlap
-    if(leftover > 0) {
-      data_.Range(0,leftover).CopyFromVec(data_.Range(data_begin_,leftover));
+    KALDI_ASSERT(leftover < data_begin_);  // no overlap
+    if (leftover > 0) {
+      data_.Range(0, leftover).CopyFromVec(data_.Range(data_begin_, leftover));
     }
-    data_begin_ = 0; data_end_ = leftover;
-    data_.Range(leftover,data_.Dim()-leftover).SetZero(); // zeroing the rest 
+    data_begin_ = 0;
+    data_end_ = leftover;
+    data_.Range(leftover, data_.Dim()-leftover).SetZero();  // zeroing the rest
   }
   // extend the buffer if necessary
-  if(data_.Dim() < data_end_ + v.Dim()) {
+  if (data_.Dim() < data_end_ + v.Dim()) {
     Vector<BaseFloat> data_aux(data_);
-    data_.Resize(data_end_ + v.Dim() + 1000); // +1000 row surplus
-    data_.Range(0,data_aux.Dim()).CopyFromVec(data_aux);
+    data_.Resize(data_end_ + v.Dim() + 1000);  // +1000 row surplus
+    data_.Range(0, data_aux.Dim()).CopyFromVec(data_aux);
   }
   // copy the data
-  data_.Range(data_end_,v.Dim()).CopyFromVec(v);
+  data_.Range(data_end_, v.Dim()).CopyFromVec(v);
   data_end_ += v.Dim();
 }
 
@@ -134,7 +144,7 @@ void VectorRandomizer::Randomize(const std::vector<int32>& mask) {
   // Use auxiliary buffer for unshuffled data
   Vector<BaseFloat> data_aux(data_);
   // randomize the data, mask is used to index elements in source vector
-  for(int32 i = 0; i<mask.size(); i++) {
+  for (int32 i = 0; i < mask.size(); i++) {
     data_(i) = data_aux(mask.at(i));
   }
 }
@@ -144,9 +154,11 @@ void VectorRandomizer::Next() {
 }
 
 const Vector<BaseFloat>& VectorRandomizer::Value() {
-  KALDI_ASSERT(data_end_ - data_begin_ >= conf_.minibatch_size); // have data for minibatch
-  minibatch_.Resize(conf_.minibatch_size,kUndefined);
-  minibatch_.CopyFromVec(data_.Range(data_begin_,conf_.minibatch_size));
+  // make sure we have data for next minibatch,
+  KALDI_ASSERT(data_end_ - data_begin_ >= conf_.minibatch_size);
+  // prepare the mini-batch buffer,
+  minibatch_.Resize(conf_.minibatch_size, kUndefined);
+  minibatch_.CopyFromVec(data_.Range(data_begin_, conf_.minibatch_size));
   return minibatch_;
 }
 
@@ -156,24 +168,24 @@ const Vector<BaseFloat>& VectorRandomizer::Value() {
 template<typename T>
 void StdVectorRandomizer<T>::AddData(const std::vector<T>& v) {
   // pre-allocate before 1st use
-  if(data_.size() == 0) {
+  if (data_.size() == 0) {
     data_.resize(conf_.randomizer_size);
   }
   // optionally put previous left-over to front
   if (data_begin_ > 0) {
-    KALDI_ASSERT(data_begin_ <= data_end_); // sanity check
+    KALDI_ASSERT(data_begin_ <= data_end_);  // sanity check
     int32 leftover = data_end_ - data_begin_;
-    KALDI_ASSERT(leftover < data_begin_); // no overlap
-    if(leftover > 0) {
-      std::copy(data_.begin()+data_begin_, data_.begin()+data_begin_+leftover, data_.begin());
+    KALDI_ASSERT(leftover < data_begin_);  // no overlap
+    if (leftover > 0) {
+      typename std::vector<T>::iterator leftover_begin = data_.begin() + data_begin_;
+      std::copy(leftover_begin, leftover_begin + leftover, data_.begin());
     }
-    data_begin_ = 0; data_end_ = leftover;
-    // cannot do this, we don't know default value of arbitrary type!
-    // data_.RowRange(leftover,data_.NumRows()-leftover).SetZero(); // zeroing the rest 
+    data_begin_ = 0;
+    data_end_ = leftover;
   }
   // extend the buffer if necessary
-  if(data_.size() < data_end_ + v.size()) {
-    data_.resize(data_end_ + v.size() + 1000); // +1000 row surplus
+  if (data_.size() < data_end_ + v.size()) {
+    data_.resize(data_end_ + v.size() + 1000);  // +1000 row surplus
   }
   // copy the data
   std::copy(v.begin(), v.end(), data_.begin()+data_end_);
@@ -188,7 +200,7 @@ void StdVectorRandomizer<T>::Randomize(const std::vector<int32>& mask) {
   // Use auxiliary buffer for unshuffled data
   std::vector<T> data_aux(data_);
   // randomize the data, mask is used to index elements in source vector
-  for(int32 i = 0; i<mask.size(); i++) {
+  for (int32 i = 0; i < mask.size(); i++) {
     data_.at(i) = data_aux.at(mask.at(i));
   }
 }
@@ -200,18 +212,21 @@ void StdVectorRandomizer<T>::Next() {
 
 template<typename T>
 const std::vector<T>& StdVectorRandomizer<T>::Value() {
-  KALDI_ASSERT(data_end_ - data_begin_ >= conf_.minibatch_size); // have data for minibatch
+  // make sure we have enough data for minibatch,
+  KALDI_ASSERT(data_end_ - data_begin_ >= conf_.minibatch_size);
+  // prepare the mini-batch buffer,
   minibatch_.resize(conf_.minibatch_size);
-
   typename std::vector<T>::iterator first = data_.begin() + data_begin_;
-  typename std::vector<T>::iterator last  = data_.begin() + data_begin_ + conf_.minibatch_size; //not-copied
+  typename std::vector<T>::iterator last  = first + conf_.minibatch_size;
   std::copy(first, last, minibatch_.begin());
   return minibatch_;
 }
 
-// Instantiate template StdVectorRandomizer with types we expect to operate on
+// Instantiate template StdVectorRandomizer with types we expect to operate on,
+// - Int32VectorRandomizer:
 template class StdVectorRandomizer<int32>;
-template class StdVectorRandomizer<std::vector<std::pair<int32, BaseFloat> > >; //PosteriorRandomizer
+// - PosteriorRandomizer:
+template class StdVectorRandomizer<std::vector<std::pair<int32, BaseFloat> > >;
 
-}
-}
+}  // namespace nnet1
+}  // namespace kaldi
diff --git a/src/nnet/nnet-randomizer.h b/src/nnet/nnet-randomizer.h
index dcabba3842e..71da6950599 100644
--- a/src/nnet/nnet-randomizer.h
+++ b/src/nnet/nnet-randomizer.h
@@ -21,6 +21,9 @@
 #ifndef KALDI_NNET_NNET_RANDOMIZER_H_
 #define KALDI_NNET_NNET_RANDOMIZER_H_
 
+#include <utility>
+#include <vector>
+
 #include "base/kaldi-math.h"
 #include "itf/options-itf.h"
 #include "cudamatrix/cu-matrix.h"
@@ -29,72 +32,113 @@
 namespace kaldi {
 namespace nnet1 {
 
-/// Configuration variables that affect how frame-level shuffling is done.
+/**
+ * Configuration variables that affect how frame-level shuffling is done.
+ */
 struct NnetDataRandomizerOptions {
-  int32 randomizer_size; // Maximum number of samples we want to have in memory at once.
+  int32 randomizer_size;  ///< Maximum number of samples we have in memory,
   int32 randomizer_seed;
-  int32 minibatch_size;  // Size of a single mini-batch.
+  int32 minibatch_size;
 
-  NnetDataRandomizerOptions()
-   : randomizer_size(32768), randomizer_seed(777), minibatch_size(256) 
+  NnetDataRandomizerOptions():
+    randomizer_size(32768),
+    randomizer_seed(777),
+    minibatch_size(256)
   { }
 
   void Register(OptionsItf *opts) {
-    opts->Register("randomizer-size", &randomizer_size, "Capacity of randomizer, length of concatenated utterances which are used for frame-level shuffling (in frames, affects memory consumption, max 8000000).");
-    opts->Register("randomizer-seed", &randomizer_seed, "Seed value for srand, sets fixed order of frame-level shuffling");
+    opts->Register("randomizer-size", &randomizer_size,
+       "Capacity of randomizer, length of concatenated utterances which, "
+       "are used for frame-level shuffling (in frames, affects memory "
+       "consumption, max 8000000).");
+    opts->Register("randomizer-seed", &randomizer_seed,
+       "Seed value for srand, sets fixed order of frame-level shuffling");
     opts->Register("minibatch-size", &minibatch_size, "Size of a minibatch.");
   }
 };
-///
 
 
-/// Generates index-mask, which is used to randomize order of datapoints (speech frames)
+/**
+ * Generates randomly ordered vector of indices,
+ */
 class RandomizerMask {
  public:
-  RandomizerMask() { }
-  RandomizerMask(const NnetDataRandomizerOptions &conf) { Init(conf); }
-  /// Init (only runs srand)
-  void Init(const NnetDataRandomizerOptions& conf); 
-  /// Generate vector of integers 0..[mask_size -1] with random order.
+  RandomizerMask()
+  { }
+
+  explicit RandomizerMask(const NnetDataRandomizerOptions &conf) {
+    Init(conf);
+  }
+
+  /// Init, call srand,
+  void Init(const NnetDataRandomizerOptions& conf);
+
+  /// Generate randomly ordered vector of integers 0..[mask_size -1],
   const std::vector<int32>& Generate(int32 mask_size);
+
  private:
   std::vector<int32> mask_;
 };
 
 
-/// Randomizes rows of a matrix according to a mask
+/**
+ * Shuffles rows of a matrix according to the indices in the mask,
+ */
 class MatrixRandomizer {
  public:
-  MatrixRandomizer() : data_begin_(0), data_end_(0) { }
-  MatrixRandomizer(const NnetDataRandomizerOptions &conf) : data_begin_(0), data_end_(0) { Init(conf); }
+  MatrixRandomizer():
+    data_begin_(0),
+    data_end_(0)
+  { }
+
+  explicit MatrixRandomizer(const NnetDataRandomizerOptions &conf):
+    data_begin_(0),
+    data_end_(0)
+  {
+    Init(conf);
+  }
+
   /// Set the randomizer parameters (size)
-  void Init(const NnetDataRandomizerOptions& conf) { conf_ = conf; }
+  void Init(const NnetDataRandomizerOptions& conf) {
+    conf_ = conf;
+  }
 
   /// Add data to randomization buffer
   void AddData(const CuMatrixBase<BaseFloat>& m);
+
   /// Returns true, when capacity is full
-  bool IsFull() { return ((data_begin_ == 0) && (data_end_ > conf_.randomizer_size )); }
+  bool IsFull() {
+    return ((data_begin_ == 0) && (data_end_ > conf_.randomizer_size ));
+  }
+
   /// Number of frames stored inside the Randomizer
-  int32 NumFrames() { return data_end_; }
+  int32 NumFrames() {
+    return data_end_;
+  }
+
   /// Randomize matrix row-order using mask
   void Randomize(const std::vector<int32>& mask);
 
   /// Returns true, if no more data for another mini-batch (after current one)
-  bool Done() { return (data_end_ - data_begin_ < conf_.minibatch_size); }
+  bool Done() {
+    return (data_end_ - data_begin_ < conf_.minibatch_size);
+  }
+
   /// Sets cursor to next mini-batch
   void Next();
+
   /// Returns matrix-window with next mini-batch
   const CuMatrixBase<BaseFloat>& Value();
 
  private:
-  CuMatrix<BaseFloat> data_; // can be larger than 'randomizer_size'
-  CuMatrix<BaseFloat> data_aux_; // auxiliary buffer for shuffling
-  CuMatrix<BaseFloat> minibatch_; // buffer for mini-batch
+  CuMatrix<BaseFloat> data_;  // can be larger than 'randomizer_size'
+  CuMatrix<BaseFloat> data_aux_;  // auxiliary buffer for shuffling
+  CuMatrix<BaseFloat> minibatch_;  // buffer for mini-batch
 
-  /// Cursor to beginning of data (row index, moves as mini-batches are delivered)
+  /// A cursor, pointing to the 'row' where the next mini-batch begins,
   int32 data_begin_;
-  /// Cursor past the end of data (row index) 
-  int32 data_end_;   
+  /// A cursor, pointing to the 'row' after the end of data,
+  int32 data_end_;
 
   NnetDataRandomizerOptions conf_;
 };
@@ -103,35 +147,58 @@ class MatrixRandomizer {
 /// Randomizes elements of a vector according to a mask
 class VectorRandomizer {
  public:
-  VectorRandomizer() : data_begin_(0), data_end_(0) { }
-  VectorRandomizer(const NnetDataRandomizerOptions &conf) : data_begin_(0), data_end_(0) { Init(conf); }
+  VectorRandomizer():
+    data_begin_(0),
+    data_end_(0)
+  { }
+
+  explicit VectorRandomizer(const NnetDataRandomizerOptions &conf):
+    data_begin_(0),
+    data_end_(0)
+  {
+    Init(conf);
+  }
+
   /// Set the randomizer parameters (size)
-  void Init(const NnetDataRandomizerOptions& conf) { conf_ = conf; }
+  void Init(const NnetDataRandomizerOptions& conf) {
+    conf_ = conf;
+  }
 
   /// Add data to randomization buffer
   void AddData(const Vector<BaseFloat>& v);
+
   /// Returns true, when capacity is full
-  bool IsFull() { return ((data_begin_ == 0) && (data_end_ > conf_.randomizer_size )); }
+  bool IsFull() {
+    return ((data_begin_ == 0) && (data_end_ > conf_.randomizer_size ));
+  }
+
   /// Number of frames stored inside the Randomizer
-  int32 NumFrames() { return data_end_; }
+  int32 NumFrames() {
+    return data_end_;
+  }
+
   /// Randomize matrix row-order using mask
   void Randomize(const std::vector<int32>& mask);
 
   /// Returns true, if no more data for another mini-batch (after current one)
-  bool Done() { return (data_end_ - data_begin_ < conf_.minibatch_size); }
+  bool Done() {
+    return (data_end_ - data_begin_ < conf_.minibatch_size);
+  }
+
   /// Sets cursor to next mini-batch
   void Next();
+
   /// Returns matrix-window with next mini-batch
   const Vector<BaseFloat>& Value();
 
  private:
-  Vector<BaseFloat> data_; // can be larger than 'randomizer_size'
-  Vector<BaseFloat> minibatch_; // buffer for mini-batch
+  Vector<BaseFloat> data_;  // can be larger than 'randomizer_size'
+  Vector<BaseFloat> minibatch_;  // buffer for mini-batch
 
-  /// Cursor to beginning of data (row index, moves as mini-batches are delivered)
+  /// A cursor, pointing to the 'row' where the next mini-batch begins,
   int32 data_begin_;
-  /// Cursor past the end of data (row index) 
-  int32 data_end_;   
+  /// A cursor, pointing to the 'row' after the end of data,
+  int32 data_end_;
 
   NnetDataRandomizerOptions conf_;
 };
@@ -141,35 +208,58 @@ class VectorRandomizer {
 template<typename T>
 class StdVectorRandomizer {
  public:
-  StdVectorRandomizer() : data_begin_(0), data_end_(0) { }
-  StdVectorRandomizer(const NnetDataRandomizerOptions &conf) : data_begin_(0), data_end_(0) { Init(conf); }
+  StdVectorRandomizer():
+    data_begin_(0),
+    data_end_(0)
+  { }
+
+  explicit StdVectorRandomizer(const NnetDataRandomizerOptions &conf):
+    data_begin_(0),
+    data_end_(0)
+  {
+    Init(conf);
+  }
+
   /// Set the randomizer parameters (size)
-  void Init(const NnetDataRandomizerOptions& conf) { conf_ = conf; }
+  void Init(const NnetDataRandomizerOptions& conf) {
+    conf_ = conf;
+  }
 
   /// Add data to randomization buffer
   void AddData(const std::vector<T>& v);
+
   /// Returns true, when capacity is full
-  bool IsFull() { return ((data_begin_ == 0) && (data_end_ > conf_.randomizer_size )); }
+  bool IsFull() {
+    return ((data_begin_ == 0) && (data_end_ > conf_.randomizer_size ));
+  }
+
   /// Number of frames stored inside the Randomizer
-  int32 NumFrames() { return data_end_; }
+  int32 NumFrames() {
+    return data_end_;
+  }
+
   /// Randomize matrix row-order using mask
   void Randomize(const std::vector<int32>& mask);
 
   /// Returns true, if no more data for another mini-batch (after current one)
-  bool Done() { return (data_end_ - data_begin_ < conf_.minibatch_size); }
+  bool Done() {
+    return (data_end_ - data_begin_ < conf_.minibatch_size);
+  }
+
   /// Sets cursor to next mini-batch
   void Next();
+
   /// Returns matrix-window with next mini-batch
   const std::vector<T>& Value();
 
  private:
-  std::vector<T> data_; // can be larger than 'randomizer_size'
-  std::vector<T> minibatch_; // buffer for mini-batch
+  std::vector<T> data_;  // can be larger than 'randomizer_size'
+  std::vector<T> minibatch_;  // buffer for mini-batch
 
-  /// Cursor to beginning of data (row index, moves as mini-batches are delivered)
+  /// A cursor, pointing to the 'row' where the next mini-batch begins,
   int32 data_begin_;
-  /// Cursor past the end of data (row index) 
-  int32 data_end_;   
+  /// A cursor, pointing to the 'row' after the end of data,
+  int32 data_end_;
 
   NnetDataRandomizerOptions conf_;
 };
@@ -178,7 +268,7 @@ typedef StdVectorRandomizer<int32> Int32VectorRandomizer;
 typedef StdVectorRandomizer<std::vector<std::pair<int32, BaseFloat> > > PosteriorRandomizer;
 
 
-} // namespace nnet1
-} // namespace kaldi
+}  // namespace nnet1
+}  // namespace kaldi
 
-#endif
+#endif  // KALDI_NNET_NNET_RANDOMIZER_H_
diff --git a/src/nnet/nnet-rbm.h b/src/nnet/nnet-rbm.h
index d1b49cc9556..4b5f4c1e24a 100644
--- a/src/nnet/nnet-rbm.h
+++ b/src/nnet/nnet-rbm.h
@@ -21,6 +21,7 @@
 #ifndef KALDI_NNET_NNET_RBM_H_
 #define KALDI_NNET_NNET_RBM_H_
 
+#include <string>
 
 #include "nnet/nnet-component.h"
 #include "nnet/nnet-nnet.h"
@@ -37,23 +38,23 @@ class RbmBase : public Component {
     Bernoulli,
     Gaussian
   } RbmNodeType;
- 
-  RbmBase(int32 dim_in, int32 dim_out) 
-   : Component(dim_in, dim_out)
+
+  RbmBase(int32 dim_in, int32 dim_out):
+    Component(dim_in, dim_out)
   { }
-  
+
   // Inherited from Component::
   // void Propagate(...)
   // virtual void PropagateFnc(...) = 0
 
   virtual void Reconstruct(
-    const CuMatrixBase<BaseFloat> &hid_state, 
+    const CuMatrixBase<BaseFloat> &hid_state,
     CuMatrix<BaseFloat> *vis_probs
   ) = 0;
   virtual void RbmUpdate(
-    const CuMatrixBase<BaseFloat> &pos_vis, 
-    const CuMatrixBase<BaseFloat> &pos_hid, 
-    const CuMatrixBase<BaseFloat> &neg_vis, 
+    const CuMatrixBase<BaseFloat> &pos_vis,
+    const CuMatrixBase<BaseFloat> &pos_hid,
+    const CuMatrixBase<BaseFloat> &neg_vis,
     const CuMatrixBase<BaseFloat> &neg_hid
   ) = 0;
 
@@ -70,17 +71,23 @@ class RbmBase : public Component {
   const RbmTrainOptions& GetRbmTrainOptions() const {
     return rbm_opts_;
   }
-  
+
  protected:
   RbmTrainOptions rbm_opts_;
 
  private:
   //// Make inherited methods inaccessible,
   //   as for RBMs we use Reconstruct(.)
-  void Backpropagate(const CuMatrixBase<BaseFloat> &in, const CuMatrixBase<BaseFloat> &out,
-                     const CuMatrixBase<BaseFloat> &out_diff, CuMatrix<BaseFloat> *in_diff) { }
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in, const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff, CuMatrixBase<BaseFloat> *in_diff) { }
+  void Backpropagate(const CuMatrixBase<BaseFloat> &in,
+                     const CuMatrixBase<BaseFloat> &out,
+                     const CuMatrixBase<BaseFloat> &out_diff,
+                     CuMatrix<BaseFloat> *in_diff)
+  { }
+  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
+                        const CuMatrixBase<BaseFloat> &out,
+                        const CuMatrixBase<BaseFloat> &out_diff,
+                        CuMatrixBase<BaseFloat> *in_diff)
+  { }
   ////
 };
 
@@ -88,27 +95,33 @@ class RbmBase : public Component {
 
 class Rbm : public RbmBase {
  public:
-  Rbm(int32 dim_in, int32 dim_out) 
-   : RbmBase(dim_in, dim_out)
-  { } 
+  Rbm(int32 dim_in, int32 dim_out):
+    RbmBase(dim_in, dim_out)
+  { }
+
   ~Rbm()
-  { }  
-  
-  Component* Copy() const { return new Rbm(*this); }
-  ComponentType GetType() const { return kRbm; }
+  { }
+
+  Component* Copy() const {
+    return new Rbm(*this);
+  }
+
+  ComponentType GetType() const {
+    return kRbm;
+  }
 
   void InitData(std::istream &is) {
-    // define options
+    // define options,
     std::string vis_type;
     std::string hid_type;
-    float vis_bias_mean = 0.0, vis_bias_range = 0.0, 
-          hid_bias_mean = 0.0, hid_bias_range = 0.0, 
+    float vis_bias_mean = 0.0, vis_bias_range = 0.0,
+          hid_bias_mean = 0.0, hid_bias_range = 0.0,
           param_stddev = 0.1;
-    std::string vis_bias_cmvn_file; // initialize biases to logit(p_active)
-    // parse config
-    std::string token; 
-    while (!is.eof()) {
-      ReadToken(is, false, &token); 
+    std::string vis_bias_cmvn_file;  // initialize biases to logit(p_active)
+    // parse config,
+    std::string token;
+    while (is >> std::ws, !is.eof()) {
+      ReadToken(is, false, &token);
       /**/ if (token == "<VisibleType>") ReadToken(is, false, &vis_type);
       else if (token == "<HiddenType>") ReadToken(is, false, &hid_type);
       else if (token == "<VisibleBiasMean>") ReadBasicType(is, false, &vis_bias_mean);
@@ -118,12 +131,9 @@ class Rbm : public RbmBase {
       else if (token == "<ParamStddev>") ReadBasicType(is, false, &param_stddev);
       else if (token == "<VisibleBiasCmvnFilename>") ReadToken(is, false, &vis_bias_cmvn_file);
       else KALDI_ERR << "Unknown token " << token << " Typo in config?";
-      is >> std::ws; // eat-up whitespace
     }
 
-    //
-    // initialize
-    //
+    // Translate the 'node' types,
     if (vis_type == "bern" || vis_type == "Bernoulli") vis_type_ = RbmBase::Bernoulli;
     else if (vis_type == "gauss" || vis_type == "Gaussian") vis_type_ = RbmBase::Gaussian;
     else KALDI_ERR << "Wrong <VisibleType>" << vis_type;
@@ -131,47 +141,41 @@ class Rbm : public RbmBase {
     if (hid_type == "bern" || hid_type == "Bernoulli") hid_type_ = RbmBase::Bernoulli;
     else if (hid_type == "gauss" || hid_type == "Gaussian") hid_type_ = RbmBase::Gaussian;
     else KALDI_ERR << "Wrong <HiddenType>" << hid_type;
-    // visible-hidden connections
-    Matrix<BaseFloat> mat(output_dim_, input_dim_);
-    for (int32 r=0; r<output_dim_; r++) {
-      for (int32 c=0; c<input_dim_; c++) {
-        mat(r,c) = param_stddev * RandGauss(); // 0-mean Gauss with given std_dev
-      }
-    }
-    vis_hid_ = mat;
-    // hidden-bias
-    Vector<BaseFloat> vec(output_dim_);
-    for (int32 i=0; i<output_dim_; i++) {
-      // +/- 1/2*bias_range from bias_mean:
-      vec(i) = hid_bias_mean + (RandUniform() - 0.5) * hid_bias_range; 
-    }
-    hid_bias_ = vec;
-    // visible-bias
+
+    //
+    // Initialize trainable parameters,
+    //
+    // visible-hidden connections,
+    vis_hid_.Resize(OutputDim(), InputDim());
+    RandGauss(0.0, param_stddev, &vis_hid_);
+    // hidden-bias,
+    hid_bias_.Resize(OutputDim());
+    RandUniform(hid_bias_mean, hid_bias_range, &hid_bias_);
+    // visible-bias,
     if (vis_bias_cmvn_file == "") {
-      Vector<BaseFloat> vec2(input_dim_);
-      for (int32 i=0; i<input_dim_; i++) {
-        // +/- 1/2*bias_range from bias_mean:
-        vec2(i) = vis_bias_mean + (RandUniform() - 0.5) * vis_bias_range; 
-      }
-      vis_bias_ = vec2;
+      vis_bias_.Resize(InputDim());
+      RandUniform(vis_bias_mean, vis_bias_range, &vis_bias_);
     } else {
-      KALDI_LOG << "Initializing from <VisibleBiasCmvnFilename> " << vis_bias_cmvn_file;
+      KALDI_LOG << "Initializing from <VisibleBiasCmvnFilename> "
+                << vis_bias_cmvn_file;
+      // Reading Nnet with 'global-cmvn' components,
       Nnet cmvn;
       cmvn.Read(vis_bias_cmvn_file);
-      // getting probablity that neuron fires:
-      Vector<BaseFloat> p(dynamic_cast<AddShift&>(cmvn.GetComponent(0)).GetShiftVec());
-      p.Scale(-1.0);
-      // compute logit:
+      KALDI_ASSERT(InputDim() == cmvn.InputDim());
+      // The parameters from <AddShift> correspond to 'negative' mean values,
+      Vector<BaseFloat> p(cmvn.InputDim());
+      dynamic_cast<AddShift&>(cmvn.GetComponent(0)).GetParams(&p);
+      p.Scale(-1.0);  // 'un-do' negation of mean values,
+      p.ApplyFloor(0.0001);
+      p.ApplyCeiling(0.9999);
+      // Getting the logit,
       Vector<BaseFloat> logit_p(p.Dim());
-      for(int32 d = 0; d < p.Dim(); d++) {
-        if(p(d) < 0.0001) p(d) = 0.0001;
-        if(p(d) > 0.9999) p(d) = 0.9999;
+      for (int32 d = 0; d < p.Dim(); d++) {
         logit_p(d) = Log(p(d)) - Log(1.0 - p(d));
       }
       vis_bias_ = logit_p;
       KALDI_ASSERT(vis_bias_.Dim() == InputDim());
     }
-    //
   }
 
 
@@ -179,15 +183,15 @@ class Rbm : public RbmBase {
     std::string vis_node_type, hid_node_type;
     ReadToken(is, binary, &vis_node_type);
     ReadToken(is, binary, &hid_node_type);
-    
-    if(vis_node_type == "bern") {
+
+    if (vis_node_type == "bern") {
       vis_type_ = RbmBase::Bernoulli;
-    } else if(vis_node_type == "gauss") {
+    } else if (vis_node_type == "gauss") {
       vis_type_ = RbmBase::Gaussian;
     }
-    if(hid_node_type == "bern") {
+    if (hid_node_type == "bern") {
       hid_type_ = RbmBase::Bernoulli;
-    } else if(hid_node_type == "gauss") {
+    } else if (hid_node_type == "gauss") {
       hid_type_ = RbmBase::Gaussian;
     }
 
@@ -200,16 +204,16 @@ class Rbm : public RbmBase {
     KALDI_ASSERT(vis_bias_.Dim() == input_dim_);
     KALDI_ASSERT(hid_bias_.Dim() == output_dim_);
   }
-  
+
   void WriteData(std::ostream &os, bool binary) const {
     switch (vis_type_) {
-      case Bernoulli : WriteToken(os,binary,"bern"); break;
-      case Gaussian  : WriteToken(os,binary,"gauss"); break;
+      case Bernoulli : WriteToken(os,binary, "bern"); break;
+      case Gaussian  : WriteToken(os,binary, "gauss"); break;
       default : KALDI_ERR << "Unknown type " << vis_type_;
     }
     switch (hid_type_) {
-      case Bernoulli : WriteToken(os,binary,"bern"); break;
-      case Gaussian  : WriteToken(os,binary,"gauss"); break;
+      case Bernoulli : WriteToken(os,binary, "bern"); break;
+      case Gaussian  : WriteToken(os,binary, "gauss"); break;
       default : KALDI_ERR << "Unknown type " << hid_type_;
     }
     vis_hid_.Write(os, binary);
@@ -219,7 +223,8 @@ class Rbm : public RbmBase {
 
 
   // Component API
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in, CuMatrixBase<BaseFloat> *out) {
+  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
+                    CuMatrixBase<BaseFloat> *out) {
     // pre-fill with bias
     out->AddVecToRows(1.0, hid_bias_, 0.0);
     // multiply by weights^t
@@ -231,13 +236,16 @@ class Rbm : public RbmBase {
   }
 
   // RBM training API
-  void Reconstruct(const CuMatrixBase<BaseFloat> &hid_state, CuMatrix<BaseFloat> *vis_probs) {
+  void Reconstruct(const CuMatrixBase<BaseFloat> &hid_state,
+                   CuMatrix<BaseFloat> *vis_probs) {
     // check the dim
     if (output_dim_ != hid_state.NumCols()) {
-      KALDI_ERR << "Nonmatching dims, component:" << output_dim_ << " data:" << hid_state.NumCols();
+      KALDI_ERR << "Nonmatching dims, component:" << output_dim_
+                << " data:" << hid_state.NumCols();
     }
     // optionally allocate buffer
-    if (input_dim_ != vis_probs->NumCols() || hid_state.NumRows() != vis_probs->NumRows()) {
+    if (input_dim_ != vis_probs->NumCols() ||
+        hid_state.NumRows() != vis_probs->NumRows()) {
       vis_probs->Resize(hid_state.NumRows(), input_dim_);
     }
 
@@ -250,8 +258,11 @@ class Rbm : public RbmBase {
       vis_probs->Sigmoid(*vis_probs);
     }
   }
-  
-  void RbmUpdate(const CuMatrixBase<BaseFloat> &pos_vis, const CuMatrixBase<BaseFloat> &pos_hid, const CuMatrixBase<BaseFloat> &neg_vis, const CuMatrixBase<BaseFloat> &neg_hid) {
+
+  void RbmUpdate(const CuMatrixBase<BaseFloat> &pos_vis,
+                 const CuMatrixBase<BaseFloat> &pos_hid,
+                 const CuMatrixBase<BaseFloat> &neg_vis,
+                 const CuMatrixBase<BaseFloat> &neg_hid) {
     // dims
     KALDI_ASSERT(pos_vis.NumRows() == pos_hid.NumRows() &&
            pos_vis.NumRows() == neg_vis.NumRows() &&
@@ -265,38 +276,44 @@ class Rbm : public RbmBase {
     if ( vis_hid_corr_.NumRows() != vis_hid_.NumRows() ||
          vis_hid_corr_.NumCols() != vis_hid_.NumCols() ||
          vis_bias_corr_.Dim()    != vis_bias_.Dim()    ||
-         hid_bias_corr_.Dim()    != hid_bias_.Dim()     ){
-      vis_hid_corr_.Resize(vis_hid_.NumRows(),vis_hid_.NumCols(),kSetZero);
+         hid_bias_corr_.Dim()    != hid_bias_.Dim()     ) {
+      vis_hid_corr_.Resize(vis_hid_.NumRows(), vis_hid_.NumCols(), kSetZero);
       vis_bias_corr_.Resize(vis_bias_.Dim(), kSetZero);
       hid_bias_corr_.Resize(hid_bias_.Dim(), kSetZero);
     }
 
     // ANTI-WEIGHT-EXPLOSION PROTECTION (Gaussian-Bernoulli RBM)
-    // in the following section we detect that the weights in Gaussian-Bernoulli RBM
-    // are about to explode. The weight explosion is caused by large variance of the
-    // reconstructed data, which causes a feed-back loop that keeps increasing the weights.
     //
-    // To avoid explosion, the variance of the visible-data and reconstructed-data
-    // should be about the same. The model is particularly sensitive at the very
+    // in the following section we detect that the weights in
+    // Gaussian-Bernoulli RBM are almost exploding. The weight
+    // explosion is caused by large variance of the reconstructed data,
+    // which causes a feed-back loop that keeps increasing the weights.
+    //
+    // To avoid explosion, the standard-deviation of the visible-data
+    // and reconstructed-data should be about the same.
+    // The model is particularly sensitive at the very
     // beginning of the CD-1 training.
     //
-    // We compute variance of a)input mini-batch b)reconstruction. 
-    // When the ratio b)/a) is larger than 2, we:
-    // 1. scale down the weights and biases by b)/a) (for next mini-batch b)/a) gets 1.0)
-    // 2. shrink learning rate by 0.9x
-    // 3. reset the momentum buffer  
+    // We compute the standard deviations on
+    // * 'A' : input mini-batch
+    // * 'B' : reconstruction.
+    // When 'B > 2*A', we stabilize the training in this way:
+    // 1. we scale down the weights and biases by 'A/B',
+    // 2. we shrink learning rate by 0.9x,
+    // 3. we reset the momentum buffer,
+    //
+    // A warning message is put to the log. In later stage
+    // the learning-rate returns back to its original value.
     //
-    // Also a warning message is put to log. Note that in later stage 
-    // the learning-rate returns to its original value.
+    // To avoid the issue, we make sure that the weight-matrix
+    // is sensibly initialized.
     //
-    // An alternative approach is to use smaller values in weight-matrix initialization.
-    // 
     if (vis_type_ == RbmBase::Gaussian) {
       // check the data have no nan/inf:
-      CheckNanInf(pos_vis,"pos_vis");
-      CheckNanInf(pos_hid,"pos_hid");
-      CheckNanInf(neg_vis,"neg_vis");
-      CheckNanInf(neg_hid,"pos_hid");
+      CheckNanInf(pos_vis, "pos_vis");
+      CheckNanInf(pos_hid, "pos_hid");
+      CheckNanInf(neg_vis, "neg_vis");
+      CheckNanInf(neg_hid, "pos_hid");
 
       // get standard deviations of pos_vis and neg_vis:
       BaseFloat pos_vis_std = ComputeStdDev(pos_vis);
@@ -309,7 +326,7 @@ class Rbm : public RbmBase {
         vis_hid_.Scale(scale);
         vis_bias_.Scale(scale);
         hid_bias_.Scale(scale);
-        // 2) reduce the learning rate           
+        // 2) reduce the learning rate
         rbm_opts_.learn_rate *= 0.9;
         // 3) reset the momentum buffers
         vis_hid_corr_.SetZero();
@@ -317,11 +334,12 @@ class Rbm : public RbmBase {
         hid_bias_corr_.SetZero();
 
         KALDI_WARN << "Mismatch between pos_vis and neg_vis variances, "
-                   << "danger of weight explosion. a) Reducing weights with scale " << scale
+                   << "danger of weight explosion."
+                   << " a) Reducing weights with scale " << scale
                    << " b) Lowering learning rate to " << rbm_opts_.learn_rate
                    << " [pos_vis_std:" << pos_vis_std
                    << ",neg_vis_std:" << neg_vis_std << "]";
-        return; /* i.e. don't update weights with current stats, as the update would be too BIG */
+        return; /* i.e. don't update now, the update would be too BIG */
       }
     }
     //
@@ -333,11 +351,11 @@ class Rbm : public RbmBase {
     const BaseFloat lr = rbm_opts_.learn_rate;
     const BaseFloat mmt = rbm_opts_.momentum;
     const BaseFloat l2 = rbm_opts_.l2_penalty;
-    
+
     //  UPDATE vishid matrix
-    //  
+    //
     //  vishidinc = momentum*vishidinc + ...
-    //              epsilonw*( (posprods-negprods)/numcases - weightcost*vishid);
+    //              epsilonw*( (posprods-negprods)/numcases - weightcost*vishid)
     //
     //  vishidinc[t] = -(epsilonw/numcases)*negprods + momentum*vishidinc[t-1]
     //                 +(epsilonw/numcases)*posprods
@@ -351,50 +369,50 @@ class Rbm : public RbmBase {
 
     //  UPDATE visbias vector
     //
-    //  visbiasinc = momentum*visbiasinc + (epsilonvb/numcases)*(posvisact-negvisact);
+    //  visbiasinc = momentum*visbiasinc +
+    //               (epsilonvb/numcases)*(posvisact-negvisact);
     //
     vis_bias_corr_.AddRowSumMat(-lr/N, neg_vis, mmt);
     vis_bias_corr_.AddRowSumMat(+lr/N, pos_vis, 1.0);
     vis_bias_.AddVec(1.0, vis_bias_corr_, 1.0);
-    
+
     //  UPDATE hidbias vector
     //
-    // hidbiasinc = momentum*hidbiasinc + (epsilonhb/numcases)*(poshidact-neghidact);
+    // hidbiasinc = momentum*hidbiasinc +
+    //              (epsilonhb/numcases)*(poshidact-neghidact);
     //
     hid_bias_corr_.AddRowSumMat(-lr/N, neg_hid, mmt);
     hid_bias_corr_.AddRowSumMat(+lr/N, pos_hid, 1.0);
     hid_bias_.AddVec(1.0, hid_bias_corr_, 1.0);
   }
 
-
-
-  RbmNodeType VisType() const { 
-    return vis_type_; 
+  RbmNodeType VisType() const {
+    return vis_type_;
   }
 
-  RbmNodeType HidType() const { 
-    return hid_type_; 
+  RbmNodeType HidType() const {
+    return hid_type_;
   }
 
   void WriteAsNnet(std::ostream& os, bool binary) const {
-    //header
-    WriteToken(os,binary,Component::TypeToMarker(Component::kAffineTransform));
-    WriteBasicType(os,binary,OutputDim());
-    WriteBasicType(os,binary,InputDim());
-    if(!binary) os << "\n";
-    //data
-    vis_hid_.Write(os,binary);
-    hid_bias_.Write(os,binary);
-    //optionally sigmoid activation
-    if(HidType() == Bernoulli) {
-      WriteToken(os,binary,Component::TypeToMarker(Component::kSigmoid));
-      WriteBasicType(os,binary,OutputDim());
-      WriteBasicType(os,binary,OutputDim());
+    // header,
+    WriteToken(os, binary, Component::TypeToMarker(Component::kAffineTransform));
+    WriteBasicType(os, binary, OutputDim());
+    WriteBasicType(os, binary, InputDim());
+    if (!binary) os << "\n";
+    // data,
+    vis_hid_.Write(os, binary);
+    hid_bias_.Write(os, binary);
+    // sigmoid activation,
+    if (HidType() == Bernoulli) {
+      WriteToken(os, binary, Component::TypeToMarker(Component::kSigmoid));
+      WriteBasicType(os, binary, OutputDim());
+      WriteBasicType(os, binary, OutputDim());
     }
-    if(!binary) os << "\n";
+    if (!binary) os << "\n";
   }
 
-protected:
+ protected:
   CuMatrix<BaseFloat> vis_hid_;        ///< Matrix with neuron weights
   CuVector<BaseFloat> vis_bias_;       ///< Vector with biases
   CuVector<BaseFloat> hid_bias_;       ///< Vector with biases
@@ -405,12 +423,11 @@ class Rbm : public RbmBase {
 
   RbmNodeType vis_type_;
   RbmNodeType hid_type_;
-
 };
 
 
 
-} // namespace nnet1
-} // namespace kaldi
+}  // namespace nnet1
+}  // namespace kaldi
 
-#endif
+#endif  // KALDI_NNET_NNET_RBM_H_
diff --git a/src/nnet/nnet-sentence-averaging-component.h b/src/nnet/nnet-sentence-averaging-component.h
index bd9ba60491a..129b54890a7 100644
--- a/src/nnet/nnet-sentence-averaging-component.h
+++ b/src/nnet/nnet-sentence-averaging-component.h
@@ -1,6 +1,6 @@
-// nnet/nnet-affine-transform.h
+// nnet/nnet-sentence-averaging-component.h
 
-// Copyright 2013  Brno University of Technology (Author: Karel Vesely)
+// Copyright 2013-2016  Brno University of Technology (Author: Karel Vesely)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -21,6 +21,7 @@
 #ifndef KALDI_NNET_NNET_SENTENCE_AVERAGING_COMPONENT_H_
 #define KALDI_NNET_NNET_SENTENCE_AVERAGING_COMPONENT_H_
 
+#include <string>
 
 #include "nnet/nnet-component.h"
 #include "nnet/nnet-utils.h"
@@ -29,10 +30,157 @@
 namespace kaldi {
 namespace nnet1 {
 
+
+/**
+ * SimpleSentenceAveragingComponent does not have nested network,
+ * it is intended to be used inside of a <ParallelComponent>.
+ * For training use 'nnet-train-perutt'.
+ *
+ * The sentence-averaging typically leads to small gradients, so we boost it 100x
+ * by default (boost = multiply, it's equivalent to applying learning-rate factor).
+ */
+class SimpleSentenceAveragingComponent : public Component {
+ public:
+  SimpleSentenceAveragingComponent(int32 dim_in, int32 dim_out):
+    Component(dim_in, dim_out),
+    gradient_boost_(100.0),
+    shrinkage_(0.0),
+    only_summing_(false)
+  { }
+
+  ~SimpleSentenceAveragingComponent()
+  { }
+
+  Component* Copy() const {
+    return new SimpleSentenceAveragingComponent(*this);
+  }
+
+  ComponentType GetType() const {
+    return kSimpleSentenceAveragingComponent;
+  }
+
+  void InitData(std::istream &is) {
+    // parse config
+    std::string token;
+    while (is >> std::ws, !is.eof()) {
+      ReadToken(is, false, &token);
+      if (token == "<GradientBoost>") ReadBasicType(is, false, &gradient_boost_);
+      else if (token == "<Shrinkage>") ReadBasicType(is, false, &shrinkage_);
+      else if (token == "<OnlySumming>") ReadBasicType(is, false, &only_summing_);
+      else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
+                     << " (GradientBoost|Shrinkage|OnlySumming)";
+    }
+  }
+
+  void ReadData(std::istream &is, bool binary) {
+    bool end_loop = false;
+    while (!end_loop && '<' == Peek(is, binary)) {
+      int first_char = PeekToken(is, binary);
+      switch (first_char) {
+        case 'G': ExpectToken(is, binary, "<GradientBoost>");
+          ReadBasicType(is, binary, &gradient_boost_);
+          break;
+        case 'S': ExpectToken(is, binary, "<Shrinkage>");
+          ReadBasicType(is, binary, &shrinkage_);
+          break;
+        case 'O': ExpectToken(is, binary, "<OnlySumming>");
+          // compatibility trick,
+          // in some models 'only_summing_' was float '0.0',
+          // from now 'only_summing_' is 'bool':
+          try {
+            ReadBasicType(is, binary, &only_summing_);
+          } catch(const std::exception &e) {
+            KALDI_WARN << "ERROR was handled by exception!";
+            BaseFloat dummy_float;
+            ReadBasicType(is, binary, &dummy_float);
+          }
+          break;
+        case '!':
+          ExpectToken(is, binary, "<!EndOfComponent>");
+        default:
+          end_loop = true;
+      }
+    }
+  }
+
+  void WriteData(std::ostream &os, bool binary) const {
+    WriteToken(os, binary, "<GradientBoost>");
+    WriteBasicType(os, binary, gradient_boost_);
+    WriteToken(os, binary, "<Shrinkage>");
+    WriteBasicType(os, binary, shrinkage_);
+    WriteToken(os, binary, "<OnlySumming>");
+    WriteBasicType(os, binary, only_summing_);
+  }
+
+  std::string Info() const {
+    return std::string("\n  gradient-boost ") + ToString(gradient_boost_) +
+      ", shrinkage: " + ToString(shrinkage_) +
+      ", only summing: " + ToString(only_summing_);
+  }
+  std::string InfoGradient() const {
+    return Info();
+  }
+
+  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
+                    CuMatrixBase<BaseFloat> *out) {
+    // get the average row-vector,
+    average_row_.Resize(InputDim());
+    if (only_summing_) {
+      average_row_.AddRowSumMat(1.0, in, 0.0);
+    } else {
+      average_row_.AddRowSumMat(1.0/(in.NumRows()+shrinkage_), in, 0.0);
+    }
+    // copy it on the output,
+    out->AddVecToRows(1.0, average_row_, 0.0);
+  }
+
+  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
+                        const CuMatrixBase<BaseFloat> &out,
+                        const CuMatrixBase<BaseFloat> &out_diff,
+                        CuMatrixBase<BaseFloat> *in_diff) {
+    // When averaging, a single frame from input influenced all the frames
+    // on the output. So the derivative w.r.t. single input frame is a sum
+    // of the output derivatives, scaled by the averaging constant 1/K.
+    //
+    // In the same time all the input frames of the average influenced
+    // all the output frames. So the loss derivarive is same for all
+    // the input frames coming to the averaging.
+    //
+    // getting the average output diff,
+    average_diff_.Resize(OutputDim());
+    if (only_summing_) {
+      average_diff_.AddRowSumMat(1.0, out_diff, 0.0);
+    } else {
+      average_diff_.AddRowSumMat(1.0/(out_diff.NumRows()+shrinkage_), out_diff, 0.0);
+    }
+    // copy the derivative into the input diff, (applying gradient-boost!!)
+    in_diff->AddVecToRows(gradient_boost_, average_diff_, 0.0);
+  }
+
+ private:
+  /// Auxiliary buffer for forward propagation (for average vector),
+  CuVector<BaseFloat> average_row_;
+
+  /// Auxiliary buffer for back-propagation (for average vector),
+  CuVector<BaseFloat> average_diff_;
+
+  /// Scalar applied on gradient in backpropagation,
+  BaseFloat gradient_boost_;
+
+  /// Number of 'imaginary' zero-vectors in the average
+  /// (shrinks the average vector for short sentences),
+  BaseFloat shrinkage_;
+
+  /// Removes normalization term from arithmetic mean (when true).
+  bool only_summing_;
+};
+
+
+/** Deprecated!!!, keeping it as Katka Zmolikova used it in JSALT 2015 */
 class SentenceAveragingComponent : public UpdatableComponent {
  public:
-  SentenceAveragingComponent(int32 dim_in, int32 dim_out) 
-    : UpdatableComponent(dim_in, dim_out), learn_rate_factor_(100.0)
+  SentenceAveragingComponent(int32 dim_in, int32 dim_out):
+    UpdatableComponent(dim_in, dim_out), learn_rate_factor_(100.0)
   { }
   ~SentenceAveragingComponent()
   { }
@@ -45,17 +193,16 @@ class SentenceAveragingComponent : public UpdatableComponent {
     std::string nested_nnet_filename;
     std::string nested_nnet_proto;
     // parse config
-    std::string token; 
-    while (!is.eof()) {
+    std::string token;
+    while (is >> std::ws, !is.eof()) {
       ReadToken(is, false, &token);
       /**/ if (token == "<NestedNnetFilename>") ReadToken(is, false, &nested_nnet_filename);
       else if (token == "<NestedNnetProto>") ReadToken(is, false, &nested_nnet_proto);
       else if (token == "<LearnRateFactor>") ReadBasicType(is, false, &learn_rate_factor_);
       else KALDI_ERR << "Unknown token " << token << " Typo in config?";
-      is >> std::ws; // eat-up whitespace
     }
     // initialize (read already prepared nnet from file)
-    KALDI_ASSERT((nested_nnet_proto != "") ^ (nested_nnet_filename != "")); //xor
+    KALDI_ASSERT((nested_nnet_proto != "") ^ (nested_nnet_filename != ""));  // xor,
     if (nested_nnet_filename != "") nnet_.Read(nested_nnet_filename);
     if (nested_nnet_proto != "") nnet_.Init(nested_nnet_proto);
     // check dims of nested nnet
@@ -74,37 +221,62 @@ class SentenceAveragingComponent : public UpdatableComponent {
   }
 
   int32 NumParams() const { return nnet_.NumParams(); }
-  void GetParams(Vector<BaseFloat>* wei_copy) const { wei_copy->Resize(NumParams()); nnet_.GetParams(wei_copy); }
-  std::string Info() const { return std::string("nested_network {\n") + nnet_.Info() + "}\n"; }
-  std::string InfoGradient() const { return std::string("nested_gradient {\n") + nnet_.InfoGradient() + "}\n"; }
 
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in, CuMatrixBase<BaseFloat> *out) {
+  void GetGradient(VectorBase<BaseFloat>* gradient) const {
+    KALDI_ERR << "Unimplemented!";
+  }
+
+  void GetParams(VectorBase<BaseFloat>* params) const {
+    KALDI_ASSERT(params->Dim() == NumParams());
+    Vector<BaseFloat> params_aux;
+    nnet_.GetParams(&params_aux);
+    params->CopyFromVec(params_aux);
+  }
+
+  void SetParams(const VectorBase<BaseFloat>& params) {
+    KALDI_ERR << "Unimplemented!";
+  }
+
+  std::string Info() const {
+    return std::string("nested_network {\n") + nnet_.Info() + "}\n";
+  }
+
+  std::string InfoGradient() const {
+    return std::string("nested_gradient {\n") + nnet_.InfoGradient() + "}\n";
+  }
+
+  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
+                    CuMatrixBase<BaseFloat> *out) {
     // Get NN output
     CuMatrix<BaseFloat> out_nnet;
     nnet_.Propagate(in, &out_nnet);
     // Get the average row (averaging over the time axis):
-    // averaging corresponds to extraction of constant vector code for single sentence, 
+    // averaging corresponds to extraction of a 'constant vector'
+    // code for single sentence,
     int32 num_inputs = in.NumCols(),
       nnet_outputs = nnet_.OutputDim(),
       num_frames = out_nnet.NumRows();
-      
+
     CuVector<BaseFloat> average_row(nnet_outputs);
     average_row.AddRowSumMat(1.0/num_frames, out_nnet, 0.0);
     // Forwarding sentence codes along with input features
-    out->ColRange(0,nnet_outputs).AddVecToRows(1.0, average_row, 0.0);
-    out->ColRange(nnet_outputs,num_inputs).CopyFromMat(in);
+    out->ColRange(0, nnet_outputs).AddVecToRows(1.0, average_row, 0.0);
+    out->ColRange(nnet_outputs, num_inputs).CopyFromMat(in);
   }
 
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in, const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff, CuMatrixBase<BaseFloat> *in_diff) {
+  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
+                        const CuMatrixBase<BaseFloat> &out,
+                        const CuMatrixBase<BaseFloat> &out_diff,
+                        CuMatrixBase<BaseFloat> *in_diff) {
     if (in_diff == NULL) return;
     int32 num_inputs = in.NumCols(),
       nnet_outputs = nnet_.OutputDim();
-    in_diff->CopyFromMat(out_diff.ColRange(nnet_outputs,num_inputs));
+    in_diff->CopyFromMat(out_diff.ColRange(nnet_outputs, num_inputs));
   }
 
-  void Update(const CuMatrixBase<BaseFloat> &input, const CuMatrixBase<BaseFloat> &diff) {
-
+  void Update(const CuMatrixBase<BaseFloat> &input,
+              const CuMatrixBase<BaseFloat> &diff) {
+    // get useful dims,
     int32 nnet_outputs = nnet_.OutputDim(),
       num_frames = diff.NumRows();
     // Passing the derivative into the nested network. The loss derivative is averaged:
@@ -115,19 +287,17 @@ class SentenceAveragingComponent : public UpdatableComponent {
     // In fact all the frames from nested network influenced all the input frames to main nnet,
     // so the loss derivarive w.r.t. nested network output is same for all frames in sentence.
     CuVector<BaseFloat> average_diff(nnet_outputs);
-    average_diff.AddRowSumMat(1.0/num_frames, diff.ColRange(0,nnet_outputs), 0.0);
+    average_diff.AddRowSumMat(1.0 / num_frames, diff.ColRange(0, nnet_outputs), 0.0);
     CuMatrix<BaseFloat> nnet_out_diff(num_frames, nnet_outputs);
     nnet_out_diff.AddVecToRows(1.0, average_diff, 0.0);
-    // 
+    //
     nnet_.Backpropagate(nnet_out_diff, NULL);
   }
- 
+
   void SetTrainOptions(const NnetTrainOptions &opts) {
     UpdatableComponent::SetTrainOptions(opts_);
     // Pass the train options to the nnet
     NnetTrainOptions o(opts);
-    //o.learn_rate *= 100; // GOOD
-    //o.learn_rate *= 1000; // BAD
     o.learn_rate *= learn_rate_factor_;
     nnet_.SetTrainOptions(opts_);
   }
@@ -136,8 +306,9 @@ class SentenceAveragingComponent : public UpdatableComponent {
   Nnet nnet_;
   float learn_rate_factor_;
 };
+/* Deprecated */
 
-} // namespace nnet1
-} // namespace kaldi
+}  // namespace nnet1
+}  // namespace kaldi
 
-#endif
+#endif  // KALDI_NNET_NNET_SENTENCE_AVERAGING_COMPONENT_H_
diff --git a/src/nnet/nnet-trnopts.h b/src/nnet/nnet-trnopts.h
index c07d44ee0b3..12ad1b1cbb5 100644
--- a/src/nnet/nnet-trnopts.h
+++ b/src/nnet/nnet-trnopts.h
@@ -33,12 +33,15 @@ struct NnetTrainOptions {
   BaseFloat momentum;
   BaseFloat l2_penalty;
   BaseFloat l1_penalty;
+
   // default values
-  NnetTrainOptions() : learn_rate(0.008),
-                       momentum(0.0),
-                       l2_penalty(0.0),
-                       l1_penalty(0.0) 
-                       { }
+  NnetTrainOptions():
+    learn_rate(0.008),
+    momentum(0.0),
+    l2_penalty(0.0),
+    l1_penalty(0.0)
+  { }
+
   // register options
   void Register(OptionsItf *opts) {
     opts->Register("learn-rate", &learn_rate, "Learning rate");
@@ -46,6 +49,7 @@ struct NnetTrainOptions {
     opts->Register("l2-penalty", &l2_penalty, "L2 penalty (weight decay)");
     opts->Register("l1-penalty", &l1_penalty, "L1 penalty (promote sparsity)");
   }
+
   // print for debug purposes
   friend std::ostream& operator<<(std::ostream& os, const NnetTrainOptions& opts) {
     os << "RbmTrainOptions : "
@@ -66,32 +70,38 @@ struct RbmTrainOptions {
   int32 momentum_steps;
   int32 momentum_step_period;
   BaseFloat l2_penalty;
+
   // default values
-  RbmTrainOptions() : learn_rate(0.4),
-                      momentum(0.5),
-                      momentum_max(0.9),
-                      momentum_steps(40),
-                      momentum_step_period(500000),
-                        // 500000 * 40 = 55h of linear increase of momentum 
-                      l2_penalty(0.0002)
-                      { }
+  RbmTrainOptions():
+    learn_rate(0.4),
+    momentum(0.5),
+    momentum_max(0.9),
+    momentum_steps(40),
+    momentum_step_period(500000),
+    // 500000 * 40 = 55h of linear increase of momentum
+    l2_penalty(0.0002)
+  { }
+
   // register options
   void Register(OptionsItf *opts) {
     opts->Register("learn-rate", &learn_rate, "Learning rate");
 
-    opts->Register("momentum", &momentum, "Initial momentum for linear scheduling");
-    opts->Register("momentum-max", &momentum_max, "Final momentum for linear scheduling");
-    opts->Register("momentum-steps", &momentum_steps, 
+    opts->Register("momentum", &momentum,
+                   "Initial momentum for linear scheduling");
+    opts->Register("momentum-max", &momentum_max,
+                   "Final momentum for linear scheduling");
+    opts->Register("momentum-steps", &momentum_steps,
                    "Number of steps of linear momentum scheduling");
-    opts->Register("momentum-step-period", &momentum_step_period, 
+    opts->Register("momentum-step-period", &momentum_step_period,
                    "Number of datapoints per single momentum increase step");
 
-    opts->Register("l2-penalty", &l2_penalty, 
+    opts->Register("l2-penalty", &l2_penalty,
                    "L2 penalty (weight decay, increases mixing-rate)");
   }
+
   // print for debug purposes
   friend std::ostream& operator<<(std::ostream& os, const RbmTrainOptions& opts) {
-    os << "RbmTrainOptions : "       
+    os << "RbmTrainOptions : "
        << "learn_rate" << opts.learn_rate << ", "
        << "momentum" << opts.momentum << ", "
        << "momentum_max" << opts.momentum_max << ", "
@@ -100,10 +110,9 @@ struct RbmTrainOptions {
        << "l2_penalty" << opts.l2_penalty;
     return os;
   }
-};
-
+};  // struct RbmTrainOptions
 
-}//namespace nnet1
-}//namespace kaldi
+}  // namespace nnet1
+}  // namespace kaldi
 
-#endif
+#endif  // KALDI_NNET_NNET_TRNOPTS_H_
diff --git a/src/nnet/nnet-utils.h b/src/nnet/nnet-utils.h
index 59c2186fdef..ea10e37dfa7 100644
--- a/src/nnet/nnet-utils.h
+++ b/src/nnet/nnet-utils.h
@@ -21,6 +21,8 @@
 #ifndef KALDI_NNET_NNET_UTILS_H_
 #define KALDI_NNET_NNET_UTILS_H_
 
+#include <string>
+#include <vector>
 #include <iterator>
 #include <algorithm>
 
@@ -37,20 +39,20 @@ namespace nnet1 {
 /**
  * Define stream insertion opeartor for 'std::vector', useful for log-prints,
  */
-template <typename T> 
+template <typename T>
 std::ostream& operator<<(std::ostream& os, const std::vector<T>& v) {
-  std::copy(v.begin(), v.end(), std::ostream_iterator<T>(os," "));
+  std::copy(v.begin(), v.end(), std::ostream_iterator<T>(os, " "));
   return os;
 }
 
 /**
- * Convert basic type to string (try not to overuse as ostringstream creation is slow)
+ * Convert basic type to a string (please don't overuse),
  */
-template <typename T> 
-std::string ToString(const T& t) { 
-  std::ostringstream os; 
-  os << t; 
-  return os.str(); 
+template <typename T>
+std::string ToString(const T& t) {
+  std::ostringstream os;
+  os << t;
+  return os.str();
 }
 
 /**
@@ -61,31 +63,32 @@ template <typename Real>
 std::string MomentStatistics(const VectorBase<Real> &vec) {
   // we use an auxiliary vector for the higher order powers
   Vector<Real> vec_aux(vec);
-  Vector<Real> vec_no_mean(vec); // vec with mean subtracted
+  Vector<Real> vec_no_mean(vec);  // vec with mean subtracted
   // mean
   Real mean = vec.Sum() / vec.Dim();
   // variance
-  vec_aux.Add(-mean); vec_no_mean = vec_aux;
-  vec_aux.MulElements(vec_no_mean); // (vec-mean)^2
+  vec_aux.Add(-mean);
+  vec_no_mean = vec_aux;
+  vec_aux.MulElements(vec_no_mean);  // (vec-mean)^2
   Real variance = vec_aux.Sum() / vec.Dim();
-  // skewness 
-  // - negative : left tail is longer, 
-  // - positive : right tail is longer, 
+  // skewness
+  // - negative : left tail is longer,
+  // - positive : right tail is longer,
   // - zero : symmetric
-  vec_aux.MulElements(vec_no_mean); // (vec-mean)^3
+  vec_aux.MulElements(vec_no_mean);  // (vec-mean)^3
   Real skewness = vec_aux.Sum() / pow(variance, 3.0/2.0) / vec.Dim();
   // kurtosis (peakedness)
   // - makes sense for symmetric distributions (skewness is zero)
   // - positive : 'sharper peak' than Normal distribution
   // - negative : 'heavier tails' than Normal distribution
   // - zero : same peakedness as the Normal distribution
-  vec_aux.MulElements(vec_no_mean); // (vec-mean)^4
+  vec_aux.MulElements(vec_no_mean);  // (vec-mean)^4
   Real kurtosis = vec_aux.Sum() / (variance * variance) / vec.Dim() - 3.0;
   // send the statistics to stream,
   std::ostringstream ostr;
   ostr << " ( min " << vec.Min() << ", max " << vec.Max()
-       << ", mean " << mean 
-       << ", variance " << variance 
+       << ", mean " << mean
+       << ", stddev " << sqrt(variance)
        << ", skewness " << skewness
        << ", kurtosis " << kurtosis
        << " ) ";
@@ -117,7 +120,7 @@ std::string MomentStatistics(const CuVectorBase<Real> &vec) {
  */
 template <typename Real>
 std::string MomentStatistics(const CuMatrixBase<Real> &mat) {
-  Matrix<Real> mat_host(mat.NumRows(),mat.NumCols());
+  Matrix<Real> mat_host(mat.NumRows(), mat.NumCols());
   mat.CopyToMat(&mat_host);
   return MomentStatistics(mat_host);
 }
@@ -128,8 +131,8 @@ std::string MomentStatistics(const CuMatrixBase<Real> &mat) {
 template <typename Real>
 void CheckNanInf(const CuMatrixBase<Real> &mat, const char *msg = "") {
   Real sum = mat.Sum();
-  if(KALDI_ISINF(sum)) { KALDI_ERR << "'inf' in " << msg; }
-  if(KALDI_ISNAN(sum)) { KALDI_ERR << "'nan' in " << msg; }
+  if (KALDI_ISINF(sum)) { KALDI_ERR << "'inf' in " << msg; }
+  if (KALDI_ISNAN(sum)) { KALDI_ERR << "'nan' in " << msg; }
 }
 
 /**
@@ -149,59 +152,207 @@ Real ComputeStdDev(const CuMatrixBase<Real> &mat) {
   return sqrt(var);
 }
 
+
+/**
+ * Fill CuMatrix with random numbers (Gaussian distribution):
+ * mu = the mean value,
+ * sigma = standard deviation,
+ *
+ * Using the CPU random generator.
+ */
+template <typename Real>
+void RandGauss(BaseFloat mu, BaseFloat sigma, CuMatrixBase<Real>* mat,
+               struct RandomState* state = NULL) {
+  // fill temporary matrix with 'Normal' samples,
+  Matrix<Real> m(mat->NumRows(), mat->NumCols(), kUndefined);
+  for (int32 r = 0; r < m.NumRows(); r++) {
+    for (int32 c = 0; c < m.NumCols(); c++) {
+      m(r, c) = RandGauss(state);
+    }
+  }
+  // re-shape the distrbution,
+  m.Scale(sigma);
+  m.Add(mu);
+  // export,
+  mat->CopyFromMat(m);
+}
+
+/**
+ * Fill CuMatrix with random numbers (Uniform distribution):
+ * mu = the mean value,
+ * range = the 'width' of the uniform PDF (spanning mu-range/2 .. mu+range/2)
+ *
+ * Using the CPU random generator.
+ */
+template <typename Real>
+void RandUniform(BaseFloat mu, BaseFloat range, CuMatrixBase<Real>* mat,
+                 struct RandomState* state = NULL) {
+  // fill temporary matrix with '0..1' samples,
+  Matrix<Real> m(mat->NumRows(), mat->NumCols(), kUndefined);
+  for (int32 r = 0; r < m.NumRows(); r++) {
+    for (int32 c = 0; c < m.NumCols(); c++) {
+      m(r, c) = Rand(state) / static_cast<Real>(RAND_MAX);
+    }
+  }
+  // re-shape the distrbution,
+  m.Scale(range);  // 0..range,
+  m.Add(mu - (range / 2.0));  // mu-range/2 .. mu+range/2,
+  // export,
+  mat->CopyFromMat(m);
+}
+
+/**
+ * Fill CuVector with random numbers (Uniform distribution):
+ * mu = the mean value,
+ * range = the 'width' of the uniform PDF (spanning mu-range/2 .. mu+range/2)
+ *
+ * Using the CPU random generator.
+ */
+template <typename Real>
+void RandUniform(BaseFloat mu, BaseFloat range, CuVectorBase<Real>* vec,
+                 struct RandomState* state = NULL) {
+  // fill temporary vector with '0..1' samples,
+  Vector<Real> v(vec->Dim(), kUndefined);
+  for (int32 i = 0; i < v.Dim(); i++) {
+    v(i) = Rand(state) / static_cast<Real>(RAND_MAX);
+  }
+  // re-shape the distrbution,
+  v.Scale(range);  // 0..range,
+  v.Add(mu - (range / 2.0));  // mu-range/2 .. mu+range/2,
+  // export,
+  vec->CopyFromVec(v);
+}
+
+
+/**
+ * Build 'integer vector' out of vector of 'matlab-like' representation:
+ * 'b, b:e, b:s:e'
+ *
+ * b,e,s are integers, where:
+ * b = beginning
+ * e = end
+ * s = step
+ *
+ * The sequence includes 'end', 1:3 => [ 1 2 3 ].
+ * The 'step' has to be positive.
+ */
+inline void BuildIntegerVector(const std::vector<std::vector<int32> >& in,
+                               std::vector<int32>* out) {
+  // start with empty vector,
+  out->clear();
+  // loop over records,
+  for (int32 i = 0; i < in.size(); i++) {
+    // process i'th record,
+    int32 beg, end, step;
+    switch (in[i].size()) {
+      case 1:
+        beg  = in[i][0];
+        end  = in[i][0];
+        step = 1;
+        break;
+      case 2:
+        beg  = in[i][0];
+        end  = in[i][1];
+        step = 1;
+        break;
+      case 3:
+        beg  = in[i][0];
+        end  = in[i][2];
+        step = in[i][1];
+        break;
+      default:
+        KALDI_ERR << "Something is wrong! (should be 1-3) : "
+                  << in[i].size();
+    }
+    // check the inputs,
+    KALDI_ASSERT(beg <= end);
+    KALDI_ASSERT(step > 0);  // positive,
+    // append values to vector,
+    for (int32 j = beg; j <= end; j += step) {
+      out->push_back(j);
+    }
+  }
+}
+
+/**
+ * Wrapper with 'CuArray<int32>' output.
+ */
+inline void BuildIntegerVector(const std::vector<std::vector<int32> >& in,
+                               CuArray<int32>* out) {
+  std::vector<int32> v;
+  BuildIntegerVector(in, &v);
+  (*out) = v;
+}
+
+
 /**
- * Convert Posterior to CuMatrix, 
+ * Convert Posterior to CuMatrix,
  * the Posterior outer-dim defines number of matrix-rows,
  * number of matrix-colmuns is set by 'num_cols'.
  */
 template <typename Real>
-void PosteriorToMatrix(const Posterior &post, int32 num_cols, CuMatrix<Real> *mat) {
+void PosteriorToMatrix(const Posterior &post,
+                       const int32 num_cols, CuMatrix<Real> *mat) {
   // Make a host-matrix,
   int32 num_rows = post.size();
-  Matrix<Real> m(num_rows, num_cols, kSetZero); // zero-filled
+  Matrix<Real> m(num_rows, num_cols, kSetZero);  // zero-filled
   // Fill from Posterior,
   for (int32 t = 0; t < post.size(); t++) {
     for (int32 i = 0; i < post[t].size(); i++) {
       int32 col = post[t][i].first;
       if (col >= num_cols) {
-        KALDI_ERR << "Out-of-bound Posterior element with index " << col 
+        KALDI_ERR << "Out-of-bound Posterior element with index " << col
                   << ", higher than number of columns " << num_cols;
       }
       m(t, col) = post[t][i].second;
     }
   }
   // Copy to output GPU matrix,
-  (*mat) = m; 
+  (*mat) = m;
 }
 
 /**
- * Convert Posterior to CuMatrix, while mapping to PDFs. 
+ * Convert Posterior to CuMatrix, while mapping to PDFs.
  * The Posterior outer-dim defines number of matrix-rows,
  * number of matrix-colmuns is set by 'TransitionModel::NumPdfs'.
  */
 template <typename Real>
-void PosteriorToMatrixMapped(const Posterior &post, const TransitionModel &model, CuMatrix<Real> *mat) {
+void PosteriorToMatrixMapped(const Posterior &post,
+                             const TransitionModel &model,
+                             Matrix<Real> *mat) {
   // Make a host-matrix,
   int32 num_rows = post.size(),
         num_cols = model.NumPdfs();
-  Matrix<Real> m(num_rows, num_cols, kSetZero); // zero-filled
+  mat->Resize(num_rows, num_cols, kSetZero);  // zero-filled,
   // Fill from Posterior,
   for (int32 t = 0; t < post.size(); t++) {
     for (int32 i = 0; i < post[t].size(); i++) {
       int32 col = model.TransitionIdToPdf(post[t][i].first);
       if (col >= num_cols) {
-        KALDI_ERR << "Out-of-bound Posterior element with index " << col 
+        KALDI_ERR << "Out-of-bound Posterior element with index " << col
                   << ", higher than number of columns " << num_cols;
       }
-      m(t, col) += post[t][i].second; // sum,
+      (*mat)(t, col) += post[t][i].second;  // sum,
     }
   }
+}
+
+/**
+ * Wrapper of PosteriorToMatrixMapped with CuMatrix argument.
+ */
+template <typename Real>
+void PosteriorToMatrixMapped(const Posterior &post,
+                             const TransitionModel &model,
+                             CuMatrix<Real> *mat) {
+  Matrix<BaseFloat> m;
+  PosteriorToMatrixMapped(post, model, &m);
   // Copy to output GPU matrix,
-  (*mat) = m; 
+  (*mat) = m;
 }
 
 
-} // namespace nnet1
-} // namespace kaldi
 
-#endif
+}  // namespace nnet1
+}  // namespace kaldi
+
+#endif  // KALDI_NNET_NNET_UTILS_H_
diff --git a/src/nnet/nnet-various.h b/src/nnet/nnet-various.h
index 0f109bbe5b2..ee9ea25d5da 100644
--- a/src/nnet/nnet-various.h
+++ b/src/nnet/nnet-various.h
@@ -1,6 +1,6 @@
 // nnet/nnet-various.h
 
-// Copyright 2012-2015  Brno University of Technology (author: Karel Vesely)
+// Copyright 2012-2016  Brno University of Technology (author: Karel Vesely)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -21,18 +21,19 @@
 #ifndef KALDI_NNET_NNET_VARIOUS_H_
 #define KALDI_NNET_NNET_VARIOUS_H_
 
+#include <string>
+#include <vector>
+#include <algorithm>
+#include <sstream>
+
 #include "nnet/nnet-component.h"
 #include "nnet/nnet-utils.h"
 #include "cudamatrix/cu-math.h"
 #include "util/text-utils.h"
 
-#include <algorithm>
-#include <sstream>
-
 namespace kaldi {
 namespace nnet1 {
 
-
 /**
  * Splices the time context of the input features
  * in N, out k*N, FrameOffset o_1,o_2,...,o_k
@@ -40,9 +41,10 @@ namespace nnet1 {
  */
 class Splice: public Component {
  public:
-  Splice(int32 dim_in, int32 dim_out)
-    : Component(dim_in, dim_out)
+  Splice(int32 dim_in, int32 dim_out):
+    Component(dim_in, dim_out)
   { }
+
   ~Splice()
   { }
 
@@ -50,19 +52,18 @@ class Splice: public Component {
   ComponentType GetType() const { return kSplice; }
 
   void InitData(std::istream &is) {
-    // define options
-    std::vector<int32> frame_offsets;
+    // define options,
     std::vector<std::vector<int32> > build_vector;
-    // parse config
-    std::string token; 
-    while (!is.eof()) {
-      ReadToken(is, false, &token); 
+    // parse config,
+    std::string token;
+    while (is >> std::ws, !is.eof()) {
+      ReadToken(is, false, &token);
       /**/ if (token == "<ReadVector>") {
-        ReadIntegerVector(is, false, &frame_offsets);
-      } else if (token == "<BuildVector>") { 
-        // <BuildVector> 1:1:1000 1:1:1000 1 2 3 1:10 </BuildVector> [matlab indexing]
-        // read the colon-separated-lists:
-        while (!is.eof()) { 
+        frame_offsets_.Read(is, false);
+      } else if (token == "<BuildVector>") {
+        // Parse the list of 'matlab-like' indices:
+        // <BuildVector> 1:1:1000 1 2 3 1:10 </BuildVector>
+        while (is >> std::ws, !is.eof()) {
           std::string colon_sep_list_or_end;
           ReadToken(is, false, &colon_sep_list_or_end);
           if (colon_sep_list_or_end == "</BuildVector>") break;
@@ -74,58 +75,26 @@ class Splice: public Component {
         KALDI_ERR << "Unknown token " << token << ", a typo in config?"
                   << " (ReadVector|BuildVector)";
       }
-      is >> std::ws; // eat-up whitespace
     }
 
-    // build the vector, using <BuildVector> ... </BuildVector> inputs
     if (build_vector.size() > 0) {
-      for (int32 i=0; i<build_vector.size(); i++) {
-        switch (build_vector[i].size()) {
-          case 1:
-            frame_offsets.push_back(build_vector[i][0]);
-            break;
-          case 2: { // assuming step 1
-            int32 min=build_vector[i][0], max=build_vector[i][1];
-            KALDI_ASSERT(min <= max);
-            for (int32 j=min; j<=max; j++) {
-              frame_offsets.push_back(j);
-            }}
-            break;
-          case 3: { // step can be negative -> flipped min/max
-            int32 min=build_vector[i][0], step=build_vector[i][1], max=build_vector[i][2];
-            KALDI_ASSERT((min <= max && step > 0) || (min >= max && step < 0));
-            for (int32 j=min; j<=max; j += step) {
-              frame_offsets.push_back(j);
-            }}
-            break;
-          case 0:
-          default: 
-            KALDI_ERR << "Error parsing <BuildVector>";
-        }
-      }
+      // build the vector, using <BuildVector> ... </BuildVector> inputs,
+      BuildIntegerVector(build_vector, &frame_offsets_);
     }
-    
-    // copy to GPU
-    frame_offsets_ = frame_offsets;
 
     // check dim
     KALDI_ASSERT(frame_offsets_.Dim()*InputDim() == OutputDim());
   }
 
-
   void ReadData(std::istream &is, bool binary) {
-    std::vector<int32> frame_offsets;
-    ReadIntegerVector(is, binary, &frame_offsets);
-    frame_offsets_ = frame_offsets; // to GPU
+    frame_offsets_.Read(is, binary);
     KALDI_ASSERT(frame_offsets_.Dim() * InputDim() == OutputDim());
   }
 
   void WriteData(std::ostream &os, bool binary) const {
-    std::vector<int32> frame_offsets(frame_offsets_.Dim());
-    frame_offsets_.CopyToVec(&frame_offsets);
-    WriteIntegerVector(os, binary, frame_offsets);
+    frame_offsets_.Write(os, binary);
   }
-  
+
   std::string Info() const {
     std::ostringstream ostr;
     ostr << "\n  frame_offsets " << frame_offsets_;
@@ -134,13 +103,17 @@ class Splice: public Component {
     return str;
   }
 
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in, CuMatrixBase<BaseFloat> *out) {
-    cu::Splice(in, frame_offsets_, out); 
+  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
+                    CuMatrixBase<BaseFloat> *out) {
+    cu::Splice(in, frame_offsets_, out);
   }
 
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in, const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff, CuMatrixBase<BaseFloat> *in_diff) {
-    KALDI_ERR << __func__ << "Not implemented!";
+  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
+                        const CuMatrixBase<BaseFloat> &out,
+                        const CuMatrixBase<BaseFloat> &out_diff,
+                        CuMatrixBase<BaseFloat> *in_diff) {
+    KALDI_ERR << Component::TypeToMarker(GetType()) << " : "
+              << __func__ << "() Not implemented!";
   }
 
  protected:
@@ -148,15 +121,15 @@ class Splice: public Component {
 };
 
 
-
 /**
  * Rearrange the matrix columns according to the indices in copy_from_indices_
  */
 class CopyComponent: public Component {
  public:
-  CopyComponent(int32 dim_in, int32 dim_out)
-    : Component(dim_in, dim_out)
+  CopyComponent(int32 dim_in, int32 dim_out):
+    Component(dim_in, dim_out)
   { }
+
   ~CopyComponent()
   { }
 
@@ -164,19 +137,18 @@ class CopyComponent: public Component {
   ComponentType GetType() const { return kCopy; }
 
   void InitData(std::istream &is) {
-    // define options
-    std::vector<int32> copy_from_indices;
+    // define options,
     std::vector<std::vector<int32> > build_vector;
-    // parse config
-    std::string token; 
-    while (!is.eof()) {
-      ReadToken(is, false, &token); 
+    // parse config,
+    std::string token;
+    while (is >> std::ws, !is.eof()) {
+      ReadToken(is, false, &token);
       /**/ if (token == "<ReadVector>") {
-        ReadIntegerVector(is, false, &copy_from_indices);
-      } else if (token == "<BuildVector>") { 
-        // <BuildVector> 1:1:1000 1:1:1000 1 2 3 1:10 </BuildVector> [matlab indexing]
-        // read the colon-separated-lists:
-        while (!is.eof()) { 
+        copy_from_indices_.Read(is, false);
+      } else if (token == "<BuildVector>") {
+        // <BuildVector> 1:1:1000 1:1:1000 1 2 3 1:10 </BuildVector>
+        // 'matlab-line' indexing, read the colon-separated-lists:
+        while (is >> std::ws, !is.eof()) {
           std::string colon_sep_list_or_end;
           ReadToken(is, false, &colon_sep_list_or_end);
           if (colon_sep_list_or_end == "</BuildVector>") break;
@@ -188,93 +160,54 @@ class CopyComponent: public Component {
         KALDI_ERR << "Unknown token " << token << ", a typo in config?"
                   << " (ReadVector|BuildVector)";
       }
-      is >> std::ws; // eat-up whitespace
     }
 
-    // build the vector, using <BuildVector> ... </BuildVector> inputs
     if (build_vector.size() > 0) {
-      for (int32 i=0; i<build_vector.size(); i++) {
-        switch (build_vector[i].size()) {
-          case 1:
-            copy_from_indices.push_back(build_vector[i][0]);
-            break;
-          case 2: { // assuming step 1
-            int32 min=build_vector[i][0], max=build_vector[i][1];
-            KALDI_ASSERT(min <= max);
-            for (int32 j=min; j<=max; j++) {
-              copy_from_indices.push_back(j);
-            }}
-            break;
-          case 3: { // step can be negative -> flipped min/max
-            int32 min=build_vector[i][0], step=build_vector[i][1], max=build_vector[i][2];
-            KALDI_ASSERT((min <= max && step > 0) || (min >= max && step < 0));
-            for (int32 j=min; j<=max; j += step) {
-              copy_from_indices.push_back(j);
-            }}
-            break;
-          case 0:
-          default: 
-            KALDI_ERR << "Error parsing <BuildVector>";
-        }
-      }
+      // build the vector, using <BuildVector> ... </BuildVector> inputs,
+      BuildIntegerVector(build_vector, &copy_from_indices_);
     }
-    
-    // decrease by 1
-    std::vector<int32>& v = copy_from_indices;
-    std::transform(v.begin(), v.end(), v.begin(), op_decrease);
-    // copy to GPU
-    copy_from_indices_ = copy_from_indices;
-
-    // check range
-    for (int32 i=0; i<copy_from_indices.size(); i++) {
-      KALDI_ASSERT(copy_from_indices[i] >= 0);
-      KALDI_ASSERT(copy_from_indices[i] < InputDim());
-    }
-    // check dim
+
+    // decrease by 1,
+    copy_from_indices_.Add(-1);
+
+    // check range,
+    KALDI_ASSERT(copy_from_indices_.Min() >= 0);
+    KALDI_ASSERT(copy_from_indices_.Max() < InputDim());
+    // check dim,
     KALDI_ASSERT(copy_from_indices_.Dim() == OutputDim());
   }
 
-  void ReadData(std::istream &is, bool binary) { 
-    std::vector<int32> copy_from_indices;
-    ReadIntegerVector(is, binary, &copy_from_indices);
-    // -1 from each element 
-    std::vector<int32>& v = copy_from_indices;
-    std::transform(v.begin(), v.end(), v.begin(), op_decrease);
-    // 
-    copy_from_indices_ = copy_from_indices;
+  void ReadData(std::istream &is, bool binary) {
+    copy_from_indices_.Read(is, binary);
     KALDI_ASSERT(copy_from_indices_.Dim() == OutputDim());
+    copy_from_indices_.Add(-1);  // -1 from each element,
   }
 
   void WriteData(std::ostream &os, bool binary) const {
-    std::vector<int32> copy_from_indices(copy_from_indices_.Dim());
-    copy_from_indices_.CopyToVec(&copy_from_indices);
-    // +1 to each element 
-    std::vector<int32>& v = copy_from_indices;
-    std::transform(v.begin(), v.end(), v.begin(), op_increase);
-    // 
-    WriteIntegerVector(os, binary, copy_from_indices); 
-  }
- 
+    CuArray<int32> tmp(copy_from_indices_);
+    tmp.Add(1);  // +1 to each element,
+    tmp.Write(os, binary);
+  }
+
   std::string Info() const {
-    /*
-    std::ostringstream ostr;
-    ostr << "\n  copy_from_indices " << copy_from_indices_;
-    std::string str = ostr.str();
-    str.erase(str.end()-1);
-    return str;
-    */
-    return "";
+    return std::string("\n  min ") + ToString(copy_from_indices_.Min()) +
+                         ", max "  + ToString(copy_from_indices_.Max());
   }
-  
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in, CuMatrixBase<BaseFloat> *out) { 
-    cu::Copy(in,copy_from_indices_,out); 
+
+  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
+                    CuMatrixBase<BaseFloat> *out) {
+    cu::Copy(in, copy_from_indices_,out);
   }
 
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in, const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff, CuMatrixBase<BaseFloat> *in_diff) {
+  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
+                        const CuMatrixBase<BaseFloat> &out,
+                        const CuMatrixBase<BaseFloat> &out_diff,
+                        CuMatrixBase<BaseFloat> *in_diff) {
     static bool warning_displayed = false;
     if (!warning_displayed) {
-      KALDI_WARN << __func__ << "Not implemented!";
+      KALDI_WARN << Component::TypeToMarker(GetType()) << " : "
+                 << __func__ << "() Not implemented!";
+
       warning_displayed = true;
     }
     in_diff->SetZero();
@@ -282,23 +215,67 @@ class CopyComponent: public Component {
 
  protected:
   CuArray<int32> copy_from_indices_;
+};
+
+
+
+/**
+ * Rescale the matrix-rows to have unit length (L2-norm).
+ */
+class LengthNormComponent: public Component {
+ public:
+  LengthNormComponent(int32 dim_in, int32 dim_out):
+    Component(dim_in, dim_out)
+  { }
+
+  ~LengthNormComponent()
+  { }
+
+  Component* Copy() const { return new LengthNormComponent(*this); }
+  ComponentType GetType() const { return kLengthNormComponent; }
+
+  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
+                    CuMatrixBase<BaseFloat> *out) {
+    // resize vector when needed,
+    if (row_scales_.Dim() != in.NumRows()) {
+      row_scales_.Resize(in.NumRows());
+    }
+    // get the normalization scalars,
+    l2_aux_ = in;
+    l2_aux_.MulElements(l2_aux_);  // x^2,
+    row_scales_.AddColSumMat(1.0, l2_aux_, 0.0);  // sum_of_cols(x^2),
+    row_scales_.ApplyPow(0.5);  // L2norm = sqrt(sum_of_cols(x^2)),
+    row_scales_.InvertElements();  // 1/L2norm,
+    // compute the output,
+    out->CopyFromMat(in);
+    out->MulRowsVec(row_scales_);  // re-normalize,
+  }
+
+  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
+                        const CuMatrixBase<BaseFloat> &out,
+                        const CuMatrixBase<BaseFloat> &out_diff,
+                        CuMatrixBase<BaseFloat> *in_diff) {
+    in_diff->CopyFromMat(out_diff);
+    in_diff->MulRowsVec(row_scales_);  // diff_by_x(s * x) = s,
+  }
 
  private:
-  static int32 op_increase (int32 i) { return ++i; }
-  static int32 op_decrease (int32 i) { return --i; }
+  CuMatrix<BaseFloat> l2_aux_;  ///< auxiliary matrix for L2 norm computation,
+  CuVector<BaseFloat> row_scales_;  ///< normalization scale of each row,
 };
 
 
-
 /**
  * Adds shift to all the lines of the matrix
  * (can be used for global mean normalization)
  */
 class AddShift : public UpdatableComponent {
  public:
-  AddShift(int32 dim_in, int32 dim_out)
-    : UpdatableComponent(dim_in, dim_out), shift_data_(dim_in), learn_rate_coef_(1.0)
+  AddShift(int32 dim_in, int32 dim_out):
+    UpdatableComponent(dim_in, dim_out),
+    shift_data_(dim_in)
   { }
+
   ~AddShift()
   { }
 
@@ -309,21 +286,20 @@ class AddShift : public UpdatableComponent {
     // define options
     float init_param = 0.0;
     // parse config
-    std::string token; 
-    while (!is.eof()) {
-      ReadToken(is, false, &token); 
+    std::string token;
+    while (is >> std::ws, !is.eof()) {
+      ReadToken(is, false, &token);
       /**/ if (token == "<InitParam>") ReadBasicType(is, false, &init_param);
       else if (token == "<LearnRateCoef>") ReadBasicType(is, false, &learn_rate_coef_);
       else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
                      << " (InitParam)";
-      is >> std::ws; // eat-up whitespace
     }
     // initialize
-    shift_data_.Resize(InputDim(), kSetZero); // set to zero
+    shift_data_.Resize(InputDim(), kSetZero);  // set to zero
     shift_data_.Set(init_param);
   }
 
-  void ReadData(std::istream &is, bool binary) { 
+  void ReadData(std::istream &is, bool binary) {
     // optional learning-rate coef,
     if ('<' == Peek(is, binary)) {
       ExpectToken(is, binary, "<LearnRateCoef>");
@@ -333,79 +309,86 @@ class AddShift : public UpdatableComponent {
     shift_data_.Read(is, binary);
   }
 
-  void WriteData(std::ostream &os, bool binary) const { 
+  void WriteData(std::ostream &os, bool binary) const {
     WriteToken(os, binary, "<LearnRateCoef>");
     WriteBasicType(os, binary, learn_rate_coef_);
     shift_data_.Write(os, binary);
   }
-  
+
   int32 NumParams() const { return shift_data_.Dim(); }
 
-  void GetParams(Vector<BaseFloat>* wei_copy) const {
-    wei_copy->Resize(InputDim());
-    shift_data_.CopyToVec(wei_copy);
+  void GetGradient(VectorBase<BaseFloat>* gradient) const {
+    KALDI_ASSERT(gradient->Dim() == NumParams());
+    shift_data_grad_.CopyToVec(gradient);
+  }
+
+  void GetParams(VectorBase<BaseFloat>* params) const {
+    KALDI_ASSERT(params->Dim() == NumParams());
+    shift_data_.CopyToVec(params);
+  }
+
+  void SetParams(const VectorBase<BaseFloat>& params) {
+    KALDI_ASSERT(params.Dim() == NumParams());
+    shift_data_.CopyFromVec(params);
   }
-   
+
   std::string Info() const {
-    return std::string("\n  shift_data") + MomentStatistics(shift_data_);
+    return std::string("\n  shift_data") +
+      MomentStatistics(shift_data_) +
+      ", lr-coef " + ToString(learn_rate_coef_);
   }
 
   std::string InfoGradient() const {
-    return std::string("\n  shift_data_grad") + MomentStatistics(shift_data_grad_) + 
-           ", lr-coef " + ToString(learn_rate_coef_);
+    return std::string("\n  shift_data_grad") +
+      MomentStatistics(shift_data_grad_) +
+      ", lr-coef " + ToString(learn_rate_coef_);
   }
 
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in, CuMatrixBase<BaseFloat> *out) { 
+  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
+                    CuMatrixBase<BaseFloat> *out) {
+    // copy, add the shift,
     out->CopyFromMat(in);
-    //add the shift
     out->AddVecToRows(1.0, shift_data_, 1.0);
   }
 
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in, const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff, CuMatrixBase<BaseFloat> *in_diff) {
-    //derivative of additive constant is zero...
+  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
+                        const CuMatrixBase<BaseFloat> &out,
+                        const CuMatrixBase<BaseFloat> &out_diff,
+                        CuMatrixBase<BaseFloat> *in_diff) {
+    // the derivative of additive constant is zero...
     in_diff->CopyFromMat(out_diff);
   }
 
-  void Update(const CuMatrixBase<BaseFloat> &input, const CuMatrixBase<BaseFloat> &diff) {
-    // we use following hyperparameters from the option class
+  void Update(const CuMatrixBase<BaseFloat> &input,
+              const CuMatrixBase<BaseFloat> &diff) {
+    // we use following hyperparameters from the option class,
     const BaseFloat lr = opts_.learn_rate;
-    // gradient
-    shift_data_grad_.Resize(InputDim(), kSetZero); // reset
+    // gradient,
+    shift_data_grad_.Resize(InputDim(), kSetZero);  // reset to zero,
     shift_data_grad_.AddRowSumMat(1.0, diff, 0.0);
-    // update
-    shift_data_.AddVec(-lr*learn_rate_coef_, shift_data_grad_);
+    // update,
+    shift_data_.AddVec(-lr * learn_rate_coef_, shift_data_grad_);
   }
 
-  // Data accessors
-  const CuVectorBase<BaseFloat>& GetShiftVec() {
-    return shift_data_;
-  }
-  void SetShiftVec(const CuVectorBase<BaseFloat>& shift_data) {
-    KALDI_ASSERT(shift_data.Dim() == shift_data_.Dim());
-    shift_data_.CopyFromVec(shift_data);
-  }
-  void SetLearnRateCoef(float c) {
-    learn_rate_coef_ = c;
-  }
+  void SetLearnRateCoef(float c) { learn_rate_coef_ = c; }
 
  protected:
   CuVector<BaseFloat> shift_data_;
   CuVector<BaseFloat> shift_data_grad_;
-  BaseFloat learn_rate_coef_;
 };
 
 
-
 /**
  * Rescale the data column-wise by a vector
  * (can be used for global variance normalization)
  */
 class Rescale : public UpdatableComponent {
  public:
-  Rescale(int32 dim_in, int32 dim_out)
-    : UpdatableComponent(dim_in, dim_out), scale_data_(dim_in), learn_rate_coef_(1.0)
+  Rescale(int32 dim_in, int32 dim_out):
+    UpdatableComponent(dim_in, dim_out),
+    scale_data_(dim_in)
   { }
+
   ~Rescale()
   { }
 
@@ -416,14 +399,13 @@ class Rescale : public UpdatableComponent {
     // define options
     float init_param = 0.0;
     // parse config
-    std::string token; 
-    while (!is.eof()) {
-      ReadToken(is, false, &token); 
+    std::string token;
+    while (is >> std::ws, !is.eof()) {
+      ReadToken(is, false, &token);
       /**/ if (token == "<InitParam>") ReadBasicType(is, false, &init_param);
       else if (token == "<LearnRateCoef>") ReadBasicType(is, false, &learn_rate_coef_);
       else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
                      << " (InitParam)";
-      is >> std::ws; // eat-up whitespace
     }
     // initialize
     scale_data_.Resize(InputDim(), kSetZero);
@@ -440,7 +422,7 @@ class Rescale : public UpdatableComponent {
     scale_data_.Read(is, binary);
   }
 
-  void WriteData(std::ostream &os, bool binary) const { 
+  void WriteData(std::ostream &os, bool binary) const {
     WriteToken(os, binary, "<LearnRateCoef>");
     WriteBasicType(os, binary, learn_rate_coef_);
     scale_data_.Write(os, binary);
@@ -448,66 +430,70 @@ class Rescale : public UpdatableComponent {
 
   int32 NumParams() const { return scale_data_.Dim(); }
 
-  void GetParams(Vector<BaseFloat>* wei_copy) const {
-    wei_copy->Resize(InputDim());
-    scale_data_.CopyToVec(wei_copy);
+  void GetGradient(VectorBase<BaseFloat>* gradient) const {
+    KALDI_ASSERT(gradient->Dim() == NumParams());
+    scale_data_grad_.CopyToVec(gradient);
   }
- 
+
+  void GetParams(VectorBase<BaseFloat>* params) const {
+    KALDI_ASSERT(params->Dim() == NumParams());
+    scale_data_.CopyToVec(params);
+  }
+
+  void SetParams(const VectorBase<BaseFloat>& params) {
+    KALDI_ASSERT(params.Dim() == NumParams());
+    scale_data_.CopyFromVec(params);
+  }
+
   std::string Info() const {
-    return std::string("\n  scale_data") + MomentStatistics(scale_data_);
+    return std::string("\n  scale_data") +
+      MomentStatistics(scale_data_) +
+      ", lr-coef " + ToString(learn_rate_coef_);
   }
-  
+
   std::string InfoGradient() const {
-    return std::string("\n  scale_data_grad") + MomentStatistics(scale_data_grad_) +
-           ", lr-coef " + ToString(learn_rate_coef_);
+    return std::string("\n  scale_data_grad") +
+      MomentStatistics(scale_data_grad_) +
+      ", lr-coef " + ToString(learn_rate_coef_);
   }
-  
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in, CuMatrixBase<BaseFloat> *out) { 
+
+  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
+                    CuMatrixBase<BaseFloat> *out) {
+    // copy, rescale the data,
     out->CopyFromMat(in);
-    // rescale the data
     out->MulColsVec(scale_data_);
   }
 
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in, const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff, CuMatrixBase<BaseFloat> *in_diff) {
+  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
+                        const CuMatrixBase<BaseFloat> &out,
+                        const CuMatrixBase<BaseFloat> &out_diff,
+                        CuMatrixBase<BaseFloat> *in_diff) {
+    // derivatives are scaled with the scale_data_,
     in_diff->CopyFromMat(out_diff);
-    // derivative gets also scaled by the scale_data_
     in_diff->MulColsVec(scale_data_);
   }
 
-  void Update(const CuMatrixBase<BaseFloat> &input, const CuMatrixBase<BaseFloat> &diff) {
-    // we use following hyperparameters from the option class
+  void Update(const CuMatrixBase<BaseFloat> &input,
+              const CuMatrixBase<BaseFloat> &diff) {
+    // we use following hyperparameters from the option class,
     const BaseFloat lr = opts_.learn_rate;
-    // gradient
-    scale_data_grad_.Resize(InputDim(), kSetZero); // reset
+    // gradient,
+    scale_data_grad_.Resize(InputDim(), kSetZero);  // reset,
     CuMatrix<BaseFloat> gradient_aux(diff);
     gradient_aux.MulElements(input);
     scale_data_grad_.AddRowSumMat(1.0, gradient_aux, 0.0);
-    // update
-    scale_data_.AddVec(-lr*learn_rate_coef_, scale_data_grad_);
+    // update,
+    scale_data_.AddVec(-lr * learn_rate_coef_, scale_data_grad_);
   }
 
-  // Data accessors
-  const CuVectorBase<BaseFloat>& GetScaleVec() {
-    return scale_data_;
-  }
-  void SetScaleVec(const CuVectorBase<BaseFloat>& scale_data) {
-    KALDI_ASSERT(scale_data.Dim() == scale_data_.Dim());
-    scale_data_.CopyFromVec(scale_data);
-  }
-  void SetLearnRateCoef(float c) {
-    learn_rate_coef_ = c;
-  }
+  void SetLearnRateCoef(float c) { learn_rate_coef_ = c; }
 
  protected:
   CuVector<BaseFloat> scale_data_;
   CuVector<BaseFloat> scale_data_grad_;
-  BaseFloat learn_rate_coef_;
 };
 
+}  // namespace nnet1
+}  // namespace kaldi
 
-
-} // namespace nnet1
-} // namespace kaldi
-
-#endif // KALDI_NNET_NNET_VARIOUS_H_
+#endif  // KALDI_NNET_NNET_VARIOUS_H_
diff --git a/src/nnet2/Makefile b/src/nnet2/Makefile
index dc74b5a9732..83d69a3186d 100644
--- a/src/nnet2/Makefile
+++ b/src/nnet2/Makefile
@@ -21,13 +21,13 @@ OBJFILES = nnet-component.o nnet-nnet.o train-nnet.o train-nnet-ensemble.o nnet-
      get-feature-transform.o widen-nnet.o nnet-precondition-online.o \
      nnet-example-functions.o nnet-compute-discriminative.o \
      nnet-compute-discriminative-parallel.o online-nnet2-decodable.o \
-     train-nnet-perturbed.o nnet-compute-online.o
+     nnet-compute-online.o
 
 LIBNAME = kaldi-nnet2
 
 ADDLIBS = ../thread/kaldi-thread.a ../lat/kaldi-lat.a ../gmm/kaldi-gmm.a \
       ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a ../transform/kaldi-transform.a \
       ../cudamatrix/kaldi-cudamatrix.a ../matrix/kaldi-matrix.a \
-      ../base/kaldi-base.a  ../util/kaldi-util.a 
+       ../util/kaldi-util.a ../thread/kaldi-thread.a ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/nnet2/am-nnet-test.cc b/src/nnet2/am-nnet-test.cc
index f3cf0b8850c..ce864320700 100644
--- a/src/nnet2/am-nnet-test.cc
+++ b/src/nnet2/am-nnet-test.cc
@@ -18,8 +18,10 @@
 // limitations under the License.
 
 #include "hmm/transition-model.h"
+#include "hmm/hmm-test-utils.h"
 #include "nnet2/am-nnet.h"
 
+
 namespace kaldi {
 namespace nnet2 {
 
@@ -38,11 +40,11 @@ void UnitTestAmNnet() {
   ContextDependency *ctx_dep =
       GenRandContextDependencyLarge(phones, N, P,
                                     true, &num_pdf_classes);
-  
+
   HmmTopology topo = GetDefaultTopology(phones);
-  
+
   TransitionModel trans_model(*ctx_dep, topo);
-  
+
   delete ctx_dep; // We won't need this further.
   ctx_dep = NULL;
 
@@ -58,7 +60,7 @@ void UnitTestAmNnet() {
   priors.Scale(1.0 / priors.Sum());
 
   am_nnet.SetPriors(priors);
-  
+
   bool binary = (rand() % 2 == 0);
   std::ostringstream os;
   am_nnet.Write(os, binary);
@@ -83,4 +85,4 @@ int main() {
   UnitTestAmNnet();
   return 0;
 }
-  
+
diff --git a/src/nnet2/decodable-am-nnet.h b/src/nnet2/decodable-am-nnet.h
index ca2a2cfd996..e3dedb33727 100644
--- a/src/nnet2/decodable-am-nnet.h
+++ b/src/nnet2/decodable-am-nnet.h
@@ -164,7 +164,7 @@ class DecodableAmNnetParallel: public DecodableInterface {
     return (frame == NumFramesReady() - 1);
   }
   ~DecodableAmNnetParallel() {
-    if (feats_) delete feats_;
+    delete feats_;
   }
  protected:
   const TransitionModel &trans_model_;
diff --git a/src/nnet2/nnet-component-test.cc b/src/nnet2/nnet-component-test.cc
index 906b9999b0a..6e5206312d0 100644
--- a/src/nnet2/nnet-component-test.cc
+++ b/src/nnet2/nnet-component-test.cc
@@ -27,8 +27,8 @@ namespace nnet2 {
 
 void UnitTestGenericComponentInternal(const Component &component,
                                       const ChunkInfo in_info,
-                                      const ChunkInfo out_info)  { 
-  
+                                      const ChunkInfo out_info)  {
+
   CuMatrix<BaseFloat> input(in_info.NumRows(), in_info.NumCols()),
       output(1, out_info.NumRows() * out_info.NumCols());
   input.SetRandn();
@@ -56,20 +56,20 @@ void UnitTestGenericComponentInternal(const Component &component,
     component_copy = Component::ReadNew(ki.Stream(), binary_in);
   }
   unlink("tmpf");
-  
+
   { // Test backward derivative is correct.
     CuVector<BaseFloat> output_objfs(out_info.NumRows());
     output_objfs.AddMatVec(1.0, output, kNoTrans, objf_vec, 0.0);
     BaseFloat objf = output_objfs.Sum();
 
-    
+
     CuMatrix<BaseFloat> output_deriv(output.NumRows(), output.NumCols());
     for (int32 i = 0; i < output_deriv.NumRows(); i++)
       output_deriv.Row(i).CopyFromVec(objf_vec);
 
     CuMatrix<BaseFloat> input_deriv(input.NumRows(), input.NumCols());
 
-    
+
     CuMatrix<BaseFloat> empty_mat;
     CuMatrix<BaseFloat> &input_ref =
         (component_copy->BackpropNeedsInput() ? input : empty_mat),
@@ -90,7 +90,7 @@ void UnitTestGenericComponentInternal(const Component &component,
           srand(rand_seed);
           rand_component->ResetGenerator();
         }
-      }        
+      }
       perturbed_input.SetRandn();
       perturbed_input.Scale(1.0e-04); // scale by a small amount so it's like a delta.
       BaseFloat predicted_difference = TraceMatMat(perturbed_input,
@@ -106,7 +106,7 @@ void UnitTestGenericComponentInternal(const Component &component,
             srand(rand_seed);
             rand_component->ResetGenerator();
           }
-        }        
+        }
         component.Propagate(in_info, out_info, perturbed_input, &perturbed_output);
         CuVector<BaseFloat> perturbed_output_objfs(out_info.NumRows());
         perturbed_output_objfs.AddMatVec(1.0, perturbed_output, kNoTrans,
@@ -140,7 +140,7 @@ void UnitTestGenericComponentInternal(const Component &component,
 
     int32 num_ok = 0, num_bad = 0, num_tries = 10;
     KALDI_LOG << "Comparing model gradients " << num_tries << " times.";
-    for (int32 i = 0; i < num_tries; i++) {    
+    for (int32 i = 0; i < num_tries; i++) {
       UpdatableComponent *perturbed_ucomponent =
           dynamic_cast<UpdatableComponent*>(ucomponent->Copy()),
           *gradient_ucomponent =
@@ -149,7 +149,7 @@ void UnitTestGenericComponentInternal(const Component &component,
       gradient_ucomponent->SetZero(true); // set params to zero and treat as gradient.
       BaseFloat perturb_stddev = 5.0e-04;
       perturbed_ucomponent->PerturbParams(perturb_stddev);
-      
+
       CuVector<BaseFloat> output_objfs(out_info.NumRows());
       output_objfs.AddMatVec(1.0, output, kNoTrans, objf_vec, 0.0);
       BaseFloat objf = output_objfs.Sum();
@@ -174,7 +174,7 @@ void UnitTestGenericComponentInternal(const Component &component,
             srand(rand_seed);
             rand_component->ResetGenerator();
           }
-        }        
+        }
         perturbed_ucomponent->Propagate(in_info, out_info, input, &output_perturbed);
         CuVector<BaseFloat> output_objfs_perturbed(out_info.NumRows());
         output_objfs_perturbed.AddMatVec(1.0, output_perturbed,
@@ -185,7 +185,7 @@ void UnitTestGenericComponentInternal(const Component &component,
       BaseFloat delta_objf_observed = objf_perturbed - objf,
           delta_objf_predicted = (perturbed_ucomponent->DotProduct(*gradient_ucomponent) -
                                   ucomponent->DotProduct(*gradient_ucomponent));
-      
+
       KALDI_LOG << "Model gradients: comparing " << delta_objf_observed
                 << " and " << delta_objf_predicted;
       if (fabs(delta_objf_predicted - delta_objf_observed) >
@@ -227,7 +227,7 @@ void UnitTestGenericComponentInternal(const Component &component) {
 void UnitTestSigmoidComponent() {
   // We're testing that the gradients are computed correctly:
   // the input gradients and the model gradients.
-  
+
   int32 input_dim = 10 + Rand() % 50;
   {
     SigmoidComponent sigmoid_component(input_dim);
@@ -244,10 +244,10 @@ template<class T>
 void UnitTestGenericComponent(std::string extra_str = "") {
   // works if it has an initializer from int,
   // e.g. tanh, sigmoid.
-  
+
   // We're testing that the gradients are computed correctly:
   // the input gradients and the model gradients.
-  
+
   int32 input_dim = 10 + Rand() % 50;
   {
     T component(input_dim);
@@ -263,7 +263,7 @@ void UnitTestGenericComponent(std::string extra_str = "") {
 void UnitTestMaxoutComponent() {
   // works if it has an initializer from int,
   // e.g. tanh, sigmoid.
-  
+
   // We're testing that the gradients are computed correctly:
   // the input gradients and the model gradients.
 
@@ -271,7 +271,7 @@ void UnitTestMaxoutComponent() {
     int32 output_dim = 10 + Rand() % 20,
         group_size = 1 + Rand() % 10,
         input_dim = output_dim * group_size;
-    
+
     MaxoutComponent component(input_dim, output_dim);
     UnitTestGenericComponentInternal(component);
   }
@@ -286,7 +286,7 @@ void UnitTestMaxoutComponent() {
 void UnitTestPnormComponent() {
   // works if it has an initializer from int,
   // e.g. tanh, sigmoid.
-  
+
   // We're testing that the gradients are computed correctly:
   // the input gradients and the model gradients.
 
@@ -295,7 +295,7 @@ void UnitTestPnormComponent() {
         group_size = 1 + Rand() % 10,
         input_dim = output_dim * group_size;
     BaseFloat p = 0.8 + 0.1 * (Rand() % 20);
-    
+
     PnormComponent component(input_dim, output_dim, p);
     UnitTestGenericComponentInternal(component);
   }
@@ -376,28 +376,32 @@ void UnitTestConvolutional1dComponent() {
     if (Rand() % 2 == 0) {
       component.Init(learning_rate, input_dim, output_dim,
                      patch_dim, patch_step, patch_stride,
-                     param_stddev, bias_stddev);
+                     param_stddev, bias_stddev, true);
     } else {
-      // initialize the hyper-parameters
-      component.Init(learning_rate, input_dim, output_dim,
-                     patch_dim, patch_step, patch_stride,
-                     param_stddev, bias_stddev);
       Matrix<BaseFloat> mat(num_filters, filter_dim + 1);
       mat.SetRandn();
       mat.Scale(param_stddev);
       WriteKaldiObject(mat, "tmpf", true);
       Sleep(0.5);
-      component.Init(learning_rate, "tmpf");
+      component.Init(learning_rate, patch_dim,
+                     patch_step, patch_stride, "tmpf", false);
       unlink("tmpf");
     }
     UnitTestGenericComponentInternal(component);
   }
   {
+    // appended-conv is false by default
     const char *str = "learning-rate=0.01 input-dim=100 output-dim=70 param-stddev=0.1 patch-dim=4 patch-step=1 patch-stride=10";
     Convolutional1dComponent component;
     component.InitFromString(str);
     UnitTestGenericComponentInternal(component);
   }
+  {
+    const char *str = "learning-rate=0.01 input-dim=100 output-dim=70 param-stddev=0.1 patch-dim=4 patch-step=1 patch-stride=10 appended-conv=true";
+    Convolutional1dComponent component;
+    component.InitFromString(str);
+    UnitTestGenericComponentInternal(component);
+  }
 }
 
 void UnitTestDropoutComponent() {
@@ -451,7 +455,7 @@ void UnitTestAdditiveNoiseComponent() {
   }
   if (num_fail >= num_tries/2) {
     KALDI_ERR << "Too many test failures.";
-  }  
+  }
 }
 
 void UnitTestScaleComponent() {
@@ -543,7 +547,7 @@ void UnitTestBlockAffineComponent() {
   int32 num_blocks = 1 + Rand() % 3,
          input_dim = num_blocks * (2 + Rand() % 4),
         output_dim = num_blocks * (2 + Rand() % 4);
-  
+
   {
     BlockAffineComponent component;
     component.Init(learning_rate, input_dim, output_dim,
@@ -564,7 +568,7 @@ void UnitTestBlockAffineComponentPreconditioned() {
   int32 num_blocks = 1 + Rand() % 3,
          input_dim = num_blocks * (2 + Rand() % 4),
         output_dim = num_blocks * (2 + Rand() % 4);
-  
+
   {
     BlockAffineComponentPreconditioned component;
     component.Init(learning_rate, input_dim, output_dim,
@@ -584,8 +588,8 @@ void UnitTestSumGroupComponent() {
   std::vector<int32> sizes;
   int32 num_sizes = 1 + Rand() % 5;
   for (int32 i = 0; i < num_sizes; i++)
-    sizes.push_back(1 + Rand() % 5); 
-  
+    sizes.push_back(1 + Rand() % 5);
+
   {
     SumGroupComponent component;
     component.Init(sizes);
@@ -717,7 +721,7 @@ void UnitTestParsing() {
                  && b == false && s == "");
     s = "x=y foo=true a=b";
     KALDI_ASSERT(ParseFromString("foo", &s, &b) == true
-                 && b == true && s == "x=y a=b");    
+                 && b == true && s == "x=y a=b");
   }
 
   {
@@ -759,7 +763,7 @@ void UnitTestSpliceComponent() {
       // (-left_context, right_context)
       KALDI_LOG << "Testing contiguous splice component";
       splice_indexes.reserve(right_context - left_context + 1);
-      for (int32 i = left_context; i <= right_context; i++) 
+      for (int32 i = left_context; i <= right_context; i++)
         splice_indexes.push_back(i);
     } else  {
       // generate random splice indexes in range (-left_context, right_context)
@@ -808,7 +812,7 @@ void BasicDebugTestForSpliceMax(bool output=false) {
   int32 C=5,
         context_len=2,
         R= 3 + 2*context_len;
- 
+
   SpliceMaxComponent *c = new SpliceMaxComponent();
   std::vector<int32> context(2 * context_len + 1);
   for (int32 i = -1 * context_len; i <= context_len; i++)
@@ -824,17 +828,17 @@ void BasicDebugTestForSpliceMax(bool output=false) {
     KALDI_LOG << in;
 
   c->Propagate(in_info, out_info, in, &out);
-  
-  if (output) 
+
+  if (output)
     KALDI_LOG << out;
 
   out.Set(5.0);
-  
+
   if (output)
     KALDI_LOG << out;
-  
+
   c->Backprop(in_info, out_info, in, in, out, c, &in_deriv);
-  
+
   if (output)
     KALDI_LOG << in_deriv;
 
@@ -860,7 +864,7 @@ int main() {
     else
       CuDevice::Instantiate().SelectGpuId("optional"); // -2 .. automatic selection
 #endif
-    
+
     BasicDebugTestForSpliceMax(true);
     for (int32 i = 0; i < 3; i++) {
       UnitTestGenericComponent<SigmoidComponent>();
@@ -873,8 +877,8 @@ int main() {
       UnitTestGenericComponent<RectifiedLinearComponent>();
       UnitTestGenericComponent<SoftHingeComponent>();
       UnitTestSpliceComponent();
-      UnitTestMaxoutComponent(); 
-      UnitTestPnormComponent(); 
+      UnitTestMaxoutComponent();
+      UnitTestPnormComponent();
       UnitTestMaxpoolingComponent();
       UnitTestGenericComponent<NormalizeComponent>();
       UnitTestSigmoidComponent();
diff --git a/src/nnet2/nnet-component.cc b/src/nnet2/nnet-component.cc
index fd28060788c..9c03738cf1c 100644
--- a/src/nnet2/nnet-component.cc
+++ b/src/nnet2/nnet-component.cc
@@ -3682,12 +3682,14 @@ void AdditiveNoiseComponent::Propagate(const ChunkInfo &in_info,
 
 Convolutional1dComponent::Convolutional1dComponent():
     UpdatableComponent(),
-    patch_dim_(0), patch_step_(0), patch_stride_(0), is_gradient_(false) {}
+    patch_dim_(0), patch_step_(0), patch_stride_(0),
+    appended_conv_(false), is_gradient_(false) {}
 
 Convolutional1dComponent::Convolutional1dComponent(const Convolutional1dComponent &component):
     UpdatableComponent(component),
     filter_params_(component.filter_params_),
     bias_params_(component.bias_params_),
+    appended_conv_(component.appended_conv_),
     is_gradient_(component.is_gradient_) {}
 
 Convolutional1dComponent::Convolutional1dComponent(const CuMatrixBase<BaseFloat> &filter_params,
@@ -3698,6 +3700,7 @@ Convolutional1dComponent::Convolutional1dComponent(const CuMatrixBase<BaseFloat>
     bias_params_(bias_params) {
   KALDI_ASSERT(filter_params.NumRows() == bias_params.Dim() &&
                bias_params.Dim() != 0);
+  appended_conv_ = false;
   is_gradient_ = false;
 }
 
@@ -3718,12 +3721,14 @@ int32 Convolutional1dComponent::OutputDim() const {
 // initialize the component using hyperparameters
 void Convolutional1dComponent::Init(BaseFloat learning_rate,
                                     int32 input_dim, int32 output_dim,
-                                    int32 patch_dim, int32 patch_step, int32 patch_stride,
-                                    BaseFloat param_stddev, BaseFloat bias_stddev) {
+                                    int32 patch_dim, int32 patch_step,
+                                    int32 patch_stride, BaseFloat param_stddev,
+                                    BaseFloat bias_stddev, bool appended_conv) {
   UpdatableComponent::Init(learning_rate);
   patch_dim_ = patch_dim;
   patch_step_ = patch_step;
   patch_stride_ = patch_stride;
+  appended_conv_ = appended_conv;
   int32 num_splice = input_dim / patch_stride;
   int32 filter_dim = num_splice * patch_dim;
   int32 num_patches = 1 + (patch_stride - patch_dim) / patch_step;
@@ -3742,9 +3747,15 @@ void Convolutional1dComponent::Init(BaseFloat learning_rate,
 }
 
 // initialize the component using predefined matrix file
-void Convolutional1dComponent::Init(BaseFloat learning_rate,
-                                    std::string matrix_filename) {
+void Convolutional1dComponent::Init(BaseFloat learning_rate, int32 patch_dim,
+                                    int32 patch_step, int32 patch_stride,
+                                    std::string matrix_filename,
+                                    bool appended_conv) {
   UpdatableComponent::Init(learning_rate);
+  patch_dim_ = patch_dim;
+  patch_step_ = patch_step;
+  patch_stride_ = patch_stride;
+  appended_conv_ = appended_conv;
   CuMatrix<BaseFloat> mat;
   ReadKaldiObject(matrix_filename, &mat);
   KALDI_ASSERT(mat.NumCols() >= 2);
@@ -3794,6 +3805,7 @@ std::string Convolutional1dComponent::Info() const {
          << ", filter-dim=" << filter_dim
          << ", filter-params-stddev=" << filter_stddev
          << ", bias-params-stddev=" << bias_stddev
+         << ", appended-conv=" << appended_conv_
          << ", learning-rate=" << LearningRate();
   return stream.str();
 }
@@ -3801,15 +3813,20 @@ std::string Convolutional1dComponent::Info() const {
 // initialize the component using configuration file
 void Convolutional1dComponent::InitFromString(std::string args) {
   std::string orig_args(args);
-  bool ok = true;
+  bool ok = true, appended_conv = false;
   BaseFloat learning_rate = learning_rate_;
   std::string matrix_filename;
   int32 input_dim = -1, output_dim = -1;
   int32 patch_dim = -1, patch_step = -1, patch_stride = -1;
   ParseFromString("learning-rate", &args, &learning_rate);
+  ParseFromString("appended-conv", &args, &appended_conv);
+  ok = ok && ParseFromString("patch-dim", &args, &patch_dim);
+  ok = ok && ParseFromString("patch-step", &args, &patch_step);
+  ok = ok && ParseFromString("patch-stride", &args, &patch_stride);
   if (ParseFromString("matrix", &args, &matrix_filename)) {
     // initialize from prefined parameter matrix
-    Init(learning_rate, matrix_filename);
+    Init(learning_rate, patch_dim, patch_step, patch_stride,
+         matrix_filename, appended_conv);
     if (ParseFromString("input-dim", &args, &input_dim))
       KALDI_ASSERT(input_dim == InputDim() &&
                "input-dim mismatch vs. matrix.");
@@ -3820,14 +3837,11 @@ void Convolutional1dComponent::InitFromString(std::string args) {
     // initialize from configuration
     ok = ok && ParseFromString("input-dim", &args, &input_dim);
     ok = ok && ParseFromString("output-dim", &args, &output_dim);
-    ok = ok && ParseFromString("patch-dim", &args, &patch_dim);
-    ok = ok && ParseFromString("patch-step", &args, &patch_step);
-    ok = ok && ParseFromString("patch-stride", &args, &patch_stride);
     BaseFloat param_stddev = 1.0 / std::sqrt(input_dim), bias_stddev = 1.0;
     ParseFromString("param-stddev", &args, &param_stddev);
     ParseFromString("bias-stddev", &args, &bias_stddev);
-    Init(learning_rate, input_dim, output_dim,
-         patch_dim, patch_step, patch_stride, param_stddev, bias_stddev);
+    Init(learning_rate, input_dim, output_dim, patch_dim,
+         patch_step, patch_stride, param_stddev, bias_stddev, appended_conv);
   }
   if (!args.empty())
     KALDI_ERR << "Could not process these elements in initializer: " << args;
@@ -3837,28 +3851,23 @@ void Convolutional1dComponent::InitFromString(std::string args) {
 
 // propagation function
 
-/* Convolutional propagation is explained:
- - Recall the AffineComponent, input X is defined #frames x $input-dim,
-   linear matrix A is defined $output-dim x $input-dim, and bias
-   vector B is defined by length $output-dim. The propagation is
-   Y = X * A' + B                                     (1)
-   where "*" is row-by-row processing of X, executing vector-matrix
-   multiplication 
-   Y(t) = X(t) * A' + B                               (2)
-   which converts each row of input of dim $input-dim to a row of output of
-   dim $output-dim by A' (' defines transpose).
- - In Convolution1dComponent, A is redefined $num-filters x $filter-dim,
-   and bias vector B is redefined by length $num-filters. The propatation is
-   Y = X o A' + B                                     (3)
-   where "o" is also row-by-row processing of X, but executing vector-matrix
-   convolution, which consists of a group of vector-vector convolutions.
+/*
+   In Convolution1dComponent, filter is defined $num-filters x $filter-dim,
+   and bias vector B is defined by length $num-filters. The propatation is
+   Y = X o A' + B
+   where "o" is executing matrix-matrix convolution, which consists of a group
+   of vector-matrix convolutions.
    For instance, the convolution of X(t) and the i-th filter A(i) is
-   Y(t,i) = X(t) o A'(i) + B(i)                       (4)
+   Y(t,i) = X(t) o A'(i) + B(i)
    The convolution used here is valid convolution. Meaning that the
    output of M o N is of dim |M| - |N| + 1, assuming M is not shorter then N.
 
-   Note that in all the equations, B is extended to proper dimensions
-   for legal addition.
+   By default, input is arranged by
+   x (time), y (channel), z(frequency)
+   and output is arranged by
+   x (time), y (frequency), z(channel).
+   When appending convolutional1dcomponent, appended_conv_ should be
+   set ture for the appended convolutional1dcomponent.
 */
 void Convolutional1dComponent::Propagate(const ChunkInfo &in_info,
                                          const ChunkInfo &out_info,
@@ -3885,23 +3894,53 @@ void Convolutional1dComponent::Propagate(const ChunkInfo &in_info,
   std::vector<int32> column_map(filter_dim * num_patches);
 
   // build-up a column selection map
-  for (int32 p = 0, index = 0; p < num_patches; p++) {
-    for (int32 s = 0; s < num_splice; s++) {
-        for (int32 d = 0; d < patch_dim_; d++, index++) {
-        column_map[index] = p * patch_step_ + s * patch_stride_ + d;
+  for (int32 patch = 0, index = 0; patch < num_patches; patch++) {
+    int32 fstride = patch * patch_step_;
+    for (int32 splice = 0; splice < num_splice; splice++) {
+      int32 cstride = splice * patch_stride_;
+      for (int32 d = 0; d < patch_dim_; d++, index++) {
+        if (appended_conv_)
+          column_map[index] = (fstride + d) * num_splice + splice;
+        else
+          column_map[index] = fstride + cstride + d;
       }
     }
   }
   CuArray<int32> cu_cols(column_map);
   patches.CopyCols(in, cu_cols);
 
+  //
   // compute filter activations
+  //
+
+  std::vector<CuSubMatrix<BaseFloat>* > tgt_batch, patch_batch, filter_params_batch;
+
+  CuSubMatrix<BaseFloat>* filter_params_elem = new CuSubMatrix<BaseFloat>(
+		  filter_params_, 0, filter_params_.NumRows(), 0, 
+		  filter_params_.NumCols());
+  
+  // form batch in vector container
+  for (int32 p = 0; p < num_patches; p++) {
+    // form batch in vector container. for filter_params_batch, all elements 
+    // point to the same copy filter_params_elem
+    tgt_batch.push_back(new CuSubMatrix<BaseFloat>(out->ColRange(p * num_filters,
+				    num_filters)));
+    patch_batch.push_back(new CuSubMatrix<BaseFloat>(patches.ColRange(p * filter_dim,
+				    filter_dim)));
+    filter_params_batch.push_back(filter_params_elem);
+
+    tgt_batch[p]->AddVecToRows(1.0, bias_params_, 0.0); // add bias
+  }
+  
+  // apply all filters
+  AddMatMatBatched<BaseFloat>(1.0, tgt_batch, patch_batch, kNoTrans, filter_params_batch,
+		  kTrans, 1.0);
+
+  // release memory
+  delete filter_params_elem;
   for (int32 p = 0; p < num_patches; p++) {
-    CuSubMatrix<BaseFloat> tgt(out->ColRange(p * num_filters, num_filters));
-    CuSubMatrix<BaseFloat> patch(patches.ColRange(p * filter_dim, filter_dim));
-    tgt.AddVecToRows(1.0, bias_params_, 0.0); // add bias
-    // apply all filters
-    tgt.AddMatMat(1.0, patch, kNoTrans, filter_params_, kTrans, 1.0);
+    delete tgt_batch[p];
+    delete patch_batch[p];
   }
 }
 
@@ -4000,22 +4039,48 @@ void Convolutional1dComponent::Backprop(const ChunkInfo &in_info,
    */
   CuMatrix<BaseFloat> patches_deriv(num_frames, filter_dim * num_patches, kSetZero);
 
+  //
   // backpropagate to vector of matrices
   // (corresponding to position of a filter)
+  //
+  std::vector<CuSubMatrix<BaseFloat>* > patch_deriv_batch, out_deriv_batch, 
+	  filter_params_batch;
+
+  CuSubMatrix<BaseFloat>* filter_params_elem = new CuSubMatrix<BaseFloat>(
+		  filter_params_, 0, filter_params_.NumRows(), 0, 
+		  filter_params_.NumCols());
+
+  // form batch in vector container
   for (int32 p = 0; p < num_patches; p++) {
-    CuSubMatrix<BaseFloat> patch_deriv(patches_deriv.ColRange(p * filter_dim, filter_dim));
-    CuSubMatrix<BaseFloat> out_deriv_patch(out_deriv.ColRange(p * num_filters,
-                                                              num_filters));
-    patch_deriv.AddMatMat(1.0, out_deriv_patch, kNoTrans,
-                          filter_params_, kNoTrans, 0.0);
+    // form batch in vector container. for filter_params_batch, all elements 
+    // point to the same copy filter_params_elem
+    patch_deriv_batch.push_back(new CuSubMatrix<BaseFloat>(patches_deriv.ColRange(
+				    p * filter_dim, filter_dim)));
+    out_deriv_batch.push_back(new CuSubMatrix<BaseFloat>(out_deriv.ColRange(
+				    p * num_filters, num_filters)));
+    filter_params_batch.push_back(filter_params_elem);  
+  }
+  AddMatMatBatched<BaseFloat>(1.0, patch_deriv_batch, out_deriv_batch, kNoTrans, 
+		  filter_params_batch, kNoTrans, 0.0);
+
+  // release memory
+  delete filter_params_elem;
+  for (int32 p = 0; p < num_patches; p++) {
+    delete patch_deriv_batch[p];
+    delete out_deriv_batch[p];
   }
 
   // sum the derivatives into in_deriv
   std::vector<int32> column_map(filter_dim * num_patches);
-  for (int32 p = 0, index = 0; p < num_patches; p++) {
-    for (int32 s = 0; s < num_splice; s++) {
+  for (int32 patch = 0, index = 0; patch < num_patches; patch++) {
+    int32 fstride = patch * patch_step_;
+    for (int32 splice = 0; splice < num_splice; splice++) {
+      int32 cstride = splice * patch_stride_;
       for (int32 d = 0; d < patch_dim_; d++, index++) {
-        column_map[index] = p * patch_step_ + s * patch_stride_ + d;
+        if (appended_conv_)
+          column_map[index] = (fstride + d) * num_splice + splice;
+        else
+          column_map[index] = fstride + cstride + d;
       }
     }
   }
@@ -4054,17 +4119,25 @@ void Convolutional1dComponent::Read(std::istream &is, bool binary) {
   // of how ReadNew() works.
   ExpectOneOrTwoTokens(is, binary, ostr_beg.str(), "<LearningRate>");
   ReadBasicType(is, binary, &learning_rate_);
-  ExpectOneOrTwoTokens(is, binary, ostr_beg.str(), "<PatchDim>");
+  ExpectToken(is, binary, "<PatchDim>");
   ReadBasicType(is, binary, &patch_dim_);
-  ExpectOneOrTwoTokens(is, binary, ostr_beg.str(), "<PatchStep>");
+  ExpectToken(is, binary, "<PatchStep>");
   ReadBasicType(is, binary, &patch_step_);
-  ExpectOneOrTwoTokens(is, binary, ostr_beg.str(), "<PatchStride>");
+  ExpectToken(is, binary, "<PatchStride>");
   ReadBasicType(is, binary, &patch_stride_);
-  ExpectToken(is, binary, "<FilterParams>");
+  // back-compatibility
+  std::string tok;
+  ReadToken(is, binary, &tok);
+  if (tok == "<AppendedConv>") {
+    ReadBasicType(is, binary, &appended_conv_);
+    ExpectToken(is, binary, "<FilterParams>");
+  } else {
+    appended_conv_ = false;
+    KALDI_ASSERT(tok == "<FilterParams>");
+  }
   filter_params_.Read(is, binary);
   ExpectToken(is, binary, "<BiasParams>");
   bias_params_.Read(is, binary);
-  std::string tok;
   ReadToken(is, binary, &tok);
   if (tok == "<IsGradient>") {
     ReadBasicType(is, binary, &is_gradient_);
@@ -4088,6 +4161,8 @@ void Convolutional1dComponent::Write(std::ostream &os, bool binary) const {
   WriteBasicType(os, binary, patch_step_);
   WriteToken(os, binary, "<PatchStride>");
   WriteBasicType(os, binary, patch_stride_);
+  WriteToken(os, binary, "<AppendedConv>");
+  WriteBasicType(os, binary, appended_conv_);
   WriteToken(os, binary, "<FilterParams>");
   filter_params_.Write(os, binary);
   WriteToken(os, binary, "<BiasParams>");
@@ -4112,6 +4187,7 @@ Component* Convolutional1dComponent::Copy() const {
   ans->patch_stride_ = patch_stride_;
   ans->filter_params_ = filter_params_;
   ans->bias_params_ = bias_params_;
+  ans->appended_conv_ = appended_conv_;
   ans->is_gradient_ = is_gradient_;
   return ans;
 }
@@ -4155,10 +4231,15 @@ void Convolutional1dComponent::Update(const CuMatrixBase<BaseFloat> &in_value,
    */
   CuMatrix<BaseFloat> patches(num_frames, filter_dim * num_patches, kUndefined);
   std::vector<int32> column_map(filter_dim * num_patches);
-  for (int32 p = 0, index = 0; p < num_patches; p++) {
-    for (int32 s = 0; s < num_splice; s++) {
+  for (int32 patch = 0, index = 0; patch < num_patches; patch++) {
+    int32 fstride = patch * patch_step_;
+    for (int32 splice = 0; splice < num_splice; splice++) {
+      int32 cstride = splice * patch_stride_;
       for (int32 d = 0; d < patch_dim_; d++, index++) {
-        column_map[index] = p * patch_step_ + s * patch_stride_ + d;
+        if (appended_conv_)
+          column_map[index] = (fstride + d) * num_splice + splice;
+        else
+          column_map[index] = fstride + cstride + d;
       }
     }
   }
@@ -4170,13 +4251,49 @@ void Convolutional1dComponent::Update(const CuMatrixBase<BaseFloat> &in_value,
   //
   filters_grad.Resize(num_filters, filter_dim, kSetZero); // reset
   bias_grad.Resize(num_filters, kSetZero); // reset
+
+  //
   // use all the patches
-  for (int32 p = 0; p < num_patches; p++) { // sum
-    CuSubMatrix<BaseFloat> diff_patch(out_deriv.ColRange(p * num_filters,
-                                                         num_filters));
-    CuSubMatrix<BaseFloat> patch(patches.ColRange(p * filter_dim, filter_dim));
-    filters_grad.AddMatMat(1.0, diff_patch, kTrans, patch, kNoTrans, 1.0);
-    bias_grad.AddRowSumMat(1.0, diff_patch, 1.0);
+  //
+
+  // create a single large matrix holding the smaller matrices 
+  // from the vector container filters_grad_batch along the rows
+  CuMatrix<BaseFloat> filters_grad_blocks_batch(
+		  num_patches * filters_grad.NumRows(), filters_grad.NumCols());
+
+  std::vector<CuSubMatrix<BaseFloat>* > filters_grad_batch, diff_patch_batch, 
+	  patch_batch;
+  for (int32 p = 0; p < num_patches; p++) {
+    // form batch in vector container
+    filters_grad_batch.push_back(new CuSubMatrix<BaseFloat>(
+			    filters_grad_blocks_batch.RowRange(
+				    p * filters_grad.NumRows(), 
+				    filters_grad.NumRows())));
+    diff_patch_batch.push_back(new CuSubMatrix<BaseFloat>(out_deriv.ColRange(
+				    p * num_filters, num_filters)));
+    patch_batch.push_back(new CuSubMatrix<BaseFloat>(patches.ColRange(
+				    p * filter_dim, filter_dim)));
+  }
+
+  AddMatMatBatched<BaseFloat>(1.0, filters_grad_batch, diff_patch_batch, kTrans, patch_batch,
+		  kNoTrans, 1.0);
+
+  // add the row blocks together to filters_grad
+  filters_grad.AddMatBlocks(1.0, filters_grad_blocks_batch);
+
+  // create a matrix holding the col blocks sum of out_deriv
+  CuMatrix<BaseFloat> out_deriv_col_blocks_sum(out_deriv.NumRows(), num_filters);
+
+  // add the col blocks together to out_deriv_col_blocks_sum
+  out_deriv_col_blocks_sum.AddMatBlocks(1.0, out_deriv);
+
+  bias_grad.AddRowSumMat(1.0, out_deriv_col_blocks_sum, 1.0);
+
+  // release memory
+  for (int32 p = 0; p < num_patches; p++) {
+    delete filters_grad_batch[p];
+    delete diff_patch_batch[p];
+    delete patch_batch[p];    
   }
 
   //
@@ -4225,6 +4342,11 @@ void MaxpoolingComponent::InitFromString(std::string args) {
   Init(input_dim, output_dim, pool_size, pool_stride);
 }
 
+/*
+   Input and output of maxpooling component is arranged as
+   x (time), y (frequency), z (channel)
+   for efficient pooling.
+ */
 void MaxpoolingComponent::Propagate(const ChunkInfo &in_info,
                                     const ChunkInfo &out_info,
                                     const CuMatrixBase<BaseFloat> &in,
diff --git a/src/nnet2/nnet-component.h b/src/nnet2/nnet-component.h
index 9a06651472f..2a4ada44e0c 100644
--- a/src/nnet2/nnet-component.h
+++ b/src/nnet2/nnet-component.h
@@ -2,7 +2,7 @@
 
 // Copyright 2011-2013  Karel Vesely
 //           2012-2014  Johns Hopkins University (author: Daniel Povey)
-//                2013  Xiaohui Zhang    
+//                2013  Xiaohui Zhang
 //                2014  Vijayaditya Peddinti
 //           2014-2015  Guoguo Chen
 
@@ -38,8 +38,8 @@ namespace nnet2 {
 
 
 /**
-   ChunkInfo is a class whose purpose is to describe the structure of matrices 
-   holding features.  This is useful mostly in training time. 
+   ChunkInfo is a class whose purpose is to describe the structure of matrices
+   holding features.  This is useful mostly in training time.
    The main reason why we have this is to support efficient
    training for networks which we have splicing components that splice in a
    non-contiguous way, e.g. frames -5, 0 and 5.  We also have in mind future
@@ -67,21 +67,21 @@ namespace nnet2 {
      At the output of the last hidden layer (after the {-5, 0, 5} splice):
       feat_dim = 1024, num_chunks = 512, first_offset = 7, last_offset = 7
    (the decoding setup would still look pretty normal, so we don't give an example).
-    
+
 */
 class ChunkInfo {
  public:
   ChunkInfo()  // default constructor we assume this object will not be used
       : feat_dim_(0), num_chunks_(0),
-        first_offset_(0), last_offset_(0), 
+        first_offset_(0), last_offset_(0),
         offsets_() { }
- 
+
   ChunkInfo(int32 feat_dim, int32 num_chunks,
-            int32 first_offset, int32 last_offset ) 
+            int32 first_offset, int32 last_offset )
       : feat_dim_(feat_dim), num_chunks_(num_chunks),
         first_offset_(first_offset), last_offset_(last_offset),
         offsets_() { Check(); }
-  
+
   ChunkInfo(int32 feat_dim, int32 num_chunks,
             const std::vector<int32> offsets)
       : feat_dim_(feat_dim), num_chunks_(num_chunks),
@@ -95,14 +95,14 @@ class ChunkInfo {
   // As described above offsets can take a variety of values, we see the indices
   // corresponding to the offsets in each case
   // 1) if first_offset = 0 & last_offset = 691, then chunk has data
-  // corresponding to time offsets 0:691, so index = offset 
-  // 2) if first_offset = 7 & last_offset = 684, 
+  // corresponding to time offsets 0:691, so index = offset
+  // 2) if first_offset = 7 & last_offset = 684,
   //      then index = offset - first offset
   // 3) if offsets = {2, 10, 12} then indices for these offsets are 0, 1 and 2
- 
+
   // Returns the chunk row index corresponding to given time offset
   int32 GetIndex (int32 offset) const;
-  
+
   // Returns time offset at the current row index in the chunk
   int32 GetOffset (int32 index) const;
 
@@ -118,18 +118,18 @@ class ChunkInfo {
   inline int32 NumChunks() const { return num_chunks_; }
 
   /// Returns the number of rows that we expect the feature matrix to have.
-  int32 NumRows() const { 
+  int32 NumRows() const {
     return num_chunks_ * (!offsets_.empty() ? offsets_.size() :
                                          last_offset_ - first_offset_ + 1); }
 
   /// Returns the number of columns that we expect the feature matrix to have.
   int32 NumCols() const { return feat_dim_; }
-    
+
   /// Checks that the matrix has the size we expect, and die if not.
   void CheckSize(const CuMatrixBase<BaseFloat> &mat) const;
 
   /// Checks that the data in the ChunkInfo is valid, and die if not.
-  void Check() const;  
+  void Check() const;
 
  private:
   int32 feat_dim_;  // Feature dimension.
@@ -142,7 +142,7 @@ class ChunkInfo {
                              // a non-contiguous sequence.  If nonempty, it must
                              // be sorted, and offsets.front() == first_offset,
                              // offsets.back() == last_offset.
-  
+
 };
 
 /**
@@ -153,11 +153,11 @@ class ChunkInfo {
  * It is able to propagate and backpropagate
  * exact implementation is to be implemented in descendants.
  *
- */ 
+ */
 class Component {
  public:
   Component(): index_(-1) { }
-  
+
   virtual std::string Type() const = 0; // each type should return a string such as
   // "SigmoidComponent".
 
@@ -170,12 +170,12 @@ class Component {
   /// Initialize, typically from a line of a config file.  The "args" will
   /// contain any parameters that need to be passed to the Component, e.g.
   /// dimensions.
-  virtual void InitFromString(std::string args) = 0; 
-  
+  virtual void InitFromString(std::string args) = 0;
+
   /// Get size of input vectors
   virtual int32 InputDim() const = 0;
-  
-  /// Get size of output vectors 
+
+  /// Get size of output vectors
   virtual int32 OutputDim() const = 0;
 
   /// Return a vector describing the temporal context this component requires
@@ -212,8 +212,8 @@ class Component {
     // Cast to CuMatrixBase to use the virtual version of propagate function.
     Propagate(in_info, out_info, in,
               static_cast<CuMatrixBase<BaseFloat>*>(out));
-  } 
-  
+  }
+
   /// Perform backward pass propagation of the derivative, and
   /// also either update the model (if to_update == this) or
   /// update another model or compute the model derivative (otherwise).
@@ -227,7 +227,7 @@ class Component {
   virtual void Backprop(const ChunkInfo &in_info,
                         const ChunkInfo &out_info,
                         const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,                        
+                        const CuMatrixBase<BaseFloat> &out_value,
                         const CuMatrixBase<BaseFloat> &out_deriv,
                         Component *to_update, // may be identical to "this".
                         CuMatrix<BaseFloat> *in_deriv) const = 0;
@@ -236,7 +236,7 @@ class Component {
   // the "in_value" to Backprop may be a dummy variable.
   virtual bool BackpropNeedsOutput() const { return true; } // if this returns false,
   // the "out_value" to Backprop may be a dummy variable.
-  
+
   /// Read component from stream
   static Component* ReadNew(std::istream &is, bool binary);
 
@@ -250,12 +250,12 @@ class Component {
   static Component *NewFromString(const std::string &initializer_line);
 
   /// Return a new Component of the given type e.g. "SoftmaxComponent",
-  /// or NULL if no such type exists. 
+  /// or NULL if no such type exists.
   static Component *NewComponentOfType(const std::string &type);
-  
+
   virtual void Read(std::istream &is, bool binary) = 0; // This Read function
   // requires that the Component has the correct type.
-  
+
   /// Write component to stream
   virtual void Write(std::ostream &os, bool binary) const = 0;
 
@@ -271,7 +271,7 @@ class Component {
 
 /**
  * Class UpdatableComponent is a Component which has
- * trainable parameters and contains some global 
+ * trainable parameters and contains some global
  * parameters for stochastic gradient descent
  * (learning rate, L2 regularization constant).
  * This is a base-class for Components with parameters.
@@ -280,7 +280,7 @@ class UpdatableComponent: public Component {
  public:
   UpdatableComponent(const UpdatableComponent &other):
       learning_rate_(other.learning_rate_){ }
-  
+
   void Init(BaseFloat learning_rate) {
     learning_rate_ = learning_rate;
   }
@@ -293,9 +293,9 @@ class UpdatableComponent: public Component {
   /// other changes necessary (there's a variable we have to set for the
   /// MixtureProbComponent).
   virtual void SetZero(bool treat_as_gradient) = 0;
-  
+
   UpdatableComponent(): learning_rate_(0.001) { }
-  
+
   virtual ~UpdatableComponent() { }
 
   /// Here, "other" is a component of the same specific type.  This
@@ -303,31 +303,31 @@ class UpdatableComponent: public Component {
   /// automatically adjusting learning rates; typically, one of the two will
   /// actually contain the gradient.
   virtual BaseFloat DotProduct(const UpdatableComponent &other) const = 0;
-  
+
   /// We introduce a new virtual function that only applies to
   /// class UpdatableComponent.  This is used in testing.
   virtual void PerturbParams(BaseFloat stddev) = 0;
-  
+
   /// This new virtual function scales the parameters
-  /// by this amount.  
+  /// by this amount.
   virtual void Scale(BaseFloat scale) = 0;
 
   /// This new virtual function adds the parameters of another
   /// updatable component, times some constant, to the current
   /// parameters.
   virtual void Add(BaseFloat alpha, const UpdatableComponent &other) = 0;
-  
+
   /// Sets the learning rate of gradient descent
   void SetLearningRate(BaseFloat lrate) {  learning_rate_ = lrate; }
   /// Gets the learning rate of gradient descent
   BaseFloat LearningRate() const { return learning_rate_; }
 
   virtual std::string Info() const;
-  
+
   // The next few functions are not implemented everywhere; they are
   // intended for use by L-BFGS code, and we won't implement them
   // for all child classes.
-  
+
   /// The following new virtual function returns the total dimension of
   /// the parameters in this class.  E.g. used for L-BFGS update
   virtual int32 GetParameterDim() const { KALDI_ASSERT(0); return 0; }
@@ -340,8 +340,8 @@ class UpdatableComponent: public Component {
   virtual void UnVectorize(const VectorBase<BaseFloat> &params) {
     KALDI_ASSERT(0);
   }
-  
- protected: 
+
+ protected:
   BaseFloat learning_rate_; ///< learning rate (0.0..0.01)
  private:
   const UpdatableComponent &operator = (const UpdatableComponent &other); // Disallow.
@@ -355,19 +355,19 @@ class NonlinearComponent: public Component {
   explicit NonlinearComponent(int32 dim) { Init(dim); }
   NonlinearComponent(): dim_(0) { } // e.g. prior to Read().
   explicit NonlinearComponent(const NonlinearComponent &other);
-  
+
   virtual int32 InputDim() const { return dim_; }
   virtual int32 OutputDim() const { return dim_; }
-  
+
   /// We implement InitFromString at this level.
   virtual void InitFromString(std::string args);
-  
+
   /// We implement Read at this level as it just needs the Type().
   virtual void Read(std::istream &is, bool binary);
-  
+
   /// Write component to stream.
   virtual void Write(std::ostream &os, bool binary) const;
-  
+
   void Scale(BaseFloat scale); // relates to scaling stats, not parameters.
   void Add(BaseFloat alpha, const NonlinearComponent &other); // relates to
                                                               // adding stats
@@ -380,7 +380,7 @@ class NonlinearComponent: public Component {
 
   // The following function is used when "widening" neural networks.
   void SetDim(int32 dim);
-  
+
  protected:
   friend class NormalizationComponent;
   friend class SigmoidComponent;
@@ -389,7 +389,7 @@ class NonlinearComponent: public Component {
   friend class LogSoftmaxComponent;
   friend class RectifiedLinearComponent;
   friend class SoftHingeComponent;
-  
+
 
   // This function updates the stats "value_sum_", "deriv_sum_", and
   // count_. (If deriv == NULL, it won't update "deriv_sum_").
@@ -397,7 +397,7 @@ class NonlinearComponent: public Component {
   void UpdateStats(const CuMatrixBase<BaseFloat> &out_value,
                    const CuMatrixBase<BaseFloat> *deriv = NULL);
 
-  
+
   const NonlinearComponent &operator = (const NonlinearComponent &other); // Disallow.
   int32 dim_;
   CuVector<double> value_sum_; // stats at the output.
@@ -416,18 +416,18 @@ class MaxoutComponent: public Component {
   }
   MaxoutComponent(): input_dim_(0), output_dim_(0) { }
   virtual std::string Type() const { return "MaxoutComponent"; }
-  virtual void InitFromString(std::string args); 
+  virtual void InitFromString(std::string args);
   virtual int32 InputDim() const { return input_dim_; }
   virtual int32 OutputDim() const { return output_dim_; }
   using Component::Propagate; // to avoid name hiding
   virtual void Propagate(const ChunkInfo &in_info,
                          const ChunkInfo &out_info,
                          const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const; 
+                         CuMatrixBase<BaseFloat> *out) const;
   virtual void Backprop(const ChunkInfo &in_info,
                         const ChunkInfo &out_info,
                         const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &,  //out_value,                        
+                        const CuMatrixBase<BaseFloat> &,  //out_value,
                         const CuMatrixBase<BaseFloat> &out_deriv,
                         Component *to_update, // may be identical to "this".
                         CuMatrix<BaseFloat> *in_deriv) const;
@@ -435,10 +435,10 @@ class MaxoutComponent: public Component {
   virtual bool BackpropNeedsOutput() const { return true; }
   virtual Component* Copy() const { return new MaxoutComponent(input_dim_,
                                                               output_dim_); }
-  
+
   virtual void Read(std::istream &is, bool binary); // This Read function
   // requires that the Component has the correct type.
-  
+
   /// Write component to stream
   virtual void Write(std::ostream &os, bool binary) const;
 
@@ -461,7 +461,7 @@ class MaxoutComponent: public Component {
  * as 128 and 3 respectively. Maxpooling component would create an output
  * matrix of 512 x 1280. The 30 input neurons are grouped by a group size of 3, and
  * the maximum in a group is selected, creating a smaller feature map of 10.
- * 
+ *
  * Our pooling does not supports overlaps, which simplifies the
  * implementation (and was not helpful for Ossama).
  */
@@ -519,18 +519,18 @@ class PnormComponent: public Component {
   }
   PnormComponent(): input_dim_(0), output_dim_(0), p_(0) { }
   virtual std::string Type() const { return "PnormComponent"; }
-  virtual void InitFromString(std::string args); 
+  virtual void InitFromString(std::string args);
   virtual int32 InputDim() const { return input_dim_; }
   virtual int32 OutputDim() const { return output_dim_; }
   using Component::Propagate; // to avoid name hiding
   virtual void Propagate(const ChunkInfo &in_info,
                          const ChunkInfo &out_info,
                          const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const; 
+                         CuMatrixBase<BaseFloat> *out) const;
   virtual void Backprop(const ChunkInfo &in_info,
                         const ChunkInfo &out_info,
                         const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &,  //out_value,                        
+                        const CuMatrixBase<BaseFloat> &,  //out_value,
                         const CuMatrixBase<BaseFloat> &out_deriv,
                         Component *to_update, // may be identical to "this".
                         CuMatrix<BaseFloat> *in_deriv) const;
@@ -538,10 +538,10 @@ class PnormComponent: public Component {
   virtual bool BackpropNeedsOutput() const { return true; }
   virtual Component* Copy() const { return new PnormComponent(input_dim_,
                                                               output_dim_, p_); }
-  
+
   virtual void Read(std::istream &is, bool binary); // This Read function
   // requires that the Component has the correct type.
-  
+
   /// Write component to stream
   virtual void Write(std::ostream &os, bool binary) const;
 
@@ -565,11 +565,11 @@ class NormalizeComponent: public NonlinearComponent {
   virtual void Propagate(const ChunkInfo &in_info,
                          const ChunkInfo &out_info,
                          const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const; 
+                         CuMatrixBase<BaseFloat> *out) const;
   virtual void Backprop(const ChunkInfo &in_info,
                         const ChunkInfo &out_info,
                         const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,                        
+                        const CuMatrixBase<BaseFloat> &out_value,
                         const CuMatrixBase<BaseFloat> &out_deriv,
                         Component *to_update, // may be identical to "this".
                         CuMatrix<BaseFloat> *in_deriv) const;
@@ -585,7 +585,7 @@ class NormalizeComponent: public NonlinearComponent {
 class SigmoidComponent: public NonlinearComponent {
  public:
   explicit SigmoidComponent(int32 dim): NonlinearComponent(dim) { }
-  explicit SigmoidComponent(const SigmoidComponent &other): NonlinearComponent(other) { }    
+  explicit SigmoidComponent(const SigmoidComponent &other): NonlinearComponent(other) { }
   SigmoidComponent() { }
   virtual std::string Type() const { return "SigmoidComponent"; }
   virtual bool BackpropNeedsInput() const { return false; }
@@ -595,11 +595,11 @@ class SigmoidComponent: public NonlinearComponent {
   virtual void Propagate(const ChunkInfo &in_info,
                          const ChunkInfo &out_info,
                          const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const; 
+                         CuMatrixBase<BaseFloat> *out) const;
   virtual void Backprop(const ChunkInfo &in_info,
                         const ChunkInfo &out_info,
                         const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,                        
+                        const CuMatrixBase<BaseFloat> &out_value,
                         const CuMatrixBase<BaseFloat> &out_deriv,
                         Component *to_update, // may be identical to "this".
                         CuMatrix<BaseFloat> *in_deriv) const;
@@ -620,11 +620,11 @@ class TanhComponent: public NonlinearComponent {
   virtual void Propagate(const ChunkInfo &in_info,
                          const ChunkInfo &out_info,
                          const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const; 
+                         CuMatrixBase<BaseFloat> *out) const;
   virtual void Backprop(const ChunkInfo &in_info,
                         const ChunkInfo &out_info,
                         const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,                        
+                        const CuMatrixBase<BaseFloat> &out_value,
                         const CuMatrixBase<BaseFloat> &out_deriv,
                         Component *to_update, // may be identical to "this".
                         CuMatrix<BaseFloat> *in_deriv) const;
@@ -642,18 +642,18 @@ class PowerComponent: public NonlinearComponent {
   }
   PowerComponent(): dim_(0), power_(2) { }
   virtual std::string Type() const { return "PowerComponent"; }
-  virtual void InitFromString(std::string args); 
+  virtual void InitFromString(std::string args);
   virtual int32 InputDim() const { return dim_; }
   virtual int32 OutputDim() const { return dim_; }
   using Component::Propagate; // to avoid name hiding
   virtual void Propagate(const ChunkInfo &in_info,
                          const ChunkInfo &out_info,
                          const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const; 
+                         CuMatrixBase<BaseFloat> *out) const;
   virtual void Backprop(const ChunkInfo &in_info,
                         const ChunkInfo &out_info,
                         const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,                        
+                        const CuMatrixBase<BaseFloat> &out_value,
                         const CuMatrixBase<BaseFloat> &out_deriv,
                         Component *to_update, // may be identical to "this".
                         CuMatrix<BaseFloat> *in_deriv) const;
@@ -662,7 +662,7 @@ class PowerComponent: public NonlinearComponent {
   virtual Component* Copy() const { return new PowerComponent(dim_, power_); }
   virtual void Read(std::istream &is, bool binary); // This Read function
   // requires that the Component has the correct type.
-  
+
   /// Write component to stream
   virtual void Write(std::ostream &os, bool binary) const;
 
@@ -686,11 +686,11 @@ class RectifiedLinearComponent: public NonlinearComponent {
   virtual void Propagate(const ChunkInfo &in_info,
                          const ChunkInfo &out_info,
                          const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const; 
+                         CuMatrixBase<BaseFloat> *out) const;
   virtual void Backprop(const ChunkInfo &in_info,
                         const ChunkInfo &out_info,
                         const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,                        
+                        const CuMatrixBase<BaseFloat> &out_value,
                         const CuMatrixBase<BaseFloat> &out_deriv,
                         Component *to_update, // may be identical to "this".
                         CuMatrix<BaseFloat> *in_deriv) const;
@@ -711,11 +711,11 @@ class SoftHingeComponent: public NonlinearComponent {
   virtual void Propagate(const ChunkInfo &in_info,
                          const ChunkInfo &out_info,
                          const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const; 
+                         CuMatrixBase<BaseFloat> *out) const;
   virtual void Backprop(const ChunkInfo &in_info,
                         const ChunkInfo &out_info,
                         const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,                        
+                        const CuMatrixBase<BaseFloat> &out_value,
                         const CuMatrixBase<BaseFloat> &out_deriv,
                         Component *to_update, // may be identical to "this".
                         CuMatrix<BaseFloat> *in_deriv) const;
@@ -741,11 +741,11 @@ class ScaleComponent: public Component {
   virtual void Propagate(const ChunkInfo &in_info,
                          const ChunkInfo &out_info,
                          const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const; 
+                         CuMatrixBase<BaseFloat> *out) const;
   virtual void Backprop(const ChunkInfo &in_info,
                         const ChunkInfo &out_info,
                         const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,                        
+                        const CuMatrixBase<BaseFloat> &out_value,
                         const CuMatrixBase<BaseFloat> &out_deriv,
                         Component *to_update, // may be identical to "this".
                         CuMatrix<BaseFloat> *in_deriv) const;
@@ -753,15 +753,15 @@ class ScaleComponent: public Component {
   virtual int32 InputDim() const { return dim_; }
   virtual int32 OutputDim() const { return dim_; }
   virtual void Read(std::istream &is, bool binary);
-  
+
   virtual void Write(std::ostream &os, bool binary) const;
 
   void Init(int32 dim, BaseFloat scale);
-  
-  virtual void InitFromString(std::string args); 
+
+  virtual void InitFromString(std::string args);
 
   virtual std::string Info() const;
-  
+
  private:
   int32 dim_;
   BaseFloat scale_;
@@ -777,7 +777,7 @@ class FixedScaleComponent;  // Forward declaration.
 class SoftmaxComponent: public NonlinearComponent {
  public:
   explicit SoftmaxComponent(int32 dim): NonlinearComponent(dim) { }
-  explicit SoftmaxComponent(const SoftmaxComponent &other): NonlinearComponent(other) { }  
+  explicit SoftmaxComponent(const SoftmaxComponent &other): NonlinearComponent(other) { }
   SoftmaxComponent() { }
   virtual std::string Type() const { return "SoftmaxComponent"; }
   virtual bool BackpropNeedsInput() const { return false; }
@@ -786,22 +786,22 @@ class SoftmaxComponent: public NonlinearComponent {
   virtual void Propagate(const ChunkInfo &in_info,
                          const ChunkInfo &out_info,
                          const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const; 
+                         CuMatrixBase<BaseFloat> *out) const;
   virtual void Backprop(const ChunkInfo &in_info,
                         const ChunkInfo &out_info,
                         const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,                        
+                        const CuMatrixBase<BaseFloat> &out_value,
                         const CuMatrixBase<BaseFloat> &out_deriv,
                         Component *to_update, // may be identical to "this".
                         CuMatrix<BaseFloat> *in_deriv) const;
-  
+
   void MixUp(int32 num_mixtures,
              BaseFloat power,
              BaseFloat min_count,
              BaseFloat perturb_stddev,
              AffineComponent *ac,
              SumGroupComponent *sc);
-  
+
   virtual Component* Copy() const { return new SoftmaxComponent(*this); }
  private:
   SoftmaxComponent &operator = (const SoftmaxComponent &other); // Disallow.
@@ -810,7 +810,7 @@ class SoftmaxComponent: public NonlinearComponent {
 class LogSoftmaxComponent: public NonlinearComponent {
  public:
   explicit LogSoftmaxComponent(int32 dim): NonlinearComponent(dim) { }
-  explicit LogSoftmaxComponent(const LogSoftmaxComponent &other): NonlinearComponent(other) { }  
+  explicit LogSoftmaxComponent(const LogSoftmaxComponent &other): NonlinearComponent(other) { }
   LogSoftmaxComponent() { }
   virtual std::string Type() const { return "LogSoftmaxComponent"; }
   virtual bool BackpropNeedsInput() const { return false; }
@@ -819,15 +819,15 @@ class LogSoftmaxComponent: public NonlinearComponent {
   virtual void Propagate(const ChunkInfo &in_info,
                          const ChunkInfo &out_info,
                          const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const; 
+                         CuMatrixBase<BaseFloat> *out) const;
   virtual void Backprop(const ChunkInfo &in_info,
                         const ChunkInfo &out_info,
                         const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,                        
+                        const CuMatrixBase<BaseFloat> &out_value,
                         const CuMatrixBase<BaseFloat> &out_deriv,
                         Component *to_update, // may be identical to "this".
                         CuMatrix<BaseFloat> *in_deriv) const;
- 
+
   virtual Component* Copy() const { return new LogSoftmaxComponent(*this); }
  private:
   LogSoftmaxComponent &operator = (const LogSoftmaxComponent &other); // Disallow.
@@ -848,7 +848,7 @@ class AffineComponent: public UpdatableComponent {
   AffineComponent(const CuMatrixBase<BaseFloat> &linear_params,
                   const CuVectorBase<BaseFloat> &bias_params,
                   BaseFloat learning_rate);
-  
+
   virtual int32 InputDim() const { return linear_params_.NumCols(); }
   virtual int32 OutputDim() const { return linear_params_.NumRows(); }
   void Init(BaseFloat learning_rate,
@@ -872,7 +872,7 @@ class AffineComponent: public UpdatableComponent {
 
   virtual std::string Info() const;
   virtual void InitFromString(std::string args);
-  
+
   AffineComponent(): is_gradient_(false) { } // use Init to really initialize.
   virtual std::string Type() const { return "AffineComponent"; }
   virtual bool BackpropNeedsInput() const { return true; }
@@ -881,13 +881,13 @@ class AffineComponent: public UpdatableComponent {
   virtual void Propagate(const ChunkInfo &in_info,
                          const ChunkInfo &out_info,
                          const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const; 
+                         CuMatrixBase<BaseFloat> *out) const;
   virtual void Scale(BaseFloat scale);
   virtual void Add(BaseFloat alpha, const UpdatableComponent &other);
   virtual void Backprop(const ChunkInfo &in_info,
                         const ChunkInfo &out_info,
                         const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,                        
+                        const CuMatrixBase<BaseFloat> &out_value,
                         const CuMatrixBase<BaseFloat> &out_deriv,
                         Component *to_update, // may be identical to "this".
                         CuMatrix<BaseFloat> *in_deriv) const;
@@ -931,7 +931,7 @@ class AffineComponent: public UpdatableComponent {
   // or may not override this.
   virtual void UpdateSimple(
       const CuMatrixBase<BaseFloat> &in_value,
-      const CuMatrixBase<BaseFloat> &out_deriv);  
+      const CuMatrixBase<BaseFloat> &out_deriv);
 
   const AffineComponent &operator = (const AffineComponent &other); // Disallow.
   CuMatrix<BaseFloat> linear_params_;
@@ -957,7 +957,7 @@ class AffineComponentPreconditioned: public AffineComponent {
             BaseFloat alpha, BaseFloat max_change);
   void Init(BaseFloat learning_rate, BaseFloat alpha,
             BaseFloat max_change, std::string matrix_filename);
-  
+
   virtual void InitFromString(std::string args);
   virtual std::string Info() const;
   virtual Component* Copy() const;
@@ -1015,7 +1015,7 @@ class AffineComponentPreconditionedOnline: public AffineComponent {
             std::string matrix_filename);
 
   virtual void Resize(int32 input_dim, int32 output_dim);
-  
+
   // This constructor is used when converting neural networks partway through
   // training, from AffineComponent or AffineComponentPreconditioned to
   // AffineComponentPreconditionedOnline.
@@ -1023,7 +1023,7 @@ class AffineComponentPreconditionedOnline: public AffineComponent {
                                       int32 rank_in, int32 rank_out,
                                       int32 update_period,
                                       BaseFloat eta, BaseFloat alpha);
-  
+
   virtual void InitFromString(std::string args);
   virtual std::string Info() const;
   virtual Component* Copy() const;
@@ -1040,7 +1040,7 @@ class AffineComponentPreconditionedOnline: public AffineComponent {
   int32 update_period_;
   BaseFloat num_samples_history_;
   BaseFloat alpha_;
-  
+
   OnlinePreconditioner preconditioner_in_;
 
   OnlinePreconditioner preconditioner_out_;
@@ -1082,7 +1082,7 @@ class RandomComponent: public Component {
   // This function is required in testing code and in other places we need
   // consistency in the random number generation (e.g. when optimizing
   // validation-set performance), but check where else we call sRand().  You'll
-  // need to call srand as well as making this call.  
+  // need to call srand as well as making this call.
   void ResetGenerator() { random_generator_.SeedGpu(0); }
  protected:
   CuRand<BaseFloat> random_generator_;
@@ -1108,11 +1108,11 @@ class SpliceComponent: public Component {
   virtual void Propagate(const ChunkInfo &in_info,
                          const ChunkInfo &out_info,
                          const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const; 
+                         CuMatrixBase<BaseFloat> *out) const;
   virtual void Backprop(const ChunkInfo &in_info,
                         const ChunkInfo &out_info,
                         const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,                        
+                        const CuMatrixBase<BaseFloat> &out_value,
                         const CuMatrixBase<BaseFloat> &out_deriv,
                         Component *to_update, // may be identical to "this".
                         CuMatrix<BaseFloat> *in_deriv) const;
@@ -1145,11 +1145,11 @@ class SpliceMaxComponent: public Component {
   virtual void Propagate(const ChunkInfo &in_info,
                          const ChunkInfo &out_info,
                          const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const; 
+                         CuMatrixBase<BaseFloat> *out) const;
   virtual void Backprop(const ChunkInfo &in_info,
                         const ChunkInfo &out_info,
                         const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,                        
+                        const CuMatrixBase<BaseFloat> &out_value,
                         const CuMatrixBase<BaseFloat> &out_deriv,
                         Component *to_update, // may be identical to "this".
                         CuMatrix<BaseFloat> *in_deriv) const;
@@ -1182,7 +1182,7 @@ class BlockAffineComponent: public UpdatableComponent {
                     BaseFloat param_stddev, BaseFloat bias_stddev,
                     int32 num_blocks);
   virtual void InitFromString(std::string args);
-  
+
   BlockAffineComponent() { } // use Init to really initialize.
   virtual std::string Type() const { return "BlockAffineComponent"; }
   virtual bool BackpropNeedsInput() const { return true; }
@@ -1191,11 +1191,11 @@ class BlockAffineComponent: public UpdatableComponent {
   virtual void Propagate(const ChunkInfo &in_info,
                          const ChunkInfo &out_info,
                          const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const; 
+                         CuMatrixBase<BaseFloat> *out) const;
   virtual void Backprop(const ChunkInfo &in_info,
                         const ChunkInfo &out_info,
                         const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,                        
+                        const CuMatrixBase<BaseFloat> &out_value,
                         const CuMatrixBase<BaseFloat> &out_deriv,
                         Component *to_update, // may be identical to "this".
                         CuMatrix<BaseFloat> *in_deriv) const;
@@ -1218,8 +1218,8 @@ class BlockAffineComponent: public UpdatableComponent {
   virtual void UpdateSimple(
       const CuMatrixBase<BaseFloat> &in_value,
       const CuMatrixBase<BaseFloat> &out_deriv);
-  
-  // The matrix linear_parms_ has a block structure, with num_blocks_ blocks fo
+
+  // The matrix linear_params_ has a block structure, with num_blocks_ blocks of
   // equal size.  The blocks are stored in linear_params_ as
   // [ M
   //   N
@@ -1246,9 +1246,9 @@ class BlockAffineComponentPreconditioned: public BlockAffineComponent {
             int32 input_dim, int32 output_dim,
             BaseFloat param_stddev, BaseFloat bias_stddev,
             int32 num_blocks, BaseFloat alpha);
-  
+
   virtual void InitFromString(std::string args);
-  
+
   BlockAffineComponentPreconditioned() { } // use Init to really initialize.
   virtual std::string Type() const { return "BlockAffineComponentPreconditioned"; }
   virtual void SetZero(bool treat_as_gradient);
@@ -1270,7 +1270,7 @@ class BlockAffineComponentPreconditioned: public BlockAffineComponent {
 // idea into neural nets.  This is basically a degenerate case of
 // MixtureProbComponent; we had to implement it separately to
 // be efficient for CUDA (we can use this one regardless whether
-// we have CUDA or not; it's the normal case we want anyway). 
+// we have CUDA or not; it's the normal case we want anyway).
 class SumGroupComponent: public Component {
 public:
   virtual int32 InputDim() const { return input_dim_; }
@@ -1289,12 +1289,12 @@ class SumGroupComponent: public Component {
   virtual void Propagate(const ChunkInfo &in_info,
                          const ChunkInfo &out_info,
                          const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const; 
+                         CuMatrixBase<BaseFloat> *out) const;
   // Note: in_value and out_value are both dummy variables.
   virtual void Backprop(const ChunkInfo &in_info,
                         const ChunkInfo &out_info,
                         const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,                        
+                        const CuMatrixBase<BaseFloat> &out_value,
                         const CuMatrixBase<BaseFloat> &out_deriv,
                         Component *to_update, // may be identical to "this".
                         CuMatrix<BaseFloat> *in_deriv) const;
@@ -1310,7 +1310,7 @@ class SumGroupComponent: public Component {
                                // index.
   CuArray<int32> reverse_indexes_; // for each input index, the output index.
   int32 input_dim_;
-  int32 output_dim_;  
+  int32 output_dim_;
 };
 
 
@@ -1325,7 +1325,7 @@ class PermuteComponent: public Component {
   PermuteComponent(const std::vector<int32> &reorder) { Init(reorder); }
 
   PermuteComponent() { } // e.g. prior to Read() or Init()
-  
+
   virtual int32 InputDim() const { return reorder_.size(); }
   virtual int32 OutputDim() const { return reorder_.size(); }
   virtual Component *Copy() const;
@@ -1340,15 +1340,15 @@ class PermuteComponent: public Component {
   virtual void Propagate(const ChunkInfo &in_info,
                          const ChunkInfo &out_info,
                          const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const; 
+                         CuMatrixBase<BaseFloat> *out) const;
   virtual void Backprop(const ChunkInfo &in_info,
                         const ChunkInfo &out_info,
                         const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,                        
+                        const CuMatrixBase<BaseFloat> &out_value,
                         const CuMatrixBase<BaseFloat> &out_deriv,
                         Component *to_update, // may be identical to "this".
                         CuMatrix<BaseFloat> *in_deriv) const;
-  
+
  private:
   KALDI_DISALLOW_COPY_AND_ASSIGN(PermuteComponent);
   std::vector<int32> reorder_; // This class sends input dimension i to
@@ -1357,10 +1357,10 @@ class PermuteComponent: public Component {
 
 
 /// Discrete cosine transform.
-/// TODO: modify this Component so that it supports only keeping a subset 
+/// TODO: modify this Component so that it supports only keeping a subset
 class DctComponent: public Component {
  public:
-  DctComponent() { dim_ = 0; } 
+  DctComponent() { dim_ = 0; }
   virtual std::string Type() const { return "DctComponent"; }
   virtual std::string Info() const;
   //dim = dimension of vector being processed
@@ -1376,11 +1376,11 @@ class DctComponent: public Component {
   virtual void Propagate(const ChunkInfo &in_info,
                          const ChunkInfo &out_info,
                          const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const; 
+                         CuMatrixBase<BaseFloat> *out) const;
   virtual void Backprop(const ChunkInfo &in_info,
                         const ChunkInfo &out_info,
                         const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,                        
+                        const CuMatrixBase<BaseFloat> &out_value,
                         const CuMatrixBase<BaseFloat> &out_deriv,
                         Component *to_update, // may be identical to "this".
                         CuMatrix<BaseFloat> *in_deriv) const;
@@ -1412,27 +1412,27 @@ class DctComponent: public Component {
 /// at network initialization time and is not trainable.
 class FixedLinearComponent: public Component {
  public:
-  FixedLinearComponent() { } 
+  FixedLinearComponent() { }
   virtual std::string Type() const { return "FixedLinearComponent"; }
   virtual std::string Info() const;
-  
+
   void Init(const CuMatrixBase<BaseFloat> &matrix) { mat_ = matrix; }
 
   // InitFromString takes only the option matrix=<string>,
   // where the string is the filename of a Kaldi-format matrix to read.
   virtual void InitFromString(std::string args);
-  
+
   virtual int32 InputDim() const { return mat_.NumCols(); }
   virtual int32 OutputDim() const { return mat_.NumRows(); }
   using Component::Propagate; // to avoid name hiding
   virtual void Propagate(const ChunkInfo &in_info,
                          const ChunkInfo &out_info,
                          const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const; 
+                         CuMatrixBase<BaseFloat> *out) const;
   virtual void Backprop(const ChunkInfo &in_info,
                         const ChunkInfo &out_info,
                         const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,                        
+                        const CuMatrixBase<BaseFloat> &out_value,
                         const CuMatrixBase<BaseFloat> &out_deriv,
                         Component *to_update, // may be identical to "this".
                         CuMatrix<BaseFloat> *in_deriv) const;
@@ -1453,28 +1453,28 @@ class FixedLinearComponent: public Component {
 /// at network initialization time and is not trainable.
 class FixedAffineComponent: public Component {
  public:
-  FixedAffineComponent() { } 
+  FixedAffineComponent() { }
   virtual std::string Type() const { return "FixedAffineComponent"; }
   virtual std::string Info() const;
 
   /// matrix should be of size input-dim+1 to output-dim, last col is offset
-  void Init(const CuMatrixBase<BaseFloat> &matrix); 
+  void Init(const CuMatrixBase<BaseFloat> &matrix);
 
   // InitFromString takes only the option matrix=<string>,
   // where the string is the filename of a Kaldi-format matrix to read.
   virtual void InitFromString(std::string args);
-  
+
   virtual int32 InputDim() const { return linear_params_.NumCols(); }
   virtual int32 OutputDim() const { return linear_params_.NumRows(); }
   using Component::Propagate; // to avoid name hiding
   virtual void Propagate(const ChunkInfo &in_info,
                          const ChunkInfo &out_info,
                          const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const; 
+                         CuMatrixBase<BaseFloat> *out) const;
   virtual void Backprop(const ChunkInfo &in_info,
                         const ChunkInfo &out_info,
                         const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,                        
+                        const CuMatrixBase<BaseFloat> &out_value,
                         const CuMatrixBase<BaseFloat> &out_deriv,
                         Component *to_update, // may be identical to "this".
                         CuMatrix<BaseFloat> *in_deriv) const;
@@ -1490,7 +1490,7 @@ class FixedAffineComponent: public Component {
   friend class AffineComponent;
   CuMatrix<BaseFloat> linear_params_;
   CuVector<BaseFloat> bias_params_;
-  
+
   KALDI_DISALLOW_COPY_AND_ASSIGN(FixedAffineComponent);
 };
 
@@ -1500,27 +1500,27 @@ class FixedAffineComponent: public Component {
 /// model conversion).
 class FixedScaleComponent: public Component {
  public:
-  FixedScaleComponent() { } 
+  FixedScaleComponent() { }
   virtual std::string Type() const { return "FixedScaleComponent"; }
   virtual std::string Info() const;
-  
-  void Init(const CuVectorBase<BaseFloat> &scales); 
-  
+
+  void Init(const CuVectorBase<BaseFloat> &scales);
+
   // InitFromString takes only the option scales=<string>,
   // where the string is the filename of a Kaldi-format matrix to read.
   virtual void InitFromString(std::string args);
-  
+
   virtual int32 InputDim() const { return scales_.Dim(); }
   virtual int32 OutputDim() const { return scales_.Dim(); }
   using Component::Propagate; // to avoid name hiding
   virtual void Propagate(const ChunkInfo &in_info,
                          const ChunkInfo &out_info,
                          const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const; 
+                         CuMatrixBase<BaseFloat> *out) const;
   virtual void Backprop(const ChunkInfo &in_info,
                         const ChunkInfo &out_info,
                         const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,                        
+                        const CuMatrixBase<BaseFloat> &out_value,
                         const CuMatrixBase<BaseFloat> &out_deriv,
                         Component *to_update, // may be identical to "this".
                         CuMatrix<BaseFloat> *in_deriv) const;
@@ -1532,7 +1532,7 @@ class FixedScaleComponent: public Component {
 
  protected:
   friend class AffineComponent;  // necessary for collapse
-  CuVector<BaseFloat> scales_;  
+  CuVector<BaseFloat> scales_;
   KALDI_DISALLOW_COPY_AND_ASSIGN(FixedScaleComponent);
 };
 
@@ -1541,27 +1541,27 @@ class FixedScaleComponent: public Component {
 /// model conversion.
 class FixedBiasComponent: public Component {
  public:
-  FixedBiasComponent() { } 
+  FixedBiasComponent() { }
   virtual std::string Type() const { return "FixedBiasComponent"; }
   virtual std::string Info() const;
-  
-  void Init(const CuVectorBase<BaseFloat> &scales); 
-  
+
+  void Init(const CuVectorBase<BaseFloat> &scales);
+
   // InitFromString takes only the option bias=<string>,
   // where the string is the filename of a Kaldi-format matrix to read.
   virtual void InitFromString(std::string args);
-  
+
   virtual int32 InputDim() const { return bias_.Dim(); }
   virtual int32 OutputDim() const { return bias_.Dim(); }
   using Component::Propagate; // to avoid name hiding
   virtual void Propagate(const ChunkInfo &in_info,
                          const ChunkInfo &out_info,
                          const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const; 
+                         CuMatrixBase<BaseFloat> *out) const;
   virtual void Backprop(const ChunkInfo &in_info,
                         const ChunkInfo &out_info,
                         const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,                        
+                        const CuMatrixBase<BaseFloat> &out_value,
                         const CuMatrixBase<BaseFloat> &out_deriv,
                         Component *to_update, // may be identical to "this".
                         CuMatrix<BaseFloat> *in_deriv) const ;
@@ -1572,7 +1572,7 @@ class FixedBiasComponent: public Component {
   virtual void Write(std::ostream &os, bool binary) const;
 
  protected:
-  CuVector<BaseFloat> bias_;  
+  CuVector<BaseFloat> bias_;
   KALDI_DISALLOW_COPY_AND_ASSIGN(FixedBiasComponent);
 };
 
@@ -1601,30 +1601,30 @@ class DropoutComponent: public RandomComponent {
   virtual void InitFromString(std::string args);
 
   virtual void Read(std::istream &is, bool binary);
-  
+
   virtual void Write(std::ostream &os, bool binary) const;
-      
+
   virtual std::string Type() const { return "DropoutComponent"; }
 
   void SetDropoutScale(BaseFloat scale) { dropout_scale_ = scale; }
   virtual bool BackpropNeedsInput() const { return true; }
-  virtual bool BackpropNeedsOutput() const { return true; }  
+  virtual bool BackpropNeedsOutput() const { return true; }
   virtual Component* Copy() const;
   using Component::Propagate; // to avoid name hiding
   virtual void Propagate(const ChunkInfo &in_info,
                          const ChunkInfo &out_info,
                          const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const; 
+                         CuMatrixBase<BaseFloat> *out) const;
   virtual void Backprop(const ChunkInfo &in_info,
                         const ChunkInfo &out_info,
                         const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,                        
+                        const CuMatrixBase<BaseFloat> &out_value,
                         const CuMatrixBase<BaseFloat> &out_deriv,
                         Component *to_update, // may be identical to "this".
                         CuMatrix<BaseFloat> *in_deriv) const;
   virtual std::string Info() const;
  private:
-  int32 dim_;  
+  int32 dim_;
   BaseFloat dropout_proportion_;
   BaseFloat dropout_scale_; // Set the scale that we scale "dropout_proportion_"
   // of the neurons by (default 0.0, but can be set arbitrarily close to 1.0).
@@ -1642,13 +1642,13 @@ class AdditiveNoiseComponent: public RandomComponent {
   virtual void InitFromString(std::string args);
 
   virtual void Read(std::istream &is, bool binary);
-  
+
   virtual void Write(std::ostream &os, bool binary) const;
-      
+
   virtual std::string Type() const { return "AdditiveNoiseComponent"; }
 
   virtual bool BackpropNeedsInput() const { return false; }
-  virtual bool BackpropNeedsOutput() const { return false; }  
+  virtual bool BackpropNeedsOutput() const { return false; }
   virtual Component* Copy() const {
     return new AdditiveNoiseComponent(dim_, stddev_);
   }
@@ -1656,16 +1656,16 @@ class AdditiveNoiseComponent: public RandomComponent {
   virtual void Propagate(const ChunkInfo &in_info,
                          const ChunkInfo &out_info,
                          const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const; 
+                         CuMatrixBase<BaseFloat> *out) const;
   virtual void Backprop(const ChunkInfo &in_info,
                         const ChunkInfo &out_info,
                         const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,                        
+                        const CuMatrixBase<BaseFloat> &out_value,
                         const CuMatrixBase<BaseFloat> &out_deriv,
                         Component *to_update, // may be identical to "this".
                         CuMatrix<BaseFloat> *in_deriv) const { *in_deriv = out_deriv; }
  private:
-  int32 dim_;  
+  int32 dim_;
   BaseFloat stddev_;
 };
 
@@ -1690,11 +1690,11 @@ class AdditiveNoiseComponent: public RandomComponent {
  * stored. The features are then re-shaped to a set of matrices, where
  * one matrix corresponds to single patch-position, where all the
  * filters get applied.
- * 
+ *
  * The type of convolution is controled by hyperparameters:
  * patch_dim_     ... frequency axis size of the patch
  * patch_step_    ... size of shift in the convolution
- * patch_stride_  ... shift for 2nd dim of a patch 
+ * patch_stride_  ... shift for 2nd dim of a patch
  *                    (i.e. frame length before splicing)
  * For instance, for a convolutional component after raw input,
  * if the input is 36-dim fbank feature with delta of order 2
@@ -1729,8 +1729,10 @@ class Convolutional1dComponent: public UpdatableComponent {
   int32 OutputDim() const;
   void Init(BaseFloat learning_rate, int32 input_dim, int32 output_dim,
             int32 patch_dim, int32 patch_step, int32 patch_stride,
-            BaseFloat param_stddev, BaseFloat bias_stddev);
-  void Init(BaseFloat learning_rate, std::string matrix_filename);
+            BaseFloat param_stddev, BaseFloat bias_stddev, bool appended_conv);
+  void Init(BaseFloat learning_rate,
+            int32 patch_dim, int32 patch_step, int32 patch_stride,
+            std::string matrix_filename, bool appended_conv);
 
   // resize the component, setting the parameters to zero, while
   // leaving any other configuration values the same
@@ -1778,10 +1780,13 @@ class Convolutional1dComponent: public UpdatableComponent {
                              std::vector<std::vector<int32> > *backward_indexes);
   static void RearrangeIndexes(const std::vector<std::vector<int32> > &in,
                                std::vector<std::vector<int32> > *out);
-    
+
   const Convolutional1dComponent &operator = (const Convolutional1dComponent &other); // Disallow.
   CuMatrix<BaseFloat> filter_params_;
   CuVector<BaseFloat> bias_params_;
+  // When appending convolutional1dcomponents, appended_conv_ should be
+  // set ture for the appended convolutional1dcomponents.
+  bool appended_conv_;
   bool is_gradient_;
 };
 
diff --git a/src/nnet2/nnet-compute-discriminative.cc b/src/nnet2/nnet-compute-discriminative.cc
index c844d92a55b..72a579d608f 100644
--- a/src/nnet2/nnet-compute-discriminative.cc
+++ b/src/nnet2/nnet-compute-discriminative.cc
@@ -235,7 +235,9 @@ void NnetDiscriminativeUpdater::LatticeComputations() {
   }
 
   std::vector<BaseFloat> answers;
-  posteriors.Lookup(requested_indexes, &answers);
+  CuArray<Int32Pair> cu_requested_indexes(requested_indexes);
+  answers.resize(requested_indexes.size());
+  posteriors.Lookup(cu_requested_indexes, &(answers[0]));
 
   int32 num_floored = 0;
 
diff --git a/src/nnet2/nnet-compute-test.cc b/src/nnet2/nnet-compute-test.cc
index 448923056c1..6f1ff5e2a9b 100644
--- a/src/nnet2/nnet-compute-test.cc
+++ b/src/nnet2/nnet-compute-test.cc
@@ -1,6 +1,7 @@
 // nnet2/nnet-compute-test.cc
 
 // Copyright 2014  Johns Hopkins University (author:  Daniel Povey)
+// Copyright 2015  David Snyder
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -80,6 +81,41 @@ void UnitTestNnetCompute() {
   delete nnet;
 }
 
+void UnitTestNnetComputeChunked() {
+  int32 input_dim = 10 + rand() % 40, output_dim = 100 + rand() % 500;
+  bool pad_input = true;
+  
+  Nnet *nnet = GenRandomNnet(input_dim, output_dim);
+  int32 num_feats = 100 + rand() % 500;
+  int32 chunk_size = num_feats / (2 + rand() % 10);
+  CuMatrix<BaseFloat> input(num_feats, input_dim);
+  input.SetRandn();
+
+  KALDI_LOG << "Left context = " << nnet->LeftContext() 
+            << ", right context = " << nnet->RightContext() 
+            << ", chunk size = " << chunk_size;
+  KALDI_LOG << "NNet info is " << nnet->Info();
+
+  int32 num_output_rows = num_feats;
+  CuMatrix<BaseFloat> cu_output1(num_output_rows, output_dim);
+  Matrix<BaseFloat> output2(num_output_rows, output_dim);
+  NnetComputation(*nnet, input, pad_input, &cu_output1);
+  NnetComputationChunked(*nnet, Matrix<BaseFloat>(input), chunk_size, 
+                         &output2);
+  Matrix<BaseFloat> output1(cu_output1);
+  AssertEqual(output1, output2);
+  for (int32 i = 0; i < output1.NumRows(); i++) {
+    // just double-check that the frames near the end are right, in case
+    // the test above somehow passed despite that.
+    if (i < 10 || output1.NumRows() - i < 10) {
+      SubVector<BaseFloat> vec1(output1, i), vec2(output2, i);
+      AssertEqual(vec1, vec2);
+    }
+  }
+  KALDI_LOG << "OK";
+  delete nnet;
+}
+
 }  // namespace nnet2
 }  // namespace kaldi
 
@@ -92,6 +128,7 @@ int main() {
 
   for (int32 i = 0; i < 10; i++) 
     UnitTestNnetCompute();
+    UnitTestNnetComputeChunked();
   return 0;
 }
   
diff --git a/src/nnet2/nnet-compute.cc b/src/nnet2/nnet-compute.cc
index 84c98bb284a..9f2fe1ebcc8 100644
--- a/src/nnet2/nnet-compute.cc
+++ b/src/nnet2/nnet-compute.cc
@@ -1,6 +1,7 @@
 // nnet2/nnet-compute.cc
 
 // Copyright 2012   Johns Hopkins University (author: Daniel Povey)
+// Copyright 2015   David Snyder
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -165,6 +166,45 @@ void NnetComputation(const Nnet &nnet,
   output->CopyFromMat(nnet_computer.GetOutput());
 }
 
+void NnetComputationChunked(const Nnet &nnet,
+                     const Matrix<BaseFloat> &input,  // features
+                     int32 chunk_size,
+                     Matrix<BaseFloat> *output) {
+  int32 num_rows,
+       num_chunks = ceil((BaseFloat)input.NumRows() / chunk_size),
+       dim = input.NumCols(),
+       left_context = nnet.LeftContext(),
+       right_context = nnet.RightContext();
+  Matrix<BaseFloat> full_input;
+  num_rows = left_context + input.NumRows() + right_context;
+  full_input.Resize(num_rows, dim);
+  full_input.Range(left_context, input.NumRows(),
+            0, dim).CopyFromMat(input);
+  for (int32 i = 0; i < left_context; i++)
+    full_input.Row(i).CopyFromVec(input.Row(0));
+  int32 last_row = input.NumRows() - 1;
+  for (int32 i = 0; i < right_context; i++)
+    full_input.Row(num_rows - i - 1).CopyFromVec(input.Row(last_row));
+
+  for (int32 i = 0; i < num_chunks; i++) {
+    int32 index = i * chunk_size,
+          offset = std::min(num_rows - chunk_size * i, 
+                            left_context + chunk_size + right_context);
+    SubMatrix<BaseFloat> chunk_input(full_input, index, offset, 0, dim);
+    CuMatrix<BaseFloat> cu_chunk_input(chunk_input);
+
+    // Note: we have already accounted for input padding, so we pass
+    // pad_input==false to the NnetComputer.
+    NnetComputer nnet_computer(nnet, cu_chunk_input, false, NULL);
+    nnet_computer.Propagate();
+    CuMatrix<BaseFloat> cu_chunk_output(nnet_computer.GetOutput());
+    SubMatrix<BaseFloat> chunk_out(*output, i * chunk_size, 
+                           cu_chunk_output.NumRows(), 0, 
+                           cu_chunk_output.NumCols());
+    chunk_out.CopyFromMat(cu_chunk_output);
+  }
+}
+
 BaseFloat NnetGradientComputation(const Nnet &nnet,
                                   const CuMatrixBase<BaseFloat> &input,
                                   bool pad_input,
diff --git a/src/nnet2/nnet-compute.h b/src/nnet2/nnet-compute.h
index 6a3738b5a6f..875252fd260 100644
--- a/src/nnet2/nnet-compute.h
+++ b/src/nnet2/nnet-compute.h
@@ -1,6 +1,7 @@
 // nnet2/nnet-compute.h
 
 // Copyright 2012  Johns Hopkins University (author: Daniel Povey)
+// Copyright 2015  David Snyder
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -45,6 +46,19 @@ void NnetComputation(const Nnet &nnet,
                      const CuMatrixBase<BaseFloat> &input,  // features
                      bool pad_input,
                      CuMatrixBase<BaseFloat> *output); // posteriors.
+/**
+  Does the basic neural net computation, on a sequence of data (e.g.
+  an utterance).  This variant of NnetComputation chunks the input
+  according to chunk_size and does the posterior computation chunk 
+  by chunk.  This allows the computation to be performed on the GPU
+  when the input matrix is very large.  Input is padded with enough
+  frames of context so that the output will be a matrix with 
+  input.NumRows().
+*/
+void NnetComputationChunked(const Nnet &nnet,
+                     const Matrix<BaseFloat> &input,  // features
+                     int32 chunk_size,
+                     Matrix<BaseFloat> *output); // posteriors.
 
 /** Does the neural net computation and backprop, given input and labels.
     Note: if pad_input==true the number of rows of input should be the
diff --git a/src/nnet2/nnet-example-functions.cc b/src/nnet2/nnet-example-functions.cc
index b65351ce210..87184cd16e4 100644
--- a/src/nnet2/nnet-example-functions.cc
+++ b/src/nnet2/nnet-example-functions.cc
@@ -1,4 +1,4 @@
-// nnet/nnet-example-functions.cc
+// nnet2/nnet-example-functions.cc
 
 // Copyright 2012-2013  Johns Hopkins University (author: Daniel Povey)
 
diff --git a/src/nnet2/nnet-example-functions.h b/src/nnet2/nnet-example-functions.h
index fac48b2f383..82c86dfc046 100644
--- a/src/nnet2/nnet-example-functions.h
+++ b/src/nnet2/nnet-example-functions.h
@@ -107,7 +107,7 @@ struct SplitDiscriminativeExampleConfig {
     // See "Sequence-discriminative training of deep neural networks", Vesely et al,
     // ICASSP 2013 for explanation of frame dropping.
     opts->Register("drop-frames", &drop_frames, "For MMI, if true we drop frames "
-                   "with no overlap of num and den frames");
+                   "with no overlap of num and den pdf-ids");
     opts->Register("split", &split, "Set to false to disable lattice-splitting.");
     opts->Register("excise", &excise, "Set to false to disable excising un-needed "
                    "frames (option included for debug purposes)");
diff --git a/src/nnet2/nnet-nnet.cc b/src/nnet2/nnet-nnet.cc
index fbf4dbb0678..9fe10a4f5aa 100644
--- a/src/nnet2/nnet-nnet.cc
+++ b/src/nnet2/nnet-nnet.cc
@@ -47,7 +47,7 @@ int32 Nnet::LeftContext() const {
   }
   return -1*ans;
   // nnet-components return left context as a non-positive integer
-  // however the nnet-update, nnet-compute, train-nnet-perturbed expect a
+  // however the nnet-update, nnet-compute expect a
   // non-negative left context. In addition, the NnetExample also stores data
   // left context as positive integer. To be compatible with these other classes
   // Nnet::LeftContext() returns a non-negative left context.
diff --git a/src/nnet2/nnet-precondition-online-test.cc b/src/nnet2/nnet-precondition-online-test.cc
index aa7e8744963..ad889be5c55 100644
--- a/src/nnet2/nnet-precondition-online-test.cc
+++ b/src/nnet2/nnet-precondition-online-test.cc
@@ -1,6 +1,6 @@
 // nnet2/nnet-precondition-online-test.cc
 
-// Copyright 2012  Johns Hopkins University (author:  Daniel Povey)
+// Copyright 2012-2015  Johns Hopkins University (author:  Daniel Povey)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -28,8 +28,8 @@ namespace nnet2 {
 class OnlinePreconditionerSimple {
  public:
   OnlinePreconditionerSimple(): rank_(40), num_samples_history_(2000.0), alpha_(4.0),
-                                epsilon_(1.0e-10), delta_(1.0e-05) { }
-  
+                                epsilon_(1.0e-10), delta_(5.0e-04) { }
+
   void SetRank(int32 rank) { rank_ = rank; }
 
   void PreconditionDirections(
@@ -40,24 +40,27 @@ class OnlinePreconditionerSimple {
 
  private:
   BaseFloat Eta(int32 N) const;
-  
+
   void PreconditionDirectionsCpu(
       MatrixBase<double> *R,
       VectorBase<double> *row_prod,
       BaseFloat *scale);
-  
-  
+
+
   void Init(const MatrixBase<double> &R0);
 
-  int32 rank_;  
+  void InitDefault(int32 D);
+
+  int32 rank_;
   double num_samples_history_;
   double alpha_;
   double epsilon_;
   double delta_;
-  
+
+  // Fisher matrix defined as F_t = R_t^T diag(d_t) R_t + rho_t I.
   Vector<double> d_t_;
-  Matrix<double> X_t_;
-  double rho_t_;  
+  Matrix<double> R_t_;
+  double rho_t_;
 };
 
 
@@ -78,14 +81,7 @@ void OnlinePreconditionerSimple::PreconditionDirections(
   row_prod->CopyFromVec(row_prod_cpu);
 }
 
-void OnlinePreconditionerSimple::Init(const MatrixBase<double> &R0) {
-  int32 D = R0.NumCols(), N = R0.NumRows();
-  SpMatrix<double> S(D);
-  S.AddMat2(1.0 / N, R0, kTrans, 0.0);
-  Matrix<double> P(D, D);
-  Vector<double> s(D);
-  S.Eig(&s, &P);
-  SortSvd(&s, &P);
+void OnlinePreconditionerSimple::InitDefault(int32 D) {
   if (rank_ >= D) {
     KALDI_WARN << "Rank " << rank_ << " of online preconditioner is >= dim " << D
                << ", setting it to "
@@ -93,58 +89,76 @@ void OnlinePreconditionerSimple::Init(const MatrixBase<double> &R0) {
     rank_ = D - 1;
   }
   int32 R = rank_;
-  X_t_.Resize(R, D);
-  P.Transpose();
-  X_t_ = P.Range(0, R, 0, D);
-  d_t_ = s.Range(0, R);
-  KALDI_VLOG(3) << "d_t orig is " << d_t_;
-  rho_t_ = (TraceMatMat(R0, R0, kTrans) / N - d_t_.Sum()) / (D - R);
-  d_t_.Add(-rho_t_);
-  KALDI_VLOG(3) << "rho_0 = " << rho_t_;
-  KALDI_VLOG(3) << "d_0 = " << d_t_;
-  double floor_val = std::max(epsilon_, delta_ * d_t_.Max());
-  if (rho_t_ < floor_val) {
-    KALDI_WARN << "Flooring rho_0 to " << floor_val << ", was " << rho_t_;
-    rho_t_ = floor_val;
+  R_t_.Resize(R, D);
+  for (int32 r = 0; r < R; r++) {
+    std::vector<int32> cols;
+    for (int32 c = r; c < D; c += R)
+      cols.push_back(c);
+    for (int32 i = 0; i < cols.size(); i++) {
+      int32 c = cols[i];
+      R_t_(r, c) = (i == 0 ? 1.1 : 1.0) /
+          sqrt(1.1 * 1.1 + cols.size() - 1);
+    }
+  }
+  d_t_.Resize(R);
+  d_t_.Set(epsilon_);
+  rho_t_ = epsilon_;
+}
+
+void OnlinePreconditionerSimple::Init(const MatrixBase<double> &R0) {
+  int32 D = R0.NumCols(), N = R0.NumRows();
+  InitDefault(D);
+  int32 num_init_iters = 3;
+  for (int32 i = 0; i < num_init_iters; i++) {
+    CuMatrix<BaseFloat> R0_copy(R0);
+    CuVector<BaseFloat> row_products(N);
+    BaseFloat scale;
+    PreconditionDirections(&R0_copy, &row_products, &scale);
   }
-  int32 nf = d_t_.ApplyFloor(epsilon_);
-  if (nf > 0) {
-    KALDI_WARN << "Floored " << nf << " elements of D_0";
-  }  
 }
 
 BaseFloat OnlinePreconditionerSimple::Eta(int32 N) const {
   KALDI_ASSERT(num_samples_history_ > 0.0);
-  return 1.0 - Exp(-N / num_samples_history_);
+  BaseFloat ans = 1.0 - exp(-N / num_samples_history_);
+  if (ans > 0.9) ans = 0.9;
+  return ans;
 }
 
 
 void OnlinePreconditionerSimple::PreconditionDirectionsCpu(
-    MatrixBase<double> *R_t,
+    MatrixBase<double> *X_t,
     VectorBase<double> *row_prod,
     BaseFloat *scale) {
-  if (X_t_.NumRows() == 0)
-    Init(*R_t);
-  int32 R = X_t_.NumRows(), D = X_t_.NumCols(), N = R_t->NumRows();
+  if (R_t_.NumRows() == 0)
+    Init(*X_t);
+  int32 R = R_t_.NumRows(), D = R_t_.NumCols(), N = X_t->NumRows();
   BaseFloat eta = Eta(N);
-  
+
   SpMatrix<double> F_t(D);
-  // F_t =(def) X_t^T D_t X_t + \rho_t I
+  // F_t =(def) R_t^T D_t R_t + \rho_t I
   F_t.AddToDiag(rho_t_);
-  F_t.AddMat2Vec(1.0, X_t_, kTrans, d_t_, 1.0);
+  F_t.AddMat2Vec(1.0, R_t_, kTrans, d_t_, 1.0);
+
+  // Make sure F_t is +ve definite.
+  {
+    KALDI_ASSERT(d_t_.Min() > 0);
+    Vector<double> eigs(D);
+    F_t.Eig(&eigs, NULL);
+    KALDI_ASSERT(eigs.Min() > 0);
+  }
 
-  // S_t =(def) 1/N R_t^T R_t.
+  // S_t =(def) 1/N X_t^T X_t.
   SpMatrix<double> S_t(D);
-  S_t.AddMat2(1.0 / N, *R_t, kTrans, 0.0);
+  S_t.AddMat2(1.0 / N, *X_t, kTrans, 0.0);
 
   // T_t =(def) \eta S_t + (1-\eta) F_t
   SpMatrix<double> T_t(D);
   T_t.AddSp(eta, S_t);
   T_t.AddSp(1.0 - eta, F_t);
 
-  // Y_t =(def) X_t T_t
+  // Y_t =(def) R_t T_t
   Matrix<double> Y_t(R, D);
-  Y_t.AddMatSp(1.0, X_t_, kNoTrans, T_t, 0.0);
+  Y_t.AddMatSp(1.0, R_t_, kNoTrans, T_t, 0.0);
 
   // Z_t =(def) Y_t Y_t^T
   SpMatrix<double> Z_t(R);
@@ -163,15 +177,15 @@ void OnlinePreconditionerSimple::PreconditionDirectionsCpu(
   // KALDI_LOG << "c_t is " << c_t;
   // KALDI_LOG << "U_t is " << U_t;
   // KALDI_LOG << "Z_t is " << Z_t;
-  
+
   Vector<double> sqrt_c_t(c_t);
   sqrt_c_t.ApplyPow(0.5);
   Vector<double> inv_sqrt_c_t(sqrt_c_t);
   inv_sqrt_c_t.InvertElements();
-  Matrix<double> X_t1(R, D);
-  // X_{t+1} = C_t^{-0.5} U_t^T Y_t
-  X_t1.AddMatMat(1.0, U_t, kTrans, Y_t, kNoTrans, 0.0);
-  X_t1.MulRowsVec(inv_sqrt_c_t);
+  Matrix<double> R_t1(R, D);
+  // R_{t+1} = C_t^{-0.5} U_t^T Y_t
+  R_t1.AddMatMat(1.0, U_t, kTrans, Y_t, kNoTrans, 0.0);
+  R_t1.MulRowsVec(inv_sqrt_c_t);
 
   double rho_t1 = (1.0 / (D - R)) *
       (eta * S_t.Trace() + (1.0 - eta) * (D * rho_t_ + d_t_.Sum()) - sqrt_c_t.Sum());
@@ -184,13 +198,13 @@ void OnlinePreconditionerSimple::PreconditionDirectionsCpu(
     KALDI_WARN << "flooring rho_{t+1} to " << floor_val << ", was " << rho_t1;
     rho_t1 = floor_val;
   }
-  nf = d_t1.ApplyFloor(epsilon_);
+  nf = d_t1.ApplyFloor(floor_val);
   if (nf > 0) {
     KALDI_VLOG(3) << "d_t1 was " << d_t1;
     KALDI_WARN << "Floored " << nf << " elements of d_{t+1}.";
-  }  
+  }
   // a check.
-  if (nf == 0 && rho_t1 > epsilon_) {
+  if (nf == 0 && rho_t1 > floor_val) {
     double tr_F_t1 = D * rho_t1 + d_t1.Sum(), tr_T_t = T_t.Trace();
     AssertEqual(tr_F_t1, tr_T_t);
   }
@@ -202,38 +216,50 @@ void OnlinePreconditionerSimple::PreconditionDirectionsCpu(
   G_t_inv.Invert();
 
   double beta_t = rho_t_ + alpha_/D * F_t.Trace();
-  // P_t = beta_t R_t G_t^{-1}.
-  Matrix<double> P_t(N, D);
-  P_t.AddMatSp(beta_t, *R_t, kNoTrans, G_t_inv, 0.0);
+  // X_hat_t = beta_t X_t G_t^{-1}.
+  Matrix<double> X_hat_t(N, D);
+  X_hat_t.AddMatSp(beta_t, *X_t, kNoTrans, G_t_inv, 0.0);
 
-  double tr_r_r = TraceMatMat(*R_t, *R_t, kTrans),
-      tr_p_p = TraceMatMat(P_t, P_t, kTrans);
-  double gamma = (tr_p_p == 0 ? 1.0 : sqrt(tr_r_r / tr_p_p));
+  double tr_x_x = TraceMatMat(*X_t, *X_t, kTrans),
+      tr_Xhat_Xhat = TraceMatMat(X_hat_t, X_hat_t, kTrans);
+  double gamma = (tr_Xhat_Xhat == 0 ? 1.0 : sqrt(tr_x_x / tr_Xhat_Xhat));
 
-  R_t->CopyFromMat(P_t);
-  row_prod->AddDiagMat2(1.0, *R_t, kNoTrans, 0.0);
+  X_t->CopyFromMat(X_hat_t);
+  row_prod->AddDiagMat2(1.0, *X_t, kNoTrans, 0.0);
   *scale = gamma;
 
   // Update the parameters
   rho_t_ = rho_t1;
   d_t_.CopyFromVec(d_t1);
-  X_t_.CopyFromMat(X_t1);
+  R_t_.CopyFromMat(R_t1);
 
   KALDI_VLOG(3) << "rho_t_ = " << rho_t_;
   KALDI_VLOG(3) << "d_t_ = " << d_t_;
-  KALDI_VLOG(3) << "X_t_ = " << X_t_;
-  
+  KALDI_VLOG(3) << "R_t_ = " << R_t_;
+
 
-  { // check that X_t_ X_t_^T = I.
+  { // check that R_t_ R_t_^T = I.
     SpMatrix<double> unit(R);
-    unit.AddMat2(1.0, X_t_, kNoTrans, 0.0);
+    unit.AddMat2(1.0, R_t_, kNoTrans, 0.0);
+    if (!unit.IsUnit(1.0e-03)) {
+      KALDI_WARN  << "R is not orthogonal, reorthogonalizing.";
+      for (int32 i = 0; i < R; i++) {
+        SubVector<double> row(R_t_, i);
+        for (int32 j = 0; j < i; j++) {
+          SubVector<double> row_j(R_t_, j);
+          row.AddVec(-VecVec(row_j, row), row_j);
+        }
+        row.Scale(1.0 / row.Norm(2.0));
+      }
+    }
+    unit.AddMat2(1.0, R_t_, kNoTrans, 0.0);
     KALDI_ASSERT(unit.IsUnit(1.0e-03));
   }
 }
 
 
 void UnitTestPreconditionDirectionsOnline() {
-  MatrixIndexT R = 1 + Rand() % 5,  // rank of correction
+  MatrixIndexT R = 1 + Rand() % 30,  // rank of correction
       N = (2 * R) + Rand() % 30,  // batch size
       D = R + 1 + Rand() % 20; // problem dimension.  Must be > R.
 
@@ -242,11 +268,16 @@ void UnitTestPreconditionDirectionsOnline() {
   bool zero = false;
   bool one = false;
   if (Rand() % 3 == 0) zero = true;
-  else if (Rand() % 2 == 0) one = true;
-  
+  //else if (Rand() % 2 == 0) one = true;
+
   CuVector<BaseFloat> row_prod1(N), row_prod2(N);
   BaseFloat gamma1, gamma2;
-  
+  BaseFloat big_eig_factor = RandInt(1, 20);
+  big_eig_factor = big_eig_factor * big_eig_factor;
+  Vector<BaseFloat> big_eig_vector(D);
+  big_eig_vector.SetRandn();
+  big_eig_vector.Scale(big_eig_factor);
+
   OnlinePreconditionerSimple preconditioner1;
   OnlinePreconditioner preconditioner2;
   preconditioner1.SetRank(R);
@@ -255,117 +286,39 @@ void UnitTestPreconditionDirectionsOnline() {
 
   int32 num_iters = 100;
   for (int32 iter = 0; iter < num_iters; iter++) {
-    CuMatrix<BaseFloat> M(N, D);
-    if (one) M.Set(1.0);
-    else if (!zero)
-      M.SetRandn();
-    
+    Matrix<BaseFloat> M_cpu(N, D);
+    if (one) M_cpu.Set(1.0);
+    else if (!zero) {
+      M_cpu.SetRandn();
+      Vector<BaseFloat> rand_vec(N);
+      rand_vec.SetRandn();
+      M_cpu.AddVecVec(1.0, rand_vec, big_eig_vector);
+    }
+    CuMatrix<BaseFloat> M(M_cpu);
+
     CuMatrix<BaseFloat> Mcopy1(M), Mcopy2(M);
 
     preconditioner1.PreconditionDirections(&Mcopy1, &row_prod1, &gamma1);
 
     preconditioner2.PreconditionDirections(&Mcopy2, &row_prod2, &gamma2);
 
-    AssertEqual(Mcopy1, Mcopy2);
-  }
-  return;
-}
+    BaseFloat trace1 = TraceMatMat(M, M, kTrans),
+        trace2 = TraceMatMat(Mcopy1, Mcopy1, kTrans);
+    AssertEqual(trace1, trace2 * gamma2 * gamma2, 1.0e-02);
 
+    AssertEqual(Mcopy1, Mcopy2);
+    AssertEqual<BaseFloat>(row_prod1, row_prod2, 1.0e-02);
+    AssertEqual(gamma1, gamma2, 1.0e-02);
 
-// outputs eigs to rows of P.
-void ExactEigsOfProduct(const CuMatrixBase<BaseFloat> &M,
-                        MatrixTransposeType trans,
-                        CuMatrixBase<BaseFloat> *P,
-                        CuVectorBase<BaseFloat> *s) {
-  Matrix<BaseFloat> M_cpu(M);
-  int32 D = trans == kTrans ? M.NumCols() : M.NumRows();
-  SpMatrix<BaseFloat> S_cpu(D);
-  S_cpu.AddMat2(1.0, M_cpu, trans, 0.0);
-  Matrix<BaseFloat> P_cpu(D, D);
-  Vector<BaseFloat> s_cpu(D);
-  S_cpu.Eig(&s_cpu, &P_cpu);
-  SortSvd(&s_cpu, &P_cpu);
-  P->CopyFromMat(P_cpu.Range(0, D, 0, P->NumRows()), kTrans);
-  s->CopyFromVec(s_cpu.Range(0, P->NumRows()));
-}
-  
-
-void UnitTestApproxEigsOfProduct() {
-  int32 dimM = 10 + Rand() % 50,
-      dimN = 10 + Rand() % 50;
-  MatrixTransposeType trans = (Rand() % 2 == 0 ? kTrans : kNoTrans);
-  int32 product_dim = (trans == kTrans ? dimN : dimM),
-      other_dim = (trans == kTrans ? dimM : dimN);
-  
-  CuMatrix<BaseFloat> M(dimM, dimN);
-  if (Rand() % 4 == 0) {
-    M.SetRandn();
-  } else if (Rand() % 3 == 0) {
-    M.Row(2).SetRandn();
-  } else if (Rand() % 2 == 0) {
-    M.Row(2).SetRandn();
-    M.Row(4).SetRandn();
+    // make sure positive definite
+    CuVector<BaseFloat> inner_prods(M.NumRows());
+    inner_prods.AddDiagMatMat(1.0, M, kNoTrans, Mcopy1, kTrans, 0.0);
+    KALDI_ASSERT(inner_prods.Min() >= 0.0);
   }
-  // else leave M at zero.  We want to test
-  // full-rank M as well as zero, one or two eigenvalues
-  // being nonzero.
-
-  int32 rank = 1 + Rand() % (product_dim - 1);
-
-  CuMatrix<BaseFloat> P_approx(rank, product_dim),
-      P_exact(rank, product_dim);
-  CuVector<BaseFloat> s_approx(rank),
-      s_exact(rank);
-
-  ExactEigsOfProduct(M, trans, &P_exact, &s_exact);
-  ApproxEigsOfProduct(M, trans, &P_approx, &s_approx);
-
-  KALDI_LOG << "Approx eig sum is " << s_approx.Sum();
-  KALDI_LOG << "Exact eig sum is " << s_exact.Sum();
-
-  CuMatrix<BaseFloat> unit1(rank, rank), unit2(rank, rank);
-  unit1.AddMatMat(1.0, P_approx, kNoTrans, P_approx, kTrans, 0.0);
-  unit2.AddMatMat(1.0, P_exact, kNoTrans, P_exact, kTrans, 0.0);
-  KALDI_ASSERT(unit1.IsUnit());
-  KALDI_ASSERT(unit2.IsUnit());
-
-  CuMatrix<BaseFloat> Mproj_approx(rank, other_dim);
-  Mproj_approx.AddMatMat(1.0, P_approx, kNoTrans, M, trans, 0.0);
-  CuMatrix<BaseFloat> Mproj_exact(rank, other_dim);
-  Mproj_exact.AddMatMat(1.0, P_exact, kNoTrans, M, trans, 0.0);
-
-  CuVector<BaseFloat> s2_approx(rank), s2_exact(rank);
-  s2_approx.AddDiagMat2(1.0, Mproj_approx, kNoTrans, 0.0);
-  s2_exact.AddDiagMat2(1.0, Mproj_exact, kNoTrans, 0.0);  
-  KALDI_ASSERT(s_approx.ApproxEqual(s2_approx));
-  // KALDI_LOG << "s_exact is " << s_exact;
-  // KALDI_LOG << "s2_exact is " << s2_exact;
-  // KALDI_LOG << "P_exact is " << P_exact;
-  KALDI_ASSERT(s_exact.ApproxEqual(s2_exact));
-  
+  return;
 }
 
-/*
-  CuSpMatrix<BaseFloat> G(D);
-  G.SetUnit();
-  G.ScaleDiag(lambda);
-  // G += R^T R.
-  G.AddMat2(1.0/(N-1), R, kTrans, 1.0);
-  
-  for (int32 n = 0; n < N; n++) {
-    CuSubVector<BaseFloat> rn(R, n);
-    CuSpMatrix<BaseFloat> Gn(G);
-    Gn.AddVec2(-1.0/(N-1), rn); // subtract the
-    // outer product of "this" vector.
-    Gn.Invert();
-    CuSubVector<BaseFloat> pn(P, n);
-    CuVector<BaseFloat> pn_compare(D);
-    pn_compare.AddSpVec(1.0, Gn, rn, 0.0);
-    KALDI_ASSERT(pn.ApproxEqual(pn_compare, 0.1));
-  }
-}
 
-*/
 } // namespace nnet2
 } // namespace kaldi
 
@@ -380,9 +333,8 @@ int main() {
     else
       CuDevice::Instantiate().SelectGpuId("optional"); // -2 .. automatic selection
 #endif
-    for (int32 i = 0; i < 30; i++) {
+    for (int32 i = 0; i < 10; i++) {
       UnitTestPreconditionDirectionsOnline();
-      UnitTestApproxEigsOfProduct();
     }
   }
 }
diff --git a/src/nnet2/nnet-precondition-online.cc b/src/nnet2/nnet-precondition-online.cc
index 01af3e276d2..bec86eb5ae9 100644
--- a/src/nnet2/nnet-precondition-online.cc
+++ b/src/nnet2/nnet-precondition-online.cc
@@ -1,6 +1,7 @@
 // nnet2/nnet-precondition-online.cc
 
-// Copyright 2013   Johns Hopkins University (author: Daniel Povey)
+// Copyright 2013-2015   Johns Hopkins University (author: Daniel Povey)
+//                2015   Xiaohui Zhang
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -23,106 +24,148 @@ namespace kaldi {
 namespace nnet2 {
 
 
+OnlinePreconditioner::OnlinePreconditioner():
+    rank_(40), update_period_(1), num_samples_history_(2000.0), alpha_(4.0),
+    epsilon_(1.0e-10), delta_(5.0e-04), t_(-1),
+    num_updates_skipped_(0), self_debug_(false) { }
 
 
-static void CheckOrthogonal(CuMatrixBase<BaseFloat> *N,
-                            bool quiet = false,
-                            int32 recurse_count = 0) {
-  if (recurse_count > 100)
-    KALDI_ERR << "CheckOrthogonal recursed >100 times, something is wrong.";
-  
-  int32 R = N->NumRows();
-  CuSpMatrix<BaseFloat> S(R);
-  S.AddMat2(1.0, *N, kNoTrans, 0.0);
-  if (!S.IsUnit(1.0e-04)) {
-    {
-      SpMatrix<BaseFloat> S_cpu(S);
-      if (!quiet)
-        KALDI_WARN << "Matrix N is not orthogonal, fixing it.  N N^T is "
-                   << S_cpu;
-      Vector<BaseFloat> s(R);
-      S_cpu.Eig(&s);
-      BaseFloat threshold = 0.001;
-      if (s.Min() < threshold) {
-        if (!quiet) {
-          KALDI_WARN << "Minimum eigenvalue of N N^T is less than " << threshold
-                     << ", may be hard to fix: re-initializing from random "
-                     << "start. Eigs are" << s;
-        }
-        N->SetRandn();
-        CheckOrthogonal(N, quiet, recurse_count + 1);
-        return;
-      }
+/**
+  This function creates a matrix with orthonormal rows that is like the
+  following matrix, except with each row normalized to have unit 2-norm:
+  [  1.1 0   1   0   1   0
+     0   1.1 0   1   0   1  ]
+  The reason why the first element in each row is 1.1 and not 1, is for
+  symmetry-breaking... we don't want any weighted sum of all these rows to be
+  all ones, because the derivative in that direction can be zero in some
+  architectures and it causes us to have to do an inefficient CPU-based
+  renormalization.
+ */
+// static
+void OnlinePreconditioner::InitOrthonormalSpecial(CuMatrixBase<BaseFloat> *R) {
+  int32 num_rows = R->NumRows(), num_cols = R->NumCols();
+  KALDI_ASSERT(num_cols >= num_rows);
+  R->SetZero();
+  std::vector<MatrixElement<BaseFloat> > elems;
+  elems.reserve(num_cols);
+  BaseFloat first_elem = 1.1;
+  for (int32 r = 0; r < num_rows; r++) {
+    std::vector<int32> cols;  // columns that have an entry for this row
+    for (int32 c = r; c < num_cols; c += num_rows)
+      cols.push_back(c);
+    BaseFloat normalizer = 1.0 / sqrt(first_elem * first_elem +
+                                      cols.size() - 1);
+    for (size_t i = 0; i < cols.size(); i++) {
+      int32 c = cols[i];
+      MatrixElement<BaseFloat> e = { r, c,
+                                     normalizer * (i == 0 ? first_elem :
+                                                   BaseFloat(1.0)) };
+      elems.push_back(e);
     }
-    CuTpMatrix<BaseFloat> Cinv(R);
-    Cinv.Cholesky(S);
-    Cinv.Invert();
-    CuMatrix<BaseFloat> N_copy(*N);
-    N->AddTpMat(1.0, Cinv, kNoTrans, N_copy, kNoTrans, 0.0);
-    CheckOrthogonal(N, quiet, recurse_count + 1); // Check that it worked.
+  }
+  R->AddElements(1.0, elems);
+  { // TODO: remove this testing code.
+    CuMatrix<BaseFloat> prod(num_rows, num_rows);
+    prod.AddMatMat(1.0, *R, kNoTrans, *R, kTrans, 0.0);
+    KALDI_ASSERT(prod.IsUnit());
   }
 }
 
-OnlinePreconditioner::OnlinePreconditioner():
-    rank_(40), update_period_(1), num_samples_history_(2000.0), alpha_(4.0),
-    epsilon_(1.0e-10), delta_(1.0e-05), t_(-1),
-    num_updates_skipped_(0), self_debug_(false) { }
 
-void OnlinePreconditioner::Init(const CuMatrixBase<BaseFloat> &R0) {
-  int32 D = R0.NumCols(), N = R0.NumRows();
-  KALDI_ASSERT(D > 1);
+void OnlinePreconditioner::InitDefault(int32 D) {
   if (rank_ >= D) {
     KALDI_WARN << "Rank " << rank_ << " of online preconditioner is >= dim " << D
                << ", setting it to "
                << (D - 1) << " (but this is probably still too high)";
     rank_ = D - 1;
   }
+  if (rank_ == 0) {
+    // Dimension of input data was 1, so the natural gradient preconditioner
+    // would always be the unit matrix.
+    // We'll handle this as a special case, for generality.
+    return;
+  }
   KALDI_ASSERT(num_samples_history_ > 0.0 && num_samples_history_ <= 1.0e+6);
   KALDI_ASSERT(alpha_ >= 0.0);
   KALDI_ASSERT(rank_ > 0);
   KALDI_ASSERT(epsilon_ > 0.0 && epsilon_ <= 1.0e-05);  // plausible values.
   KALDI_ASSERT(delta_ > 0.0 && delta_ <= 1.0e-02);  // plausible values.
-  
-  int32 R = rank_;
-  W_t_.Resize(R, D);
-  d_t_.Resize(R);
-  CuVector<BaseFloat> L(R);
-  ApproxEigsOfProduct(R0, kTrans, &W_t_, &L);
-  // want L to be eigenvalues of 1/N R0 R0^T
-  L.Scale(1.0 / N);
-  
-  // \rho_0 = (1/N tr(R0 R0^T) - tr(L)) / (D - R)
-  rho_t_ = (TraceMatMat(R0, R0, kTrans) / N - L.Sum()) / (D - R);
-  BaseFloat floor_val = std::max(epsilon_, delta_ * L.Max());
-  if (rho_t_ < floor_val)
-    rho_t_ = floor_val;
-  d_t_.CopyFromVec(L);
-  d_t_.Add(-rho_t_);  // D_0 = L - \rho_0 I
-  d_t_.ApplyFloor(epsilon_);
-  
-  // beta_t = \rho_t(1+\alpha) + \alpha/D tr(D_t)
-  BaseFloat beta_t = rho_t_ * (1.0 + alpha_) + alpha_ * d_t_.Sum() / D;
-  Vector<BaseFloat> e_t(R), sqrt_e_t(R), inv_sqrt_e_t(R);
-  ComputeEt(d_t_, beta_t, &e_t, &sqrt_e_t, &inv_sqrt_e_t);
-  // Compute W_0 by scaling the rows of X_0 with E_0^{0.5}.
-  CuVector<BaseFloat> sqrt_e_t_gpu(sqrt_e_t);
-  W_t_.MulRowsVec(sqrt_e_t_gpu);
+
+  // to initialize, in the equation
+  //   F_t =(def) R_t^T D_t R_t + \rho_t I
+  // we will set the orthogonal R_t to a special orthogonal matrix with no zero
+  // rows or columns (see the function), rho_t to epsilon,
+  // and D_t to epsilon.  But we don't store R_t directly.  Instead, we store
+  //   W_t =(def)  E_t^{0.5} R_t,
+  // where E_t =(def)  1/\beta_t (D_t^{-1} + 1/\beta_t I)^{-1}
+  // from (eqn:tii),
+  //  e_{tii} =   1/(\beta_t/d_{tii} + 1),
+  // where
+  // \beta_t =(def) \rho_t + \alpha/D tr(F_t)
+  //         =      epsilon + alpha/D * (epsilon * D + epsilon * rank)
+  //         =     epsilon * (1 + alpha * (D + rank) / D)
+  // And  d_{tii} is epsilon, so
+  //  e_{tii} =   1/((1 + alpha * (D + rank) / D) + 1)  [for each i.]
+  //          =   1/(2 + alpha * (D + rank) / D)).
+  BaseFloat epsilon = epsilon_;  // we could make this a bit more.
+  rho_t_ = epsilon;
+  d_t_.Resize(rank_, kUndefined);
+  d_t_.Set(epsilon);
+  W_t_.Resize(rank_, D, kUndefined);
+  // after the next line, W_ will store the orthogonal matrix R_t.
+  InitOrthonormalSpecial(&W_t_);
+  BaseFloat E_tii = 1.0 / ( 2.0 + (D + rank_) * alpha_ / D );
+  // W_t =(def) E_t^{0.5} R_t.
+  W_t_.Scale(sqrt(E_tii));
+  t_ = 0;
+}
+
+void OnlinePreconditioner::Init(const CuMatrixBase<BaseFloat> &R0) {
+  int32 D = R0.NumCols();
+  // for locking reasons it's better to use a different object.
+  OnlinePreconditioner this_copy(*this);
+  this_copy.InitDefault(D);
+
+  CuMatrix<BaseFloat> R0_copy(R0.NumRows(), R0.NumCols(), kUndefined);
+  // number of iterations with the same data from a pseudorandom start.
+  // this is a faster way of starting than doing eigenvalue decomposition.
+  int32 num_init_iters = 3;
+  for (int32 i = 0; i < num_init_iters; i++) {
+    BaseFloat scale;
+    R0_copy.CopyFromMat(R0);
+    this_copy.PreconditionDirections(&R0_copy, NULL, &scale);
+  }
+  rank_ = this_copy.rank_;
+  W_t_.Swap(&this_copy.W_t_);
+  d_t_.Swap(&this_copy.d_t_);
+  rho_t_ = this_copy.rho_t_;
   t_ = 0;
 }
 
 void OnlinePreconditioner::PreconditionDirections(
-    CuMatrixBase<BaseFloat> *R_t,
+    CuMatrixBase<BaseFloat> *X_t,
     CuVectorBase<BaseFloat> *row_prod,
     BaseFloat *scale) {
+  if (X_t->NumCols() == 1) {
+    // If the dimension of the space equals one then our natural gradient update
+    // with rescaling becomes a no-op, but the code wouldn't naturally handle it
+    // because rank would be zero.  Support this as a special case.
+    if (row_prod)
+      row_prod->AddDiagMat2(1.0, *X_t, kNoTrans, 0.0);
+    *scale = 1.0;
+    return;
+  }
+
   if (row_prod == NULL) {
-    CuVector<BaseFloat> row_prod_tmp(R_t->NumRows());
-    PreconditionDirections(R_t, &row_prod_tmp, scale);
+    CuVector<BaseFloat> row_prod_tmp(X_t->NumRows());
+    PreconditionDirections(X_t, &row_prod_tmp, scale);
     return;
   }
-  
+
   read_write_mutex_.Lock();
   if (t_ == -1) // not initialized
-    Init(*R_t);
+    Init(*X_t);
+
   // Now t_ >= 0.
   // We create local copies  of the class variables... this is intended for
   // multi-threaded safety so we can't read them in an inconsistent state,
@@ -135,7 +178,7 @@ void OnlinePreconditioner::PreconditionDirections(
   BaseFloat rho_t(rho_t_);
   Vector<BaseFloat> d_t(d_t_);
   read_write_mutex_.Unlock();
-  PreconditionDirectionsInternal(t, rho_t, d_t, &WJKL_t, R_t, row_prod, scale);
+  PreconditionDirectionsInternal(t, rho_t, d_t, &WJKL_t, X_t, row_prod, scale);
 }
 
 void OnlinePreconditioner::ReorthogonalizeXt1(
@@ -152,10 +195,10 @@ void OnlinePreconditioner::ReorthogonalizeXt1(
   BaseFloat beta_t1 = rho_t1 * (1.0 + alpha_) + alpha_ * d_t1.Sum() / D;
   Vector<BaseFloat> e_t1(R, kUndefined), sqrt_e_t1(R, kUndefined),
       inv_sqrt_e_t1(R, kUndefined);
-  ComputeEt(d_t1, beta_t1, &e_t1, &sqrt_e_t1, &inv_sqrt_e_t1);  
-  
+  ComputeEt(d_t1, beta_t1, &e_t1, &sqrt_e_t1, &inv_sqrt_e_t1);
+
   temp_O->SymAddMat2(1.0, *W_t1, kNoTrans, 0.0);
-  // O_t =  E_t^{-0.5} W_t W_t^T E_t^{-0.5}  
+  // O_t =  E_t^{-0.5} W_t W_t^T E_t^{-0.5}
   Matrix<BaseFloat> O_mat(*temp_O);
   SpMatrix<BaseFloat> O(O_mat, kTakeLower);
   for (int32 i = 0; i < R; i++) {
@@ -174,20 +217,24 @@ void OnlinePreconditioner::ReorthogonalizeXt1(
   TpMatrix<BaseFloat> C(R);
   try {
     C.Cholesky(O);
+    C.Invert();  // Now it's C^{-1}.
+    if (!(C.Max() < 100.0))
+      KALDI_ERR << "Cholesky out of expected range, "
+                << "reorthogonalizing with Gram-Schmidt";
   } catch (...) {
-    // It would be very strange to reach this point, but we try to handle it
-    // gracefully anyway.
-    KALDI_WARN << "Cholesky failed while re-orthogonalizing X_t. "
-               << "Re-initializing as arbitrary orthogonal matrix.";
-    // set X_t to [I; 0] which is orthogonal.
-    W_t1->SetZero();
-    W_t1->AddToDiag(1.0);
-    // W_{t+1} = E_{t+1}^{0.5} X_{t+1}
+    // We do a Gram-Schmidt orthogonalization, which is a bit less efficient but
+    // more robust than the method using Cholesky.
+    KALDI_WARN << "Cholesky or Invert() failed while re-orthogonalizing R_t. "
+               << "Re-orthogonalizing on CPU.";
+    Matrix<BaseFloat> cpu_W_t1(*W_t1);
+    cpu_W_t1.OrthogonalizeRows();
+    W_t1->CopyFromMat(cpu_W_t1);
+    // at this point cpu_W_t1 represents R_{t+1}- it has orthonormal
+    // rows.  Do: W_{t+1} = E_{t+1}^{0.5} R_{t+1}
     CuVector<BaseFloat> sqrt_e_t1_gpu(sqrt_e_t1);
     W_t1->MulRowsVec(sqrt_e_t1_gpu);
     return;
   }
-  C.Invert();  // Now it's C^{-1}.
   // Next, compute (E_t^{0.5} C^{-1} E_t^{-0.5})
   // but it's really t+1, not t.
   for (int32 i = 0; i < R; i++) {
@@ -204,16 +251,62 @@ void OnlinePreconditioner::ReorthogonalizeXt1(
   W_t1->AddMatMat(1.0, *temp_O, kNoTrans, *temp_W, kNoTrans, 0.0);
 }
 
+// makes sure certain invariants are being preserved
+void OnlinePreconditioner::SelfTest() const {
+  KALDI_ASSERT(rho_t_ >= epsilon_);
+  BaseFloat d_t_max = d_t_.Max(), d_t_min = d_t_.Min();
+  KALDI_ASSERT(d_t_min >= epsilon_);
+  KALDI_ASSERT(d_t_min > 0.9 * delta_ * d_t_max);
+  KALDI_ASSERT(rho_t_ > 0.9 * delta_ * d_t_max);
+
+  int32 D = W_t_.NumCols(), R = W_t_.NumRows();
+  BaseFloat beta_t = rho_t_ * (1.0 + alpha_) + alpha_ * d_t_.Sum() / D;
+  Vector<BaseFloat> e_t(R, kUndefined), sqrt_e_t(R, kUndefined),
+      inv_sqrt_e_t(R, kUndefined);
+  ComputeEt(d_t_, beta_t, &e_t, &sqrt_e_t, &inv_sqrt_e_t);
+
+  CuSpMatrix<BaseFloat> S(R);
+  S.AddMat2(1.0, W_t_, kNoTrans, 0.0);
+  SpMatrix<BaseFloat> O(S);
+  for (int32 i = 0; i < R; i++) {
+    BaseFloat i_factor = inv_sqrt_e_t(i);
+    for (int32 j = 0; j <= i; j++) {
+      BaseFloat j_factor = inv_sqrt_e_t(j);
+      O(i, j) *= i_factor * j_factor;
+    }
+  }
+  if (!O.IsUnit(1.0e-04) || O(0, 0) != O(0, 0)) {
+    BaseFloat worst_error = 0.0;
+    int32 worst_i = 0, worst_j = 0;
+    for (int32 i = 0; i < R; i++) {
+      for (int32 j = 0; j < R; j++) {
+        BaseFloat elem = O(i, j);
+        BaseFloat error = fabs(elem - (i == j ? 1.0 : 0.0));
+        if (error > worst_error || error != error) {
+          worst_error = error;
+          worst_i = i;
+          worst_j = j;
+        }
+      }
+    }
+    if (worst_error > 1.0e-02 || worst_error != worst_error) {
+      KALDI_WARN << "Failed to verify W_t (worst error: O[" << worst_i << ','
+                 << worst_j << "] = " << O(worst_i, worst_j)
+                 << ", d_t = " << d_t_;
+    }
+  }
+}
+
 void OnlinePreconditioner::PreconditionDirectionsInternal(
     const int32 t,
     const BaseFloat rho_t,
     const Vector<BaseFloat> &d_t,
     CuMatrixBase<BaseFloat> *WJKL_t,
-    CuMatrixBase<BaseFloat> *R_t,
+    CuMatrixBase<BaseFloat> *X_t,
     CuVectorBase<BaseFloat> *row_prod,
     BaseFloat *scale) {
-  int32 N = R_t->NumRows(),  // Minibatch size.
-      D = R_t->NumCols(),  // Dimensions of vectors we're preconditioning
+  int32 N = X_t->NumRows(),  // Minibatch size.
+      D = X_t->NumCols(),  // Dimensions of vectors we're preconditioning
       R = rank_;  // Rank of correction to unit matrix.
   KALDI_ASSERT(R > 0 && R < D);
   BaseFloat eta = Eta(N);
@@ -227,9 +320,9 @@ void OnlinePreconditioner::PreconditionDirectionsInternal(
       K_t(*WJKL_t, R, R, D, R),
       WJ_t(*WJKL_t, 0, 2 * R, 0, D),
       LK_t(*WJKL_t, 0, 2 * R, D, R);
-  
-  H_t.AddMatMat(1.0, *R_t, kNoTrans, W_t, kTrans, 0.0);  // H_t = R_t W_t^T
-  
+
+  H_t.AddMatMat(1.0, *X_t, kNoTrans, W_t, kTrans, 0.0);  // H_t = X_t W_t^T
+
   bool locked = update_mutex_.TryLock();
   if (locked) {
     // Just hard-code it here that we do 10 updates before skipping any.
@@ -243,7 +336,7 @@ void OnlinePreconditioner::PreconditionDirectionsInternal(
       locked = false;
     }
   }
-  
+
   if (!locked) {
     // We're not updating the parameters, either because another thread is
     // working on updating them, or because another thread already did so from
@@ -255,24 +348,24 @@ void OnlinePreconditioner::PreconditionDirectionsInternal(
     // on very rare occasions, we could skip one or two more updates than we
     // intended.
     num_updates_skipped_++;
-    
-    BaseFloat tr_Rt_RtT = TraceMatMat(*R_t, *R_t, kTrans);
-    // P_t = R_t - H_t W_t
-    R_t->AddMatMat(-1.0, H_t, kNoTrans, W_t, kNoTrans, 1.0); 
-    // each element i of row_prod will be inner product of row i of P_t with
+
+    BaseFloat tr_Xt_XtT = TraceMatMat(*X_t, *X_t, kTrans);
+    // X_hat_t = X_t - H_t W_t
+    X_t->AddMatMat(-1.0, H_t, kNoTrans, W_t, kNoTrans, 1.0);
+    // each element i of row_prod will be inner product of row i of X_hat_t with
     // itself.
-    row_prod->AddDiagMat2(1.0, *R_t, kNoTrans, 0.0);
-    BaseFloat tr_Pt_PtT = row_prod->Sum();
-    KALDI_ASSERT(tr_Pt_PtT == tr_Pt_PtT);  // Check for NaN.
-    BaseFloat gamma_t = (tr_Pt_PtT == 0.0 ? 1.0 :
-                         sqrt(tr_Rt_RtT / tr_Pt_PtT));
+    row_prod->AddDiagMat2(1.0, *X_t, kNoTrans, 0.0);
+    BaseFloat tr_Xhat_XhatT = row_prod->Sum();
+    KALDI_ASSERT(tr_Xhat_XhatT == tr_Xhat_XhatT);  // Check for NaN.
+    BaseFloat gamma_t = (tr_Xhat_XhatT == 0.0 ? 1.0 :
+                         sqrt(tr_Xt_XtT / tr_Xhat_XhatT));
     *scale = gamma_t;
     return;
   }
-  J_t.AddMatMat(1.0, H_t, kTrans, *R_t, kNoTrans, 0.0);  // J_t = H_t^T R_t
+  J_t.AddMatMat(1.0, H_t, kTrans, *X_t, kNoTrans, 0.0);  // J_t = H_t^T X_t
 
   bool compute_lk_together = (N > D);
-  
+
   if (compute_lk_together) {
     // do the following two multiplies in one operation...
     // note
@@ -300,22 +393,28 @@ void OnlinePreconditioner::PreconditionDirectionsInternal(
   Vector<BaseFloat> e_t(R), sqrt_e_t(R), inv_sqrt_e_t(R);
   ComputeEt(d_t, beta_t, &e_t, &sqrt_e_t, &inv_sqrt_e_t);
   KALDI_VLOG(5) << "e_t = " << e_t;
-  
-  SpMatrix<BaseFloat> Z_t(R);
-  ComputeZt(N, rho_t, d_t, inv_sqrt_e_t, K_t_cpu, L_t_cpu, &Z_t);
+
+  // The double-precision Z_t here, and the scaling, is to avoid potential
+  // overflow, because Z_t is proportional to the fourth power of data.
+  SpMatrix<double> Z_t_double(R);
+  ComputeZt(N, rho_t, d_t, inv_sqrt_e_t, K_t_cpu, L_t_cpu, &Z_t_double);
+  BaseFloat z_t_scale = std::max<double>(1.0, Z_t_double.Trace());
+  Z_t_double.Scale(1.0 / z_t_scale);
+  SpMatrix<BaseFloat> Z_t_scaled(Z_t_double);
 
   Matrix<BaseFloat> U_t(R, R);
   Vector<BaseFloat> c_t(R);
   // do the symmetric eigenvalue decomposition Z_t = U_t C_t U_t^T.
-  Z_t.Eig(&c_t, &U_t);
+  Z_t_scaled.Eig(&c_t, &U_t);
   SortSvd(&c_t, &U_t);
+  c_t.Scale(z_t_scale);
 
   const BaseFloat condition_threshold = 1.0e+06;
   // must_reorthogonalize will be true if the last diagonal element of c_t is
   // negative, since we don't take the absolute value, but this is the right
   // thing anyway.
   bool must_reorthogonalize = (c_t(0) > condition_threshold * c_t(R - 1));
-  
+
   BaseFloat c_t_floor = pow(rho_t * (1 - eta), 2);
   int32 nf = c_t.ApplyFloor(c_t_floor);
   if (nf > 0)
@@ -323,31 +422,31 @@ void OnlinePreconditioner::PreconditionDirectionsInternal(
   if (nf > 0 && self_debug_) {
     KALDI_WARN << "Floored " << nf << " elements of C_t.";
   }
-  BaseFloat tr_Rt_RtT_check;
+  BaseFloat tr_Xt_XtT_check;
   if (self_debug_)
-    tr_Rt_RtT_check = TraceMatMat(*R_t, *R_t, kTrans);
-  
-  R_t->AddMatMat(-1.0, H_t, kNoTrans, W_t, kNoTrans, 1.0);  // P_t = R_t - H_t W_t
-  // set *row_prod to inner products of each row of P_t with itself.
-  row_prod->AddDiagMat2(1.0, *R_t, kNoTrans, 0.0);
-
-  BaseFloat tr_Pt_PtT = row_prod->Sum();
-  //  tr(R_t R_t^T) = tr(P_t P_t^T) - tr(L_t E_t) + 2 tr(L_t)  
-  double tr_Rt_RtT = tr_Pt_PtT;
+    tr_Xt_XtT_check = TraceMatMat(*X_t, *X_t, kTrans);
+
+  X_t->AddMatMat(-1.0, H_t, kNoTrans, W_t, kNoTrans, 1.0);  // X_hat_t = X_t - H_t W_t
+  // set *row_prod to inner products of each row of X_hat_t with itself.
+  row_prod->AddDiagMat2(1.0, *X_t, kNoTrans, 0.0);
+
+  BaseFloat tr_Xhat_XhatT = row_prod->Sum();
+  //  tr(X_t X_t^T) = tr(X_hat_t X_hat_t^T) - tr(L_t E_t) + 2 tr(L_t)
+  double tr_Xt_XtT = tr_Xhat_XhatT;
   for (int32 i = 0; i < R; i++)
-    tr_Rt_RtT += L_t_cpu(i, i) * (2.0 - e_t(i));
+    tr_Xt_XtT += L_t_cpu(i, i) * (2.0 - e_t(i));
   if (self_debug_) {
-    KALDI_ASSERT(ApproxEqual(tr_Rt_RtT, tr_Rt_RtT_check));
+    KALDI_ASSERT(ApproxEqual(tr_Xt_XtT, tr_Xt_XtT_check));
   }
-  BaseFloat gamma_t = (tr_Pt_PtT == 0.0 ? 1.0 :
-                       sqrt(tr_Rt_RtT / tr_Pt_PtT));
+  BaseFloat gamma_t = (tr_Xhat_XhatT == 0.0 ? 1.0 :
+                       sqrt(tr_Xt_XtT / tr_Xhat_XhatT));
   *scale = gamma_t;
 
   Vector<BaseFloat> sqrt_c_t(c_t);
   sqrt_c_t.ApplyPow(0.5);
-  
-  // \rho_{t+1} = 1/(D - R) (\eta/N tr(R_t R_t^T) + (1-\eta)(D \rho_t + tr(D_t)) - tr(C_t^{0.5})).  
-  BaseFloat rho_t1 = 1.0 / (D - R) * (eta / N * tr_Rt_RtT
+
+  // \rho_{t+1} = 1/(D - R) (\eta/N tr(X_t X_t^T) + (1-\eta)(D \rho_t + tr(D_t)) - tr(C_t^{0.5})).
+  BaseFloat rho_t1 = 1.0 / (D - R) * (eta / N * tr_Xt_XtT
                                       + (1-eta)*(D * rho_t + d_t.Sum())
                                       - sqrt_c_t.Sum());
   // D_{t+1} = C_t^{0.5} - \rho_{t+1} I
@@ -356,7 +455,7 @@ void OnlinePreconditioner::PreconditionDirectionsInternal(
   BaseFloat floor_val = std::max(epsilon_, delta_ * sqrt_c_t.Max());
   if (rho_t1 < floor_val)
     rho_t1 = floor_val;
-  d_t1.ApplyFloor(epsilon_);
+  d_t1.ApplyFloor(floor_val);
 
   CuMatrix<BaseFloat> W_t1(R, D);  // W_{t+1}
   ComputeWt1(N, d_t, d_t1, rho_t, rho_t1, U_t, sqrt_c_t, inv_sqrt_e_t,
@@ -373,7 +472,6 @@ void OnlinePreconditioner::PreconditionDirectionsInternal(
                        &L_t);
   }
 
-
   // Commit the new parameters.
   read_write_mutex_.Lock();
   KALDI_ASSERT(t_ == t);  // we already ensured this.
@@ -382,28 +480,35 @@ void OnlinePreconditioner::PreconditionDirectionsInternal(
   W_t_.Swap(&W_t1);
   d_t_.CopyFromVec(d_t1);
   rho_t_ = rho_t1;
-  
+
+  if (self_debug_)
+    SelfTest();
+
   read_write_mutex_.Unlock();
   update_mutex_.Unlock();
 }
 
 BaseFloat OnlinePreconditioner::Eta(int32 N) const {
   KALDI_ASSERT(num_samples_history_ > 0.0);
-  return 1.0 - Exp(-N / num_samples_history_);
+  BaseFloat ans = 1.0 - exp(-N / num_samples_history_);
+  // Don't let eta approach 1 too closely, as it can lead to NaN's appearing if
+  // the input is all zero.
+  if (ans > 0.9) ans = 0.9;
+  return ans;
 }
 
 void OnlinePreconditioner::ComputeWt1(int32 N,
-                                      const VectorBase<BaseFloat> &d_t,
-                                      const VectorBase<BaseFloat> &d_t1,
-                                      BaseFloat rho_t,
-                                      BaseFloat rho_t1,
-                                      const MatrixBase<BaseFloat> &U_t,
-                                      const VectorBase<BaseFloat> &sqrt_c_t,
-                                      const VectorBase<BaseFloat> &inv_sqrt_e_t,                                      
-                                      const CuMatrixBase<BaseFloat> &W_t,
-                                      CuMatrixBase<BaseFloat> *J_t,
-                                      CuMatrixBase<BaseFloat> *W_t1) const {
-  
+                                       const VectorBase<BaseFloat> &d_t,
+                                       const VectorBase<BaseFloat> &d_t1,
+                                       BaseFloat rho_t,
+                                       BaseFloat rho_t1,
+                                       const MatrixBase<BaseFloat> &U_t,
+                                       const VectorBase<BaseFloat> &sqrt_c_t,
+                                       const VectorBase<BaseFloat> &inv_sqrt_e_t,
+                                       const CuMatrixBase<BaseFloat> &W_t,
+                                       CuMatrixBase<BaseFloat> *J_t,
+                                       CuMatrixBase<BaseFloat> *W_t1) const {
+
   int32 R = d_t.Dim(), D = W_t.NumCols();
   BaseFloat eta = Eta(N);
 
@@ -415,7 +520,7 @@ void OnlinePreconditioner::ComputeWt1(int32 N,
   ComputeEt(d_t1, beta_t1, &e_t1, &sqrt_e_t1, &inv_sqrt_e_t1);
   Vector<BaseFloat> inv_sqrt_c_t(sqrt_c_t);
   inv_sqrt_c_t.InvertElements();
-  
+
   Vector<BaseFloat> w_t_coeff(R);
   for (int32 i = 0; i < R; i++)
     w_t_coeff(i) = (1.0 - eta) / (eta/N) * (d_t(i) + rho_t);
@@ -435,28 +540,6 @@ void OnlinePreconditioner::ComputeWt1(int32 N,
   // W_{t+1} = A_t B_t
   CuMatrix<BaseFloat> A_t_gpu(A_t);
   W_t1->AddMatMat(1.0, A_t_gpu, kNoTrans, *J_t, kNoTrans, 0.0);
-
-  if (self_debug_) {
-    CuMatrix<BaseFloat> W_t1_prod(R, R);
-    W_t1_prod.SymAddMat2(1.0, *W_t1, kNoTrans, 0.0);
-    W_t1_prod.CopyLowerToUpper();
-    Matrix<BaseFloat> W_t1_prod_cpu(W_t1_prod);
-    // Verifying that W_{t+1} W_{t+1}^T == E_t, via
-    // E_{-0.5} W_{t+1} W_{t+1}^T E_{-0.5} == I.
-    for (int32 i = 0; i < R; i++)
-      for (int32 j = 0; j < R; j++)
-        W_t1_prod_cpu(i, j) *= inv_sqrt_e_t1(i) * inv_sqrt_e_t1(j);
-    for (int32 i = 0; i < R; i++) {
-      for (int32 j = 0; j < R; j++) {
-        BaseFloat elem = W_t1_prod_cpu(i, j);
-        if ((i == j && fabs(elem - 1.0) > 0.1) ||
-            (i != j && fabs(elem) > 1.0e-02)) {
-          KALDI_WARN << "Failed to verify W_{t+1}, the following should be unit: "
-                     << W_t1_prod_cpu;
-        }
-      }
-    }
-  }
 }
 
 void OnlinePreconditioner::ComputeZt(int32 N,
@@ -465,18 +548,20 @@ void OnlinePreconditioner::ComputeZt(int32 N,
                                      const VectorBase<BaseFloat> &inv_sqrt_e_t,
                                      const MatrixBase<BaseFloat> &K_t,
                                      const MatrixBase<BaseFloat> &L_t,
-                                     SpMatrix<BaseFloat> *Z_t) const {
+                                     SpMatrix<double> *Z_t) const {
+  // Use doubles because the range of quantities in Z_t can get large (fourth
+  // power of data), and we want to avoid overflow.  This routine is fast.
   BaseFloat eta = Eta(N);
   Vector<BaseFloat> d_t_rho_t(d_t);
   d_t_rho_t.Add(rho_t);  // now d_t_rho_t is diag(D_t + \rho_t I).
-  BaseFloat etaN = eta / N, eta1 = 1.0 - eta,
+  double etaN = eta / N, eta1 = 1.0 - eta,
       etaN_sq = etaN * etaN, eta1_sq = eta1 * eta1,
       etaN_eta1 = etaN * eta1;
   int32 R = d_t.Dim();
   for (int32 i = 0; i < R; i++) {
-    BaseFloat inv_sqrt_e_t_i = inv_sqrt_e_t(i), d_t_rho_t_i = d_t_rho_t(i);
+    double inv_sqrt_e_t_i = inv_sqrt_e_t(i), d_t_rho_t_i = d_t_rho_t(i);
     for (int32 j = 0; j <= i; j++) {
-      BaseFloat inv_sqrt_e_t_j = inv_sqrt_e_t(j), d_t_rho_t_j = d_t_rho_t(j),
+      double inv_sqrt_e_t_j = inv_sqrt_e_t(j), d_t_rho_t_j = d_t_rho_t(j),
           L_t_i_j = 0.5 * (L_t(i, j) + L_t(j, i)),
           K_t_i_j = 0.5 * (K_t(i, j) + K_t(j, i));
       // See (eqn:Zt) in header.
@@ -506,106 +591,6 @@ void OnlinePreconditioner::ComputeEt(const VectorBase<BaseFloat> &d_t,
 }
 
 
-/**
-   I'm not very satisfied with the implementation of this function, but a
-   careful GPU-oriented version would take a while to do correctly, mainly due
-   to the necessity to implement orthogonalization of a matrix where the matrix
-   might have a reduced rank and we might have to "complete" it with random
-   rows.  Anyway, in the current implementation we just move an inner-product
-   matrix to the CPU and compute the approximate top eigenvalues there.
- */
-void ApproxEigsOfProduct(const CuMatrixBase<BaseFloat> &M,
-                         MatrixTransposeType trans,
-                         CuMatrixBase<BaseFloat> *P,
-                         CuVectorBase<BaseFloat> *s) {
-  int32 R = P->NumRows(), D = P->NumCols();
-  
-  // First make sure, for simplicity, that trans == kNoTrans.
-  if (trans == kTrans) {
-    CuMatrix<BaseFloat> M_trans(M, kTrans);
-    ApproxEigsOfProduct(M_trans, kNoTrans, P, s);
-    return;
-  }
-  // Next, make sure we can handle the case when the number of requested
-  // eigenvalues is more than smaller of (#columns/#rows)... this makes sense
-  // in a situation where we are asked for a number eigenvalues of R R^T that
-  // is greater than the #cols of R.  The remaining eigenvectors should be zero.
-  if (R > std::min(M.NumRows(), M.NumCols())) {
-    KALDI_ASSERT(R <= D);
-    int32 R_tmp = std::min(M.NumRows(), M.NumCols());
-    CuSubMatrix<BaseFloat> P_part(*P, 0, R_tmp, 0, D);
-    CuSubVector<BaseFloat> s_part(*s, 0, R_tmp);
-    s->SetZero();
-    ApproxEigsOfProduct(M, trans, &P_part, &s_part);
-    Matrix<BaseFloat> P_cpu(*P);
-    P_cpu.OrthogonalizeRows();  // Will fill the remaining rows of P_cpu with
-                                // random vectors and ensure P P^T = I.
-    P->CopyFromMat(P_cpu);
-    return;
-  }
-  
-  KALDI_ASSERT(R <= D && R > 0 && s->Dim() == R);
-  if (trans == kNoTrans) {
-    KALDI_ASSERT(D == M.NumRows());
-  } else {
-    KALDI_ASSERT(D == M.NumCols());
-  }
-
-  if (M.NumRows() < M.NumCols()) {
-    // Quicker to compute eigenvalues of M M^T
-    CuMatrix<BaseFloat> MMT(M.NumRows(), M.NumRows());
-    MMT.SymAddMat2(1.0, M, kNoTrans, 0.0);
-    CuSpMatrix<BaseFloat> MMT_sp(MMT, kTakeLower);
-    SpMatrix<BaseFloat> MMT_cpu(MMT_sp);
-
-    Vector<BaseFloat> s_cpu(R);
-    Matrix<BaseFloat> P_cpu(D, R);  // It's actually the columns of P that are
-                                    // the eigenvectors.
-    // Uses default configuration to get top eigenvalues approximately.
-    MMT_cpu.TopEigs(&s_cpu, &P_cpu);  
-    P->CopyFromMat(P_cpu, kTrans);
-    s->CopyFromVec(s_cpu);
-  } else {
-    // Quicker to compute eigenvalues of M^T M
-    int32 D = M.NumCols();
-    CuMatrix<BaseFloat> MTM(D, D);
-    MTM.SymAddMat2(1.0, M, kTrans, 0.0);
-    CuSpMatrix<BaseFloat> MTM_sp(MTM, kTakeLower);
-    SpMatrix<BaseFloat> MTM_cpu(MTM_sp);
-
-    Vector<BaseFloat> s_cpu(R);
-    Matrix<BaseFloat> Q_cpu(D, R);  // It's actually the columns of Q that are
-                                    // the eigenvectors.
-    MTM_cpu.TopEigs(&s_cpu, &Q_cpu);  // Uses default configuration.
-    
-    // OK, suppose we have some eigenvector v, so M^T M v = \lambda v.  Define w
-    // = M v.  Then M M^T M v = M (M^T M v) = M (\lambda v) = \lambda M M^T w.
-    // Then w = M v is also an eigenvector of M M^T, with the same eigenvalue
-    // \lambda.
-    // However, we might have a problem if M v == 0 (this is only possible if
-    // some eigenvalues are zero); in this case we won't be able to renormalize
-    // w to have unit norm.  We'll let OrthogonalizeRows() take care of that,
-    // though.  Anyway, just to avoid having to think about it to hard,
-    // we'll recompute the eigenvalues after computing P = Q^T M^T below
-    // and orthogonalizing its rows.
-    // Note: the tranpose on Q in the above equation is because our Q_cpu
-    // has its columns, not rows, as the eigenvectors.
-
-    Matrix<BaseFloat> P_cpu(R, M.NumRows());
-    Matrix<BaseFloat> M_cpu(M);
-    P_cpu.AddMatMat(1.0, Q_cpu, kTrans, M_cpu, kTrans, 0.0);
-    P_cpu.OrthogonalizeRows();
-    P->CopyFromMat(P_cpu);
-
-    // we will set s according to diag(s) = P M M^T P^T,
-    // which we can get by computing P M, and doing AddDiagMat2
-    CuMatrix<BaseFloat> PM(R, M.NumCols());
-    PM.AddMatMat(1.0, *P, kNoTrans, M, kNoTrans, 0.0);
-    s->SetZero();  // In case it had NaN's in it.
-    s->AddDiagMat2(1.0, PM, kNoTrans, 0.0);
-  }
-}
-
 OnlinePreconditioner::OnlinePreconditioner(const OnlinePreconditioner &other):
     rank_(other.rank_), update_period_(other.update_period_),
     num_samples_history_(other.num_samples_history_),
@@ -613,7 +598,7 @@ OnlinePreconditioner::OnlinePreconditioner(const OnlinePreconditioner &other):
     t_(other.t_), num_updates_skipped_(other.num_updates_skipped_),
     self_debug_(other.self_debug_), W_t_(other.W_t_),
     rho_t_(other.rho_t_), d_t_(other.d_t_) {
-  // use default constructor for the mutextes.
+  // use default constructor for the mutexes.
 }
 
 OnlinePreconditioner& OnlinePreconditioner::operator = (
@@ -633,7 +618,7 @@ OnlinePreconditioner& OnlinePreconditioner::operator = (
 
 void OnlinePreconditioner::SetRank(int32 rank) {
   KALDI_ASSERT(rank > 0);
-  rank_ = rank;  
+  rank_ = rank;
 }
 void OnlinePreconditioner::SetUpdatePeriod(int32 update_period) {
   KALDI_ASSERT(update_period > 0);
diff --git a/src/nnet2/nnet-precondition-online.h b/src/nnet2/nnet-precondition-online.h
index e461339c18c..2b2ada962b2 100644
--- a/src/nnet2/nnet-precondition-online.h
+++ b/src/nnet2/nnet-precondition-online.h
@@ -1,6 +1,7 @@
 // nnet2/nnet-precondition-online.h
 
-// Copyright 2013   Johns Hopkins University (author: Daniel Povey)
+// Copyright 2013-2015   Johns Hopkins University (author: Daniel Povey)
+//                2015   Xiaohui Zhang
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -34,134 +35,102 @@ namespace nnet2 {
 /**
    Keywords for search: natural gradient, naturalgradient, NG-SGD
 
-   It will help to first try to understand ./nnet-precondition.h before reading
-   this comment and trying to understand what's going on here.  The motivation
-   for this method was that the code in nnet-precondition.h was too slow when
-   implemented on CUDA cards, it was taking well over half the time.  The problem
-   is that algorithms like Cholesky decomposition and triangular solvers, that
-   were used in that method, are not as parallelizable as matrix multiplication.
-   The method in nnet-precondition.h involved inverting symmetric matrices whose
-   dimension was the number of frames in a minibatch.
-
-   Our method here aims to reduce the dimension in which we have to do things
-   like inversion.  (In fact, for CUDA implementation we'll deal with small matrices
-   like 10x10 for which it's faster to move them to the CPU and invert them there,
-   and then move them back).
-   
-   Firstly, for each affine layer, we treat it for purposes of this code as
-   a linear layer on an input that has been extended by a 1.  This code does not
-   even see the bias as a separate thing.
-   
-   The basic aim (just like nnet-precondition.h) is to deal with a factored form
-   of a Fisher matrix (outer product of derivatives), where for each affine
-   layer we have one such matrix that is in the space of the input to the layer
-   and another that is in the space of the derivatives at the output of the
-   layer.  There are reasons from information geometry to believe that it's a
-   good thing to multiply the derivatives by the inverse of the Fisher matrix,
-   so that's basically what we'll do, except we have to worry about things like
-   smoothing estimates of the Fisher matrix, and putting in an overall scaling
-   factor so we still have a reasonable learning rate.
-
-   In the previous code (nnet-precondition.h), we got the Fisher matrix from
-   the current minibatch.  In this case we estimate a low-rank plus scaled-identity
-   approximation to the Fisher matrix, in an iterative approach where we update
-   it on each minibatch.  This is more efficient, particularly with CUDA, where
-   the relatively high-dimensional symmetric matrix inversion we previously did
-   was becoming a bottleneck.
-*/
-
-
-/*
-  Note regarding AISTATS/arXiv paper: we have different notation there.  The following is
-  the mapping from our notation to the AISTATS/arXiv paper notation.
-     R_t -> X_t
-     P_t -> \hat{X}_t
-     Q_t -> \bar{X}_t
-     X_t -> V_t
-  
+   This method is explained in the paper
+   "Parallel training of DNNs with Natural Gradient and Parameter Averaging"
+   by D. Povey, X. Zhang and S. Khudanpur, ICLR Workshop, 2015, where
+   it is referred to as online NG-SGD.  Note that the method exported
+   from this header is just the core of the algorithm, and some outer-level parts
+   of it are implemented in class NaturalGradientAffineComponent.
+
+  The rest of this extended comment describes the way we keep updated an estimate
+  of the inverse of a scatter matrix, in an online way.  This is the same as the
+  estimation of one of the A or B quantities in the paper.  This comment is slightly
+  redundant with the paper- actually it precedes the paper- but we keep it in case it
+  is useful in understanging our method.
+
   We consider the problem of doing online estimation of a (scaled-identity plus low-rank)
   approximation of a Fisher matrix... since the Fisher matrix is a scatter of vector-valued derivatives
   and we will be given the derivatives (or at least terms in a factorization of the derivatives
   which need not concern us right now), we can just think of the present task as being
   the online accumulation of a (low-rank plus scaled-identity) approximation to a variance
   of a distribution with mean zero.
-  
+
   Later on we'll think about how to get easy access to the inverse of this approximate
   variance, which is what we really need.
 
   Our approximation to the Fisher matrix (the scatter of derivatives) will be of the following form
   (and just think of this as an approximate variance matrix of some arbitrary quantities).
 
-     F_t =(def) X_t^T D_t X_t + \rho_t I
+     F_t =(def) R_t^T D_t R_t + \rho_t I
 
-  (t is the minibatch index), where X_t is an R by D matrix with orthonormal
+  (t is the minibatch index), where R_t is an R by D matrix with orthonormal
   rows (1 <= R < D is our chosen rank), D_t is a positive-definite diagonal matrix, and
   \rho_t > 0.  Suppose the dimension of F_t is D.  Let the vectors whose variance
   we are approximating be provided in minibatches of size M (M can vary from
   iteration to iteration, but it won't vary in the normal case, so we omit the
-  subscript t).  The batch of gradients is given as R_t \in Re^{M \times D},
+  subscript t).  The batch of gradients is given as X_t \in Re^{M \times D},
   i.e. each row is one of the vectors whose scatter we're estimating.  On the
-  t'th iteration, define the scatter S_t of the input vectors R_t as:
+  t'th iteration, define the scatter S_t of the input vectors X_t as:
+
+     S_t =(def) 1/N X_t^T X_t           (eqn:St)
 
-     S_t =(def) 1/N R_t^T R_t           (eqn:St)
-     
   (where N is the minibatch size).  Be careful not to confuse the rank R with
-  with input R_t (we would typeface R_t in bold if this were not plain text, to
+  with input X_t (we would typeface X_t in bold if this were not plain text, to
   make the distinction clearer).  We want F_t to approach some kind of
   time-weighted average of the S_t quantities, to the extent permitted by the
   limitation of the rank R.  We want the F_t quantities to stay "fresh" (since
   we'll be doing this in a SGD context and the parameters will be slowly
   changing).  We use a constant 0 < \eta < 1 to control the updating rate.  Our
-  update for X_t is based on the power method.  Define the smoothed scatter
+  update for R_t is based on the power method.  Define the smoothed scatter
 
    T_t =(def) \eta S_t + (1-\eta) F_t
 
   we'll use this in place of the observed scatter S_t, to slow down the update.
   Defining
-  
-   Y_t =(def) X_t T_t
+
+   Y_t =(def) R_t T_t
 
   which can be expanded as follows:
-       Y_t = X_t ( \eta S_t + (1-\eta) F_t )
-           = X_t ( \eta S_t + (1-\eta) (X_t^T D_t X_t + \rho_t I) )
-           = X_t ( \eta S_t + (1-\eta) (X_t^T D_t X_t + \rho_t I) )
-           = \eta X_t S_t + (1-\eta) (D_t + \rho_t I) X_t
+       Y_t = R_t ( \eta S_t + (1-\eta) F_t )
+           = R_t ( \eta S_t + (1-\eta) (R_t^T D_t R_t + \rho_t I) )
+           = R_t ( \eta S_t + (1-\eta) (R_t^T D_t R_t + \rho_t I) )
+           = \eta R_t S_t + (1-\eta) (D_t + \rho_t I) R_t
 
   It is useful to think of Y_t as having each of the top eigenvectors of the
-  scatter scaled by the corresponding eigenvalue \lambda_i. 
+  scatter scaled by the corresponding eigenvalue \lambda_i.
   We compute the following R by R matrix:
     Z_t =(def) Y_t Y_t^T
   and do the symmetric eigenvalue decomposition
     Z_t = U_t C_t U_t^T
   where C_t is diagonal and U_t orthogonal; the diagonal elements of C_t will be
-  positive (since \rho_t > 0, T_t is positive definite; since X_t has full row rank
+  positive (since \rho_t > 0, T_t is positive definite; since R_t has full row rank
   and T_t is positive definite, Y_t has full row rank; hence Z_t is positive definite).
   The diagonal elements of C_t can be thought of as corresponding to the squares of
   our current estimate of the top eigenvalues of the scatter matrix.
   [we should check that no element of C_t is <= 0.]
-  
+
   It is easy to show that C_t^{-0.5} U_t^T Z_t U_t C_t^{-0.5} = I, so
      (C_t^{-0.5} U_t^T Y_t) (Y_t^T U_t C_t^{-0.5}) = I.  Define
-    X_{t+1} =(def) C_t^{-0.5} U_t^T Y_t
+    R_{t+1} =(def) C_t^{-0.5} U_t^T Y_t
 
-  and it's clear that X_{t+1} X_{t+1}^T = I. 
+  and it's clear that R_{t+1} R_{t+1}^T = I.
   We will set
      D_{t+1} =(def) C_t^{0.5} - \rho_{t+1} I             (eqn:dt1)
 
-  which ensures that for each row x of X_{t+1}, the variance of our scatter
+  which ensures that for each row r of R_{t+1}, the variance of our scatter
   matrix F_{t+1} will be the square root of the corresponding diagonal element
   of C_t.  This makes sense because, as we have pointed out, the diagonal
   elements of C_t can be thought of as corresponding to squared eigenvalues.
   But a proper treatment of this would require convergence analysis that would
   get quite complicated.  We will choose \rho_{t+1} in order to ensure that
   tr(F_{t+1}) = tr(T_t).
-  
+
   For any t,
      tr(F_t) = D \rho_t + tr(D_t)
      tr(T_t) = \eta tr(S_t) + (1-\eta) tr(F_t)
              = \eta tr(S_t) + (1-\eta) (D \rho_t + tr(D_t))
   Expanding out D_{t+1} from (eqn:dt1) in the expression for tr(F_{t+1}) below:
-      tr(F_{t+1})  = D \rho_{t+1} +  tr(D_{t+1}) 
+      tr(F_{t+1})  = D \rho_{t+1} +  tr(D_{t+1})
       tr(F_{t+1})  = D \rho_{t+1} +  tr(C_t^{0.5} - \rho_{t+1} I)
                    = (D - R) \rho_{t+1} + tr(C_t^{0.5})
    and equating tr(F_{t+1}) with T_t (since F_{t+1} is supposed to be a low-rank
@@ -182,7 +151,7 @@ namespace nnet2 {
   D_{t+1} and \rho_{t+1} as above, we floor \rho_{t+1} to a small value (like
   1.0e-10).
 
-  OK, we have described the updating of X_t, D_t and \rho_t.  Next, we need to
+  OK, we have described the updating of R_t, D_t and \rho_t.  Next, we need to
   figure out how to efficiently multiply by the inverse of F_t.  Our experience
   from working with the old preconditioning method was that it's best not to use
   the inverse of the Fisher matrix itself, but a version of the Fisher matrix
@@ -191,22 +160,22 @@ namespace nnet2 {
   designed to ensure that the smoothing varies proportionally with the scale of F_t:
 
         G_t =(def) F_t +  \alpha/D tr(F_t) I
-            =     X_t^T D_t X_t + (\rho_t + \alpha/D tr(F_t)) I
-            =     X_t^T D_t X_t + \beta_t I
-  where            
+            =     R_t^T D_t R_t + (\rho_t + \alpha/D tr(F_t)) I
+            =     R_t^T D_t R_t + \beta_t I
+  where
     \beta_t =(def) \rho_t + \alpha/D tr(F_t)
             =      \rho_t(1+\alpha) + \alpha/D tr(D_t)       (eqn:betat2)
 
   Define
-     P_t =(def)  \beta_t R_t G_t^{-1}.
+     \hat{X}_t =(def)  \beta_t X_t G_t^{-1}.
   the factor of \beta_t is inserted arbitrarily as it just happens to be convenient
-  to put unit scale on R_t in the formula for P_t; it will anyway be canceled out
+  to put unit scale on X_t in the formula for \hat{X}_t; it will anyway be canceled out
   in the next step.  Then our final preconditioned minibatch of vectors is:
-     Q_t = \gamma_t P_t
+     \bar{X}_t = \gamma_t \hat{X}_t
   where
-     \gamma_t = sqrt(tr(R_t R_t^T)  / tr(P_t P_t^T).
-  The factor of \gamma ensures that Q_t is scaled to have the same overall
-  2-norm as the input R_t.  We found in previous versions of this method that this
+     \gamma_t = sqrt(tr(X_t X_t^T)  / tr(\hat{X}_t \hat{X}_t^T).
+  The factor of \gamma ensures that \bar{X}_t is scaled to have the same overall
+  2-norm as the input X_t.  We found in previous versions of this method that this
   rescaling was helpful, as otherwise there are certain situations (e.g. at the
   start of training) where the preconditioned derivatives can get very large.  Note
   that this rescaling introduces a small bias into the training, because now the
@@ -216,61 +185,61 @@ namespace nnet2 {
   To efficiently compute G_t^{-1}, we will use the Woodbury matrix identity.
   Writing the Woodbury formula for the symmetric case,
     (A + U D U^T)^{-1} = A^{-1} - A^{-1} U (D^{-1} + U^T A^{-1} U)^{-1} U^T A^{-1}
-  Substituting A = \beta_t I, D = D_t and U = X_t^T, this becomes
-       G_t^{-1} = 1/\beta_t I - 1/\beta_t^2 X_t^T (D_t^{-1} + 1/\beta_t I)^{-1} X_t
-                = 1/\beta_t (I - X_t^T E_t X_t)
+  Substituting A = \beta_t I, D = D_t and U = R_t^T, this becomes
+       G_t^{-1} = 1/\beta_t I - 1/\beta_t^2 R_t^T (D_t^{-1} + 1/\beta_t I)^{-1} R_t
+                = 1/\beta_t (I - R_t^T E_t R_t)
   where
         E_t =(def)  1/\beta_t (D_t^{-1} + 1/\beta_t I)^{-1},         (eqn:etdef)
-  so       
+  so
     e_{tii} =   1/\beta_t * 1/(1/d_{tii} + 1/\beta_t)                (eqn:tii)
             =   1/(\beta_t/d_{tii} + 1)
 
-  We would like an efficient-to-compute expression for P_t, without too many separate
+  We would like an efficient-to-compute expression for \hat{X}_t, without too many separate
   invocations of kernels on the GPU.
-     P_t = \beta_t R_t G_t^{-1}
-         = R_t - R_t X_t^T E_t X_t
+     \hat{X}_t = \beta_t X_t G_t^{-1}
+         = X_t - X_t R_t^T E_t R_t
   For efficient operation on the GPU, we want to reduce the number of high-dimensional
   operations that we do (defining "high-dimension" as anything involving D or M, but not
   R, since R is likely small, such as 20).  We define
-     W_t =(def)  E_t^{0.5} X_t.
-  We will actually be storing W_t on the GPU rather than X_t, in order to reduce the
+     W_t =(def)  E_t^{0.5} R_t.
+  We will actually be storing W_t on the GPU rather than R_t, in order to reduce the
   number of operations on the GPU.  We can now write:
 
-        P_t = R_t - R_t W_t^T W_t       (eqn:pt2)
-  
+        \hat{X}_t = X_t - X_t W_t^T W_t       (eqn:pt2)
+
   The following, which we'll compute on the GPU, are going to be useful in computing
   quantities like Z_t:
-  
-     H_t =(def) R_t W_t^T     (dim is N by R)
-     J_t =(def) H_t^T R_t     (dim is R by D)
-         =      W_t R_t^T R_t
+
+     H_t =(def) X_t W_t^T     (dim is N by R)
+     J_t =(def) H_t^T X_t     (dim is R by D)
+         =      W_t X_t^T X_t
      K_t =(def) J_t J_t^T     (dim is R by R, symmetric).. transfer this to CPU.
      L_t =(def) H_t^T H_t     (dim is R by R, symmetric).. transfer this to CPU.
-         =      W_t R_t^T R_t W_t^T
+         =      W_t X_t^T X_t W_t^T
      Note: L_t may also be computed as
      L_t = J_t W_t^T
      which may be more efficient if D < N.
 
   Note: after we have computed H_t we can directly compute
-     P_t = R_t - H_t W_t
+     \hat{X}_t = X_t - H_t W_t
 
   We need to determine how Y_t and Z_t relate to the quantities we just defined.
   First, we'll expand out H_t, J_t, K_t and L_t in terms of the more fundamental quantities.
-     H_t = R_t X_t^T E_t^{0.5}
-     J_t = E_t^{0.5} X_t R_t^T R_t
-     K_t = E_t^{0.5} X_t R_t^T R_t R_t^T R_t X_t^T E_t^{0.5}
-     L_t = E_t^{0.5} X_t R_t^T R_t X_t^T E_t^{0.5}
+     H_t = X_t R_t^T E_t^{0.5}
+     J_t = E_t^{0.5} R_t X_t^T X_t
+     K_t = E_t^{0.5} R_t X_t^T X_t X_t^T X_t R_t^T E_t^{0.5}
+     L_t = E_t^{0.5} R_t X_t^T X_t R_t^T E_t^{0.5}
 
   we wrote above that
-      Y_t = \eta X_t S_t + (1-\eta) (D_t + \rho_t I) X_t
-  so      
-      Y_t = \eta/N X_t R_t^T R_t   + (1-\eta) (D_t + \rho_t I) X_t
-          = \eta/N E_t^{-0.5} J_t  + (1-\eta) (D_t + \rho_t I) X_t     (eqn:yt)
+      Y_t = \eta R_t S_t + (1-\eta) (D_t + \rho_t I) R_t
+  so
+      Y_t = \eta/N R_t X_t^T X_t   + (1-\eta) (D_t + \rho_t I) R_t
+          = \eta/N E_t^{-0.5} J_t  + (1-\eta) (D_t + \rho_t I) R_t     (eqn:yt)
   We will expand Z_t using the expression for Y_t in the line above:
       Z_t = Y_t Y_t^T
           =  (\eta/N)^2 E_t^{-0.5} J_t J_t^T E_t^{-0.5}
-            +(\eta/N)(1-\eta) E_t^{-0.5} J_t X_t^T (D_t + \rho_t I)
-            +(\eta/N)(1-\eta) (D_t + \rho_t I) X_t J_t^T E_t^{-0.5}
+            +(\eta/N)(1-\eta) E_t^{-0.5} J_t R_t^T (D_t + \rho_t I)
+            +(\eta/N)(1-\eta) (D_t + \rho_t I) R_t J_t^T E_t^{-0.5}
             +(1-\eta)^2 (D_t + \rho_t I)^2
           = (\eta/N)^2 E_t^{-0.5} K_t E_t^{-0.5}
            +(\eta/N)(1-\eta) E_t^{-0.5} L_t E_t^{-0.5} (D_t + \rho_t I)
@@ -284,60 +253,60 @@ namespace nnet2 {
 
   Mathematically, no diagonal element of C_t can be less than (1-\eta)^2
   \rho_t^2, and since negative or zero elements of C_t would cause us a problem
-  later, we floor C_t to this value.  (see below regarding how we ensure X_{t+1}
+  later, we floor C_t to this value.  (see below regarding how we ensure R_{t+1}
   has orthonormal rows).
-  
+
   We will continue the discussion below regarding what we do with C_t and U_t.
   Next, we need to digress briefly and describe how to compute
-  tr(P_t P_t^T) and tr(R_t R_t^2), since these appear in expressions for
-  \gamma_t (needed to produce the output Q_t), and for \rho_{t+1}.  It happens
+  tr(\hat{X}_t \hat{X}_t^T) and tr(X_t X_t^2), since these appear in expressions for
+  \gamma_t (needed to produce the output \bar{X}_t), and for \rho_{t+1}.  It happens
   that we need, for purposes of appying "max_change" in the neural net code, the
-  squared 2-norm of each row of the output Q_t.  In order to be able to compute
+  squared 2-norm of each row of the output \bar{X}_t.  In order to be able to compute
   \gamma_t, it's most convenient to compute this squared row-norm for each row
-  of P_t, as a vector, to compute tr(P_t P_t^2) from this vector as its sum, and
-  to then work back to compute tr(R_t R_t^2) from the relation between P_t and
-  R_t.  We can then scale the row-norms we computed for P_t, so they apply to
-  Q_t.
+  of \hat{X}_t, as a vector, to compute tr(\hat{X}_t \hat{X}_t^2) from this vector as its sum, and
+  to then work back to compute tr(X_t X_t^2) from the relation between \hat{X}_t and
+  X_t.  We can then scale the row-norms we computed for \hat{X}_t, so they apply to
+  \bar{X}_t.
 
-  For current purposes, you can imagine that we computed tr(P_t P_t^T) directly.
+  For current purposes, you can imagine that we computed tr(\hat{X}_t \hat{X}_t^T) directly.
   Using (from eqn:pt2)
-      P_t = R_t - R_t W_t^T W_t,
-  we can expand tr(P_t P_t^T) as:
-   tr(P_t P_t^T) = tr(R_t R_t^T) + tr(R_t W_t^T W_t W_t^T W_t R_t^T)
-                  - 2 tr(R_t W_t^T W_t R_t^T)
-                 = tr(R_t R_t^T) + tr(W_t R_t^T R_t W_t^T W_t W_t^T)
-                  - 2 tr(W_t R_t^T R_t W_t^T)
-                 = tr(R_t R_t^T) + tr(L_t W_t W_t^T) - 2 tr(L_t)
-                 = tr(R_t R_t^T) + tr(L_t E_t) - 2 tr(L_t)
+      \hat{X}_t = X_t - X_t W_t^T W_t,
+  we can expand tr(\hat{X}_t \hat{X}_t^T) as:
+   tr(\hat{X}_t \hat{X}_t^T) = tr(X_t X_t^T) + tr(X_t W_t^T W_t W_t^T W_t X_t^T)
+                  - 2 tr(X_t W_t^T W_t X_t^T)
+                 = tr(X_t X_t^T) + tr(W_t X_t^T X_t W_t^T W_t W_t^T)
+                  - 2 tr(W_t X_t^T X_t W_t^T)
+                 = tr(X_t X_t^T) + tr(L_t W_t W_t^T) - 2 tr(L_t)
+                 = tr(X_t X_t^T) + tr(L_t E_t) - 2 tr(L_t)
   and all quantities have already been computed (or are quick to compute, such as
-  the small traces on the right), except tr(R_t R_t^T), so we can write
+  the small traces on the right), except tr(X_t X_t^T), so we can write
 
-    tr(R_t R_t^T) = tr(P_t P_t^T) - tr(L_t E_t) + 2 tr(L_t)
-  and the above expression can be used to obtain tr(R_t R_t^2).
+    tr(X_t X_t^T) = tr(\hat{X}_t \hat{X}_t^T) - tr(L_t E_t) + 2 tr(L_t)
+  and the above expression can be used to obtain tr(X_t X_t^2).
   We can then do
-     \gamma_t <-- sqrt(tr(R_t R_t^T)  / tr(P_t P_t^T)).
+     \gamma_t <-- sqrt(tr(X_t X_t^T)  / tr(\hat{X}_t \hat{X}_t^T)).
   (or one if the denominator is zero), and then
-      Q_t <-- \gamma_t P_t
+      \bar{X}_t <-- \gamma_t \hat{X}_t
   We can then output the per-row squared-l2-norms of Q by scaling those we
   computed from P by \gamma_t^2.
 
-  OK, the digression on how to compute \gamma_t and tr(R_t R_t^T) is over.
-  We now return to the computation of X_{t+1}, W_{t+1}, \rho_{t+1}, D_{t+1} and E_{t+1}.
+  OK, the digression on how to compute \gamma_t and tr(X_t X_t^T) is over.
+  We now return to the computation of R_{t+1}, W_{t+1}, \rho_{t+1}, D_{t+1} and E_{t+1}.
 
   We found above in (eqn:rhot1)
      \rho_{t+1} = 1/(D - R) (\eta tr(S_t) + (1-\eta)(D \rho_t + tr(D_t)) - tr(C_t^{0.5})).
   Expanding out S_t from its definition in (eqn:St),
-     \rho_{t+1} = 1/(D - R) (\eta/N tr(R_t R_t^T) + (1-\eta)(D \rho_t + tr(D_t)) - tr(C_t^{0.5})).  
+     \rho_{t+1} = 1/(D - R) (\eta/N tr(X_t X_t^T) + (1-\eta)(D \rho_t + tr(D_t)) - tr(C_t^{0.5})).
   We can compute this directly as all the quantities involved are already known
   or easy to compute.
   Next, from (eqn:dt1), we compute
      D_{t+1} = C_t^{0.5} - \rho_{t+1} I
   At this point if \rho_{t+1} is smaller than some small value \epsilon, e.g. 1.0e-10, we
   set it to \epsilon; as mentioned, we do this to stop F_t approaching zero if all inputs
-  are zero.  Next, if any diagonal element D_{t+1,i,i} has absolute value less than \epsilon,
-  we set it to +\epsilon.  This is to ensure that diagonal elements of E are never zero, which
-  would cause problems.
-  
+  are zero.  Next, if any diagonal element D_{t+1,i,i} has absolute value less
+  than \epsilon, we set it to +\epsilon.  This is to ensure that diagonal
+  elements of E are never zero, which would cause problems.
+
   Next, we compute (from eqn:betat2, eqn:etdef, eqn:tii),
         \beta_{t+1} = \rho_{t+1} (1+\alpha) + \alpha/D tr(D_{t+1})
             E_{t+1} = 1/\beta_{t+1} (D_{t+1}^{-1} + 1/\beta_{t+1} I)^{-1},
@@ -346,13 +315,13 @@ namespace nnet2 {
  We'll want to store D_{t+1}.  We next want to compute W_{t+1}.
 
   Before computing W_{t+1}, we need to find an expression for
-     X_{t+1} = C_t^{-0.5} U_t^T Y_t
+     R_{t+1} = C_t^{-0.5} U_t^T Y_t
    Expanding out Y_t using the expression in (eqn:yt),
-     X_{t+1} = C_t^{-0.5} U_t^T  (\eta/N E_t^{-0.5} J_t  + (1-\eta) (D_t + \rho_t I) X_t)
+     R_{t+1} = C_t^{-0.5} U_t^T  (\eta/N E_t^{-0.5} J_t  + (1-\eta) (D_t + \rho_t I) R_t)
              =  (\eta/N C_t^{-0.5} U_t^T E_t^{-0.5})  J_t
                +((1-\eta) C_t^{-0.5} U_t^T (D_t + \rho_t I) E_t^{-0.5}) W_t
 
-   What we actually want is W_{t+1} = E_{t+1}^{0.5} X_{t+1}:
+   What we actually want is W_{t+1} = E_{t+1}^{0.5} R_{t+1}:
      W_{t+1} = (\eta/N E_{t+1}^{0.5} C_t^{-0.5} U_t^T E_t^{-0.5}) J_t
               +((1-\eta) E_{t+1}^{0.5} C_t^{-0.5} U_t^T (D_t + \rho_t I) E_t^{-0.5}) W_t
    and to minimize the number of matrix-matrix multiplies we can factorize this as:
@@ -361,27 +330,27 @@ namespace nnet2 {
         B_t = J_t + (1-\eta)/(\eta/N) (D_t + \rho_t I) W_t
    [note: we use the fact that (D_t + \rho_t I) and E_t^{-0.5} commute because
     they are diagonal].
-               
+
   A_t is computed on the CPU and transferred from there to the GPU, B_t is
   computed on the PGU, and the multiplication of A_t with B_t is done on the GPU.
 
-   * Keeping X_t orthogonal *
-   
-   Our method requires the X_t matrices to be orthogonal (which we define to
-   mean that X_t X_t^T = I).  If roundoff error causes this equality to be
+   * Keeping R_t orthogonal *
+
+   Our method requires the R_t matrices to be orthogonal (which we define to
+   mean that R_t R_t^T = I).  If roundoff error causes this equality to be
    significantly violated, it could cause a problem for the stability of our
-   method.  We now address our method for making sure that the X_t values stay
+   method.  We now address our method for making sure that the R_t values stay
    orthogonal.  We do this in the algorithm described above, after creating
    W_{t+1}.  This extra step is only executed if the condition number of C_t
    (i.e. the ratio of its largest to smallest diagonal element) exceeds a
    specified threshold, such as 1.0e+06 [this is tested before applying the
    floor to C_t].  The threshold was determined empirically by finding the
-   largest value needed to ensure a certain level of orthogonality in X_{t+1}.
-   For purposes of the present discussion, since X_{t+1} is not actually stored,
+   largest value needed to ensure a certain level of orthogonality in R_{t+1}.
+   For purposes of the present discussion, since R_{t+1} is not actually stored,
    define it as E_{t+1}^{-0.5} W_{t+1}.  Define the following (and we will
    just use t instead of t+1 below, as all quantities have the same subscript):
 
-      O_t =(def) X_t X_t^T
+      O_t =(def) R_t R_t^T
           =  E_t^{-0.5} W_t W_t^T E_t^{-0.5}
 
    (and we would compute this by computing W_t W_t^T on the GPU, transferring
@@ -389,9 +358,9 @@ namespace nnet2 {
    to the unit matrix, we can re-orthogonalize as follows:
    Do the Cholesky decomposition
       O_t = C C^T
-   Clearly C^{-1} O_t C^{-T} = I, so if we correct X_t with:
-      X_t <-- C^{-1} X_t
-   we can ensure orthogonality.  If X_t's first k rows are orthogonal, this
+   Clearly C^{-1} O_t C^{-T} = I, so if we correct R_t with:
+      R_t <-- C^{-1} R_t
+   we can ensure orthogonality.  If R_t's first k rows are orthogonal, this
    transform will not affect them, because of its lower-triangular
    structure... this is good because (thanks to the eigenvalue sorting), the
    larger eigenvectors are first and it is more critical to keep them pointing
@@ -401,19 +370,19 @@ namespace nnet2 {
       W_t <-- (E_t^{0.5} C^{-1} E_t^{-0.5}) W_t,
    and the matrix in parentheses is computed on the CPU, transferred to the
    GPU, and the multiplication is done there.
- 
 
-   * Initialization *  
+
+   * Initialization *
 
    Now, a note on what we do on time t = 0, i.e. for the first minibatch.  We
-   initialize X_0 to the top R eigenvectors of 1/N R_0 R_0^T, where N is the
+   initialize X_0 to the top R eigenvectors of 1/N X_0 X_0^T, where N is the
    minibatch size (num-rows of R0).  If L is the corresponding RxR diagonal
    matrix of eigenvalues, then we will set D_0 = L - \rho_0 I.  We set \rho_0
    to ensure that
-                      tr(F_0) = 1/N tr(R_0 R_0^T),
-           tr(D_0) - \rho_0 D = 1/N tr(R_0 R_0^T),
-  tr(L) + \rho_0 R - \rho_0 D = 1/N tr(R_0 R_0^T)
-                       \rho_0 = (1/N tr(R_0 R_0^T) - tr(L)) / (D - R)
+                      tr(F_0) = 1/N tr(X_0 X_0^T),
+           tr(D_0) - \rho_0 D = 1/N tr(X_0 X_0^T),
+  tr(L) + \rho_0 R - \rho_0 D = 1/N tr(X_0 X_0^T)
+                       \rho_0 = (1/N tr(X_0 X_0^T) - tr(L)) / (D - R)
 
    We then floor \rho_0 to \epsilon (e.g. 1.0e-10) and also floor the
    diagonal elements of D_0 to \epsilon; this ensures that we won't
@@ -423,10 +392,10 @@ namespace nnet2 {
    with a GPU, where we won't have multi-threading, but we want it to work
    also on a CPU, where we may have multiple worker threads.
    Our approach is as follows (we do this when we're about to start updating
-   the parameters X_t, D_t, \rho_t and derived quantities):
+   the parameters R_t, D_t, \rho_t and derived quantities):
 
     For time t > 0 (where the matrices are already initialized), before starting
-    the part of the computation that updates the parameters (X_t, D_t, \rho_t and
+    the part of the computation that updates the parameters (R_t, D_t, \rho_t and
     derived quantities), we try to lock a mutex that guards the OnlinePreconditioner.
     If we can lock it right away, we go ahead and do the update, but if not,
     we just abandon the attempt to update those quantities.
@@ -436,13 +405,12 @@ namespace nnet2 {
     being written by another thread).  This mutex will only be locked for short
     periods of time.
 
-   Note: it might be a good idea to make sure that the X_t still retain orthonormal
+   Note: it might be a good idea to make sure that the R_t still retain orthonormal
    rows even in the presence of roundoff, without errors accumulating.  My instinct
    is that this isn't going to be a problem.
  */
 
 
-
 class OnlinePreconditioner {
  public:
   OnlinePreconditioner();
@@ -456,15 +424,15 @@ class OnlinePreconditioner {
   BaseFloat GetNumSamplesHistory() const { return num_samples_history_; }
   BaseFloat GetAlpha() const { return alpha_; }
   int32 GetRank() const { return rank_; }
-  
-  
+  int32 GetUpdatePeriod() const { return update_period_; }
+
   // The "R" pointer is both the input (R in the comment) and the output (P in
   // the comment; equal to the preconditioned directions before scaling by
-  // gamma).  If the pointer "row_prod" is supplied, it's set to the inner
-  // product of each row of the preconditioned directions P, at output, with
-  // itself.  You would need to apply "scale" to R and "scale * scale" to
-  // row_prod, to get the preconditioned directions; we don't do this ourselves,
-  // in order to save CUDA calls.
+  // gamma).  If the pointer "row_prod" is supplied, it's set to the inner product
+  // of each row of the preconditioned directions P, at output, with itself.
+  // You would need to apply "scale" to R and "scale * scale" to row_prod, to
+  // get the preconditioned directions; we don't do this ourselves, in order to
+  // save CUDA calls.
   void PreconditionDirections(CuMatrixBase<BaseFloat> *R,
                               CuVectorBase<BaseFloat> *row_prod,
                               BaseFloat *scale);
@@ -482,7 +450,7 @@ class OnlinePreconditioner {
                                       const BaseFloat rho_t,
                                       const Vector<BaseFloat> &d_t,
                                       CuMatrixBase<BaseFloat> *WJKL_t,
-                                      CuMatrixBase<BaseFloat> *R_t,
+                                      CuMatrixBase<BaseFloat> *X_t,
                                       CuVectorBase<BaseFloat> *row_prod,
                                       BaseFloat *scale);
 
@@ -498,7 +466,7 @@ class OnlinePreconditioner {
                  const VectorBase<BaseFloat> &inv_sqrt_e_t,
                  const MatrixBase<BaseFloat> &K_t,
                  const MatrixBase<BaseFloat> &L_t,
-                 SpMatrix<BaseFloat> *Z_t) const;
+                 SpMatrix<double> *Z_t) const;
   // Computes W_{t+1}.  Overwrites J_t.
   void ComputeWt1(int32 N,
                   const VectorBase<BaseFloat> &d_t,
@@ -507,14 +475,14 @@ class OnlinePreconditioner {
                   BaseFloat rho_t1,
                   const MatrixBase<BaseFloat> &U_t,
                   const VectorBase<BaseFloat> &sqrt_c_t,
-                  const VectorBase<BaseFloat> &inv_sqrt_e_t,                                      
+                  const VectorBase<BaseFloat> &inv_sqrt_e_t,
                   const CuMatrixBase<BaseFloat> &W_t,
                   CuMatrixBase<BaseFloat> *J_t,
                   CuMatrixBase<BaseFloat> *W_t1) const;
 
   // This function is called if C_t has high condition number; it makes sure
-  // that X_{t+1} is orthogonal.  See the section in the extended comment above
-  // on "keeping X_t orthogonal".
+  // that R_{t+1} is orthogonal.  See the section in the extended comment above
+  // on "keeping R_t orthogonal".
   void ReorthogonalizeXt1(const VectorBase<BaseFloat> &d_t1,
                           BaseFloat rho_t1,
                           CuMatrixBase<BaseFloat> *W_t1,
@@ -523,13 +491,26 @@ class OnlinePreconditioner {
 
   void Init(const CuMatrixBase<BaseFloat> &R0);
 
+  // Initialize to some small 'default' values, called from Init().  Init() then
+  // does a few iterations of update with the first batch's data to give more
+  // reasonable values.
+  void InitDefault(int32 D);
+
+  // initializes R, which is assumed to have at least as many columns as rows,
+  // to a specially designed matrix with orthonormal rows, that has no zero rows
+  // or columns.
+  static void InitOrthonormalSpecial(CuMatrixBase<BaseFloat> *R);
+
   // Returns the learning rate eta as the function of the number of samples
   // (actually, N is the number of vectors we're preconditioning, which due to
   // context is not always exactly the same as the number of samples).  The
   // value returned depends on num_samples_history_.
   BaseFloat Eta(int32 N) const;
 
-  
+  // called if self_debug_ = true, makes sure the members satisfy certain
+  // properties.
+  void SelfTest() const;
+
   // Configuration values:
 
   // The rank of the correction to the unit matrix (e.g. 20).
@@ -539,13 +520,13 @@ class OnlinePreconditioner {
   // updating the Fisher-matrix parameters every "update_period_" minibatches;
   // this saves time.
   int32 update_period_;
-  
+
   // num_samples_history_ determines the value of eta, which in turn affects how
   // fast we update our estimate of the covariance matrix.  We've done it this
   // way in order to make it easy to have a single configuration value that
   // doesn't have to be changed when we change the minibatch size.
   BaseFloat num_samples_history_;
-  
+
   // alpha controls how much we smooth the Fisher matrix with the unit matrix.
   // e.g. alpha = 4.0.
   BaseFloat alpha_;
@@ -558,9 +539,10 @@ class OnlinePreconditioner {
 
   // delta is a relative floor on the unit-matrix scaling factor rho_t in our
   // Fisher estimate, which we set to 1.0e-05: this is relative to the largest
-  // value of D_t.  It's needed to control roundoff error.
+  // value of D_t.  It's needed to control roundoff error.  We apply the same
+  // floor to the eigenvalues in D_t.
   BaseFloat delta_;
-  
+
   // t is a counter that measures how many updates we've done.
   int32 t_;
 
@@ -569,47 +551,23 @@ class OnlinePreconditioner {
   // is a mechanism to avoid spending too much time updating the subspace (which can
   // be wasteful).
   int32 num_updates_skipped_;
-  
+
   // If true, activates certain checks.
   bool self_debug_;
 
   CuMatrix<BaseFloat> W_t_;
   BaseFloat rho_t_;
   Vector<BaseFloat> d_t_;
- 
-  
+
+
   // Used to prevent parameters being read or written in an inconsistent state.
   Mutex read_write_mutex_;
 
   // This mutex is used to control which thread gets to update the
   // parameters, in multi-threaded code.
   Mutex update_mutex_;
-  
-
 };
 
-
-/*
-  This function finds the approximate top eigenvectors and eigenvalues of S = beta M
-  M^T (if trans == kNoTrans) or S = beta M^T M (if trans == kTrans).
-  Each row p of P will be set to an approximate
-  eigenvector of S, and the corresponding value in s will exactly equal p^T S p.
-  (note: it will actually be those with the largest absolute value that we return,
-  which makes a difference only if S has negative eigenvalues).
-  We do the eigenvalue computation on the CPU, mainly to avoid the hassle of
-  coding a version of it for CUDA.
-  Caution: most of the other eigenvalue or SVD code puts the eigenvalues in the
-  columns, not the rows.
-  This function is used by class OnlinePreconditioner; we declare it separately
-  for ease of testing.   
- */
-void ApproxEigsOfProduct(const CuMatrixBase<BaseFloat> &M,
-                         MatrixTransposeType trans,
-                         CuMatrixBase<BaseFloat> *P,
-                         CuVectorBase<BaseFloat> *s);
-
-
-
 } // namespace nnet2
 } // namespace kaldi
 
diff --git a/src/nnet2/online-nnet2-decodable-test.cc b/src/nnet2/online-nnet2-decodable-test.cc
index 2accf71102a..10ca206c5ee 100644
--- a/src/nnet2/online-nnet2-decodable-test.cc
+++ b/src/nnet2/online-nnet2-decodable-test.cc
@@ -1,4 +1,4 @@
-// online2/online-nnet2-decodable-test.cc
+// nnet2/online-nnet2-decodable-test.cc
 
 // Copyright 2014  Johns Hopkins University (author:  Daniel Povey)
 
@@ -22,6 +22,7 @@
 #include "nnet2/decodable-am-nnet.h"
 #include "nnet2/online-nnet2-decodable.h"
 #include "feat/online-feature.h"
+#include "hmm/hmm-test-utils.h"
 
 namespace kaldi {
 namespace nnet2 {
@@ -41,11 +42,11 @@ void UnitTestNnetDecodable() {
   ContextDependency *ctx_dep =
       GenRandContextDependencyLarge(phones, N, P,
                                     true, &num_pdf_classes);
-  
+
   HmmTopology topo = GetDefaultTopology(phones);
-  
+
   TransitionModel trans_model(*ctx_dep, topo);
-  
+
   delete ctx_dep; // We won't need this further.
   ctx_dep = NULL;
 
@@ -71,9 +72,9 @@ void UnitTestNnetDecodable() {
   int32 num_input_frames = 400;
   Matrix<BaseFloat> input_feats(num_input_frames, input_dim);
   input_feats.SetRandn();
-  
+
   OnlineMatrixFeature matrix_feature(input_feats);
-  
+
   DecodableNnet2Online online_decodable(am_nnet, trans_model,
                                         opts, &matrix_feature);
 
@@ -86,7 +87,7 @@ void UnitTestNnetDecodable() {
                offline_decodable.NumFramesReady());
   int32 num_frames = online_decodable.NumFramesReady(),
       num_tids = trans_model.NumTransitionIds();
-  
+
   for (int32 i = 0; i < 50; i++) {
 
     int32 t = rand() % num_frames, tid = 1 + rand() % num_tids;
@@ -109,5 +110,5 @@ int main() {
     UnitTestNnetDecodable();
   return 0;
 }
-  
+
 
diff --git a/src/nnet2/online-nnet2-decodable.h b/src/nnet2/online-nnet2-decodable.h
index f4af2ae1d6c..96e0a4b8926 100644
--- a/src/nnet2/online-nnet2-decodable.h
+++ b/src/nnet2/online-nnet2-decodable.h
@@ -119,4 +119,4 @@ class DecodableNnet2Online: public DecodableInterface {
 } // namespace nnet2
 } // namespace kaldi
 
-#endif // KALDI_NNET2_ONLINE_GMM_DECODABLE_H_
+#endif // KALDI_NNET2_ONLINE_NNET2_DECODABLE_H_
diff --git a/src/nnet2/train-nnet-ensemble.cc b/src/nnet2/train-nnet-ensemble.cc
index 750105450a5..e04f86c267f 100644
--- a/src/nnet2/train-nnet-ensemble.cc
+++ b/src/nnet2/train-nnet-ensemble.cc
@@ -92,7 +92,8 @@ void NnetEnsembleTrainer::TrainOneMinibatch() {
     CuMatrix<BaseFloat> tmp_deriv(post_mat[i]);
     post_mat[i].ApplyLog();
     std::vector<BaseFloat> log_post_correct;
-    post_mat[i].Lookup(sv_labels_ind, &log_post_correct);
+    log_post_correct.resize(sv_labels_ind.size());
+    post_mat[i].Lookup(sv_labels_ind, &(log_post_correct[0]));
     BaseFloat log_prob_this_net = std::accumulate(log_post_correct.begin(),
                                                   log_post_correct.end(),
                                                   static_cast<BaseFloat>(0));
diff --git a/src/nnet2/train-nnet-perturbed.cc b/src/nnet2/train-nnet-perturbed.cc
deleted file mode 100644
index f4a15a2fc73..00000000000
--- a/src/nnet2/train-nnet-perturbed.cc
+++ /dev/null
@@ -1,714 +0,0 @@
-// nnet2/train-nnet-perturbed.cc
-
-// Copyright 2012-2014   Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet2/train-nnet-perturbed.h"
-#include "nnet2/nnet-update.h"
-#include "thread/kaldi-thread.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-
-class NnetPerturbedUpdater {
- public:
-  // Note: in the case of training with SGD, "nnet" and "nnet_to_update" will be
-  // identical.  They'd be different if we're accumulating the gradient for a
-  // held-out set and don't want to update the model, but this shouldn't happen
-  // for this "perturbed" update.  nnet_to_update may be NULL if you don't
-  // want do do backprop, but this probably doesn't make sense.
-  // num_layers_before_input is the number of layers to ignore before what
-  // we consider to be the input (x) for purposes of this technique.  This will
-  // likely equal 2: one for the feature-splicing layer (SpliceComponent) and
-  // one for the preconditioning layer (FixedAffineComponent).  The within_class_covar
-  // argument (within_class_covar)
-  // 
-  // within_class_covar is the within-class covariance matrix
-  NnetPerturbedUpdater(const Nnet &nnet,
-                       int32 num_layers_before_input,
-                       const CuMatrix<BaseFloat> &within_class_covar,
-                       Nnet *nnet_to_update);
-  
-  // This function does the entire forward and backward computation for this
-  // minbatch.  Outputs to tot_objf_orig and tot_objf_perturbed the total
-  // objective function (including any weighting factors) over this minibatch,
-  // and the same after perturbing the data.
-  void ComputeForMinibatch(const std::vector<NnetExample> &data,
-                           BaseFloat D,
-                           double *tot_objf_orig,
-                           double *tot_objf_perturbed);
-  
- protected:
-
-  /// takes the input and formats as a single matrix, in forward_data_[0].
-  void FormatInput(const std::vector<NnetExample> &data);
-
-  /// Do the forward propagation for layers 0 ... num_layers_before_input_ - 1,
-  /// typically the first two layers.  This will be called once per minibatch.
-  void PropagateInitial() { Propagate(0, num_layers_before_input_); }
-
-
-  /// Do the forward propagation for layers num_layers_before_input_
-  /// ... num-layers-1, typically all but the first two layers.  This will be
-  /// called twice per minibatch, once before and once after perturbing the
-  /// inputs.
-  void PropagateRemaining() { Propagate(num_layers_before_input_,
-                                        nnet_.NumComponents()); }
-
-  /// Internal Propagate function, does the forward computation for
-  /// layers begin_layer ... end_layer - 1.
-  void Propagate(int32 begin_layer, int32 end_layer);
-  
-  /// Computes objective function and derivative at output layer, but does not
-  /// do the backprop [for that, see Backprop()].  This will be called twice per
-  /// minibatch, once before and once after perturbing the inputs.
-  void ComputeObjfAndDeriv(const std::vector<MatrixElement<BaseFloat> > &sv_labels,
-                           CuMatrix<BaseFloat> *deriv,
-                           BaseFloat *tot_objf,
-                           BaseFloat *tot_weight) const;
-
-  /// Computes supervision labels from data.
-  void ComputeSupervisionLabels(const std::vector<NnetExample> &data,
-                                std::vector<MatrixElement<BaseFloat> > *sv_labels);
-
-  /// Backprop must be called after ComputeObjfAndDeriv (it will be called
-  /// twice, the first time with a NULL nnet_to_update pointer).  It does the
-  /// backpropagation (not including the first num_layers_before_input_ layers).
-  /// "nnet_to_update" is updated, if non-NULL.  Note: "deriv" will contain, at
-  /// input, the derivative w.r.t. the output layer (as computed by
-  /// ComputeObjfAndDeriv), but will be used as a temporary variable by this
-  /// function, and exit, will contain the derivative of the objective function
-  /// w.r.t. the input of layer num_layers_before_input_.
-  void Backprop(Nnet *nnet_to_update,
-                CuMatrix<BaseFloat> *deriv) const;
-
-  /// Perturb the input features (actually, the features at the input of layer
-  /// num_layers_before_input_).  This modifies the value of
-  /// forward_data_[num_layers_before_input_].  For the math, see \ref
-  /// train-nnet-perturbed.h
-  void PerturbInput(const CuMatrix<BaseFloat> &deriv_at_input,
-                    BaseFloat D);                    
-  
- private:
-  
-  const Nnet &nnet_;
-  
-  Nnet *nnet_to_update_;  
-  int32 num_layers_before_input_;  // Number of layers before whichever layer we
-                                   // regard as the input for purposes of this
-                                   // method (normally 2, to include splicing
-                                   // layer and preconditioning layer)
-  std::vector<ChunkInfo> chunk_info_out_;
-  const CuMatrix<BaseFloat> &within_class_covar_;
-  
-  int32 num_chunks_; // same as the minibatch size.
-  
-  std::vector<CuMatrix<BaseFloat> > forward_data_; // The forward data
-  // for the outputs of each of the components.
-};
-
-
-NnetPerturbedUpdater::NnetPerturbedUpdater(const Nnet &nnet,
-                                           int32 num_layers_before_input,
-                                           const CuMatrix<BaseFloat> &within_class_covar,
-                                           Nnet *nnet_to_update):
-    nnet_(nnet),
-    nnet_to_update_(nnet_to_update),
-    num_layers_before_input_(num_layers_before_input),
-    within_class_covar_(within_class_covar) {
-  KALDI_ASSERT(num_layers_before_input_ >= 0 &&
-               num_layers_before_input < nnet.NumComponents());
-  for (int32 c = 0; c < num_layers_before_input_; c++) {
-    const Component *comp = &(nnet.GetComponent(c));
-    const UpdatableComponent *uc = dynamic_cast<const UpdatableComponent*>(comp);
-    if (uc != NULL) {
-      KALDI_ERR << "One of the pre-input layers is updatable.";
-    }
-  }
-}    
-
-void NnetPerturbedUpdater::PerturbInput(
-    const CuMatrix<BaseFloat> &deriv_at_input,
-    BaseFloat D) {
-  // The code doesn't handle the case where there is further splicing after the
-  // input.
-  KALDI_ASSERT(num_chunks_ == deriv_at_input.NumRows());
-  // For the math, see train-nnet-perturbed.h.
-  // deriv_at_input is \nabla in the math.
-
-  // "input" is the input features, currently unmodified, but we'll
-  // modify them.
-  CuMatrix<BaseFloat> &input(forward_data_[num_layers_before_input_]);
-  KALDI_ASSERT(SameDim(input, deriv_at_input));
-  // Each row of deriv_w will equal (W nabla_t)', where ' is transpose.
-  CuMatrix<BaseFloat> deriv_w(input.NumRows(), input.NumCols());
-  // note: for the second transpose-ness argument below we can choose either
-  // kTrans or kNoTrans because the matrix is symmetric.  I'm guessing that
-  // kTrans will be faster.
-  deriv_w.AddMatMat(1.0, deriv_at_input, kNoTrans,
-                    within_class_covar_, kTrans, 0.0);
-  
-  // k will be used to compute and store the gradient-scaling factor k_t.
-  CuVector<BaseFloat> k(deriv_at_input.NumRows());
-  // after the next call, each element of k will contain (\nabla_t^T W \nabla_t)
-  // We want k_t = D / sqrt(\nabla_t^T W \nabla_t)
-  // so we need to take this to the power -0.5.
-  // We can't do this if it's zero, so we first floor to a very small value.
-  k.AddDiagMatMat(1.0, deriv_w, kNoTrans, deriv_at_input, kTrans, 0.0);
-  int32 num_floored = k.ApplyFloor(1.0e-20);
-  if (num_floored > 0.0) {
-    // Should only happen at the very start of training, 
-    KALDI_WARN << num_floored << " gradients floored (derivative at input was "
-               << "close to zero).. should only happen at start of training "
-               << "or when adding a new layer.";
-  }
-  k.ApplyPow(-0.5);
-  // now we have k_t = 1.0 / sqrt(\nabla_t^T W \nabla_t).
-  // in the math, k_t contains an additional factor of D, but we'll
-  // add this later.
-  // Below, we will do  x'_t = x_t - k_t W \nabla_t
-  // Here, each row of deriv_w contains the transpose of W \nabla_t.
-  // The factor of D is because it was missing in k.
-  input.AddDiagVecMat(-1.0 * D, k, deriv_w, kNoTrans, 1.0);
-}
-
-void NnetPerturbedUpdater::ComputeForMinibatch(
-    const std::vector<NnetExample> &data,
-    BaseFloat D,
-    double *tot_objf_orig,
-    double *tot_objf_perturbed) {
-
-  FormatInput(data);
-  PropagateInitial();
-  PropagateRemaining();
-  CuMatrix<BaseFloat> tmp_deriv;
-
-  std::vector<MatrixElement<BaseFloat> > sv_labels;
-  ComputeSupervisionLabels(data, &sv_labels);
-  
-  BaseFloat tot_objf, tot_weight;
-  ComputeObjfAndDeriv(sv_labels, &tmp_deriv, &tot_objf, &tot_weight);
-
-  KALDI_VLOG(4) << "Objective function (original) is " << (tot_objf/tot_weight)
-                << " per sample, over " << tot_weight << " samples (weighted).";
-  *tot_objf_orig = tot_objf;
-  
-  // only backprops till layer number num_layers_before_input_,
-  // and derivative at that layer is in tmp_deriv.
-  Backprop(NULL, &tmp_deriv);
-
-  // perturb forward_data_[num_layers_before_input_].
-  PerturbInput(tmp_deriv, D);
-  
-  // Now propagate forward again from that point.
-  PropagateRemaining();
-
-  ComputeObjfAndDeriv(sv_labels, &tmp_deriv, &tot_objf, &tot_weight);
-  KALDI_VLOG(4) << "Objective function (perturbed) is " << (tot_objf/tot_weight)
-                << " per sample, over " << tot_weight << " samples (weighted).";
-  *tot_objf_perturbed = tot_objf;
-
-  // The actual model updating would happen in the next call.
-  if (nnet_to_update_ != NULL)
-    Backprop(nnet_to_update_, &tmp_deriv);
-}
-
-void NnetPerturbedUpdater::Propagate(int32 begin_layer, int32 end_layer) {
-  static int32 num_times_printed = 0;
-  
-  for (int32 c = begin_layer; c < end_layer; c++) {
-    const Component &component = nnet_.GetComponent(c);
-    const CuMatrix<BaseFloat> &input = forward_data_[c];
-    CuMatrix<BaseFloat> &output = forward_data_[c+1];
-    // Note: the Propagate function will automatically resize the
-    // output.
-    component.Propagate(chunk_info_out_[c], chunk_info_out_[c+1], input, &output);
-
-    KALDI_VLOG(4) << "Propagating: sum at output of " << c << " is " << output.Sum();
-    
-    // If we won't need the output of the previous layer for
-    // backprop, delete it to save memory.
-    bool need_last_output =
-        (c>0 && nnet_.GetComponent(c-1).BackpropNeedsOutput()) ||
-        component.BackpropNeedsInput();
-    if (g_kaldi_verbose_level >= 3 && num_times_printed < 100) {
-      KALDI_VLOG(3) << "Stddev of data for component " << c
-                    << " for this minibatch is "
-                    << (TraceMatMat(forward_data_[c], forward_data_[c], kTrans) /
-                        (forward_data_[c].NumRows() * forward_data_[c].NumCols()));
-      num_times_printed++;
-    }
-    if (!need_last_output && c != num_layers_before_input_)
-      forward_data_[c].Resize(0, 0); // We won't need this data.
-  }
-}
-
-void NnetPerturbedUpdater::ComputeSupervisionLabels(
-    const std::vector<NnetExample> &data,
-    std::vector<MatrixElement<BaseFloat> > *sv_labels) {
-  sv_labels->clear();
-  sv_labels->reserve(num_chunks_); // We must have at least this many labels.
-  for (int32 m = 0; m < num_chunks_; m++) {
-    KALDI_ASSERT(data[m].labels.size() == 1 &&
-                 "Training code does not currently support multi-frame egs");
-    const std::vector<std::pair<int32,BaseFloat> > &labels = data[m].labels[0];
-    for (size_t i = 0; i < labels.size(); i++) {
-      MatrixElement<BaseFloat> elem = {m, labels[i].first, labels[i].second};
-      sv_labels->push_back(elem);
-    }
-  }  
-}
-
-void NnetPerturbedUpdater::ComputeObjfAndDeriv(
-    const std::vector<MatrixElement<BaseFloat> > &sv_labels,
-    CuMatrix<BaseFloat> *deriv,
-    BaseFloat *tot_objf,
-    BaseFloat *tot_weight) const {
-  int32 num_components = nnet_.NumComponents();  
-  deriv->Resize(num_chunks_, nnet_.OutputDim()); // sets to zero.
-  const CuMatrix<BaseFloat> &output(forward_data_[num_components]);
-  KALDI_ASSERT(SameDim(output, *deriv));
-  
-  deriv->CompObjfAndDeriv(sv_labels, output, tot_objf, tot_weight);
-}
-
-
-void NnetPerturbedUpdater::Backprop(Nnet *nnet_to_update,
-                                    CuMatrix<BaseFloat> *deriv) const {
-  // We assume ComputeObjfAndDeriv has already been called.
-  for (int32 c = nnet_.NumComponents() - 1; c >= num_layers_before_input_; c--) {
-    const Component &component = nnet_.GetComponent(c);
-    Component *component_to_update = (nnet_to_update == NULL ? NULL :
-                                      &(nnet_to_update->GetComponent(c)));
-    const CuMatrix<BaseFloat> &input = forward_data_[c],
-        &output = forward_data_[c+1];
-    CuMatrix<BaseFloat> input_deriv(input.NumRows(), input.NumCols());
-    const CuMatrix<BaseFloat> &output_deriv(*deriv);
-    component.Backprop(chunk_info_out_[c] , chunk_info_out_[c+1], input, output, output_deriv,
-                       component_to_update, &input_deriv);
-    input_deriv.Swap(deriv);
-  }
-}
-
-
-void NnetPerturbedUpdater::FormatInput(const std::vector<NnetExample> &data) {
-  KALDI_ASSERT(data.size() > 0);
-  int32 num_splice = nnet_.LeftContext() + 1 + nnet_.RightContext();
-  KALDI_ASSERT(data[0].input_frames.NumRows() >= num_splice);
-  
-  int32 feat_dim = data[0].input_frames.NumCols(),
-         spk_dim = data[0].spk_info.Dim(),
-         tot_dim = feat_dim + spk_dim; // we append these at the neural net
-                                       // input... note, spk_dim might be 0.
-  KALDI_ASSERT(tot_dim == nnet_.InputDim());
-  KALDI_ASSERT(data[0].left_context >= nnet_.LeftContext());
-  int32 ignore_frames = data[0].left_context - nnet_.LeftContext(); // If
-  // the NnetExample has more left-context than we need, ignore some.
-  // this may happen in settings where we increase the amount of context during
-  // training, e.g. by adding layers that require more context.
-  num_chunks_ = data.size();
-  
-  forward_data_.resize(nnet_.NumComponents() + 1);
-
-  // First copy to a single matrix on the CPU, so we can copy to
-  // GPU with a single copy command.
-  Matrix<BaseFloat> temp_forward_data(num_splice * num_chunks_,
-                                      tot_dim);
-  
-  for (int32 chunk = 0; chunk < num_chunks_; chunk++) {
-    SubMatrix<BaseFloat> dest(temp_forward_data,
-                              chunk * num_splice, num_splice,
-                              0, feat_dim);
-
-    Matrix<BaseFloat> full_src(data[chunk].input_frames);
-    SubMatrix<BaseFloat> src(full_src, ignore_frames, num_splice, 0, feat_dim);
-                             
-    dest.CopyFromMat(src);
-    if (spk_dim != 0) {
-      SubMatrix<BaseFloat> spk_dest(temp_forward_data,
-                                    chunk * num_splice, num_splice,
-                                    feat_dim, spk_dim);
-      spk_dest.CopyRowsFromVec(data[chunk].spk_info);
-    }
-  }
-  forward_data_[0].Swap(&temp_forward_data); // Copy to GPU, if being used.
-  // TODO : filter out the unnecessary rows from the input
-  nnet_.ComputeChunkInfo(num_splice, num_chunks_, &chunk_info_out_);
-
-}
-
-
-
-void DoBackpropPerturbed(const Nnet &nnet,
-                         int32 num_layers_before_input,
-                         const CuMatrix<BaseFloat> &within_class_covar,
-                         BaseFloat D,
-                         const std::vector<NnetExample> &examples,
-                         Nnet *nnet_to_update,
-                         double *tot_objf_orig,
-                         double *tot_objf_perturbed) {
-  
-  try {
-    NnetPerturbedUpdater updater(nnet, num_layers_before_input,
-                                 within_class_covar, nnet_to_update);
-
-    updater.ComputeForMinibatch(examples, D, tot_objf_orig, tot_objf_perturbed);
-  } catch (...) {
-    KALDI_LOG << "Error doing backprop, nnet info is: " << nnet.Info();
-    throw;
-  }
-}
-
-
-NnetPerturbedTrainer::NnetPerturbedTrainer(
-    const NnetPerturbedTrainerConfig &config,
-    const SpMatrix<BaseFloat> &within_class_covar,    
-    Nnet *nnet):
-    config_(config), nnet_(nnet), logprob_this_phase_(0.0),
-    logprob_perturbed_this_phase_(0.0), weight_this_phase_(0.0),
-    logprob_total_(0.0), logprob_perturbed_total_(0.0),
-    weight_total_(0.0),
-    D_(config.initial_d) {
-  InitWithinClassCovar(within_class_covar);
-  num_phases_ = 0;
-  bool first_time = true;
-  BeginNewPhase(first_time);
-}
-
-
-// This function is used in class NnetPerturbedTrainer
-// and the function DoBackpropPerturbedParallel.
-void InitWithinClassCovar(
-    const SpMatrix<BaseFloat> &within_class_covar,
-    const Nnet &nnet,
-    int32 *num_layers_before_input,
-    CuMatrix<BaseFloat> *within_class_covar_out) {  
-
-  CuSpMatrix<BaseFloat> orig_covar(within_class_covar);
-  *num_layers_before_input = 0;
-  KALDI_ASSERT(nnet.NumComponents() > *num_layers_before_input);
-  const Component *comp = &(nnet.GetComponent(*num_layers_before_input));
-  // Skip over any SpliceComponent that appears at the beginning of
-  // the network.
-  if (dynamic_cast<const SpliceComponent*>(comp) != NULL)
-    (*num_layers_before_input)++;
-  
-  KALDI_ASSERT(nnet.NumComponents() > *num_layers_before_input);
-  comp = &(nnet.GetComponent(*num_layers_before_input));
-
-  const FixedAffineComponent *fa =
-      dynamic_cast<const FixedAffineComponent*>(comp);
-  if (fa != NULL) {
-    (*num_layers_before_input)++;
-    const CuMatrix<BaseFloat> &linear_params = fa->LinearParams();
-    if (linear_params.NumCols() != orig_covar.NumCols()) {
-      KALDI_ERR << "The neural network seems to expect a (spliced) feature "
-                << "dimension of " << linear_params.NumCols() << ", but your "
-                << "LDA stats have a dimension of " << orig_covar.NumCols();
-    }
-    CuMatrix<BaseFloat> temp(linear_params.NumRows(), orig_covar.NumRows());
-    // temp = linear_params . orig_covar
-    temp.AddMatSp(1.0, linear_params, kNoTrans, orig_covar, 0.0);
-    within_class_covar_out->Resize(linear_params.NumRows(),
-                                   linear_params.NumRows());
-    // temp = linear_params . orig_covar . linear_params^T
-    within_class_covar_out->AddMatMat(1.0, temp, kNoTrans,
-                                      linear_params, kTrans, 0.0);
-    // note: this should be symmetric, spot-test it like this:
-    KALDI_ASSERT(ApproxEqual(TraceMatMat(*within_class_covar_out,
-                                         *within_class_covar_out, kNoTrans),
-                             TraceMatMat(*within_class_covar_out,
-                                         *within_class_covar_out, kTrans)));
-  } else {
-    if (comp->InputDim() != orig_covar.NumCols()) {
-      KALDI_ERR << "The neural network seems to expect a (spliced) feature "
-                << "dimension of " << comp->InputDim() << ", but your "
-                << "LDA stats have a dimension of " << orig_covar.NumCols();
-    }
-    within_class_covar_out->Resize(orig_covar.NumRows(), orig_covar.NumCols());
-    within_class_covar_out->CopyFromSp(orig_covar);
-  }
-}
-  
-
-
-void NnetPerturbedTrainer::InitWithinClassCovar(
-    const SpMatrix<BaseFloat> &within_class_covar) {
-  kaldi::nnet2::InitWithinClassCovar(within_class_covar, *nnet_,
-                                     &num_layers_before_input_,
-                                     &within_class_covar_);
-}  
-
-void NnetPerturbedTrainer::TrainOnExample(const NnetExample &value) {
-  buffer_.push_back(value);
-  if (static_cast<int32>(buffer_.size()) == config_.minibatch_size)
-    TrainOneMinibatch();
-}
-
-void NnetPerturbedTrainer::TrainOneMinibatch() {
-  KALDI_ASSERT(!buffer_.empty());
-
-  double tot_objf_orig, tot_objf_perturbed;
-  DoBackpropPerturbed(*nnet_, num_layers_before_input_, within_class_covar_, D_,
-                      buffer_, nnet_, &tot_objf_orig, &tot_objf_perturbed);
-
-  logprob_this_phase_ += tot_objf_orig;
-  logprob_perturbed_this_phase_ += tot_objf_perturbed;
-  double weight = TotalNnetTrainingWeight(buffer_);
-  UpdateD(tot_objf_orig / weight, tot_objf_perturbed / weight);
-  weight_this_phase_ += weight;
-  buffer_.clear();
-  minibatches_seen_this_phase_++;
-  if (minibatches_seen_this_phase_ == config_.minibatches_per_phase) {
-    bool first_time = false;
-    BeginNewPhase(first_time);
-  }
-}
-
-
-void NnetPerturbedTrainer::UpdateD(BaseFloat orig_objf_per_example,                                   
-                                   BaseFloat perturbed_objf_per_example) {
-  
-  BaseFloat diff = orig_objf_per_example - perturbed_objf_per_example;
-  // note: diff should be positive in the normal case.
-  KALDI_ASSERT(config_.target_objf_change > 0.0 && config_.max_d_factor > 1.0);
-  BaseFloat objf_ratio = config_.target_objf_change /
-      std::max<BaseFloat>(1.0e-20, diff),
-      D_ratio = pow(objf_ratio, config_.tune_d_power);
-  if (D_ratio > config_.max_d_factor)
-    D_ratio = config_.max_d_factor;
-  else if (D_ratio < 1.0 / config_.max_d_factor)
-    D_ratio = 1.0 / config_.max_d_factor;
-  BaseFloat D_new = D_ * D_ratio;
-  
-  KALDI_VLOG(3) << "Training objective function normal/perturbed is "
-                << orig_objf_per_example << '/' << perturbed_objf_per_example
-                << ", diff " << diff << " vs. target "
-                << config_.target_objf_change
-                << ", changing D by factor " << D_ratio << " to " << D_new;
-  D_ = D_new;  
-}
-
-void NnetPerturbedTrainer::BeginNewPhase(bool first_time) {
-  if (!first_time) {
-    BaseFloat logprob = logprob_this_phase_/weight_this_phase_,
-        logprob_perturbed = logprob_perturbed_this_phase_/weight_this_phase_,
-        diff = logprob - logprob_perturbed;
-    KALDI_LOG << "Training objective function normal->perturbed is "
-              << logprob << " -> " << logprob_perturbed << ", diff "
-              << diff << " vs. target " << config_.target_objf_change
-              << ", over " << weight_this_phase_ << " frames, D is "
-              << D_;
-  }
-  logprob_total_ += logprob_this_phase_;
-  logprob_perturbed_total_ += logprob_perturbed_this_phase_;
-  weight_total_ += weight_this_phase_;
-  logprob_this_phase_ = 0.0;
-  logprob_perturbed_this_phase_ = 0.0;
-  weight_this_phase_ = 0.0;
-  minibatches_seen_this_phase_ = 0;
-  num_phases_++;
-}
-
-
-NnetPerturbedTrainer::~NnetPerturbedTrainer() {
-  if (!buffer_.empty()) {
-    KALDI_LOG << "Doing partial minibatch of size "
-              << buffer_.size();
-    TrainOneMinibatch();
-    if (minibatches_seen_this_phase_ != 0) {
-      bool first_time = false;
-      BeginNewPhase(first_time);
-    }
-  }
-  if (weight_total_ == 0.0) {
-    KALDI_WARN << "No data seen.";
-  } else {
-    KALDI_LOG << "Did backprop on " << weight_total_
-              << " examples, average log-prob normal->perturbed per frame is "
-              << (logprob_total_ / weight_total_) << " -> "
-              << (logprob_perturbed_total_ / weight_total_);
-    KALDI_LOG << "[this line is to be parsed by a script:] log-prob-per-frame="
-              << (logprob_total_ / weight_total_);
-  }
-}
-
-
-// compare with DoBackpropParallelClass
-class TrainParallelPerturbedClass: public MultiThreadable {
- public:
-  // This constructor is only called for a temporary object
-  // that we pass to the RunMultiThreaded function.
-  TrainParallelPerturbedClass(const NnetPerturbedTrainerConfig &config,
-                              const CuMatrix<BaseFloat> &within_class_covar,
-                              int32 num_layers_before_input,
-                              BaseFloat *D,
-                              Nnet *nnet,
-                              ExamplesRepository *repository,
-                              double *log_prob_orig_ptr,
-                              double *log_prob_perturbed_ptr,
-                              double *tot_weight_ptr):
-      config_(config), within_class_covar_(within_class_covar),
-      num_layers_before_input_(num_layers_before_input), D_(D),
-      nnet_(nnet), repository_(repository),
-      log_prob_orig_ptr_(log_prob_orig_ptr),
-      log_prob_perturbed_ptr_(log_prob_perturbed_ptr),
-      tot_weight_ptr_(tot_weight_ptr),
-      log_prob_orig_(0.0),
-      log_prob_perturbed_(0.0),
-      tot_weight_(0.0) { }
-
-  // Use the default copy constructor.
-  
-  // This does the main function of the class.
-  void operator () () {
-    std::vector<NnetExample> examples;
-    while (repository_->ProvideExamples(&examples)) {
-      double objf_orig, objf_perturbed,
-          weight = TotalNnetTrainingWeight(examples);
-      DoBackpropPerturbed(*nnet_, num_layers_before_input_,
-                          within_class_covar_, *D_,
-                          examples, nnet_,
-                          &objf_orig, &objf_perturbed);
-      UpdateD(objf_orig / weight, objf_perturbed / weight);
-      
-      tot_weight_ += weight;
-      log_prob_orig_ += objf_orig;
-      log_prob_perturbed_ += objf_perturbed;
-      KALDI_VLOG(4) << "Thread " << thread_id_ << " saw "
-                    << tot_weight_ << " frames so far (weighted); likelihood "
-                    << "per frame (orig->perturbed) so far is "
-                    << (log_prob_orig_ / tot_weight_) << " -> "
-                    << (log_prob_perturbed_ / tot_weight_);
-      examples.clear();
-    }    
-  }
-  
-  ~TrainParallelPerturbedClass() {
-    *log_prob_orig_ptr_ += log_prob_orig_;
-    *log_prob_perturbed_ptr_ += log_prob_perturbed_;
-    *tot_weight_ptr_ += tot_weight_;
-  }
- private:
-  void UpdateD(BaseFloat orig_logprob, BaseFloat perturbed_logprob) {
-    BaseFloat diff = orig_logprob - perturbed_logprob;
-    // note: diff should be positive in the normal case.
-    KALDI_ASSERT(config_.target_objf_change > 0.0 && config_.max_d_factor > 1.0);
-    // divide the power we raise the ratio to when tuning D, by the
-    // number of threads; this should ensure stability of the update.
-    BaseFloat tune_d_power = config_.tune_d_power / g_num_threads;
-    BaseFloat objf_ratio = config_.target_objf_change /
-        std::max<BaseFloat>(1.0e-20, diff),
-        D_ratio = pow(objf_ratio, tune_d_power);
-    if (D_ratio > config_.max_d_factor)
-      D_ratio = config_.max_d_factor;
-    else if (D_ratio < 1.0 / config_.max_d_factor)
-      D_ratio = 1.0 / config_.max_d_factor;
-    BaseFloat D_new = (*D_) * D_ratio;
-    *D_ = D_new;  
-    
-    // Note: we are accessing *D_ from multiple threads without
-    // locking, but the negative consequences of this contention are
-    // very small (
-    KALDI_VLOG(3) << "Training objective function normal->perturbed is "
-                  << orig_logprob << " -> " << perturbed_logprob
-                  << ", diff " << diff << " vs. target "
-                  << config_.target_objf_change
-                  << ", changing D by factor " << D_ratio << " to " << D_new;
-  }
-
-  const NnetPerturbedTrainerConfig &config_;
-  const CuMatrix<BaseFloat> &within_class_covar_;
-  int32 num_layers_before_input_;
-  BaseFloat *D_;  // Constant D that controls how much to perturb the data.  We
-                  // update this as well as use it.
-  Nnet *nnet_;
-  ExamplesRepository *repository_;
-
-  double *log_prob_orig_ptr_;
-  double *log_prob_perturbed_ptr_;
-  double *tot_weight_ptr_;
-  double log_prob_orig_;  // log-like times num frames (before perturbing features)
-  double log_prob_perturbed_;  // log-like times num frames (after perturbing features)
-  double tot_weight_;  // normalizing factor for the above.
-};
-
-void DoBackpropPerturbedParallel(const NnetPerturbedTrainerConfig &config,
-                                 const SpMatrix<BaseFloat> &within_class_covar,
-                                 SequentialNnetExampleReader *example_reader,
-                                 double *tot_objf_orig,
-                                 double *tot_objf_perturbed,
-                                 double *tot_weight,
-                                 Nnet *nnet) {
-
-  // within_class_covar_processed is the within-class covar as CuMatrix, possibly
-  // projected by the preconditioning transform in any FixedAffineComponent.
-  CuMatrix<BaseFloat> within_class_covar_processed;
-  int32 num_layers_before_input;
-  InitWithinClassCovar(within_class_covar, *nnet,
-                       &num_layers_before_input,
-                       &within_class_covar_processed);
-  BaseFloat D = config.initial_d;
-
-  ExamplesRepository repository; // handles parallel programming issues regarding  
-
-  *tot_objf_orig = *tot_objf_perturbed = *tot_weight = 0.0;
-
-  TrainParallelPerturbedClass trainer_proto(config,
-                                            within_class_covar_processed,
-                                            num_layers_before_input, &D,
-                                            nnet, &repository,
-                                            tot_objf_orig,
-                                            tot_objf_perturbed,
-                                            tot_weight);
-
-  {
-    // The initialization of the following class spawns the threads that
-    // process the examples.  They get re-joined in its destructor.
-    MultiThreader<TrainParallelPerturbedClass> m(g_num_threads, trainer_proto);
-    
-    std::vector<NnetExample> examples;
-    for (; !example_reader->Done(); example_reader->Next()) {
-      examples.push_back(example_reader->Value());
-      if (examples.size() == config.minibatch_size)
-        repository.AcceptExamples(&examples);
-    }
-    if (!examples.empty()) // partial minibatch.
-      repository.AcceptExamples(&examples);
-    // Here, the destructor of "m" re-joins the threads, and
-    // does the summing of the gradients if we're doing gradient
-    // computation (i.e. &nnet != nnet_to_update).  This gets
-    // done in the destructors of the objects of type
-    // DoBackpropParallelClass.
-    repository.ExamplesDone();
-  }
-  KALDI_LOG << "Did backprop on " << *tot_weight << " examples, average log-prob "
-            << "per frame (orig->perturbed) is "
-            << (*tot_objf_orig / *tot_weight) << " -> "
-            << (*tot_objf_perturbed / *tot_weight) << " over "
-            << *tot_weight << " samples (weighted).";
-  
-  KALDI_LOG << "[this line is to be parsed by a script:] log-prob-per-frame="
-            << (*tot_objf_orig / *tot_weight);
-}
-
-
-
-
-} // namespace nnet2
-} // namespace kaldi
diff --git a/src/nnet2/train-nnet-perturbed.h b/src/nnet2/train-nnet-perturbed.h
deleted file mode 100644
index 9d5d65ea468..00000000000
--- a/src/nnet2/train-nnet-perturbed.h
+++ /dev/null
@@ -1,327 +0,0 @@
-// nnet2/train-nnet-perturbed.h
-
-// Copyright 2012-2014  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET2_TRAIN_NNET_PERTURBED_H_
-#define KALDI_NNET2_TRAIN_NNET_PERTURBED_H_
-
-#include "nnet2/nnet-nnet.h"
-#include "nnet2/nnet-example.h"
-#include "itf/options-itf.h"
-
-namespace kaldi {
-namespace nnet2 {
-
-/**
-   @file
-
-   This file was modified from train-nnet.h in order to implement an idea
-   about perturbing the training examples slightly, in a direction that's
-   opposite to the gradient of the objective function w.r.t. those examples.
-   It's a bit like the idea in "Intriguing properties of neural networks", the
-   training method they mention, except they have a more complicated formulation
-   with L-BFGS.  We can justify our idea by approximating the neural network
-   plus objective-function evaluation as a linear function.
-
-   Note: before doing this, we want to make sure the input features have a
-   reasonable distribution, and our choice for this is to make the within-class
-   covariance matrix unit.  [note: we don't have to normalize the mean to zero,
-   this won't matter.]  Rather than explicitly transforming the features using
-   a transform T, it turns out that we have to multiply the gradients by something
-   like T T'.  We'll describe this later.
-
-   Suppose the actual input features are x.  Typically we do frame splicing
-   as part of the network, and it's more convenient to do the perturbation on
-   the spliced features, so x may actually be the output of the network's
-   first (splicing) layer.  Suppose the within-class covariance matrix of
-   x is W.  If we do the Cholesky transform
-     W = C C^T,
-   then C^{-1} W C^{-T} = I, so if we define
-     T =(def) C^{-1} and
-   and transformed features
-     \hat{x} =(def) T x
-   then it's easy to show that the within-class covariance matrix of the
-   transformed features \hat{x} would be I.
-
-   The way we formulate the perturbed-feature thing is somewhat similar to the
-   "Intriguing properties of neural networks" paper, except we're not in image
-   recognition so no need to keep features in the range [0, 1].  Given a training
-   example \hat{x}_t, we want to find a perturbed example
-       \hat{x}'_t = \hat{x}_t + d_t
-   that gives the worst possible loss-value, such that ||d_t|| <= D, where D is
-   a scalar length parameter (e.g. D = 0.1), and ||.|| is the 2-norm.  This means
-   that we want to perturb the training example in the most damaging way possible,
-   given that it should not change by more than a certain amount.  Because we've
-   normalized the within-class covariance we believe that using a normal 2-norm
-   on d_t, rather than a more general form of inner-product, is suitable.
-
-   Anyway, we make a simplifying assumption that the loss function for a particular
-   sample is just a linear function of the input, and when we get to the space of
-   \hat{x}, it just means we go a certain distance D down the gradient.  How we
-   set a suitable value for D, we'll come to later.
-   
-   Suppose by backpropagating the
-   derivative to x we get a derivative \nabla_t of the objective function (e.g. a
-   log-probability) w.r.t. x_t.  Then we can get the derivative \hat{\nabla}_t of
-   the objective function w.r.t. \hat{x}_t, by identifying
-       x_t^T nabla_t = \hat{x}_t^T \hat{\nabla}_t
-       x_t^T nabla_t = x_t^T T^T \hat{\nabla}_t
-       x_t^T nabla_t = x_t^T T^T T^{-T} \nabla_t, since T^T T^{-T} = I.
-       [note, ^T is transpose and ^{-T} is inverse-of-transpose.]
-   so  \hat{\nabla}_t = T^{-T} \nabla_t.
-   (this is not the formal way of getting these derivatives, it's just how I remember).
-   Anyway, we now have
-       \hat{x}'_t =(def) \hat{x}_t  - k_t T^{-T} \nabla_t
-   where k_t is chosen to ensure that
-                        k_t || T^{-T} \nabla_t ||_2 = D
-      k_t sqrt( \nabla_t^T T^{-1} T^{-T} \nabla_t ) = D
-   so
-     k_t = D / sqrt(\nabla_t^T T^{-1} T^{-T} \nabla_t)
-         = D / sqrt(\nabla_t^T C C^T \nabla_t)
-         = D / sqrt(\nabla_t^T W \nabla_t)
-   Now, we actually want the update in terms of the parameter x instead of \hat{x},
-   so multiplying the definition of \hat{x}'_t above by T^{-1} on the left, we have:
-       x'_t = x_t - k_t T^{-1} T^{-T} \nabla_t
-            = x_t - k_t W \nabla_t
-  (note: we can also use W \nabla_t for efficiently computing k_t).
-
-  It will actually be more efficient to do this after the FixedAffineTransform
-  layer that we used to "precondition" the features, so after the second layer
-  of the input rather than the first.  All we need to do is to get the
-  within-class covariance matrix W in that space (after the
-  FixedAffineTransform) instead.  We'll use the name x for that space, and forget
-  about the original input space.
-
-  Next, we want to discuss how we'll set the constant D.  D is a proportion of
-  the within-class covariance.  However, it's not clear a priori how to set
-  this, or that we can tune it just once and then leave it fixed for other
-  setups.  For one thing, if the input features contain a lot of "nuisance"
-  dimension that are not very informative about the class, it may be necessary
-  for D to be smaller (because hopefully the gradients will be small in those
-  nuisance directions).  There is another issue that this whole method is
-  intended to improve generalization, so we only want to use it strongly if
-  generalization is actually a problem.  For example, if we have so much
-  training data and so few parameters that we have no trouble generalizing, we
-  might not want to apply this method too strongly.  Our method will be to set D
-  in order to get, on average, a certain degradation which we'll call
-  "target-objf-change" in the objective function per frame.  Each time we
-  apply this perturbation to a minibatch, we'll see whether the degradation in
-  objective is greater or less than "target-objf-change", and we'll change
-  D accordingly.  We'll use a simple heuristic that D should change proportionally
-  to the 0.5'th power of the ratio between the "target-objf-change" and the
-  observed objective function change for this minibatch, but never by more than
-  a factor of two.  Note: the only significance of 0.5 here is that 0.5 <= 1; a
-  smaller number means slower changes in D, so it should change over about 2
-  minibatches to the right number.   If this proves unstable, we'll change it.
-
-  Next, it's not absolutely clear how we should set target-objf-change-- the
-  value which determines how much objective-function degradation we want the
-  perturbation to produce on average (per sample).  To put this in perspective,
-  for speech tasks with small amounts of data (say, <30 hours) and a couple thousand
-  classes
-  we typically see objective values like: training-set -0.6 and valdiation-set -1.1.
-  These are avearage log-probabilities per frame, of the correct class.
-  The two numbers are quite different because there is substantial overtraining.  Note: for Karel's
-  nnet1 setup, the difference is typically smaller, more like -0.8 vs. -1.0, as
-  that setup monitors the validation-set objective and decreases the learning rate
-  when it starts to degrade.  Now, for much larger training sets, we might
-  see smaller differences in training-set versus validation-set objective function:
-  for instance: say, -1.40 versus -1.45.  (For larger training sets the objectives tend
-  to be more negative simply because we have more leaves).  We measure these values each
-  iteration: see the files compute_prob_train.*.log and compute_prob_valid.*.log produced
-  by the example scripts.   The reason why I discuss these values
-  is that if the training-set and validation-set objective functions are very close, then
-  it means that there is not much overtraining going on and we don't want to apply this
-  method too strongly; on the other hand, if they are very different, it means we are
-  overtraining badly and we may want to apply this method more.
-
-  So we plan to set target-objf-change to the following value, at the script level:
-
-   target-objf-change = target-multiplier * (training-objf - validation-objf))
-
-  (e.g. target-multiplier = 1.0).
-  Note that if target-objf-change is less than a specified min-target-objf-change
-  (e.g. 0.1) then we won't apply the perturbed training at all, which will save
-  time.  The method is intended to help generalization, and if we're generalizing
-  well then we don't need to apply it.
-  The training and validation objective functions are computed over
-  different (randomly chosen) sets, each with about 3000 samples, and it can
-  sometimes happen that the validation objective function can be better than the
-  training set objective function.  Also, the validation set is sampled from a
-  held-out subset of 300 utterances by default; this is done out of a concern
-  that the correlations within an utterance can be very high, so if we use the
-  same utterances for training and validation, then the validation set is not
-  really held-out.  But the smallish number (300) of validation utterances
-  increases the randomness in the training and validation objectives.
-*/
-
-
-
-struct NnetPerturbedTrainerConfig {
-  int32 minibatch_size;
-  int32 minibatches_per_phase;
-  // target_objf_change will be set from the command line to a value >0.0.
-  BaseFloat target_objf_change;
-  BaseFloat initial_d;
-  // tune_d_power is not configurable from the command line.
-  BaseFloat tune_d_power;
-  // max_d_factor is not configurable from the command line.
-  BaseFloat max_d_factor;
-
-
-  NnetPerturbedTrainerConfig(): minibatch_size(500),
-                                minibatches_per_phase(50),
-                                target_objf_change(0.1),
-                                initial_d(0.05),
-                                tune_d_power(0.5),
-                                max_d_factor(2.0){ }
-  
-  void Register (OptionsItf *opts) {
-    opts->Register("minibatch-size", &minibatch_size,
-                   "Number of samples per minibatch of training data.");
-    opts->Register("minibatches-per-phase", &minibatches_per_phase,
-                   "Number of minibatches to wait before printing training-set "
-                   "objective.");
-    opts->Register("target-objf-change", &target_objf_change, "Target objective "
-                   "function change from feature perturbation, used to set "
-                   "feature distance parameter D");
-    opts->Register("initial-d", &initial_d, "Initial value of parameter D "
-                   "It will ultimately be set according to --target-objf-change");
-  }  
-};
-
-
-/// Class NnetPerturbedTrainer is as NnetSimpleTrainer but implements feature
-/// perturbation; see the comment at the top of this file (\ref
-/// train-nnet-perturbed.h) for more details.
-
-class NnetPerturbedTrainer {
- public:
-  NnetPerturbedTrainer(const NnetPerturbedTrainerConfig &config,
-                       const SpMatrix<BaseFloat> &within_class_covar,
-                       Nnet *nnet);
-  
-  /// TrainOnExample will take the example and add it to a buffer;
-  /// if we've reached the minibatch size it will do the training.
-  void TrainOnExample(const NnetExample &value);
-
-  ~NnetPerturbedTrainer();
- private:
-  KALDI_DISALLOW_COPY_AND_ASSIGN(NnetPerturbedTrainer);
-  
-  void TrainOneMinibatch();
-
-  // This function initializes within_class_covar_ and num_layers_before_input_.
-  // The input within_class_covar is the within-class covariance on the original
-  // raw features, computed from LDA stats, but if this neural network has
-  // a data-preconditioning layer of type FixedAffineComponent then we will
-  // project the transform with that and treat the output of that transform
-  // as the input x (this is more efficient).
-  void InitWithinClassCovar(const SpMatrix<BaseFloat> &within_class_covar);
-
-  void UpdateD(BaseFloat orig_objf_per_example,
-               BaseFloat perturbed_objf_per_example);
-  
-  // The following function is called by TrainOneMinibatch() when we enter a new
-  // phase.  A phase is just a certain number of epochs, and now matters only
-  // for diagnostics (originally it meant something more).
-  void BeginNewPhase(bool first_time);
-  
-  // Things we were given in the initializer:
-  NnetPerturbedTrainerConfig config_;
-
-  Nnet *nnet_; // the nnet we're training.
-
-  // static information:
-  // num_layers_before_input_ is the number of initial layers before what we
-  // consider to be the input for this method: normally 2, for the splicing
-  // layer and the (FixedAffineComponent) data-preconditioning layer.
-  int32 num_layers_before_input_;
-  // The within_class_covar_ variable below is the within-class covariance; if
-  // we have a (FixedAffineComponent) data-preconditioning layer, we'd project
-  // the within-class-covariance with that and store it as within_class_covar_.
-  CuMatrix<BaseFloat> within_class_covar_;
-  
-  // State information:
-  int32 num_phases_;
-  int32 minibatches_seen_this_phase_;
-  std::vector<NnetExample> buffer_;
-
-  double logprob_this_phase_; // Needed for accumulating train log-prob on each phase.
-  double logprob_perturbed_this_phase_;  // same for perturbed log-prob
-  double weight_this_phase_; // count corresponding to the above.
-  
-  double logprob_total_;
-  double logprob_perturbed_total_;
-  double weight_total_;
-
-  BaseFloat D_;  // The distance factor D.
-};
-
-
-
-
-/// This function computes the objective function and either updates the model
-/// or adds to parameter gradients.  It returns the cross-entropy objective
-/// function summed over all samples (normalize this by dividing by
-/// TotalNnetTrainingWeight(examples)).  It is mostly a wrapper for
-/// a class NnetPerturbedUpdater that's defined in train-nnet-perturbed.cc, but we
-/// don't want to expose that complexity at this level.
-/// All these examples will be treated as one minibatch.
-///
-/// D is the distance factor that determines how much to perturb examples;
-/// this is optimized in outer-level code (see class NnetPerturbedTrainer).
-/// num_layers_before_input determines how many layers to skip before we find
-/// the activation that we regard as the input x to the network, for purposes
-/// of this method (e.g. we might skip over the splicing layer and a layer
-/// that preconditions the input).
-/// within_class_covar (actually a symmetric matrix, but represented as CuMatrix),
-/// is the within-class covariance of the features, measured at that level,
-/// which ultimately will be derived from LDA stats on the data.
-
-void DoBackpropPerturbed(const Nnet &nnet,
-                         int32 num_layers_before_input,
-                         const CuMatrix<BaseFloat> &within_class_covar,
-                         BaseFloat D,
-                         const std::vector<NnetExample> &examples,
-                         Nnet *nnet_to_update,
-                         double *tot_objf_orig,
-                         double *tot_objf_perturbed);
-
-
-
-/// This function is similar to "DoBackpropParallel" as declared in
-/// nnet-update-parallel.h, but supports "perturbed" training.  It's intended
-/// for multi-threaded CPU-based training.  The number of threads will be
-/// set to g_num_threads.
-/// within_class_covar is the within-class covariance after any splicing
-/// but before preconditioning, as needed for the LDA computation.
-/// All pointer arguments must be non-NULL.
-void DoBackpropPerturbedParallel(const NnetPerturbedTrainerConfig &config,
-                                 const SpMatrix<BaseFloat> &within_class_covar,
-                                 SequentialNnetExampleReader *example_reader,
-                                 double *tot_objf_orig,
-                                 double *tot_objf_perturbed,
-                                 double *tot_weight,
-                                 Nnet *nnet);
-
-
-} // namespace nnet2
-} // namespace kaldi
-
-#endif
diff --git a/src/nnet2bin/Makefile b/src/nnet2bin/Makefile
index 790dbfc81f6..b2da7690b3c 100644
--- a/src/nnet2bin/Makefile
+++ b/src/nnet2bin/Makefile
@@ -26,7 +26,6 @@ BINFILES = nnet-am-info nnet-init \
    nnet-modify-learning-rates nnet-normalize-stddev nnet-perturb-egs \
    nnet-perturb-egs-fmllr nnet-get-weighted-egs nnet-adjust-priors \
    cuda-compiled nnet-replace-last-layers nnet-am-switch-preconditioning \
-   nnet-train-simple-perturbed nnet-train-parallel-perturbed \
    nnet1-to-raw-nnet raw-nnet-copy nnet-relabel-egs nnet-am-reinitialize \
    nnet2-boost-silence
 
@@ -42,6 +41,7 @@ ADDLIBS = ../nnet2/kaldi-nnet2.a ../nnet/kaldi-nnet.a ../gmm/kaldi-gmm.a \
          ../decoder/kaldi-decoder.a ../lat/kaldi-lat.a ../hmm/kaldi-hmm.a  \
          ../transform/kaldi-transform.a ../tree/kaldi-tree.a ../thread/kaldi-thread.a \
          ../cudamatrix/kaldi-cudamatrix.a ../matrix/kaldi-matrix.a \
-         ../fstext/kaldi-fstext.a ../util/kaldi-util.a ../base/kaldi-base.a 
+         ../fstext/kaldi-fstext.a ../util/kaldi-util.a ../thread/kaldi-thread.a \
+         ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/nnet2bin/nnet-am-average.cc b/src/nnet2bin/nnet-am-average.cc
index b51fa703e7a..0fa00f05995 100644
--- a/src/nnet2bin/nnet-am-average.cc
+++ b/src/nnet2bin/nnet-am-average.cc
@@ -17,6 +17,8 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
+#include <algorithm>
+
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "hmm/transition-model.h"
@@ -51,9 +53,62 @@ void GetWeights(const std::string &weights_str,
   }
 }
 
-}
 
 
+std::vector<bool> GetSkipLayers(const std::string &skip_layers_str,
+                                const int32 first_layer_idx,
+                                const int32 last_layer_idx) {
+
+  std::vector<bool> skip_layers(last_layer_idx, false);
+
+  if (skip_layers_str.empty()) {
+    return skip_layers;
+  }
+
+  std::vector<int> layer_indices;
+  bool ret = SplitStringToIntegers(skip_layers_str, ":", true, &layer_indices);
+  if (!ret) {
+    KALDI_ERR << "Cannot parse the skip layers specifier. It should be"
+              << "colon-separated list of integers";
+  }
+
+  int min_elem = std::numeric_limits<int>().max(),
+      max_elem = std::numeric_limits<int>().min();
+
+  std::vector<int>::iterator it;
+  for ( it = layer_indices.begin(); it != layer_indices.end(); ++it ) {
+    if ( *it < 0 )
+      *it = last_layer_idx + *it;  // convert the negative indices to
+                                       // correct indices -- -1 would be the
+                                       // last one, -2 the one before the last
+                                       // and so on.
+    if (*it > max_elem)
+      max_elem = *it;
+
+    if (*it < min_elem)
+      min_elem = *it;
+  }
+
+  if (max_elem >= last_layer_idx) {
+    KALDI_ERR << "--skip-layers option has to be a colon-separated list"
+              << "of indices which are supposed to be skipped.\n"
+              << "Maximum expected index: " << last_layer_idx
+              << " got: " << max_elem ;
+  }
+  if (min_elem < first_layer_idx) {
+    KALDI_ERR << "--skip-layers option has to be a colon-separated list"
+              << "of indices which are supposed to be skipped.\n"
+              << "Minimum expected index: " << first_layer_idx
+              << " got: " << min_elem ;
+  }
+
+  for ( it = layer_indices.begin(); it != layer_indices.end(); ++it ) {
+    skip_layers[*it] = true;
+  }
+  return skip_layers;
+}
+
+}
 int main(int argc, char *argv[]) {
   try {
     using namespace kaldi;
@@ -81,11 +136,17 @@ int main(int argc, char *argv[]) {
     po.Register("binary", &binary_write, "Write output in binary mode");
     string weights_str;
     bool skip_last_layer = false;
+    string skip_layers_str;
     po.Register("weights", &weights_str, "Colon-separated list of weights, one "
                 "for each input model.  These will be normalized to sum to one.");
     po.Register("skip-last-layer", &skip_last_layer, "If true, averaging of "
                 "the last updatable layer is skipped (result comes from model1)");
-    
+    po.Register("skip-layers", &skip_layers_str, "Colon-separated list of "
+                "indices of the layers that should be skipped during averaging."
+                "Be careful: this parameter uses an absolute indexing of "
+                "layers, i.e. iterates over all components, not over updatable "
+                "ones only.");
+
     po.Read(argc, argv);
 
     if (po.NumArgs() < 2) {
@@ -110,22 +171,43 @@ int main(int argc, char *argv[]) {
 
     vector<BaseFloat> model_weights;
     GetWeights(weights_str, num_inputs, &model_weights);
-    
+
     int32 c_begin = 0,
         c_end = (skip_last_layer ?
                  am_nnet1.GetNnet().LastUpdatableComponent() :
                  am_nnet1.GetNnet().NumComponents());
     KALDI_ASSERT(c_end != -1 && "Network has no updatable components.");
-    
+
+    int32 last_layer_idx = am_nnet1.GetNnet().NumComponents();
+    vector<bool> skip_layers = GetSkipLayers(skip_layers_str,
+                                             0,
+                                             last_layer_idx);
+
     // scale the components - except the last layer, if skip_last_layer == true.
     for (int32 c = c_begin; c < c_end; c++) {
+      if (skip_layers[c]) {
+        KALDI_VLOG(2) << "Not averaging layer " << c << " (as requested)";
+        continue;
+      }
+      bool updated = false;
       UpdatableComponent *uc =
         dynamic_cast<UpdatableComponent*>(&(am_nnet1.GetNnet().GetComponent(c)));
-      if (uc != NULL)  uc->Scale(model_weights[0]);
+      if (uc != NULL)  {
+        KALDI_VLOG(2) << "Averaging layer " << c << " (UpdatableComponent)";
+        uc->Scale(model_weights[0]);
+        updated = true;
+      }
       NonlinearComponent *nc =
         dynamic_cast<NonlinearComponent*>(&(am_nnet1.GetNnet().GetComponent(c)));
-      if (nc != NULL)
+      if (nc != NULL) {
+        KALDI_VLOG(2) << "Averaging layer " << c << " (NonlinearComponent)";
         nc->Scale(model_weights[0]);
+        updated = true;
+      }
+      if (! updated) {
+        KALDI_VLOG(2) << "Not averaging layer " << c
+          << " (unscalable component)";
+      }
     }
 
     for (int32 i = 2; i <= num_inputs; i++) {
@@ -137,6 +219,8 @@ int main(int argc, char *argv[]) {
       am_nnet.Read(ki.Stream(), binary_read);
 
       for (int32 c = c_begin; c < c_end; c++) {
+        if (skip_layers[c]) continue;
+
         UpdatableComponent *uc_average =
           dynamic_cast<UpdatableComponent*>(&(am_nnet1.GetNnet().GetComponent(c)));
         const UpdatableComponent *uc_this =
diff --git a/src/nnet2bin/nnet-am-compute.cc b/src/nnet2bin/nnet-am-compute.cc
index 556c97ac610..6628100569f 100644
--- a/src/nnet2bin/nnet-am-compute.cc
+++ b/src/nnet2bin/nnet-am-compute.cc
@@ -2,6 +2,7 @@
 
 // Copyright 2012  Johns Hopkins University (author:  Daniel Povey)
 //           2015  Johns Hopkins University (author:  Daniel Garcia-Romero)
+//           2015  David Snyder
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -39,11 +40,13 @@ int main(int argc, char *argv[]) {
         "--apply-log=true\n"
         "\n"
         "Usage:  nnet-am-compute [options] <model-in> <feature-rspecifier> "
-        "<feature-or-loglikes-wspecifier>\n";
-    
+        "<feature-or-loglikes-wspecifier>\n"
+        "See also: nnet-compute, nnet-logprob\n";
+
     bool apply_log = false;
     bool pad_input = true;
     std::string use_gpu = "no";
+    int32 chunk_size = 0;
     ParseOptions po(usage);
     po.Register("apply-log", &apply_log, "Apply a log to the result of the computation "
                 "before outputting.");
@@ -52,17 +55,23 @@ int main(int argc, char *argv[]) {
                 "of output being less than those of input.");
     po.Register("use-gpu", &use_gpu,
                 "yes|no|optional|wait, only has effect if compiled with CUDA");
-    
+    po.Register("chunk-size", &chunk_size, "Process the feature matrix in chunks.  "
+                "This is useful when processing large feature files in the GPU.  "
+                "If chunk-size > 0, pad-input must be true.");
+
     po.Read(argc, argv);
-    
+
     if (po.NumArgs() != 3) {
       po.PrintUsage();
       exit(1);
     }
+    // If chunk_size is greater than 0, pad_input needs to be true.
+    KALDI_ASSERT(chunk_size < 0 || pad_input);
+
 #if HAVE_CUDA==1
     CuDevice::Instantiate().SelectGpuId(use_gpu);
 #endif
-    
+
     std::string nnet_rxfilename = po.GetArg(1),
         features_rspecifier = po.GetArg(2),
         features_or_loglikes_wspecifier = po.GetArg(3);
@@ -77,14 +86,14 @@ int main(int argc, char *argv[]) {
     }
 
     Nnet &nnet = am_nnet.GetNnet();
-    
+
     int64 num_done = 0, num_frames = 0;
-    SequentialBaseFloatCuMatrixReader feature_reader(features_rspecifier);
-    BaseFloatCuMatrixWriter writer(features_or_loglikes_wspecifier);
-    
+    SequentialBaseFloatMatrixReader feature_reader(features_rspecifier);
+    BaseFloatMatrixWriter writer(features_or_loglikes_wspecifier);
+
     for (; !feature_reader.Done();  feature_reader.Next()) {
       std::string utt = feature_reader.Key();
-      const CuMatrix<BaseFloat> &feats  = feature_reader.Value();
+      const Matrix<BaseFloat> &feats  = feature_reader.Value();
 
       int32 output_frames = feats.NumRows(), output_dim = nnet.OutputDim();
       if (!pad_input)
@@ -94,8 +103,16 @@ int main(int argc, char *argv[]) {
                    << "would be empty.";
         continue;
       }
-      CuMatrix<BaseFloat> output(output_frames, output_dim);
-      NnetComputation(nnet, feats, pad_input, &output);
+
+      Matrix<BaseFloat> output(output_frames, output_dim);
+      if (chunk_size > 0 && chunk_size < feats.NumRows()) {
+        NnetComputationChunked(nnet, feats, chunk_size, &output);
+      } else {
+        CuMatrix<BaseFloat> cu_feats(feats);
+        CuMatrix<BaseFloat> cu_output(output);
+        NnetComputation(nnet, cu_feats, pad_input, &cu_output);
+        output.CopyFromMat(cu_output);
+      }
 
       if (apply_log) {
         output.ApplyFloor(1.0e-20);
@@ -108,7 +125,7 @@ int main(int argc, char *argv[]) {
 #if HAVE_CUDA==1
     CuDevice::Instantiate().PrintProfile();
 #endif
-    
+
     KALDI_LOG << "Processed " << num_done << " feature files, "
               << num_frames << " frames of input were processed.";
 
diff --git a/src/nnet2bin/nnet-get-weighted-egs.cc b/src/nnet2bin/nnet-get-weighted-egs.cc
index 60b31521f6f..a3099ad8017 100644
--- a/src/nnet2bin/nnet-get-weighted-egs.cc
+++ b/src/nnet2bin/nnet-get-weighted-egs.cc
@@ -1,4 +1,4 @@
-// nnet2bin/nnet-weighted-get-egs.cc
+// nnet2bin/nnet-get-weighted-egs.cc
 
 // Copyright 2013-2014  (Author: Vimal Manohar)
 
diff --git a/src/nnet2bin/nnet-latgen-faster-parallel.cc b/src/nnet2bin/nnet-latgen-faster-parallel.cc
index 869ed9c1024..d7adc4ca03f 100644
--- a/src/nnet2bin/nnet-latgen-faster-parallel.cc
+++ b/src/nnet2bin/nnet-latgen-faster-parallel.cc
@@ -184,7 +184,7 @@ int main(int argc, char *argv[]) {
       }
     }
     sequencer.Wait(); // Waits for all tasks to be done.
-    if (decode_fst != NULL) delete decode_fst;   
+    delete decode_fst;   
     
     double elapsed = timer.Elapsed();
     KALDI_LOG << "Time taken "<< elapsed
@@ -195,7 +195,7 @@ int main(int argc, char *argv[]) {
     KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like/frame_count) << " over "
               << frame_count<<" frames.";
 
-    if (word_syms) delete word_syms;
+    delete word_syms;
     if (num_done != 0) return 0;
     else return 1;
   } catch(const std::exception &e) {
diff --git a/src/nnet2bin/nnet-latgen-faster.cc b/src/nnet2bin/nnet-latgen-faster.cc
index 249b3a61ded..fb4b5d152db 100644
--- a/src/nnet2bin/nnet-latgen-faster.cc
+++ b/src/nnet2bin/nnet-latgen-faster.cc
@@ -185,7 +185,7 @@ int main(int argc, char *argv[]) {
     KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like/frame_count) << " over "
               << frame_count<<" frames.";
 
-    if (word_syms) delete word_syms;
+    delete word_syms;
     if (num_success != 0) return 0;
     else return 1;
   } catch(const std::exception &e) {
diff --git a/src/nnet2bin/nnet-limit-degradation.cc b/src/nnet2bin/nnet-limit-degradation.cc
index 9e9255e6675..6aa9978afcd 100644
--- a/src/nnet2bin/nnet-limit-degradation.cc
+++ b/src/nnet2bin/nnet-limit-degradation.cc
@@ -1,4 +1,4 @@
-// nnet2bin/nnet-linit-degradation.cc
+// nnet2bin/nnet-limit-degradation.cc
 
 // Copyright 2012-2013  Johns Hopkins University (author:  Daniel Povey)
 
diff --git a/src/nnet2bin/nnet-logprob.cc b/src/nnet2bin/nnet-logprob.cc
index 8463d9ae475..b9ec964b94c 100644
--- a/src/nnet2bin/nnet-logprob.cc
+++ b/src/nnet2bin/nnet-logprob.cc
@@ -35,31 +35,33 @@ int main(int argc, char *argv[]) {
         "Do the forward computation for a neural net acoustic model (and division by\n"
         "the prior, if --divide-by-priors=true), and output as an archive the matrix\n"
         "of log probabilities for each utterance, e.g. for input to latgen-faster-mapped\n"
+        "or lattice-rescore-mapped.\n"
         "(note: you can also directly use nnet-latgen-faster.\n"
         "\n"
         "Usage: nnet-logprob [options] <model-in> <features-rspecifier> <logprobs-wspecifier>\n"
         "\n"
-        "e.g.: nnet-logprob 1.nnet \"$feats\" ark:- | latgen-faster-mapped ... \n";
-    
+        "e.g.: nnet-logprob 1.nnet \"$feats\" ark:- | latgen-faster-mapped ... \n"
+        "See also: nnet-am-compute, nnet-compute\n";
+
     bool pad_input = true; // This is not currently configurable.
     bool divide_by_priors = true;
-    
+
     ParseOptions po(usage);
-    
+
     po.Register("divide-by-priors", &divide_by_priors, "If true, before getting "
                 "the log-probs, divide by the priors stored with the model");
-    
+
     po.Read(argc, argv);
-    
+
     if (po.NumArgs() != 3) {
       po.PrintUsage();
       exit(1);
     }
-    
+
     std::string nnet_rxfilename = po.GetArg(1),
         feats_rspecifier = po.GetArg(2),
         logprob_wspecifier = po.GetArg(3);
-    
+
     TransitionModel trans_model;
     AmNnet am_nnet;
     {
@@ -75,14 +77,14 @@ int main(int argc, char *argv[]) {
     KALDI_ASSERT(inv_priors.Dim() == am_nnet.NumPdfs() &&
                  "Priors in neural network not set up.");
     inv_priors.ApplyPow(-1.0);
-    
+
     SequentialBaseFloatCuMatrixReader feature_reader(feats_rspecifier);
     BaseFloatCuMatrixWriter logprob_writer(logprob_wspecifier);
-    
+
     for (; !feature_reader.Done(); feature_reader.Next()) {
       std::string key = feature_reader.Key();
       const CuMatrix<BaseFloat> &feats = feature_reader.Value();
-      
+
       CuMatrix<BaseFloat> log_probs(feats.NumRows(), am_nnet.NumPdfs());
       NnetComputation(am_nnet.GetNnet(), feats, pad_input, &log_probs);
       // at this point "log_probs" contains actual probabilities, not logs.
@@ -105,7 +107,7 @@ int main(int argc, char *argv[]) {
       logprob_writer.Write(key, log_probs);
       num_done++;
     }
-    
+
     KALDI_LOG << "Finished computing neural net log-probs, processed "
               << num_done << " utterances, " << num_err << " with errors.";
     return (num_done == 0 ? 1 : 0);
diff --git a/src/nnet2bin/nnet-relabel-egs.cc b/src/nnet2bin/nnet-relabel-egs.cc
index bae6573d7b2..69c6c9923b8 100644
--- a/src/nnet2bin/nnet-relabel-egs.cc
+++ b/src/nnet2bin/nnet-relabel-egs.cc
@@ -22,17 +22,6 @@
 
 #include <sstream>
 
-#ifdef _MSC_VER
-#include <unordered_map>
-using std::unordered_map;
-#elif __cplusplus > 199711L || defined(__GXX_EXPERIMENTAL_CXX0X__)
-#include <unordered_map>
-using std::unordered_map;
-#else
-#include <tr1/unordered_map>
-using std::tr1::unordered_map;
-#endif
-
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "nnet2/nnet-example.h"
@@ -160,7 +149,7 @@ int main(int argc, char *argv[]) {
 
     unordered_map<std::string, std::vector<int32>*>::iterator iter;
     
-    for (iter = utt_to_pdf_ali.begin(); iter != utt_to_pdf_ali.end(); iter++)
+    for (iter = utt_to_pdf_ali.begin(); iter != utt_to_pdf_ali.end(); ++iter)
       delete iter->second;
     
     KALDI_LOG << "Read " << num_ali << " alignments containing a total of " 
diff --git a/src/nnet2bin/nnet-shuffle-egs.cc b/src/nnet2bin/nnet-shuffle-egs.cc
index 381a733aeef..7c4872b48b6 100644
--- a/src/nnet2bin/nnet-shuffle-egs.cc
+++ b/src/nnet2bin/nnet-shuffle-egs.cc
@@ -62,7 +62,7 @@ int main(int argc, char *argv[]) {
     int64 num_done = 0;
 
     std::vector<std::pair<std::string, NnetExample*> > egs;
-    
+
     SequentialNnetExampleReader example_reader(examples_rspecifier);
     NnetExampleWriter example_writer(examples_wspecifier);
     if (buffer_size == 0) {  // Do full randomization
@@ -76,7 +76,7 @@ int main(int argc, char *argv[]) {
       std::random_shuffle(egs.begin(), egs.end());
     } else {
       KALDI_ASSERT(buffer_size > 0);
-      egs.resize(buffer_size, 
+      egs.resize(buffer_size,
           std::pair<std::string, NnetExample*>("", static_cast<NnetExample *>(NULL)));
       for (; !example_reader.Done(); example_reader.Next()) {
         int32 index = RandInt(0, buffer_size - 1);
@@ -95,8 +95,8 @@ int main(int argc, char *argv[]) {
       if (egs[i].second != NULL) {
         example_writer.Write(egs[i].first, *(egs[i].second));
         delete egs[i].second;
+        num_done++;
       }
-      num_done++;
     }
 
     KALDI_LOG << "Shuffled order of " << num_done
diff --git a/src/nnet2bin/nnet-train-ensemble.cc b/src/nnet2bin/nnet-train-ensemble.cc
index ba574c314cf..86e78936279 100644
--- a/src/nnet2bin/nnet-train-ensemble.cc
+++ b/src/nnet2bin/nnet-train-ensemble.cc
@@ -1,4 +1,4 @@
-// nnet2bin/nnet-train-simple.cc
+// nnet2bin/nnet-train-ensemble.cc
 
 // Copyright 2012  Johns Hopkins University (author: Daniel Povey)
 //           2014  Xiaohui Zhang
diff --git a/src/nnet2bin/nnet-train-parallel-perturbed.cc b/src/nnet2bin/nnet-train-parallel-perturbed.cc
deleted file mode 100644
index d4996fbe114..00000000000
--- a/src/nnet2bin/nnet-train-parallel-perturbed.cc
+++ /dev/null
@@ -1,126 +0,0 @@
-// nnet2bin/nnet-train-parallel-perturbed.cc
-
-// Copyright 2012-2014  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/train-nnet-perturbed.h"
-#include "nnet2/am-nnet.h"
-#include "thread/kaldi-thread.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "Train the neural network parameters with backprop and stochastic\n"
-        "gradient descent using minibatches.  As nnet-train-parallel but\n"
-        "perturbs the input features by going a certain distance down the\n"
-        "gradient obtained by backprop (can help for small datasets)\n"
-        "\n"
-        "Usage:  nnet-train-parallel-perturbed [options] <model-in> <training-examples-in> <model-out>\n"
-        "\n"
-        "e.g.:\n"
-        "nnet-train-parallel-pertured \\\n"
-        " --within-covar=within.spmat --num-threads=8 --target-objf-change=0.2 1.nnet ark:1.egs 2.nnet\n";
-    
-    bool binary_write = true;
-    bool zero_stats = true;
-    int32 srand_seed = 0;
-    std::string within_covar_rxfilename;
-    NnetPerturbedTrainerConfig train_config;
-    
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("within-covar", &within_covar_rxfilename,
-                "rxfilename of within-class covariance-matrix, written as "
-                "SpMatrix.  Must be specified.");
-    po.Register("zero-stats", &zero_stats, "If true, zero stats "
-                "stored with the neural net (only affects mixing up).");
-    po.Register("srand", &srand_seed,
-                "Seed for random number generator (e.g., for dropout)");
-    po.Register("num-threads", &g_num_threads, "Number of training threads to use "
-                "in the parallel update. [Note: if you use a parallel "
-                "implementation of BLAS, the actual number of threads may be larger.]");
-    train_config.Register(&po);
-    
-    po.Read(argc, argv);
-    srand(srand_seed);
-
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-    
-    std::string nnet_rxfilename = po.GetArg(1),
-        examples_rspecifier = po.GetArg(2),
-        nnet_wxfilename = po.GetArg(3);
-
-    if (within_covar_rxfilename == "") {
-      KALDI_ERR << "The option --within-covar is required.";
-    }
-    
-    TransitionModel trans_model;
-    AmNnet am_nnet;
-    {
-      bool binary_read;
-      Input ki(nnet_rxfilename, &binary_read);
-      trans_model.Read(ki.Stream(), binary_read);
-      am_nnet.Read(ki.Stream(), binary_read);
-    }
-
-    KALDI_ASSERT(train_config.minibatch_size > 0);
-
-    SpMatrix<BaseFloat> within_covar;
-    ReadKaldiObject(within_covar_rxfilename, &within_covar);
-    
-    if (zero_stats) am_nnet.GetNnet().ZeroStats();
-
-    SequentialNnetExampleReader example_reader(examples_rspecifier);
-    
-    
-    double tot_objf_orig, tot_objf_perturbed, tot_weight;
-    // logging info will be printed from within the next call.
-    DoBackpropPerturbedParallel(train_config,
-                                within_covar,
-                                &example_reader,
-                                &tot_objf_orig,
-                                &tot_objf_perturbed,
-                                &tot_weight,
-                                &(am_nnet.GetNnet()));
-    {
-      Output ko(nnet_wxfilename, binary_write);
-      trans_model.Write(ko.Stream(), binary_write);
-      am_nnet.Write(ko.Stream(), binary_write);
-    }
-    
-    KALDI_LOG << "Finished training, processed " << tot_weight
-              << " training examples (weighted).  Wrote model to "
-              << nnet_wxfilename;
-    return (tot_weight == 0 ? 1 : 0);
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
diff --git a/src/nnet2bin/nnet-train-perturbed.cc b/src/nnet2bin/nnet-train-perturbed.cc
deleted file mode 100644
index fcad97701f1..00000000000
--- a/src/nnet2bin/nnet-train-perturbed.cc
+++ /dev/null
@@ -1,136 +0,0 @@
-// nnet2bin/nnet-train-perturbed.cc
-
-// Copyright 2012-2014  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/train-nnet-perturbed.h"
-#include "nnet2/am-nnet.h"
-
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "Train the neural network parameters with backprop and stochastic\n"
-        "gradient descent using minibatches.  As nnet-train-simple, but\n"
-        "perturbs the input features by going a certain distance down the\n"
-        "backprop-ed gradient.  Can be helpful for small datasets.\n"
-        "\n"
-        "Usage:  nnet-train-perturbed [options] <model-in> <training-examples-in> <model-out>\n"
-        "note: the option --within-covar=<file> is needed\n"
-        "\n"
-        "e.g.:\n"
-        "nnet-train-perturbed --within-covar=within.spmat 1.nnet ark:1.1.egs 2.nnet\n";
-    
-    bool binary_write = true;
-    bool zero_stats = true;
-    int32 srand_seed = 0;
-    std::string use_gpu = "yes";
-    std::string within_covar_rxfilename;
-    NnetPerturbedTrainerConfig train_config;
-    
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("within-covar", &within_covar_rxfilename,
-                "rxfilename of within-class covariance-matrix, written as "
-                "SpMatrix.  Must be specified.");
-    po.Register("zero-stats", &zero_stats, "If true, zero occupation "
-                "counts stored with the neural net (only affects mixing up).");
-    po.Register("srand", &srand_seed, "Seed for random number generator "
-                "(relevant if you have layers of type AffineComponentPreconditioned "
-                "with l2-penalty != 0.0");
-    po.Register("use-gpu", &use_gpu,
-                "yes|no|optional|wait, only has effect if compiled with CUDA");
-    
-    train_config.Register(&po);
-    
-    po.Read(argc, argv);
-    
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-    srand(srand_seed);
-    
-#if HAVE_CUDA==1
-    CuDevice::Instantiate().SelectGpuId(use_gpu);
-#endif
-    if (within_covar_rxfilename == "") {
-      KALDI_ERR << "The option --within-covar is required.";
-    }
-    
-    std::string nnet_rxfilename = po.GetArg(1),
-        examples_rspecifier = po.GetArg(2),
-        nnet_wxfilename = po.GetArg(3);
-    
-    int64 num_examples = 0;
-
-    {
-      TransitionModel trans_model;
-      AmNnet am_nnet;
-      {
-        bool binary_read;
-        Input ki(nnet_rxfilename, &binary_read);
-        trans_model.Read(ki.Stream(), binary_read);
-        am_nnet.Read(ki.Stream(), binary_read);
-      }
-
-      SpMatrix<BaseFloat> within_covar;
-      ReadKaldiObject(within_covar_rxfilename, &within_covar);
-
-      if (zero_stats) am_nnet.GetNnet().ZeroStats();
-    
-      { // want to make sure this object deinitializes before
-        // we write the model, as it does something in the destructor.
-        NnetPerturbedTrainer trainer(train_config,
-                                     within_covar,
-                                     &(am_nnet.GetNnet()));
-      
-        SequentialNnetExampleReader example_reader(examples_rspecifier);
-
-        for (; !example_reader.Done(); example_reader.Next(), num_examples++)
-          trainer.TrainOnExample(example_reader.Value());  // It all happens here!
-      }
-    
-      {
-        Output ko(nnet_wxfilename, binary_write);
-        trans_model.Write(ko.Stream(), binary_write);
-        am_nnet.Write(ko.Stream(), binary_write);
-      }
-    }
-#if HAVE_CUDA==1
-    CuDevice::Instantiate().PrintProfile();
-#endif
-    
-    KALDI_LOG << "Finished training, processed " << num_examples
-              << " training examples.  Wrote model to "
-              << nnet_wxfilename;
-    return (num_examples == 0 ? 1 : 0);
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
diff --git a/src/nnet2bin/nnet-train-simple-perturbed.cc b/src/nnet2bin/nnet-train-simple-perturbed.cc
deleted file mode 100644
index 6560d1d5795..00000000000
--- a/src/nnet2bin/nnet-train-simple-perturbed.cc
+++ /dev/null
@@ -1,137 +0,0 @@
-// nnet2bin/nnet-train-simple-perturbed.cc
-
-// Copyright 2012-2014  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "hmm/transition-model.h"
-#include "nnet2/train-nnet-perturbed.h"
-#include "nnet2/am-nnet.h"
-
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet2;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "Train the neural network parameters with backprop and stochastic\n"
-        "gradient descent using minibatches.  As nnet-train-simple, but\n"
-        "perturbs the input features by going a certain distance down the\n"
-        "backprop-ed gradient.  Can be helpful for small datasets.\n"
-        "\n"
-        "Usage:  nnet-train-simple-perturbed [options] <model-in> <training-examples-in> <model-out>\n"
-        "note: the option --within-covar=<file> is needed\n"
-        "\n"
-        "e.g.:\n"
-        "nnet-train-simple-perturbed --within-covar=within.spmat --target-objf-change=0.2 \\\n"
-        "  1.nnet ark:1.1.egs 2.nnet\n";
-    
-    bool binary_write = true;
-    bool zero_stats = true;
-    int32 srand_seed = 0;
-    std::string use_gpu = "yes";
-    std::string within_covar_rxfilename;
-    NnetPerturbedTrainerConfig train_config;
-    
-    ParseOptions po(usage);
-    po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("within-covar", &within_covar_rxfilename,
-                "rxfilename of within-class covariance-matrix, written as "
-                "SpMatrix.  Must be specified.");
-    po.Register("zero-stats", &zero_stats, "If true, zero occupation "
-                "counts stored with the neural net (only affects mixing up).");
-    po.Register("srand", &srand_seed, "Seed for random number generator "
-                "(relevant if you have layers of type AffineComponentPreconditioned "
-                "with l2-penalty != 0.0");
-    po.Register("use-gpu", &use_gpu,
-                "yes|no|optional|wait, only has effect if compiled with CUDA");
-    
-    train_config.Register(&po);
-    
-    po.Read(argc, argv);
-    
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-    srand(srand_seed);
-    
-#if HAVE_CUDA==1
-    CuDevice::Instantiate().SelectGpuId(use_gpu);
-#endif
-    if (within_covar_rxfilename == "") {
-      KALDI_ERR << "The option --within-covar is required.";
-    }
-    
-    std::string nnet_rxfilename = po.GetArg(1),
-        examples_rspecifier = po.GetArg(2),
-        nnet_wxfilename = po.GetArg(3);
-    
-    int64 num_examples = 0;
-
-    {
-      TransitionModel trans_model;
-      AmNnet am_nnet;
-      {
-        bool binary_read;
-        Input ki(nnet_rxfilename, &binary_read);
-        trans_model.Read(ki.Stream(), binary_read);
-        am_nnet.Read(ki.Stream(), binary_read);
-      }
-
-      SpMatrix<BaseFloat> within_covar;
-      ReadKaldiObject(within_covar_rxfilename, &within_covar);
-
-      if (zero_stats) am_nnet.GetNnet().ZeroStats();
-    
-      { // want to make sure this object deinitializes before
-        // we write the model, as it does something in the destructor.
-        NnetPerturbedTrainer trainer(train_config,
-                                     within_covar,
-                                     &(am_nnet.GetNnet()));
-      
-        SequentialNnetExampleReader example_reader(examples_rspecifier);
-
-        for (; !example_reader.Done(); example_reader.Next(), num_examples++)
-          trainer.TrainOnExample(example_reader.Value());  // It all happens here!
-      }
-    
-      {
-        Output ko(nnet_wxfilename, binary_write);
-        trans_model.Write(ko.Stream(), binary_write);
-        am_nnet.Write(ko.Stream(), binary_write);
-      }
-    }
-#if HAVE_CUDA==1
-    CuDevice::Instantiate().PrintProfile();
-#endif
-    
-    KALDI_LOG << "Finished training, processed " << num_examples
-              << " training examples.  Wrote model to "
-              << nnet_wxfilename;
-    return (num_examples == 0 ? 1 : 0);
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-
diff --git a/src/nnet2bin/nnet1-to-raw-nnet.cc b/src/nnet2bin/nnet1-to-raw-nnet.cc
index 368214e9649..96e058075d9 100644
--- a/src/nnet2bin/nnet1-to-raw-nnet.cc
+++ b/src/nnet2bin/nnet1-to-raw-nnet.cc
@@ -30,16 +30,34 @@
 namespace kaldi {
 
 nnet2::Component *ConvertAffineTransformComponent(
-    const nnet1::Component &nnet1_component) {
+    const nnet1::Component &nnet1_component,
+    const bool use_preconditioned_affine_component) {
   const nnet1::AffineTransform *affine =
       dynamic_cast<const nnet1::AffineTransform*>(&nnet1_component);
   KALDI_ASSERT(affine != NULL);
   // default learning rate is 1.0e-05, you can use the --learning-rate or
   // --learning-rates option to nnet-am-copy to change it if you need.
-  BaseFloat learning_rate = 1.0e-05; 
-  return new nnet2::AffineComponent(affine->GetLinearity(),
-                                    affine->GetBias(),
-                                    learning_rate);
+  BaseFloat learning_rate = 1.0e-05;
+  if (use_preconditioned_affine_component) {
+    int32 rank_in = 20,
+          rank_out = 80,
+          update_period = 4;
+    BaseFloat num_samples_history = 2000.,
+              alpha = 4.;
+    return new nnet2::AffineComponentPreconditionedOnline(
+      nnet2::AffineComponent(affine->GetLinearity(),
+        affine->GetBias(),
+        learning_rate),
+      rank_in,
+      rank_out,
+      update_period,
+      num_samples_history,
+      alpha);
+  } else {
+    return new nnet2::AffineComponent(affine->GetLinearity(),
+      affine->GetBias(),
+      learning_rate);
+  }
 }
 
 nnet2::Component *ConvertSoftmaxComponent(
@@ -63,7 +81,7 @@ nnet2::Component *ConvertSpliceComponent(
   const nnet1::Splice *splice =
       dynamic_cast<const nnet1::Splice*>(&nnet1_component);
   KALDI_ASSERT(splice != NULL);
-  int32 low, high;
+//  int32 low, high;
   std::vector<int32> frame_offsets;
 
   std::ostringstream ostr;
@@ -72,19 +90,8 @@ nnet2::Component *ConvertSpliceComponent(
   std::istringstream istr(ostr.str());
   ReadIntegerVector(istr, false, &frame_offsets);
 
-  for (size_t i = 1; i < frame_offsets.size(); i++) {
-    KALDI_ASSERT(frame_offsets[i-1] + 1 == frame_offsets[i]);
-  }
-
-  low = frame_offsets[0];
-  high = frame_offsets[frame_offsets.size() - 1];
-
   nnet2::SpliceComponent *res = new nnet2::SpliceComponent();
-  std::vector<int32> context(high - low + 1);
-  for (int32 i = low; i <= high; i++)  {
-    context[i - low] = i;
-  }
-  res->Init(splice->InputDim(), context);
+  res->Init(splice->InputDim(), frame_offsets);
   return res;
 }
 
@@ -94,7 +101,7 @@ nnet2::Component *ConvertAddShiftComponent(
   const nnet1::AddShift *add_shift =
       dynamic_cast<const nnet1::AddShift*>(&nnet1_component);
   KALDI_ASSERT(add_shift != NULL);
-  Vector<BaseFloat> bias;
+  Vector<BaseFloat> bias(add_shift->NumParams());
 
   add_shift->GetParams(&bias);
   CuVector<BaseFloat> cu_bias(bias);
@@ -110,7 +117,7 @@ nnet2::Component *ConvertRescaleComponent(
       dynamic_cast<const nnet1::Rescale*>(&nnet1_component);
   KALDI_ASSERT(rescale != NULL);
 
-  Vector<BaseFloat> scale;
+  Vector<BaseFloat> scale(rescale->NumParams());
   rescale->GetParams(&scale);
 
   CuVector<BaseFloat> cu_scale(scale);
@@ -120,13 +127,15 @@ nnet2::Component *ConvertRescaleComponent(
   return res;
 }
 
-nnet2::Component *ConvertComponent(const nnet1::Component &nnet1_component) {
+nnet2::Component *ConvertComponent(const nnet1::Component &nnet1_component,
+    const bool use_preconditioned_affine_component) {
   nnet1::Component::ComponentType type_in = nnet1_component.GetType();
   switch (type_in) {
     case nnet1::Component::kAffineTransform:
-      return ConvertAffineTransformComponent(nnet1_component);
+      return ConvertAffineTransformComponent(nnet1_component,
+          use_preconditioned_affine_component);
     case nnet1::Component::kSoftmax:
-      return ConvertSoftmaxComponent(nnet1_component);      
+      return ConvertSoftmaxComponent(nnet1_component);
     case nnet1::Component::kSigmoid:
       return ConvertSigmoidComponent(nnet1_component);
     case nnet1::Component::kSplice:
@@ -144,23 +153,24 @@ nnet2::Component *ConvertComponent(const nnet1::Component &nnet1_component) {
 }
 
 
-nnet2::Nnet *ConvertNnet1ToNnet2(const nnet1::Nnet &nnet1) {
+nnet2::Nnet *ConvertNnet1ToNnet2(const nnet1::Nnet &nnet1,
+    const bool use_preconditioned_affine_component) {
   // get a vector of nnet2::Component pointers and initialize the nnet2::Nnet with it.
   size_t size = nnet1.NumComponents();
   std::vector<nnet2::Component*> *components = new std::vector<nnet2::Component*>();
   components->resize(size);
   for (size_t i = 0; i < size; i++) {
-    (*components)[i] = ConvertComponent(nnet1.GetComponent(i));
+      (*components)[i] = ConvertComponent(nnet1.GetComponent(i),
+          use_preconditioned_affine_component);
   }
-  
+
   nnet2::Nnet *res = new nnet2::Nnet();
   res->Init(components);
-  // not de-allocate the memory for components
-  // since the nnet takes the ownership
+  delete components;
   return res;
 }
 
-}
+}  // namespace kaldi
 
 
 int main(int argc, char *argv[]) {
@@ -175,15 +185,19 @@ int main(int argc, char *argv[]) {
         "e.g.:\n"
         " nnet1-to-raw-nnet srcdir/final.nnet - | nnet-am-init dest/tree dest/topo - dest/0.mdl\n";
 
-    bool binary_write = true;
+    bool binary_write = true, use_preconditioned_affine_component = false;
     int32 srand_seed = 0;
-    
+
     ParseOptions po(usage);
     po.Register("binary", &binary_write, "Write output in binary mode");
-    
+
+    po.Register("use_preconditioned_affine_component",
+        &use_preconditioned_affine_component,
+        "Using AffineComponentPreconditionOnline instead AffineComponent");
+
     po.Read(argc, argv);
     srand(srand_seed);
-    
+
     if (po.NumArgs() != 2) {
       po.PrintUsage();
       exit(1);
@@ -191,10 +205,11 @@ int main(int argc, char *argv[]) {
 
     std::string nnet1_rxfilename = po.GetArg(1),
         raw_nnet2_wxfilename = po.GetArg(2);
-    
+
     nnet1::Nnet nnet1;
     ReadKaldiObject(nnet1_rxfilename, &nnet1);
-    nnet2::Nnet *nnet2 = ConvertNnet1ToNnet2(nnet1);
+    nnet2::Nnet *nnet2 = ConvertNnet1ToNnet2(nnet1,
+        use_preconditioned_affine_component);
     WriteKaldiObject(*nnet2, raw_nnet2_wxfilename, binary_write);
     KALDI_LOG << "Converted nnet1 neural net to raw nnet2 and wrote it to "
               << PrintableWxfilename(raw_nnet2_wxfilename);
diff --git a/src/nnet2bin/nnet2-boost-silence.cc b/src/nnet2bin/nnet2-boost-silence.cc
index b727cd42103..6bac7ca748c 100644
--- a/src/nnet2bin/nnet2-boost-silence.cc
+++ b/src/nnet2bin/nnet2-boost-silence.cc
@@ -1,4 +1,4 @@
-// nnet2bin/nnet-boost-silence.cc
+// nnet2bin/nnet2-boost-silence.cc
 
 // Copyright 2012  Johns Hopkins University (author:  Daniel Povey)
 
diff --git a/src/nnet3/Makefile b/src/nnet3/Makefile
index f833138501f..739fdc4004b 100644
--- a/src/nnet3/Makefile
+++ b/src/nnet3/Makefile
@@ -22,13 +22,21 @@ OBJFILES = nnet-common.o nnet-compile.o nnet-component-itf.o \
   nnet-example.o nnet-nnet.o nnet-compile-utils.o \
   nnet-utils.o nnet-compute.o nnet-test-utils.o nnet-analyze.o \
   nnet-example-utils.o nnet-training.o \
-  nnet-diagnostics.o nnet-combine.o nnet-am-decodable-simple.o
+  nnet-diagnostics.o nnet-combine.o nnet-am-decodable-simple.o \
+  nnet-optimize-utils.o nnet-chain-example.o \
+  nnet-chain-training.o nnet-chain-diagnostics.o nnet-chain-combine.o \
+  discriminative-supervision.o nnet-discriminative-example.o \
+  nnet-discriminative-diagnostics.o \
+  discriminative-training.o nnet-discriminative-training.o \
+  online-nnet3-decodable-simple.o
+
 
 LIBNAME = kaldi-nnet3
 
-ADDLIBS = ../thread/kaldi-thread.a ../lat/kaldi-lat.a ../gmm/kaldi-gmm.a \
+ADDLIBS = ../chain/kaldi-chain.a ../thread/kaldi-thread.a \
+       ../fstext/kaldi-fstext.a ../lat/kaldi-lat.a ../gmm/kaldi-gmm.a \
       ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a ../transform/kaldi-transform.a \
       ../cudamatrix/kaldi-cudamatrix.a ../matrix/kaldi-matrix.a \
-      ../base/kaldi-base.a  ../util/kaldi-util.a 
+      ../util/kaldi-util.a ../thread/kaldi-thread.a ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/nnet3/am-nnet-simple.cc b/src/nnet3/am-nnet-simple.cc
index f0fec067371..e1211170ec2 100644
--- a/src/nnet3/am-nnet-simple.cc
+++ b/src/nnet3/am-nnet-simple.cc
@@ -39,7 +39,7 @@ void AmNnetSimple::Write(std::ostream &os, bool binary) const {
   WriteToken(os, binary, "<LeftContext>");
   WriteBasicType(os, binary, left_context_);
   WriteToken(os, binary, "<RightContext>");
-  WriteBasicType(os, binary, right_context_);  
+  WriteBasicType(os, binary, right_context_);
   WriteToken(os, binary, "<Priors>");
   priors_.Write(os, binary);
 }
@@ -49,7 +49,9 @@ void AmNnetSimple::Read(std::istream &is, bool binary) {
   ExpectToken(is, binary, "<LeftContext>");
   ReadBasicType(is, binary, &left_context_);
   ExpectToken(is, binary, "<RightContext>");
-  ReadBasicType(is, binary, &right_context_);  
+  ReadBasicType(is, binary, &right_context_);
+  SetContext();  // temporarily, I'm not trusting the written ones (there was
+                 // briefly a bug)
   ExpectToken(is, binary, "<Priors>");
   priors_.Read(is, binary);
 }
@@ -67,7 +69,8 @@ void AmNnetSimple::SetNnet(const Nnet &nnet) {
 
 void AmNnetSimple::SetPriors(const VectorBase<BaseFloat> &priors) {
   priors_ = priors;
-  if (priors_.Dim() != nnet_.OutputDim("output")) {
+  if (priors_.Dim() != nnet_.OutputDim("output") &&
+      priors_.Dim() != 0) {
     KALDI_ERR << "Dimension mismatch when setting priors: priors have dim "
               << priors.Dim() << ", model expects "
               << nnet_.OutputDim("output");
@@ -87,7 +90,7 @@ std::string AmNnetSimple::Info() const {
     ostr << "prior-min: " << priors_.Min() << "\n";
     ostr << "prior-max: " << priors_.Max() << "\n";
   }
-  ostr << "# Nnet info follows.\n" << "\n";
+  ostr << "# Nnet info follows.\n";
   return ostr.str() + nnet_.Info();
 }
 
diff --git a/src/nnet3/am-nnet-simple.h b/src/nnet3/am-nnet-simple.h
index 33e618ee3d4..a45ba6c38fe 100644
--- a/src/nnet3/am-nnet-simple.h
+++ b/src/nnet3/am-nnet-simple.h
@@ -84,10 +84,10 @@ class AmNnetSimple {
   int32 RightContext() const { return right_context_; }
 
   /// Returns the input feature dim.
-  int32 InputDim() const { return nnet_.InputDim("input-dim"); }
+  int32 InputDim() const { return nnet_.InputDim("input"); }
 
   /// Returns the iVector dimension, or -1 if there is no such input.
-  int32 IvectorDim() const { return nnet_.InputDim("ivector-dim"); }
+  int32 IvectorDim() const { return nnet_.InputDim("ivector"); }
   
  private:
   void SetContext();
diff --git a/src/nnet3/discriminative-supervision.cc b/src/nnet3/discriminative-supervision.cc
new file mode 100644
index 00000000000..223257e5a5f
--- /dev/null
+++ b/src/nnet3/discriminative-supervision.cc
@@ -0,0 +1,452 @@
+// nnet3/discriminative-supervision.cc
+
+// Copyright 2012-2015  Johns Hopkins University (author: Daniel Povey)
+//           2014-2015  Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "nnet3/discriminative-supervision.h"
+#include "lat/lattice-functions.h"
+
+namespace kaldi {
+namespace discriminative {
+
+void DiscriminativeSupervisionOptions::Check() const {
+  KALDI_ASSERT(frame_subsampling_factor > 0);
+}
+
+DiscriminativeSupervision::DiscriminativeSupervision(
+    const DiscriminativeSupervision &other):
+    weight(other.weight), num_sequences(other.num_sequences),
+    frames_per_sequence(other.frames_per_sequence), 
+    num_ali(other.num_ali), den_lat(other.den_lat) { }
+
+void DiscriminativeSupervision::Swap(DiscriminativeSupervision *other) {
+  std::swap(weight, other->weight);
+  std::swap(num_sequences, other->num_sequences);
+  std::swap(frames_per_sequence, other->frames_per_sequence);
+  std::swap(num_ali, other->num_ali);
+  std::swap(den_lat, other->den_lat);
+}
+
+bool DiscriminativeSupervision::operator == (
+    const DiscriminativeSupervision &other) const {
+  return ( weight == other.weight && 
+      num_sequences == other.num_sequences &&
+      frames_per_sequence == other.frames_per_sequence &&
+      num_ali == other.num_ali &&
+      fst::Equal(den_lat, other.den_lat) );
+}
+
+void DiscriminativeSupervision::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<DiscriminativeSupervision>");
+  WriteToken(os, binary, "<Weight>");
+  WriteBasicType(os, binary, weight);
+  WriteToken(os, binary, "<NumSequences>");
+  WriteBasicType(os, binary, num_sequences);
+  WriteToken(os, binary, "<FramesPerSeq>");
+  WriteBasicType(os, binary, frames_per_sequence);
+  KALDI_ASSERT(frames_per_sequence > 0 &&
+               num_sequences > 0);
+  
+  WriteToken(os, binary, "<NumAli>");
+  WriteIntegerVector(os, binary, num_ali);
+
+  WriteToken(os, binary, "<DenLat>");
+  if (!WriteLattice(os, binary, den_lat)) {
+    // We can't return error status from this function so we
+    // throw an exception. 
+    KALDI_ERR << "Error writing denominator lattice to stream";
+  }
+
+  WriteToken(os, binary, "</DiscriminativeSupervision>");
+}
+
+void DiscriminativeSupervision::Read(std::istream &is, bool binary) {
+  ExpectToken(is, binary, "<DiscriminativeSupervision>");
+  ExpectToken(is, binary, "<Weight>");
+  ReadBasicType(is, binary, &weight);
+  ExpectToken(is, binary, "<NumSequences>");
+  ReadBasicType(is, binary, &num_sequences);
+  ExpectToken(is, binary, "<FramesPerSeq>");
+  ReadBasicType(is, binary, &frames_per_sequence);
+  KALDI_ASSERT(frames_per_sequence > 0 && 
+               num_sequences > 0);
+  
+  ExpectToken(is, binary, "<NumAli>");
+  ReadIntegerVector(is, binary, &num_ali);
+
+  ExpectToken(is, binary, "<DenLat>");
+  {
+    Lattice *lat = NULL;
+    if (!ReadLattice(is, binary, &lat) || lat == NULL) {
+      // We can't return error status from this function so we
+      // throw an exception. 
+      KALDI_ERR << "Error reading Lattice from stream";
+    }
+    den_lat = *lat;
+    delete lat;
+    TopSort(&den_lat);
+  }
+
+  ExpectToken(is, binary, "</DiscriminativeSupervision>");
+}
+
+bool DiscriminativeSupervision::Initialize(const std::vector<int32> &num_ali,
+                                           const Lattice &den_lat, 
+                                           BaseFloat weight) {
+  if (num_ali.size() == 0) return false;
+  if (den_lat.NumStates() == 0) return false;
+
+  this->weight = weight;
+  this->num_sequences = 1;
+  this->frames_per_sequence = num_ali.size();
+  this->num_ali = num_ali;
+  this->den_lat = den_lat;
+  KALDI_ASSERT(TopSort(&(this->den_lat)));
+
+  // Checks if num frames in alignment matches lattice
+  Check();
+
+  return true;
+}
+
+void DiscriminativeSupervision::Check() const {
+  int32 num_frames_subsampled = num_ali.size();
+  KALDI_ASSERT(num_frames_subsampled == 
+               num_sequences * frames_per_sequence);
+
+  {
+    std::vector<int32> state_times;
+    int32 max_time = LatticeStateTimes(den_lat, &state_times);
+    KALDI_ASSERT(max_time == num_frames_subsampled);
+  }
+}
+
+DiscriminativeSupervisionSplitter::DiscriminativeSupervisionSplitter(
+    const SplitDiscriminativeSupervisionOptions &config,
+    const TransitionModel &tmodel,
+    const DiscriminativeSupervision &supervision):
+    config_(config), tmodel_(tmodel), supervision_(supervision) {
+  if (supervision_.num_sequences != 1) {
+    KALDI_WARN << "Splitting already-reattached sequence (only expected in "
+               << "testing code)";
+  }
+
+  KALDI_ASSERT(supervision_.num_sequences == 1); // For now, don't allow splitting already merged examples
+
+  den_lat_ = supervision_.den_lat;
+  PrepareLattice(&den_lat_, &den_lat_scores_);
+  
+  int32 num_states = den_lat_.NumStates(),
+        num_frames = supervision_.frames_per_sequence * supervision_.num_sequences;
+  KALDI_ASSERT(num_states > 0);
+  int32 start_state = den_lat_.Start();
+  // Lattice should be top-sorted and connected, so start-state must be 0.
+  KALDI_ASSERT(start_state == 0 && "Expecting start-state to be 0");
+  
+  KALDI_ASSERT(num_states == den_lat_scores_.state_times.size());
+  KALDI_ASSERT(den_lat_scores_.state_times[start_state] == 0);
+  KALDI_ASSERT(den_lat_scores_.state_times.back() == num_frames);
+}
+
+// Make sure that for any given pdf-id and any given frame, the den-lat has
+// only one transition-id mapping to that pdf-id, on the same frame.
+// It helps us to more completely minimize the lattice.  Note: we
+// can't do this if the criterion is MPFE, because in that case the
+// objective function will be affected by the phone-identities being
+// different even if the pdf-ids are the same.
+void DiscriminativeSupervisionSplitter::CollapseTransitionIds(
+    const std::vector<int32> &state_times, Lattice *lat) const {
+  typedef Lattice::StateId StateId;
+  typedef Lattice::Arc Arc;
+
+  int32 num_frames = state_times.back();   // TODO: Check if this is always true
+  StateId num_states = lat->NumStates();
+
+  std::vector<std::map<int32, int32> > pdf_to_tid(num_frames);
+  for (StateId s = 0; s < num_states; s++) {
+    int32 t = state_times[s];
+    for (fst::MutableArcIterator<Lattice> aiter(lat, s);
+         !aiter.Done(); aiter.Next()) {
+      KALDI_ASSERT(t >= 0 && t < num_frames);
+      Arc arc = aiter.Value();
+      KALDI_ASSERT(arc.ilabel != 0 && arc.ilabel == arc.olabel);
+      int32 pdf = tmodel_.TransitionIdToPdf(arc.ilabel);
+      if (pdf_to_tid[t].count(pdf) != 0) {
+        arc.ilabel = arc.olabel = pdf_to_tid[t][pdf];
+        aiter.SetValue(arc);
+      } else {
+        pdf_to_tid[t][pdf] = arc.ilabel;
+      }
+    }
+  }    
+}
+
+void DiscriminativeSupervisionSplitter::LatticeInfo::Check() const {
+  // Check if all the vectors are of size num_states
+  KALDI_ASSERT(state_times.size() == alpha.size() &&
+               state_times.size() == beta.size());
+
+  // Check that the states are ordered in increasing order of state_times.
+  // This must be true since the states are in breadth-first search order.
+  KALDI_ASSERT(IsSorted(state_times));
+} 
+
+void DiscriminativeSupervisionSplitter::GetFrameRange(int32 begin_frame, int32 num_frames, bool normalize, 
+                                                      DiscriminativeSupervision *out_supervision) const {
+  int32 end_frame = begin_frame + num_frames;
+  // Note: end_frame is not included in the range of frames that the
+  // output supervision object covers; it's one past the end.
+  KALDI_ASSERT(num_frames > 0 && begin_frame >= 0 &&
+               begin_frame + num_frames <=
+               supervision_.num_sequences * supervision_.frames_per_sequence);
+
+  CreateRangeLattice(den_lat_,
+                     den_lat_scores_,
+                     begin_frame, end_frame, normalize,
+                     &(out_supervision->den_lat));
+
+  out_supervision->num_ali.clear();
+  std::copy(supervision_.num_ali.begin() + begin_frame,
+            supervision_.num_ali.begin() + end_frame,
+            std::back_inserter(out_supervision->num_ali));
+  
+  out_supervision->num_sequences = 1;
+  out_supervision->weight = supervision_.weight;
+  out_supervision->frames_per_sequence = num_frames;
+
+  out_supervision->Check();
+}
+
+void DiscriminativeSupervisionSplitter::CreateRangeLattice(
+    const Lattice &in_lat, const LatticeInfo &scores,
+    int32 begin_frame, int32 end_frame, bool normalize,
+    Lattice *out_lat) const {
+  typedef Lattice::StateId StateId;
+
+  const std::vector<int32> &state_times = scores.state_times;
+  
+  // Some checks to ensure the lattice and scores are prepared properly 
+  KALDI_ASSERT(state_times.size() == in_lat.NumStates());
+  if (!in_lat.Properties(fst::kTopSorted, true))
+    KALDI_ERR << "Input lattice must be topologically sorted.";
+
+  std::vector<int32>::const_iterator begin_iter =
+      std::lower_bound(state_times.begin(), state_times.end(), begin_frame),
+      end_iter = std::lower_bound(begin_iter, 
+                                  state_times.end(), end_frame);
+
+  KALDI_ASSERT(*begin_iter == begin_frame &&
+               (begin_iter == state_times.begin() || 
+                begin_iter[-1] < begin_frame));
+  // even if end_frame == supervision_.num_frames, there should be a state with
+  // that frame index.
+  KALDI_ASSERT(end_iter[-1] < end_frame &&
+               (end_iter < state_times.end() || *end_iter == end_frame));
+  StateId begin_state = begin_iter - state_times.begin(),
+          end_state = end_iter - state_times.begin();
+
+  KALDI_ASSERT(end_state > begin_state);
+  out_lat->DeleteStates();
+  out_lat->ReserveStates(end_state - begin_state + 2);
+
+  // Add special start state
+  StateId start_state = out_lat->AddState();
+  out_lat->SetStart(start_state);
+  
+  for (StateId i = begin_state; i < end_state; i++)
+    out_lat->AddState();
+  
+  // Add the special final-state.
+  StateId final_state = out_lat->AddState();
+  out_lat->SetFinal(final_state, LatticeWeight::One());
+
+  for (StateId state = begin_state; state < end_state; state++) {
+    StateId output_state = state - begin_state + 1;
+    if (state_times[state] == begin_frame) {
+      // we'd like to make this an initial state, but OpenFst doesn't allow
+      // multiple initial states.  Instead we add an epsilon transition to it
+      // from our actual initial state.  The weight on this 
+      // transition is the forward probability of the said 'initial state'
+      LatticeWeight weight = LatticeWeight::One();
+      weight.SetValue1((normalize ? scores.beta[0] : 0.0) - scores.alpha[state]); 
+      // Add negative of the forward log-probability to the graph cost score,
+      // since the acoustic scores would be changed later.
+      // Assuming that the lattice is scaled with appropriate acoustic
+      // scale.
+      // We additionally normalize using the total lattice score. Since the
+      // same score is added as normalizer to all the paths in the lattice,
+      // the relative probabilities of the paths in the lattice is not affected.
+      // Note: Doing a forward-backward on this split must result in a total
+      // score of 0 because of the normalization.
+
+      out_lat->AddArc(start_state, 
+                      LatticeArc(0, 0, weight, output_state));
+    } else {
+      KALDI_ASSERT(scores.state_times[state] < end_frame);
+    }
+    for (fst::ArcIterator<Lattice> aiter(in_lat, state); 
+          !aiter.Done(); aiter.Next()) {
+      const LatticeArc &arc = aiter.Value();
+      StateId nextstate = arc.nextstate;
+      if (nextstate >= end_state) {
+        // A transition to any state outside the range becomes a transition to
+        // our special final-state. 
+        // The weight is just the negative of the backward log-probability + 
+        // the arc cost. We again normalize with the total lattice score.
+        LatticeWeight weight;
+        //KALDI_ASSERT(scores.beta[state] < 0);
+        weight.SetValue1(arc.weight.Value1() - scores.beta[nextstate]); 
+        weight.SetValue2(arc.weight.Value2());
+        // Add negative of the backward log-probability to the LM score, since
+        // the acoustic scores would be changed later.
+        // Note: We don't normalize here because that is already done with the
+        // initial cost.
+      
+        out_lat->AddArc(output_state,
+            LatticeArc(arc.ilabel, arc.olabel, weight, final_state));
+      } else {
+        StateId output_nextstate = nextstate - begin_state + 1;
+        out_lat->AddArc(output_state,
+            LatticeArc(arc.ilabel, arc.olabel, arc.weight, output_nextstate));
+      }
+    }
+  }
+
+  // Get rid of the word labels and put the
+  // transition-ids on both sides.
+  fst::Project(out_lat, fst::PROJECT_INPUT);
+  fst::RmEpsilon(out_lat);
+
+  if (config_.collapse_transition_ids)
+    CollapseTransitionIds(state_times, out_lat);
+
+  if (config_.determinize) {
+    if (!config_.minimize) {
+      Lattice tmp_lat;
+      fst::Determinize(*out_lat, &tmp_lat);
+      std::swap(*out_lat, tmp_lat);
+    } else {
+      Lattice tmp_lat;
+      fst::Reverse(*out_lat, &tmp_lat);
+      fst::Determinize(tmp_lat, out_lat);
+      fst::Reverse(*out_lat, &tmp_lat);
+      fst::Determinize(tmp_lat, out_lat);
+      fst::RmEpsilon(out_lat);
+    }
+  }
+
+  fst::TopSort(out_lat);    
+  std::vector<int32> state_times_tmp;
+  KALDI_ASSERT(LatticeStateTimes(*out_lat, &state_times_tmp) ==
+                                            end_frame - begin_frame);
+
+  // Remove the acoustic scale that was previously added
+  if (config_.supervision_config.acoustic_scale != 1.0) {
+    fst::ScaleLattice(fst::AcousticLatticeScale(
+          1 / config_.supervision_config.acoustic_scale), out_lat);
+  }
+}
+
+void DiscriminativeSupervisionSplitter::PrepareLattice(
+    Lattice *lat, LatticeInfo *scores) const {
+  // Scale the lattice to appropriate acoustic scale. It is important to 
+  // ensure this is equal to the acoustic scale used while training. This is 
+  // because, on splitting lattices, the initial and final costs are added 
+  // into the graph cost.
+  KALDI_ASSERT(config_.supervision_config.acoustic_scale != 0.0);
+  if (config_.supervision_config.acoustic_scale != 1.0)
+    fst::ScaleLattice(fst::AcousticLatticeScale(
+          config_.supervision_config.acoustic_scale), lat);
+
+  LatticeStateTimes(*lat, &(scores->state_times));
+  int32 num_states = lat->NumStates();
+  std::vector<std::pair<int32,int32> > state_time_indexes(num_states);
+  for (int32 s = 0; s < num_states; s++) {
+    state_time_indexes[s] = std::make_pair(scores->state_times[s], s);
+  }
+
+  // Order the states based on the state times. This is stronger than just
+  // topological sort. This is required by the lattice splitting code.
+  std::sort(state_time_indexes.begin(), state_time_indexes.end());
+  
+  std::vector<int32> state_order(num_states);
+  for (int32 s = 0; s < num_states; s++) {
+    state_order[state_time_indexes[s].second] = s;
+  }
+
+  fst::StateSort(lat, state_order);
+  ComputeLatticeScores(*lat, scores);
+}
+
+void DiscriminativeSupervisionSplitter::ComputeLatticeScores(const Lattice &lat,
+    LatticeInfo *scores) const {
+  LatticeStateTimes(lat, &(scores->state_times));
+  ComputeLatticeAlphasAndBetas(lat, false, 
+                               &(scores->alpha), &(scores->beta));
+  scores->Check();  
+  // This check will fail if the lattice is not breadth-first search sorted
+}
+
+void AppendSupervision(const std::vector<const DiscriminativeSupervision*> &input,
+    bool compactify,
+    std::vector<DiscriminativeSupervision> *output_supervision) {
+  KALDI_ASSERT(!input.empty());
+  int32 num_inputs = input.size();
+  if (num_inputs == 1) {
+    output_supervision->resize(1);
+    (*output_supervision)[0] = *(input[0]);
+    return;
+  }
+  std::vector<bool> output_was_merged;
+  output_supervision->clear();
+  output_supervision->reserve(input.size());
+  for (int32 i = 0; i < input.size(); i++) {
+    const DiscriminativeSupervision &src = *(input[i]);
+    KALDI_ASSERT(src.num_sequences == 1);
+    if (compactify && !output_supervision->empty() &&
+        output_supervision->back().weight == src.weight &&
+        output_supervision->back().frames_per_sequence ==
+        src.frames_per_sequence) {
+      // Combine with current output
+      // append src.den_lat to output_supervision->den_lat.
+      fst::Concat(&output_supervision->back().den_lat, src.den_lat);
+
+      output_supervision->back().num_ali.insert(
+          output_supervision->back().num_ali.end(), 
+          src.num_ali.begin(), src.num_ali.end());
+
+      output_supervision->back().num_sequences++;
+      output_was_merged.back() = true;
+    } else {
+      output_supervision->resize(output_supervision->size() + 1);
+      output_supervision->back() = src;
+      output_was_merged.push_back(false);
+    }
+  }
+  KALDI_ASSERT(output_was_merged.size() == output_supervision->size());
+  for (size_t i = 0; i < output_supervision->size(); i++) {
+    if (output_was_merged[i]) {
+      DiscriminativeSupervision &out_sup = (*output_supervision)[i];
+      fst::TopSort(&(out_sup.den_lat));
+      out_sup.Check();
+    }
+  }
+}
+
+} // namespace discriminative 
+} // namespace kaldi
diff --git a/src/nnet3/discriminative-supervision.h b/src/nnet3/discriminative-supervision.h
new file mode 100644
index 00000000000..c5cdc7a4107
--- /dev/null
+++ b/src/nnet3/discriminative-supervision.h
@@ -0,0 +1,251 @@
+// nnet3/discriminative-supervision.h
+
+// Copyright 2012-2015  Johns Hopkins University (author: Daniel Povey)
+//           2014-2015  Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_NNET3_DISCRIMINATIVE_SUPERVISION_H
+#define KALDI_NNET3_DISCRIMINATIVE_SUPERVISION_H
+
+#include "util/table-types.h"
+#include "hmm/posterior.h"
+#include "hmm/transition-model.h"
+#include "lat/kaldi-lattice.h"
+
+namespace kaldi {
+namespace discriminative {
+
+struct DiscriminativeSupervisionOptions {
+  int32 frame_subsampling_factor;
+  BaseFloat acoustic_scale;
+
+  DiscriminativeSupervisionOptions(): frame_subsampling_factor(1), acoustic_scale(0.1) { }
+
+  void Register(OptionsItf *opts) {
+    opts->Register("frame-subsampling-factor", &frame_subsampling_factor, "Used "
+                   "if the frame-rate for the model will be less than the "
+                   "frame-rate of the original alignment.  Applied after "
+                   "left-tolerance and right-tolerance are applied (so they are "
+                   "in terms of the original num-frames.");
+    opts->Register("acoustic-scale", &acoustic_scale,
+                   "Scaling factor for acoustic likelihoods");
+  }
+
+  void Check() const;
+};
+
+struct SplitDiscriminativeSupervisionOptions {
+  bool remove_output_symbols;
+  bool collapse_transition_ids;
+  bool remove_epsilons;
+  bool determinize;
+  bool minimize; // we'll push and minimize if this is true.
+  DiscriminativeSupervisionOptions supervision_config;
+  
+  SplitDiscriminativeSupervisionOptions() :
+    remove_output_symbols(false), collapse_transition_ids(false), 
+    remove_epsilons(false), determinize(false),
+    minimize(false) { }
+
+  void Register(OptionsItf *opts) {
+    opts->Register("collapse-transition-ids", &collapse_transition_ids,
+                   "If true, modify the transition-ids on denominator lattice "
+                   "so that on each frame, there is just one with any given "
+                   "pdf-id. This allows us to determinize and minimize "
+                   "more completely.");
+    opts->Register("remove-output-symbols", &remove_output_symbols,
+                   "Remove output symbols from lattice to convert it to an "
+                   "acceptor and make it more determinizable");
+    opts->Register("remove-epsilons", &remove_epsilons,
+                   "Remove epsilons from the split lattices");
+    opts->Register("determinize", &determinize, "If true, we determinize "
+                   "lattices (as Lattice) after splitting and possibly minimize");
+    opts->Register("minimize", &minimize, "If true, we push and "
+                   "minimize lattices (as Lattice) after splitting");
+    supervision_config.Register(opts);
+  }
+};
+
+/*
+  This file contains some declarations relating to the object we use to
+  encode the supervision information for sequence training
+*/
+
+// struct DiscriminativeSupervision is the fully-processed information for
+// a whole utterance or (after splitting) part of an utterance. 
+struct DiscriminativeSupervision {
+  // The weight we assign to this example;
+  // this will typically be one, but we include it
+  // for the sake of generality.  
+  BaseFloat weight; 
+  
+  // num_sequences will be 1 if you create a DiscriminativeSupervision object from a single
+  // lattice or alignment, but if you combine multiple DiscriminativeSupervision objects
+  // the 'num_sequences' is the number of objects that were combined (the
+  // lattices get appended).
+  int32 num_sequences;
+
+  // the number of frames in each sequence of appended objects.  num_frames *
+  // num_sequences must equal the path length of any path in the lattices.
+  // Technically this information is redundant with the lattices, but it's convenient
+  // to have it separately.
+  int32 frames_per_sequence;
+  
+  // The numerator alignment
+  // Usually obtained by aligning the reference text with the seed neural
+  // network model; can be the best path of generated lattice in the case of
+  // semi-supervised training.
+  std::vector<int32> num_ali;
+  
+  // Note: any acoustic
+  // likelihoods in the lattices will be
+  // recomputed at the time we train.
+  
+  // The denominator lattice.  
+  Lattice den_lat; 
+  
+  DiscriminativeSupervision(): weight(1.0), num_sequences(1),
+                               frames_per_sequence(-1) { }
+
+  DiscriminativeSupervision(const DiscriminativeSupervision &other);
+
+
+  // This function creates a supervision object from numerator alignment
+  // and denominator lattice.  The supervision object is used for sequence
+  // discriminative training.
+  // Topologically sorts the lattice after copying to the supervision object.
+  // Returns false when alignment or lattice is empty 
+  bool Initialize(const std::vector<int32> &alignment,
+                  const Lattice &lat,
+                  BaseFloat weight);
+
+  void Swap(DiscriminativeSupervision *other);
+
+  bool operator == (const DiscriminativeSupervision &other) const;
+  
+  // This function checks that this supervision object satifsies some
+  // of the properties we expect of it, and calls KALDI_ERR if not.
+  void Check() const;
+  
+  inline int32 NumFrames() const { 
+    return num_sequences * frames_per_sequence; 
+  }
+
+  void Write(std::ostream &os, bool binary) const;
+  void Read(std::istream &is, bool binary);
+};
+
+// This class is used for splitting something of type
+// DiscriminativeSupervision into
+// multiple pieces corresponding to different frame-ranges.
+class DiscriminativeSupervisionSplitter {
+ public:
+  typedef fst::ArcTpl<LatticeWeight> LatticeArc;
+  typedef fst::VectorFst<LatticeArc> Lattice;
+ 
+  DiscriminativeSupervisionSplitter(
+      const SplitDiscriminativeSupervisionOptions &config,
+      const TransitionModel &tmodel,
+      const DiscriminativeSupervision &supervision);
+
+  // A structure used to store the forward and backward scores 
+  // and state times of a lattice
+  struct LatticeInfo {
+    // These values are stored in log. 
+    std::vector<double> alpha;
+    std::vector<double> beta;
+    std::vector<int32> state_times;
+
+    void Check() const;
+  };
+  
+  // Extracts a frame range of the supervision into 'supervision'.  
+  void GetFrameRange(int32 begin_frame, int32 frames_per_sequence,
+                     bool normalize,
+                     DiscriminativeSupervision *supervision) const;
+
+  // Get the acoustic scaled denominator lattice out for debugging purposes
+  inline const Lattice& DenLat() const { return den_lat_; }  
+
+ private:
+
+  // Creates an output lattice covering frames begin_frame <= t < end_frame,
+  // assuming that the corresponding state-range that we need to
+  // include, begin_state <= s < end_state has been included.
+  // (note: the output lattice will also have two special initial and final
+  // states).  
+  // Also does post-processing (RmEpsilon, Determinize,
+  // TopSort on the result).  See code for details.
+  void CreateRangeLattice(const Lattice &in_lat,
+                          const LatticeInfo &scores,
+                          int32 begin_frame, int32 end_frame, bool normalize,
+                          Lattice *out_lat) const;
+
+  // Config options for splitting supervision object
+  const SplitDiscriminativeSupervisionOptions &config_;
+
+  // Transition model is used by the function
+  // CollapseTransitionIds()
+  const TransitionModel &tmodel_;
+  
+  // A reference to the supervision object that we will be splitting
+  const DiscriminativeSupervision &supervision_;
+
+  // LatticeInfo object for denominator lattice.
+  // This will be computed when PrepareLattice function is called.
+  LatticeInfo den_lat_scores_;
+
+  // Copy of denominator lattice. This is required because the lattice states
+  // need to be ordered in breadth-first search order.
+  Lattice den_lat_;
+
+  // Function to compute lattice scores for a lattice
+  void ComputeLatticeScores(const Lattice &lat, LatticeInfo *scores) const;
+
+  // Prepare lattice : 
+  // 1) Order states in breadth-first search order
+  // 2) Compute states times, which must be a strictly non-decreasing vector
+  // 3) Compute lattice alpha and beta scores
+  void PrepareLattice(Lattice *lat, LatticeInfo *scores) const;
+
+  // Modifies the transition-ids on lat_ so that on each frame, there is just
+  // one with any given pdf-id.  This allows us to determinize and minimize
+  // more completely.
+  void CollapseTransitionIds(const std::vector<int32> &state_times, 
+                             Lattice *lat) const;
+
+};
+
+/// This function appends a list of supervision objects to create what will
+/// usually be a single such object, but if the weights and num-frames are not
+/// all the same it will only append Supervision objects where successive ones
+/// have the same weight and num-frames, and if 'compactify' is true.  The
+/// normal use-case for this is when you are combining neural-net examples for
+/// training; appending them like this helps to simplify the training process.
+
+void AppendSupervision(const std::vector<const DiscriminativeSupervision*> &input,
+    bool compactify,
+    std::vector<DiscriminativeSupervision> *output_supervision);
+
+typedef TableWriter<KaldiObjectHolder<DiscriminativeSupervision> > DiscriminativeSupervisionWriter;
+typedef SequentialTableReader<KaldiObjectHolder<DiscriminativeSupervision> > SequentialDiscriminativeSupervisionReader;
+typedef RandomAccessTableReader<KaldiObjectHolder<DiscriminativeSupervision> > RandomAccessDiscriminativeSupervisionReader;
+
+} // namespace discriminative
+} // namespace kaldi
+
+#endif // KALDI_NNET3_DISCRIMINATIVE_SUPERVISION_H
diff --git a/src/nnet3/discriminative-training.cc b/src/nnet3/discriminative-training.cc
new file mode 100644
index 00000000000..438a01aafd9
--- /dev/null
+++ b/src/nnet3/discriminative-training.cc
@@ -0,0 +1,645 @@
+// nnet3/discriminative-training.cc
+
+// Copyright      2012-2015    Johns Hopkins University (author: Daniel Povey)
+// Copyright      2014-2015    Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "nnet3/discriminative-training.h"
+#include "lat/lattice-functions.h"
+#include "cudamatrix/cu-matrix.h"
+
+namespace kaldi {
+namespace discriminative {
+
+DiscriminativeObjectiveInfo::DiscriminativeObjectiveInfo() {
+  std::memset(this, 0, sizeof(*this));
+}
+
+DiscriminativeObjectiveInfo::DiscriminativeObjectiveInfo(int32 num_pdfs) :
+  accumulate_gradients(false),
+  accumulate_output(false),
+  num_pdfs(num_pdfs) {
+  gradients.Resize(num_pdfs);
+  output.Resize(num_pdfs);
+  Reset();
+}
+
+// Constructor from config structure
+DiscriminativeObjectiveInfo::DiscriminativeObjectiveInfo(
+    const DiscriminativeOptions &opts) :
+  accumulate_gradients(opts.accumulate_gradients),
+  accumulate_output(opts.accumulate_output),
+  num_pdfs(opts.num_pdfs) {
+  gradients.Resize(opts.num_pdfs);
+  output.Resize(opts.num_pdfs);
+  Reset();
+}
+
+// Reset statistics
+void DiscriminativeObjectiveInfo::Reset() {
+  gradients.SetZero();
+  output.SetZero();
+
+  tot_t = 0.0;
+  tot_t_weighted = 0.0;
+  tot_objf = 0.0;
+  tot_num_count = 0.0;
+  tot_den_count = 0.0;
+  tot_num_objf = 0.0;
+  tot_l2_term = 0.0;
+}
+
+void DiscriminativeObjectiveInfo::Configure(const DiscriminativeOptions &opts) {
+  accumulate_gradients = opts.accumulate_gradients;
+  accumulate_output = opts.accumulate_output;
+  num_pdfs = opts.num_pdfs;
+  gradients.Resize(opts.num_pdfs);
+  output.Resize(opts.num_pdfs);
+}
+
+// This class is responsible for the forward-backward of the
+// 'supervision' lattices and computation of the objective function
+// and gradients.
+//
+// note: the supervision.weight is ignored by this class, you have to apply
+// it externally.
+class DiscriminativeComputation {
+  typedef Lattice::Arc Arc;
+  typedef Arc::StateId StateId;
+
+ public:
+  // Initialize the objcect.  Note: we expect the 'nnet_output' to have the
+  // same number of rows as supervision.num_frames * supervision.num_sequences,
+  // and the same number of columns as tmodel.NumPdfs(); but the
+  // ordering of the rows of 'nnet_output' is not the same as the ordering of
+  // frames in paths in the 'supervision' object (which has all frames of the
+  // 1st sequence first, then the 2nd sequence, and so on).  Instead, the
+  // frames in 'nnet_output' are ordered as: first the first frame of each
+  // sequence, then the second frame of each sequence, and so on.
+  // This is done to be similar to the setup in 'chain' training
+  // even though this does not offer any computational advantages here
+  // as in the 'chain' case.
+  DiscriminativeComputation(const DiscriminativeOptions &opts,
+      const TransitionModel &tmodel,
+      const CuVectorBase<BaseFloat> &log_priors,
+      const DiscriminativeSupervision &supervision,
+      const CuMatrixBase<BaseFloat> &nnet_output,
+      DiscriminativeObjectiveInfo *stats,
+      CuMatrixBase<BaseFloat> *nnet_output_deriv,
+      CuMatrixBase<BaseFloat> *xent_output_deriv);
+
+  // Does the forward-backward computation and add the derivative of the
+  // w.r.t. the nnet output (log-prob) times supervision_.weight times
+  // deriv_weight to 'nnet_output_deriv'.
+  void Compute();
+
+ private:
+  const DiscriminativeOptions &opts_;
+  const TransitionModel &tmodel_;
+
+  // Vector of log-priors of pdfs.
+  // This can be a size zero vector e.g. for 'chain' model
+  const CuVectorBase<BaseFloat> &log_priors_;
+
+  const DiscriminativeSupervision &supervision_;
+
+  // The neural net output.
+  const CuMatrixBase<BaseFloat> &nnet_output_;
+
+  // Training stats including accumulated objective function, gradient
+  // and total weight. Optionally the nnet_output and gradients per pdf can be
+  // accumulated for debugging purposes.
+  DiscriminativeObjectiveInfo *stats_;
+
+  // If non-NULL, derivative w.r.t. to nnet_output is written here.
+  CuMatrixBase<BaseFloat> *nnet_output_deriv_;
+
+  // If non-NULL, then the xent objective derivative
+  // (which equals a posterior from the numerator forward-backward, scaled by
+  // the supervision weight) is written to here.
+  // This will be used in the cross-entropy regularization code.
+  CuMatrixBase<BaseFloat> *xent_output_deriv_;
+
+  // Denominator lattice.
+  Lattice den_lat_;
+
+  // List of silence phones. Useful to treat silence phones
+  // differently in computing SMBR / MPFE objectives.
+  std::vector<int32> silence_phones_;
+
+  // The function that actually computes the objective and gradients
+  double ComputeObjfAndDeriv(Posterior *post, Posterior *xent_post);
+
+  // This function looks up the nnet output the pdf-ids in the
+  // denominator lattice and the alignment in the case of "mmi" objective
+  // using the CuMatrix::Lookup() and stores them in "answers"
+  void LookupNnetOutput(std::vector<Int32Pair> *requested_indexes,
+                        std::vector<BaseFloat> *answers) const ;
+
+  // Converts the answers looked up by LookupNnetOutput function into
+  // log-likelihoods scaled by acoustic scale.
+  void ConvertAnswersToLogLike(
+      const std::vector<Int32Pair>& requested_indexes,
+      std::vector<BaseFloat> *answers) const;
+
+  // Does acoustic rescoring of lattice to put the negative (scaled) acoustic
+  // log-likelihoods in the arcs of the lattice. Returns the number of
+  // indexes of log-likelihoods read from the "answers" vector.
+  static size_t LatticeAcousticRescore(const std::vector<BaseFloat> &answers,
+                                size_t index,
+                                Lattice *lat);
+
+  // Process the derivative stored as posteriors into CuMatrix.
+  // Optionally accumulate numerator and denominator posteriors.
+  void ProcessPosteriors(const Posterior &post,
+                         CuMatrixBase<BaseFloat> *output_deriv_temp,
+                         double *tot_num_post = NULL,
+                         double *tot_den_post = NULL) const;
+
+  static inline Int32Pair MakePair(int32 first, int32 second) {
+    Int32Pair ans;
+    ans.first = first;
+    ans.second = second;
+    return ans;
+  }
+};
+
+DiscriminativeComputation::DiscriminativeComputation(
+                            const DiscriminativeOptions &opts,
+                            const TransitionModel &tmodel,
+                            const CuVectorBase<BaseFloat> &log_priors,
+                            const DiscriminativeSupervision &supervision,
+                            const CuMatrixBase<BaseFloat> &nnet_output,
+                            DiscriminativeObjectiveInfo *stats,
+                            CuMatrixBase<BaseFloat> *nnet_output_deriv,
+                            CuMatrixBase<BaseFloat> *xent_output_deriv)
+  : opts_(opts), tmodel_(tmodel), log_priors_(log_priors),
+  supervision_(supervision), nnet_output_(nnet_output),
+  stats_(stats),
+  nnet_output_deriv_(nnet_output_deriv),
+  xent_output_deriv_(xent_output_deriv) {
+
+  den_lat_ = supervision.den_lat;
+  TopSort(&den_lat_);
+
+  if (!SplitStringToIntegers(opts_.silence_phones_str, ":", false,
+                             &silence_phones_)) {
+    KALDI_ERR << "Bad value for --silence-phones option: "
+              << opts_.silence_phones_str;
+  }
+}
+
+void DiscriminativeComputation::LookupNnetOutput(
+    std::vector<Int32Pair> *requested_indexes,
+    std::vector<BaseFloat> *answers) const {
+  BaseFloat wiggle_room = 1.3; // value not critical.. it's just 'reserve'
+
+  int32 num_frames = supervision_.frames_per_sequence * supervision_.num_sequences;
+  int32 num_pdfs = tmodel_.NumPdfs();
+
+  int32 num_reserve = wiggle_room * den_lat_.NumStates();
+
+  if (opts_.criterion == "mmi") {
+    // For looking up the posteriors corresponding to the pdfs in the alignment
+    num_reserve += num_frames;
+  }
+
+  requested_indexes->reserve(num_reserve);
+
+  // Denominator probabilities to look up from denominator lattice
+  std::vector<int32> state_times;
+  int32 T = LatticeStateTimes(den_lat_, &state_times);
+  KALDI_ASSERT(T == num_frames);
+
+  StateId num_states = den_lat_.NumStates();
+  for (StateId s = 0; s < num_states; s++) {
+    int32 t = state_times[s];
+    int32 seq = t / supervision_.frames_per_sequence,
+          idx = t % supervision_.frames_per_sequence;
+
+    for (fst::ArcIterator<Lattice> aiter(den_lat_, s); !aiter.Done(); aiter.Next()) {
+      const Arc &arc = aiter.Value();
+      if (arc.ilabel != 0) { // input-side has transition-ids, output-side empty
+        int32 tid = arc.ilabel, pdf_id = tmodel_.TransitionIdToPdf(tid);
+        // The ordering of the indexes is similar to that in chain models
+        requested_indexes->push_back(MakePair(idx * supervision_.num_sequences + seq, pdf_id));
+      }
+    }
+  }
+
+  if (opts_.criterion == "mmi") {
+    // Numerator probabilities to look up from alignment
+    for (int32 t = 0; t < num_frames; t++) {
+      int32 seq = t / supervision_.frames_per_sequence,
+            idx = t % supervision_.frames_per_sequence;
+      int32 tid = supervision_.num_ali[t],
+                  pdf_id = tmodel_.TransitionIdToPdf(tid);
+      KALDI_ASSERT(pdf_id >= 0 && pdf_id < num_pdfs);
+      requested_indexes->push_back(MakePair(idx * supervision_.num_sequences + seq, pdf_id));
+    }
+  }
+
+  CuArray<Int32Pair> cu_requested_indexes(*requested_indexes);
+  answers->resize(requested_indexes->size());
+  nnet_output_.Lookup(cu_requested_indexes, &((*answers)[0]));
+  // requested_indexes now contain (t, j) pair and answers contains the
+  // neural network output, which is log p(j|x(t)) for CE models
+}
+
+void DiscriminativeComputation::ConvertAnswersToLogLike(
+    const std::vector<Int32Pair>& requested_indexes,
+    std::vector<BaseFloat> *answers) const {
+  int32 num_floored = 0;
+
+  BaseFloat floor_val = -20 * kaldi::Log(10.0); // floor for posteriors.
+  size_t index;
+
+  Vector<BaseFloat> log_priors(log_priors_);
+
+  // Replace "answers" with the vector of scaled log-probs.  If this step takes
+  // too much time, we can look at other ways to do it, using the CUDA card.
+  for (index = 0; index < answers->size(); index++) {
+    BaseFloat log_post = (*answers)[index];
+    if (log_post < floor_val) {
+      // TODO: this might not be required for 'chain' models
+      log_post = floor_val;
+      num_floored++;
+    }
+
+    if (log_priors_.Dim() > 0) {
+      int32 pdf_id = requested_indexes[index].second;
+      KALDI_ASSERT(log_post <= 0 && log_priors(pdf_id) <= 0);
+      BaseFloat pseudo_loglike = (log_post - log_priors(pdf_id))
+                                  * opts_.acoustic_scale;
+      KALDI_ASSERT(!KALDI_ISINF(pseudo_loglike) && !KALDI_ISNAN(pseudo_loglike));
+      (*answers)[index] = pseudo_loglike;
+    } else {
+      (*answers)[index] = log_post * opts_.acoustic_scale;
+    }
+  }
+
+  if (num_floored > 0) {
+    KALDI_WARN << "Floored " << num_floored << " probabilities from nnet.";
+  }
+}
+
+size_t DiscriminativeComputation::LatticeAcousticRescore(
+    const std::vector<BaseFloat> &answers,
+    size_t index, Lattice *lat) {
+  int32 num_states = lat->NumStates();
+
+  for (StateId s = 0; s < num_states; s++) {
+    for (fst::MutableArcIterator<Lattice> aiter(lat, s);
+         !aiter.Done(); aiter.Next()) {
+      Arc arc = aiter.Value();
+      if (arc.ilabel != 0) { // input-side has transition-ids, output-side empty
+        arc.weight.SetValue2(-answers[index]);
+        index++;
+        aiter.SetValue(arc);
+      }
+    }
+    LatticeWeight final = lat->Final(s);
+    if (final != LatticeWeight::Zero()) {
+      final.SetValue2(0.0); // make sure no acoustic term in final-prob.
+      lat->SetFinal(s, final);
+    }
+  }
+
+  // Number of indexes of log-likes used to rescore lattice
+  return index;
+}
+
+void DiscriminativeComputation::ProcessPosteriors(
+                                const Posterior &post,
+                                CuMatrixBase<BaseFloat> *output_deriv_temp,
+                                double *tot_num_post,
+                                double *tot_den_post) const {
+  std::vector<Int32Pair> deriv_indexes;
+  std::vector<BaseFloat> deriv_data;
+  for (size_t t = 0; t < post.size(); t++) {
+    for (size_t j = 0; j < post[t].size(); j++) {
+      int32 seq = t / supervision_.frames_per_sequence,
+            idx = t % supervision_.frames_per_sequence;
+      int32 pdf_id = post[t][j].first;
+
+      // Same ordering as for 'chain' models
+      deriv_indexes.push_back(MakePair(idx * supervision_.num_sequences + seq, pdf_id));
+
+      BaseFloat weight = post[t][j].second;
+      if (tot_num_post && weight > 0.0) *tot_num_post += weight;
+      if (tot_den_post && weight < 0.0) *tot_den_post -= weight;
+      deriv_data.push_back(weight);
+    }
+  }
+  CuArray<Int32Pair> cu_deriv_indexes(deriv_indexes);
+  output_deriv_temp->AddElements(supervision_.weight, cu_deriv_indexes,
+                                 deriv_data.data());
+}
+
+void DiscriminativeComputation::Compute() {
+  if (opts_.criterion == "mmi" && opts_.boost != 0.0) {
+    BaseFloat max_silence_error = 0.0;
+    LatticeBoost(tmodel_, supervision_.num_ali, silence_phones_,
+                 opts_.boost, max_silence_error, &den_lat_);
+  }
+
+  int32 num_frames = supervision_.frames_per_sequence * supervision_.num_sequences;
+
+  int32 num_pdfs = nnet_output_.NumCols();
+  KALDI_ASSERT(log_priors_.Dim() == 0 || num_pdfs == log_priors_.Dim());
+
+  // We need to look up the nnet output for some pdf-ids.
+  // Rather than looking them all up using operator (), which is
+  // very slow because each lookup involves a separate CUDA call with
+  // communication over PciExpress, we look them up all at once using
+  // CuMatrix::Lookup().
+  std::vector<BaseFloat> answers;
+  std::vector<Int32Pair> requested_indexes;
+
+  LookupNnetOutput(&requested_indexes, &answers);
+
+  ConvertAnswersToLogLike(requested_indexes, &answers);
+
+  size_t index = 0;
+
+  // Now put the negative (scaled) acoustic log-likelihoods in the lattice.
+  index = LatticeAcousticRescore(answers, index, &den_lat_);
+  // index is now the number of indexes of log-likes used to rescore lattice.
+  // This is required to further lookup answers for computing "mmi"
+  // numerator score.
+
+  // Get statistics for this minibatch
+  DiscriminativeObjectiveInfo this_stats;
+  if (stats_) {
+    this_stats = *stats_;
+    this_stats.Reset();
+  }
+
+  // Look up numerator probabilities corresponding to alignment
+  if (opts_.criterion == "mmi") {
+    double tot_num_like = 0.0;
+    KALDI_ASSERT(index + supervision_.num_ali.size() == answers.size());
+    for (size_t this_index = 0; this_index < supervision_.num_ali.size(); this_index++) {
+      tot_num_like += answers[index + this_index];
+    }
+    this_stats.tot_num_objf += supervision_.weight * tot_num_like;
+    index += supervision_.num_ali.size();
+  }
+
+  KALDI_ASSERT(index == answers.size());
+
+  if (nnet_output_deriv_) {
+    nnet_output_deriv_->SetZero();
+    KALDI_ASSERT(nnet_output_deriv_->NumRows() == nnet_output_.NumRows() &&
+        nnet_output_deriv_->NumCols() == nnet_output_.NumCols());
+  }
+
+  if (xent_output_deriv_) {
+    xent_output_deriv_->SetZero();
+    KALDI_ASSERT(xent_output_deriv_->NumRows() == nnet_output_.NumRows() &&
+        xent_output_deriv_->NumCols() == nnet_output_.NumCols());
+  }
+
+  Posterior post;
+  Posterior xent_post;
+  double objf = ComputeObjfAndDeriv(&post,
+                (xent_output_deriv_ ? &xent_post : NULL));
+
+  this_stats.tot_objf += supervision_.weight * objf;
+
+  KALDI_ASSERT(nnet_output_.NumRows() == post.size());
+
+  CuMatrix<BaseFloat> output_deriv;
+
+  CuMatrixBase<BaseFloat> *output_deriv_temp;
+
+  if (nnet_output_deriv_)
+    output_deriv_temp = nnet_output_deriv_;
+  else {
+    // This is for accumulating the statistics
+    output_deriv.Resize(nnet_output_.NumRows(), nnet_output_.NumCols());
+    output_deriv_temp = &output_deriv;
+  }
+
+  double tot_num_post = 0.0, tot_den_post = 0.0;
+  {
+    ProcessPosteriors(post, output_deriv_temp,
+                             &tot_num_post, &tot_den_post);
+  }
+
+  if (xent_output_deriv_) {
+    ProcessPosteriors(xent_post, xent_output_deriv_, NULL, NULL);
+  }
+
+  this_stats.tot_den_count += tot_den_post;
+  this_stats.tot_num_count += tot_num_post;
+
+  if (this_stats.AccumulateGradients())
+    (this_stats.gradients).AddRowSumMat(1.0, CuMatrix<double>(*output_deriv_temp));
+
+  if (this_stats.AccumulateOutput()) {
+    CuMatrix<double> temp(nnet_output_);
+    temp.ApplyExp();
+    (this_stats.output).AddRowSumMat(1.0, temp);
+  }
+
+  this_stats.tot_t = num_frames;
+  this_stats.tot_t_weighted = num_frames * supervision_.weight;
+
+  if (!(this_stats.TotalObjf(opts_.criterion) ==
+        this_stats.TotalObjf(opts_.criterion))) {
+    // inf or NaN detected
+    if (nnet_output_deriv_)
+      nnet_output_deriv_->SetZero();
+    BaseFloat default_objf = -10;
+    KALDI_WARN << "Objective function is "
+               << this_stats.TotalObjf(opts_.criterion)
+               << ", setting to " << default_objf << " per frame.";
+    this_stats.tot_objf = default_objf * this_stats.tot_t_weighted;
+  }
+
+  if (GetVerboseLevel() >= 2) {
+    if (GetVerboseLevel() >= 3) {
+      this_stats.PrintAll(opts_.criterion);
+    } else
+      this_stats.Print(opts_.criterion);
+  }
+
+  // This code helps us see how big the derivatives are, on average,
+  // for different frames of the sequences.  As expected, they are
+  // smaller towards the edges of the sequences (due to the penalization
+  // of 'incorrect' pdf-ids.
+  if (nnet_output_deriv_ && GetVerboseLevel() >= 1) {
+    int32 tot_frames = nnet_output_deriv_->NumRows(),
+ frames_per_sequence = supervision_.frames_per_sequence,
+       num_sequences = supervision_.num_sequences;
+    CuVector<BaseFloat> row_products(tot_frames);
+    row_products.AddDiagMat2(1.0, *nnet_output_deriv_, kNoTrans, 0.0);
+    Vector<BaseFloat> row_products_cpu(row_products);
+    Vector<BaseFloat> row_products_per_frame(frames_per_sequence);
+    for (int32 i = 0; i < tot_frames; i++)
+      row_products_per_frame(i / num_sequences) += row_products_cpu(i);
+    KALDI_LOG << "Derivs per frame are " << row_products_per_frame;
+  }
+
+  if (opts_.l2_regularize != 0.0) {
+    // compute the l2 penalty term and its derivative
+    BaseFloat scale = supervision_.weight * opts_.l2_regularize;
+    this_stats.tot_l2_term += -0.5 * scale * TraceMatMat(nnet_output_, nnet_output_, kTrans);
+    if (nnet_output_deriv_)
+      nnet_output_deriv_->AddMat(-1.0 * scale, nnet_output_);
+  }
+
+  if (stats_)
+    stats_->Add(this_stats);
+
+}
+
+double DiscriminativeComputation::ComputeObjfAndDeriv(Posterior *post,
+                                                      Posterior *xent_post) {
+
+  if (xent_post) {
+    Posterior tid_post;
+    // Compute posterior from the numerator alignment
+    AlignmentToPosterior(supervision_.num_ali, &tid_post);
+    ConvertPosteriorToPdfs(tmodel_, tid_post, xent_post);
+  }
+
+  if (opts_.criterion == "mpfe" || opts_.criterion == "smbr") {
+    Posterior tid_post;
+    double ans = LatticeForwardBackwardMpeVariants(tmodel_, silence_phones_,
+        den_lat_,
+        supervision_.num_ali, opts_.criterion,
+        opts_.one_silence_class,
+        &tid_post);
+    ConvertPosteriorToPdfs(tmodel_, tid_post, post);
+    return ans;
+  } else if (opts_.criterion == "mmi") {
+    bool convert_to_pdfs = true, cancel = true;
+    // we'll return the denominator-lattice forward backward likelihood,
+    // which is one term in the objective function.
+    return (LatticeForwardBackwardMmi(tmodel_, den_lat_, supervision_.num_ali,
+                                      opts_.drop_frames, convert_to_pdfs,
+                                      cancel, post));
+  } else {
+    KALDI_ERR << "Unknown criterion " << opts_.criterion;
+  }
+
+  return 0;
+}
+
+
+void ComputeDiscriminativeObjfAndDeriv(const DiscriminativeOptions &opts,
+                                       const TransitionModel &tmodel,
+                                       const CuVectorBase<BaseFloat> &log_priors,
+                                       const DiscriminativeSupervision &supervision,
+                                       const CuMatrixBase<BaseFloat> &nnet_output,
+                                       DiscriminativeObjectiveInfo *stats,
+                                       CuMatrixBase<BaseFloat> *nnet_output_deriv,
+                                       CuMatrixBase<BaseFloat> *xent_output_deriv) {
+  DiscriminativeComputation computation(opts, tmodel, log_priors, supervision,
+                                        nnet_output, stats,
+                                        nnet_output_deriv, xent_output_deriv);
+  computation.Compute();
+}
+
+void DiscriminativeObjectiveInfo::Add(const DiscriminativeObjectiveInfo &other) {
+  tot_t += other.tot_t;
+  tot_t_weighted += other.tot_t_weighted;
+  tot_objf += other.tot_objf;             // Actually tot_den_objf for mmi
+  tot_num_count += other.tot_num_count;
+  tot_den_count += other.tot_den_count;
+  tot_num_objf += other.tot_num_objf;     // Only for mmi
+  tot_l2_term += other.tot_l2_term;
+
+  if (AccumulateGradients()) {
+    gradients.AddVec(1.0, other.gradients);
+  }
+  if (AccumulateOutput()) {
+    output.AddVec(1.0, other.output);
+  }
+}
+
+void DiscriminativeObjectiveInfo::Print(const std::string &criterion,
+                                        bool print_avg_gradients,
+                                        bool print_avg_output) const {
+  if (criterion == "mmi") {
+    double num_objf = tot_num_objf / tot_t_weighted,
+           den_objf = tot_objf / tot_t_weighted;
+    double objf = num_objf - den_objf;
+
+    double avg_post_per_frame = tot_num_count / tot_t_weighted;
+
+    KALDI_LOG << "Number of frames is " << tot_t
+              << " (weighted: " << tot_t_weighted
+              << "), average (num or den) posterior per frame is "
+              << avg_post_per_frame;
+    KALDI_LOG << "MMI objective function is " << num_objf << " - "
+              << den_objf << " = " << objf << " per frame, over "
+              << tot_t_weighted << " frames.";
+  } else if (criterion == "mpfe") {
+    double avg_gradients = (tot_num_count + tot_den_count) / tot_t_weighted;
+    double objf = tot_objf / tot_t_weighted;
+    KALDI_LOG << "Average modulus of MPFE gradients is " << avg_gradients
+              << " per frame, over "
+              << tot_t_weighted << " frames";
+    KALDI_LOG << "MPFE objective function is " << objf
+              << " per frame, over " << tot_t_weighted << " frames.";
+  } else if (criterion == "smbr") {
+    double avg_gradients = (tot_num_count + tot_den_count) / tot_t_weighted;
+    double objf = tot_objf / tot_t_weighted;
+    KALDI_LOG << "Average modulus of SMBR gradients is " << avg_gradients
+              << " per frame, over "
+              << tot_t_weighted << " frames";
+    KALDI_LOG << "SMBR objective function is " << objf
+              << " per frame, over " << tot_t_weighted << " frames.";
+  }
+
+  if (AccumulateGradients()) {
+    Vector<double> temp(gradients);
+    temp.Scale(1.0/tot_t_weighted);
+    if (print_avg_gradients) {
+      KALDI_LOG << "Vector of average gradients wrt output activations is: \n" << temp;
+    } else {
+      KALDI_VLOG(4) << "Vector of average gradients wrt output activations is: \n" << temp;
+    }
+  }
+  if (AccumulateOutput()) {
+    Vector<double> temp(output);
+    temp.Scale(1.0/tot_t_weighted);
+    if (print_avg_output) {
+      KALDI_LOG << "Average DNN output is: \n" << temp;
+    } else {
+      KALDI_VLOG(4) << "Average DNN output is: \n" << temp;
+    }
+  }
+}
+
+void DiscriminativeObjectiveInfo::PrintAvgGradientForPdf(int32 pdf_id) const {
+  if (pdf_id < gradients.Dim() && pdf_id >= 0) {
+    KALDI_LOG << "Average gradient wrt output activations of pdf " << pdf_id
+              << " is " << gradients(pdf_id) / tot_t_weighted
+              << " per frame, over "
+              << tot_t_weighted << " frames";
+  }
+}
+
+
+
+}  // namespace discriminative
+}  // namespace kaldi
+
diff --git a/src/nnet3/discriminative-training.h b/src/nnet3/discriminative-training.h
new file mode 100644
index 00000000000..4ec7109d64f
--- /dev/null
+++ b/src/nnet3/discriminative-training.h
@@ -0,0 +1,251 @@
+// nnet3/discriminative-training.h
+
+// Copyright      2012-2015    Johns Hopkins University (author: Daniel Povey)
+// Copyright      2014-2015    Vimal Manohar
+
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#ifndef KALDI_NNET3_DISCRIMINATIVE_TRAINING_H_
+#define KALDI_NNET3_DISCRIMINATIVE_TRAINING_H_
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "fstext/fstext-lib.h"
+#include "tree/context-dep.h"
+#include "lat/kaldi-lattice.h"
+#include "matrix/kaldi-matrix.h"
+#include "hmm/transition-model.h"
+#include "nnet3/discriminative-supervision.h"
+#include "lat/lattice-functions.h"
+#include "cudamatrix/cu-matrix-lib.h"
+
+namespace kaldi {
+namespace discriminative {
+
+/* Options for discriminative training
+ *
+ * Legend:
+ * mmi - Maximum Mutual Information
+ * mpfe - Minimum Phone Frame Error
+ * smbr - State Minimum Bayes Risk
+ *
+ */
+struct DiscriminativeOptions {
+  std::string criterion; // one of {"mmi", "mpfe", "smbr"}
+                         // If the criterion does not match the supervision
+                         // object, the derivatives may not be very accurate
+  BaseFloat acoustic_scale; // e.g. 0.1
+  bool drop_frames; // for MMI, true if we ignore frames where alignment
+                    // pdf-id is not in the lattice.
+  bool one_silence_class;  // Affects MPFE and SMBR objectives 
+  BaseFloat boost; // for MMI, boosting factor (would be Boosted MMI)... e.g. 0.1.
+  
+  std::string silence_phones_str; // colon-separated list of integer ids of silence phones,
+                                  // for MPFE and SMBR objectives
+
+  // Cross-entropy regularization constant.  (e.g. try 0.1).  If nonzero,
+  // the network is expected to have an output named 'output-xent', which
+  // should have a softmax as its final nonlinearity.
+  BaseFloat xent_regularize;
+
+  // l2 regularization constant on the 'chain' output; the actual term added to
+  // the objf will be -0.5 times this constant times the squared l2 norm.
+  // (squared so it's additive across the dimensions).  e.g. try 0.0005.
+  BaseFloat l2_regularize;
+  
+  // Options for debugging discriminative training
+  
+  // Accumulates gradients wrt nnet outputs
+  bool accumulate_gradients;  
+  
+  // Accumulates nnet output
+  bool accumulate_output;     
+  
+  // Applicable for debugging discriminative training when accumulate_gradients
+  // or accumulate_output is true 
+  int32 num_pdfs;             
+
+  DiscriminativeOptions(): criterion("smbr"), 
+                           acoustic_scale(0.1),
+                           drop_frames(false),
+                           one_silence_class(false),
+                           boost(0.0), 
+                           xent_regularize(0.0), 
+                           l2_regularize(0.0),
+                           accumulate_gradients(false), 
+                           accumulate_output(false),
+                           num_pdfs(0) { }
+
+  void Register(OptionsItf *opts) {
+    opts->Register("criterion", &criterion, "Criterion, 'mmi'|'mpfe'|'smbr', "
+                   "determines the objective function to use.  Should match "
+                   "option used when we created the examples.");
+    opts->Register("acoustic-scale", &acoustic_scale, "Weighting factor to "
+                   "apply to acoustic likelihoods.");
+    opts->Register("drop-frames", &drop_frames, "For MMI, if true we drop frames "
+                   "with no overlap of num and den pdf-ids");
+    opts->Register("boost", &boost, "Boosting factor for boosted MMI (e.g. 0.1)");
+    opts->Register("one-silence-class", &one_silence_class, "If true, newer "
+                   "behavior which will tend to reduce insertions "
+                   "when using MPFE or SMBR objective");
+    opts->Register("silence-phones", &silence_phones_str,
+                   "For MPFE or SMBR objectives, colon-separated list of "
+                   "integer ids of silence phones, e.g. 1:2:3");
+    opts->Register("l2-regularize", &l2_regularize, "l2 regularization "
+                   "constant for 'chain' output "
+                   "of the neural net.");
+    opts->Register("xent-regularize", &xent_regularize, "Cross-entropy "
+                   "regularization constant for sequence training.  If "
+                   "nonzero, the network is expected to have an output "
+                   "named 'output-xent', which should have a softmax as "
+                   "its final nonlinearity.");
+    opts->Register("accumulate-gradients", &accumulate_gradients,
+                   "Accumulate gradients wrt nnet output "
+                   "for debugging discriminative training");
+    opts->Register("accumulate-output", &accumulate_output,
+                   "Accumulate nnet output "
+                   "for debugging discriminative training");
+    opts->Register("num-pdfs", &num_pdfs,
+                   "Number of pdfs; "
+                   "applicable when accumulate-output or accumulate-gradients "
+                   "is true for discriminative training");
+  }
+};
+
+struct DiscriminativeObjectiveInfo {
+  double tot_t;          // total number of frames
+  double tot_t_weighted; // total number of frames times weight.
+  double tot_objf;      // for 'mmi', the (weighted) denominator likelihood; for
+                        // everything else, the objective function.
+  double tot_num_count; // total count of numerator posterior 
+  double tot_den_count; // total count of denominator posterior 
+  double tot_num_objf;  // for 'mmi', the (weighted) numerator likelihood; for
+                        // everything else 0
+  
+  double tot_l2_term;   // l2 regularization objective
+  // l2 regularization constant on the 'chain' output; the actual term added to
+  // the objf will be -0.5 times this constant times the squared l2 norm.
+  // (squared so it's additive across the dimensions).  e.g. try 0.0005.
+
+  // Options for debugging discriminative training
+  
+  // Accumulates gradients wrt nnet outputs
+  bool accumulate_gradients;  
+  
+  // Accumulates nnet output
+  bool accumulate_output;     
+  
+  // Applicable for debugging discriminative training when accumulate_gradients
+  // or accumulate_output is true 
+  int32 num_pdfs;             
+
+  // Used to accumulates gradients wrt nnet outputs
+  // when accumulate_gradients is true
+  CuVector<double> gradients;
+  // Used to accumulates output when accumulate_output is true
+  CuVector<double> output;
+
+  // Print statistics for the criterion
+  void Print(const std::string &criterion, 
+             bool print_avg_gradients = false, 
+             bool print_avg_output = false) const;
+
+  // Print all accumulated statistics for debugging
+  void PrintAll(const std::string &criterion) const {
+    Print(criterion, true, true);
+  }
+
+  // Print the gradient wrt nnet output accumulated for a pdf
+  void PrintAvgGradientForPdf(int32 pdf_id) const;
+
+  // Add stats from another object
+  void Add(const DiscriminativeObjectiveInfo &other);
+
+  // Returns the objective function value for the criterion
+  inline double TotalObjf(const std::string &criterion) const {
+    if (criterion == "mmi") return (tot_num_objf - tot_objf);
+    return tot_objf;
+  }
+
+  // Returns true if accumulate_gradients is true 
+  // and the gradients vector has been resized to store the 
+  // accumulated gradients
+  inline bool AccumulateGradients() const {
+    return accumulate_gradients && gradients.Dim() > 0;
+  }
+
+  // Returns true if accumulate_output is true 
+  // and the output vector has been resized to store the 
+  // accumulated nnet output 
+  inline bool AccumulateOutput() const {
+    return accumulate_output && output.Dim() > 0;
+  }
+
+  // Empty constructor
+  DiscriminativeObjectiveInfo();
+
+  // Constructor preparing to gradients or output to be accumulated
+  DiscriminativeObjectiveInfo(int32 num_pdfs);
+
+  // Constructor from config options
+  DiscriminativeObjectiveInfo(const DiscriminativeOptions &opts);
+  
+  // Reset statistics
+  void Reset();
+  
+  void Configure(const DiscriminativeOptions &opts);
+};
+
+/**
+   This function does forward-backward on the numerator and denominator 
+   lattices and computes derivates wrt to the output for the specified 
+   objective function.
+
+   @param [in] opts        Struct containing options
+   @param [in] tmodel       Transition model
+   @param [in] log_priors   Vector of log-priors for pdfs
+   @param [in] supervision  The supervision object, containing the numerator
+                            and denominator paths. The denominator is 
+                            always a lattice. The numerator is an alignment.
+   @param [in] nnet_output  The output of the neural net; dimension must equal
+                          ((supervision.num_sequences * supervision.frames_per_sequence) by
+                            tmodel.NumPdfs()).
+
+   @param [out] stats       Statistics accumulated during training such as 
+                            the objective function and the total weight.
+   @param [out] xent_output_deriv  If non-NULL, then the xent objective derivative
+                           (which equals a posterior from the numerator forward-backward,
+                           scaled by the supervision weight) is written to here.  This will
+                           be used in the cross-entropy regularization code.  
+*/
+void ComputeDiscriminativeObjfAndDeriv(
+    const DiscriminativeOptions &opts,
+    const TransitionModel &tmodel,
+    const CuVectorBase<BaseFloat> &log_priors,
+    const DiscriminativeSupervision &supervision,
+    const CuMatrixBase<BaseFloat> &nnet_output,
+    DiscriminativeObjectiveInfo *stats,
+    CuMatrixBase<BaseFloat> *nnet_output_deriv,
+    CuMatrixBase<BaseFloat> *xent_output_deriv);
+
+}  // namespace discriminative
+}  // namespace kaldi
+
+#endif  // KALDI_NNET3_DISCRIMINATIVE_TRAINING_H_
+
+
diff --git a/src/nnet3/natural-gradient-online-test.cc b/src/nnet3/natural-gradient-online-test.cc
index 28bfc5bf35d..9f4136a26b1 100644
--- a/src/nnet3/natural-gradient-online-test.cc
+++ b/src/nnet3/natural-gradient-online-test.cc
@@ -28,8 +28,8 @@ namespace nnet3 {
 class OnlineNaturalGradientSimple {
  public:
   OnlineNaturalGradientSimple(): rank_(40), num_samples_history_(2000.0), alpha_(4.0),
-                                epsilon_(1.0e-10), delta_(1.0e-05) { }
-  
+                                epsilon_(1.0e-10), delta_(5.0e-04) { }
+
   void SetRank(int32 rank) { rank_ = rank; }
 
   void PreconditionDirections(
@@ -40,24 +40,27 @@ class OnlineNaturalGradientSimple {
 
  private:
   BaseFloat Eta(int32 N) const;
-  
+
   void PreconditionDirectionsCpu(
       MatrixBase<double> *R,
       VectorBase<double> *row_prod,
       BaseFloat *scale);
-  
-  
+
+
   void Init(const MatrixBase<double> &R0);
 
-  int32 rank_;  
+  void InitDefault(int32 D);
+
+  int32 rank_;
   double num_samples_history_;
   double alpha_;
   double epsilon_;
   double delta_;
-  
+
+  // Fisher matrix defined as F_t = R_t^T diag(d_t) R_t + rho_t I.
   Vector<double> d_t_;
-  Matrix<double> X_t_;
-  double rho_t_;  
+  Matrix<double> R_t_;
+  double rho_t_;
 };
 
 
@@ -78,14 +81,7 @@ void OnlineNaturalGradientSimple::PreconditionDirections(
   row_prod->CopyFromVec(row_prod_cpu);
 }
 
-void OnlineNaturalGradientSimple::Init(const MatrixBase<double> &R0) {
-  int32 D = R0.NumCols(), N = R0.NumRows();
-  SpMatrix<double> S(D);
-  S.AddMat2(1.0 / N, R0, kTrans, 0.0);
-  Matrix<double> P(D, D);
-  Vector<double> s(D);
-  S.Eig(&s, &P);
-  SortSvd(&s, &P);
+void OnlineNaturalGradientSimple::InitDefault(int32 D) {
   if (rank_ >= D) {
     KALDI_WARN << "Rank " << rank_ << " of online preconditioner is >= dim " << D
                << ", setting it to "
@@ -93,58 +89,76 @@ void OnlineNaturalGradientSimple::Init(const MatrixBase<double> &R0) {
     rank_ = D - 1;
   }
   int32 R = rank_;
-  X_t_.Resize(R, D);
-  P.Transpose();
-  X_t_ = P.Range(0, R, 0, D);
-  d_t_ = s.Range(0, R);
-  KALDI_VLOG(3) << "d_t orig is " << d_t_;
-  rho_t_ = (TraceMatMat(R0, R0, kTrans) / N - d_t_.Sum()) / (D - R);
-  d_t_.Add(-rho_t_);
-  KALDI_VLOG(3) << "rho_0 = " << rho_t_;
-  KALDI_VLOG(3) << "d_0 = " << d_t_;
-  double floor_val = std::max(epsilon_, delta_ * d_t_.Max());
-  if (rho_t_ < floor_val) {
-    KALDI_WARN << "Flooring rho_0 to " << floor_val << ", was " << rho_t_;
-    rho_t_ = floor_val;
+  R_t_.Resize(R, D);
+  for (int32 r = 0; r < R; r++) {
+    std::vector<int32> cols;
+    for (int32 c = r; c < D; c += R)
+      cols.push_back(c);
+    for (int32 i = 0; i < cols.size(); i++) {
+      int32 c = cols[i];
+      R_t_(r, c) = (i == 0 ? 1.1 : 1.0) /
+          sqrt(1.1 * 1.1 + cols.size() - 1);
+    }
+  }
+  d_t_.Resize(R);
+  d_t_.Set(epsilon_);
+  rho_t_ = epsilon_;
+}
+
+void OnlineNaturalGradientSimple::Init(const MatrixBase<double> &R0) {
+  int32 D = R0.NumCols(), N = R0.NumRows();
+  InitDefault(D);
+  int32 num_init_iters = 3;
+  for (int32 i = 0; i < num_init_iters; i++) {
+    CuMatrix<BaseFloat> R0_copy(R0);
+    CuVector<BaseFloat> row_products(N);
+    BaseFloat scale;
+    PreconditionDirections(&R0_copy, &row_products, &scale);
   }
-  int32 nf = d_t_.ApplyFloor(epsilon_);
-  if (nf > 0) {
-    KALDI_WARN << "Floored " << nf << " elements of D_0";
-  }  
 }
 
 BaseFloat OnlineNaturalGradientSimple::Eta(int32 N) const {
   KALDI_ASSERT(num_samples_history_ > 0.0);
-  return 1.0 - exp(-N / num_samples_history_);
+  BaseFloat ans = 1.0 - exp(-N / num_samples_history_);
+  if (ans > 0.9) ans = 0.9;
+  return ans;
 }
 
 
 void OnlineNaturalGradientSimple::PreconditionDirectionsCpu(
-    MatrixBase<double> *R_t,
+    MatrixBase<double> *X_t,
     VectorBase<double> *row_prod,
     BaseFloat *scale) {
-  if (X_t_.NumRows() == 0)
-    Init(*R_t);
-  int32 R = X_t_.NumRows(), D = X_t_.NumCols(), N = R_t->NumRows();
+  if (R_t_.NumRows() == 0)
+    Init(*X_t);
+  int32 R = R_t_.NumRows(), D = R_t_.NumCols(), N = X_t->NumRows();
   BaseFloat eta = Eta(N);
-  
+
   SpMatrix<double> F_t(D);
-  // F_t =(def) X_t^T D_t X_t + \rho_t I
+  // F_t =(def) R_t^T D_t R_t + \rho_t I
   F_t.AddToDiag(rho_t_);
-  F_t.AddMat2Vec(1.0, X_t_, kTrans, d_t_, 1.0);
+  F_t.AddMat2Vec(1.0, R_t_, kTrans, d_t_, 1.0);
+
+  // Make sure F_t is +ve definite.
+  {
+    KALDI_ASSERT(d_t_.Min() > 0);
+    Vector<double> eigs(D);
+    F_t.Eig(&eigs, NULL);
+    KALDI_ASSERT(eigs.Min() > 0);
+  }
 
-  // S_t =(def) 1/N R_t^T R_t.
+  // S_t =(def) 1/N X_t^T X_t.
   SpMatrix<double> S_t(D);
-  S_t.AddMat2(1.0 / N, *R_t, kTrans, 0.0);
+  S_t.AddMat2(1.0 / N, *X_t, kTrans, 0.0);
 
   // T_t =(def) \eta S_t + (1-\eta) F_t
   SpMatrix<double> T_t(D);
   T_t.AddSp(eta, S_t);
   T_t.AddSp(1.0 - eta, F_t);
 
-  // Y_t =(def) X_t T_t
+  // Y_t =(def) R_t T_t
   Matrix<double> Y_t(R, D);
-  Y_t.AddMatSp(1.0, X_t_, kNoTrans, T_t, 0.0);
+  Y_t.AddMatSp(1.0, R_t_, kNoTrans, T_t, 0.0);
 
   // Z_t =(def) Y_t Y_t^T
   SpMatrix<double> Z_t(R);
@@ -163,15 +177,15 @@ void OnlineNaturalGradientSimple::PreconditionDirectionsCpu(
   // KALDI_LOG << "c_t is " << c_t;
   // KALDI_LOG << "U_t is " << U_t;
   // KALDI_LOG << "Z_t is " << Z_t;
-  
+
   Vector<double> sqrt_c_t(c_t);
   sqrt_c_t.ApplyPow(0.5);
   Vector<double> inv_sqrt_c_t(sqrt_c_t);
   inv_sqrt_c_t.InvertElements();
-  Matrix<double> X_t1(R, D);
-  // X_{t+1} = C_t^{-0.5} U_t^T Y_t
-  X_t1.AddMatMat(1.0, U_t, kTrans, Y_t, kNoTrans, 0.0);
-  X_t1.MulRowsVec(inv_sqrt_c_t);
+  Matrix<double> R_t1(R, D);
+  // R_{t+1} = C_t^{-0.5} U_t^T Y_t
+  R_t1.AddMatMat(1.0, U_t, kTrans, Y_t, kNoTrans, 0.0);
+  R_t1.MulRowsVec(inv_sqrt_c_t);
 
   double rho_t1 = (1.0 / (D - R)) *
       (eta * S_t.Trace() + (1.0 - eta) * (D * rho_t_ + d_t_.Sum()) - sqrt_c_t.Sum());
@@ -184,13 +198,13 @@ void OnlineNaturalGradientSimple::PreconditionDirectionsCpu(
     KALDI_WARN << "flooring rho_{t+1} to " << floor_val << ", was " << rho_t1;
     rho_t1 = floor_val;
   }
-  nf = d_t1.ApplyFloor(epsilon_);
+  nf = d_t1.ApplyFloor(floor_val);
   if (nf > 0) {
     KALDI_VLOG(3) << "d_t1 was " << d_t1;
     KALDI_WARN << "Floored " << nf << " elements of d_{t+1}.";
-  }  
+  }
   // a check.
-  if (nf == 0 && rho_t1 > epsilon_) {
+  if (nf == 0 && rho_t1 > floor_val) {
     double tr_F_t1 = D * rho_t1 + d_t1.Sum(), tr_T_t = T_t.Trace();
     AssertEqual(tr_F_t1, tr_T_t);
   }
@@ -202,38 +216,50 @@ void OnlineNaturalGradientSimple::PreconditionDirectionsCpu(
   G_t_inv.Invert();
 
   double beta_t = rho_t_ + alpha_/D * F_t.Trace();
-  // P_t = beta_t R_t G_t^{-1}.
-  Matrix<double> P_t(N, D);
-  P_t.AddMatSp(beta_t, *R_t, kNoTrans, G_t_inv, 0.0);
+  // X_hat_t = beta_t X_t G_t^{-1}.
+  Matrix<double> X_hat_t(N, D);
+  X_hat_t.AddMatSp(beta_t, *X_t, kNoTrans, G_t_inv, 0.0);
 
-  double tr_r_r = TraceMatMat(*R_t, *R_t, kTrans),
-      tr_p_p = TraceMatMat(P_t, P_t, kTrans);
-  double gamma = (tr_p_p == 0 ? 1.0 : sqrt(tr_r_r / tr_p_p));
+  double tr_x_x = TraceMatMat(*X_t, *X_t, kTrans),
+      tr_Xhat_Xhat = TraceMatMat(X_hat_t, X_hat_t, kTrans);
+  double gamma = (tr_Xhat_Xhat == 0 ? 1.0 : sqrt(tr_x_x / tr_Xhat_Xhat));
 
-  R_t->CopyFromMat(P_t);
-  row_prod->AddDiagMat2(1.0, *R_t, kNoTrans, 0.0);
+  X_t->CopyFromMat(X_hat_t);
+  row_prod->AddDiagMat2(1.0, *X_t, kNoTrans, 0.0);
   *scale = gamma;
 
   // Update the parameters
   rho_t_ = rho_t1;
   d_t_.CopyFromVec(d_t1);
-  X_t_.CopyFromMat(X_t1);
+  R_t_.CopyFromMat(R_t1);
 
   KALDI_VLOG(3) << "rho_t_ = " << rho_t_;
   KALDI_VLOG(3) << "d_t_ = " << d_t_;
-  KALDI_VLOG(3) << "X_t_ = " << X_t_;
-  
+  KALDI_VLOG(3) << "R_t_ = " << R_t_;
+
 
-  { // check that X_t_ X_t_^T = I.
+  { // check that R_t_ R_t_^T = I.
     SpMatrix<double> unit(R);
-    unit.AddMat2(1.0, X_t_, kNoTrans, 0.0);
+    unit.AddMat2(1.0, R_t_, kNoTrans, 0.0);
+    if (!unit.IsUnit(1.0e-03)) {
+      KALDI_WARN  << "R is not orthogonal, reorthogonalizing.";
+      for (int32 i = 0; i < R; i++) {
+        SubVector<double> row(R_t_, i);
+        for (int32 j = 0; j < i; j++) {
+          SubVector<double> row_j(R_t_, j);
+          row.AddVec(-VecVec(row_j, row), row_j);
+        }
+        row.Scale(1.0 / row.Norm(2.0));
+      }
+    }
+    unit.AddMat2(1.0, R_t_, kNoTrans, 0.0);
     KALDI_ASSERT(unit.IsUnit(1.0e-03));
   }
 }
 
 
 void UnitTestPreconditionDirectionsOnline() {
-  MatrixIndexT R = 1 + Rand() % 5,  // rank of correction
+  MatrixIndexT R = 1 + Rand() % 30,  // rank of correction
       N = (2 * R) + Rand() % 30,  // batch size
       D = R + 1 + Rand() % 20; // problem dimension.  Must be > R.
 
@@ -242,11 +268,16 @@ void UnitTestPreconditionDirectionsOnline() {
   bool zero = false;
   bool one = false;
   if (Rand() % 3 == 0) zero = true;
-  else if (Rand() % 2 == 0) one = true;
-  
+  //else if (Rand() % 2 == 0) one = true;
+
   CuVector<BaseFloat> row_prod1(N), row_prod2(N);
   BaseFloat gamma1, gamma2;
-  
+  BaseFloat big_eig_factor = RandInt(1, 20);
+  big_eig_factor = big_eig_factor * big_eig_factor;
+  Vector<BaseFloat> big_eig_vector(D);
+  big_eig_vector.SetRandn();
+  big_eig_vector.Scale(big_eig_factor);
+
   OnlineNaturalGradientSimple preconditioner1;
   OnlineNaturalGradient preconditioner2;
   preconditioner1.SetRank(R);
@@ -255,117 +286,39 @@ void UnitTestPreconditionDirectionsOnline() {
 
   int32 num_iters = 100;
   for (int32 iter = 0; iter < num_iters; iter++) {
-    CuMatrix<BaseFloat> M(N, D);
-    if (one) M.Set(1.0);
-    else if (!zero)
-      M.SetRandn();
-    
+    Matrix<BaseFloat> M_cpu(N, D);
+    if (one) M_cpu.Set(1.0);
+    else if (!zero) {
+      M_cpu.SetRandn();
+      Vector<BaseFloat> rand_vec(N);
+      rand_vec.SetRandn();
+      M_cpu.AddVecVec(1.0, rand_vec, big_eig_vector);
+    }
+    CuMatrix<BaseFloat> M(M_cpu);
+
     CuMatrix<BaseFloat> Mcopy1(M), Mcopy2(M);
 
     preconditioner1.PreconditionDirections(&Mcopy1, &row_prod1, &gamma1);
 
     preconditioner2.PreconditionDirections(&Mcopy2, &row_prod2, &gamma2);
 
-    AssertEqual(Mcopy1, Mcopy2);
-  }
-  return;
-}
+    BaseFloat trace1 = TraceMatMat(M, M, kTrans),
+        trace2 = TraceMatMat(Mcopy1, Mcopy1, kTrans);
+    AssertEqual(trace1, trace2 * gamma2 * gamma2, 1.0e-02);
 
+    AssertEqual(Mcopy1, Mcopy2);
+    AssertEqual<BaseFloat>(row_prod1, row_prod2, 1.0e-02);
+    AssertEqual(gamma1, gamma2, 1.0e-02);
 
-// outputs eigs to rows of P.
-void ExactEigsOfProduct(const CuMatrixBase<BaseFloat> &M,
-                        MatrixTransposeType trans,
-                        CuMatrixBase<BaseFloat> *P,
-                        CuVectorBase<BaseFloat> *s) {
-  Matrix<BaseFloat> M_cpu(M);
-  int32 D = trans == kTrans ? M.NumCols() : M.NumRows();
-  SpMatrix<BaseFloat> S_cpu(D);
-  S_cpu.AddMat2(1.0, M_cpu, trans, 0.0);
-  Matrix<BaseFloat> P_cpu(D, D);
-  Vector<BaseFloat> s_cpu(D);
-  S_cpu.Eig(&s_cpu, &P_cpu);
-  SortSvd(&s_cpu, &P_cpu);
-  P->CopyFromMat(P_cpu.Range(0, D, 0, P->NumRows()), kTrans);
-  s->CopyFromVec(s_cpu.Range(0, P->NumRows()));
-}
-  
-
-void UnitTestApproxEigsOfProduct() {
-  int32 dimM = 10 + Rand() % 50,
-      dimN = 10 + Rand() % 50;
-  MatrixTransposeType trans = (Rand() % 2 == 0 ? kTrans : kNoTrans);
-  int32 product_dim = (trans == kTrans ? dimN : dimM),
-      other_dim = (trans == kTrans ? dimM : dimN);
-  
-  CuMatrix<BaseFloat> M(dimM, dimN);
-  if (Rand() % 4 == 0) {
-    M.SetRandn();
-  } else if (Rand() % 3 == 0) {
-    M.Row(2).SetRandn();
-  } else if (Rand() % 2 == 0) {
-    M.Row(2).SetRandn();
-    M.Row(4).SetRandn();
+    // make sure positive definite
+    CuVector<BaseFloat> inner_prods(M.NumRows());
+    inner_prods.AddDiagMatMat(1.0, M, kNoTrans, Mcopy1, kTrans, 0.0);
+    KALDI_ASSERT(inner_prods.Min() >= 0.0);
   }
-  // else leave M at zero.  We want to test
-  // full-rank M as well as zero, one or two eigenvalues
-  // being nonzero.
-
-  int32 rank = 1 + Rand() % (product_dim - 1);
-
-  CuMatrix<BaseFloat> P_approx(rank, product_dim),
-      P_exact(rank, product_dim);
-  CuVector<BaseFloat> s_approx(rank),
-      s_exact(rank);
-
-  ExactEigsOfProduct(M, trans, &P_exact, &s_exact);
-  ApproxEigsOfProduct(M, trans, &P_approx, &s_approx);
-
-  KALDI_LOG << "Approx eig sum is " << s_approx.Sum();
-  KALDI_LOG << "Exact eig sum is " << s_exact.Sum();
-
-  CuMatrix<BaseFloat> unit1(rank, rank), unit2(rank, rank);
-  unit1.AddMatMat(1.0, P_approx, kNoTrans, P_approx, kTrans, 0.0);
-  unit2.AddMatMat(1.0, P_exact, kNoTrans, P_exact, kTrans, 0.0);
-  KALDI_ASSERT(unit1.IsUnit());
-  KALDI_ASSERT(unit2.IsUnit());
-
-  CuMatrix<BaseFloat> Mproj_approx(rank, other_dim);
-  Mproj_approx.AddMatMat(1.0, P_approx, kNoTrans, M, trans, 0.0);
-  CuMatrix<BaseFloat> Mproj_exact(rank, other_dim);
-  Mproj_exact.AddMatMat(1.0, P_exact, kNoTrans, M, trans, 0.0);
-
-  CuVector<BaseFloat> s2_approx(rank), s2_exact(rank);
-  s2_approx.AddDiagMat2(1.0, Mproj_approx, kNoTrans, 0.0);
-  s2_exact.AddDiagMat2(1.0, Mproj_exact, kNoTrans, 0.0);  
-  KALDI_ASSERT(s_approx.ApproxEqual(s2_approx));
-  // KALDI_LOG << "s_exact is " << s_exact;
-  // KALDI_LOG << "s2_exact is " << s2_exact;
-  // KALDI_LOG << "P_exact is " << P_exact;
-  KALDI_ASSERT(s_exact.ApproxEqual(s2_exact));
-  
+  return;
 }
 
-/*
-  CuSpMatrix<BaseFloat> G(D);
-  G.SetUnit();
-  G.ScaleDiag(lambda);
-  // G += R^T R.
-  G.AddMat2(1.0/(N-1), R, kTrans, 1.0);
-  
-  for (int32 n = 0; n < N; n++) {
-    CuSubVector<BaseFloat> rn(R, n);
-    CuSpMatrix<BaseFloat> Gn(G);
-    Gn.AddVec2(-1.0/(N-1), rn); // subtract the
-    // outer product of "this" vector.
-    Gn.Invert();
-    CuSubVector<BaseFloat> pn(P, n);
-    CuVector<BaseFloat> pn_compare(D);
-    pn_compare.AddSpVec(1.0, Gn, rn, 0.0);
-    KALDI_ASSERT(pn.ApproxEqual(pn_compare, 0.1));
-  }
-}
 
-*/
 } // namespace nnet3
 } // namespace kaldi
 
@@ -380,9 +333,8 @@ int main() {
     else
       CuDevice::Instantiate().SelectGpuId("optional"); // -2 .. automatic selection
 #endif
-    for (int32 i = 0; i < 30; i++) {
+    for (int32 i = 0; i < 10; i++) {
       UnitTestPreconditionDirectionsOnline();
-      UnitTestApproxEigsOfProduct();
     }
   }
 }
diff --git a/src/nnet3/natural-gradient-online.cc b/src/nnet3/natural-gradient-online.cc
index 674cb86f895..9f046d4bf3a 100644
--- a/src/nnet3/natural-gradient-online.cc
+++ b/src/nnet3/natural-gradient-online.cc
@@ -23,51 +23,55 @@ namespace kaldi {
 namespace nnet3 {
 
 
-static void CheckOrthogonal(CuMatrixBase<BaseFloat> *N,
-                            bool quiet = false,
-                            int32 recurse_count = 0) {
-  if (recurse_count > 100)
-    KALDI_ERR << "CheckOrthogonal recursed >100 times, something is wrong.";
-  
-  int32 R = N->NumRows();
-  CuSpMatrix<BaseFloat> S(R);
-  S.AddMat2(1.0, *N, kNoTrans, 0.0);
-  if (!S.IsUnit(1.0e-04)) {
-    {
-      SpMatrix<BaseFloat> S_cpu(S);
-      if (!quiet)
-        KALDI_WARN << "Matrix N is not orthogonal, fixing it.  N N^T is "
-                   << S_cpu;
-      Vector<BaseFloat> s(R);
-      S_cpu.Eig(&s);
-      BaseFloat threshold = 0.001;
-      if (s.Min() < threshold) {
-        if (!quiet) {
-          KALDI_WARN << "Minimum eigenvalue of N N^T is less than " << threshold
-                     << ", may be hard to fix: re-initializing from random "
-                     << "start. Eigs are" << s;
-        }
-        N->SetRandn();
-        CheckOrthogonal(N, quiet, recurse_count + 1);
-        return;
-      }
+OnlineNaturalGradient::OnlineNaturalGradient():
+    rank_(40), update_period_(1), num_samples_history_(2000.0), alpha_(4.0),
+    epsilon_(1.0e-10), delta_(5.0e-04), t_(-1),
+    num_updates_skipped_(0), self_debug_(false) { }
+
+
+/**
+  This function creates a matrix with orthonormal rows that is like the
+  following matrix, except with each row normalized to have unit 2-norm:
+  [  1.1 0   1   0   1   0
+     0   1.1 0   1   0   1  ]
+  The reason why the first element in each row is 1.1 and not 1, is for
+  symmetry-breaking... we don't want any weighted sum of all these rows to be
+  all ones, because the derivative in that direction can be zero in some
+  architectures and it causes us to have to do an inefficient CPU-based
+  renormalization.
+ */
+// static
+void OnlineNaturalGradient::InitOrthonormalSpecial(CuMatrixBase<BaseFloat> *R) {
+  int32 num_rows = R->NumRows(), num_cols = R->NumCols();
+  KALDI_ASSERT(num_cols >= num_rows);
+  R->SetZero();
+  std::vector<MatrixElement<BaseFloat> > elems;
+  elems.reserve(num_cols);
+  BaseFloat first_elem = 1.1;
+  for (int32 r = 0; r < num_rows; r++) {
+    std::vector<int32> cols;  // columns that have an entry for this row
+    for (int32 c = r; c < num_cols; c += num_rows)
+      cols.push_back(c);
+    BaseFloat normalizer = 1.0 / sqrt(first_elem * first_elem +
+                                      cols.size() - 1);
+    for (size_t i = 0; i < cols.size(); i++) {
+      int32 c = cols[i];
+      MatrixElement<BaseFloat> e = { r, c,
+                                     normalizer * (i == 0 ? first_elem :
+                                                   BaseFloat(1.0)) };
+      elems.push_back(e);
     }
-    CuTpMatrix<BaseFloat> Cinv(R);
-    Cinv.Cholesky(S);
-    Cinv.Invert();
-    CuMatrix<BaseFloat> N_copy(*N);
-    N->AddTpMat(1.0, Cinv, kNoTrans, N_copy, kNoTrans, 0.0);
-    CheckOrthogonal(N, quiet, recurse_count + 1); // Check that it worked.
+  }
+  R->AddElements(1.0, elems);
+  { // TODO: remove this testing code.
+    CuMatrix<BaseFloat> prod(num_rows, num_rows);
+    prod.AddMatMat(1.0, *R, kNoTrans, *R, kTrans, 0.0);
+    KALDI_ASSERT(prod.IsUnit());
   }
 }
 
-OnlineNaturalGradient::OnlineNaturalGradient():
-    rank_(40), update_period_(1), num_samples_history_(2000.0), alpha_(4.0),
-    epsilon_(1.0e-10), delta_(1.0e-05), t_(-1),
-    num_updates_skipped_(0), self_debug_(false) { }
 
-void OnlineNaturalGradient::Init(const CuMatrixBase<BaseFloat> &R0) {
-  int32 D = R0.NumCols(), N = R0.NumRows();
+void OnlineNaturalGradient::InitDefault(int32 D) {
   if (rank_ >= D) {
     KALDI_WARN << "Rank " << rank_ << " of online preconditioner is >= dim " << D
                << ", setting it to "
@@ -80,64 +84,87 @@ void OnlineNaturalGradient::Init(const CuMatrixBase<BaseFloat> &R0) {
     // We'll handle this as a special case, for generality.
     return;
   }
-  
   KALDI_ASSERT(num_samples_history_ > 0.0 && num_samples_history_ <= 1.0e+6);
   KALDI_ASSERT(alpha_ >= 0.0);
   KALDI_ASSERT(rank_ > 0);
   KALDI_ASSERT(epsilon_ > 0.0 && epsilon_ <= 1.0e-05);  // plausible values.
   KALDI_ASSERT(delta_ > 0.0 && delta_ <= 1.0e-02);  // plausible values.
-  
-  int32 R = rank_;
-  W_t_.Resize(R, D);
-  d_t_.Resize(R);
-  CuVector<BaseFloat> L(R);
-  ApproxEigsOfProduct(R0, kTrans, &W_t_, &L);
-  // want L to be eigenvalues of 1/N R0 R0^T
-  L.Scale(1.0 / N);
-  
-  // \rho_0 = (1/N tr(R0 R0^T) - tr(L)) / (D - R)
-  rho_t_ = (TraceMatMat(R0, R0, kTrans) / N - L.Sum()) / (D - R);
-  BaseFloat floor_val = std::max(epsilon_, delta_ * L.Max());
-  if (rho_t_ < floor_val)
-    rho_t_ = floor_val;
-  d_t_.CopyFromVec(L);
-  d_t_.Add(-rho_t_);  // D_0 = L - \rho_0 I
-  d_t_.ApplyFloor(epsilon_);
-  
-  // beta_t = \rho_t(1+\alpha) + \alpha/D tr(D_t)
-  BaseFloat beta_t = rho_t_ * (1.0 + alpha_) + alpha_ * d_t_.Sum() / D;
-  Vector<BaseFloat> e_t(R), sqrt_e_t(R), inv_sqrt_e_t(R);
-  ComputeEt(d_t_, beta_t, &e_t, &sqrt_e_t, &inv_sqrt_e_t);
-  // Compute W_0 by scaling the rows of X_0 with E_0^{0.5}.
-  CuVector<BaseFloat> sqrt_e_t_gpu(sqrt_e_t);
-  W_t_.MulRowsVec(sqrt_e_t_gpu);
+
+  // to initialize, in the equation
+  //   F_t =(def) R_t^T D_t R_t + \rho_t I
+  // we will set the orthogonal R_t to a special orthogonal matrix with no zero
+  // rows or columns (see the function), rho_t to epsilon,
+  // and D_t to epsilon.  But we don't store R_t directly.  Instead, we store
+  //   W_t =(def)  E_t^{0.5} R_t,
+  // where E_t =(def)  1/\beta_t (D_t^{-1} + 1/\beta_t I)^{-1}
+  // from (eqn:tii),
+  //  e_{tii} =   1/(\beta_t/d_{tii} + 1),
+  // where
+  // \beta_t =(def) \rho_t + \alpha/D tr(F_t)
+  //         =      epsilon + alpha/D * (epsilon * D + epsilon * rank)
+  //         =     epsilon * (1 + alpha * (D + rank) / D)
+  // And  d_{tii} is epsilon, so
+  //  e_{tii} =   1/((1 + alpha * (D + rank) / D) + 1)  [for each i.]
+  //          =   1/(2 + alpha * (D + rank) / D)).
+  BaseFloat epsilon = epsilon_;  // we could make this a bit more.
+  rho_t_ = epsilon;
+  d_t_.Resize(rank_, kUndefined);
+  d_t_.Set(epsilon);
+  W_t_.Resize(rank_, D, kUndefined);
+  // after the next line, W_ will store the orthogonal matrix R_t.
+  InitOrthonormalSpecial(&W_t_);
+  BaseFloat E_tii = 1.0 / ( 2.0 + (D + rank_) * alpha_ / D );
+  // W_t =(def) E_t^{0.5} R_t.
+  W_t_.Scale(sqrt(E_tii));
+  t_ = 0;
+}
+
+void OnlineNaturalGradient::Init(const CuMatrixBase<BaseFloat> &R0) {
+  int32 D = R0.NumCols();
+  // for locking reasons it's better to use a different object.
+  OnlineNaturalGradient this_copy(*this);
+  this_copy.InitDefault(D);
+
+  CuMatrix<BaseFloat> R0_copy(R0.NumRows(), R0.NumCols(), kUndefined);
+  // number of iterations with the same data from a pseudorandom start.
+  // this is a faster way of starting than doing eigenvalue decomposition.
+  int32 num_init_iters = 3;
+  for (int32 i = 0; i < num_init_iters; i++) {
+    BaseFloat scale;
+    R0_copy.CopyFromMat(R0);
+    this_copy.PreconditionDirections(&R0_copy, NULL, &scale);
+  }
+  rank_ = this_copy.rank_;
+  W_t_.Swap(&this_copy.W_t_);
+  d_t_.Swap(&this_copy.d_t_);
+  rho_t_ = this_copy.rho_t_;
   t_ = 0;
 }
 
 void OnlineNaturalGradient::PreconditionDirections(
-    CuMatrixBase<BaseFloat> *R_t,
+    CuMatrixBase<BaseFloat> *X_t,
     CuVectorBase<BaseFloat> *row_prod,
     BaseFloat *scale) {
-  if (R_t->NumCols() == 1) {
+  if (X_t->NumCols() == 1) {
     // If the dimension of the space equals one then our natural gradient update
     // with rescaling becomes a no-op, but the code wouldn't naturally handle it
     // because rank would be zero.  Support this as a special case.
     if (row_prod)
-      row_prod->AddDiagMat2(1.0, *R_t, kNoTrans, 0.0);      
+      row_prod->AddDiagMat2(1.0, *X_t, kNoTrans, 0.0);
     *scale = 1.0;
     return;
   }
-  
+
   if (row_prod == NULL) {
-    CuVector<BaseFloat> row_prod_tmp(R_t->NumRows());
-    PreconditionDirections(R_t, &row_prod_tmp, scale);
+    CuVector<BaseFloat> row_prod_tmp(X_t->NumRows());
+    PreconditionDirections(X_t, &row_prod_tmp, scale);
     return;
   }
-  
+
   read_write_mutex_.Lock();
   if (t_ == -1) // not initialized
-    Init(*R_t);
-  
+    Init(*X_t);
+
   // Now t_ >= 0.
   // We create local copies  of the class variables... this is intended for
   // multi-threaded safety so we can't read them in an inconsistent state,
@@ -150,7 +177,7 @@ void OnlineNaturalGradient::PreconditionDirections(
   BaseFloat rho_t(rho_t_);
   Vector<BaseFloat> d_t(d_t_);
   read_write_mutex_.Unlock();
-  PreconditionDirectionsInternal(t, rho_t, d_t, &WJKL_t, R_t, row_prod, scale);
+  PreconditionDirectionsInternal(t, rho_t, d_t, &WJKL_t, X_t, row_prod, scale);
 }
 
 void OnlineNaturalGradient::ReorthogonalizeXt1(
@@ -167,10 +194,10 @@ void OnlineNaturalGradient::ReorthogonalizeXt1(
   BaseFloat beta_t1 = rho_t1 * (1.0 + alpha_) + alpha_ * d_t1.Sum() / D;
   Vector<BaseFloat> e_t1(R, kUndefined), sqrt_e_t1(R, kUndefined),
       inv_sqrt_e_t1(R, kUndefined);
-  ComputeEt(d_t1, beta_t1, &e_t1, &sqrt_e_t1, &inv_sqrt_e_t1);  
-  
+  ComputeEt(d_t1, beta_t1, &e_t1, &sqrt_e_t1, &inv_sqrt_e_t1);
+
   temp_O->SymAddMat2(1.0, *W_t1, kNoTrans, 0.0);
-  // O_t =  E_t^{-0.5} W_t W_t^T E_t^{-0.5}  
+  // O_t =  E_t^{-0.5} W_t W_t^T E_t^{-0.5}
   Matrix<BaseFloat> O_mat(*temp_O);
   SpMatrix<BaseFloat> O(O_mat, kTakeLower);
   for (int32 i = 0; i < R; i++) {
@@ -189,20 +216,24 @@ void OnlineNaturalGradient::ReorthogonalizeXt1(
   TpMatrix<BaseFloat> C(R);
   try {
     C.Cholesky(O);
+    C.Invert();  // Now it's C^{-1}.
+    if (!(C.Max() < 100.0))
+      KALDI_ERR << "Cholesky out of expected range, "
+                << "reorthogonalizing with Gram-Schmidt";
   } catch (...) {
-    // It would be very strange to reach this point, but we try to handle it
-    // gracefully anyway.
-    KALDI_WARN << "Cholesky failed while re-orthogonalizing X_t. "
-               << "Re-initializing as arbitrary orthogonal matrix.";
-    // set X_t to [I; 0] which is orthogonal.
-    W_t1->SetZero();
-    W_t1->AddToDiag(1.0);
-    // W_{t+1} = E_{t+1}^{0.5} X_{t+1}
+    // We do a Gram-Schmidt orthogonalization, which is a bit less efficient but
+    // more robust than the method using Cholesky.
+    KALDI_WARN << "Cholesky or Invert() failed while re-orthogonalizing R_t. "
+               << "Re-orthogonalizing on CPU.";
+    Matrix<BaseFloat> cpu_W_t1(*W_t1);
+    cpu_W_t1.OrthogonalizeRows();
+    W_t1->CopyFromMat(cpu_W_t1);
+    // at this point cpu_W_t1 represents R_{t+1}- it has orthonormal
+    // rows.  Do: W_{t+1} = E_{t+1}^{0.5} R_{t+1}
     CuVector<BaseFloat> sqrt_e_t1_gpu(sqrt_e_t1);
     W_t1->MulRowsVec(sqrt_e_t1_gpu);
     return;
   }
-  C.Invert();  // Now it's C^{-1}.
   // Next, compute (E_t^{0.5} C^{-1} E_t^{-0.5})
   // but it's really t+1, not t.
   for (int32 i = 0; i < R; i++) {
@@ -219,16 +250,62 @@ void OnlineNaturalGradient::ReorthogonalizeXt1(
   W_t1->AddMatMat(1.0, *temp_O, kNoTrans, *temp_W, kNoTrans, 0.0);
 }
 
+// makes sure certain invariants are being preserved
+void OnlineNaturalGradient::SelfTest() const {
+  KALDI_ASSERT(rho_t_ >= epsilon_);
+  BaseFloat d_t_max = d_t_.Max(), d_t_min = d_t_.Min();
+  KALDI_ASSERT(d_t_min >= epsilon_);
+  KALDI_ASSERT(d_t_min > 0.9 * delta_ * d_t_max);
+  KALDI_ASSERT(rho_t_ > 0.9 * delta_ * d_t_max);
+
+  int32 D = W_t_.NumCols(), R = W_t_.NumRows();
+  BaseFloat beta_t = rho_t_ * (1.0 + alpha_) + alpha_ * d_t_.Sum() / D;
+  Vector<BaseFloat> e_t(R, kUndefined), sqrt_e_t(R, kUndefined),
+      inv_sqrt_e_t(R, kUndefined);
+  ComputeEt(d_t_, beta_t, &e_t, &sqrt_e_t, &inv_sqrt_e_t);
+
+  CuSpMatrix<BaseFloat> S(R);
+  S.AddMat2(1.0, W_t_, kNoTrans, 0.0);
+  SpMatrix<BaseFloat> O(S);
+  for (int32 i = 0; i < R; i++) {
+    BaseFloat i_factor = inv_sqrt_e_t(i);
+    for (int32 j = 0; j <= i; j++) {
+      BaseFloat j_factor = inv_sqrt_e_t(j);
+      O(i, j) *= i_factor * j_factor;
+    }
+  }
+  if (!O.IsUnit(1.0e-04) || O(0, 0) != O(0, 0)) {
+    BaseFloat worst_error = 0.0;
+    int32 worst_i = 0, worst_j = 0;
+    for (int32 i = 0; i < R; i++) {
+      for (int32 j = 0; j < R; j++) {
+        BaseFloat elem = O(i, j);
+        BaseFloat error = fabs(elem - (i == j ? 1.0 : 0.0));
+        if (error > worst_error || error != error) {
+          worst_error = error;
+          worst_i = i;
+          worst_j = j;
+        }
+      }
+    }
+    if (worst_error > 1.0e-02 || worst_error != worst_error) {
+      KALDI_WARN << "Failed to verify W_t (worst error: O[" << worst_i << ','
+                 << worst_j << "] = " << O(worst_i, worst_j)
+                 << ", d_t = " << d_t_;
+    }
+  }
+}
+
 void OnlineNaturalGradient::PreconditionDirectionsInternal(
     const int32 t,
     const BaseFloat rho_t,
     const Vector<BaseFloat> &d_t,
     CuMatrixBase<BaseFloat> *WJKL_t,
-    CuMatrixBase<BaseFloat> *R_t,
+    CuMatrixBase<BaseFloat> *X_t,
     CuVectorBase<BaseFloat> *row_prod,
     BaseFloat *scale) {
-  int32 N = R_t->NumRows(),  // Minibatch size.
-      D = R_t->NumCols(),  // Dimensions of vectors we're preconditioning
+  int32 N = X_t->NumRows(),  // Minibatch size.
+      D = X_t->NumCols(),  // Dimensions of vectors we're preconditioning
       R = rank_;  // Rank of correction to unit matrix.
   KALDI_ASSERT(R > 0 && R < D);
   BaseFloat eta = Eta(N);
@@ -242,9 +319,9 @@ void OnlineNaturalGradient::PreconditionDirectionsInternal(
       K_t(*WJKL_t, R, R, D, R),
       WJ_t(*WJKL_t, 0, 2 * R, 0, D),
       LK_t(*WJKL_t, 0, 2 * R, D, R);
-  
-  H_t.AddMatMat(1.0, *R_t, kNoTrans, W_t, kTrans, 0.0);  // H_t = R_t W_t^T
-  
+
+  H_t.AddMatMat(1.0, *X_t, kNoTrans, W_t, kTrans, 0.0);  // H_t = X_t W_t^T
+
   bool locked = update_mutex_.TryLock();
   if (locked) {
     // Just hard-code it here that we do 10 updates before skipping any.
@@ -258,7 +335,7 @@ void OnlineNaturalGradient::PreconditionDirectionsInternal(
       locked = false;
     }
   }
-  
+
   if (!locked) {
     // We're not updating the parameters, either because another thread is
     // working on updating them, or because another thread already did so from
@@ -270,24 +347,24 @@ void OnlineNaturalGradient::PreconditionDirectionsInternal(
     // on very rare occasions, we could skip one or two more updates than we
     // intended.
     num_updates_skipped_++;
-    
-    BaseFloat tr_Rt_RtT = TraceMatMat(*R_t, *R_t, kTrans);
-    // P_t = R_t - H_t W_t
-    R_t->AddMatMat(-1.0, H_t, kNoTrans, W_t, kNoTrans, 1.0); 
-    // each element i of row_prod will be inner product of row i of P_t with
+
+    BaseFloat tr_Xt_XtT = TraceMatMat(*X_t, *X_t, kTrans);
+    // X_hat_t = X_t - H_t W_t
+    X_t->AddMatMat(-1.0, H_t, kNoTrans, W_t, kNoTrans, 1.0);
+    // each element i of row_prod will be inner product of row i of X_hat_t with
     // itself.
-    row_prod->AddDiagMat2(1.0, *R_t, kNoTrans, 0.0);
-    BaseFloat tr_Pt_PtT = row_prod->Sum();
-    KALDI_ASSERT(tr_Pt_PtT == tr_Pt_PtT);  // Check for NaN.
-    BaseFloat gamma_t = (tr_Pt_PtT == 0.0 ? 1.0 :
-                         sqrt(tr_Rt_RtT / tr_Pt_PtT));
+    row_prod->AddDiagMat2(1.0, *X_t, kNoTrans, 0.0);
+    BaseFloat tr_Xhat_XhatT = row_prod->Sum();
+    KALDI_ASSERT(tr_Xhat_XhatT == tr_Xhat_XhatT);  // Check for NaN.
+    BaseFloat gamma_t = (tr_Xhat_XhatT == 0.0 ? 1.0 :
+                         sqrt(tr_Xt_XtT / tr_Xhat_XhatT));
     *scale = gamma_t;
     return;
   }
-  J_t.AddMatMat(1.0, H_t, kTrans, *R_t, kNoTrans, 0.0);  // J_t = H_t^T R_t
+  J_t.AddMatMat(1.0, H_t, kTrans, *X_t, kNoTrans, 0.0);  // J_t = H_t^T X_t
 
   bool compute_lk_together = (N > D);
-  
+
   if (compute_lk_together) {
     // do the following two multiplies in one operation...
     // note
@@ -336,7 +413,7 @@ void OnlineNaturalGradient::PreconditionDirectionsInternal(
   // negative, since we don't take the absolute value, but this is the right
   // thing anyway.
   bool must_reorthogonalize = (c_t(0) > condition_threshold * c_t(R - 1));
-  
+
   BaseFloat c_t_floor = pow(rho_t * (1 - eta), 2);
   int32 nf = c_t.ApplyFloor(c_t_floor);
   if (nf > 0)
@@ -344,31 +421,31 @@ void OnlineNaturalGradient::PreconditionDirectionsInternal(
   if (nf > 0 && self_debug_) {
     KALDI_WARN << "Floored " << nf << " elements of C_t.";
   }
-  BaseFloat tr_Rt_RtT_check;
+  BaseFloat tr_Xt_XtT_check;
   if (self_debug_)
-    tr_Rt_RtT_check = TraceMatMat(*R_t, *R_t, kTrans);
-  
-  R_t->AddMatMat(-1.0, H_t, kNoTrans, W_t, kNoTrans, 1.0);  // P_t = R_t - H_t W_t
-  // set *row_prod to inner products of each row of P_t with itself.
-  row_prod->AddDiagMat2(1.0, *R_t, kNoTrans, 0.0);
-
-  BaseFloat tr_Pt_PtT = row_prod->Sum();
-  //  tr(R_t R_t^T) = tr(P_t P_t^T) - tr(L_t E_t) + 2 tr(L_t)  
-  double tr_Rt_RtT = tr_Pt_PtT;
+    tr_Xt_XtT_check = TraceMatMat(*X_t, *X_t, kTrans);
+
+  X_t->AddMatMat(-1.0, H_t, kNoTrans, W_t, kNoTrans, 1.0);  // X_hat_t = X_t - H_t W_t
+  // set *row_prod to inner products of each row of X_hat_t with itself.
+  row_prod->AddDiagMat2(1.0, *X_t, kNoTrans, 0.0);
+
+  BaseFloat tr_Xhat_XhatT = row_prod->Sum();
+  //  tr(X_t X_t^T) = tr(X_hat_t X_hat_t^T) - tr(L_t E_t) + 2 tr(L_t)
+  double tr_Xt_XtT = tr_Xhat_XhatT;
   for (int32 i = 0; i < R; i++)
-    tr_Rt_RtT += L_t_cpu(i, i) * (2.0 - e_t(i));
+    tr_Xt_XtT += L_t_cpu(i, i) * (2.0 - e_t(i));
   if (self_debug_) {
-    KALDI_ASSERT(ApproxEqual(tr_Rt_RtT, tr_Rt_RtT_check));
+    KALDI_ASSERT(ApproxEqual(tr_Xt_XtT, tr_Xt_XtT_check));
   }
-  BaseFloat gamma_t = (tr_Pt_PtT == 0.0 ? 1.0 :
-                       sqrt(tr_Rt_RtT / tr_Pt_PtT));
+  BaseFloat gamma_t = (tr_Xhat_XhatT == 0.0 ? 1.0 :
+                       sqrt(tr_Xt_XtT / tr_Xhat_XhatT));
   *scale = gamma_t;
 
   Vector<BaseFloat> sqrt_c_t(c_t);
   sqrt_c_t.ApplyPow(0.5);
-  
-  // \rho_{t+1} = 1/(D - R) (\eta/N tr(R_t R_t^T) + (1-\eta)(D \rho_t + tr(D_t)) - tr(C_t^{0.5})).  
-  BaseFloat rho_t1 = 1.0 / (D - R) * (eta / N * tr_Rt_RtT
+
+  // \rho_{t+1} = 1/(D - R) (\eta/N tr(X_t X_t^T) + (1-\eta)(D \rho_t + tr(D_t)) - tr(C_t^{0.5})).
+  BaseFloat rho_t1 = 1.0 / (D - R) * (eta / N * tr_Xt_XtT
                                       + (1-eta)*(D * rho_t + d_t.Sum())
                                       - sqrt_c_t.Sum());
   // D_{t+1} = C_t^{0.5} - \rho_{t+1} I
@@ -377,7 +454,7 @@ void OnlineNaturalGradient::PreconditionDirectionsInternal(
   BaseFloat floor_val = std::max(epsilon_, delta_ * sqrt_c_t.Max());
   if (rho_t1 < floor_val)
     rho_t1 = floor_val;
-  d_t1.ApplyFloor(epsilon_);
+  d_t1.ApplyFloor(floor_val);
 
   CuMatrix<BaseFloat> W_t1(R, D);  // W_{t+1}
   ComputeWt1(N, d_t, d_t1, rho_t, rho_t1, U_t, sqrt_c_t, inv_sqrt_e_t,
@@ -394,7 +471,6 @@ void OnlineNaturalGradient::PreconditionDirectionsInternal(
                        &L_t);
   }
 
-
   // Commit the new parameters.
   read_write_mutex_.Lock();
   KALDI_ASSERT(t_ == t);  // we already ensured this.
@@ -403,28 +479,35 @@ void OnlineNaturalGradient::PreconditionDirectionsInternal(
   W_t_.Swap(&W_t1);
   d_t_.CopyFromVec(d_t1);
   rho_t_ = rho_t1;
-  
+
+  if (self_debug_)
+    SelfTest();
+
   read_write_mutex_.Unlock();
   update_mutex_.Unlock();
 }
 
 BaseFloat OnlineNaturalGradient::Eta(int32 N) const {
   KALDI_ASSERT(num_samples_history_ > 0.0);
-  return 1.0 - exp(-N / num_samples_history_);
+  BaseFloat ans = 1.0 - exp(-N / num_samples_history_);
+  // Don't let eta approach 1 too closely, as it can lead to NaN's appearing if
+  // the input is all zero.
+  if (ans > 0.9) ans = 0.9;
+  return ans;
 }
 
 void OnlineNaturalGradient::ComputeWt1(int32 N,
-                                      const VectorBase<BaseFloat> &d_t,
-                                      const VectorBase<BaseFloat> &d_t1,
-                                      BaseFloat rho_t,
-                                      BaseFloat rho_t1,
-                                      const MatrixBase<BaseFloat> &U_t,
-                                      const VectorBase<BaseFloat> &sqrt_c_t,
-                                      const VectorBase<BaseFloat> &inv_sqrt_e_t,                                      
-                                      const CuMatrixBase<BaseFloat> &W_t,
-                                      CuMatrixBase<BaseFloat> *J_t,
-                                      CuMatrixBase<BaseFloat> *W_t1) const {
-  
+                                       const VectorBase<BaseFloat> &d_t,
+                                       const VectorBase<BaseFloat> &d_t1,
+                                       BaseFloat rho_t,
+                                       BaseFloat rho_t1,
+                                       const MatrixBase<BaseFloat> &U_t,
+                                       const VectorBase<BaseFloat> &sqrt_c_t,
+                                       const VectorBase<BaseFloat> &inv_sqrt_e_t,
+                                       const CuMatrixBase<BaseFloat> &W_t,
+                                       CuMatrixBase<BaseFloat> *J_t,
+                                       CuMatrixBase<BaseFloat> *W_t1) const {
+
   int32 R = d_t.Dim(), D = W_t.NumCols();
   BaseFloat eta = Eta(N);
 
@@ -436,7 +519,7 @@ void OnlineNaturalGradient::ComputeWt1(int32 N,
   ComputeEt(d_t1, beta_t1, &e_t1, &sqrt_e_t1, &inv_sqrt_e_t1);
   Vector<BaseFloat> inv_sqrt_c_t(sqrt_c_t);
   inv_sqrt_c_t.InvertElements();
-  
+
   Vector<BaseFloat> w_t_coeff(R);
   for (int32 i = 0; i < R; i++)
     w_t_coeff(i) = (1.0 - eta) / (eta/N) * (d_t(i) + rho_t);
@@ -456,28 +539,6 @@ void OnlineNaturalGradient::ComputeWt1(int32 N,
   // W_{t+1} = A_t B_t
   CuMatrix<BaseFloat> A_t_gpu(A_t);
   W_t1->AddMatMat(1.0, A_t_gpu, kNoTrans, *J_t, kNoTrans, 0.0);
-
-  if (self_debug_) {
-    CuMatrix<BaseFloat> W_t1_prod(R, R);
-    W_t1_prod.SymAddMat2(1.0, *W_t1, kNoTrans, 0.0);
-    W_t1_prod.CopyLowerToUpper();
-    Matrix<BaseFloat> W_t1_prod_cpu(W_t1_prod);
-    // Verifying that W_{t+1} W_{t+1}^T == E_t, via
-    // E_{-0.5} W_{t+1} W_{t+1}^T E_{-0.5} == I.
-    for (int32 i = 0; i < R; i++)
-      for (int32 j = 0; j < R; j++)
-        W_t1_prod_cpu(i, j) *= inv_sqrt_e_t1(i) * inv_sqrt_e_t1(j);
-    for (int32 i = 0; i < R; i++) {
-      for (int32 j = 0; j < R; j++) {
-        BaseFloat elem = W_t1_prod_cpu(i, j);
-        if ((i == j && fabs(elem - 1.0) > 0.1) ||
-            (i != j && fabs(elem) > 1.0e-02)) {
-          KALDI_WARN << "Failed to verify W_{t+1}, the following should be unit: "
-                     << W_t1_prod_cpu;
-        }
-      }
-    }
-  }
 }
 
 void OnlineNaturalGradient::ComputeZt(int32 N,
@@ -529,106 +590,6 @@ void OnlineNaturalGradient::ComputeEt(const VectorBase<BaseFloat> &d_t,
 }
 
 
-/**
-   I'm not very satisfied with the implementation of this function, but a
-   careful GPU-oriented version would take a while to do correctly, mainly due
-   to the necessity to implement orthogonalization of a matrix where the matrix
-   might have a reduced rank and we might have to "complete" it with random
-   rows.  Anyway, in the current implementation we just move an inner-product
-   matrix to the CPU and compute the approximate top eigenvalues there.
- */
-void ApproxEigsOfProduct(const CuMatrixBase<BaseFloat> &M,
-                         MatrixTransposeType trans,
-                         CuMatrixBase<BaseFloat> *P,
-                         CuVectorBase<BaseFloat> *s) {
-  int32 R = P->NumRows(), D = P->NumCols();
-  
-  // First make sure, for simplicity, that trans == kNoTrans.
-  if (trans == kTrans) {
-    CuMatrix<BaseFloat> M_trans(M, kTrans);
-    ApproxEigsOfProduct(M_trans, kNoTrans, P, s);
-    return;
-  }
-  // Next, make sure we can handle the case when the number of requested
-  // eigenvalues is more than smaller of (#columns/#rows)... this makes sense
-  // in a situation where we are asked for a number eigenvalues of R R^T that
-  // is greater than the #cols of R.  The remaining eigenvectors should be zero.
-  if (R > std::min(M.NumRows(), M.NumCols())) {
-    KALDI_ASSERT(R <= D);
-    int32 R_tmp = std::min(M.NumRows(), M.NumCols());
-    CuSubMatrix<BaseFloat> P_part(*P, 0, R_tmp, 0, D);
-    CuSubVector<BaseFloat> s_part(*s, 0, R_tmp);
-    s->SetZero();
-    ApproxEigsOfProduct(M, trans, &P_part, &s_part);
-    Matrix<BaseFloat> P_cpu(*P);
-    P_cpu.OrthogonalizeRows();  // Will fill the remaining rows of P_cpu with
-                                // random vectors and ensure P P^T = I.
-    P->CopyFromMat(P_cpu);
-    return;
-  }
-  
-  KALDI_ASSERT(R <= D && R > 0 && s->Dim() == R);
-  if (trans == kNoTrans) {
-    KALDI_ASSERT(D == M.NumRows());
-  } else {
-    KALDI_ASSERT(D == M.NumCols());
-  }
-
-  if (M.NumRows() < M.NumCols()) {
-    // Quicker to compute eigenvalues of M M^T
-    CuMatrix<BaseFloat> MMT(M.NumRows(), M.NumRows());
-    MMT.SymAddMat2(1.0, M, kNoTrans, 0.0);
-    CuSpMatrix<BaseFloat> MMT_sp(MMT, kTakeLower);
-    SpMatrix<BaseFloat> MMT_cpu(MMT_sp);
-
-    Vector<BaseFloat> s_cpu(R);
-    Matrix<BaseFloat> P_cpu(D, R);  // It's actually the columns of P that are
-                                    // the eigenvectors.
-    // Uses default configuration to get top eigenvalues approximately.
-    MMT_cpu.TopEigs(&s_cpu, &P_cpu);  
-    P->CopyFromMat(P_cpu, kTrans);
-    s->CopyFromVec(s_cpu);
-  } else {
-    // Quicker to compute eigenvalues of M^T M
-    int32 D = M.NumCols();
-    CuMatrix<BaseFloat> MTM(D, D);
-    MTM.SymAddMat2(1.0, M, kTrans, 0.0);
-    CuSpMatrix<BaseFloat> MTM_sp(MTM, kTakeLower);
-    SpMatrix<BaseFloat> MTM_cpu(MTM_sp);
-
-    Vector<BaseFloat> s_cpu(R);
-    Matrix<BaseFloat> Q_cpu(D, R);  // It's actually the columns of Q that are
-                                    // the eigenvectors.
-    MTM_cpu.TopEigs(&s_cpu, &Q_cpu);  // Uses default configuration.
-    
-    // OK, suppose we have some eigenvector v, so M^T M v = \lambda v.  Define w
-    // = M v.  Then M M^T M v = M (M^T M v) = M (\lambda v) = \lambda M M^T w.
-    // Then w = M v is also an eigenvector of M M^T, with the same eigenvalue
-    // \lambda.
-    // However, we might have a problem if M v == 0 (this is only possible if
-    // some eigenvalues are zero); in this case we won't be able to renormalize
-    // w to have unit norm.  We'll let OrthogonalizeRows() take care of that,
-    // though.  Anyway, just to avoid having to think about it to hard,
-    // we'll recompute the eigenvalues after computing P = Q^T M^T below
-    // and orthogonalizing its rows.
-    // Note: the tranpose on Q in the above equation is because our Q_cpu
-    // has its columns, not rows, as the eigenvectors.
-
-    Matrix<BaseFloat> P_cpu(R, M.NumRows());
-    Matrix<BaseFloat> M_cpu(M);
-    P_cpu.AddMatMat(1.0, Q_cpu, kTrans, M_cpu, kTrans, 0.0);
-    P_cpu.OrthogonalizeRows();
-    P->CopyFromMat(P_cpu);
-
-    // we will set s according to diag(s) = P M M^T P^T,
-    // which we can get by computing P M, and doing AddDiagMat2
-    CuMatrix<BaseFloat> PM(R, M.NumCols());
-    PM.AddMatMat(1.0, *P, kNoTrans, M, kNoTrans, 0.0);
-    s->SetZero();  // In case it had NaN's in it.
-    s->AddDiagMat2(1.0, PM, kNoTrans, 0.0);
-  }
-}
-
 OnlineNaturalGradient::OnlineNaturalGradient(const OnlineNaturalGradient &other):
     rank_(other.rank_), update_period_(other.update_period_),
     num_samples_history_(other.num_samples_history_),
@@ -636,7 +597,7 @@ OnlineNaturalGradient::OnlineNaturalGradient(const OnlineNaturalGradient &other)
     t_(other.t_), num_updates_skipped_(other.num_updates_skipped_),
     self_debug_(other.self_debug_), W_t_(other.W_t_),
     rho_t_(other.rho_t_), d_t_(other.d_t_) {
-  // use default constructor for the mutextes.
+  // use default constructor for the mutexes.
 }
 
 OnlineNaturalGradient& OnlineNaturalGradient::operator = (
@@ -656,7 +617,7 @@ OnlineNaturalGradient& OnlineNaturalGradient::operator = (
 
 void OnlineNaturalGradient::SetRank(int32 rank) {
   KALDI_ASSERT(rank > 0);
-  rank_ = rank;  
+  rank_ = rank;
 }
 void OnlineNaturalGradient::SetUpdatePeriod(int32 update_period) {
   KALDI_ASSERT(update_period > 0);
diff --git a/src/nnet3/natural-gradient-online.h b/src/nnet3/natural-gradient-online.h
index d4c0bf861b7..1bf48b9bf75 100644
--- a/src/nnet3/natural-gradient-online.h
+++ b/src/nnet3/natural-gradient-online.h
@@ -40,103 +40,96 @@ namespace nnet3 {
    it is referred to as online NG-SGD.  Note that the method exported
    from this header is just the core of the algorithm, and some outer-level parts
    of it are implemented in class NaturalGradientAffineComponent.
-   
-  Note regarding AISTATS/arXiv paper: we have different notation there.  The following is
-  the mapping from our notation to the AISTATS/arXiv paper notation.
-     R_t -> X_t
-     P_t -> \hat{X}_t
-     Q_t -> \bar{X}_t
-     X_t -> V_t
 
   The rest of this extended comment describes the way we keep updated an estimate
   of the inverse of a scatter matrix, in an online way.  This is the same as the
   estimation of one of the A or B quantities in the paper.  This comment is slightly
   redundant with the paper- actually it precedes the paper- but we keep it in case it
   is useful in understanging our method.
-     
+
   We consider the problem of doing online estimation of a (scaled-identity plus low-rank)
   approximation of a Fisher matrix... since the Fisher matrix is a scatter of vector-valued derivatives
   and we will be given the derivatives (or at least terms in a factorization of the derivatives
   which need not concern us right now), we can just think of the present task as being
   the online accumulation of a (low-rank plus scaled-identity) approximation to a variance
   of a distribution with mean zero.
-  
+
   Later on we'll think about how to get easy access to the inverse of this approximate
   variance, which is what we really need.
 
   Our approximation to the Fisher matrix (the scatter of derivatives) will be of the following form
   (and just think of this as an approximate variance matrix of some arbitrary quantities).
 
-     F_t =(def) X_t^T D_t X_t + \rho_t I
+     F_t =(def) R_t^T D_t R_t + \rho_t I
 
-  (t is the minibatch index), where X_t is an R by D matrix with orthonormal
+  (t is the minibatch index), where R_t is an R by D matrix with orthonormal
   rows (1 <= R < D is our chosen rank), D_t is a positive-definite diagonal matrix, and
   \rho_t > 0.  Suppose the dimension of F_t is D.  Let the vectors whose variance
   we are approximating be provided in minibatches of size M (M can vary from
   iteration to iteration, but it won't vary in the normal case, so we omit the
-  subscript t).  The batch of gradients is given as R_t \in Re^{M \times D},
+  subscript t).  The batch of gradients is given as X_t \in Re^{M \times D},
   i.e. each row is one of the vectors whose scatter we're estimating.  On the
-  t'th iteration, define the scatter S_t of the input vectors R_t as:
+  t'th iteration, define the scatter S_t of the input vectors X_t as:
+
+     S_t =(def) 1/N X_t^T X_t           (eqn:St)
 
-     S_t =(def) 1/N R_t^T R_t           (eqn:St)
-     
   (where N is the minibatch size).  Be careful not to confuse the rank R with
-  with input R_t (we would typeface R_t in bold if this were not plain text, to
+  with input X_t (we would typeface X_t in bold if this were not plain text, to
   make the distinction clearer).  We want F_t to approach some kind of
   time-weighted average of the S_t quantities, to the extent permitted by the
   limitation of the rank R.  We want the F_t quantities to stay "fresh" (since
   we'll be doing this in a SGD context and the parameters will be slowly
   changing).  We use a constant 0 < \eta < 1 to control the updating rate.  Our
-  update for X_t is based on the power method.  Define the smoothed scatter
+  update for R_t is based on the power method.  Define the smoothed scatter
 
    T_t =(def) \eta S_t + (1-\eta) F_t
 
   we'll use this in place of the observed scatter S_t, to slow down the update.
   Defining
-  
-   Y_t =(def) X_t T_t
+
+   Y_t =(def) R_t T_t
 
   which can be expanded as follows:
-       Y_t = X_t ( \eta S_t + (1-\eta) F_t )
-           = X_t ( \eta S_t + (1-\eta) (X_t^T D_t X_t + \rho_t I) )
-           = X_t ( \eta S_t + (1-\eta) (X_t^T D_t X_t + \rho_t I) )
-           = \eta X_t S_t + (1-\eta) (D_t + \rho_t I) X_t
+       Y_t = R_t ( \eta S_t + (1-\eta) F_t )
+           = R_t ( \eta S_t + (1-\eta) (R_t^T D_t R_t + \rho_t I) )
+           = R_t ( \eta S_t + (1-\eta) (R_t^T D_t R_t + \rho_t I) )
+           = \eta R_t S_t + (1-\eta) (D_t + \rho_t I) R_t
 
   It is useful to think of Y_t as having each of the top eigenvectors of the
-  scatter scaled by the corresponding eigenvalue \lambda_i. 
+  scatter scaled by the corresponding eigenvalue \lambda_i.
   We compute the following R by R matrix:
     Z_t =(def) Y_t Y_t^T
   and do the symmetric eigenvalue decomposition
     Z_t = U_t C_t U_t^T
   where C_t is diagonal and U_t orthogonal; the diagonal elements of C_t will be
-  positive (since \rho_t > 0, T_t is positive definite; since X_t has full row rank
+  positive (since \rho_t > 0, T_t is positive definite; since R_t has full row rank
   and T_t is positive definite, Y_t has full row rank; hence Z_t is positive definite).
   The diagonal elements of C_t can be thought of as corresponding to the squares of
   our current estimate of the top eigenvalues of the scatter matrix.
   [we should check that no element of C_t is <= 0.]
-  
+
   It is easy to show that C_t^{-0.5} U_t^T Z_t U_t C_t^{-0.5} = I, so
      (C_t^{-0.5} U_t^T Y_t) (Y_t^T U_t C_t^{-0.5}) = I.  Define
-    X_{t+1} =(def) C_t^{-0.5} U_t^T Y_t
+    R_{t+1} =(def) C_t^{-0.5} U_t^T Y_t
 
-  and it's clear that X_{t+1} X_{t+1}^T = I. 
+  and it's clear that R_{t+1} R_{t+1}^T = I.
   We will set
      D_{t+1} =(def) C_t^{0.5} - \rho_{t+1} I             (eqn:dt1)
 
-  which ensures that for each row x of X_{t+1}, the variance of our scatter
+  which ensures that for each row r of R_{t+1}, the variance of our scatter
   matrix F_{t+1} will be the square root of the corresponding diagonal element
   of C_t.  This makes sense because, as we have pointed out, the diagonal
   elements of C_t can be thought of as corresponding to squared eigenvalues.
   But a proper treatment of this would require convergence analysis that would
   get quite complicated.  We will choose \rho_{t+1} in order to ensure that
   tr(F_{t+1}) = tr(T_t).
-  
+
   For any t,
      tr(F_t) = D \rho_t + tr(D_t)
      tr(T_t) = \eta tr(S_t) + (1-\eta) tr(F_t)
              = \eta tr(S_t) + (1-\eta) (D \rho_t + tr(D_t))
   Expanding out D_{t+1} from (eqn:dt1) in the expression for tr(F_{t+1}) below:
-      tr(F_{t+1})  = D \rho_{t+1} +  tr(D_{t+1}) 
+      tr(F_{t+1})  = D \rho_{t+1} +  tr(D_{t+1})
       tr(F_{t+1})  = D \rho_{t+1} +  tr(C_t^{0.5} - \rho_{t+1} I)
                    = (D - R) \rho_{t+1} + tr(C_t^{0.5})
    and equating tr(F_{t+1}) with T_t (since F_{t+1} is supposed to be a low-rank
@@ -157,7 +150,7 @@ namespace nnet3 {
   D_{t+1} and \rho_{t+1} as above, we floor \rho_{t+1} to a small value (like
   1.0e-10).
 
-  OK, we have described the updating of X_t, D_t and \rho_t.  Next, we need to
+  OK, we have described the updating of R_t, D_t and \rho_t.  Next, we need to
   figure out how to efficiently multiply by the inverse of F_t.  Our experience
   from working with the old preconditioning method was that it's best not to use
   the inverse of the Fisher matrix itself, but a version of the Fisher matrix
@@ -166,22 +159,22 @@ namespace nnet3 {
   designed to ensure that the smoothing varies proportionally with the scale of F_t:
 
         G_t =(def) F_t +  \alpha/D tr(F_t) I
-            =     X_t^T D_t X_t + (\rho_t + \alpha/D tr(F_t)) I
-            =     X_t^T D_t X_t + \beta_t I
-  where            
+            =     R_t^T D_t R_t + (\rho_t + \alpha/D tr(F_t)) I
+            =     R_t^T D_t R_t + \beta_t I
+  where
     \beta_t =(def) \rho_t + \alpha/D tr(F_t)
             =      \rho_t(1+\alpha) + \alpha/D tr(D_t)       (eqn:betat2)
 
   Define
-     P_t =(def)  \beta_t R_t G_t^{-1}.
+     \hat{X}_t =(def)  \beta_t X_t G_t^{-1}.
   the factor of \beta_t is inserted arbitrarily as it just happens to be convenient
-  to put unit scale on R_t in the formula for P_t; it will anyway be canceled out
+  to put unit scale on X_t in the formula for \hat{X}_t; it will anyway be canceled out
   in the next step.  Then our final preconditioned minibatch of vectors is:
-     Q_t = \gamma_t P_t
+     \bar{X}_t = \gamma_t \hat{X}_t
   where
-     \gamma_t = sqrt(tr(R_t R_t^T)  / tr(P_t P_t^T).
-  The factor of \gamma ensures that Q_t is scaled to have the same overall
-  2-norm as the input R_t.  We found in previous versions of this method that this
+     \gamma_t = sqrt(tr(X_t X_t^T)  / tr(\hat{X}_t \hat{X}_t^T).
+  The factor of \gamma ensures that \bar{X}_t is scaled to have the same overall
+  2-norm as the input X_t.  We found in previous versions of this method that this
   rescaling was helpful, as otherwise there are certain situations (e.g. at the
   start of training) where the preconditioned derivatives can get very large.  Note
   that this rescaling introduces a small bias into the training, because now the
@@ -191,61 +184,61 @@ namespace nnet3 {
   To efficiently compute G_t^{-1}, we will use the Woodbury matrix identity.
   Writing the Woodbury formula for the symmetric case,
     (A + U D U^T)^{-1} = A^{-1} - A^{-1} U (D^{-1} + U^T A^{-1} U)^{-1} U^T A^{-1}
-  Substituting A = \beta_t I, D = D_t and U = X_t^T, this becomes
-       G_t^{-1} = 1/\beta_t I - 1/\beta_t^2 X_t^T (D_t^{-1} + 1/\beta_t I)^{-1} X_t
-                = 1/\beta_t (I - X_t^T E_t X_t)
+  Substituting A = \beta_t I, D = D_t and U = R_t^T, this becomes
+       G_t^{-1} = 1/\beta_t I - 1/\beta_t^2 R_t^T (D_t^{-1} + 1/\beta_t I)^{-1} R_t
+                = 1/\beta_t (I - R_t^T E_t R_t)
   where
         E_t =(def)  1/\beta_t (D_t^{-1} + 1/\beta_t I)^{-1},         (eqn:etdef)
-  so       
+  so
     e_{tii} =   1/\beta_t * 1/(1/d_{tii} + 1/\beta_t)                (eqn:tii)
             =   1/(\beta_t/d_{tii} + 1)
 
-  We would like an efficient-to-compute expression for P_t, without too many separate
+  We would like an efficient-to-compute expression for \hat{X}_t, without too many separate
   invocations of kernels on the GPU.
-     P_t = \beta_t R_t G_t^{-1}
-         = R_t - R_t X_t^T E_t X_t
+     \hat{X}_t = \beta_t X_t G_t^{-1}
+         = X_t - X_t R_t^T E_t R_t
   For efficient operation on the GPU, we want to reduce the number of high-dimensional
   operations that we do (defining "high-dimension" as anything involving D or M, but not
   R, since R is likely small, such as 20).  We define
-     W_t =(def)  E_t^{0.5} X_t.
-  We will actually be storing W_t on the GPU rather than X_t, in order to reduce the
+     W_t =(def)  E_t^{0.5} R_t.
+  We will actually be storing W_t on the GPU rather than R_t, in order to reduce the
   number of operations on the GPU.  We can now write:
 
-        P_t = R_t - R_t W_t^T W_t       (eqn:pt2)
-  
+        \hat{X}_t = X_t - X_t W_t^T W_t       (eqn:pt2)
+
   The following, which we'll compute on the GPU, are going to be useful in computing
   quantities like Z_t:
-  
-     H_t =(def) R_t W_t^T     (dim is N by R)
-     J_t =(def) H_t^T R_t     (dim is R by D)
-         =      W_t R_t^T R_t
+
+     H_t =(def) X_t W_t^T     (dim is N by R)
+     J_t =(def) H_t^T X_t     (dim is R by D)
+         =      W_t X_t^T X_t
      K_t =(def) J_t J_t^T     (dim is R by R, symmetric).. transfer this to CPU.
      L_t =(def) H_t^T H_t     (dim is R by R, symmetric).. transfer this to CPU.
-         =      W_t R_t^T R_t W_t^T
+         =      W_t X_t^T X_t W_t^T
      Note: L_t may also be computed as
      L_t = J_t W_t^T
      which may be more efficient if D < N.
 
   Note: after we have computed H_t we can directly compute
-     P_t = R_t - H_t W_t
+     \hat{X}_t = X_t - H_t W_t
 
   We need to determine how Y_t and Z_t relate to the quantities we just defined.
   First, we'll expand out H_t, J_t, K_t and L_t in terms of the more fundamental quantities.
-     H_t = R_t X_t^T E_t^{0.5}
-     J_t = E_t^{0.5} X_t R_t^T R_t
-     K_t = E_t^{0.5} X_t R_t^T R_t R_t^T R_t X_t^T E_t^{0.5}
-     L_t = E_t^{0.5} X_t R_t^T R_t X_t^T E_t^{0.5}
+     H_t = X_t R_t^T E_t^{0.5}
+     J_t = E_t^{0.5} R_t X_t^T X_t
+     K_t = E_t^{0.5} R_t X_t^T X_t X_t^T X_t R_t^T E_t^{0.5}
+     L_t = E_t^{0.5} R_t X_t^T X_t R_t^T E_t^{0.5}
 
   we wrote above that
-      Y_t = \eta X_t S_t + (1-\eta) (D_t + \rho_t I) X_t
-  so      
-      Y_t = \eta/N X_t R_t^T R_t   + (1-\eta) (D_t + \rho_t I) X_t
-          = \eta/N E_t^{-0.5} J_t  + (1-\eta) (D_t + \rho_t I) X_t     (eqn:yt)
+      Y_t = \eta R_t S_t + (1-\eta) (D_t + \rho_t I) R_t
+  so
+      Y_t = \eta/N R_t X_t^T X_t   + (1-\eta) (D_t + \rho_t I) R_t
+          = \eta/N E_t^{-0.5} J_t  + (1-\eta) (D_t + \rho_t I) R_t     (eqn:yt)
   We will expand Z_t using the expression for Y_t in the line above:
       Z_t = Y_t Y_t^T
           =  (\eta/N)^2 E_t^{-0.5} J_t J_t^T E_t^{-0.5}
-            +(\eta/N)(1-\eta) E_t^{-0.5} J_t X_t^T (D_t + \rho_t I)
-            +(\eta/N)(1-\eta) (D_t + \rho_t I) X_t J_t^T E_t^{-0.5}
+            +(\eta/N)(1-\eta) E_t^{-0.5} J_t R_t^T (D_t + \rho_t I)
+            +(\eta/N)(1-\eta) (D_t + \rho_t I) R_t J_t^T E_t^{-0.5}
             +(1-\eta)^2 (D_t + \rho_t I)^2
           = (\eta/N)^2 E_t^{-0.5} K_t E_t^{-0.5}
            +(\eta/N)(1-\eta) E_t^{-0.5} L_t E_t^{-0.5} (D_t + \rho_t I)
@@ -259,60 +252,60 @@ namespace nnet3 {
 
   Mathematically, no diagonal element of C_t can be less than (1-\eta)^2
   \rho_t^2, and since negative or zero elements of C_t would cause us a problem
-  later, we floor C_t to this value.  (see below regarding how we ensure X_{t+1}
+  later, we floor C_t to this value.  (see below regarding how we ensure R_{t+1}
   has orthonormal rows).
-  
+
   We will continue the discussion below regarding what we do with C_t and U_t.
   Next, we need to digress briefly and describe how to compute
-  tr(P_t P_t^T) and tr(R_t R_t^2), since these appear in expressions for
-  \gamma_t (needed to produce the output Q_t), and for \rho_{t+1}.  It happens
+  tr(\hat{X}_t \hat{X}_t^T) and tr(X_t X_t^2), since these appear in expressions for
+  \gamma_t (needed to produce the output \bar{X}_t), and for \rho_{t+1}.  It happens
   that we need, for purposes of appying "max_change" in the neural net code, the
-  squared 2-norm of each row of the output Q_t.  In order to be able to compute
+  squared 2-norm of each row of the output \bar{X}_t.  In order to be able to compute
   \gamma_t, it's most convenient to compute this squared row-norm for each row
-  of P_t, as a vector, to compute tr(P_t P_t^2) from this vector as its sum, and
-  to then work back to compute tr(R_t R_t^2) from the relation between P_t and
-  R_t.  We can then scale the row-norms we computed for P_t, so they apply to
-  Q_t.
+  of \hat{X}_t, as a vector, to compute tr(\hat{X}_t \hat{X}_t^2) from this vector as its sum, and
+  to then work back to compute tr(X_t X_t^2) from the relation between \hat{X}_t and
+  X_t.  We can then scale the row-norms we computed for \hat{X}_t, so they apply to
+  \bar{X}_t.
 
-  For current purposes, you can imagine that we computed tr(P_t P_t^T) directly.
+  For current purposes, you can imagine that we computed tr(\hat{X}_t \hat{X}_t^T) directly.
   Using (from eqn:pt2)
-      P_t = R_t - R_t W_t^T W_t,
-  we can expand tr(P_t P_t^T) as:
-   tr(P_t P_t^T) = tr(R_t R_t^T) + tr(R_t W_t^T W_t W_t^T W_t R_t^T)
-                  - 2 tr(R_t W_t^T W_t R_t^T)
-                 = tr(R_t R_t^T) + tr(W_t R_t^T R_t W_t^T W_t W_t^T)
-                  - 2 tr(W_t R_t^T R_t W_t^T)
-                 = tr(R_t R_t^T) + tr(L_t W_t W_t^T) - 2 tr(L_t)
-                 = tr(R_t R_t^T) + tr(L_t E_t) - 2 tr(L_t)
+      \hat{X}_t = X_t - X_t W_t^T W_t,
+  we can expand tr(\hat{X}_t \hat{X}_t^T) as:
+   tr(\hat{X}_t \hat{X}_t^T) = tr(X_t X_t^T) + tr(X_t W_t^T W_t W_t^T W_t X_t^T)
+                  - 2 tr(X_t W_t^T W_t X_t^T)
+                 = tr(X_t X_t^T) + tr(W_t X_t^T X_t W_t^T W_t W_t^T)
+                  - 2 tr(W_t X_t^T X_t W_t^T)
+                 = tr(X_t X_t^T) + tr(L_t W_t W_t^T) - 2 tr(L_t)
+                 = tr(X_t X_t^T) + tr(L_t E_t) - 2 tr(L_t)
   and all quantities have already been computed (or are quick to compute, such as
-  the small traces on the right), except tr(R_t R_t^T), so we can write
+  the small traces on the right), except tr(X_t X_t^T), so we can write
 
-    tr(R_t R_t^T) = tr(P_t P_t^T) - tr(L_t E_t) + 2 tr(L_t)
-  and the above expression can be used to obtain tr(R_t R_t^2).
+    tr(X_t X_t^T) = tr(\hat{X}_t \hat{X}_t^T) - tr(L_t E_t) + 2 tr(L_t)
+  and the above expression can be used to obtain tr(X_t X_t^2).
   We can then do
-     \gamma_t <-- sqrt(tr(R_t R_t^T)  / tr(P_t P_t^T)).
+     \gamma_t <-- sqrt(tr(X_t X_t^T)  / tr(\hat{X}_t \hat{X}_t^T)).
   (or one if the denominator is zero), and then
-      Q_t <-- \gamma_t P_t
+      \bar{X}_t <-- \gamma_t \hat{X}_t
   We can then output the per-row squared-l2-norms of Q by scaling those we
   computed from P by \gamma_t^2.
 
-  OK, the digression on how to compute \gamma_t and tr(R_t R_t^T) is over.
-  We now return to the computation of X_{t+1}, W_{t+1}, \rho_{t+1}, D_{t+1} and E_{t+1}.
+  OK, the digression on how to compute \gamma_t and tr(X_t X_t^T) is over.
+  We now return to the computation of R_{t+1}, W_{t+1}, \rho_{t+1}, D_{t+1} and E_{t+1}.
 
   We found above in (eqn:rhot1)
      \rho_{t+1} = 1/(D - R) (\eta tr(S_t) + (1-\eta)(D \rho_t + tr(D_t)) - tr(C_t^{0.5})).
   Expanding out S_t from its definition in (eqn:St),
-     \rho_{t+1} = 1/(D - R) (\eta/N tr(R_t R_t^T) + (1-\eta)(D \rho_t + tr(D_t)) - tr(C_t^{0.5})).  
+     \rho_{t+1} = 1/(D - R) (\eta/N tr(X_t X_t^T) + (1-\eta)(D \rho_t + tr(D_t)) - tr(C_t^{0.5})).
   We can compute this directly as all the quantities involved are already known
   or easy to compute.
   Next, from (eqn:dt1), we compute
      D_{t+1} = C_t^{0.5} - \rho_{t+1} I
   At this point if \rho_{t+1} is smaller than some small value \epsilon, e.g. 1.0e-10, we
   set it to \epsilon; as mentioned, we do this to stop F_t approaching zero if all inputs
-  are zero.  Next, if any diagonal element D_{t+1,i,i} has absolute value less than \epsilon,
-  we set it to +\epsilon.  This is to ensure that diagonal elements of E are never zero, which
-  would cause problems.
-  
+  are zero.  Next, if any diagonal element D_{t+1,i,i} has absolute value less
+  than \epsilon, we set it to +\epsilon.  This is to ensure that diagonal
+  elements of E are never zero, which would cause problems.
+
   Next, we compute (from eqn:betat2, eqn:etdef, eqn:tii),
         \beta_{t+1} = \rho_{t+1} (1+\alpha) + \alpha/D tr(D_{t+1})
             E_{t+1} = 1/\beta_{t+1} (D_{t+1}^{-1} + 1/\beta_{t+1} I)^{-1},
@@ -321,13 +314,13 @@ namespace nnet3 {
  We'll want to store D_{t+1}.  We next want to compute W_{t+1}.
 
   Before computing W_{t+1}, we need to find an expression for
-     X_{t+1} = C_t^{-0.5} U_t^T Y_t
+     R_{t+1} = C_t^{-0.5} U_t^T Y_t
    Expanding out Y_t using the expression in (eqn:yt),
-     X_{t+1} = C_t^{-0.5} U_t^T  (\eta/N E_t^{-0.5} J_t  + (1-\eta) (D_t + \rho_t I) X_t)
+     R_{t+1} = C_t^{-0.5} U_t^T  (\eta/N E_t^{-0.5} J_t  + (1-\eta) (D_t + \rho_t I) R_t)
              =  (\eta/N C_t^{-0.5} U_t^T E_t^{-0.5})  J_t
                +((1-\eta) C_t^{-0.5} U_t^T (D_t + \rho_t I) E_t^{-0.5}) W_t
 
-   What we actually want is W_{t+1} = E_{t+1}^{0.5} X_{t+1}:
+   What we actually want is W_{t+1} = E_{t+1}^{0.5} R_{t+1}:
      W_{t+1} = (\eta/N E_{t+1}^{0.5} C_t^{-0.5} U_t^T E_t^{-0.5}) J_t
               +((1-\eta) E_{t+1}^{0.5} C_t^{-0.5} U_t^T (D_t + \rho_t I) E_t^{-0.5}) W_t
    and to minimize the number of matrix-matrix multiplies we can factorize this as:
@@ -336,27 +329,27 @@ namespace nnet3 {
         B_t = J_t + (1-\eta)/(\eta/N) (D_t + \rho_t I) W_t
    [note: we use the fact that (D_t + \rho_t I) and E_t^{-0.5} commute because
     they are diagonal].
-               
+
   A_t is computed on the CPU and transferred from there to the GPU, B_t is
   computed on the PGU, and the multiplication of A_t with B_t is done on the GPU.
 
-   * Keeping X_t orthogonal *
-   
-   Our method requires the X_t matrices to be orthogonal (which we define to
-   mean that X_t X_t^T = I).  If roundoff error causes this equality to be
+   * Keeping R_t orthogonal *
+
+   Our method requires the R_t matrices to be orthogonal (which we define to
+   mean that R_t R_t^T = I).  If roundoff error causes this equality to be
    significantly violated, it could cause a problem for the stability of our
-   method.  We now address our method for making sure that the X_t values stay
+   method.  We now address our method for making sure that the R_t values stay
    orthogonal.  We do this in the algorithm described above, after creating
    W_{t+1}.  This extra step is only executed if the condition number of C_t
    (i.e. the ratio of its largest to smallest diagonal element) exceeds a
    specified threshold, such as 1.0e+06 [this is tested before applying the
    floor to C_t].  The threshold was determined empirically by finding the
-   largest value needed to ensure a certain level of orthogonality in X_{t+1}.
-   For purposes of the present discussion, since X_{t+1} is not actually stored,
+   largest value needed to ensure a certain level of orthogonality in R_{t+1}.
+   For purposes of the present discussion, since R_{t+1} is not actually stored,
    define it as E_{t+1}^{-0.5} W_{t+1}.  Define the following (and we will
    just use t instead of t+1 below, as all quantities have the same subscript):
 
-      O_t =(def) X_t X_t^T
+      O_t =(def) R_t R_t^T
           =  E_t^{-0.5} W_t W_t^T E_t^{-0.5}
 
    (and we would compute this by computing W_t W_t^T on the GPU, transferring
@@ -364,9 +357,9 @@ namespace nnet3 {
    to the unit matrix, we can re-orthogonalize as follows:
    Do the Cholesky decomposition
       O_t = C C^T
-   Clearly C^{-1} O_t C^{-T} = I, so if we correct X_t with:
-      X_t <-- C^{-1} X_t
-   we can ensure orthogonality.  If X_t's first k rows are orthogonal, this
+   Clearly C^{-1} O_t C^{-T} = I, so if we correct R_t with:
+      R_t <-- C^{-1} R_t
+   we can ensure orthogonality.  If R_t's first k rows are orthogonal, this
    transform will not affect them, because of its lower-triangular
    structure... this is good because (thanks to the eigenvalue sorting), the
    larger eigenvectors are first and it is more critical to keep them pointing
@@ -376,19 +369,19 @@ namespace nnet3 {
       W_t <-- (E_t^{0.5} C^{-1} E_t^{-0.5}) W_t,
    and the matrix in parentheses is computed on the CPU, transferred to the
    GPU, and the multiplication is done there.
- 
 
-   * Initialization *  
+
+   * Initialization *
 
    Now, a note on what we do on time t = 0, i.e. for the first minibatch.  We
-   initialize X_0 to the top R eigenvectors of 1/N R_0 R_0^T, where N is the
+   initialize X_0 to the top R eigenvectors of 1/N X_0 X_0^T, where N is the
    minibatch size (num-rows of R0).  If L is the corresponding RxR diagonal
    matrix of eigenvalues, then we will set D_0 = L - \rho_0 I.  We set \rho_0
    to ensure that
-                      tr(F_0) = 1/N tr(R_0 R_0^T),
-           tr(D_0) - \rho_0 D = 1/N tr(R_0 R_0^T),
-  tr(L) + \rho_0 R - \rho_0 D = 1/N tr(R_0 R_0^T)
-                       \rho_0 = (1/N tr(R_0 R_0^T) - tr(L)) / (D - R)
+                      tr(F_0) = 1/N tr(X_0 X_0^T),
+           tr(D_0) - \rho_0 D = 1/N tr(X_0 X_0^T),
+  tr(L) + \rho_0 R - \rho_0 D = 1/N tr(X_0 X_0^T)
+                       \rho_0 = (1/N tr(X_0 X_0^T) - tr(L)) / (D - R)
 
    We then floor \rho_0 to \epsilon (e.g. 1.0e-10) and also floor the
    diagonal elements of D_0 to \epsilon; this ensures that we won't
@@ -398,10 +391,10 @@ namespace nnet3 {
    with a GPU, where we won't have multi-threading, but we want it to work
    also on a CPU, where we may have multiple worker threads.
    Our approach is as follows (we do this when we're about to start updating
-   the parameters X_t, D_t, \rho_t and derived quantities):
+   the parameters R_t, D_t, \rho_t and derived quantities):
 
     For time t > 0 (where the matrices are already initialized), before starting
-    the part of the computation that updates the parameters (X_t, D_t, \rho_t and
+    the part of the computation that updates the parameters (R_t, D_t, \rho_t and
     derived quantities), we try to lock a mutex that guards the OnlinePreconditioner.
     If we can lock it right away, we go ahead and do the update, but if not,
     we just abandon the attempt to update those quantities.
@@ -411,7 +404,7 @@ namespace nnet3 {
     being written by another thread).  This mutex will only be locked for short
     periods of time.
 
-   Note: it might be a good idea to make sure that the X_t still retain orthonormal
+   Note: it might be a good idea to make sure that the R_t still retain orthonormal
    rows even in the presence of roundoff, without errors accumulating.  My instinct
    is that this isn't going to be a problem.
  */
@@ -456,7 +449,7 @@ class OnlineNaturalGradient {
                                       const BaseFloat rho_t,
                                       const Vector<BaseFloat> &d_t,
                                       CuMatrixBase<BaseFloat> *WJKL_t,
-                                      CuMatrixBase<BaseFloat> *R_t,
+                                      CuMatrixBase<BaseFloat> *X_t,
                                       CuVectorBase<BaseFloat> *row_prod,
                                       BaseFloat *scale);
 
@@ -481,14 +474,14 @@ class OnlineNaturalGradient {
                   BaseFloat rho_t1,
                   const MatrixBase<BaseFloat> &U_t,
                   const VectorBase<BaseFloat> &sqrt_c_t,
-                  const VectorBase<BaseFloat> &inv_sqrt_e_t,                                      
+                  const VectorBase<BaseFloat> &inv_sqrt_e_t,
                   const CuMatrixBase<BaseFloat> &W_t,
                   CuMatrixBase<BaseFloat> *J_t,
                   CuMatrixBase<BaseFloat> *W_t1) const;
 
   // This function is called if C_t has high condition number; it makes sure
-  // that X_{t+1} is orthogonal.  See the section in the extended comment above
-  // on "keeping X_t orthogonal".
+  // that R_{t+1} is orthogonal.  See the section in the extended comment above
+  // on "keeping R_t orthogonal".
   void ReorthogonalizeXt1(const VectorBase<BaseFloat> &d_t1,
                           BaseFloat rho_t1,
                           CuMatrixBase<BaseFloat> *W_t1,
@@ -497,13 +490,26 @@ class OnlineNaturalGradient {
 
   void Init(const CuMatrixBase<BaseFloat> &R0);
 
+  // Initialize to some small 'default' values, called from Init().  Init() then
+  // does a few iterations of update with the first batch's data to give more
+  // reasonable values.
+  void InitDefault(int32 D);
+
+  // initializes R, which is assumed to have at least as many columns as rows,
+  // to a specially designed matrix with orthonormal rows, that has no zero rows
+  // or columns.
+  static void InitOrthonormalSpecial(CuMatrixBase<BaseFloat> *R);
+
   // Returns the learning rate eta as the function of the number of samples
   // (actually, N is the number of vectors we're preconditioning, which due to
   // context is not always exactly the same as the number of samples).  The
   // value returned depends on num_samples_history_.
   BaseFloat Eta(int32 N) const;
 
-  
+  // called if self_debug_ = true, makes sure the members satisfy certain
+  // properties.
+  void SelfTest() const;
+
   // Configuration values:
 
   // The rank of the correction to the unit matrix (e.g. 20).
@@ -513,13 +519,13 @@ class OnlineNaturalGradient {
   // updating the Fisher-matrix parameters every "update_period_" minibatches;
   // this saves time.
   int32 update_period_;
-  
+
   // num_samples_history_ determines the value of eta, which in turn affects how
   // fast we update our estimate of the covariance matrix.  We've done it this
   // way in order to make it easy to have a single configuration value that
   // doesn't have to be changed when we change the minibatch size.
   BaseFloat num_samples_history_;
-  
+
   // alpha controls how much we smooth the Fisher matrix with the unit matrix.
   // e.g. alpha = 4.0.
   BaseFloat alpha_;
@@ -532,9 +538,10 @@ class OnlineNaturalGradient {
 
   // delta is a relative floor on the unit-matrix scaling factor rho_t in our
   // Fisher estimate, which we set to 1.0e-05: this is relative to the largest
-  // value of D_t.  It's needed to control roundoff error.
+  // value of D_t.  It's needed to control roundoff error.  We apply the same
+  // floor to the eigenvalues in D_t.
   BaseFloat delta_;
-  
+
   // t is a counter that measures how many updates we've done.
   int32 t_;
 
@@ -543,47 +550,23 @@ class OnlineNaturalGradient {
   // is a mechanism to avoid spending too much time updating the subspace (which can
   // be wasteful).
   int32 num_updates_skipped_;
-  
+
   // If true, activates certain checks.
   bool self_debug_;
 
   CuMatrix<BaseFloat> W_t_;
   BaseFloat rho_t_;
   Vector<BaseFloat> d_t_;
- 
-  
+
+
   // Used to prevent parameters being read or written in an inconsistent state.
   Mutex read_write_mutex_;
 
   // This mutex is used to control which thread gets to update the
   // parameters, in multi-threaded code.
   Mutex update_mutex_;
-  
-
 };
 
-
-/*
-  This function finds the approximate top eigenvectors and eigenvalues of S = beta M
-  M^T (if trans == kNoTrans) or S = beta M^T M (if trans == kTrans).
-  Each row p of P will be set to an approximate
-  eigenvector of S, and the corresponding value in s will exactly equal p^T S p.
-  (note: it will actually be those with the largest absolute value that we return,
-  which makes a difference only if S has negative eigenvalues).
-  We do the eigenvalue computation on the CPU, mainly to avoid the hassle of
-  coding a version of it for CUDA.
-  Caution: most of the other eigenvalue or SVD code puts the eigenvalues in the
-  columns, not the rows.
-  This function is used by class OnlineNaturalGradient; we declare it separately
-  for ease of testing.   
- */
-void ApproxEigsOfProduct(const CuMatrixBase<BaseFloat> &M,
-                         MatrixTransposeType trans,
-                         CuMatrixBase<BaseFloat> *P,
-                         CuVectorBase<BaseFloat> *s);
-
-
-
 } // namespace nnet3
 } // namespace kaldi
 
diff --git a/src/nnet3/nnet-am-decodable-simple.cc b/src/nnet3/nnet-am-decodable-simple.cc
index 204773516d4..5a3de5c6074 100644
--- a/src/nnet3/nnet-am-decodable-simple.cc
+++ b/src/nnet3/nnet-am-decodable-simple.cc
@@ -18,88 +18,66 @@
 // limitations under the License.
 
 #include "nnet3/nnet-am-decodable-simple.h"
+#include "nnet3/nnet-utils.h"
 
 namespace kaldi {
 namespace nnet3 {
 
 
-DecodableAmNnetSimple::DecodableAmNnetSimple(
-    const DecodableAmNnetSimpleOptions &opts,
-    const TransitionModel &trans_model,
-    const AmNnetSimple &am_nnet,
+NnetDecodableBase::NnetDecodableBase(
+    const NnetSimpleComputationOptions &opts,
+    const Nnet &nnet,
+    const VectorBase<BaseFloat> &priors,
     const MatrixBase<BaseFloat> &feats,
     const VectorBase<BaseFloat> *ivector,
     const MatrixBase<BaseFloat> *online_ivectors,
     int32 online_ivector_period):
     opts_(opts),
-    trans_model_(trans_model),
-    am_nnet_(am_nnet),
-    priors_(am_nnet_.Priors()),
+    nnet_(nnet),
+    output_dim_(nnet_.OutputDim("output")),
+    log_priors_(priors),
     feats_(feats),
     ivector_(ivector), online_ivector_feats_(online_ivectors),
     online_ivector_period_(online_ivector_period),
-    compiler_(am_nnet_.GetNnet(), opts_.optimize_config),
-    current_log_post_offset_(0) {
+    compiler_(nnet_, opts_.optimize_config),
+    current_log_post_subsampled_offset_(0) {
+  num_subsampled_frames_ =
+      (feats_.NumRows() + opts_.frame_subsampling_factor - 1) /
+      opts_.frame_subsampling_factor;
+  KALDI_ASSERT(IsSimpleNnet(nnet));
+  ComputeSimpleNnetContext(nnet, &nnet_left_context_, &nnet_right_context_);
   KALDI_ASSERT(!(ivector != NULL && online_ivectors != NULL));
   KALDI_ASSERT(!(online_ivectors != NULL && online_ivector_period <= 0 &&
                  "You need to set the --online-ivector-period option!"));
-  priors_.ApplyLog();  
-  PossiblyWarnForFramesPerChunk();
+  log_priors_.ApplyLog();
+  CheckAndFixConfigs();
 }
 
+
 DecodableAmNnetSimple::DecodableAmNnetSimple(
-    const DecodableAmNnetSimpleOptions &opts,
+    const NnetSimpleComputationOptions &opts,
     const TransitionModel &trans_model,
     const AmNnetSimple &am_nnet,
     const MatrixBase<BaseFloat> &feats,
-    const MatrixBase<BaseFloat> &ivectors,
+    const VectorBase<BaseFloat> *ivector,
+    const MatrixBase<BaseFloat> *online_ivectors,
     int32 online_ivector_period):
-    opts_(opts),
-    trans_model_(trans_model),
-    am_nnet_(am_nnet),
-    priors_(am_nnet_.Priors()),    
-    feats_(feats),
-    ivector_(NULL),
-    online_ivector_feats_(&ivectors),
-    online_ivector_period_(online_ivector_period),
-    compiler_(am_nnet_.GetNnet(), opts_.optimize_config),
-    current_log_post_offset_(0) {
-  priors_.ApplyLog();
-  PossiblyWarnForFramesPerChunk();
-}      
-    
-DecodableAmNnetSimple::DecodableAmNnetSimple(
-    const DecodableAmNnetSimpleOptions &opts,
-    const TransitionModel &trans_model,
-    const AmNnetSimple &am_nnet,
-    const MatrixBase<BaseFloat> &feats,
-    const VectorBase<BaseFloat> &ivector):
-    opts_(opts),
-    trans_model_(trans_model),
-    am_nnet_(am_nnet),
-    priors_(am_nnet_.Priors()),    
-    feats_(feats),
-    ivector_(&ivector),
-    online_ivector_feats_(NULL),
-    online_ivector_period_(0),
-    compiler_(am_nnet_.GetNnet(), opts_.optimize_config),
-    current_log_post_offset_(0) {
-  priors_.ApplyLog();
-  PossiblyWarnForFramesPerChunk();
-}      
+    NnetDecodableBase(opts, am_nnet.GetNnet(), am_nnet.Priors(),
+                      feats, ivector, online_ivectors,
+                      online_ivector_period),
+    trans_model_(trans_model) { }
+
+
+
 
 
 BaseFloat DecodableAmNnetSimple::LogLikelihood(int32 frame,
                                                int32 transition_id) {
-  if (frame < current_log_post_offset_ ||
-      frame >= current_log_post_offset_ + current_log_post_.NumRows())
-    EnsureFrameIsComputed(frame);
   int32 pdf_id = trans_model_.TransitionIdToPdf(transition_id);
-  return current_log_post_(frame - current_log_post_offset_,
-                           pdf_id);
+  return GetOutput(frame, pdf_id);
 }
 
-int32 DecodableAmNnetSimple::GetIvectorDim() const {
+int32 NnetDecodableBase::GetIvectorDim() const {
   if (ivector_ != NULL)
     return ivector_->Dim();
   else if (online_ivector_feats_ != NULL)
@@ -108,44 +86,66 @@ int32 DecodableAmNnetSimple::GetIvectorDim() const {
     return 0;
 }
 
-void DecodableAmNnetSimple::EnsureFrameIsComputed(int32 frame) {
-  KALDI_ASSERT(frame >= 0 && frame  < feats_.NumRows());
-
-  const Nnet &nnet = am_nnet_.GetNnet();
+void NnetDecodableBase::EnsureFrameIsComputed(int32 subsampled_frame) {
+  KALDI_ASSERT(subsampled_frame >= 0 &&
+               subsampled_frame < num_subsampled_frames_);
   int32 feature_dim = feats_.NumCols(),
       ivector_dim = GetIvectorDim(),
-      nnet_input_dim = nnet.InputDim("input"),
-      nnet_ivector_dim = std::max<int32>(0, nnet.InputDim("ivector"));
+      nnet_input_dim = nnet_.InputDim("input"),
+      nnet_ivector_dim = std::max<int32>(0, nnet_.InputDim("ivector"));
   if (feature_dim != nnet_input_dim)
     KALDI_ERR << "Neural net expects 'input' features with dimension "
               << nnet_input_dim << " but you provided "
               << feature_dim;
-  if (ivector_dim != std::max<int32>(0, nnet.InputDim("ivector")))
+  if (ivector_dim != std::max<int32>(0, nnet_.InputDim("ivector")))
     KALDI_ERR << "Neural net expects 'ivector' features with dimension "
               << nnet_ivector_dim << " but you provided " << ivector_dim;
 
-  int32 current_frames_computed = current_log_post_.NumRows(),
-      current_offset = current_log_post_offset_;
-  KALDI_ASSERT(frame < current_offset ||
-               frame >= current_offset + current_frames_computed);
-  // allow the output to be computed for frame 0 ... num_input_frames - 1.
-  int32 start_output_frame = frame,
-      num_output_frames = std::min<int32>(feats_.NumRows() - start_output_frame,
-                                          opts_.frames_per_chunk);
-  KALDI_ASSERT(num_output_frames > 0);
-  int32 first_input_frame = start_output_frame - am_nnet_.LeftContext(),
-      num_input_frames = am_nnet_.LeftContext() + num_output_frames +
-                         am_nnet_.RightContext();
+  int32 current_subsampled_frames_computed = current_log_post_.NumRows(),
+      current_subsampled_offset = current_log_post_subsampled_offset_;
+  KALDI_ASSERT(subsampled_frame < current_subsampled_offset ||
+               subsampled_frame >= current_subsampled_offset +
+                                   current_subsampled_frames_computed);
+
+  // all subsampled frames pertain to the output of the network,
+  // they are output frames divided by opts_.frame_subsampling_factor.
+  int32 subsampling_factor = opts_.frame_subsampling_factor,
+      subsampled_frames_per_chunk = opts_.frames_per_chunk / subsampling_factor,
+      start_subsampled_frame = subsampled_frame,
+      num_subsampled_frames = std::min<int32>(num_subsampled_frames_ -
+                                              start_subsampled_frame,
+                                              subsampled_frames_per_chunk),
+      last_subsampled_frame = start_subsampled_frame + num_subsampled_frames - 1;
+  KALDI_ASSERT(num_subsampled_frames > 0);
+  // the output-frame numbers are the subsampled-frame numbers
+  int32 first_output_frame = start_subsampled_frame * subsampling_factor,
+      last_output_frame = last_subsampled_frame * subsampling_factor;
+
+  KALDI_ASSERT(opts_.extra_left_context >= 0 && opts_.extra_right_context >= 0);
+  int32 extra_left_context = opts_.extra_left_context,
+      extra_right_context = opts_.extra_right_context;
+  if (first_output_frame == 0 && opts_.extra_left_context_initial >= 0)
+    extra_left_context = opts_.extra_left_context_initial;
+  if (last_subsampled_frame == num_subsampled_frames_ - 1 &&
+      opts_.extra_right_context_final >= 0)
+    extra_right_context = opts_.extra_right_context_final;
+  int32 left_context = nnet_left_context_ + extra_left_context,
+      right_context = nnet_right_context_ + extra_right_context;
+  int32 first_input_frame = first_output_frame - left_context,
+      last_input_frame = last_output_frame + right_context,
+      num_input_frames = last_input_frame + 1 - first_input_frame;
   Vector<BaseFloat> ivector;
-  GetCurrentIvector(start_output_frame, num_output_frames, &ivector);
-  
+  GetCurrentIvector(first_output_frame,
+                    last_output_frame - first_output_frame,
+                    &ivector);
+
   Matrix<BaseFloat> input_feats;
   if (first_input_frame >= 0 &&
-      first_input_frame + num_input_frames <= feats_.NumRows()) {
+      last_input_frame < feats_.NumRows()) {
     SubMatrix<BaseFloat> input_feats(feats_.RowRange(first_input_frame,
                                                      num_input_frames));
     DoNnetComputation(first_input_frame, input_feats, ivector,
-                      start_output_frame, num_output_frames);
+                      first_output_frame, num_subsampled_frames);
   } else {
     Matrix<BaseFloat> feats_block(num_input_frames, feats_.NumCols());
     int32 tot_input_feats = feats_.NumRows();
@@ -158,13 +158,25 @@ void DecodableAmNnetSimple::EnsureFrameIsComputed(int32 frame) {
       dest.CopyFromVec(src);
     }
     DoNnetComputation(first_input_frame, feats_block, ivector,
-                      start_output_frame, num_output_frames);
-  }  
+                      first_output_frame, num_subsampled_frames);
+  }
 }
 
-void DecodableAmNnetSimple::GetCurrentIvector(int32 output_t_start,
-                                              int32 num_output_frames,
-                                              Vector<BaseFloat> *ivector) {
+// note: in the normal case (with no frame subsampling) you can ignore the
+// 'subsampled_' in the variable name.
+void NnetDecodableBase::GetOutputForFrame(int32 subsampled_frame,
+                                          VectorBase<BaseFloat> *output) {
+  if (subsampled_frame < current_log_post_subsampled_offset_ ||
+      subsampled_frame >= current_log_post_subsampled_offset_ +
+      current_log_post_.NumRows())
+    EnsureFrameIsComputed(subsampled_frame);
+  output->CopyFromVec(current_log_post_.Row(
+      subsampled_frame - current_log_post_subsampled_offset_));
+}
+
+void NnetDecodableBase::GetCurrentIvector(int32 output_t_start,
+                                          int32 num_output_frames,
+                                          Vector<BaseFloat> *ivector) {
   if (ivector_ != NULL) {
     *ivector = *ivector_;
     return;
@@ -194,14 +206,14 @@ void DecodableAmNnetSimple::GetCurrentIvector(int32 output_t_start,
   }
   *ivector = online_ivector_feats_->Row(ivector_frame);
 }
-  
 
-void DecodableAmNnetSimple::DoNnetComputation(
+
+void NnetDecodableBase::DoNnetComputation(
     int32 input_t_start,
     const MatrixBase<BaseFloat> &input_feats,
     const VectorBase<BaseFloat> &ivector,
     int32 output_t_start,
-    int32 num_output_frames) {
+    int32 num_subsampled_frames) {
   ComputationRequest request;
   request.need_model_derivative = false;
   request.store_component_stats = false;
@@ -210,7 +222,7 @@ void DecodableAmNnetSimple::DoNnetComputation(
                           // time, to take advantage of caching in the compiler.
                           // An optimization.
   int32 time_offset = (shift_time ? -output_t_start : 0);
-  
+
   // First add the regular features-- named "input".
   request.inputs.reserve(2);
   request.inputs.push_back(
@@ -221,13 +233,21 @@ void DecodableAmNnetSimple::DoNnetComputation(
     indexes.push_back(Index(0, 0, 0));
     request.inputs.push_back(IoSpecification("ivector", indexes));
   }
-  request.outputs.push_back(
-      IoSpecification("output", time_offset + output_t_start,
-                      time_offset + output_t_start + num_output_frames));
+  IoSpecification output_spec;
+  output_spec.name = "output";
+  output_spec.has_deriv = false;
+  int32 subsample = opts_.frame_subsampling_factor;
+  output_spec.indexes.resize(num_subsampled_frames);
+  // leave n and x values at 0 (the constructor sets these).
+  for (int32 i = 0; i < num_subsampled_frames; i++)
+    output_spec.indexes[i].t = time_offset + output_t_start + i * subsample;
+  request.outputs.resize(1);
+  request.outputs[0].Swap(&output_spec);
+
   const NnetComputation *computation = compiler_.Compile(request);
   Nnet *nnet_to_update = NULL;  // we're not doing any update.
   NnetComputer computer(opts_.compute_config, *computation,
-                        am_nnet_.GetNnet(), nnet_to_update);
+                        nnet_, nnet_to_update);
 
   CuMatrix<BaseFloat> input_feats_cu(input_feats);
   computer.AcceptInput("input", &input_feats_cu);
@@ -241,20 +261,38 @@ void DecodableAmNnetSimple::DoNnetComputation(
   CuMatrix<BaseFloat> cu_output;
   computer.GetOutputDestructive("output", &cu_output);
   // subtract log-prior (divide by prior)
-  cu_output.AddVecToRows(-1.0, priors_);
+  if (log_priors_.Dim() != 0)
+    cu_output.AddVecToRows(-1.0, log_priors_);
   // apply the acoustic scale
   cu_output.Scale(opts_.acoustic_scale);
   current_log_post_.Resize(0, 0);
   // the following statement just swaps the pointers if we're not using a GPU.
   cu_output.Swap(&current_log_post_);
-  current_log_post_offset_ = output_t_start;
+  current_log_post_subsampled_offset_ = output_t_start / subsample;
 }
 
-void DecodableAmNnetSimple::PossiblyWarnForFramesPerChunk() const {
-  static bool warned = false;
-  int32 nnet_modulus = am_nnet_.GetNnet().Modulus();  
-  if (opts_.frames_per_chunk % nnet_modulus != 0 && !warned) {
-    warned = true;
+void NnetDecodableBase::CheckAndFixConfigs() {
+  static bool warned_modulus = false,
+      warned_subsampling = false;
+  int32 nnet_modulus = nnet_.Modulus();
+  if (opts_.frame_subsampling_factor < 1 ||
+      opts_.frames_per_chunk < 1)
+    KALDI_ERR << "--frame-subsampling-factor and --frames-per-chunk must be > 0";
+  if (opts_.frames_per_chunk % opts_.frame_subsampling_factor != 0) {
+    int32 f = opts_.frame_subsampling_factor,
+        frames_per_chunk = f * ((opts_.frames_per_chunk + f - 1) / f);
+    if (!warned_subsampling) {
+      warned_subsampling = true;
+      KALDI_LOG << "Increasing --frames-per-chunk from "
+                << opts_.frames_per_chunk << " to "
+                << frames_per_chunk << " to make it a multiple of "
+                << "--frame-subsampling-factor="
+                << opts_.frame_subsampling_factor;
+    }
+    opts_.frames_per_chunk = frames_per_chunk;
+  }
+  if (opts_.frames_per_chunk % nnet_modulus != 0 && !warned_modulus) {
+    warned_modulus = true;
     KALDI_WARN << "It may be more efficient to set the --frames-per-chunk "
                << "(currently " << opts_.frames_per_chunk << " to a "
                << "multiple of the network's shift-invariance modulus "
@@ -264,3 +302,4 @@ void DecodableAmNnetSimple::PossiblyWarnForFramesPerChunk() const {
 
 } // namespace nnet3
 } // namespace kaldi
+
diff --git a/src/nnet3/nnet-am-decodable-simple.h b/src/nnet3/nnet-am-decodable-simple.h
index 68dfb612e7d..45652c8e4ba 100644
--- a/src/nnet3/nnet-am-decodable-simple.h
+++ b/src/nnet3/nnet-am-decodable-simple.h
@@ -33,23 +33,56 @@ namespace kaldi {
 namespace nnet3 {
 
 
-struct DecodableAmNnetSimpleOptions {
+// Note: the 'simple' in the name means it applies to networks
+// for which IsSimpleNnet(nnet) would return true.
+struct NnetSimpleComputationOptions {
+  int32 extra_left_context;
+  int32 extra_right_context;
+  int32 extra_left_context_initial;
+  int32 extra_right_context_final;
+  int32 frame_subsampling_factor;
   int32 frames_per_chunk;
   BaseFloat acoustic_scale;
   bool debug_computation;
   NnetOptimizeOptions optimize_config;
   NnetComputeOptions compute_config;
 
-  DecodableAmNnetSimpleOptions():
+  NnetSimpleComputationOptions():
+      extra_left_context(0),
+      extra_right_context(0),
+      extra_left_context_initial(-1),
+      extra_right_context_final(-1),
+      frame_subsampling_factor(1),
       frames_per_chunk(50),
-      acoustic_scale(0.1) { }
+      acoustic_scale(0.1),
+      debug_computation(false) { }
 
   void Register(OptionsItf *opts) {
-    opts->Register("frames-per-chunk", &frames_per_chunk,
-                   "Number of frames in each chunk that is separately evaluated "
-                   "by the neural net.");
+    opts->Register("extra-left-context", &extra_left_context,
+                   "Number of frames of additional left-context to add on top "
+                   "of the neural net's inherent left context (may be useful in "
+                   "recurrent setups");
+    opts->Register("extra-right-context", &extra_right_context,
+                   "Number of frames of additional right-context to add on top "
+                   "of the neural net's inherent right context (may be useful in "
+                   "recurrent setups");
+    opts->Register("extra-left-context-initial", &extra_left_context_initial,
+                   "If >0, overrides the --extra-left-context value at the start "
+                   "of an utterance.");
+    opts->Register("extra-right-context-final", &extra_right_context_final,
+                   "If >0, overrides the --extra-right-context value at the end "
+                   "of an utterance.");
+    opts->Register("frame-subsampling-factor", &frame_subsampling_factor,
+                   "Required if the frame-rate of the output (e.g. in 'chain' "
+                   "models) is less than the frame-rate of the original "
+                   "alignment.");
     opts->Register("acoustic-scale", &acoustic_scale,
                    "Scaling factor for acoustic log-likelihoods");
+    opts->Register("frames-per-chunk", &frames_per_chunk,
+                   "Number of frames in each chunk that is separately evaluated "
+                   "by the neural net.  Measured before any subsampling, if the "
+                   "--frame-subsampling-factor options is used (i.e. counts "
+                   "input frames");
     opts->Register("debug-computation", &debug_computation, "If true, turn on "
                    "debug for the actual computation (very verbose!)");
 
@@ -63,62 +96,76 @@ struct DecodableAmNnetSimpleOptions {
   }
 };
 
-/* DecodableAmNnetSimple is a decodable object that decodes with a neural net
-   acoustic model of type AmNnetSimple.  It can accept just input features, or
+/*
+  This base-class for DecodableAmNnetSimple handles just the nnet computation;
+  it can also be used if you just want the nnet output directly.
+
+   It can accept just input features, or
    input features plus iVectors.
 */
-class DecodableAmNnetSimple: public DecodableInterface {
+class NnetDecodableBase {
  public:
-  /// Constructor that just takes the features as input, but can also optionally
-  /// take batch-mode or online iVectors.  Note: it stores references to all
-  /// arguments to the constructor, so don't delete them till this goes out of
-  /// scope.
+  /**
+     This constructor takes features as input, and you can either supply a
+     single iVector input, estimated in batch-mode ('ivector'), or 'online'
+     iVectors ('online_ivectors' and 'online_ivector_period', or none at all.
+     Note: it stores references to all arguments to the constructor, so don't
+     delete them till this goes out of scope.
 
-  DecodableAmNnetSimple(const DecodableAmNnetSimpleOptions &opts,
-                        const TransitionModel &trans_model,
-                        const AmNnetSimple &am_nnet,
-                        const MatrixBase<BaseFloat> &feats,
-                        const VectorBase<BaseFloat> *ivector = NULL,
-                        const MatrixBase<BaseFloat> *online_ivectors = NULL,
-                        int32 online_ivector_period = 1);
+     @param [in] opts   The options class.  Warning: it includes an acoustic
+                        weight, whose default is 0.1; you may sometimes want to
+                        change this to 1.0.
+     @param [in] nnet   The neural net that we're going to do the computation with
+     @param [in] priors Vector of priors-- if supplied and nonempty, we subtract
+                        the log of these priors from the nnet output.
+     @param [in] feats  The input feature matrix.
+     @param [in] ivector If you are using iVectors estimated in batch mode,
+                         a pointer to the iVector, else NULL.
+     @param [in] ivector If you are using iVectors estimated in batch mode,
+                         a pointer to the iVector, else NULL.
+     @param [in] online_ivectors
+                        If you are using iVectors estimated 'online'
+                        a pointer to the iVectors, else NULL.
+     @param [in] online_ivector_period If you are using iVectors estimated 'online'
+                        (i.e. if online_ivectors != NULL) gives the periodicity
+                        (in frames) with which the iVectors are estimated.
+  */
+  NnetDecodableBase(const NnetSimpleComputationOptions &opts,
+                    const Nnet &nnet,
+                    const VectorBase<BaseFloat> &priors,
+                    const MatrixBase<BaseFloat> &feats,
+                    const VectorBase<BaseFloat> *ivector = NULL,
+                    const MatrixBase<BaseFloat> *online_ivectors = NULL,
+                    int32 online_ivector_period = 1);
 
-  /// Constructor that also accepts iVectors estimated online;
-  /// online_ivector_period is the time spacing between rows of the matrix.
-  DecodableAmNnetSimple(const DecodableAmNnetSimpleOptions &opts,
-                        const TransitionModel &trans_model,
-                        const AmNnetSimple &am_nnet,
-                        const MatrixBase<BaseFloat> &feats,
-                        const MatrixBase<BaseFloat> &online_ivectors,
-                        int32 online_ivector_period);
 
-  /// Constructor that accepts iVectors estimated in batch mode
-  DecodableAmNnetSimple(const DecodableAmNnetSimpleOptions &opts,
-                        const TransitionModel &trans_model,
-                        const AmNnetSimple &am_nnet,
-                        const MatrixBase<BaseFloat> &feats,
-                        const VectorBase<BaseFloat> &ivector);
-
-  
-  // Note, frames are numbered from zero.  But transition_id is numbered
-  // from one (this routine is called by FSTs).
-  virtual BaseFloat LogLikelihood(int32 frame, int32 transition_id);
+  // returns the number of frames of likelihoods.  The same as feats_.NumRows()
+  // in the normal case (but may be less if opts_.frame_subsampling_factor !=
+  // 1).
+  inline int32 NumFrames() const { return num_subsampled_frames_; }
 
-  virtual int32 NumFramesReady() const { return feats_.NumRows(); }
-  
-  // Note: these indices are one-based!  This is for compatibility with OpenFst.
-  virtual int32 NumIndices() const { return trans_model_.NumTransitionIds(); }
-  
-  virtual bool IsLastFrame(int32 frame) const {
-    KALDI_ASSERT(frame < NumFramesReady());
-    return (frame == NumFramesReady() - 1);
-  }
+  inline int32 OutputDim() const { return output_dim_; }
 
+  // Gets the output for a particular frame, with 0 <= frame < NumFrames().
+  // 'output' must be correctly sized (with dimension OutputDim()).
+  void GetOutputForFrame(int32 frame, VectorBase<BaseFloat> *output);
 
-  
+  // Gets the output for a particular frame and pdf_id, with
+  // 0 <= subsampled_frame < NumFrames(),
+  // and 0 <= pdf_id < OutputDim().
+  inline BaseFloat GetOutput(int32 subsampled_frame, int32 pdf_id) {
+    if (subsampled_frame < current_log_post_subsampled_offset_ ||
+        subsampled_frame >= current_log_post_subsampled_offset_ +
+                            current_log_post_.NumRows())
+      EnsureFrameIsComputed(subsampled_frame);
+    return current_log_post_(subsampled_frame -
+                             current_log_post_subsampled_offset_,
+                             pdf_id);
+  }
  private:
   // This call is made to ensure that we have the log-probs for this frame
   // cached in current_log_post_.
-  void EnsureFrameIsComputed(int32 frame);
+  void EnsureFrameIsComputed(int32 subsampled_frame);
 
   // This function does the actual nnet computation; it is called from
   // EnsureFrameIsComputed.  Any padding at file start/end is done by
@@ -128,23 +175,34 @@ class DecodableAmNnetSimple: public DecodableInterface {
                          const MatrixBase<BaseFloat> &input_feats,
                          const VectorBase<BaseFloat> &ivector,
                          int32 output_t_start,
-                         int32 num_output_frames);
+                         int32 num_subsampled_frames);
 
-  // Gets the iVector that will be used for this chunk of frames, if
-  // we are using iVectors (else does nothing).
-  void GetCurrentIvector(int32 output_t_start, int32 num_output_frames,
+  // Gets the iVector that will be used for this chunk of frames, if we are
+  // using iVectors (else does nothing).  note: the num_output_frames is
+  // interpreted as the number of t value, which in the subsampled case is not
+  // the same as the number of subsampled frames (it would be larger by
+  // opts_.frame_subsampling_factor).
+  void GetCurrentIvector(int32 output_t_start,
+                         int32 num_output_frames,
                          Vector<BaseFloat> *ivector);
 
-  void PossiblyWarnForFramesPerChunk() const;
+  // called from constructor
+  void CheckAndFixConfigs();
 
   // returns dimension of the provided iVectors if supplied, or 0 otherwise.
   int32 GetIvectorDim() const;
 
-  const DecodableAmNnetSimpleOptions &opts_;
-  const TransitionModel &trans_model_;
-  const AmNnetSimple &am_nnet_;
-  CuVector<BaseFloat> priors_;
+  NnetSimpleComputationOptions opts_;
+  const Nnet &nnet_;
+  int32 nnet_left_context_;
+  int32 nnet_right_context_;
+  int32 output_dim_;
+  // the log priors (or the empty vector if the priors are not set in the model)
+  CuVector<BaseFloat> log_priors_;
   const MatrixBase<BaseFloat> &feats_;
+  // note: num_subsampled_frames_ will equal feats_.NumRows() in the normal case
+  // when opts_.frame_subsampling_factor == 1.
+  int32 num_subsampled_frames_;
 
   // ivector_ is the iVector if we're using iVectors that are estimated in batch
   // mode.
@@ -155,20 +213,77 @@ class DecodableAmNnetSimple: public DecodableInterface {
   // online_ivector_period_ helps us interpret online_ivector_feats_; it's the
   // number of frames the rows of ivector_feats are separated by.
   int32 online_ivector_period_;
-  
-  CachingOptimizingCompiler compiler_;
 
+  CachingOptimizingCompiler compiler_;
 
   // The current log-posteriors that we got from the last time we
   // ran the computation.
   Matrix<BaseFloat> current_log_post_;
-  // The time-offset of the current log-posteriors. 
-  int32 current_log_post_offset_;
-  
+  // The time-offset of the current log-posteriors.  Note: if
+  // opts_.frame_subsampling_factor > 1, this will be measured in subsampled
+  // frames.
+  int32 current_log_post_subsampled_offset_;
+
 
 };
 
+class DecodableAmNnetSimple: public DecodableInterface,
+                             private NnetDecodableBase {
+ public:
+  /**
+     This constructor takes features as input, and you can either supply a
+     single iVector input, estimated in batch-mode ('ivector'), or 'online'
+     iVectors ('online_ivectors' and 'online_ivector_period', or none at all.
+     Note: it stores references to all arguments to the constructor, so don't
+     delete them till this goes out of scope.
+
+     @param [in] opts   The options class.  Warning: it includes an acoustic
+                        weight, whose default is 0.1; you may sometimes want to
+                        change this to 1.0.
+     @param [in] nnet   The neural net that we're going to do the computation with
+     @param [in] priors Vector of priors-- if supplied and nonempty, we subtract
+                        the log of these priors from the nnet output.
+     @param [in] feats  The input feature matrix.
+     @param [in] ivector If you are using iVectors estimated in batch mode,
+                         a pointer to the iVector, else NULL.
+     @param [in] ivector If you are using iVectors estimated in batch mode,
+                         a pointer to the iVector, else NULL.
+     @param [in] online_ivectors
+                        If you are using iVectors estimated 'online'
+                        a pointer to the iVectors, else NULL.
+     @param [in] online_ivector_period If you are using iVectors estimated 'online'
+                        (i.e. if online_ivectors != NULL) gives the periodicity
+                        (in frames) with which the iVectors are estimated.
+  */
+  DecodableAmNnetSimple(const NnetSimpleComputationOptions &opts,
+                        const TransitionModel &trans_model,
+                        const AmNnetSimple &am_nnet,
+                        const MatrixBase<BaseFloat> &feats,
+                        const VectorBase<BaseFloat> *ivector = NULL,
+                        const MatrixBase<BaseFloat> *online_ivectors = NULL,
+                        int32 online_ivector_period = 1);
+
+
+  virtual BaseFloat LogLikelihood(int32 frame, int32 transition_id);
+
+  virtual inline int32 NumFramesReady() const { return NumFrames(); }
+
+  virtual int32 NumIndices() const { return trans_model_.NumTransitionIds(); }
+
+  virtual bool IsLastFrame(int32 frame) const {
+    KALDI_ASSERT(frame < NumFramesReady());
+    return (frame == NumFramesReady() - 1);
+  }
+
+
+
+ private:
+  const TransitionModel &trans_model_;
+
+};
+
+
 } // namespace nnet3
 } // namespace kaldi
 
-#endif  // KALDI_NNET2_DECODABLE_AM_NNET_H_
+#endif  // KALDI_NNET3_NNET_AM_DECODABLE_SIMPLE_H_
diff --git a/src/nnet3/nnet-analyze-test.cc b/src/nnet3/nnet-analyze-test.cc
index ee23f9626c9..9511223742e 100644
--- a/src/nnet3/nnet-analyze-test.cc
+++ b/src/nnet3/nnet-analyze-test.cc
@@ -25,11 +25,21 @@
 namespace kaldi {
 namespace nnet3 {
 
+std::string PrintCommand(int32 num_commands,
+                         int32 command) {
+  std::ostringstream os;
+  if (command < 0 || command >= num_commands)
+    os << command;
+  else
+    os << 'c' << command;
+  return os.str();
+}
+
 
 void UnitTestNnetAnalyze() {
   for (int32 n = 0; n < 20; n++) {
     struct NnetGenerationOptions gen_config;
-    
+
     std::vector<std::string> configs;
     GenerateConfigSequence(gen_config, &configs);
     Nnet nnet;
@@ -42,7 +52,7 @@ void UnitTestNnetAnalyze() {
     ComputationRequest request;
     std::vector<Matrix<BaseFloat> > inputs;
     ComputeExampleComputationRequestSimple(nnet, &request, &inputs);
-    
+
     NnetComputation computation;
     Compiler compiler(request, nnet);
 
@@ -55,9 +65,30 @@ void UnitTestNnetAnalyze() {
 
     CheckComputationOptions check_config;
     // we can do the rewrite check since it's before optimization.
-    check_config.check_rewrite = true;  
-    ComputationChecker checker(check_config, nnet, request, computation);
+    check_config.check_rewrite = true;
+    ComputationChecker checker(check_config, nnet, computation);
     checker.Check();
+
+    Analyzer analyzer;
+    analyzer.Init(nnet, computation);
+    ComputationAnalysis analysis(computation, analyzer);
+    // The following output is to be eyeballed by a person.
+    std::vector<std::string> submatrix_strings;
+    computation.GetSubmatrixStrings(nnet, &submatrix_strings);
+    int32 nc = computation.commands.size();
+    for (int32 n = 0; n < 30; n++) {
+      int32 s = RandInt(1, computation.submatrices.size() - 1);
+      int32 c = RandInt(0, nc - 1);
+      KALDI_LOG << "First access of submatrix " << submatrix_strings[s]
+                << " is command " << PrintCommand(nc, analysis.FirstAccess(s));
+      KALDI_LOG << "Last access of submatrix " << submatrix_strings[s]
+                << " is command " << PrintCommand(nc, analysis.LastAccess(s));
+      KALDI_LOG << "Last write access of submatrix " << submatrix_strings[s]
+                << " is command " << PrintCommand(nc, analysis.LastWriteAccess(s));
+      KALDI_LOG << "Data present in " << submatrix_strings[s]
+                << " at command " << c << " is invalidated at command "
+                << PrintCommand(nc, analysis.DataInvalidatedCommand(c, s));
+    }
   }
 }
 
diff --git a/src/nnet3/nnet-analyze.cc b/src/nnet3/nnet-analyze.cc
index b2c6a91436d..29ff2f01fb1 100644
--- a/src/nnet3/nnet-analyze.cc
+++ b/src/nnet3/nnet-analyze.cc
@@ -28,27 +28,40 @@ void ComputationVariables::ComputeSplitPoints(
   // matrix/submatrix as a matrix.
   int32 num_matrices = computation.matrices.size(),
       num_submatrices = computation.submatrices.size();
-  split_points_.resize(num_matrices);
+  row_split_points_.resize(num_matrices);
+  column_split_points_.resize(num_matrices);
   KALDI_ASSERT(computation.submatrices[0].num_rows == 0);
   for (int32 submatrix_index = 1;
        submatrix_index < num_submatrices;
        submatrix_index++) {
     const NnetComputation::SubMatrixInfo &s =
         computation.submatrices[submatrix_index];
-    split_points_[s.matrix_index].push_back(s.col_offset);
-    split_points_[s.matrix_index].push_back(s.col_offset + s.num_cols);
+    row_split_points_[s.matrix_index].push_back(s.row_offset);
+    row_split_points_[s.matrix_index].push_back(s.row_offset + s.num_rows);
+    column_split_points_[s.matrix_index].push_back(s.col_offset);
+    column_split_points_[s.matrix_index].push_back(s.col_offset + s.num_cols);
   }
   for (int32 matrix_index = 1; matrix_index < num_matrices; matrix_index++) {
-    SortAndUniq(&(split_points_[matrix_index]));
-    // should have at least 0 and num_rows included, so size >= 2.
-    KALDI_ASSERT(split_points_[matrix_index].size() >= 2);
+    // Because it's possible for matrices not to have any submatrices (after
+    // pruning), we need to make sure that the beginning and end dimensions are
+    // in the split points.
+    column_split_points_[matrix_index].push_back(0);
+    column_split_points_[matrix_index].push_back(
+        computation.matrices[matrix_index].num_cols);
+    row_split_points_[matrix_index].push_back(0);
+    row_split_points_[matrix_index].push_back(
+        computation.matrices[matrix_index].num_rows);
+    SortAndUniq(&(column_split_points_[matrix_index]));
+    SortAndUniq(&(row_split_points_[matrix_index]));
   }
   // note: the last split point of each matrix doesn't get its own variable index.
   matrix_to_variable_index_.resize(num_matrices + 1);
   matrix_to_variable_index_[0] = 0;
   matrix_to_variable_index_[1] = 0;
   for (int32 matrix_index = 1; matrix_index < num_matrices; matrix_index++) {
-    int32 num_variables = split_points_[matrix_index].size() - 1;
+    int32 num_row_variables = row_split_points_[matrix_index].size() - 1,
+        num_column_variables = column_split_points_[matrix_index].size() - 1,
+        num_variables = num_row_variables * num_column_variables;
     KALDI_ASSERT(num_variables >= 1);
     matrix_to_variable_index_[matrix_index+1] =
         matrix_to_variable_index_[matrix_index] + num_variables;
@@ -56,89 +69,73 @@ void ComputationVariables::ComputeSplitPoints(
   num_variables_ = matrix_to_variable_index_.back();
 }
 
-void ComputationVariables::ComputeVariableRanges(
+//static
+int32 ComputationVariables::FindIndexOf(const std::vector<int32> &vec, int32 i) {
+  // std::lower_bound does a binary search -> faster than std::find.
+  std::vector<int32>::const_iterator iter = std::lower_bound(
+      vec.begin(), vec.end(), i);
+  KALDI_ASSERT(*iter == i);
+  return iter - vec.begin();
+}
+
+void ComputationVariables::ComputeVariablesForSubmatrix(
     const NnetComputation &computation) {
   // note, these numbers are only valid if you include the empty zero-indexed
   // matrix/submatrix as a matrix.
   int32 num_submatrices = computation.submatrices.size();
 
-  variable_ranges_.resize(num_submatrices);
-  variable_ranges_[0] = std::pair<int32,int32>(0, 0);
+  variables_for_submatrix_.resize(num_submatrices);
+
+  submatrix_is_whole_matrix_.resize(num_submatrices, false);
+  submatrix_to_matrix_.resize(num_submatrices);
+  submatrix_to_matrix_[0] = 0;
 
-  full_column_range_.resize(num_submatrices);
-  
   for (int32 submatrix_index = 1;
        submatrix_index < num_submatrices;
        submatrix_index++) {
     const NnetComputation::SubMatrixInfo &s =
         computation.submatrices[submatrix_index];
     int32 matrix_index = s.matrix_index;
-    int32 start_dim = s.col_offset, end_dim = start_dim + s.num_cols;
-    const std::vector<int32> &split = split_points_[matrix_index];
-    // std::lower_bound does a binary search -> faster than std::find.
-    std::vector<int32>::const_iterator iter = std::lower_bound(
-        split.begin(), split.end(), start_dim);
-    KALDI_ASSERT(*iter == start_dim);  // or code error.
-    int32 start_split_point_index = iter - split.begin();
-    iter = std::lower_bound(iter, split.end(), end_dim);
-    KALDI_ASSERT(*iter == end_dim);  // or code error.
-    int32 end_split_point_index = iter - split.begin();
-    int32 matrix_offset = matrix_to_variable_index_[matrix_index];
-    int32 start_variable_index = matrix_offset + start_split_point_index,
-        end_variable_index = matrix_offset + end_split_point_index;
-    KALDI_ASSERT(end_variable_index > start_variable_index);
-    variable_ranges_[submatrix_index].first = start_variable_index;
-    variable_ranges_[submatrix_index].second = end_variable_index;
-    full_column_range_[submatrix_index] =
-        (s.row_offset == 0 && s.num_rows ==
-         computation.matrices[matrix_index].num_rows);
-  }
-}
-
-void ComputationVariables::ComputeSubmatrixInfo(
-    const NnetComputation &computation) {
-  int32 num_submatrices = computation.submatrices.size();  
-  submatrix_to_matrix_.resize(num_submatrices, 0);
-  submatrix_is_whole_matrix_.resize(num_submatrices, false);
-  for (int32 s = 1; s < num_submatrices; s++) {
-    submatrix_to_matrix_[s] = computation.submatrices[s].matrix_index;
-    submatrix_is_whole_matrix_[s] = computation.IsWholeMatrix(s);
+    submatrix_to_matrix_[submatrix_index] = matrix_index;
+    int32 start_col = s.col_offset, end_col = start_col + s.num_cols,
+        start_row = s.row_offset, end_row = start_row + s.num_rows;
+    int32 row_start = FindIndexOf(row_split_points_[matrix_index], start_row),
+        row_end = FindIndexOf(row_split_points_[matrix_index], end_row),
+        col_start = FindIndexOf(column_split_points_[matrix_index], start_col),
+        col_end = FindIndexOf(column_split_points_[matrix_index], end_col),
+        num_column_variables = column_split_points_[matrix_index].size() - 1,
+        num_row_variables = row_split_points_[matrix_index].size() - 1,
+        matrix_start_variable = matrix_to_variable_index_[matrix_index];
+    KALDI_ASSERT(row_end > row_start && col_end > col_start &&
+                 col_end <= num_column_variables);
+    std::vector<int32> &variables = variables_for_submatrix_[submatrix_index];
+    for (int32 r = row_start; r < row_end; r++)
+      for (int32 c = col_start; c < col_end; c++)
+        variables.push_back(matrix_start_variable + r*num_column_variables + c);
+    if (row_start == 0 && row_end == num_row_variables &&
+        col_start == 0 && col_end == num_column_variables)
+      submatrix_is_whole_matrix_[submatrix_index] = true;
   }
 }
 
-void ComputationVariables::ComputeVariableToMatrix(
-    const NnetComputation &computation) {
+void ComputationVariables::ComputeVariableToMatrix() {
   variable_to_matrix_.clear();
-  variable_to_matrix_.resize(NumVariables(), -1);
-  int32 num_submatrices = variable_ranges_.size();
-  for (int32 submatrix_index = 1;
-       submatrix_index < num_submatrices;
-       submatrix_index++) {
-    int32 matrix_index = computation.submatrices[submatrix_index].matrix_index;
-    int32 variable_start = variable_ranges_[submatrix_index].first,
-        variable_end = variable_ranges_[submatrix_index].second;
-    for (int32 variable_index = variable_start;
-         variable_index < variable_end;
-         variable_index++) {
-      if (variable_to_matrix_[variable_index] == -1) {
-        variable_to_matrix_[variable_index] = matrix_index;
-      } else {
-        KALDI_ASSERT(variable_to_matrix_[variable_index] == matrix_index);
-      }
-    }
+  variable_to_matrix_.resize(NumVariables());
+  int32 num_matrices = matrix_to_variable_index_.size() - 1;
+  for (int32 matrix_index = 1; matrix_index < num_matrices; matrix_index++) {
+    int32 start_variable = matrix_to_variable_index_[matrix_index],
+        end_variable = matrix_to_variable_index_[matrix_index + 1];
+    for (int32 i = start_variable; i < end_variable; i++)
+      variable_to_matrix_[i] = matrix_index;
   }
-  // make sure we covered all variables.
-  KALDI_ASSERT(std::count(variable_to_matrix_.begin(),
-                          variable_to_matrix_.end(), -1) == 0);
 }
 
 void ComputationVariables::Init(const NnetComputation &computation) {
-  // don't call this twice on the same objct..
-  KALDI_ASSERT(split_points_.empty());
+  // don't call this twice on the same object..
+  KALDI_ASSERT(row_split_points_.empty());
   ComputeSplitPoints(computation);
-  ComputeVariableRanges(computation);
-  ComputeVariableToMatrix(computation);
-  ComputeSubmatrixInfo(computation);
+  ComputeVariablesForSubmatrix(computation);
+  ComputeVariableToMatrix();
 }
 
 int32 ComputationVariables::GetMatrixForVariable(int32 variable) const {
@@ -149,11 +146,11 @@ int32 ComputationVariables::GetMatrixForVariable(int32 variable) const {
 void ComputationVariables::AppendVariablesForSubmatrix(
     int32 submatrix_index,
     std::vector<int32> *variable_indexes) const {
-  KALDI_ASSERT(static_cast<size_t>(submatrix_index) < variable_ranges_.size());
-  int32 start = variable_ranges_[submatrix_index].first,
-      end = variable_ranges_[submatrix_index].second;
-  for (int32 variable_index = start; variable_index < end; variable_index++)
-    variable_indexes->push_back(variable_index);
+  KALDI_ASSERT(static_cast<size_t>(submatrix_index) <
+               variables_for_submatrix_.size());
+  variable_indexes->insert(variable_indexes->end(),
+                           variables_for_submatrix_[submatrix_index].begin(),
+                           variables_for_submatrix_[submatrix_index].end());
 }
 
 void ComputationVariables::AppendVariablesForMatrix(
@@ -163,7 +160,7 @@ void ComputationVariables::AppendVariablesForMatrix(
                matrix_to_variable_index_.size());
   int32 start = matrix_to_variable_index_[matrix_index],
       end = matrix_to_variable_index_[matrix_index + 1];
-
+  variable_indexes->reserve(variable_indexes->size() + end - start);
   for (int32 variable_index = start; variable_index < end; variable_index++)
     variable_indexes->push_back(variable_index);
 }
@@ -188,17 +185,13 @@ void ComputationVariables::RecordAccessForSubmatrix(
     case kWriteAccess:
       AppendVariablesForSubmatrix(submatrix_index,
                                   &(ca->variables_written));
-      ca->submatrices_written.push_back(submatrix_index);      
+      ca->submatrices_written.push_back(submatrix_index);
       ca->matrices_written.push_back(matrix_index);
       // if submatrix does not span the full row range of the matrix,
       // a write operation has to be considered a read/write operation
-      // on the underlying variable.
-      if (!full_column_range_[submatrix_index])
-        AppendVariablesForSubmatrix(submatrix_index,
-                                    &(ca->variables_read));
-      // similar logic applies to the matrix accesses.
+      // on the underlying matrix
       if (!is_whole_matrix)
-        ca->matrices_read.push_back(matrix_index);      
+        ca->matrices_read.push_back(matrix_index);
       break;
     case kReadWriteAccess:
       AppendVariablesForSubmatrix(submatrix_index,
@@ -212,6 +205,38 @@ void ComputationVariables::RecordAccessForSubmatrix(
   }
 }
 
+std::string ComputationVariables::DescribeVariable(int32 variable) const {
+  KALDI_ASSERT(variable >= 0 && variable < num_variables_);
+  int32 matrix_index = variable_to_matrix_[variable],
+      offset = variable - matrix_to_variable_index_[matrix_index],
+      num_column_variables = column_split_points_[matrix_index].size() - 1,
+      num_row_variables = row_split_points_[matrix_index].size() - 1,
+      column_variable = offset % num_column_variables,
+      row_variable = offset / num_row_variables;
+  KALDI_ASSERT(column_variable >= 0 && row_variable >= 0 &&
+               row_variable < num_row_variables &&
+               column_variable < num_column_variables);
+  std::ostringstream os;
+  os << 'm' << matrix_index;
+  if (num_row_variables != 1 || num_column_variables != 1) {
+    os << '(';
+    if (num_row_variables == 1) {
+      os << ':';
+    } else {
+      os << row_split_points_[matrix_index][row_variable] << ':'
+         << row_split_points_[matrix_index][row_variable+1] - 1;
+    }
+    os << ',';
+    if (num_column_variables == 1) {
+      os << ':';
+    } else {
+      os << column_split_points_[matrix_index][column_variable] << ':'
+         << column_split_points_[matrix_index][column_variable+1] - 1;
+    }
+    os << ')';
+  }
+  return os.str();
+}
 
 
 /// given a vector of pairs from computation.indexes_multi_indexes
@@ -251,48 +276,51 @@ void ComputeCommandAttributes(
     const NnetComputation::Command &c = computation.commands[command_index];
     CommandAttributes &attr = (*attributes)[command_index];
     switch (c.command_type) {
-      case NnetComputation::kAllocMatrixZeroed:
+      case kAllocMatrixZeroed:
+      case kAllocMatrixFromOtherZeroed:
         vars.AppendVariablesForMatrix(c.arg1, &attr.variables_written);
         attr.matrices_written.push_back(c.arg1);
         break;
-      case NnetComputation::kAllocMatrixUndefined: // nothing is written here. 
-        break;
-      case NnetComputation::kDeallocMatrix: // ditto.
+      case kAllocMatrixUndefined: // nothing is written here.
+      case kDeallocMatrix: // ditto.
+      case kAllocMatrixFromOther: // ditto.
         break;
-      case NnetComputation::kPropagate:
+      case kPropagate:
         vars.RecordAccessForSubmatrix(c.arg3, kReadAccess, &attr);
         if (nnet.GetComponent(c.arg1)->Properties() & kPropagateAdds)
           vars.RecordAccessForSubmatrix(c.arg4, kReadWriteAccess, &attr);
         else
-          vars.RecordAccessForSubmatrix(c.arg4, kWriteAccess, &attr);        
+          vars.RecordAccessForSubmatrix(c.arg4, kWriteAccess, &attr);
         break;
-      case NnetComputation::kStoreStats:
+      case kStoreStats:
         vars.RecordAccessForSubmatrix(c.arg2, kReadAccess, &attr);
         break;
-      case NnetComputation::kBackprop:
+      case kBackprop:
+      case kBackpropNoModelUpdate:
         vars.RecordAccessForSubmatrix(c.arg3, kReadAccess, &attr);
         vars.RecordAccessForSubmatrix(c.arg4, kReadAccess, &attr);
         vars.RecordAccessForSubmatrix(c.arg5, kReadAccess, &attr);
-        if (nnet.GetComponentForNode(c.arg1)->Properties() & kBackpropAdds)      
+        if (nnet.GetComponent(c.arg1)->Properties() & kBackpropAdds)
           vars.RecordAccessForSubmatrix(c.arg6, kReadWriteAccess, &attr);
         else
-          vars.RecordAccessForSubmatrix(c.arg6, kWriteAccess, &attr);        
-        if (nnet.GetComponentForNode(c.arg1)->Properties() & kUpdatableComponent)
+          vars.RecordAccessForSubmatrix(c.arg6, kWriteAccess, &attr);
+        if (c.command_type == kBackprop &&
+            nnet.GetComponent(c.arg1)->Properties() & kUpdatableComponent)
           attr.has_side_effects = true;
         break;
-      case NnetComputation::kMatrixCopy:
+      case kMatrixCopy:
         vars.RecordAccessForSubmatrix(c.arg1, kWriteAccess, &attr);
         vars.RecordAccessForSubmatrix(c.arg2, kReadAccess, &attr);
         break;
-      case NnetComputation::kMatrixAdd:      
+      case kMatrixAdd:
         vars.RecordAccessForSubmatrix(c.arg1, kReadWriteAccess, &attr);
         vars.RecordAccessForSubmatrix(c.arg2, kReadAccess, &attr);
         break;
-      case NnetComputation::kAddRows:
+      case kAddRows:
         vars.RecordAccessForSubmatrix(c.arg1, kReadWriteAccess, &attr);
         vars.RecordAccessForSubmatrix(c.arg2, kReadAccess, &attr);
-        break;      
-      case NnetComputation::kCopyRows: {
+        break;
+      case kCopyRows: {
         const std::vector<int32> &indexes = computation.indexes[c.arg3];
         // if there are -1's in "indexes", then the result of the operation
         // will depend on the initial value of the matrix, so it's
@@ -304,7 +332,7 @@ void ComputeCommandAttributes(
         vars.RecordAccessForSubmatrix(c.arg2, kReadAccess, &attr);
         break;
       }
-      case NnetComputation::kAddRowsMulti: {
+      case kAddRowsMulti: {
         vars.RecordAccessForSubmatrix(c.arg1, kReadWriteAccess, &attr);
         std::vector<int32> submatrix_indexes;
         IndexesMultiToSubmatrixIndexes(computation.indexes_multi[c.arg2],
@@ -314,7 +342,7 @@ void ComputeCommandAttributes(
                                         kReadAccess, &attr);
         break;
       }
-      case NnetComputation::kCopyRowsMulti: {
+      case kCopyRowsMulti: {
         std::vector<int32> submatrix_indexes;
         IndexesMultiToSubmatrixIndexes(computation.indexes_multi[c.arg2],
                                        &submatrix_indexes);
@@ -326,8 +354,8 @@ void ComputeCommandAttributes(
                                         kReadAccess, &attr);
         break;
       }
-      case NnetComputation::kAddToRowsMulti:
-      case NnetComputation::kCopyToRowsMulti: {
+      case kAddToRowsMulti:
+      case kCopyToRowsMulti: {
         vars.RecordAccessForSubmatrix(c.arg1, kReadAccess, &attr);
         // if the submatrixes we're writing to (in kCopyToRowsMulti) had all
         // rows covered, it would be a pure write operation.
@@ -339,13 +367,13 @@ void ComputeCommandAttributes(
                                         &attr);
         break;
       }
-      case NnetComputation::kAddRowRanges: {
+      case kAddRowRanges: {
         vars.RecordAccessForSubmatrix(c.arg1, kReadWriteAccess, &attr);
         vars.RecordAccessForSubmatrix(c.arg2, kReadAccess, &attr);
         break;
       }
-      case NnetComputation::kNoOperation:
-      case NnetComputation::kNoOperationMarker:
+      case kNoOperation:
+      case kNoOperationMarker:
         break;
       default:
         KALDI_ERR << "Unknown command type.";
@@ -379,7 +407,7 @@ void ComputeVariableAccesses(
     all_variables.insert(all_variables.end(), attr.variables_written.begin(),
                          attr.variables_written.end());
     SortAndUniq(&all_variables);
-    
+
     std::vector<int32>::const_iterator iter = all_variables.begin(),
         end = all_variables.end();
     for (; iter != end; ++iter) {
@@ -427,7 +455,7 @@ void ComputeMatrixAccesses(
     all_matrices.insert(all_matrices.end(), attr.matrices_written.begin(),
                          attr.matrices_written.end());
     SortAndUniq(&all_matrices);
-    
+
     std::vector<int32>::const_iterator iter = all_matrices.begin(),
         end = all_matrices.end();
     for (; iter != end; ++iter) {
@@ -452,16 +480,26 @@ void ComputeMatrixAccesses(
     }
     // Now set up allocate_command and deallocate_command.
     const NnetComputation::Command &command = computation.commands[c];
-    int32 matrix_index = command.arg1;
-    
+    int32 matrix_index = command.arg1,
+        matrix_index2 = command.arg2;
+
     switch (command.command_type) {
-      case NnetComputation::kAllocMatrixZeroed:
-      case NnetComputation::kAllocMatrixUndefined:
+      case kAllocMatrixZeroed:
+      case kAllocMatrixUndefined:
+        if ((*matrix_accesses)[matrix_index].allocate_command != -1)
+          KALDI_ERR << "Matrix " << matrix_index << " initialized twice.";
+        (*matrix_accesses)[matrix_index].allocate_command = c;
+        break;
+      case kAllocMatrixFromOther:
+      case kAllocMatrixFromOtherZeroed:
         if ((*matrix_accesses)[matrix_index].allocate_command != -1)
           KALDI_ERR << "Matrix " << matrix_index << " initialized twice.";
         (*matrix_accesses)[matrix_index].allocate_command = c;
+        if ((*matrix_accesses)[matrix_index2].deallocate_command != -1)
+          KALDI_ERR << "Matrix " << matrix_index << " destroyed twice.";
+        (*matrix_accesses)[matrix_index2].deallocate_command = c;
         break;
-      case NnetComputation::kDeallocMatrix:
+      case kDeallocMatrix:
         if ((*matrix_accesses)[matrix_index].deallocate_command != -1)
           KALDI_ERR << "Matrix " << matrix_index << " destroyed twice.";
         (*matrix_accesses)[matrix_index].deallocate_command = c;
@@ -503,15 +541,13 @@ void ComputeMatrixAccesses(
     }
   }
 }
-        
+
 
 ComputationChecker::ComputationChecker(
     const CheckComputationOptions &config,
     const Nnet &nnet,
-    const ComputationRequest &request,
     const NnetComputation &computation):
-    config_(config), nnet_(nnet), request_(request),
-    computation_(computation) { }
+    config_(config), nnet_(nnet), computation_(computation) { }
 
 
 
@@ -520,7 +556,8 @@ void ComputationChecker::Check() {
   a_.Init(nnet_, computation_);
   CheckComputationOrder();
   CheckComputationMatrixAccesses();
-  CheckComputationUndefined();  
+  CheckComputationUndefined();
+  CheckComputationDebugInfo();
   if (config_.check_rewrite)
     CheckComputationRewrite();
 
@@ -540,8 +577,7 @@ void ComputationChecker::CheckComputationRewrite() const {
     const std::vector<Access> &accesses = a_.variable_accesses[v];
     int32 matrix_index = a_.variables.GetMatrixForVariable(v);
     if (accesses.empty() && ! a_.matrix_accesses[matrix_index].is_input) {
-      KALDI_ERR << "Variable " << v << " (part of matrix m"
-                << matrix_index << ") "
+      KALDI_ERR << "Variable " << v << " = " << a_.variables.DescribeVariable(v)
                 << "is never used.";
     }
     int32 num_accesses = accesses.size();
@@ -556,8 +592,8 @@ void ComputationChecker::CheckComputationRewrite() const {
       for (int32 access = first_pure_read + 1;
            access < num_accesses; access++) {
         if (accesses[access].access_type != kReadAccess) {
-          KALDI_ERR << "Variable " << v << " (part of matrix m"
-                    << matrix_index << ") "
+          KALDI_ERR << "Variable " << v << " = "
+                    << a_.variables.DescribeVariable(v)
                     << "is modified after being read "
                     << "(this is not expected before optimization)";
         }
@@ -577,13 +613,12 @@ void ComputationChecker::CheckComputationUndefined() const {
     int32 matrix_index = a_.variables.GetMatrixForVariable(v);
     bool is_input = a_.matrix_accesses[matrix_index].is_input;
     if (! is_input) {
-      if (accesses.empty()) 
-        KALDI_ERR << "Variable " << v << " (part of matrix m"
-                  << matrix_index << ") "
-                  << "is never used.";
+      if (accesses.empty())
+        KALDI_ERR << "Variable " << v << " == "
+                  << a_.variables.DescribeVariable(v) << "is never used.";
       if (accesses[0].access_type != kWriteAccess)
-        KALDI_ERR << "Variable " << v << " (part of matrix m"
-                  << matrix_index << ") "
+        KALDI_ERR << "Variable " << v << " == "
+                  << a_.variables.DescribeVariable(v)
                   << "is read before it is written to";
     }
   }
@@ -607,7 +642,7 @@ void ComputationChecker::CheckComputationMatrixAccesses() const {
         KALDI_ERR << "Input matrix is initialized.";
     } else {
       if (accesses.allocate_command == -1)
-        KALDI_ERR << "Matrix is not initialized.";
+        KALDI_ERR << "Matrix m" << matrix_index << "is not initialized.";
       if (accesses.accesses.empty()) {
         KALDI_ERR << "Matrix m" << matrix_index << " is never accessed.";
       } else if (accesses.accesses.front().command_index <
@@ -621,7 +656,7 @@ void ComputationChecker::CheckComputationMatrixAccesses() const {
         KALDI_ERR << "Output matrix is destroyed.";
     } else {
       if (accesses.deallocate_command == -1)
-        KALDI_ERR << "Matrix is not destroyed.";
+        KALDI_ERR << "Matrix m" << matrix_index << " is not destroyed.";
       if (accesses.accesses.empty()) {
         if (accesses.is_input) {
           // we allow there to be no accesses if it is an input, e.g. if an
@@ -656,17 +691,28 @@ void ComputationChecker::CheckComputationIndexes() const {
       num_submatrices = computation_.submatrices.size();
   const std::vector<NnetComputation::SubMatrixInfo> &submatrices =
       computation_.submatrices;
-  
+
   for (int32 command_index = 0; command_index < num_commands; command_index++) {
     const NnetComputation::Command &c = computation_.commands[command_index];
     switch (c.command_type) {
-      case NnetComputation::kAllocMatrixZeroed:
-      case NnetComputation::kAllocMatrixUndefined:
-      case NnetComputation::kDeallocMatrix:
+      case kAllocMatrixZeroed:
+      case kAllocMatrixUndefined:
+      case kDeallocMatrix:
         if (c.arg1 < 1 || c.arg1 >= num_matrices)
           KALDI_ERR << "matrix index out of range.";
         break;
-      case NnetComputation::kPropagate: {
+      case kAllocMatrixFromOther:
+      case kAllocMatrixFromOtherZeroed:
+        if (c.arg1 < 1 || c.arg1 >= num_matrices ||
+            c.arg2 < 1 || c.arg2 >= num_matrices)
+          KALDI_ERR << "matrix index out of range.";
+        if (computation_.matrices[c.arg1].num_rows !=
+            computation_.matrices[c.arg2].num_rows ||
+            computation_.matrices[c.arg1].num_cols !=
+            computation_.matrices[c.arg2].num_cols)
+          KALDI_ERR << "Dimension mismatch in kAllocMatrixFromOther* command";
+        break;
+      case kPropagate: {
         if (c.arg1 < 0 || c.arg1 >= nnet_.NumComponents())
           KALDI_ERR << "Component index out of range";
         const Component *component = nnet_.GetComponent(c.arg1);
@@ -695,7 +741,7 @@ void ComputationChecker::CheckComputationIndexes() const {
           KALDI_ERR << "In-place propagation not supported for this component";
         break;
       }
-      case NnetComputation::kStoreStats: {
+      case kStoreStats: {
         if (c.arg1 < 0 || c.arg1 >= nnet_.NumComponents())
           KALDI_ERR << "Component index out of range";
         const Component *component = nnet_.GetComponent(c.arg1);
@@ -708,11 +754,11 @@ void ComputationChecker::CheckComputationIndexes() const {
           KALDI_ERR << "Dimension mismatch in StoreStats";
         break;
       }
-      case NnetComputation::kBackprop: {
-        if (c.arg1 < 0 || c.arg1 >= nnet_.NumNodes() ||
-            !nnet_.IsComponentNode(c.arg1))
-          KALDI_ERR << "Node index in backprop invalid or out of range";
-        const Component *component = nnet_.GetComponentForNode(c.arg1);
+      case kBackprop:
+      case kBackpropNoModelUpdate: {
+        if (c.arg1 < 0 || c.arg1 >= nnet_.NumComponents())
+          KALDI_ERR << "Component index in backprop invalid or out of range";
+        const Component *component = nnet_.GetComponent(c.arg1);
         int32 properties = component->Properties();
         if (c.arg2 < 0 ||
             c.arg2 > computation_.component_precomputed_indexes.size())
@@ -761,8 +807,8 @@ void ComputationChecker::CheckComputationIndexes() const {
           KALDI_ERR << "Num-rows mismatch in backprop input vs output.";
         break;
       }
-      case NnetComputation::kMatrixCopy:
-      case NnetComputation::kMatrixAdd:
+      case kMatrixCopy:
+      case kMatrixAdd:
         if (c.arg1 < 1 || c.arg1 >= num_submatrices ||
             c.arg2 < 1 || c.arg2 >= num_submatrices)
           KALDI_ERR << "Submatrix indexes out of range in matrix copy/add";
@@ -772,8 +818,8 @@ void ComputationChecker::CheckComputationIndexes() const {
         if (c.arg1 == c.arg2)
           KALDI_ERR << "Adding/copying to self";
         break;
-      case NnetComputation::kAddRows:
-      case NnetComputation::kCopyRows: {
+      case kAddRows:
+      case kCopyRows: {
         if (c.arg1 < 1 || c.arg1 >= num_submatrices ||
             c.arg2 < 1 || c.arg2 >= num_submatrices ||
             static_cast<size_t>(c.arg3) >= computation_.indexes.size())
@@ -790,10 +836,10 @@ void ComputationChecker::CheckComputationIndexes() const {
           KALDI_ERR << "Copying to self in add-rows/copy-rows command.";
         break;
       }
-      case NnetComputation::kAddRowsMulti:
-      case NnetComputation::kCopyRowsMulti:
-      case NnetComputation::kAddToRowsMulti:
-      case NnetComputation::kCopyToRowsMulti: {
+      case kAddRowsMulti:
+      case kCopyRowsMulti:
+      case kAddToRowsMulti:
+      case kCopyToRowsMulti: {
         if (c.arg1 < 1 || c.arg1 >= num_submatrices ||
             static_cast<size_t>(c.arg2) >= computation_.indexes_multi.size())
           KALDI_ERR << "Index out of range in *-multi command";
@@ -822,8 +868,8 @@ void ComputationChecker::CheckComputationIndexes() const {
               KALDI_ERR << "Mismatching dimension in *-multi command";
           }
         }
-        if (c.command_type == NnetComputation::kAddToRowsMulti ||
-            c.command_type == NnetComputation::kCopyToRowsMulti) {
+        if (c.command_type == kAddToRowsMulti ||
+            c.command_type == kCopyToRowsMulti) {
           // check for duplicates; these are not allowed in kAddToRowsMulti
           // or kCopyToRowsMulti because they would necessitate extra work
           // in CUDA kernels.
@@ -845,13 +891,13 @@ void ComputationChecker::CheckComputationIndexes() const {
         }
         break;
       }
-      case NnetComputation::kAddRowRanges: {
+      case kAddRowRanges: {
         if (c.arg1 < 1 || c.arg1 >= num_submatrices ||
             c.arg2 < 1 || c.arg2 >= num_submatrices ||
-            static_cast<size_t>(c.arg3) >= computation_.indexes_ranges.size())          
+            static_cast<size_t>(c.arg3) >= computation_.indexes_ranges.size())
           KALDI_ERR << "Index out of range in add-row-ranges command";
         const std::vector<std::pair<int32, int32> > pairs =
-            computation_.indexes_ranges[c.arg2];
+            computation_.indexes_ranges[c.arg3];
         if (static_cast<size_t>(submatrices[c.arg1].num_rows) != pairs.size())
           KALDI_ERR << "Num-rows mismatch in add-row-ranges command";
         if (submatrices[c.arg1].num_cols != submatrices[c.arg2].num_cols)
@@ -860,17 +906,16 @@ void ComputationChecker::CheckComputationIndexes() const {
         std::vector<std::pair<int32, int32> >::const_iterator
             iter = pairs.begin(), end = pairs.end();
         for (; iter != end; ++iter) {
-          // note: -1's are not allowed.  To represent the empty range,
-          // the user should use some valid index twice.
-          if (iter->second < iter->first || iter->first < 0 ||
-              iter->second > src_num_rows)
+          if (!((iter->first == -1 && iter->second == -1) ||
+                (iter->second > iter->first &&
+                 iter->first >= 0 && iter->second <= src_num_rows)))
             KALDI_ERR << "Row range " << iter->first << ',' << iter->second
-                      << " out of range in add-row-ranges command.";
+                      << " is invalid in add-row-ranges command.";
         }
         break;
       }
-      case NnetComputation::kNoOperation:
-      case NnetComputation::kNoOperationMarker:
+      case kNoOperation:
+      case kNoOperationMarker:
         break;
       default:
         KALDI_ERR << "Unknown command type.";
@@ -887,137 +932,232 @@ void ComputationChecker::CheckComputationOrder() const {
   int32 num_markers = 0, marker_location = 0;
   for (int32 c = 0; c < num_commands; c++) {
     if (computation_.commands[c].command_type ==
-        NnetComputation::kNoOperationMarker) {
+        kNoOperationMarker) {
       marker_location = c;
       num_markers++;
     }
   }
   if (num_markers != 1)
     KALDI_ERR << "Expected exactly one kNoOperationMarker marker.";
-  
+
   for (int32 c = 0; c < num_commands; c++) {
-    NnetComputation::CommandType command_type =
+    CommandType command_type =
         computation_.commands[c].command_type;
     if (c != marker_location &&
-        command_type == NnetComputation::kNoOperationMarker)
+        command_type == kNoOperationMarker)
       KALDI_ERR << "Found kNoOpMarker in unexpected place";
     if (c < marker_location &&
-        command_type == NnetComputation::kBackprop)
+        (command_type == kBackprop ||
+         command_type == kBackpropNoModelUpdate))
       KALDI_ERR << "Backprop occurs before kNoOpMarker";
     if (c > marker_location &&
-        command_type == NnetComputation::kPropagate)
+        command_type == kPropagate)
       KALDI_ERR << "Propagate occurs after kNoOpMarker";
     if (c > marker_location &&
-        command_type == NnetComputation::kStoreStats)
+        command_type == kStoreStats)
       KALDI_ERR << "StoreStats occurs after kNoOpMarker";
   }
 }
 
-void ComputeSubmatLists(const NnetComputation &computation,
-                        std::vector<std::vector<int32> > *submat_lists) {
+void ComputationChecker::CheckComputationDebugInfo() const {
+  if (computation_.matrix_debug_info.empty()) return;
+  if (computation_.matrix_debug_info.size() !=
+      computation_.matrices.size())
+    KALDI_ERR << "Debug info has wrong size";
+  for (size_t i = 1; i < computation_.matrix_debug_info.size(); i++) {
+    if (computation_.matrix_debug_info[i].cindexes.size() !=
+        static_cast<size_t>(computation_.matrices[i].num_rows))
+      KALDI_ERR << "Debug info for matrix m" << i
+                << " has wrong num-rows.";
+  }
+}
+
+void CheckComputation(const Nnet &nnet,
+                      const ComputationRequest &request,
+                      const NnetComputation &computation,
+                      bool check_rewrite) {
+  CheckComputationOptions opts;
+  opts.check_rewrite = check_rewrite;
+  ComputationChecker checker(opts, nnet, computation);
+  checker.Check();
+}
+
+void ComputeMatrixToSubmatrix(
+    const NnetComputation &computation,
+    std::vector<std::vector<int32> > *mat_to_submat) {
   int32 num_matrices = computation.matrices.size(),
       num_submatrices = computation.submatrices.size();
-  submat_lists->clear();
-  submat_lists->resize(num_matrices);
+  mat_to_submat->clear();
+  mat_to_submat->resize(num_matrices);
   for (int32 submatrix_index = 1;
        submatrix_index < num_submatrices;
        submatrix_index++) {
     int32 matrix_index = computation.submatrices[submatrix_index].matrix_index;
     KALDI_ASSERT(matrix_index > 0 && matrix_index < num_matrices);
-    (*submat_lists)[matrix_index].push_back(submatrix_index);
+    (*mat_to_submat)[matrix_index].push_back(submatrix_index);
   }
 }
 
-bool MatrixIsAccessedBeforeCommand(
-    const std::vector<MatrixAccesses> &matrix_accesses,
-    int32 matrix_index,
-    int32 command_index) {
-  KALDI_ASSERT(matrix_index > 0 && matrix_index <
-               static_cast<int32>(matrix_accesses.size()));
-  const MatrixAccesses &access = matrix_accesses[matrix_index];
-  if (access.accesses.empty())
-    return false;  // should not happen in this case, but whatever...
-  int32 first_command = access.accesses.front().command_index;
-  if (first_command != access.allocate_command &&
-      first_command < command_index) {
-    // e.g. could occur if matrix was not zeroed on initialization.
-    return true;
+int32 ComputationAnalysis::FirstAccess(int32 s) const {
+  KALDI_ASSERT(static_cast<size_t>(s) < computation_.submatrices.size() && s>0);
+  int32 matrix_index = computation_.submatrices[s].matrix_index;
+  if (analyzer_.matrix_accesses[matrix_index].is_input)
+    return -1;
+  int32 ans = computation_.commands.size();
+  std::vector<int32> variable_indexes;
+  analyzer_.variables.AppendVariablesForSubmatrix(s, &variable_indexes);
+  std::vector<int32>::const_iterator iter = variable_indexes.begin(),
+          end = variable_indexes.end();
+  for (; iter != end; ++iter) {
+    int32 v = *iter;
+    const std::vector<Access> &accesses = analyzer_.variable_accesses[v];
+    std::vector<Access>::const_iterator access_iter = accesses.begin(),
+        access_end = accesses.end();
+    for (; access_iter != access_end; ++access_iter) {
+      int32 command_index = access_iter->command_index;
+      CommandType command_type =
+          computation_.commands[command_index].command_type;
+      // The following two command types are not considered writes or reads,
+      // so they should not even appear in this list.
+      KALDI_ASSERT(command_type != kAllocMatrixUndefined &&
+                   command_type != kAllocMatrixFromOther);
+      if (command_type != kAllocMatrixZeroed &&
+          command_type != kAllocMatrixFromOtherZeroed) {
+        ans = std::min(ans, command_index);
+        break;  // break from access_iter loop (an optimization)
+      }
+    }
   }
-  if (first_command != access.allocate_command &&
-      access.accesses.size() > 1) {
-    int32 second_command = access.accesses[1].command_index;
-    if (second_command < command_index)
-      return true;
+  return ans;
+}
+
+
+int32 ComputationAnalysis::FirstMatrixAccess(int32 m) const {
+  KALDI_ASSERT(static_cast<size_t>(m) < computation_.matrices.size() && m > 0);
+  if (analyzer_.matrix_accesses[m].is_input)
+    return -1;
+  int32 ans = computation_.commands.size();
+  const std::vector<Access> &accesses =
+      analyzer_.matrix_accesses[m].accesses;
+  std::vector<Access>::const_iterator access_iter = accesses.begin(),
+      access_end = accesses.end();
+  for (; access_iter != access_end; ++access_iter) {
+    int32 command_index = access_iter->command_index;
+    if (command_index != analyzer_.matrix_accesses[m].allocate_command) {
+      ans = std::min(ans, command_index);
+      break;  // break from access_iter loop (an optimization)
+    }
   }
-  return false;
+  return ans;
 }
 
-bool MatrixIsAccessedAfterCommand(
-    const std::vector<MatrixAccesses> &matrix_accesses,    
-    int32 matrix_index, int32 command_index) {
-  KALDI_ASSERT(matrix_index > 0 && matrix_index <
-               static_cast<int32>(matrix_accesses.size()));
-  const MatrixAccesses &access = matrix_accesses[matrix_index];
-  // note, deallocation won't appear in the accesses vector.
-  if (access.accesses.empty())
-    return false;
-  return access.accesses.back().command_index > command_index;
+
+int32 ComputationAnalysis::LastMatrixAccess(int32 m) const {
+  KALDI_ASSERT(static_cast<size_t>(m) < computation_.matrices.size() && m > 0);
+  if (analyzer_.matrix_accesses[m].is_output)
+    return computation_.commands.size();
+  int32 ans = -1;
+  const std::vector<Access> &accesses =
+      analyzer_.matrix_accesses[m].accesses;
+  std::vector<Access>::const_reverse_iterator access_iter = accesses.rbegin(),
+      access_end = accesses.rend();
+  for (; access_iter != access_end; ++access_iter) {
+    int32 command_index = access_iter->command_index;
+    ans = std::max(ans, command_index);
+    break;  // break from access_iter loop (an optimization)
+  }
+  return ans;
 }
 
-bool MatrixIsWrittenToAfterCommand(
-    const std::vector<MatrixAccesses> &matrix_accesses,    
-    int32 matrix_index, int32 command_index) {
-  KALDI_ASSERT(matrix_index > 0 && matrix_index <
-               static_cast<int32>(matrix_accesses.size()));
-  const MatrixAccesses &access = matrix_accesses[matrix_index];
-  // note, deallocation won't appear in the accesses vector.
-  if (access.accesses.empty())
-    return false;
-  std::vector<Access>::const_reverse_iterator iter = access.accesses.rbegin(),
-      end = access.accesses.rend();
+
+int32 ComputationAnalysis::LastAccess(int32 s) const {
+  KALDI_ASSERT(static_cast<size_t>(s) < computation_.submatrices.size() && s>0);
+  int32 matrix_index = computation_.submatrices[s].matrix_index;
+  if (analyzer_.matrix_accesses[matrix_index].is_output)
+    return computation_.commands.size();
+  int32 ans = -1;
+  std::vector<int32> variable_indexes;
+  analyzer_.variables.AppendVariablesForSubmatrix(s, &variable_indexes);
+  std::vector<int32>::const_iterator iter = variable_indexes.begin(),
+      end = variable_indexes.end();
   for (; iter != end; ++iter) {
-    if (iter->command_index <= command_index)
-      return false;
-    // so we have iter->command_index > command_index
-    if (iter->access_type != kReadAccess)
-      return true;
+    int32 v = *iter;
+    const std::vector<Access> &accesses = analyzer_.variable_accesses[v];
+    // Go through the variable accesses in reverse order (of command index)
+    std::vector<Access>::const_reverse_iterator access_iter = accesses.rbegin(),
+        access_end = accesses.rend();
+    for (; access_iter != access_end; ++access_iter) {
+      int32 command_index = access_iter->command_index;
+      CommandType command_type =
+          computation_.commands[command_index].command_type;
+      // deallocation command should not be listed here.
+      KALDI_ASSERT(command_type != kDeallocMatrix);
+      ans = std::max(ans, command_index);
+      break;  // break from access_iter loop (an optimization)
+    }
   }
-  return false;           
+  return ans;
 }
 
-int32 FirstTimeSubmatrixIsWrittenToAfterCommand(
-    const Analyzer &analyzer,
-    int32 submatrix_index,
-    int32 command_index) {
-  KALDI_ASSERT(static_cast<size_t>(command_index) <
-               analyzer.command_attributes.size());
-  std::vector<int32> variables;
-  analyzer.variables.AppendVariablesForSubmatrix(submatrix_index, &variables);
-  KALDI_ASSERT(IsSortedAndUniq(variables));
+
+int32 ComputationAnalysis::LastWriteAccess(int32 s) const {
+  KALDI_ASSERT(static_cast<size_t>(s) < computation_.submatrices.size() && s>0);
+  int32 matrix_index = computation_.submatrices[s].matrix_index;
+  if (analyzer_.matrix_accesses[matrix_index].is_output)
+    return computation_.commands.size();
   int32 ans = -1;
-  std::vector<int32>::const_iterator iter = variables.begin(),
-      end = variables.end();
+  std::vector<int32> variable_indexes;
+  analyzer_.variables.AppendVariablesForSubmatrix(s, &variable_indexes);
+  std::vector<int32>::const_iterator iter = variable_indexes.begin(),
+      end = variable_indexes.end();
   for (; iter != end; ++iter) {
-    int32 variable = *iter;
-    KALDI_PARANOID_ASSERT(static_cast<size_t>(variable_) <
-                          analyzer.variables_accesses.size());
-    const std::vector<Access> &accesses = analyzer.variable_accesses[variable];
-    // iterate from latest to earlier command.
-    std::vector<Access>::const_reverse_iterator riter = accesses.rbegin(),
-        rend = accesses.rend();
-    for (; riter != rend; ++riter) {
-      const Access &access = *riter;
-      if (access.command_index <= command_index)
-        break;
-      if (access.access_type != kReadAccess) {
-        if (access.command_index < ans || ans == -1)
-          ans = access.command_index;
+    int32 v = *iter;
+    const std::vector<Access> &accesses = analyzer_.variable_accesses[v];
+    // Go through the variable accesses in reverse order (of command index)
+    std::vector<Access>::const_reverse_iterator access_iter = accesses.rbegin(),
+        access_end = accesses.rend();
+    for (; access_iter != access_end; ++access_iter) {
+      int32 command_index = access_iter->command_index;
+      CommandType command_type =
+          computation_.commands[command_index].command_type;
+      // deallocation command should not be listed here.
+      KALDI_ASSERT(command_type != kDeallocMatrix);
+      if (access_iter->access_type != kReadAccess) {
+        // If this operation is of type kWriteAccess or kReadWriteAccess
+        ans = std::max(ans, command_index);
+        break;  // break from access_iter loop (an optimization)
       }
     }
   }
   return ans;
 }
 
+int32 ComputationAnalysis::DataInvalidatedCommand(int32 c, int32 s) const {
+  KALDI_ASSERT(static_cast<size_t>(c) < computation_.commands.size());
+  KALDI_ASSERT(static_cast<size_t>(s) < computation_.submatrices.size() && s>0);
+  int32 matrix_index = computation_.submatrices[s].matrix_index;
+  int32 ans = analyzer_.matrix_accesses[matrix_index].deallocate_command;
+  if (ans == -1)
+    ans = static_cast<int32>(computation_.commands.size());
+  std::vector<int32> variable_indexes;
+  analyzer_.variables.AppendVariablesForSubmatrix(s, &variable_indexes);
+  std::vector<int32>::const_iterator iter = variable_indexes.begin(),
+          end = variable_indexes.end();
+  for (; iter != end; ++iter) {
+    int32 v = *iter;
+    const std::vector<Access> &accesses = analyzer_.variable_accesses[v];
+    std::vector<Access>::const_iterator access_iter = accesses.begin(),
+        access_end = accesses.end();
+    for (; access_iter != access_end; ++access_iter) {
+      int32 command_index = access_iter->command_index;
+      if (command_index > c &&
+          access_iter->access_type != kReadAccess) {
+        ans = std::min(ans, command_index);
+      }
+    }
+  }
+  return ans;
+}
 
 void PrintMatrixAccesses(std::ostream &os,
                          const std::vector<MatrixAccesses> &matrix_accesses) {
diff --git a/src/nnet3/nnet-analyze.h b/src/nnet3/nnet-analyze.h
index 2fac38f9688..be110075fa7 100644
--- a/src/nnet3/nnet-analyze.h
+++ b/src/nnet3/nnet-analyze.h
@@ -45,8 +45,8 @@ namespace nnet3 {
 struct CommandAttributes {
   // All of the vector variables below are made sorted and uniq by
   // ComputeCommandAttributes.
-  
-  // variables read 
+
+  // variables read
   std::vector<int32> variables_read;
   // variables written
   std::vector<int32> variables_written;
@@ -56,7 +56,7 @@ struct CommandAttributes {
   // sub-matrices written (i.e. the submatrix appears in the command directly)
   std::vector<int32> submatrices_written;
 
-  // matrices read 
+  // matrices read
   std::vector<int32> matrices_read;
   // matrices written
   std::vector<int32> matrices_written;
@@ -90,22 +90,34 @@ enum AccessType {
     would cause the resulting analysis to be inaccurate.
 
     What we do instead, which is accurate enough in the cases we envisage, is to
-    make the variables correspond to the most specific column ranges in the
+    make the variables correspond to the most specific row column ranges in the
     matrices that we ever access.  We do this as follows: for each matrix in the
-    computation we get a list of all the "split points" at which column ranges
-    ever start and end, and define a split_point_index as the index into the
-    array.  The variable could be defined as the pair (matrix_index,
-    split_point_index), but we map it to a single integer index called variable_index,
-    which we compute from the pair using the expression
-    (matrix_to_variable_index_[matrix_index] + split_point_index).
-    Each sub-matrix in the computation will now correspond to a list of variables,
-    and because these lists are always a contiguous range we can just store the
-    start and end points.  In addition we note, for each submatrix, whether
-    it spans the entire row range of the underlying matrix or just a part of
-    the row range.  The reason we need to know this is that a write operation
-    to just part of the row-range would have to be classed as a read-write
-    operation because the final contents after the operation would in that
-    case depend on the original contents.
+    computation we get a list of all the "split points" at which the row column
+    ranges respectively ever start and end, and define a split_point_index as
+    the index into the array.  The variable could be defined as the triple
+    (matrix_index, row_split_point_index, column_split_point_index), but we map
+    it to a single integer index called variable_index.  This is a zero-based
+    index formed by listing all the existing variables iterating first over the
+    matrix index, then the row split-point-index, then the column split-point-index.
+    In the end, if we know the matrix-index, the row-split-point-index and the
+    column-split-point-index, we can compute the variable-index using the
+    expression
+      variable-index = matrix_to_variable_index_[matrix-index] +
+                      row-split-point-index * num-column-variables-for-this-matrix +
+                      column-split-point-index
+    where in code, num-column-variables-for-this-matrix equals
+    column_split_points_[matrix-index].size()-1.  The array
+    matrix_to_variable_index_ is a precomputed array telling us at which variable
+    index the variables for any given matrix begin.
+
+    Each sub-matrix in the computation will now correspond to a list of
+    variables, and because these lists are always a contiguous range we can just
+    store the row and column split-points corresponding to the start and end of
+    the submatrix.  In addition we note, for each submatrix, whether it spans
+    the entirety of the underlying matrix.  The reason we need to know this is
+    that a write operation to just part of a matrix would have to be classed as
+    a read-write operation on the underlying matrix because the final contents
+    after the operation would in that case depend on the original contents.
  */
 class ComputationVariables {
  public:
@@ -127,73 +139,84 @@ class ComputationVariables {
       AccessType access_type,
       CommandAttributes *ca) const;
 
-  /// Appends to variables_indexes the list of variables corresponding to a
-  /// matrix index.
+  /// Appends to variables_indexes the sorted list of variables corresponding to
+  /// a matrix index.
   void AppendVariablesForMatrix(
       int32 matrix_index,
       std::vector<int32> *variable_indexes) const;
 
-  // Appends to variable_indexes the list of variables corresponding to a
+  // Appends to variable_indexes the sorted list of variables corresponding to a
   // submatrix index.
   void AppendVariablesForSubmatrix(
       int32 submatrix_index,
       std::vector<int32> *variable_indexes) const;
 
+  // note: variables are zero-indexed.
   int32 NumVariables() const { return num_variables_; }
 
   int32 GetMatrixForVariable(int32 variable) const;
-  
- private:
 
-  
-  // sets up split_points_ and matrix_to_variable_index_.  called from
-  // constructor.
+  // returns a string that describes a variable in Matlab-like format (but with
+  // zero indexing): something like "m1" or "m1(0:99,:)" or "m1(0:19,10:49)"
+  std::string DescribeVariable(int32 variable) const;
+
+ private:
+  // sets up split_points_, matrix_to_variable_index_, and num_variables_.
+  // called from constructor.
   void ComputeSplitPoints(const NnetComputation &computation);
-  // sets up variable_ranges_ and full_column_range_.  called from constructor.
-  void ComputeVariableRanges(const NnetComputation &computation);
+  // sets up variables_for_submatrix_, is_full_matrix_, and submatrix_to_matrix_.  called
+  // from constructor.
+  void ComputeVariablesForSubmatrix(const NnetComputation &computation);
   // sets up variable_to_matrix_.  called from constructor.
-  void ComputeVariableToMatrix(const NnetComputation &computation);
-  // sets up submatrix_to_matrix_ and submatrix_is_whole_matrix.
-  // called from constructor.
-  void ComputeSubmatrixInfo(const NnetComputation &computation);
-  
+  void ComputeVariableToMatrix();
+
+  // This function assumes that 'sorted_vec' is sorted and unique, and that
+  // 'i' is an element of 'sorted_vec'; it returns the index of 'i' in vec,
+  // i.e. the k such that sorted_vec[k] == i.
+  static int32 FindIndexOf(const std::vector<int32> &sorted_vec, int32 i);
+
+
   // Indexed first by matrix-index and then a list, this gives us all the split
   // points at which column ranges start and end.  For instance, if the 3'rd
   // matrix has 20 columns and is split into ranges 0:9 and 10:19,
-  // split_points_[3] would equal [0, 10, 20].  zeroth one will be empty because
-  // matrix-index zero is reserved for the empty matrix.
-  std::vector<std::vector<int32> > split_points_;
-
-  // maps from the matrix-index (note, zero is invalid as it corresponds to the
+  // split_points_[3] would equal [0, 10, 20].  column_split_points_[0] will
+  // always be empty because matrix-index zero is reserved for the empty matrix.
+  std::vector<std::vector<int32> > column_split_points_;
+  // This is as column_split_points_, except for row indexes instead of column
+  // indexes.
+  std::vector<std::vector<int32> > row_split_points_;
+
+  // Maps from the matrix-index (note, zero is invalid as it corresponds to the
   // empty matrix) to the variable-index for its first split point.  for coding
   // convenience there is one extra element at the end, which is equal to the
   // total number of variables.
+  // For each matrix m, the matrix has num-row-variables * num-column-variables
+  // variables in total, where num-row-variables = row_split_points_[m].size() - 1, and
+  // num-column-variables = column_split_points_[m].size() - 1.
   std::vector<int32> matrix_to_variable_index_;
 
   std::vector<int32> submatrix_to_matrix_;
+  // indexed by submatrix index, this is true if the submatrix spans the full
+  // row and column range of the underlying matrix.  Affects whether write operations
+  // should be classed as write operations or as read-write operations.
   std::vector<bool> submatrix_is_whole_matrix_;
 
+
   // records the matrix index underlying each variable.
   std::vector<int32> variable_to_matrix_;
 
   int32 num_variables_;
-  
-  // maps each submatrix index to the start and end of the corresponding range
-  // of variable indexes (note: the actual variable indexes spanned by
-  // this submatrix can be expressed as start, start+1 ... end-1, i.e. they
-  // don't include the end of the range.
-  std::vector<std::pair<int32, int32> > variable_ranges_;
 
-  // indexed by submatrix index, this is true if the submatrix spans the full
-  // row range of the underlying matrix.  Affects whether write operations
-  // should be classed as write operations or as read-write operations.
-  std::vector<bool> full_column_range_;
+
+  // For each submatrix, a list of the variables underlying it.
+  std::vector<std::vector<int32> > variables_for_submatrix_;
+
 
 };
 
 
-// This struct records an access to a variable (i.e. a row range of a matrix) or
-// to a matrix.
+// This struct records an access to a variable (i.e. a row and column range of a
+// matrix).
 struct Access {
   int32 command_index;
   AccessType access_type;
@@ -233,11 +256,14 @@ struct MatrixAccesses {
   /// (read, read/write, write).  It will be sorted on command index with only
   /// one record per command.  Note: a write to only a part of the matrix
   /// (i.e. a submatrix that isn't the whole thing) will be recorded as an
-  /// access of type read/write.
+  /// access of type read/write.  An allocation with zeroing is recorded
+  /// as a write access.
   std::vector<Access> accesses;
-  /// true if this matrix is an input to the computation.
+  /// true if this matrix is an input to the computation (i.e. either an
+  /// input-value or an output-deriv).
   bool is_input;
-  /// true if this matrix is an output of the computation.  
+  /// true if this matrix is an output of the computation (i.e. either an
+  /// output-value or an input-deriv).
   bool is_output;
   MatrixAccesses(): allocate_command(-1), deallocate_command(-1),
                     is_input(false), is_output(false) { }
@@ -265,50 +291,80 @@ void PrintMatrixAccesses(std::ostream &os,
 struct Analyzer {
   ComputationVariables variables;
   std::vector<CommandAttributes> command_attributes;
-  std::vector<std::vector<Access> > variable_accesses;  
+  std::vector<std::vector<Access> > variable_accesses;
   std::vector<MatrixAccesses> matrix_accesses;
   void Init(const Nnet &nnet, const NnetComputation &computation);
 };
 
-/** Returns the command-index of the first command after `command_index` that
-    submatrix `submatrix_index` is written to, or -1 if there is no such
-    command.  Note that by "written to", we mean that any part of the submatrix
-    is written to, not that the submatrix appears explicitly in a command; the
-    analysis is in terms of variables.  */
-int32 FirstTimeSubmatrixIsWrittenToAfterCommand(
-    const Analyzer &analyer,
-    int32 submatrix_index,
-    int32 command_index);
-
-
-/// Returns true if the matrix "matrix_index" is written to (i.e.  accesses of
-/// type kWriteAccess or kReadWriteAccess) after the command numbered
-/// "command_index" (not counting deallocation as an access).
-bool MatrixIsWrittenToAfterCommand(
-    const std::vector<MatrixAccesses> &matrix_accesses,
-    int32 matrix_index, int32 command_index);
-  
-/// Returns true if the matrix "matrix_index" is accessed at all after the
-/// command numbered "command_index" (not counting deallocation as an access).
-bool MatrixIsAccessedAfterCommand(
-    const std::vector<MatrixAccesses> &matrix_accesses,    
-    int32 matrix_index, int32 command_index);
-
-/// Returns true if the matrix "matrix_index" is accessed at all before the
-/// command numbered "command_index" (but not counting zeroing in initialization
-/// as access).
-bool MatrixIsAccessedBeforeCommand(
-    const std::vector<MatrixAccesses> &matrix_accesses,
-    int32 matrix_index, int32 command_index);
-
-
-
-/// This function computes a vector "submat_lists", indexed
-/// by matrix index, such that (*submat_lists)[m] is a list of
+
+/// This class performs various kinds of specific analysis on top of what class
+/// Analyzer gives you immediately.  It mostly contains special-purpose things
+/// what were needed by class VariableMergingOptimizer (see nnet-optimize.h, and
+/// the extended comment above class VariableMergingOptimizer).
+/// Be careful about the meaninhg of 'access'- read the comments carefully.
+class ComputationAnalysis {
+ public:
+  /// This class stores the const references provided to its constructor ->
+  /// be careful about changing them or deallocating them during the
+  /// lifetime of this object.
+  ComputationAnalysis(const NnetComputation &computation,
+                      const Analyzer &analyzer): computation_(computation),
+                                                 analyzer_(analyzer) { }
+
+  /// If the matrix underlying submatrix 's' is an input then this returns -1;
+  /// otherwise it returns the first command (read or write) that is not an
+  /// allocation command, that accesses any part of 's' [note: deallocation does
+  /// not count as a read or write operation].  If there is no such command, it
+  /// returns num_commands.
+  /// s must be >0 (i.e. not the empty submatrix).
+  int32 FirstAccess(int32 s) const;
+
+  /// If the matrix underlying submatrix 's' is an output then this returns
+  /// num-commands; otherwise it returns the last non-deallocation command
+  /// that accesses any part of submatrix 's'; if there is no such command it
+  /// returns -1.
+  /// s must be >0 (i.e. not the empty submatrix).
+  int32 LastAccess(int32 s) const;
+
+  /// Returns the last command-index that accesses any part of submatrix 's' as
+  /// a write operation, or -1 if there is no such operation.  Not: deallocation
+  /// does not count as a write operation.
+  /// s must be >0 (i.e. not the empty submatrix).
+  int32 LastWriteAccess(int32 s) const;
+
+  /// Returns (the first command-index after 'c' that any part of submatrix 's'
+  /// is written to); or if there is no such command, then (the
+  /// command-index of the command that deallocates the matrix underlying s);
+  /// or if there is no such command, then the total number of commands.
+  /// s must be >0 (i.e. not the empty submatrix).
+  int32 DataInvalidatedCommand(int32 c, int32 s) const;
+
+  /// If matrix 'm' is an input then this returns -1; otherwise it returns the
+  /// first command (read or write) that is not an allocation command, that
+  /// accesses any part of 'm' [note: deallocation does not count as a read or
+  /// write operation].  If there is no such command, it returns num_commands.
+  /// m must be >0 (i.e. not the empty matrix).
+  int32 FirstMatrixAccess(int32 m) const;
+
+
+  /// If matrix 'm' is an output then this returns num-commands; otherwise it
+  /// returns the last non-deallocation command that accesses any part of
+  /// matrix 'm'; if there is no such command it returns -1.  m must be >0
+  /// (i.e. not the empty matrix).
+  int32 LastMatrixAccess(int32 m) const;
+
+ private:
+  const NnetComputation &computation_;
+  const Analyzer &analyzer_;
+};
+
+
+/// This function computes a vector 'mat_to_submat', indexed
+/// by matrix index, such that (*mat_to_submat)[m] is a list of
 /// all the submatrix indexes that refer to matrix m.  Note,
-/// (*submat_lists)[0] will be the empty vector.
-void ComputeSubmatLists(const NnetComputation &computation,
-                        std::vector<std::vector<int32> > *submat_lists);
+/// (*mat_to_submat)[0] will be the empty vector.
+void ComputeMatrixToSubmatrix(const NnetComputation &computation,
+                              std::vector<std::vector<int32> > *mat_to_submat);
 
 
 /**
@@ -339,7 +395,6 @@ class ComputationChecker {
  public:
   ComputationChecker(const CheckComputationOptions &config,
                      const Nnet &nnet,
-                     const ComputationRequest &request,
                      const NnetComputation &computation);
   void Check();  // call this only once.
  private:
@@ -355,22 +410,23 @@ class ComputationChecker {
   void CheckComputationRewrite() const;
   // check matrix accesses make sense.
   void CheckComputationMatrixAccesses() const;
+  // check debug_info has the correct size, if used.
+  void CheckComputationDebugInfo() const;
 
 
   const CheckComputationOptions &config_;
   const Nnet &nnet_;
-  const ComputationRequest &request_;
   const NnetComputation &computation_;
   Analyzer a_;
 };
 
 
-                      
-
-  
-
-
-
+/// This is a convenience interface for class ComputationChecker.  Call it with
+/// check_rewrite = true only if the optimization is pre-optimization.
+void CheckComputation(const Nnet &nnet,
+                      const ComputationRequest &request,
+                      const NnetComputation &computation,
+                      bool check_rewrite = false);
 
 
 } // namespace nnet3
@@ -378,4 +434,3 @@ class ComputationChecker {
 
 
 #endif
-
diff --git a/src/nnet3/nnet-chain-combine.cc b/src/nnet3/nnet-chain-combine.cc
new file mode 100644
index 00000000000..810ee2b471a
--- /dev/null
+++ b/src/nnet3/nnet-chain-combine.cc
@@ -0,0 +1,527 @@
+// nnet3/nnet-chain-combine.cc
+
+// Copyright 2012-2015   Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "nnet3/nnet-chain-combine.h"
+#include "nnet3/nnet-utils.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+NnetChainCombiner::NnetChainCombiner(const NnetCombineConfig &combine_config,
+                                     const chain::ChainTrainingOptions &chain_config,
+                                     int32 num_nnets,
+                                     const std::vector<NnetChainExample> &egs,
+                                     const fst::StdVectorFst &den_fst,
+                                     const Nnet &first_nnet):
+    combine_config_(combine_config),
+    chain_config_(chain_config),
+    egs_(egs),
+    den_fst_(den_fst),
+    nnet_(first_nnet),
+    num_real_input_nnets_(num_nnets),
+    nnet_params_(std::min(num_nnets, combine_config_.max_effective_inputs),
+                 NumParameters(first_nnet)),
+    tot_input_weighting_(nnet_params_.NumRows()) {
+  SubVector<BaseFloat> first_params(nnet_params_, 0);
+  VectorizeNnet(nnet_, &first_params);
+  tot_input_weighting_(0) += 1.0;
+  num_nnets_provided_ = 1;
+  ComputeUpdatableComponentDims();
+  NnetComputeProbOptions compute_prob_opts;
+  compute_prob_opts.compute_deriv = true;
+  prob_computer_ = new NnetChainComputeProb(compute_prob_opts, chain_config_, den_fst_, nnet_);
+}
+
+void NnetChainCombiner::ComputeUpdatableComponentDims(){
+  updatable_component_dims_.clear();
+  for (int32 c = 0; c < nnet_.NumComponents(); c++) {
+    Component *comp = nnet_.GetComponent(c);
+    if (comp->Properties() & kUpdatableComponent) {
+      // For now all updatable components inherit from class UpdatableComponent.
+      // If that changes in future, we will change this code.
+      UpdatableComponent *uc = dynamic_cast<UpdatableComponent*>(comp);
+      if (uc == NULL)
+        KALDI_ERR << "Updatable component does not inherit from class "
+            "UpdatableComponent; change this code.";
+      updatable_component_dims_.push_back(uc->NumParameters());
+    }
+  }
+}
+
+void NnetChainCombiner::AcceptNnet(const Nnet &nnet) {
+  KALDI_ASSERT(num_nnets_provided_ < num_real_input_nnets_ &&
+               "You called AcceptNnet too many times.");
+  int32 num_effective_nnets = nnet_params_.NumRows();
+  if (num_effective_nnets == num_real_input_nnets_) {
+    SubVector<BaseFloat> this_params(nnet_params_, num_nnets_provided_);
+    VectorizeNnet(nnet, &this_params);
+    tot_input_weighting_(num_nnets_provided_) += 1.0;
+  } else {
+    // this_index is a kind of warped index, mapping the range
+    // 0 ... num_real_inputs_nnets_ - 1 onto the range
+    // 0 ... num_effective_nnets - 1.  View the index as falling in
+    // between two integer indexes and determining weighting factors.
+    // we could view this as triangular bins.
+    BaseFloat this_index = num_nnets_provided_ * (num_effective_nnets - 1)
+        / static_cast<BaseFloat>(num_real_input_nnets_ - 1);
+    int32 lower_index = std::floor(this_index),
+        upper_index = lower_index + 1;
+    BaseFloat remaining_part = this_index - lower_index,
+        lower_weight = 1.0 - remaining_part,
+        upper_weight = remaining_part;
+    KALDI_ASSERT(lower_index >= 0 && upper_index <= num_effective_nnets &&
+                 lower_weight >= 0.0 && upper_weight >= 0.0 &&
+                 lower_weight <= 1.0 && upper_weight <= 1.0);
+    Vector<BaseFloat> vec(nnet_params_.NumCols(), kUndefined);
+    VectorizeNnet(nnet, &vec);
+    nnet_params_.Row(lower_index).AddVec(lower_weight, vec);
+    tot_input_weighting_(lower_index) += lower_weight;
+    if (upper_index == num_effective_nnets) {
+      KALDI_ASSERT(upper_weight < 0.1);
+    } else {
+      nnet_params_.Row(upper_index).AddVec(upper_weight, vec);
+      tot_input_weighting_(upper_index) += upper_weight;
+    }
+  }
+  num_nnets_provided_++;
+}
+
+void NnetChainCombiner::FinishPreprocessingInput() {
+  KALDI_ASSERT(num_nnets_provided_ == num_real_input_nnets_ &&
+               "You did not call AcceptInput() enough times.");
+  int32 num_effective_nnets = nnet_params_.NumRows();
+  for (int32 i = 0; i < num_effective_nnets; i++) {
+    BaseFloat tot_weight = tot_input_weighting_(i);
+    KALDI_ASSERT(tot_weight > 0.0);  // Or would be a coding error.
+    // Rescale so this row is like a weighted average instead of
+    // a weighted sum.
+    if (tot_weight != 1.0)
+      nnet_params_.Row(i).Scale(1.0 / tot_weight);
+  }
+}
+
+void NnetChainCombiner::Combine() {
+  FinishPreprocessingInput();
+
+  if (!SelfTestDerivatives()) {
+    KALDI_LOG << "Self-testing model derivatives since parameter-derivatives "
+        "self-test failed.";
+    SelfTestModelDerivatives();
+  }
+
+  int32 dim = ParameterDim();
+  LbfgsOptions lbfgs_options;
+  lbfgs_options.minimize = false; // We're maximizing.
+  lbfgs_options.m = dim; // Store the same number of vectors as the dimension
+                         // itself, so this is BFGS.
+  lbfgs_options.first_step_impr = combine_config_.initial_impr;
+
+  Vector<BaseFloat> params(dim), deriv(dim);
+  BaseFloat objf, initial_objf;
+  GetInitialParameters(&params);
+
+
+  OptimizeLbfgs<BaseFloat> lbfgs(params, lbfgs_options);
+
+  for (int32 i = 0; i < combine_config_.num_iters; i++) {
+    params.CopyFromVec(lbfgs.GetProposedValue());
+    objf = ComputeObjfAndDerivFromParameters(params, &deriv);
+    KALDI_VLOG(2) << "Iteration " << i << " params = " << params
+                  << ", objf = " << objf << ", deriv = " << deriv;
+    if (i == 0) initial_objf = objf;
+    lbfgs.DoStep(objf, deriv);
+  }
+
+  KALDI_LOG << "Combining nnets, objective function changed from "
+            << initial_objf << " to " << objf;
+
+  // must recompute nnet_ if "params" is not exactly equal to the
+  // final params that LB
+  Vector<BaseFloat> final_params(dim);
+  final_params.CopyFromVec(lbfgs.GetValue(&objf));
+  if (!params.ApproxEqual(final_params, 0.0)) {
+    // the following call makes sure that nnet_ corresponds to the parameters
+    // in "params".
+    ComputeObjfAndDerivFromParameters(final_params, &deriv);
+  }
+  PrintParams(final_params);
+}
+
+
+void NnetChainCombiner::PrintParams(const VectorBase<BaseFloat> &params) const {
+
+  Vector<BaseFloat> weights(params.Dim()), normalized_weights(params.Dim());
+  GetWeights(params, &weights);
+  GetNormalizedWeights(weights, &normalized_weights);
+  int32 num_models = nnet_params_.NumRows(),
+      num_uc = NumUpdatableComponents();
+
+  if (combine_config_.separate_weights_per_component) {
+    std::vector<std::string> updatable_component_names;
+    for (int32 c = 0; c < nnet_.NumComponents(); c++) {
+      const Component *comp = nnet_.GetComponent(c);
+      if (comp->Properties() & kUpdatableComponent)
+        updatable_component_names.push_back(nnet_.GetComponentName(c));
+    }
+    KALDI_ASSERT(static_cast<int32>(updatable_component_names.size()) ==
+                 NumUpdatableComponents());
+    for (int32 uc = 0; uc < num_uc; uc++) {
+      std::ostringstream os;
+      os.width(20);
+      os << std::left << updatable_component_names[uc] << ": ";
+      os.width(9);
+      os.precision(4);
+      for (int32 m = 0; m < num_models; m++) {
+        int32 index = m * num_uc + uc;
+        os << " " << std::left << normalized_weights(index);
+      }
+      KALDI_LOG << "Weights for " << os.str();
+    }
+  } else {
+    int32 c = 0;  // arbitrarily chosen; they'll all be the same.
+    std::ostringstream os;
+    os.width(9);
+    os.precision(4);
+    for (int32 m = 0; m < num_models; m++) {
+      int32 index = m * num_uc + c;
+      os << " " <<  std::left << normalized_weights(index);
+    }
+    KALDI_LOG << "Model weights are: " << os.str();
+  }
+  int32 num_effective_nnets = nnet_params_.NumRows();
+  if (num_effective_nnets != num_real_input_nnets_)
+    KALDI_LOG << "Above, only " << num_effective_nnets << " weights were "
+              "printed due to the the --num-effective-nnets option; "
+              "there were " << num_real_input_nnets_ << " actual input nnets. "
+              "Each weight corresponds to a weighted average over a range of "
+              "nnets in the sequence (with triangular bins)";
+}
+
+bool NnetChainCombiner::SelfTestDerivatives() {
+  int32 num_tests = 2;  // more properly, this is the number of dimensions in a
+                        // single test.
+  BaseFloat delta = 0.001;
+  int32 dim = ParameterDim();
+
+  Vector<BaseFloat> params(dim), deriv(dim);
+  Vector<BaseFloat> predicted_changes(num_tests),
+      observed_changes(num_tests);
+
+  GetInitialParameters(&params);
+  BaseFloat initial_objf = ComputeObjfAndDerivFromParameters(params,
+                                                             &deriv);
+  for (int32 i = 0; i < num_tests; i++) {
+    Vector<BaseFloat> new_deriv(dim), offset(dim), new_params(params);
+    offset.SetRandn();
+    new_params.AddVec(delta, offset);
+    BaseFloat new_objf = ComputeObjfAndDerivFromParameters(new_params,
+                                                           &new_deriv);
+    // for predicted changes, interpolate old and new derivs.
+    predicted_changes(i) =
+        0.5 * VecVec(new_params, deriv) -  0.5 * VecVec(params, deriv) +
+        0.5 * VecVec(new_params, new_deriv) - 0.5 * VecVec(params, new_deriv);
+    observed_changes(i) = new_objf - initial_objf;
+  }
+  BaseFloat threshold = 0.1;
+  KALDI_LOG << "predicted_changes = " << predicted_changes;
+  KALDI_LOG << "observed_changes = " << observed_changes;
+  if (!ApproxEqual(predicted_changes, observed_changes, threshold)) {
+    KALDI_WARN << "Derivatives self-test failed.";
+    return false;
+  } else {
+    return true;
+  }
+}
+
+
+void NnetChainCombiner::SelfTestModelDerivatives() {
+  int32 num_tests = 3;  // more properly, this is the number of dimensions in a
+                        // single test.
+  int32 dim = ParameterDim();
+
+  Vector<BaseFloat> params(dim), deriv(dim);
+  Vector<BaseFloat> predicted_changes(num_tests),
+      observed_changes(num_tests);
+
+  GetInitialParameters(&params);
+  Vector<BaseFloat> weights(WeightDim()), normalized_weights(WeightDim()),
+      nnet_params(NnetParameterDim(), kUndefined),
+      nnet_deriv(NnetParameterDim(), kUndefined);
+  GetWeights(params, &weights);
+  GetNormalizedWeights(weights, &normalized_weights);
+  GetNnetParameters(normalized_weights, &nnet_params);
+
+  BaseFloat initial_objf = ComputeObjfAndDerivFromNnet(nnet_params,
+                                                       &nnet_deriv);
+
+  BaseFloat delta = 0.002 * std::sqrt(VecVec(nnet_params, nnet_params) /
+                                      NnetParameterDim());
+
+
+  for (int32 i = 0; i < num_tests; i++) {
+    Vector<BaseFloat> new_nnet_deriv(NnetParameterDim()),
+        offset(NnetParameterDim()), new_nnet_params(nnet_params);
+    offset.SetRandn();
+    new_nnet_params.AddVec(delta, offset);
+    BaseFloat new_objf = ComputeObjfAndDerivFromNnet(new_nnet_params,
+                                                     &new_nnet_deriv);
+    // for predicted changes, interpolate old and new derivs.
+    predicted_changes(i) =
+        0.5 * VecVec(new_nnet_params, nnet_deriv) -
+        0.5 * VecVec(nnet_params, nnet_deriv) +
+        0.5 * VecVec(new_nnet_params, new_nnet_deriv) -
+        0.5 * VecVec(nnet_params, new_nnet_deriv);
+    observed_changes(i) = new_objf - initial_objf;
+  }
+  BaseFloat threshold = 0.1;
+  KALDI_LOG << "model-derivatives: predicted_changes = " << predicted_changes;
+  KALDI_LOG << "model-derivatives: observed_changes = " << observed_changes;
+  if (!ApproxEqual(predicted_changes, observed_changes, threshold))
+    KALDI_WARN << "Model derivatives self-test failed.";
+}
+
+
+
+
+int32 NnetChainCombiner::ParameterDim() const {
+  if (combine_config_.separate_weights_per_component)
+    return NumUpdatableComponents() * nnet_params_.NumRows();
+  else
+    return nnet_params_.NumRows();
+}
+
+
+void NnetChainCombiner::GetInitialParameters(VectorBase<BaseFloat> *params) const {
+  KALDI_ASSERT(params->Dim() == ParameterDim());
+  params->Set(1.0 / nnet_params_.NumRows());
+  if (combine_config_.enforce_positive_weights) {
+    // we enforce positive weights by treating the params as the log of the
+    // actual weight.
+    params->ApplyLog();
+  }
+}
+
+void NnetChainCombiner::GetWeights(const VectorBase<BaseFloat> &params,
+                              VectorBase<BaseFloat> *weights) const {
+  KALDI_ASSERT(weights->Dim() == WeightDim());
+  if (combine_config_.separate_weights_per_component) {
+    weights->CopyFromVec(params);
+  } else {
+    int32 nc = NumUpdatableComponents();
+    // have one parameter per row of nnet_params_, and need to repeat
+    // the weight for the different components.
+    for (int32 n = 0; n < nnet_params_.NumRows(); n++) {
+      for (int32 c = 0; c < nc; c++)
+        (*weights)(n * nc + c) = params(n);
+    }
+  }
+  // we enforce positive weights by having the weights be the exponential of the
+  // corresponding parameters.
+  if (combine_config_.enforce_positive_weights)
+    weights->ApplyExp();
+}
+
+
+void NnetChainCombiner::GetParamsDeriv(const VectorBase<BaseFloat> &weights,
+                                  const VectorBase<BaseFloat> &weights_deriv,
+                                  VectorBase<BaseFloat> *param_deriv) {
+  KALDI_ASSERT(weights.Dim() == WeightDim() &&
+               param_deriv->Dim() == ParameterDim());
+  Vector<BaseFloat> preexp_weights_deriv(weights_deriv);
+  if (combine_config_.enforce_positive_weights) {
+    // to enforce positive weights we first compute weights (call these
+    // preexp_weights) and then take exponential.  Note, d/dx exp(x) = exp(x).
+    // So the derivative w.r.t. the preexp_weights equals the derivative
+    // w.r.t. the weights, times the weights.
+    preexp_weights_deriv.MulElements(weights);
+  }
+  if (combine_config_.separate_weights_per_component) {
+    param_deriv->CopyFromVec(preexp_weights_deriv);
+  } else {
+    int32 nc = NumUpdatableComponents();
+    param_deriv->SetZero();
+    for (int32 n = 0; n < nnet_params_.NumRows(); n++)
+      for (int32 c = 0; c < nc; c++)
+        (*param_deriv)(n) += preexp_weights_deriv(n * nc + c);
+  }
+}
+
+
+void NnetChainCombiner::GetNnetParameters(const Vector<BaseFloat> &weights,
+                                     VectorBase<BaseFloat> *nnet_params) const {
+  KALDI_ASSERT(nnet_params->Dim() == nnet_params_.NumCols());
+  nnet_params->SetZero();
+  int32 num_uc = NumUpdatableComponents(),
+      num_models = nnet_params_.NumRows();
+  for (int32 m = 0; m < num_models; m++) {
+    const SubVector<BaseFloat> src_params(nnet_params_, m);
+    int32 dim_offset = 0;
+    for (int32 c = 0; c < num_uc; c++) {
+      int32 index = m * num_uc + c;
+      BaseFloat weight = weights(index);
+      int32 dim = updatable_component_dims_[c];
+      const SubVector<BaseFloat> src_component_params(src_params, dim_offset,
+                                                      dim);
+      SubVector<BaseFloat> dest_component_params(*nnet_params, dim_offset, dim);
+      dest_component_params.AddVec(weight, src_component_params);
+      dim_offset += dim;
+    }
+    KALDI_ASSERT(dim_offset == nnet_params_.NumCols());
+  }
+}
+
+// compare GetNnetParameters.
+void NnetChainCombiner::GetWeightsDeriv(
+    const VectorBase<BaseFloat> &nnet_params_deriv,
+    VectorBase<BaseFloat> *weights_deriv) {
+  KALDI_ASSERT(nnet_params_deriv.Dim() == nnet_params_.NumCols() &&
+               weights_deriv->Dim() == WeightDim());
+  int32 num_uc = NumUpdatableComponents(),
+      num_models = nnet_params_.NumRows();
+  for (int32 m = 0; m < num_models; m++) {
+    const SubVector<BaseFloat> src_params(nnet_params_, m);
+    int32 dim_offset = 0;
+    for (int32 c = 0; c < num_uc; c++) {
+      int32 index = m * num_uc + c;
+      int32 dim = updatable_component_dims_[c];
+      const SubVector<BaseFloat> src_component_params(src_params, dim_offset,
+                                                      dim);
+      const SubVector<BaseFloat> component_params_deriv(nnet_params_deriv,
+                                                        dim_offset, dim);
+      (*weights_deriv)(index) = VecVec(src_component_params,
+                                       component_params_deriv);
+      dim_offset += dim;
+    }
+    KALDI_ASSERT(dim_offset == nnet_params_.NumCols());
+  }
+}
+
+double NnetChainCombiner::ComputeObjfAndDerivFromNnet(
+    VectorBase<BaseFloat> &nnet_params,
+    VectorBase<BaseFloat> *nnet_params_deriv) {
+  BaseFloat sum = nnet_params.Sum();
+  // inf/nan parameters->return -inf objective.
+  if (!(sum == sum && sum - sum == 0))
+    return -std::numeric_limits<double>::infinity();
+  // Set nnet to have these params.
+  UnVectorizeNnet(nnet_params, &nnet_);
+
+  prob_computer_->Reset();
+  std::vector<NnetChainExample>::const_iterator iter = egs_.begin(),
+                                                end = egs_.end();
+  for (; iter != end; ++iter)
+    prob_computer_->Compute(*iter);
+  const ChainObjectiveInfo *objf_info =
+      prob_computer_->GetObjective("output");
+  if (objf_info == NULL)
+    KALDI_ERR << "Error getting objective info (unsuitable egs?)";
+  KALDI_ASSERT(objf_info->tot_weight > 0.0);
+  const Nnet &deriv = prob_computer_->GetDeriv();
+  VectorizeNnet(deriv, nnet_params_deriv);
+  // we prefer to deal with normalized objective functions.
+  nnet_params_deriv->Scale(1.0 / objf_info->tot_weight);
+  return (objf_info->tot_like + objf_info->tot_l2_term) / objf_info->tot_weight;
+}
+
+
+double NnetChainCombiner::ComputeObjfAndDerivFromParameters(
+    VectorBase<BaseFloat> &params,
+    VectorBase<BaseFloat> *params_deriv) {
+  Vector<BaseFloat> weights(WeightDim()), normalized_weights(WeightDim()),
+      nnet_params(NnetParameterDim(), kUndefined),
+      nnet_params_deriv(NnetParameterDim(), kUndefined),
+      normalized_weights_deriv(WeightDim()), weights_deriv(WeightDim());
+  GetWeights(params, &weights);
+  GetNormalizedWeights(weights, &normalized_weights);
+  GetNnetParameters(normalized_weights, &nnet_params);
+  double ans = ComputeObjfAndDerivFromNnet(nnet_params, &nnet_params_deriv);
+  if (ans != ans || ans - ans != 0) // NaN or inf
+    return ans;  // No point computing derivative
+  GetWeightsDeriv(nnet_params_deriv, &normalized_weights_deriv);
+  GetUnnormalizedWeightsDeriv(weights, normalized_weights_deriv,
+                              &weights_deriv);
+  GetParamsDeriv(weights, weights_deriv, params_deriv);
+  return ans;
+}
+
+
+// enforces the constraint that the weights for each component must sum to one.
+void NnetChainCombiner::GetNormalizedWeights(
+    const VectorBase<BaseFloat> &unnorm_weights,
+    VectorBase<BaseFloat> *norm_weights) const {
+  if (!combine_config_.enforce_sum_to_one) {
+    norm_weights->CopyFromVec(unnorm_weights);
+    return;
+  }
+  int32 num_uc = NumUpdatableComponents(),
+      num_models = nnet_params_.NumRows();
+  for (int32 c = 0; c < num_uc; c++) {
+    BaseFloat sum = 0.0;
+    for (int32 m = 0; m < num_models; m++) {
+      int32 index = m * num_uc + c;
+      sum += unnorm_weights(index);
+    }
+    BaseFloat inv_sum = 1.0 / sum;  // if it's NaN then it's OK, we'll get NaN
+                                    // weights and eventually -inf objective.
+    for (int32 m = 0; m < num_models; m++) {
+      int32 index = m * num_uc + c;
+      (*norm_weights)(index) = unnorm_weights(index) * inv_sum;
+    }
+  }
+}
+
+void NnetChainCombiner::GetUnnormalizedWeightsDeriv(
+    const VectorBase<BaseFloat> &unnorm_weights,
+    const VectorBase<BaseFloat> &norm_weights_deriv,
+    VectorBase<BaseFloat> *unnorm_weights_deriv) {
+  if (!combine_config_.enforce_sum_to_one) {
+    unnorm_weights_deriv->CopyFromVec(norm_weights_deriv);
+    return;
+  }
+  int32 num_uc = NumUpdatableComponents(),
+      num_models = nnet_params_.NumRows();
+  for (int32 c = 0; c < num_uc; c++) {
+    BaseFloat sum = 0.0;
+    for (int32 m = 0; m < num_models; m++) {
+      int32 index = m * num_uc + c;
+      sum += unnorm_weights(index);
+    }
+    BaseFloat inv_sum = 1.0 / sum;
+    BaseFloat inv_sum_deriv = 0.0;
+    for (int32 m = 0; m < num_models; m++) {
+      int32 index = m * num_uc + c;
+      // in the forward direction, we'd do:
+      // (*norm_weights)(index) = unnorm_weights(index) * inv_sum;
+      (*unnorm_weights_deriv)(index) = inv_sum * norm_weights_deriv(index);
+      inv_sum_deriv += norm_weights_deriv(index) * unnorm_weights(index);
+    }
+    // note: d/dx (1/x) = -1/x^2
+    BaseFloat sum_deriv = -1.0 * inv_sum_deriv * inv_sum * inv_sum;
+    for (int32 m = 0; m < num_models; m++) {
+      int32 index = m * num_uc + c;
+      (*unnorm_weights_deriv)(index) += sum_deriv;
+    }
+  }
+}
+
+
+
+
+} // namespace nnet3
+} // namespace kaldi
diff --git a/src/nnet3/nnet-chain-combine.h b/src/nnet3/nnet-chain-combine.h
new file mode 100644
index 00000000000..6ef882ecc38
--- /dev/null
+++ b/src/nnet3/nnet-chain-combine.h
@@ -0,0 +1,189 @@
+// nnet3/nnet-chain-combine.h
+
+// Copyright 2012-2015  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_NNET3_NNET_CHAIN_COMBINE_H_
+#define KALDI_NNET3_NNET_CHAIN_COMBINE_H_
+
+#include "nnet3/nnet-utils.h"
+#include "nnet3/nnet-compute.h"
+#include "util/parse-options.h"
+#include "itf/options-itf.h"
+#include "nnet3/nnet-combine.h"
+#include "nnet3/nnet-chain-diagnostics.h"
+
+
+namespace kaldi {
+namespace nnet3 {
+
+// we re-use NnetCombineConfig from nnet-combine.h
+
+/*
+  You should use this class as follows:
+    - Call the constructor, giving it the egs and the first nnet.
+    - Call AcceptNnet to provide all the other nnets.  (the nnets will
+      be stored in a matrix in CPU memory, to avoid filing up GPU memory).
+    - Call Combine()
+    - Get the resultant nnet with GetNnet().
+ */
+class NnetChainCombiner {
+ public:
+  /// Caution: this object retains a const reference to the "egs", so don't
+  /// delete them until it goes out of scope.
+  NnetChainCombiner(const NnetCombineConfig &nnet_config,
+                    const chain::ChainTrainingOptions &chain_config,
+                    int32 num_nnets,
+                    const std::vector<NnetChainExample> &egs,
+                    const fst::StdVectorFst &den_fst,
+                    const Nnet &first_nnet);
+
+  /// You should call this function num_nnets-1 times after calling
+  /// the constructor, to provide the remaining nnets.
+  void AcceptNnet(const Nnet &nnet);
+
+  void Combine();
+
+  const Nnet &GetNnet() const { return nnet_; }
+
+  ~NnetChainCombiner() { delete prob_computer_; }
+ private:
+  const NnetCombineConfig &combine_config_;
+  const chain::ChainTrainingOptions &chain_config_;
+
+  const std::vector<NnetChainExample> &egs_;
+
+  const fst::StdVectorFst &den_fst_;
+
+  Nnet nnet_;  // The current neural network.
+
+  NnetChainComputeProb *prob_computer_;
+
+  std::vector<int32> updatable_component_dims_;  // dimension of each updatable
+                                                 // component.
+
+  int32 num_real_input_nnets_;  // number of actual nnet inputs.
+
+  int32 num_nnets_provided_;  // keeps track of the number of calls to AcceptNnet().
+
+  // nnet_params_ are the parameters of the "effective input"
+  // neural nets; they will often be the same as the real inputs,
+  // but if num_real_input_nnets_ > config_.num_effective_nnets, they
+  // will be weighted combinations.
+  Matrix<BaseFloat> nnet_params_;
+
+  // This vector has the same dimension as nnet_params_.NumRows(),
+  // and helps us normalize so each row of nnet_params correspondss to
+  // a weighted average of its inputs.
+  Vector<BaseFloat> tot_input_weighting_;
+
+  // returns the parameter dimension, i.e. the dimension of the parameters that
+  // we are optimizing.  This depends on the config, the number of updatable
+  // components and nnet_params_.NumRows(); it will never exceed the number of
+  // updatable components times nnet_params_.NumRows().
+  int32 ParameterDim() const;
+
+  int32 NumUpdatableComponents() const {
+    return updatable_component_dims_.size();
+  }
+  // returns the weight dimension.
+  int32 WeightDim() const {
+    return nnet_params_.NumRows() * NumUpdatableComponents();
+  }
+
+  int32 NnetParameterDim() const { return nnet_params_.NumCols(); }
+
+  // Computes the initial parameters.  The parameters are the underlying thing
+  // that we optimize; their dimension equals ParameterDim().  They are not the same
+  // thing as the nnet parameters.
+  void GetInitialParameters(VectorBase<BaseFloat> *params) const;
+
+  // Tests that derivatives are accurate.  Prints warning and returns false if not.
+  bool SelfTestDerivatives();
+
+  // Tests that model derivatives are accurate.  Just prints warning if not.
+  void SelfTestModelDerivatives();
+
+
+  // prints the parameters via logging statements.
+  void PrintParams(const VectorBase<BaseFloat> &params) const;
+
+  // This function computes the objective function (and its derivative, if the objective
+  // function is finite) at the given value of the parameters (the parameters we're optimizing,
+  // i.e. the combination weights; not the nnet parameters.  This function calls most of the
+  // functions below.
+  double ComputeObjfAndDerivFromParameters(
+      VectorBase<BaseFloat> &params,
+      VectorBase<BaseFloat> *params_deriv);
+
+
+  // Computes the weights from the parameters in a config-dependent way.  The
+  // weight dimension is always (the number of updatable components times
+  // nnet_params_.NumRows()).
+  void GetWeights(const VectorBase<BaseFloat> &params,
+                  VectorBase<BaseFloat> *weights) const;
+
+  // Given the raw weights: if config_.enforce_sum_to_one, then compute weights
+  // with sum-to-one constrint per component included; else just copy input to
+  // output.
+  void GetNormalizedWeights(const VectorBase<BaseFloat> &unnorm_weights,
+                            VectorBase<BaseFloat> *norm_weights) const;
+
+  // Computes the nnet-parameter vector from the normalized weights and
+  // nnet_params_, as a vector.  (See the functions Vectorize() and
+  // UnVectorize() for how they relate to the nnet's components' parameters).
+  void GetNnetParameters(const Vector<BaseFloat> &normalized_weights,
+                         VectorBase<BaseFloat> *nnet_params) const;
+
+  // This function computes the objective function (and its derivative, if the objective
+  // function is finite) at the given value of nnet parameters.  This involves the
+  // nnet computation.
+  double ComputeObjfAndDerivFromNnet(VectorBase<BaseFloat> &nnet_params,
+                                     VectorBase<BaseFloat> *nnet_params_deriv);
+
+  // Given an objective-function derivative with respect to the nnet parameters,
+  // computes the derivative with respect to the (normalized) weights.
+  void GetWeightsDeriv(const VectorBase<BaseFloat> &nnet_params_deriv,
+                       VectorBase<BaseFloat> *normalized_weights_deriv);
+
+
+  // Computes the derivative w.r.t. the unnormalized weights, by propagating
+  // through the normalization operation.
+  // If config_.enforce_sum_to_one == false, just copies norm_weights_deriv to
+  // unnorm_weights_deriv.
+  void GetUnnormalizedWeightsDeriv(const VectorBase<BaseFloat> &unnorm_weights,
+                                   const VectorBase<BaseFloat> &norm_weights_deriv,
+                                   VectorBase<BaseFloat> *unnorm_weights_deriv);
+
+
+  // Given a derivative w.r.t. the weights, outputs a derivative w.r.t.
+  // the params
+  void GetParamsDeriv(const VectorBase<BaseFloat> &weights,
+                      const VectorBase<BaseFloat> &weight_deriv,
+                      VectorBase<BaseFloat> *param_deriv);
+
+  void ComputeUpdatableComponentDims();
+  void FinishPreprocessingInput();
+
+};
+
+
+
+} // namespace nnet3
+} // namespace kaldi
+
+#endif
diff --git a/src/nnet3/nnet-chain-diagnostics.cc b/src/nnet3/nnet-chain-diagnostics.cc
new file mode 100644
index 00000000000..46e2b0c01dc
--- /dev/null
+++ b/src/nnet3/nnet-chain-diagnostics.cc
@@ -0,0 +1,198 @@
+// nnet3/nnet-chain-diagnostics.cc
+
+// Copyright      2015    Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "nnet3/nnet-chain-diagnostics.h"
+#include "nnet3/nnet-utils.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+NnetChainComputeProb::NnetChainComputeProb(
+    const NnetComputeProbOptions &nnet_config,
+    const chain::ChainTrainingOptions &chain_config,
+    const fst::StdVectorFst &den_fst,
+    const Nnet &nnet):
+    nnet_config_(nnet_config),
+    chain_config_(chain_config),
+    den_graph_(den_fst, nnet.OutputDim("output")),
+    nnet_(nnet),
+    compiler_(nnet, nnet_config_.optimize_config),
+    deriv_nnet_(NULL),
+    num_minibatches_processed_(0) {
+  if (nnet_config_.compute_deriv) {
+    deriv_nnet_ = new Nnet(nnet_);
+    bool is_gradient = true;  // force simple update
+    SetZero(is_gradient, deriv_nnet_);
+  }
+}
+
+const Nnet &NnetChainComputeProb::GetDeriv() const {
+  if (deriv_nnet_ == NULL)
+    KALDI_ERR << "GetDeriv() called when no derivatives were requested.";
+  return *deriv_nnet_;
+}
+
+NnetChainComputeProb::~NnetChainComputeProb() {
+  delete deriv_nnet_;  // delete does nothing if pointer is NULL.
+}
+
+void NnetChainComputeProb::Reset() {
+  num_minibatches_processed_ = 0;
+  objf_info_.clear();
+  if (deriv_nnet_) {
+    bool is_gradient = true;
+    SetZero(is_gradient, deriv_nnet_);
+  }
+}
+
+void NnetChainComputeProb::Compute(const NnetChainExample &chain_eg) {
+  bool need_model_derivative = nnet_config_.compute_deriv,
+      store_component_stats = false;
+  ComputationRequest request;
+  // if the options specify cross-entropy regularization, we'll be computing
+  // this objective (not interpolated with the regular objective-- we give it a
+  // separate name), but currently we won't make it contribute to the
+  // derivative-- we just compute the derivative of the regular output.
+  // This is because in the place where we use the derivative (the
+  // model-combination code) we decided to keep it simple and just use the
+  // regular objective.
+  bool use_xent_regularization = (chain_config_.xent_regularize != 0.0),
+      use_xent_derivative = false;
+  GetChainComputationRequest(nnet_, chain_eg, need_model_derivative,
+                             store_component_stats, use_xent_regularization,
+                             use_xent_derivative, &request);
+  const NnetComputation *computation = compiler_.Compile(request);
+  NnetComputer computer(nnet_config_.compute_config, *computation,
+                        nnet_, deriv_nnet_);
+  // give the inputs to the computer object.
+  computer.AcceptInputs(nnet_, chain_eg.inputs);
+  computer.Forward();
+  this->ProcessOutputs(chain_eg, &computer);
+  if (nnet_config_.compute_deriv)
+    computer.Backward();
+}
+
+void NnetChainComputeProb::ProcessOutputs(const NnetChainExample &eg,
+                                         NnetComputer *computer) {
+  // There will normally be just one output here, named 'output',
+  // but the code is more general than this.
+  std::vector<NnetChainSupervision>::const_iterator iter = eg.outputs.begin(),
+      end = eg.outputs.end();
+  for (; iter != end; ++iter) {
+    const NnetChainSupervision &sup = *iter;
+    int32 node_index = nnet_.GetNodeIndex(sup.name);
+    if (node_index < 0 ||
+        !nnet_.IsOutputNode(node_index))
+      KALDI_ERR << "Network has no output named " << sup.name;
+
+    const CuMatrixBase<BaseFloat> &nnet_output = computer->GetOutput(sup.name);
+    bool use_xent = (chain_config_.xent_regularize != 0.0);
+    std::string xent_name = sup.name + "-xent";  // typically "output-xent".
+    CuMatrix<BaseFloat> nnet_output_deriv, xent_deriv;
+    if (nnet_config_.compute_deriv)
+      nnet_output_deriv.Resize(nnet_output.NumRows(), nnet_output.NumCols(),
+                               kUndefined);
+    if (use_xent)
+      xent_deriv.Resize(nnet_output.NumRows(), nnet_output.NumCols(),
+                        kUndefined);
+      
+    BaseFloat tot_like, tot_l2_term, tot_weight;
+    
+    ComputeChainObjfAndDeriv(chain_config_, den_graph_,
+                             sup.supervision, nnet_output,
+                             &tot_like, &tot_l2_term, &tot_weight,
+                             (nnet_config_.compute_deriv ? &nnet_output_deriv :
+                              NULL), (use_xent ? &xent_deriv : NULL));
+    
+    // note: in this context we don't want to apply 'sup.deriv_weights' because
+    // this code is used only in combination, where it's part of an L-BFGS
+    // optimization algorithm, and in that case if there is a mismatch between
+    // the computed objective function and the derivatives, it may cause errors
+    // in the optimization procedure such as early termination.  (line search
+    // and conjugate gradient descent both rely on the derivatives being
+    // accurate, and don't fail gracefully if the derivatives are not accurate).
+
+    ChainObjectiveInfo &totals = objf_info_[sup.name];
+    totals.tot_weight += tot_weight;
+    totals.tot_like += tot_like;
+    totals.tot_l2_term += tot_l2_term;
+
+    if (nnet_config_.compute_deriv)
+      computer->AcceptOutputDeriv(sup.name, &nnet_output_deriv);
+
+    if (use_xent) {
+      ChainObjectiveInfo &xent_totals = objf_info_[xent_name];
+      // this block computes the cross-entropy objective.
+      const CuMatrixBase<BaseFloat> &xent_output = computer->GetOutput(
+          xent_name);
+      // at this point, xent_deriv is posteriors derived from the numerator
+      // computation.  note, xent_deriv has a factor of '.supervision.weight',
+      // but so does tot_weight.
+      BaseFloat xent_objf = TraceMatMat(xent_output, xent_deriv, kTrans);
+      xent_totals.tot_weight += tot_weight;
+      xent_totals.tot_like += xent_objf;
+    }
+    num_minibatches_processed_++;
+  }
+}
+
+bool NnetChainComputeProb::PrintTotalStats() const {
+  bool ans = false;
+  unordered_map<std::string, ChainObjectiveInfo, StringHasher>::const_iterator
+      iter, end;
+  iter = objf_info_.begin();
+  end = objf_info_.end();
+  for (; iter != end; ++iter) {
+    const std::string &name = iter->first;
+    int32 node_index = nnet_.GetNodeIndex(name);
+    KALDI_ASSERT(node_index >= 0);
+    const ChainObjectiveInfo &info = iter->second;
+    BaseFloat like = (info.tot_like / info.tot_weight),
+        l2_term = (info.tot_l2_term / info.tot_weight),
+        tot_objf = like + l2_term;
+    if (info.tot_l2_term == 0.0) {
+      KALDI_LOG << "Overall log-probability for '"
+                << name << "' is "
+                << like << " per frame"
+                << ", over " << info.tot_weight << " frames.";
+    } else {
+      KALDI_LOG << "Overall log-probability for '"
+                << name << "' is "
+                << like << " + " << l2_term << " = " << tot_objf << " per frame"
+                << ", over " << info.tot_weight << " frames.";
+    }
+    if (info.tot_weight > 0)
+      ans = true;
+  }
+  return ans;
+}
+
+
+const ChainObjectiveInfo* NnetChainComputeProb::GetObjective(
+    const std::string &output_name) const {
+  unordered_map<std::string, ChainObjectiveInfo, StringHasher>::const_iterator
+      iter = objf_info_.find(output_name);
+  if (iter != objf_info_.end())
+    return &(iter->second);
+  else
+    return NULL;
+}
+
+} // namespace nnet3
+} // namespace kaldi
diff --git a/src/nnet3/nnet-chain-diagnostics.h b/src/nnet3/nnet-chain-diagnostics.h
new file mode 100644
index 00000000000..cb433b1ca4d
--- /dev/null
+++ b/src/nnet3/nnet-chain-diagnostics.h
@@ -0,0 +1,102 @@
+// nnet3/nnet-chain-diagnostics.h
+
+// Copyright    2015  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_NNET3_NNET_CHAIN_DIAGNOSTICS_H_
+#define KALDI_NNET3_NNET_CHAIN_DIAGNOSTICS_H_
+
+#include "nnet3/nnet-example.h"
+#include "nnet3/nnet-computation.h"
+#include "nnet3/nnet-compute.h"
+#include "nnet3/nnet-optimize.h"
+#include "nnet3/nnet-chain-example.h"
+#include "nnet3/nnet-diagnostics.h"
+#include "chain/chain-training.h"
+#include "chain/chain-den-graph.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+
+struct ChainObjectiveInfo {
+  double tot_weight;
+  double tot_like;
+  double tot_l2_term;
+  ChainObjectiveInfo(): tot_weight(0.0),
+                        tot_like(0.0),
+                        tot_l2_term(0.0) { }
+};
+
+
+/** This class is for computing objective-function values in a nnet3+chain
+    setup, for diagnostics.  It also supports computing model derivatives.
+    Note: if the --xent-regularization option is nonzero, the cross-entropy
+    objective will be computed, and displayed when you call PrintTotalStats(),
+    but it will not contribute to model derivatives (there is no code to compute
+    the regularized objective function, and anyway it's not clear that we really
+    need this regularization in the combination phase).
+ */
+class NnetChainComputeProb {
+ public:
+  // does not store a reference to 'config' but does store one to 'nnet'.
+  NnetChainComputeProb(const NnetComputeProbOptions &nnet_config,
+                       const chain::ChainTrainingOptions &chain_config,
+                       const fst::StdVectorFst &den_fst,
+                       const Nnet &nnet);
+
+  // Reset the likelihood stats, and the derivative stats (if computed).
+  void Reset();
+
+  // compute objective on one minibatch.
+  void Compute(const NnetChainExample &chain_eg);
+
+  // Prints out the final stats, and return true if there was a nonzero count.
+  bool PrintTotalStats() const;
+
+  // returns the objective-function info for this output name (e.g. "output"),
+  // or NULL if there is no such info.
+  const ChainObjectiveInfo *GetObjective(const std::string &output_name) const;
+
+  // if config.compute_deriv == true, returns a reference to the
+  // computed derivative.  Otherwise crashes.
+  const Nnet &GetDeriv() const;
+
+  ~NnetChainComputeProb();
+ private:
+  void ProcessOutputs(const NnetChainExample &chain_eg,
+                      NnetComputer *computer);
+
+  NnetComputeProbOptions nnet_config_;
+  chain::ChainTrainingOptions chain_config_;
+  chain::DenominatorGraph den_graph_;
+  const Nnet &nnet_;
+  CachingOptimizingCompiler compiler_;
+  Nnet *deriv_nnet_;
+  int32 num_minibatches_processed_;  // this is only for diagnostics
+
+  unordered_map<std::string, ChainObjectiveInfo, StringHasher> objf_info_;
+
+};
+
+
+
+
+} // namespace nnet3
+} // namespace kaldi
+
+#endif // KALDI_NNET3_NNET_CHAIN_DIAGNOSTICS_H_
diff --git a/src/nnet3/nnet-chain-example.cc b/src/nnet3/nnet-chain-example.cc
new file mode 100644
index 00000000000..74e8be80240
--- /dev/null
+++ b/src/nnet3/nnet-chain-example.cc
@@ -0,0 +1,425 @@
+// nnet3/nnet-chain-example.cc
+
+// Copyright      2015    Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cmath>
+#include "nnet3/nnet-chain-example.h"
+#include "nnet3/nnet-example-utils.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+
+void NnetChainSupervision::Write(std::ostream &os, bool binary) const {
+  CheckDim();
+  WriteToken(os, binary, "<NnetChainSup>");
+  WriteToken(os, binary, name);
+  WriteIndexVector(os, binary, indexes);
+  supervision.Write(os, binary);
+  WriteToken(os, binary, "<DW>");  // for DerivWeights.  Want to save space.
+  WriteVectorAsChar(os, binary, deriv_weights);
+  WriteToken(os, binary, "</NnetChainSup>");
+}
+
+bool NnetChainSupervision::operator == (const NnetChainSupervision &other) const {
+  return name == other.name && indexes == other.indexes &&
+      supervision == other.supervision &&
+      deriv_weights.ApproxEqual(other.deriv_weights);
+}
+
+void NnetChainSupervision::Read(std::istream &is, bool binary) {
+  ExpectToken(is, binary, "<NnetChainSup>");
+  ReadToken(is, binary, &name);
+  ReadIndexVector(is, binary, &indexes);
+  supervision.Read(is, binary);
+  std::string token;
+  ReadToken(is, binary, &token);
+  // in the future this back-compatibility code can be reworked.
+  if (token != "</NnetChainSup>") {
+    KALDI_ASSERT(token == "<DW>");
+    ReadVectorAsChar(is, binary, &deriv_weights);
+    ExpectToken(is, binary, "</NnetChainSup>");
+  }
+  CheckDim();
+}
+
+
+void NnetChainSupervision::CheckDim() const {
+  if (supervision.frames_per_sequence == -1) {
+    // this object has not been set up.
+    KALDI_ASSERT(indexes.empty());
+    return;
+  }
+  KALDI_ASSERT(indexes.size() == supervision.num_sequences *
+               supervision.frames_per_sequence && !indexes.empty() &&
+               supervision.frames_per_sequence > 1);
+  int32 first_frame = indexes[0].t,
+      frame_skip = indexes[supervision.num_sequences].t - first_frame,
+      num_sequences = supervision.num_sequences,
+      frames_per_sequence = supervision.frames_per_sequence;
+  int32 k = 0;
+  for (int32 i = 0; i < frames_per_sequence; i++) {
+    for (int32 j = 0; j < num_sequences; j++,k++) {
+      int32 n = j, t = i * frame_skip + first_frame, x = 0;
+      Index index(n, t, x);
+      KALDI_ASSERT(indexes[k] == index);
+    }
+  }
+  if (deriv_weights.Dim() != 0) {
+    KALDI_ASSERT(deriv_weights.Dim() == indexes.size());
+    KALDI_ASSERT(deriv_weights.Min() >= 0.0 &&
+                 deriv_weights.Max() <= 1.0);
+  }
+}
+
+NnetChainSupervision::NnetChainSupervision(const NnetChainSupervision &other):
+    name(other.name),
+    indexes(other.indexes),
+    supervision(other.supervision),
+    deriv_weights(other.deriv_weights) { CheckDim(); }
+
+void NnetChainSupervision::Swap(NnetChainSupervision *other) {
+  name.swap(other->name);
+  indexes.swap(other->indexes);
+  supervision.Swap(&(other->supervision));
+  deriv_weights.Swap(&(other->deriv_weights));
+  if (RandInt(0, 5) == 0)
+    CheckDim();
+}
+
+NnetChainSupervision::NnetChainSupervision(
+    const std::string &name,
+    const chain::Supervision &supervision,
+    const Vector<BaseFloat> &deriv_weights,
+    int32 first_frame,
+    int32 frame_skip):
+    name(name),
+    supervision(supervision),
+    deriv_weights(deriv_weights) {
+  // note: this will set the 'x' index to zero.
+  indexes.resize(supervision.num_sequences *
+                 supervision.frames_per_sequence);
+  int32 k = 0, num_sequences = supervision.num_sequences,
+      frames_per_sequence = supervision.frames_per_sequence;
+  for (int32 i = 0; i < frames_per_sequence; i++) {
+    for (int32 j = 0; j < num_sequences; j++,k++) {
+      indexes[k].n = j;
+      indexes[k].t = i * frame_skip + first_frame;
+    }
+  }
+  KALDI_ASSERT(k == indexes.size());
+  CheckDim();
+}
+
+
+void NnetChainExample::Write(std::ostream &os, bool binary) const {
+  // Note: weight, label, input_frames and spk_info are members.  This is a
+  // struct.
+  WriteToken(os, binary, "<Nnet3ChainEg>");
+  WriteToken(os, binary, "<NumInputs>");
+  int32 size = inputs.size();
+  WriteBasicType(os, binary, size);
+  KALDI_ASSERT(size > 0 && "Attempting to write NnetChainExample with no inputs");
+  if (!binary) os << '\n';
+  for (int32 i = 0; i < size; i++) {
+    inputs[i].Write(os, binary);
+    if (!binary) os << '\n';
+  }
+  WriteToken(os, binary, "<NumOutputs>");
+  size = outputs.size();
+  WriteBasicType(os, binary, size);
+  KALDI_ASSERT(size > 0 && "Attempting to write NnetChainExample with no outputs");
+  if (!binary) os << '\n';
+  for (int32 i = 0; i < size; i++) {
+    outputs[i].Write(os, binary);
+    if (!binary) os << '\n';
+  }
+  WriteToken(os, binary, "</Nnet3ChainEg>");
+}
+
+void NnetChainExample::Read(std::istream &is, bool binary) {
+  ExpectToken(is, binary, "<Nnet3ChainEg>");
+  ExpectToken(is, binary, "<NumInputs>");
+  int32 size;
+  ReadBasicType(is, binary, &size);
+  if (size < 1 || size > 1000000)
+    KALDI_ERR << "Invalid size " << size;
+  inputs.resize(size);
+  for (int32 i = 0; i < size; i++)
+    inputs[i].Read(is, binary);
+  ExpectToken(is, binary, "<NumOutputs>");
+  ReadBasicType(is, binary, &size);
+  if (size < 1 || size > 1000000)
+    KALDI_ERR << "Invalid size " << size;
+  outputs.resize(size);
+  for (int32 i = 0; i < size; i++)
+    outputs[i].Read(is, binary);
+  ExpectToken(is, binary, "</Nnet3ChainEg>");
+}
+
+void NnetChainExample::Swap(NnetChainExample *other) {
+  inputs.swap(other->inputs);
+  outputs.swap(other->outputs);
+}
+
+void NnetChainExample::Compress() {
+  std::vector<NnetIo>::iterator iter = inputs.begin(), end = inputs.end();
+  // calling features.Compress() will do nothing if they are sparse or already
+  // compressed.
+  for (; iter != end; ++iter) iter->features.Compress();
+}
+
+NnetChainExample::NnetChainExample(const NnetChainExample &other):
+    inputs(other.inputs), outputs(other.outputs) { }
+
+
+// called from MergeChainExamplesInternal, this function merges the Supervision
+// objects into one.  Requires (and checks) that they all have the same name.
+static void MergeSupervision(
+    const std::vector<const NnetChainSupervision*> &inputs,
+    NnetChainSupervision *output) {
+  int32 num_inputs = inputs.size(),
+      num_indexes = 0;
+  for (int32 n = 0; n < num_inputs; n++) {
+    KALDI_ASSERT(inputs[n]->name == inputs[0]->name);
+    num_indexes += inputs[n]->indexes.size();
+  }
+  output->name = inputs[0]->name;
+  std::vector<const chain::Supervision*> input_supervision;
+  input_supervision.reserve(inputs.size());
+  for (int32 n = 0; n < num_inputs; n++)
+    input_supervision.push_back(&(inputs[n]->supervision));
+  std::vector<chain::Supervision> output_supervision;
+  bool compactify = true;
+  AppendSupervision(input_supervision,
+                         compactify,
+                         &output_supervision);
+  if (output_supervision.size() != 1)
+    KALDI_ERR << "Failed to merge 'chain' examples-- inconsistent lengths "
+              << "or weights?";
+  output->supervision.Swap(&(output_supervision[0]));
+
+  output->indexes.clear();
+  output->indexes.reserve(num_indexes);
+  for (int32 n = 0; n < num_inputs; n++) {
+    const std::vector<Index> &src_indexes = inputs[n]->indexes;
+    int32 cur_size = output->indexes.size();
+    output->indexes.insert(output->indexes.end(),
+                           src_indexes.begin(), src_indexes.end());
+    std::vector<Index>::iterator iter = output->indexes.begin() + cur_size,
+        end = output->indexes.end();
+    // change the 'n' index to correspond to the index into 'input'.
+    // Each example gets a different 'n' value, starting from 0.
+    for (; iter != end; ++iter) {
+      KALDI_ASSERT(iter->n == 0 && "Merging already-merged chain egs");
+      iter->n = n;
+    }
+  }
+  KALDI_ASSERT(output->indexes.size() == num_indexes);
+  // OK, at this point the 'indexes' will be in the wrong order,
+  // because they should be first sorted by 't' and next by 'n'.
+  // 'sort' will fix this, due to the operator < on type Index.
+  std::sort(output->indexes.begin(), output->indexes.end());
+
+  // merge the deriv_weights.
+  if (inputs[0]->deriv_weights.Dim() != 0) {
+    int32 frames_per_sequence = inputs[0]->deriv_weights.Dim();
+    output->deriv_weights.Resize(output->indexes.size(), kUndefined);
+    KALDI_ASSERT(output->deriv_weights.Dim() ==
+                 frames_per_sequence * num_inputs);
+    for (int32 n = 0; n < num_inputs; n++) {
+      const Vector<BaseFloat> &src_deriv_weights = inputs[n]->deriv_weights;
+      KALDI_ASSERT(src_deriv_weights.Dim() == frames_per_sequence);
+      // the ordering of the deriv_weights corresponds to the ordering of the
+      // Indexes, where the time dimension has the greater stride.
+      for (int32 t = 0; t < frames_per_sequence; t++) {
+        output->deriv_weights(t * num_inputs + n) = src_deriv_weights(t);
+      }
+    }
+  }
+  output->CheckDim();
+}
+
+
+void MergeChainExamples(bool compress,
+                        std::vector<NnetChainExample> *input,
+                        NnetChainExample *output) {
+  int32 num_examples = input->size();
+  KALDI_ASSERT(num_examples > 0);
+  // we temporarily make the input-features in 'input' look like regular NnetExamples,
+  // so that we can recycle the MergeExamples() function.
+  std::vector<NnetExample> eg_inputs(num_examples);
+  for (int32 i = 0; i < num_examples; i++)
+    eg_inputs[i].io.swap((*input)[i].inputs);
+  NnetExample eg_output;
+  MergeExamples(eg_inputs, compress, &eg_output);
+  // swap the inputs back so that they are not really changed.
+  for (int32 i = 0; i < num_examples; i++)
+    eg_inputs[i].io.swap((*input)[i].inputs);
+  // write to 'output->inputs'
+  eg_output.io.swap(output->inputs);
+
+  // Now deal with the chain-supervision 'outputs'.  There will
+  // normally be just one of these, with name "output", but we
+  // handle the more general case.
+  int32 num_output_names = (*input)[0].outputs.size();
+  output->outputs.resize(num_output_names);
+  for (int32 i = 0; i < num_output_names; i++) {
+    std::vector<const NnetChainSupervision*> to_merge(num_examples);
+    for (int32 j = 0; j < num_examples; j++) {
+      KALDI_ASSERT((*input)[j].outputs.size() == num_output_names);
+      to_merge[j] = &((*input)[j].outputs[i]);
+    }
+    MergeSupervision(to_merge,
+                     &(output->outputs[i]));
+  }
+}
+
+void TruncateDerivWeights(int32 truncate,
+                          NnetChainExample *eg) {
+  for (size_t i = 0; i < eg->outputs.size(); i++) {
+    NnetChainSupervision &supervision = eg->outputs[i];
+    Vector<BaseFloat> &deriv_weights = supervision.deriv_weights;
+    if (deriv_weights.Dim() == 0) {
+      deriv_weights.Resize(supervision.indexes.size());
+      deriv_weights.Set(1.0);
+    }
+    int32 num_sequences = supervision.supervision.num_sequences,
+       frames_per_sequence = supervision.supervision.frames_per_sequence;
+    KALDI_ASSERT(2 * truncate  < frames_per_sequence);
+    for (int32 t = 0; t < truncate; t++)
+      for (int32 s = 0; s < num_sequences; s++)
+        deriv_weights(t * num_sequences + s) = 0.0;
+    for (int32 t = frames_per_sequence - truncate;
+         t < frames_per_sequence; t++)
+      for (int32 s = 0; s < num_sequences; s++)
+        deriv_weights(t * num_sequences + s) = 0.0;
+  }
+}
+
+void GetChainComputationRequest(const Nnet &nnet,
+                                const NnetChainExample &eg,
+                                bool need_model_derivative,
+                                bool store_component_stats,
+                                bool use_xent_regularization,
+                                bool use_xent_derivative,
+                                ComputationRequest *request) {
+  request->inputs.clear();
+  request->inputs.reserve(eg.inputs.size());
+  request->outputs.clear();
+  request->outputs.reserve(eg.outputs.size() * 2);
+  request->need_model_derivative = need_model_derivative;
+  request->store_component_stats = store_component_stats;
+  for (size_t i = 0; i < eg.inputs.size(); i++) {
+    const NnetIo &io = eg.inputs[i];
+    const std::string &name = io.name;
+    int32 node_index = nnet.GetNodeIndex(name);
+    if (node_index == -1 ||
+        !nnet.IsInputNode(node_index))
+      KALDI_ERR << "Nnet example has input named '" << name
+                << "', but no such input node is in the network.";
+
+    request->inputs.resize(request->inputs.size() + 1);
+    IoSpecification &io_spec = request->inputs.back();
+    io_spec.name = name;
+    io_spec.indexes = io.indexes;
+    io_spec.has_deriv = false;
+  }
+  for (size_t i = 0; i < eg.outputs.size(); i++) {
+    // there will normally be exactly one output , named "output"
+    const NnetChainSupervision &sup = eg.outputs[i];
+    const std::string &name = sup.name;
+    int32 node_index = nnet.GetNodeIndex(name);
+    if (node_index == -1 &&
+        !nnet.IsOutputNode(node_index))
+      KALDI_ERR << "Nnet example has output named '" << name
+                << "', but no such output node is in the network.";
+    request->outputs.resize(request->outputs.size() + 1);
+    IoSpecification &io_spec = request->outputs.back();
+    io_spec.name = name;
+    io_spec.indexes = sup.indexes;
+    io_spec.has_deriv = need_model_derivative;
+
+    if (use_xent_regularization) {
+      size_t cur_size = request->outputs.size();
+      request->outputs.resize(cur_size + 1);
+      IoSpecification &io_spec = request->outputs[cur_size - 1],
+          &io_spec_xent = request->outputs[cur_size];
+      // the IoSpecification for the -xent output is the same
+      // as for the regular output, except for its name which has
+      // the -xent suffix (and the has_deriv member may differ).
+      io_spec_xent = io_spec;
+      io_spec_xent.name = name + "-xent";
+      io_spec_xent.has_deriv = use_xent_derivative;
+    }
+  }
+  // check to see if something went wrong.
+  if (request->inputs.empty())
+    KALDI_ERR << "No inputs in computation request.";
+  if (request->outputs.empty())
+    KALDI_ERR << "No outputs in computation request.";
+}
+
+void ShiftChainExampleTimes(int32 frame_shift,
+                            const std::vector<std::string> &exclude_names,
+                            NnetChainExample *eg) {
+  std::vector<NnetIo>::iterator input_iter = eg->inputs.begin(),
+      input_end = eg->inputs.end();
+  for (; input_iter != input_end; ++input_iter) {
+    bool must_exclude = false;
+    std::vector<string>::const_iterator exclude_iter = exclude_names.begin(),
+        exclude_end = exclude_names.end();
+    for (; exclude_iter != exclude_end; ++exclude_iter)
+      if (input_iter->name == *exclude_iter)
+        must_exclude = true;
+    if (!must_exclude) {
+      std::vector<Index>::iterator indexes_iter = input_iter->indexes.begin(),
+          indexes_end = input_iter->indexes.end();
+      for (; indexes_iter != indexes_end; ++indexes_iter)
+        indexes_iter->t += frame_shift;
+    }
+  }
+  // note: we'll normally choose a small enough shift that the output-data
+  // shift will be zero after dividing by frame_subsampling_factor
+  // (e.g. frame_subsampling_factor == 3 and shift = 0 or 1.
+  std::vector<NnetChainSupervision>::iterator
+      sup_iter = eg->outputs.begin(),
+      sup_end = eg->outputs.end();
+  for (; sup_iter != sup_end; ++sup_iter) {
+    std::vector<Index> &indexes = sup_iter->indexes;
+    KALDI_ASSERT(indexes.size() >= 2 && indexes[0].n == indexes[1].n &&
+                 indexes[0].x == indexes[1].x);
+    int32 frame_subsampling_factor = indexes[1].t - indexes[0].t;
+    KALDI_ASSERT(frame_subsampling_factor > 0);
+
+    // We need to shift by a multiple of frame_subsampling_factor.
+    // Round to the closest multiple.
+    int32 supervision_frame_shift =
+        frame_subsampling_factor *
+        std::floor(0.5 + (frame_shift * 1.0 / frame_subsampling_factor));
+    if (supervision_frame_shift == 0)
+      continue;
+    std::vector<Index>::iterator indexes_iter = indexes.begin(),
+        indexes_end = indexes.end();
+    for (; indexes_iter != indexes_end; ++indexes_iter)
+      indexes_iter->t += supervision_frame_shift;
+  }
+}
+
+} // namespace nnet3
+} // namespace kaldi
diff --git a/src/nnet3/nnet-chain-example.h b/src/nnet3/nnet-chain-example.h
new file mode 100644
index 00000000000..323e73da8da
--- /dev/null
+++ b/src/nnet3/nnet-chain-example.h
@@ -0,0 +1,206 @@
+// nnet3/nnet-chain-example.h
+
+// Copyright      2015  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_NNET3_NNET_CHAIN_EXAMPLE_H_
+#define KALDI_NNET3_NNET_CHAIN_EXAMPLE_H_
+
+#include "nnet3/nnet-nnet.h"
+#include "nnet3/nnet-computation.h"
+#include "hmm/posterior.h"
+#include "util/table-types.h"
+#include "nnet3/nnet-example.h"
+#include "chain/chain-supervision.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+
+// For regular setups we use struct 'NnetIo' as the output.  For the 'chain'
+// models, the output supervision is a little more complex as it involves a
+// lattice and we need to do forward-backward, so we use a separate struct for
+// it.  The 'output' name means that it pertains to the output of the network,
+// as opposed to the features which pertain to the input of the network.  It
+// actually stores the lattice-like supervision information at the output of the
+// network (which imposes constraints on which frames each phone can be active
+// on.
+struct NnetChainSupervision {
+  /// the name of the output in the neural net; in simple setups it
+  /// will just be "output".
+  std::string name;
+
+  /// The indexes that the output corresponds to.  The size of this vector will
+  /// be equal to supervision.num_sequences * supervision.frames_per_sequence.
+  /// Be careful about the order of these indexes-- it is a little confusing.
+  /// The indexes in the 'index' vector are ordered as: (frame 0 of each sequence);
+  /// (frame 1 of each sequence); and so on.  But in the 'supervision' object,
+  /// the FST contains (sequence 0; sequence 1; ...).  So reordering is needed.
+  /// This is done for efficiency in the denominator computation (it helps memory
+  /// locality), as well as to match the ordering inside the neural net.
+  std::vector<Index> indexes;
+
+
+  /// The supervision object, containing the FST.
+  chain::Supervision supervision;
+
+  /// This is a vector of per-frame weights, required to be between 0 and 1,
+  /// that is applied to the derivative during training (but not during model
+  /// combination, where the derivatives need to agree with the computed objf
+  /// values for the optimization code to work).  The reason for this is to more
+  /// exactly handle edge effects and to ensure that no frames are
+  /// 'double-counted'.  The order of this vector corresponds to the order of
+  /// the 'indexes' (i.e. all the first frames, then all the second frames,
+  /// etc.)
+  /// If this vector is empty it means we're not applying per-frame weights,
+  /// so it's equivalent to a vector of all ones.  This vector is written
+  /// to disk compactly as unsigned char.
+  Vector<BaseFloat> deriv_weights;
+
+  // Use default assignment operator
+
+  NnetChainSupervision() { }
+
+  /// Initialize the object from an object of type chain::Supervision, and some
+  /// extra information.  Note: you probably want to set 'name' to "output".
+  /// 'first_frame' will often be zero but you can choose (just make it
+  /// consistent with how you numbered your inputs), and 'frame_skip' would be 1
+  /// in a vanilla setup, but we plan to try setups where the output periodicity
+  /// is slower than the input, so in this case it might be 2 or 3.
+  NnetChainSupervision(const std::string &name,
+                       const chain::Supervision &supervision,
+                       const Vector<BaseFloat> &deriv_weights,
+                       int32 first_frame,
+                       int32 frame_skip);
+
+  NnetChainSupervision(const NnetChainSupervision &other);
+
+  void Write(std::ostream &os, bool binary) const;
+
+  void Read(std::istream &is, bool binary);
+
+  void Swap(NnetChainSupervision *other);
+
+  void CheckDim() const;
+
+  bool operator == (const NnetChainSupervision &other) const;
+};
+
+/// NnetChainExample is like NnetExample, but specialized for CTC training.
+/// (actually CCTC training, which is our extension of CTC).
+struct NnetChainExample {
+
+  /// 'inputs' contains the input to the network-- normally just it has just one
+  /// element called "input", but there may be others (e.g. one called
+  /// "ivector")...  this depends on the setup.
+  std::vector<NnetIo> inputs;
+
+  /// 'outputs' contains the CTC output supervision.  There will normally
+  /// be just one member with name == "output".
+  std::vector<NnetChainSupervision> outputs;
+
+  void Write(std::ostream &os, bool binary) const;
+  void Read(std::istream &is, bool binary);
+
+  void Swap(NnetChainExample *other);
+
+  // Compresses the input features (if not compressed)
+  void Compress();
+
+  NnetChainExample() { }
+
+  NnetChainExample(const NnetChainExample &other);
+
+  bool operator == (const NnetChainExample &other) const {
+    return inputs == other.inputs && outputs == other.outputs;
+  }
+};
+
+
+/// This function merges a list of NnetChainExample objects into a single one--
+/// intended to be used when forming minibatches for neural net training.  If
+/// 'compress' it compresses the output features (recommended to save disk
+/// space).
+///
+/// Note: the input is left as it was at the start, but it is temporarily
+/// changed inside the function; this is a trick to allow us to use the
+/// MergeExamples() routine while avoiding having to rewrite code.
+void MergeChainExamples(bool compress,
+                        std::vector<NnetChainExample> *input,
+                        NnetChainExample *output);
+
+
+
+/** Shifts the time-index t of everything in the input of "eg" by adding
+    "t_offset" to all "t" values-- but excluding those with names listed in
+    "exclude_names", e.g.  "ivector".  This might be useful if you are doing
+    subsampling of frames at the output, because shifted examples won't be quite
+    equivalent to their non-shifted counterparts.  "exclude_names" is a vector
+    of names of nnet inputs that we avoid shifting the "t" values of-- normally
+    it will contain just the single string "ivector" because we always leave t=0
+    for any ivector.
+
+    Note: input features will be shifted by 'frame_shift', and indexes in the
+    supervision in (eg->output) will be shifted by 'frame_shift' rounded to the
+    closest multiple of the frame subsampling factor (e.g. 3).  The frame
+    subsampling factor is worked out from the time spacing between the indexes
+    in the output.  */
+void ShiftChainExampleTimes(int32 frame_shift,
+                           const std::vector<std::string> &exclude_names,
+                           NnetChainExample *eg);
+
+/**
+   This sets to zero any elements of 'egs->outputs[*].deriv_weights' that correspond
+   to frames within the first or last 'truncate' frames of the sequence (e.g. you could
+   set 'truncate=5' to set zero deriv-weight for the first and last 5 frames of the
+   sequence).
+ */
+void TruncateDerivWeights(int32 truncate,
+                          NnetChainExample *eg);
+
+/**  This function takes a NnetChainExample and produces a ComputationRequest.
+     Assumes you don't want the derivatives w.r.t. the inputs; if you do, you
+     can create the ComputationRequest manually.  Assumes that if
+     need_model_derivative is true, you will be supplying derivatives w.r.t. all
+     outputs.
+
+     If use_xent_regularization == true, then it assumes that for each output
+     name (e.g. "output" in the eg, there is another output with the same
+     dimension and with the suffix "-xent" on its name, e.g. named
+     "output-xent".  The derivative w.r.t. the xent objective will only be
+     supplied to the nnet computation if 'use_xent_derivative' is true (we
+     propagate back the xent derivative to the model only in training, not in
+     model-combination in nnet3-chain-combine).
+*/
+void GetChainComputationRequest(const Nnet &nnet,
+                                const NnetChainExample &eg,
+                                bool need_model_derivative,
+                                bool store_component_stats,
+                                bool use_xent_regularization,
+                                bool use_xent_derivative,
+                                ComputationRequest *computation_request);
+
+
+
+typedef TableWriter<KaldiObjectHolder<NnetChainExample > > NnetChainExampleWriter;
+typedef SequentialTableReader<KaldiObjectHolder<NnetChainExample > > SequentialNnetChainExampleReader;
+typedef RandomAccessTableReader<KaldiObjectHolder<NnetChainExample > > RandomAccessNnetChainExampleReader;
+
+} // namespace nnet3
+} // namespace kaldi
+
+#endif // KALDI_NNET3_NNET_CHAIN_EXAMPLE_H_
diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc
new file mode 100644
index 00000000000..1da521eebd3
--- /dev/null
+++ b/src/nnet3/nnet-chain-training.cc
@@ -0,0 +1,198 @@
+// nnet3/nnet-chain-training.cc
+
+// Copyright      2015    Johns Hopkins University (author: Daniel Povey)
+//                2016    Xiaohui Zhang
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "nnet3/nnet-chain-training.h"
+#include "nnet3/nnet-utils.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+NnetChainTrainer::NnetChainTrainer(const NnetChainTrainingOptions &opts,
+                                   const fst::StdVectorFst &den_fst,
+                                   Nnet *nnet):
+    opts_(opts),
+    den_graph_(den_fst, nnet->OutputDim("output")),
+    nnet_(nnet),
+    compiler_(*nnet, opts_.nnet_config.optimize_config),
+    num_minibatches_processed_(0) {
+  if (opts.nnet_config.zero_component_stats)
+    ZeroComponentStats(nnet);
+  if (opts.nnet_config.momentum == 0.0 &&
+      opts.nnet_config.max_param_change == 0.0) {
+    delta_nnet_= NULL;
+  } else {
+    KALDI_ASSERT(opts.nnet_config.momentum >= 0.0 &&
+                 opts.nnet_config.max_param_change >= 0.0);
+    delta_nnet_ = nnet_->Copy();
+    bool is_gradient = false;  // setting this to true would disable the
+                               // natural-gradient updates.
+    SetZero(is_gradient, delta_nnet_);
+  }
+  if (opts.nnet_config.read_cache != "") {
+    bool binary;
+    try {
+      Input ki(opts.nnet_config.read_cache, &binary);
+      compiler_.ReadCache(ki.Stream(), binary);
+      KALDI_LOG << "Read computation cache from " << opts.nnet_config.write_cache;
+    } catch (...) {
+      KALDI_WARN << "Could not open cached computation. "
+                    "Probably this is the first training iteration.";
+    }
+  } 
+}
+
+
+void NnetChainTrainer::Train(const NnetChainExample &chain_eg) {
+  bool need_model_derivative = true;
+  const NnetTrainerOptions &nnet_config = opts_.nnet_config;
+  bool use_xent_regularization = (opts_.chain_config.xent_regularize != 0.0);
+  ComputationRequest request;
+  GetChainComputationRequest(*nnet_, chain_eg, need_model_derivative,
+                             nnet_config.store_component_stats,
+                             use_xent_regularization, need_model_derivative,
+                             &request);
+  const NnetComputation *computation = compiler_.Compile(request);
+
+  NnetComputer computer(nnet_config.compute_config, *computation,
+                        *nnet_,
+                        (delta_nnet_ == NULL ? nnet_ : delta_nnet_));
+  // give the inputs to the computer object.
+  computer.AcceptInputs(*nnet_, chain_eg.inputs);
+  computer.Forward();
+
+  this->ProcessOutputs(chain_eg, &computer);
+  computer.Backward();
+
+  if (delta_nnet_ != NULL) {
+    BaseFloat scale = (1.0 - nnet_config.momentum);
+    if (nnet_config.max_param_change != 0.0) {
+      BaseFloat param_delta =
+          std::sqrt(DotProduct(*delta_nnet_, *delta_nnet_)) * scale;
+      if (param_delta > nnet_config.max_param_change) {
+        if (param_delta - param_delta != 0.0) {
+          KALDI_WARN << "Infinite parameter change, will not apply.";
+          SetZero(false, delta_nnet_);
+        } else {
+          scale *= nnet_config.max_param_change / param_delta;
+          KALDI_LOG << "Parameter change too big: " << param_delta << " > "
+                    << "--max-param-change=" << nnet_config.max_param_change
+                    << ", scaling by "
+                    << nnet_config.max_param_change / param_delta;
+        }
+      }
+    }
+    AddNnet(*delta_nnet_, scale, nnet_);
+    ScaleNnet(nnet_config.momentum, delta_nnet_);
+  }
+}
+
+
+void NnetChainTrainer::ProcessOutputs(const NnetChainExample &eg,
+                                      NnetComputer *computer) {
+  // normally the eg will have just one output named 'output', but
+  // we don't assume this.
+  std::vector<NnetChainSupervision>::const_iterator iter = eg.outputs.begin(),
+      end = eg.outputs.end();
+  for (; iter != end; ++iter) {
+    const NnetChainSupervision &sup = *iter;
+    int32 node_index = nnet_->GetNodeIndex(sup.name);
+    if (node_index < 0 ||
+        !nnet_->IsOutputNode(node_index))
+      KALDI_ERR << "Network has no output named " << sup.name;
+
+    const CuMatrixBase<BaseFloat> &nnet_output = computer->GetOutput(sup.name);
+    CuMatrix<BaseFloat> nnet_output_deriv(nnet_output.NumRows(),
+                                          nnet_output.NumCols(),
+                                          kUndefined);
+
+    bool use_xent = (opts_.chain_config.xent_regularize != 0.0);
+    std::string xent_name = sup.name + "-xent";  // typically "output-xent".
+    CuMatrix<BaseFloat> xent_deriv;
+    if (use_xent)
+      xent_deriv.Resize(nnet_output.NumRows(), nnet_output.NumCols(),
+                        kUndefined);
+
+    BaseFloat tot_objf, tot_l2_term, tot_weight;
+
+    ComputeChainObjfAndDeriv(opts_.chain_config, den_graph_,
+                             sup.supervision, nnet_output,
+                             &tot_objf, &tot_l2_term, &tot_weight,
+                             &nnet_output_deriv,
+                             (use_xent ? &xent_deriv : NULL));
+
+    if (use_xent) {
+      // this block computes the cross-entropy objective.
+      const CuMatrixBase<BaseFloat> &xent_output = computer->GetOutput(
+          xent_name);
+      // at this point, xent_deriv is posteriors derived from the numerator
+      // computation.  note, xent_objf has a factor of '.supervision.weight'
+      BaseFloat xent_objf = TraceMatMat(xent_output, xent_deriv, kTrans);
+      objf_info_[xent_name].UpdateStats(xent_name, opts_.nnet_config.print_interval,
+                                        num_minibatches_processed_,
+                                        tot_weight, xent_objf);
+    }
+
+    if (opts_.apply_deriv_weights && sup.deriv_weights.Dim() != 0) {
+      CuVector<BaseFloat> cu_deriv_weights(sup.deriv_weights);
+      nnet_output_deriv.MulRowsVec(cu_deriv_weights);
+      if (use_xent)
+        xent_deriv.MulRowsVec(cu_deriv_weights);
+    }
+
+    computer->AcceptOutputDeriv(sup.name, &nnet_output_deriv);
+
+    objf_info_[sup.name].UpdateStats(sup.name, opts_.nnet_config.print_interval,
+                                     num_minibatches_processed_++,
+                                     tot_weight, tot_objf, tot_l2_term);
+
+    if (use_xent) {
+      xent_deriv.Scale(opts_.chain_config.xent_regularize);
+      computer->AcceptOutputDeriv(xent_name, &xent_deriv);
+    }
+  }
+}
+
+
+bool NnetChainTrainer::PrintTotalStats() const {
+  unordered_map<std::string, ObjectiveFunctionInfo>::const_iterator
+      iter = objf_info_.begin(),
+      end = objf_info_.end();
+  bool ans = false;
+  for (; iter != end; ++iter) {
+    const std::string &name = iter->first;
+    const ObjectiveFunctionInfo &info = iter->second;
+    ans = info.PrintTotalStats(name) || ans;
+  }
+  return ans;
+}
+
+
+NnetChainTrainer::~NnetChainTrainer() {
+  if (opts_.nnet_config.write_cache != "") {
+    Output ko(opts_.nnet_config.write_cache, opts_.nnet_config.binary_write_cache);
+    compiler_.WriteCache(ko.Stream(), opts_.nnet_config.binary_write_cache);
+    KALDI_LOG << "Wrote computation cache to " << opts_.nnet_config.write_cache;
+  } 
+  delete delta_nnet_;
+}
+
+
+} // namespace nnet3
+} // namespace kaldi
diff --git a/src/nnet3/nnet-chain-training.h b/src/nnet3/nnet-chain-training.h
new file mode 100644
index 00000000000..a4810fe16c6
--- /dev/null
+++ b/src/nnet3/nnet-chain-training.h
@@ -0,0 +1,95 @@
+// nnet3/nnet-chain-training.h
+
+// Copyright    2015  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_NNET3_NNET_CHAIN_TRAINING_H_
+#define KALDI_NNET3_NNET_CHAIN_TRAINING_H_
+
+#include "nnet3/nnet-example.h"
+#include "nnet3/nnet-computation.h"
+#include "nnet3/nnet-compute.h"
+#include "nnet3/nnet-optimize.h"
+#include "nnet3/nnet-chain-example.h"
+#include "nnet3/nnet-training.h"
+#include "chain/chain-training.h"
+#include "chain/chain-den-graph.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+struct NnetChainTrainingOptions {
+  NnetTrainerOptions nnet_config;
+  chain::ChainTrainingOptions chain_config;
+  bool apply_deriv_weights;
+  NnetChainTrainingOptions(): apply_deriv_weights(true) { }
+
+  void Register(OptionsItf *opts) {
+    nnet_config.Register(opts);
+    chain_config.Register(opts);
+    opts->Register("apply-deriv-weights", &apply_deriv_weights,
+                   "If true, apply the per-frame derivative weights stored with "
+                   "the example");
+  }
+};
+
+
+/**
+   This class is for single-threaded training of neural nets using the 'chain'
+   model.
+*/
+class NnetChainTrainer {
+ public:
+  NnetChainTrainer(const NnetChainTrainingOptions &config,
+                   const fst::StdVectorFst &den_fst,
+                   Nnet *nnet);
+
+  // train on one minibatch.
+  void Train(const NnetChainExample &eg);
+
+  // Prints out the final stats, and return true if there was a nonzero count.
+  bool PrintTotalStats() const;
+
+  ~NnetChainTrainer();
+ private:
+  void ProcessOutputs(const NnetChainExample &eg,
+                      NnetComputer *computer);
+
+  const NnetChainTrainingOptions opts_;
+
+  chain::DenominatorGraph den_graph_;
+  Nnet *nnet_;
+  Nnet *delta_nnet_;  // Only used if momentum != 0.0 or max-param-change !=
+                      // 0.0.  nnet representing accumulated parameter-change
+                      // (we'd call this gradient_nnet_, but due to
+                      // natural-gradient update, it's better to consider it as
+                      // a delta-parameter nnet.
+  CachingOptimizingCompiler compiler_;
+
+  // This code supports multiple output layers, even though in the
+  // normal case there will be just one output layer named "output".
+  // So we store the objective functions per output layer.
+  int32 num_minibatches_processed_;
+
+  unordered_map<std::string, ObjectiveFunctionInfo, StringHasher> objf_info_;
+};
+
+
+} // namespace nnet3
+} // namespace kaldi
+
+#endif // KALDI_NNET3_NNET_CHAIN_TRAINING_H_
diff --git a/src/nnet3/nnet-combine.cc b/src/nnet3/nnet-combine.cc
index 7a01ccc50a6..45c1f74477b 100644
--- a/src/nnet3/nnet-combine.cc
+++ b/src/nnet3/nnet-combine.cc
@@ -39,6 +39,10 @@ NnetCombiner::NnetCombiner(const NnetCombineConfig &config,
   tot_input_weighting_(0) += 1.0;
   num_nnets_provided_ = 1;
   ComputeUpdatableComponentDims();
+  NnetComputeProbOptions compute_prob_opts;
+  compute_prob_opts.compute_deriv = true;
+  prob_computer_ = new NnetComputeProb(compute_prob_opts, nnet_);
+
 }
 
 void NnetCombiner::ComputeUpdatableComponentDims(){
@@ -67,7 +71,7 @@ void NnetCombiner::AcceptNnet(const Nnet &nnet) {
     tot_input_weighting_(num_nnets_provided_) += 1.0;
   } else {
     // this_index is a kind of warped index, mapping the range
-    // 0 ... num_real_inputs_nnets_ - 1 onto the range 
+    // 0 ... num_real_inputs_nnets_ - 1 onto the range
     // 0 ... num_effective_nnets - 1.  View the index as falling in
     // between two integer indexes and determining weighting factors.
     // we could view this as triangular bins.
@@ -124,7 +128,7 @@ void NnetCombiner::Combine() {
   lbfgs_options.m = dim; // Store the same number of vectors as the dimension
                          // itself, so this is BFGS.
   lbfgs_options.first_step_impr = config_.initial_impr;
-  
+
   Vector<BaseFloat> params(dim), deriv(dim);
   BaseFloat objf, initial_objf;
   GetInitialParameters(&params);
@@ -132,7 +136,7 @@ void NnetCombiner::Combine() {
 
   OptimizeLbfgs<BaseFloat> lbfgs(params, lbfgs_options);
 
-  for (int32 i = 0; i < config_.num_iters; i++) {    
+  for (int32 i = 0; i < config_.num_iters; i++) {
     params.CopyFromVec(lbfgs.GetProposedValue());
     objf = ComputeObjfAndDerivFromParameters(params, &deriv);
     KALDI_VLOG(2) << "Iteration " << i << " params = " << params
@@ -163,7 +167,7 @@ void NnetCombiner::PrintParams(const VectorBase<BaseFloat> &params) const {
   GetWeights(params, &weights);
   GetNormalizedWeights(weights, &normalized_weights);
   int32 num_models = nnet_params_.NumRows(),
-      num_uc = NumUpdatableComponents();        
+      num_uc = NumUpdatableComponents();
 
   if (config_.separate_weights_per_component) {
     std::vector<std::string> updatable_component_names;
@@ -173,7 +177,7 @@ void NnetCombiner::PrintParams(const VectorBase<BaseFloat> &params) const {
         updatable_component_names.push_back(nnet_.GetComponentName(c));
     }
     KALDI_ASSERT(static_cast<int32>(updatable_component_names.size()) ==
-                 NumUpdatableComponents());    
+                 NumUpdatableComponents());
     for (int32 uc = 0; uc < num_uc; uc++) {
       std::ostringstream os;
       os.width(20);
@@ -225,7 +229,7 @@ bool NnetCombiner::SelfTestDerivatives() {
     new_params.AddVec(delta, offset);
     BaseFloat new_objf = ComputeObjfAndDerivFromParameters(new_params,
                                                            &new_deriv);
-    // for predicted changes, interpolate old and new derivs.    
+    // for predicted changes, interpolate old and new derivs.
     predicted_changes(i) =
         0.5 * VecVec(new_params, deriv) -  0.5 * VecVec(params, deriv) +
         0.5 * VecVec(new_params, new_deriv) - 0.5 * VecVec(params, new_deriv);
@@ -266,7 +270,7 @@ void NnetCombiner::SelfTestModelDerivatives() {
   BaseFloat delta = 0.002 * std::sqrt(VecVec(nnet_params, nnet_params) /
                                       NnetParameterDim());
 
-  
+
   for (int32 i = 0; i < num_tests; i++) {
     Vector<BaseFloat> new_nnet_deriv(NnetParameterDim()),
         offset(NnetParameterDim()), new_nnet_params(nnet_params);
@@ -404,28 +408,27 @@ void NnetCombiner::GetWeightsDeriv(
     KALDI_ASSERT(dim_offset == nnet_params_.NumCols());
   }
 }
-  
+
 double NnetCombiner::ComputeObjfAndDerivFromNnet(
     VectorBase<BaseFloat> &nnet_params,
     VectorBase<BaseFloat> *nnet_params_deriv) {
   BaseFloat sum = nnet_params.Sum();
   // inf/nan parameters->return -inf objective.
-  if (!(sum == sum && sum - sum == 0))  
+  if (!(sum == sum && sum - sum == 0))
     return -std::numeric_limits<double>::infinity();
   // Set nnet to have these params.
   UnVectorizeNnet(nnet_params, &nnet_);
-  NnetComputeProbOptions compute_prob_opts;
-  compute_prob_opts.compute_deriv = true;
-  NnetComputeProb prob_computer(compute_prob_opts, nnet_);
+
+  prob_computer_->Reset();
   std::vector<NnetExample>::const_iterator iter = egs_.begin(),
                                             end = egs_.end();
   for (; iter != end; ++iter)
-    prob_computer.Compute(*iter);
-  const SimpleObjectiveInfo *objf_info = prob_computer.GetObjective("output");
+    prob_computer_->Compute(*iter);
+  const SimpleObjectiveInfo *objf_info = prob_computer_->GetObjective("output");
   if (objf_info == NULL)
     KALDI_ERR << "Error getting objective info (unsuitable egs?)";
   KALDI_ASSERT(objf_info->tot_weight > 0.0);
-  const Nnet &deriv = prob_computer.GetDeriv();
+  const Nnet &deriv = prob_computer_->GetDeriv();
   VectorizeNnet(deriv, nnet_params_deriv);
   // we prefer to deal with normalized objective functions.
   nnet_params_deriv->Scale(1.0 / objf_info->tot_weight);
@@ -514,7 +517,7 @@ void NnetCombiner::GetUnnormalizedWeightsDeriv(
 }
 
 
- 
-  
+
+
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-combine.h b/src/nnet3/nnet-combine.h
index 33a035496ec..a2883dab5b2 100644
--- a/src/nnet3/nnet-combine.h
+++ b/src/nnet3/nnet-combine.h
@@ -22,9 +22,10 @@
 
 #include "nnet3/nnet-utils.h"
 #include "nnet3/nnet-compute.h"
-#include "nnet3/nnet-diagnostics.h"
 #include "util/parse-options.h"
 #include "itf/options-itf.h"
+#include "nnet3/nnet-diagnostics.h"
+
 
 namespace kaldi {
 namespace nnet3 {
@@ -55,7 +56,7 @@ struct NnetCombineConfig {
                        enforce_positive_weights(false),
                        enforce_sum_to_one(false),
                        separate_weights_per_component(true) { }
-  
+
   void Register(OptionsItf *po) {
     po->Register("num-iters", &num_iters, "Maximum number of function "
                  "evaluations for BFGS to use when optimizing combination weights");
@@ -75,7 +76,7 @@ struct NnetCombineConfig {
     po->Register("separate-weights-per-component", &separate_weights_per_component,
                  "If true, have a separate weight for each updatable component in "
                  "the nnet.");
-  }  
+  }
 };
 
 
@@ -100,18 +101,22 @@ class NnetCombiner {
   void AcceptNnet(const Nnet &nnet);
   void Combine();
   const Nnet &GetNnet() const { return nnet_; }
+
+  ~NnetCombiner() { delete prob_computer_; }
  private:
   const NnetCombineConfig &config_;
 
   const std::vector<NnetExample> &egs_;
-  
+
   Nnet nnet_;  // The current neural network.
 
+  NnetComputeProb *prob_computer_;
+
   std::vector<int32> updatable_component_dims_;  // dimension of each updatable
                                                  // component.
 
   int32 num_real_input_nnets_;  // number of actual nnet inputs.
- 
+
   int32 num_nnets_provided_;  // keeps track of the number of calls to AcceptNnet().
 
   // nnet_params_ are the parameters of the "effective input"
@@ -124,7 +129,7 @@ class NnetCombiner {
   // and helps us normalize so each row of nnet_params correspondss to
   // a weighted average of its inputs.
   Vector<BaseFloat> tot_input_weighting_;
-  
+
   // returns the parameter dimension, i.e. the dimension of the parameters that
   // we are optimizing.  This depends on the config, the number of updatable
   // components and nnet_params_.NumRows(); it will never exceed the number of
@@ -140,7 +145,7 @@ class NnetCombiner {
   }
 
   int32 NnetParameterDim() const { return nnet_params_.NumCols(); }
-  
+
   // Computes the initial parameters.  The parameters are the underlying thing
   // that we optimize; their dimension equals ParameterDim().  They are not the same
   // thing as the nnet parameters.
@@ -152,10 +157,10 @@ class NnetCombiner {
   // Tests that model derivatives are accurate.  Just prints warning if not.
   void SelfTestModelDerivatives();
 
-  
+
   // prints the parameters via logging statements.
   void PrintParams(const VectorBase<BaseFloat> &params) const;
-  
+
   // This function computes the objective function (and its derivative, if the objective
   // function is finite) at the given value of the parameters (the parameters we're optimizing,
   // i.e. the combination weights; not the nnet parameters.  This function calls most of the
@@ -164,7 +169,7 @@ class NnetCombiner {
       VectorBase<BaseFloat> &params,
       VectorBase<BaseFloat> *params_deriv);
 
-  
+
   // Computes the weights from the parameters in a config-dependent way.  The
   // weight dimension is always (the number of updatable components times
   // nnet_params_.NumRows()).
@@ -202,7 +207,7 @@ class NnetCombiner {
   void GetUnnormalizedWeightsDeriv(const VectorBase<BaseFloat> &unnorm_weights,
                                    const VectorBase<BaseFloat> &norm_weights_deriv,
                                    VectorBase<BaseFloat> *unnorm_weights_deriv);
-  
+
 
   // Given a derivative w.r.t. the weights, outputs a derivative w.r.t.
   // the params
@@ -212,7 +217,7 @@ class NnetCombiner {
 
   void ComputeUpdatableComponentDims();
   void FinishPreprocessingInput();
-  
+
 };
 
 
diff --git a/src/nnet3/nnet-common-test.cc b/src/nnet3/nnet-common-test.cc
index 318c19ee8b8..856797bc5cc 100644
--- a/src/nnet3/nnet-common-test.cc
+++ b/src/nnet3/nnet-common-test.cc
@@ -1,6 +1,7 @@
 // nnet3/nnet-common-test.cc
 
-// Copyright 2015  Johns Hopkins University (author: Daniel Povey)
+// Copyright      2015  Johns Hopkins University (author: Daniel Povey)
+//                2016  Xiaohui Zhang
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -58,6 +59,44 @@ void UnitTestIndexIo() {
   }
 }
 
+void UnitTestCindexIo() {
+  std::vector<Cindex> cindexes(RandInt(0, 10));
+
+  for (int32 i = 0; i < cindexes.size(); i++) {
+    if (i == 0 || RandInt(0, 1) == 0) {
+      cindexes[i].first = RandInt(0, 127);
+    } else {
+      cindexes[i].first = cindexes[i-1].first;
+    }
+    if (i == 0 || RandInt(0, 1) == 0) {
+      cindexes[i].second.n = RandInt(-1, 2);
+      cindexes[i].second.t = RandInt(-150, 150);
+      cindexes[i].second.x = RandInt(-1, 1);
+    } else {
+      // this case gets optimized while writing. (if abs(diff-in-t) < 125).
+      cindexes[i].second.n = cindexes[i-1].second.n;
+      cindexes[i].second.t = cindexes[i-1].second.t + RandInt(-127, 127);
+      cindexes[i].second.x = cindexes[i-1].second.x;
+    }
+  }
+  
+  std::ostringstream os;
+  bool binary = (RandInt(0, 1) == 0);
+  WriteCindexVector(os, binary, cindexes);
+  std::vector<Cindex> cindexes2;
+  if (RandInt(0, 1) == 0)
+    cindexes2 = cindexes;
+  std::istringstream is(os.str());
+  ReadCindexVector(is, binary, &cindexes2);
+  if (cindexes != cindexes2) {
+    WriteCindexVector(std::cerr, false, cindexes);
+    std::cerr << "  vs. \n";
+    WriteCindexVector(std::cerr, false, cindexes2);
+    std::cerr << "\n";
+    KALDI_ERR << "Indexes differ.";
+  }
+}
+
 } // namespace nnet3
 } // namespace kaldi
 
@@ -65,8 +104,10 @@ int main() {
   using namespace kaldi;
   using namespace kaldi::nnet3;
 
-  for (int32 i = 0; i < 50; i++)
+  for (int32 i = 0; i < 50; i++) {
     UnitTestIndexIo();
+    UnitTestCindexIo();
+  }
 
   KALDI_LOG << "Nnet-common tests succeeded.";
 
diff --git a/src/nnet3/nnet-common.cc b/src/nnet3/nnet-common.cc
index d4ad1157197..9df01d4f048 100644
--- a/src/nnet3/nnet-common.cc
+++ b/src/nnet3/nnet-common.cc
@@ -1,6 +1,7 @@
 // nnet3/nnet-common.cc
 
 // Copyright      2015    Johns Hopkins University (author: Daniel Povey)
+//                2016  Xiaohui Zhang
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -142,7 +143,7 @@ void ReadIndexVector(std::istream &is, bool binary,
               << size;
   }
   vec->resize(size);
-  if (!binary) {  
+  if (!binary) {
     for (int32 i = 0; i < size; i++)
       (*vec)[i].Read(is, binary);
   } else {
@@ -151,7 +152,196 @@ void ReadIndexVector(std::istream &is, bool binary,
   }
 }
 
+static void WriteCindexVectorElementBinary(
+    std::ostream &os,
+    const std::vector<Cindex> &vec,
+    int32 i) {
+  bool binary = true;
+  int32 node_index = vec[i].first;
+  Index index = vec[i].second;
+  if (i == 0 || node_index != vec[i-1].first) {
+    // '|' into ranges that each have all the same node name, like:
+    // [node_1: index_1 index_2] [node_2: index_3 index_4]
+    os.put('|');
+    WriteBasicType(os, binary, node_index);
+  }  
+  if (i == 0) {
+    if (index.n == 0 && index.x == 0 &&
+        std::abs(index.t) < 125) {
+      // handle this common case in one character.
+      os.put(static_cast<signed char>(index.t));
+    } else if (index.t == 0 && index.x == 0 &&
+        std::abs(index.n) < 2) {
+      // handle this common case in one character.
+      os.put(static_cast<signed char>(index.n + 125));
+    } else {  // handle the general case less efficiently.
+      os.put(127);
+      WriteBasicType(os, binary, index.n);
+      WriteBasicType(os, binary, index.t);
+      WriteBasicType(os, binary, index.x);
+    }
+  } else {
+    Index last_index = vec[i-1].second;
+    if (index.n == last_index.n && index.x == last_index.x &&
+        std::abs(index.t - last_index.t) < 124) {
+      signed char c = index.t - last_index.t;
+      os.put(c);
+    } else if (index.t == last_index.t && index.x == last_index.x &&
+        std::abs(index.n - last_index.n) < 2) {
+      signed char c = index.n - last_index.n;
+      os.put(c + 125);
+    } else {  // handle the general case less efficiently.
+      os.put(127);
+      WriteBasicType(os, binary, index.n);
+      WriteBasicType(os, binary, index.t);
+      WriteBasicType(os, binary, index.x);
+    }
+  }
+  if (!os.good())
+    KALDI_ERR << "Output stream error detected";
+}
+
+static void ReadCindexVectorElementBinary(
+    std::istream &is,
+    int32 i,
+    std::vector<Cindex> *vec) {
+  bool binary = true;
+  Index &index = (*vec)[i].second;
+  if (!is.good())
+    KALDI_ERR << "End of file while reading vector of Cindex.";
+  if (is.peek() == static_cast<int>('|')) {
+    is.get();
+    ReadBasicType(is, binary, &((*vec)[i].first));
+  } else {
+    (*vec)[i].first = (*vec)[i-1].first;
+  }
+  signed char c = is.get();
+  if (i == 0) {
+    if (std::abs(int(c)) < 124) {
+      index.n = 0;
+      index.t = c;
+      index.x = 0;
+    } else if (std::abs(int(c)) < 127) {
+      index.n = c - 125;
+      index.t = 0;
+      index.x = 0;
+    } else {
+      if (c != 127)
+        KALDI_ERR << "Unexpected character " << c
+                  << " encountered while reading Cindex vector.";
+      ReadBasicType(is, binary, &(index.n));
+      ReadBasicType(is, binary, &(index.t));
+      ReadBasicType(is, binary, &(index.x));
+    }
+  } else {
+    Index &last_index = (*vec)[i-1].second;
+    if (std::abs(int(c)) < 124) {
+      index.n = last_index.n;
+      index.t = last_index.t + c;
+      index.x = last_index.x;
+    } else if (std::abs(int(c)) < 127) {
+      index.n = last_index.n + c - 125;
+      index.t = last_index.t;
+      index.x = last_index.x;
+    } else {
+      if (c != 127)
+        KALDI_ERR << "Unexpected character " << c
+                  << " encountered while reading Cindex vector.";
+      ReadBasicType(is, binary, &(index.n));
+      ReadBasicType(is, binary, &(index.t));
+      ReadBasicType(is, binary, &(index.x));
+    }
+  }
+}
+
+// This function writes elements of a Cindex vector in a compact form.
+// which is similar as the output of PrintCindexes. The vector is divided
+// into ranges that each have all the same node name, like:
+// [node_1: index_1 index_2] [node_2: index_3 index_4]
+void WriteCindexVector(std::ostream &os, bool binary,
+                       const std::vector<Cindex> &vec) {
+  // This token will make it easier to write back-compatible code if we later
+  // change the format.
+  WriteToken(os, binary, "<I1V>");
+  int32 size = vec.size();
+  WriteBasicType(os, binary, size);
+  if (!binary) {  // In text mode we just use the native Write functionality.
+    for (int32 i = 0; i < size; i++) {
+      int32 node_index = vec[i].first;
+      if (i == 0 || node_index != vec[i-1].first) {
+        if (i > 0)
+          os.put(']');
+        os.put('[');
+        WriteBasicType(os, binary, node_index);
+        os.put(':');
+      } 
+      vec[i].second.Write(os, binary);
+      if (i == size - 1)
+        os.put(']');
+    } 
+  } else {
+    for (int32 i = 0; i < size; i++)
+      WriteCindexVectorElementBinary(os, vec, i);
+  }
+}
+
+void ReadCindexVector(std::istream &is, bool binary,
+                      std::vector<Cindex> *vec) {
+  ExpectToken(is, binary, "<I1V>");
+  int32 size;
+  ReadBasicType(is, binary, &size);
+  if (size < 0) {
+    KALDI_ERR << "Error reading Index vector: size = "
+              << size;
+  }
+  vec->resize(size);
+  if (!binary) {
+    for (int32 i = 0; i < size; i++) {
+      is >> std::ws;
+      if (is.peek() == static_cast<int>(']') || i == 0) {
+        if (i != 0)
+          is.get();
+        is >> std::ws;
+        if (is.peek() == static_cast<int>('[')) {
+          is.get();
+        } else {
+          KALDI_ERR << "ReadCintegerVector: expected to see [, saw "
+                    << is.peek() << ", at file position " << is.tellg();
+        }
+        ReadBasicType(is, binary, &((*vec)[i].first));
+        is >> std::ws;
+        if (is.peek() == static_cast<int>(':')) {
+          is.get();
+        } else {
+          KALDI_ERR << "ReadCintegerVector: expected to see :, saw "
+                    << is.peek() << ", at file position " << is.tellg();
+        }
+      } else {
+        (*vec)[i].first = (*vec)[i-1].first;
+      }
+      (*vec)[i].second.Read(is, binary);
+      if (i == size - 1) { 
+        is >> std::ws;
+        if (is.peek() == static_cast<int>(']')) {
+          is.get();
+        } else {
+          KALDI_ERR << "ReadCintegerVector: expected to see ], saw "
+                    << is.peek() << ", at file position " << is.tellg();
+        }
+      }
+    }
+  } else {
+    for (int32 i = 0; i < size; i++)
+      ReadCindexVectorElementBinary(is, i, vec);
+  }
+}
 
+size_t IndexHasher::operator () (const Index &index) const {
+  // The numbers that appear below were chosen arbitrarily from a list of primes
+  return index.n +
+      1619 * index.t +
+      15649 * index.x;
+}
 
 size_t CindexHasher::operator () (const Cindex &cindex) const {
   // The numbers that appear below were chosen arbitrarily from a list of primes
@@ -159,7 +349,7 @@ size_t CindexHasher::operator () (const Cindex &cindex) const {
        1619 * cindex.second.n +
       15649 * cindex.second.t +
       89809 * cindex.second.x;
-  
+
 }
 
 std::ostream &operator << (std::ostream &ostream, const Index &index) {
@@ -222,6 +412,33 @@ void PrintIndexes(std::ostream &os,
   os << "]";
 }
 
+void PrintCindexes(std::ostream &ostream,
+                   const std::vector<Cindex> &cindexes,
+                   const std::vector<std::string> &node_names) {
+  int32 num_cindexes = cindexes.size();
+  if (num_cindexes == 0) {
+    ostream << "[ ]";
+    return;
+  }
+  int32 cur_offset = 0;
+  std::vector<Index> indexes;
+  indexes.reserve(cindexes.size());
+  while (cur_offset < num_cindexes) {
+    int32 cur_node_index = cindexes[cur_offset].first;
+    while (cur_offset < num_cindexes &&
+           cindexes[cur_offset].first == cur_node_index) {
+      indexes.push_back(cindexes[cur_offset].second);
+      cur_offset++;
+    }
+    KALDI_ASSERT(static_cast<size_t>(cur_node_index) < node_names.size());
+    const std::string &node_name = node_names[cur_node_index];
+    ostream << node_name;
+    PrintIndexes(ostream, indexes);
+    indexes.clear();
+  }
+}
+
+
 void PrintIntegerVector(std::ostream &os,
                         const std::vector<int32> &ints) {
   if (ints.empty()) {
diff --git a/src/nnet3/nnet-common.h b/src/nnet3/nnet-common.h
index c78ff195859..f8140e62f12 100644
--- a/src/nnet3/nnet-common.h
+++ b/src/nnet3/nnet-common.h
@@ -1,7 +1,7 @@
 // nnet3/nnet-common.h
 
-// Copyright      2015  Johns Hopkins University (author: Daniel Povey)
-
+// Copyright      2015  Johns Hopkins University (author: Daniel Pove
+//                2016  Xiaohui Zhang
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -39,7 +39,7 @@ namespace nnet3 {
    of the member of the minibatch, 't', used for the frame index in speech
    recognition, and 'x', which is a catch-all extra index which we might use in
    convolutional setups or for other reasons.  It is possible to extend this by
-   adding new indexes if needed. 
+   adding new indexes if needed.
 */
 struct Index {
   int32 n;  // member-index of minibatch, or zero.
@@ -48,15 +48,15 @@ struct Index {
   // ... it is possible to add extra index here, if needed.
   Index(): n(0), t(0), x(0) { }
   Index(int32 n, int32 t, int32 x = 0): n(n), t(t), x(x) { }
-  
+
   bool operator == (const Index &a) const {
     return n == a.n && t == a.t && x == a.x;
   }
   bool operator < (const Index &a) const {
-    if (n < a.n) { return true; }
-    else if (n > a.n) { return false; }
-    else if (t < a.t) { return true; }
+    if (t < a.t) { return true; }
     else if (t > a.t) { return false; }
+    else if (n < a.n) { return true; }
+    else if (n > a.n) { return false; }
     else return (x < a.x);
   }
   Index operator + (const Index &other) const {
@@ -68,6 +68,21 @@ struct Index {
   void Read(std::istream &os, bool binary);
 };
 
+
+// This struct can be used as a comparison object when you want to
+// sort the indexes first on n, then x, then t (Index's own comparison
+// object will sort first on t, then n, then x)
+struct IndexLessNxt {
+  inline bool operator ()(const Index &a, const Index &b) const {
+    if (a.n < b.n) { return true; }
+    else if (a.n > b.n) { return false; }
+    else if (a.x < b.x) { return true; }
+    else if (a.x > b.x) { return false; }
+    else return (a.t < b.t);
+  }
+};
+
+
 // this will be used only for debugging output.
 std::ostream &operator << (std::ostream &ostream, const Index &index);
 
@@ -75,7 +90,7 @@ std::ostream &operator << (std::ostream &ostream, const Index &index);
 void WriteIndexVector(std::ostream &os, bool binary,
                       const std::vector<Index> &vec);
 
-void ReadIndexVector(std::istream &os, bool binary,
+void ReadIndexVector(std::istream &is, bool binary,
                      std::vector<Index> *vec);
 
 
@@ -84,10 +99,16 @@ void ReadIndexVector(std::istream &os, bool binary,
  */
 typedef std::pair<int32, Index> Cindex;
 
+struct IndexHasher {
+  size_t operator () (const Index &cindex) const;
+};
+
 struct CindexHasher {
   size_t operator () (const Cindex &cindex) const;
 };
 
+
+
 // this will only be used for pretty-printing.
 void PrintCindex(std::ostream &ostream, const Cindex &cindex,
                  const std::vector<std::string> &node_names);
@@ -102,6 +123,27 @@ void PrintCindex(std::ostream &ostream, const Cindex &cindex,
 void PrintIndexes(std::ostream &ostream,
                   const std::vector<Index> &indexes);
 
+/// this will only be used for pretty-printing.  It prints a vector of Cindexes
+/// in a compact, human-readable way with compression of ranges.  If the values
+/// of the node indexes are the same for the entire vector, it will just be
+/// node-name followed by the output of PrintIndexes, e.g.  some_node[ (1,1,0)
+/// ].  Otherwise it will divide the vector into ranges that each have all the
+/// same node name, and will print out each range in the way we just mentioned.
+/// 'node_names' will usually come from a call like nnet.GetNodeNames().
+void PrintCindexes(std::ostream &ostream,
+                   const std::vector<Cindex> &cindexes,
+                   const std::vector<std::string> &node_names);
+
+/// Appends to 'out' the pairs (node, indexes[0]), (node, indexes[1]), ...
+void AppendCindexes(int32 node, const std::vector<Index> &indexes,
+                    std::vector<Cindex> *out);
+
+void WriteCindexVector(std::ostream &os, bool binary,
+                       const std::vector<Cindex> &vec);
+
+void ReadCindexVector(std::istream &is, bool binary,
+                      std::vector<Cindex> *vec);
+
 // this function prints a vector of integers in a human-readable
 // way, for pretty-printing; it outputs ranges and repeats in
 // a compact form e.g. [ -1x10, 1:20, 25:40 ]
diff --git a/src/nnet3/nnet-compile-test.cc b/src/nnet3/nnet-compile-test.cc
index 3a492a70d20..d405fd0f5fa 100644
--- a/src/nnet3/nnet-compile-test.cc
+++ b/src/nnet3/nnet-compile-test.cc
@@ -28,7 +28,7 @@ namespace nnet3 {
 void UnitTestNnetCompile() {
   for (int32 n = 0; n < 20; n++) {
     struct NnetGenerationOptions gen_config;
-    
+
     std::vector<std::string> configs;
     GenerateConfigSequence(gen_config, &configs);
     Nnet nnet;
@@ -37,13 +37,13 @@ void UnitTestNnetCompile() {
       std::istringstream is(configs[j]);
       nnet.ReadConfig(is);
     }
-    
+
     ComputationRequest request;
     std::vector<Matrix<BaseFloat> > inputs;
     ComputeExampleComputationRequestSimple(nnet, &request, &inputs);
     KALDI_LOG << "Computation request is:";
     request.Print(std::cerr);
-    
+
     NnetComputation computation;
     Compiler compiler(request, nnet);
 
@@ -62,7 +62,7 @@ void UnitTestNnetCompile() {
 int main() {
   using namespace kaldi;
   using namespace kaldi::nnet3;
-  //SetVerboseLevel(2);
+  // SetVerboseLevel(2);
 
   UnitTestNnetCompile();
 
diff --git a/src/nnet3/nnet-compile-utils-test.cc b/src/nnet3/nnet-compile-utils-test.cc
index 36c014aed2a..e5c9e24cc46 100644
--- a/src/nnet3/nnet-compile-utils-test.cc
+++ b/src/nnet3/nnet-compile-utils-test.cc
@@ -52,7 +52,7 @@ void PrintVectorVectorPair(
   std::ostringstream ostream;
   for (int32 i = 0; i < vec_vec_pair.size(); i++) {
     for (int32 j = 0; j < vec_vec_pair[i].size(); j++)  {
-      ostream << "(" << vec_vec_pair[i][j].first << "," 
+      ostream << "(" << vec_vec_pair[i][j].first << ","
               << vec_vec_pair[i][j].second << ") ";
     }
     ostream << std::endl;
@@ -65,7 +65,7 @@ void PrintVectorVectorPair(
 // checks if the submat_lists are split into same first_element lists wherever
 // possible
 // checks if the split_lists satisfy either "unique contiguous segments"
-// property or unique pairs property (see SplitLocationsBackward in 
+// property or unique pairs property (see SplitLocationsBackward in
 // nnet-compile-utils.h for more details)
 void UnitTestSplitLocationsBackward(bool verbose) {
   int32 minibatch_size = Rand() % 1024 + 100;
@@ -78,7 +78,7 @@ void UnitTestSplitLocationsBackward(bool verbose) {
 
   std::vector<std::pair<int32, int32> > all_pairs;
   all_pairs.reserve(minibatch_size * max_submat_list_size);
-  std::vector<std::vector<std::pair<int32, int32> > > 
+  std::vector<std::vector<std::pair<int32, int32> > >
       submat_lists(minibatch_size),
       split_lists;
   std::vector<int32> submat_indexes(num_submat_indexes);
@@ -97,7 +97,7 @@ void UnitTestSplitLocationsBackward(bool verbose) {
     for (int32 j = 0; j < num_locations; j++) {
       if (j <= min_num_kAddRows)
         // since we need min_num_kAddRows in the split_lists we ensure that
-        // we add a pair with the same first element in all the submat_lists 
+        // we add a pair with the same first element in all the submat_lists
         submat_lists[i].push_back(std::make_pair(submat_indexes[j],
                            Rand() % minibatch_size));
       submat_lists[i].push_back(
@@ -129,18 +129,18 @@ void UnitTestSplitLocationsBackward(bool verbose) {
       }
     } else {
       std::vector<std::pair<int32, int32> > list_of_pairs;
-      // checking for uniques of elements in the list 
+      // checking for uniques of elements in the list
       for (int32 j = 0; j < split_lists[i].size(); j++)  {
         if (split_lists[i][j].first == -1)
           continue;
-        std::vector<std::pair<int32, int32> >::const_iterator iter = 
+        std::vector<std::pair<int32, int32> >::const_iterator iter =
             std::find_if(list_of_pairs.begin(), list_of_pairs.end(),
                          PairIsEqualComparator(split_lists[i][j]));
         KALDI_ASSERT(iter == list_of_pairs.end());
         list_of_pairs.push_back(split_lists[i][j]);
       }
     }
-  } 
+  }
   if (verbose)  {
     KALDI_LOG << "submat_list";
     PrintVectorVectorPair(submat_lists);
@@ -182,6 +182,92 @@ void UnitTestSplitLocationsBackward(bool verbose) {
 }
 
 
+void UnitTestHasContiguousProperty() {
+  for (int32 k = 0; k < 10; k++) {
+    int32 size = RandInt(0, 5);
+    std::vector<int32> indexes(size);
+    for (int32 i = 0; i < size; i++)
+      indexes[i] = RandInt(-1, 4);
+    std::vector<std::pair<int32, int32> > reverse_indexes;
+    bool ans = HasContiguousProperty(indexes, &reverse_indexes);
+    if (!ans) { // doesn't have contiguous propety.
+      KALDI_LOG << "no.";
+      bool found_example = false;
+      for (int32 i = 0; i < size; i++) {
+        if (indexes[i] != -1) {
+          bool found_not_same = false;
+          for (int32 j = i + 1; j < size; j++) {
+            if (indexes[j] != indexes[i]) found_not_same = true;
+            else if (found_not_same) found_example = true;  // found something like x y x.
+          }
+        }
+      }
+      KALDI_ASSERT(found_example);
+    } else {
+      KALDI_LOG << "yes.";
+      for (int32 i = 0; i < reverse_indexes.size(); i++) {
+        for (int32 j = reverse_indexes[i].first;
+             j < reverse_indexes[i].second; j++) {
+          KALDI_ASSERT(indexes[j] == i);
+          indexes[j] = -1;
+        }
+      }
+      for (int32 i = 0; i < size; i++)  // make sure all indexes covered.
+        KALDI_ASSERT(indexes[i] == -1);
+    }
+  }
+}
+
+
+void UnitTestEnsureContiguousProperty() {
+  for (int32 k = 0; k < 10; k++) {
+    int32 size = RandInt(0, 5);
+    std::vector<int32> indexes(size);
+    for (int32 i = 0; i < size; i++)
+      indexes[i] = RandInt(-1, 4);
+    std::vector<std::pair<int32, int32> > reverse_indexes;
+    bool ans = HasContiguousProperty(indexes, &reverse_indexes);
+    if (ans) { // has contiguous property -> EnsureContiguousProperty should do
+               // nothing.
+      std::vector<std::vector<int32> > indexes_split;
+      EnsureContiguousProperty(indexes, &indexes_split);
+      if (indexes.size() == 0 ||
+          *std::max_element(indexes.begin(), indexes.end()) == -1) {
+        KALDI_ASSERT(indexes_split.size() == 0);
+      } else {
+        KALDI_ASSERT(indexes_split.size() == 1 &&
+                     indexes_split[0] == indexes);
+      }
+    } else {
+      std::vector<std::vector<int32> > indexes_split;
+      EnsureContiguousProperty(indexes, &indexes_split);
+      KALDI_ASSERT(indexes_split.size() > 1);
+      for (int32 i = 0; i < indexes.size(); i++) {
+        int32 this_val = indexes[i];
+        bool found = (this_val == -1);  // not looking for anything if
+                                        // this_val is -1.
+        for (int32 j = 0; j < indexes_split.size(); j++) {
+          if (found) {
+            KALDI_ASSERT(indexes_split[j][i] == -1);
+          } else {
+            if (indexes_split[j][i] == this_val) {
+              found = true;
+            } else {
+              KALDI_ASSERT(indexes_split[j][i] == -1);
+            }
+          }
+        }
+        KALDI_ASSERT(found);
+        for (int32 j = 0; j < indexes_split.size(); j++) {
+          KALDI_ASSERT(indexes_split[j].size() == indexes.size() &&
+                       HasContiguousProperty(indexes_split[j], &reverse_indexes));
+        }
+      }
+    }
+  }
+}
+
+
 // Function to check SplitLocations() method
 // checks if the submat_lists and split_lists have the same non-dummy elements
 // checks if the submat_lists are split into same first_element lists wherever
@@ -197,7 +283,7 @@ void UnitTestSplitLocations(bool verbose) {
 
   std::vector<std::pair<int32, int32> > all_pairs;
   all_pairs.reserve(minibatch_size * max_submat_list_size);
-  std::vector<std::vector<std::pair<int32, int32> > > 
+  std::vector<std::vector<std::pair<int32, int32> > >
       submat_lists(minibatch_size),
       split_lists;
   std::vector<int32> submat_indexes(num_submat_indexes);
@@ -216,10 +302,10 @@ void UnitTestSplitLocations(bool verbose) {
     for (int32 j = 0; j < num_locations; j++) {
       if (j <= min_num_kAddRows)
         // since we need min_num_kAddRows in the split_lists we ensure that
-        // we add a pair with the same first element in all the submat_lists 
+        // we add a pair with the same first element in all the submat_lists
         submat_lists[i].push_back(std::make_pair(submat_indexes[j],
                            Rand() % minibatch_size));
-        
+
       submat_lists[i].push_back(
           std::make_pair(submat_indexes[Rand() % num_submat_indexes],
                          Rand() % minibatch_size));
@@ -235,8 +321,8 @@ void UnitTestSplitLocations(bool verbose) {
     KALDI_LOG << "split_lists";
     PrintVectorVectorPair(split_lists);
     KALDI_LOG << "===========================";
-    KALDI_LOG << split_lists.size(); 
-  } 
+    KALDI_LOG << split_lists.size();
+  }
   int32 num_kAddRows_in_output = 0;
   int32 first_value;
   std::vector<int32> second_values;
@@ -279,6 +365,8 @@ int main()  {
   for (int32 loop = 0; loop < 10; loop++)  {
     UnitTestSplitLocations(verbose);
     UnitTestSplitLocationsBackward(verbose);
+    UnitTestHasContiguousProperty();
+    UnitTestEnsureContiguousProperty();
   }
   KALDI_LOG << "Tests passed.";
   return 0;
diff --git a/src/nnet3/nnet-compile-utils.cc b/src/nnet3/nnet-compile-utils.cc
index eb3039bef64..3ff7c6fe3d8 100644
--- a/src/nnet3/nnet-compile-utils.cc
+++ b/src/nnet3/nnet-compile-utils.cc
@@ -1,3 +1,5 @@
+// nnet3/nnet-compile-utils.cc
+
 // Copyright      2015  Johns Hopkins University (author: Daniel Povey)
 //                2015                           (author: Vijayaditya Peddinti)
 
@@ -70,9 +72,11 @@ struct PairIsEqualComparator  :
 };
 
 // this comparator will be used to sort pairs initially by second element in
-// descending order and then by first element in descending order
-bool SecondElementComparator(const std::pair<int32, int32>& first_pair,
-                             const std::pair<int32, int32>& second_pair) {
+// descending order and then by first element in descending order.
+// note, std::sort accepts an actual function as an alternative to a
+// function object.
+bool  SecondElementComparator(const std::pair<int32, int32>& first_pair,
+                              const std::pair<int32, int32>& second_pair) {
   if (first_pair.second == second_pair.second)
     return first_pair.first > second_pair.first;
   return first_pair.second > second_pair.second;
@@ -83,7 +87,7 @@ bool SecondElementComparator(const std::pair<int32, int32>& first_pair,
 void SortSubmatLists(
     // vector of list of location pairs
     const std::vector<std::vector<std::pair<int32, int32> > > submat_lists,
-    // a copy of the input submat_lists where the lists are sorted 
+    // a copy of the input submat_lists where the lists are sorted
     // (this will be used in the caller function for sort and find functions)
     std::vector<std::vector<std::pair<int32, int32> > > * sorted_submat_lists,
     // maximum size of the submat_lists
@@ -111,7 +115,7 @@ void ComputeSubmatIndexHistogram(
     // by submat_indexes (.first element)
     const std::vector<std::vector<std::pair<int32, int32> > >
     sorted_submat_lists,
-    // a histogram of submat_indexes where 
+    // a histogram of submat_indexes where
     // the keys are submat_indexes and values are a vector of frequencies
     // of first occurrence, second occurrence, etc. of a submat_index
     // in a submat_list
@@ -155,7 +159,7 @@ void ComputeSubmatIndexHistogram(
 }
 
 
-// Function to find the first occurrence of a submat_index in list of location 
+// Function to find the first occurrence of a submat_index in list of location
 // pairs from a vector of list of locations pairs.
 // The occurrences are returned as a list of vector iterators,
 // pointing to the position of the pair in the list or to the
@@ -164,7 +168,7 @@ void FindSubmatIndexInSubmatLists(
     // submat_index to search in the submat_lists
     int32 submat_index,
     // sorted_submat_lists is a pointer as we want non-const iterators in the
-    // output 
+    // output
     std::vector<std::vector<std::pair<int32, int32> > > *sorted_submat_lists,
     // a vector of iterators to store the location of the pairs
     std::vector<std::vector<std::pair<int32, int32> >::iterator>
@@ -217,7 +221,7 @@ void ExtractGivenPairsFromSubmatLists(
 
 // Function to extract the last pairs from a vector of list of pairs
 // a dummy is added when the list is empty
-void ExtractLastPairFromSubmatLists(
+static void ExtractLastPairFromSubmatLists(
     std::vector<std::vector<std::pair<int32, int32> > > *sorted_submat_lists,
     std::vector<std::pair<int32, int32> > *list_of_pairs) {
   list_of_pairs->reserve(sorted_submat_lists->size());
@@ -234,9 +238,9 @@ void ExtractLastPairFromSubmatLists(
 }
 
 // Function which does the actual splitting of submat_lists. But it operates on
-// sorted submat_lists and uses submat_histogram_vector. 
+// sorted submat_lists and uses submat_histogram_vector.
 // See SplitLocations, below for the algorithm
-void SplitLocationsUsingSubmatHistogram(
+static void SplitLocationsUsingSubmatHistogram(
     // maximum size of the lists in the sorted_submat_lists
     int32 max_submat_list_size,
     // a vector of list of pairs where each list is expected to be sorted
@@ -245,7 +249,7 @@ void SplitLocationsUsingSubmatHistogram(
     // a vector of pairs to represent a histogram
     // this is a pointer as the vector will be sorted
     std::vector<std::pair<int32, int32> > *submat_histogram_vector,
-    // a vector of lists of pairs with rearranged pairs 
+    // a vector of lists of pairs with rearranged pairs
     std::vector<std::vector<std::pair<int32, int32> > > *split_lists)  {
 
   // sort the submat_histogram_vector based on second element of pair
@@ -257,7 +261,7 @@ void SplitLocationsUsingSubmatHistogram(
   std::vector<std::pair<int32, int32> >::iterator iter;
   for (iter = submat_histogram_vector->begin();
        iter != submat_histogram_vector->end();
-       iter++)  {
+       ++iter)  {
     std::pair<int32, int32> submat_index_and_count = *iter;
     std::vector<std::vector<std::pair<int32, int32> >::iterator>
         output_iterator_list;
@@ -279,9 +283,9 @@ void SplitLocationsUsingSubmatHistogram(
       prev_max_remaining_submat_list_size = max_remaining_submat_list_size;
     }
   }
-  
+
   // rearrange the remaining pairs into lists where
-  // pairs with multiple first elements are allowed 
+  // pairs with multiple first elements are allowed
   // Note : we don't yet know if there is any advantage of having multiple
   // calls to the same submat in kAddRowsMulti. If this is actually helpful
   // then use the sorted_histogram_vector to first copy submat_indexes which
@@ -307,7 +311,7 @@ void SplitLocationsUsingSubmatHistogram(
 // In the current implementation we replace kAddRowsMulti calls with
 // kAddRows calls wherever possible, while not increasing the number of calls.
 //
-// Algorithm : 
+// Algorithm :
 // The function computes a histogram of submat_indexes and spans through the
 // submat_indexes in descending order of frequency. For each submat_index a
 // decision is made to copy it using a kAddRows call or not.
@@ -317,7 +321,7 @@ void SplitLocationsUsingSubmatHistogram(
 // submat_indexes which cannot be assigned to kAddRow calls are rearranged into
 // lists for kAddRowsMulti calls.
 //
-// Note : To decide splits we could have solved a combinatorial 
+// Note : To decide splits we could have solved a combinatorial
 // optimization problem where we find the best set of
 // kAddRows + kAddRowsMulti calls;
 // but given that both these calls have similar costs,
@@ -336,8 +340,8 @@ void SplitLocations(
 
   // a histogram of the submat_indexes in the submat_lists
   // each occurence in a given submat_list is considered unique so we maintain
-  // a vector to count each occurrence seperately.
-  // The i'th element in the vector corresponds to the count of 
+  // a vector to count each occurrence separately.
+  // The i'th element in the vector corresponds to the count of
   // the (i+1)'th occurrence of a submat_index in a submat_list
   unordered_map<int32, std::vector<int32> > submat_histogram;
 
@@ -350,7 +354,7 @@ void SplitLocations(
   ComputeSubmatIndexHistogram(sorted_submat_lists, &submat_histogram);
   // the vector has same information as the submat_histogram, but it is
   // suitable for sorting according to frequency. The first elements of pairs
-  // can be repeated, these correspond to different occurrences in the same list 
+  // can be repeated, these correspond to different occurrences in the same list
   std::vector<std::pair<int32, int32> > submat_histogram_vector;
   // copy the key, occurence_counts from submat_histogram to a vector
   unordered_map<int32, std::vector<int32> >::iterator hist_iter;
@@ -366,6 +370,10 @@ void SplitLocations(
                                      &submat_histogram_vector, split_lists);
 }
 
+/* If it is the case for some i >= 0 that all the .first elements of
+   "location_vector" are either i or -1, then output i to first_value and the
+   .second elements into "second_values", and return true.  Otherwise return
+   false and the outputs are don't-cares. */
 bool ConvertToIndexes(
     const std::vector<std::pair<int32, int32> > &location_vector,
     int32 *first_value,
@@ -374,7 +382,7 @@ bool ConvertToIndexes(
   second_values->clear();
   second_values->reserve(location_vector.size());
   std::vector<std::pair<int32, int32> >::const_iterator iter;
-  for (iter = location_vector.begin(); iter < location_vector.end(); iter++)  {
+  for (iter = location_vector.begin(); iter < location_vector.end(); ++iter)  {
     if (iter->first != -1) {
       if (*first_value == -1)
         *first_value = iter->first;
@@ -388,69 +396,41 @@ bool ConvertToIndexes(
   return true;
 }
 
-// Function to split a vector of values into contiguous segments where each
-// segment is represented by (value, (start_index, end_index))
-void ConvertVectorToContiguousSegments(std::vector<int32> values,
-      std::vector<std::pair< int32, std::pair<int32, int32> > > *
-      contiguous_segments)  {
-  int32 edge_start = 0;
-  for (int32 i = 0; i < values.size(); i++)  {
-    // check if this is an edge
-    if ((i > 0) && (values[i] != values[i-1]))  {
-      // create the edge
-      (*contiguous_segments).push_back(std::make_pair(
-          values[i-1], std::make_pair(edge_start, i-1)));
-      edge_start = i; 
-    }
-  }
-  (*contiguous_segments).push_back( 
-      std::make_pair( values.back(),  std::make_pair(edge_start,
-                                                     values.size() - 1)));
-}
 
-// Function to split list of segments into vector of list of segments with
-// unique values. Lists are always ensured to be of the same size as the input
-// list, dummy pair (-1, -1) is inserted where necessary.
-void SplitContiguousSegments(
-    std::vector<std::pair< int32, std::pair<int32, int32> > >
-    &contiguous_segments,
-    std::vector<std::vector<std::pair< int32, std::pair<int32, int32> > > > 
-    *contiguous_segments_list) {
-  // a vector of integer lists used for book-keeping;
-  // it stores the values of contiguous segments stored in each list of output
-  // vector
-  std::vector<std::vector<int32> > segment_values_vector;
-  for (int32 i = 0; i < contiguous_segments.size(); i++)  {
-    int32 segment_value = contiguous_segments[i].first;
-    if (segment_value == -1)
-      continue;  // this is a dummy segment so ignoring it
-    bool added_segment = false;
-    for (int32 j = 0; j < segment_values_vector.size(); j++) {
-      std::vector<int32>::iterator iter = std::find(
-          segment_values_vector[j].begin(),
-          segment_values_vector[j].end(),
-          segment_value);
-      if (iter == segment_values_vector[j].end()) {
-        // a segment with the current value does not exist in this list
-        // so adding the segment to this list
-        (*contiguous_segments_list)[j].push_back(contiguous_segments[i]);
-        // book-keeping 
-        segment_values_vector[j].push_back(segment_value);
-        added_segment = true;
-        break;
-      }
+// see declaration in header for documentation
+void EnsureContiguousProperty(
+    const std::vector<int32> &indexes,
+    std::vector<std::vector<int32> > *indexes_out) {
+  indexes_out->clear();
+  indexes_out->reserve(3);
+  if (indexes.empty()) return;
+  int32 max_value = *std::max_element(indexes.begin(), indexes.end());
+  if (max_value == -1) return;
+  std::vector<int32> num_segments_seen(max_value + 1, 0);
+  int32 dim = indexes.size(), num_output_vectors = 0;
+  for (int32 i = 0; i < dim;) {
+    // note, we increment i within the loop.
+    if (indexes[i] == -1) {
+      i++;
+      continue;
     }
-    if (!added_segment) {
-      // the segment was not added to any of existing segment lists
-      // creating a new list and adding the segment
-      std::vector<std::pair<int32, std::pair<int32, int32> > > list_of_pairs;
-      list_of_pairs.push_back(contiguous_segments[i]);
-      contiguous_segments_list->push_back(list_of_pairs);
-      // book-keeping for easier search  
-      std::vector<int32> list;
-      list.push_back(segment_value);
-      segment_values_vector.push_back(list);
+    int32 value = indexes[i], start_index = i;
+    for (; i < dim && indexes[i] == value; i++);
+    int32 end_index = i;  // one past the end.
+    // the input 'indexes' contains a sequence of possibly-repeated instances of
+    // the value 'value', starting at index 'start_index', with 'end_index' as
+    // one past the end.
+    int32 this_num_segments_seen = num_segments_seen[value]++;
+    if (this_num_segments_seen >= num_output_vectors) {  // we have nowhere to
+                                                         // put it.
+      indexes_out->resize(++num_output_vectors);
+      indexes_out->back().resize(dim, -1);  // fill newly added vector with -1's.
     }
+    std::vector<int32> &this_out_vec((*indexes_out)[this_num_segments_seen]);
+    std::vector<int32>::iterator iter = this_out_vec.begin() + start_index,
+        end = this_out_vec.begin() + end_index;
+    // Fill the appropriate range of the output vector with 'value'
+    for (; iter != end; ++iter) *iter = value;
   }
 }
 
@@ -486,49 +466,39 @@ void SplitPairList(std::vector<std::pair<int32, int32> >& list,
     KALDI_ERR << "Input list has just dummy pairs";
 }
 
-
-
 void SplitLocationsBackward(
     const std::vector<std::vector<std::pair<int32, int32> > > &submat_lists,
     std::vector<std::vector<std::pair<int32, int32> > > *split_lists) {
   std::vector<std::vector<std::pair<int32, int32> > > split_lists_intermediate;
   // Split the submat_lists
   SplitLocations(submat_lists, &split_lists_intermediate);
-  for ( int32 i = 0; i < split_lists_intermediate.size(); i++) {
+  for (size_t i = 0; i < split_lists_intermediate.size(); i++) {
     int32 first_value;
     std::vector<int32> second_values;
     if (ConvertToIndexes(split_lists_intermediate[i],
                          &first_value, &second_values)) {
       // the .first values are the same
-      // splitting this list of pairs to ensure that the second values
-      // have unique contiguous segments
-      std::vector<std::pair<int32, std::pair<int32, int32> > >
-          contiguous_segments;
-      std::vector<std::vector<std::pair<int32, std::pair<int32, int32> > > >
-          unique_contiguous_segments_list;
-      std::vector<std::vector<int32> > second_values_vector;
-      ConvertVectorToContiguousSegments(second_values, &contiguous_segments);
-      SplitContiguousSegments(contiguous_segments,
-                              &unique_contiguous_segments_list);
-      // making pairs from the unique_contiguous_segments
-      for (int32 j = 0; j < unique_contiguous_segments_list.size(); j++) {
-        std::vector<std::pair<int32, int32> > list_of_pairs(
-            split_lists_intermediate[0].size(), std::make_pair(-1, -1));
-        std::vector<std::pair<int32, std::pair<int32, int32> > > 
-            &current_contiguous_segments = unique_contiguous_segments_list[j];
-        // converting the contiguous_segment to a list of pairs
-        for (int32 k = 0; k < current_contiguous_segments.size(); k++) {
-          std::pair<int32, std::pair<int32, int32> > &segment =
-              current_contiguous_segments[k];
-          int32 segment_value = segment.first;
-          int32 segment_start = segment.second.first;
-          int32 segment_end = segment.second.second;
-          for (int32 l = segment_start; l <= segment_end; l++)  {
-            if (segment_value != -1)
-              list_of_pairs[l] = std::make_pair(first_value, segment_value);
+      if (first_value == -1) continue;  // don't output anything for this.
+      std::vector<std::vector<int32> > second_values_split;
+      EnsureContiguousProperty(second_values, &second_values_split);
+      if (second_values_split.size() == 1) {
+        // this branch is an optimization for speed.
+        split_lists->push_back(split_lists_intermediate[i]);
+      } else {
+        for (size_t j = 0; j < second_values_split.size(); j++) {
+          split_lists->resize(split_lists->size() + 1);
+          const std::vector<int32> &input_list = second_values_split[j];
+          std::vector<std::pair<int32, int32> > &output_list =
+              split_lists->back();
+          output_list.resize(input_list.size());
+          int32 size = input_list.size();
+          for (int32 k = 0; k < size; k++) {
+            int32 row = input_list[k];
+            if (row == -1) output_list[k].first = -1;
+            else output_list[k].first = first_value;
+            output_list[k].second = row;
           }
         }
-        split_lists->push_back(list_of_pairs);
       }
     } else {
       // the .first values are not the same
@@ -544,5 +514,56 @@ void SplitLocationsBackward(
   }
 }
 
+// This function returns true if for each integer i != -1, all the indexes j at
+// which indexes[j] == i are consecutive with no gaps (more formally: if j1 < j2
+// < j3 and indexes[j1] == indexes[j3], then indexes[j1] == indexes[j2]).  If
+// so, it also outputs to "reverse_indexes" the begin and end of these ranges,
+// so that indexes[j] == i for all j such that (*reverse_indexes)[i].first <= j
+// && j < (*reverse_indexes)[i].second.
+bool HasContiguousProperty(
+    const std::vector<int32> &indexes,
+    std::vector<std::pair<int32, int32> > *reverse_indexes) {
+  reverse_indexes->clear();
+  int32 num_indexes = indexes.size();
+  if (num_indexes == 0)
+    return true;
+  int32 num_input_indexes =
+      *std::max_element(indexes.begin(), indexes.end()) + 1;
+  KALDI_ASSERT(num_input_indexes >= 0);
+  if (num_input_indexes == 0) {
+    // we don't really expect this input, filled with -1's.
+    KALDI_WARN << "HasContiguousProperty called on vector of -1's.";
+    return true;
+  }
+  reverse_indexes->resize(num_input_indexes,
+                          std::pair<int32,int32>(-1, -1));
+  // set each pair's "first" to the min index of all elements
+  // of "indexes" with that value, and the "second" to the
+  // max plus one.
+  for (int32 i = 0; i < num_indexes; i++) {
+    int32 j = indexes[i];
+    if (j == -1) continue;
+    KALDI_ASSERT(j >= 0);
+    std::pair<int32, int32> &pair = (*reverse_indexes)[j];
+    if (pair.first == -1) {
+      pair.first = i;
+      pair.second = i + 1;
+    } else {
+      pair.first = std::min(pair.first, i);
+      pair.second = std::max(pair.second, i + 1);
+    }
+  }
+  // check that the contiguous property holds.
+  for (int32 i = 0; i < num_input_indexes; i++) {
+    std::pair<int32, int32> pair = (*reverse_indexes)[i];
+    if (pair.first != -1) {
+      for (int32 j = pair.first; j < pair.second; j++)
+        if (indexes[j] != i)
+          return false;
+    }
+  }
+  return true;
+}
+
 }  // namespace nnet3
 }  // namespace kaldi
diff --git a/src/nnet3/nnet-compile-utils.h b/src/nnet3/nnet-compile-utils.h
index aea832ee7c2..8caad5f757c 100644
--- a/src/nnet3/nnet-compile-utils.h
+++ b/src/nnet3/nnet-compile-utils.h
@@ -42,7 +42,7 @@ namespace nnet3 {
    size), but the maximum size of the lists will usually be fairly small e.g. no
    more than 4 or so, as it represents the number of terms in a hand-coded
    summation expression.
-   
+
    The use of this function is in interpreting a command to set each row of
    a matrix to a sum of terms.  Each pair represents an input term, interpreted
    as (index-of-matrix, row-index), which represents a vector that will form
@@ -94,15 +94,42 @@ void SplitLocationsBackward(
     std::vector<std::vector<std::pair<int32, int32> > > *split_lists);
 
 
-/* If it is the case for some i >= 0 that all the .first elements of
+/** If it is the case for some i >= 0 that all the .first elements of
    "location_vector" are either i or -1, then output i to first_value and the
    .second elements into "second_values", and return true.  Otherwise return
    false and the outputs are don't-cares. */
 bool ConvertToIndexes(
     const std::vector<std::pair<int32, int32> > &location_vector,
-    int32 *first_value,    
+    int32 *first_value,
     std::vector<int32> *second_values);
 
+/** This function returns true if for each integer i != -1, all the indexes j at
+    which indexes[j] == i are consecutive with no gaps (more formally: if j1 <
+    j2 < j3 and indexes[j1] != -1 and indexes[j1] == indexes[j3], then
+    indexes[j1] == indexes[j2]).  For example, the vector [ 1 2 1 ] lacks the
+    contiguous property because 1 appears in two places with a different number
+    in the middle.  If the vector has the contiguous property, this function
+    also outputs to "reverse_indexes" the begin and end of these ranges, so that
+    indexes[j] == i for all j such that (*reverse_indexes)[i].first <= j && j <
+    (*reverse_indexes)[i].second. */
+bool HasContiguousProperty(const std::vector<int32> &indexes,
+                           std::vector<std::pair<int32, int32> > *reverse_indexes);
+
+
+/** This function takes a vector of indexes and splits it up into as separate
+    vectors of the same size, as needed to ensure that the 'contiguous property' holds.
+    This is done via padding with -1's.  An example will clarify this.  Suppose the
+    input is:
+      [ -1  1  1  1  2  2  1  1 ]
+    which lacks the contiguous property because 1's appear in 2 different places, it
+    would split it up as
+      [ -1  1  1  1  2  2 -1 -1 ]
+      [ -1 -1 -1 -1 -1 -1  1  1 ]
+    If 'indexes' is empty or only contains -1's, 'indexes_out' will be empty.
+ */
+void EnsureContiguousProperty(
+    const std::vector<int32> &indexes,
+    std::vector<std::vector<int32> > *indexes_out);
 
 
 
diff --git a/src/nnet3/nnet-compile.cc b/src/nnet3/nnet-compile.cc
index f6b67369592..ea8f620c6ce 100644
--- a/src/nnet3/nnet-compile.cc
+++ b/src/nnet3/nnet-compile.cc
@@ -1,3 +1,5 @@
+// nnet3/nnet-compile.cc
+
 // Copyright      2015  Johns Hopkins University (author: Daniel Povey)
 
 // See ../../COPYING for clarification regarding multiple authors
@@ -67,7 +69,7 @@ void Compiler::AddCommands(const std::vector<bool> &deriv_needed,
     DoForwardComputation(step, computation);
   // mark the end of the forward phase.
   computation->commands.push_back(
-      NnetComputation::Command(NnetComputation::kNoOperationMarker));
+      NnetComputation::Command(kNoOperationMarker));
   for (int32 step = num_steps - 1; step >= 0; step--)
     if (deriv_needed[step])
       DoBackwardComputation(step, computation);
@@ -161,7 +163,31 @@ void Compiler::ComputeDerivNeeded(
   }
 }
 
-      
+MatrixStrideType Compiler::GetStrideType(int32 node_index) const {
+  int32 component_node_index;
+  bool is_input;
+  if (nnet_.IsComponentInputNode(node_index)) {
+    // this node is for the input to a component.
+    component_node_index = node_index + 1;
+    is_input = true;
+  } else if (nnet_.IsComponentNode(node_index)) {
+    component_node_index = node_index;
+    is_input = false;
+  } else {
+    return kDefaultStride;
+  }
+  const NetworkNode &node = nnet_.GetNode(component_node_index);
+  const Component *c = nnet_.GetComponent(node.u.component_index);
+  if (is_input) {
+    return (c->Properties() & kInputContiguous) ?
+        kStrideEqualNumCols : kDefaultStride;
+  } else {
+    return (c->Properties() & kOutputContiguous) ?
+        kStrideEqualNumCols : kDefaultStride;
+  }
+}
+
+
 // Note: "by_step" is an input but is passed as a pointer because this
 // function destroys it.
 void Compiler::CreateStepInfo(
@@ -185,11 +211,14 @@ void Compiler::CreateStepInfo(
         graph_.cindexes[this_info.output_cindex_ids.front()].first;
     const NetworkNode &node = nnet_.GetNode(this_info.node_index);
     int32 num_rows = num_ids, num_cols = node.Dim(nnet_);
-    
-    if (node.node_type != kDimRange) {    
-      this_info.value = computation->NewMatrix(num_rows, num_cols);
+
+    if (node.node_type != kDimRange) {
+      MatrixStrideType stride_type = GetStrideType(this_info.node_index);
+      this_info.value = computation->NewMatrix(num_rows, num_cols,
+                                               stride_type);
       if (deriv_needed[step])
-        this_info.deriv = computation->NewMatrix(num_rows, num_cols);
+        this_info.deriv = computation->NewMatrix(num_rows, num_cols,
+                                                 stride_type);
       if (node.node_type == kComponent)
         KALDI_PARANOID_ASSERT(step > 0 &&  steps_[step-1].output_indexes ==
                               this_info.output_indexes);
@@ -202,9 +231,11 @@ void Compiler::CreateStepInfo(
       KALDI_PARANOID_ASSERT(this_info.output_indexes ==
                             steps_[input_step].output_indexes);
       this_info.value = computation->NewSubMatrix(steps_[input_step].value,
+                                                  0, -1,
                                                   node.dim_offset, node.dim);
       if (deriv_needed[step])
         this_info.deriv = computation->NewSubMatrix(steps_[input_step].deriv,
+                                                    0, -1,
                                                     node.dim_offset, node.dim);
     }
     if (node.node_type == kDescriptor) {
@@ -235,10 +266,12 @@ void Compiler::CreateStepInfo(
           int32 this_dim = this_part.Dim(nnet_);
           this_info.value_parts[p] =
               computation->NewSubMatrix(this_info.value,
+                                        0, -1,
                                         cur_dim_offset, this_dim);
           if (deriv_needed[step])
             this_info.deriv_parts[p] =
                 computation->NewSubMatrix(this_info.deriv,
+                                          0, -1,
                                           cur_dim_offset, this_dim);
           cur_dim_offset += this_dim;
         }
@@ -308,7 +341,7 @@ void Compiler::DoForwardComputation(int32 step,
       break;
     default:
       KALDI_ERR << "Invalid node type";
-  }      
+  }
 }
 
 
@@ -434,11 +467,11 @@ void Compiler::DoForwardComputationSumDescriptor(
 
 void Compiler::DoForwardComputationFromIndexes(
     int32 value_submatrix_index,
-    int32 input_submatrix_index,    
+    int32 input_submatrix_index,
     bool is_first_term_in_sum,
     const std::vector<int32> &indexes,
     NnetComputation *computation) const {
-    
+
   int32 input_num_rows =
       computation->submatrices[input_submatrix_index].num_rows,
       num_rows = indexes.size();
@@ -448,13 +481,13 @@ void Compiler::DoForwardComputationFromIndexes(
       if (indexes[i] != i)
         break;
     if (i == num_rows) {  // Simplest case: just matrix addition.
-      NnetComputation::CommandType ctype =
+      CommandType ctype =
           (is_first_term_in_sum ?
-           NnetComputation::kMatrixCopy : NnetComputation::kMatrixAdd);
+           kMatrixCopy : kMatrixAdd);
       computation->commands.push_back(
           NnetComputation::Command(ctype, value_submatrix_index,
                                    input_submatrix_index));
-                                   
+
       return;
     }
   }
@@ -466,9 +499,9 @@ void Compiler::DoForwardComputationFromIndexes(
   // possibly with a subset of the rows in the original sub-matrices.
   int32 indexes_index = computation->indexes.size();
   computation->indexes.push_back(indexes);
-  NnetComputation::CommandType ctype =
+  CommandType ctype =
       (is_first_term_in_sum ?
-       NnetComputation::kCopyRows : NnetComputation::kAddRows);
+       kCopyRows : kAddRows);
   computation->commands.push_back(
       NnetComputation::Command(ctype, value_submatrix_index,
                                input_submatrix_index, indexes_index));
@@ -478,7 +511,7 @@ void Compiler::DoForwardComputationFromIndexes(
 void Compiler::DoForwardComputationFromSubmatLocations(
     int32 value_submatrix_index,
     bool is_first_term_in_sum,
-    const std::vector<std::pair<int32, int32> > &submat_locations, 
+    const std::vector<std::pair<int32, int32> > &submat_locations,
     NnetComputation *computation) const {
 
 
@@ -494,9 +527,9 @@ void Compiler::DoForwardComputationFromSubmatLocations(
     return;
   } else {
     // There are multiple source matrices.
-    NnetComputation::CommandType ctype =
+    CommandType ctype =
         (is_first_term_in_sum ?
-         NnetComputation::kCopyRowsMulti : NnetComputation::kAddRowsMulti);
+         kCopyRowsMulti : kAddRowsMulti);
     int32 indexes_multi_index = computation->indexes_multi.size();
     computation->indexes_multi.push_back(submat_locations);
     computation->commands.push_back(
@@ -510,7 +543,7 @@ void Compiler::DoForwardComputationFromSubmatLocationsList(
     int32 value_submatrix_index,
     const std::vector<std::vector<std::pair<int32, int32> > > &submat_lists,
     NnetComputation *computation) const {
-  std::vector<std::vector<std::pair<int32, int32> > > split_lists;  
+  std::vector<std::vector<std::pair<int32, int32> > > split_lists;
   SplitLocations(submat_lists, &split_lists);
   int32 size = split_lists.size();
   // note: size may be empty in unusual cases so don't assert that it's
@@ -527,7 +560,7 @@ void Compiler::DoBackwardComputationFromSubmatLocationsList(
     int32 deriv_submatrix_index,
     const std::vector<std::vector<std::pair<int32, int32> > > &submat_lists,
     NnetComputation *computation) const {
-  std::vector<std::vector<std::pair<int32, int32> > > split_lists;  
+  std::vector<std::vector<std::pair<int32, int32> > > split_lists;
   SplitLocationsBackward(submat_lists, &split_lists);
   int32 size = split_lists.size();  // size may be zero e.g. for unused outputs.
   for (int32 i = 0; i < size; i++)
@@ -556,13 +589,13 @@ void Compiler::DoBackwardComputationSumDescriptor(
 
 void Compiler::DoBackwardComputationFromSubmatLocations(
     int32 deriv_submatrix_index,
-    const std::vector<std::pair<int32, int32> > &submat_locations,        
+    const std::vector<std::pair<int32, int32> > &submat_locations,
     NnetComputation *computation) const {
   // This function creates a command to handle an individual piece of the
   // Descriptor, for backprop.  Note: because the backprop case is a little
   // trickier to implement efficiently on the GPU, there may be cases
   // which we will refuse to implement backprop for if we get here.
-  
+
   int32 num_rows = submat_locations.size();
   std::vector<std::pair<int32, int32> >::const_iterator
       iter = submat_locations.begin(), end = submat_locations.end();
@@ -593,7 +626,7 @@ void Compiler::DoBackwardComputationFromSubmatLocations(
       int32 indexes_multi_index = computation->indexes_multi.size();
       computation->indexes_multi.push_back(submat_locations);
       computation->commands.push_back(
-          NnetComputation::Command(NnetComputation::kAddToRowsMulti,
+          NnetComputation::Command(kAddToRowsMulti,
                                    deriv_submatrix_index,
                                    indexes_multi_index));
       return;
@@ -609,56 +642,12 @@ void Compiler::DoBackwardComputationFromSubmatLocations(
   }
 }
 
-// This function returns true if for each integer i != -1, all the indexes j at which
-// indexes[j] == i are consecutive with no gaps (more formally: if j1 < j2 < j3
-// and indexes[j1] == indexes[j3], then indexes[j1] == indexes[j2]).  If so it
-// also outputs to "reverse_indexes" the begin and end of these ranges, so that
-// indexes[j] == i for all j such that (*reverse_indexes)[i].first <= j && j <
-// (*reverse_indexes)[i].second.
-static bool HasContiguousProperty(
-    const std::vector<int32> &indexes,
-    std::vector<std::pair<int32, int32> > *reverse_indexes) {
-  int32 num_indexes = indexes.size(),
-      num_input_indexes = *std::max_element(indexes.begin(), indexes.end()) + 1;
-  reverse_indexes->resize(num_input_indexes);
-  for (int32 i = 0; i < num_input_indexes; i++) {
-    (*reverse_indexes)[i].first = -1;
-    (*reverse_indexes)[i].second = -1;
-  }
-  // set each pair's "first" to the min index of all elements
-  // of "indexes" with that value, and the "second" to the
-  // max plus one.
-  for (int32 i = 0; i < num_indexes; i++) {
-    int32 j = indexes[i];
-    if (j == -1) continue;
-    KALDI_ASSERT(j >= 0);
-    std::pair<int32, int32> &pair = (*reverse_indexes)[j];
-    if (pair.first == -1) {
-      pair.first = j;
-      pair.second = j + 1;
-    } else {
-      pair.first = std::min(pair.first, j);
-      pair.second = std::max(pair.second, j + 1);
-    }
-  }
-  // check that the contiguous property holds.
-  for (int32 i = 0; i < num_input_indexes; i++) {
-    std::pair<int32, int32> pair = (*reverse_indexes)[i];
-    if (pair.first != -1) {
-      for (int32 j = pair.first; j < pair.second; j++)
-        if (indexes[j] != i)
-          return false;
-    }
-  }
-  return true;
-}
-
 void Compiler::DoBackwardComputationFromIndexes(
     int32 deriv_submatrix_index,
-    int32 input_deriv_submatrix_index,      
+    int32 input_deriv_submatrix_index,
     const std::vector<int32> &indexes,
     NnetComputation *computation) const {
-    
+
   int32 num_rows = computation->submatrices[deriv_submatrix_index].num_rows,
       input_num_rows =
       computation->submatrices[input_deriv_submatrix_index].num_rows;
@@ -670,10 +659,10 @@ void Compiler::DoBackwardComputationFromIndexes(
         break;
     if (i == num_rows) {  // Simplest case: just matrix addition.
         computation->commands.push_back(
-            NnetComputation::Command(NnetComputation::kMatrixAdd,
+            NnetComputation::Command(kMatrixAdd,
                                      input_deriv_submatrix_index,
                                      deriv_submatrix_index));
-                                     
+
       return;
     }
   }
@@ -697,7 +686,7 @@ void Compiler::DoBackwardComputationFromIndexes(
       int32 indexes_index = computation->indexes.size();
       computation->indexes.push_back(reverse_indexes);
         computation->commands.push_back(
-            NnetComputation::Command(NnetComputation::kAddRows,
+            NnetComputation::Command(kAddRows,
                                      input_deriv_submatrix_index,
                                      deriv_submatrix_index,
                                      indexes_index));
@@ -707,10 +696,15 @@ void Compiler::DoBackwardComputationFromIndexes(
   std::vector<std::pair<int32, int32> > ranges;
   if (HasContiguousProperty(indexes, &ranges)) {
     // the operation can be set up as AddRowRanges.
+    if (static_cast<int32>(ranges.size()) != input_num_rows) {
+      KALDI_ASSERT(static_cast<int32>(ranges.size()) < input_num_rows);
+      // extend with (-1, -1) pairs.
+      ranges.resize(input_num_rows, std::pair<int32,int32>(-1, -1));
+    }
     int32 indexes_ranges_index = computation->indexes_ranges.size();
     computation->indexes_ranges.push_back(ranges);
     computation->commands.push_back(
-        NnetComputation::Command(NnetComputation::kAddRowRanges,
+        NnetComputation::Command(kAddRowRanges,
                                  input_deriv_submatrix_index,
                                  deriv_submatrix_index,
                                  indexes_ranges_index));
@@ -730,7 +724,7 @@ void Compiler::DoBackwardComputationFromIndexes(
   // It wouldn't be too much work.
   KALDI_ERR << "This case not implemented yet.";
 }
-  
+
 
 void Compiler::DoBackwardComputationDescriptor(
     int32 step, NnetComputation *computation) {
@@ -761,7 +755,7 @@ void Compiler::DoBackwardComputation(int32 step,
       break;
     default:
       KALDI_ERR << "Invalid node type";
-  }      
+  }
 }
 
 
@@ -773,14 +767,14 @@ void Compiler::AddPropagateStep(int32 step,
   const StepInfo &input_step_info = steps_[input_step];
   int32 node_index = step_info.node_index;
   const NetworkNode &node = nnet_.GetNode(node_index);
-  KALDI_ASSERT(node.node_type == kComponent);  
-  
+  KALDI_ASSERT(node.node_type == kComponent);
+
   // in setting the following two variables, we use the fact that the submatrix
   // index of each submatrix that represents an entire matrix, is the same as
   // the matrix index of that matrix.
   int32 input_submatrix_index = input_step_info.value,
       output_submatrix_index = step_info.value;
-  NnetComputation::Command c(NnetComputation::kPropagate,
+  NnetComputation::Command c(kPropagate,
                              node.u.component_index,
                              step_info.precomputed_indexes_index,
                              input_submatrix_index,
@@ -790,7 +784,7 @@ void Compiler::AddPropagateStep(int32 step,
   if (request_.store_component_stats) {
     const Component *c = nnet_.GetComponent(node.u.component_index);
     if (c->Properties() & kStoresStats) {
-      NnetComputation::Command c(NnetComputation::kStoreStats,
+      NnetComputation::Command c(kStoreStats,
                                  node.u.component_index,
                                  output_submatrix_index);
       computation->commands.push_back(c);
@@ -809,8 +803,8 @@ void Compiler::AddBackpropStep(int32 step,
   const NetworkNode &node = nnet_.GetNode(node_index);
   KALDI_ASSERT(node.node_type == kComponent);
   int32 component_index = node.u.component_index;
-  const Component *component = nnet_.GetComponent(component_index);  
-  
+  const Component *component = nnet_.GetComponent(component_index);
+
   // in setting the following two variables, we use the fact that the submatrix
   // index of each submatrix that represents an entire matrix, is the same as
   // the matrix index of that matrix.
@@ -821,14 +815,14 @@ void Compiler::AddBackpropStep(int32 step,
   KALDI_ASSERT(output_deriv_submatrix_index > 0 &&
                (input_deriv_submatrix_index > 0 ||
                 component->Properties() & kUpdatableComponent));
-               
-  if (! (component->Properties()&kBackpropNeedsInput))
+
+  if (! (component->Properties() & kBackpropNeedsInput))
     input_submatrix_index = 0;
-  if (! (component->Properties()&kBackpropNeedsOutput))
+  if (! (component->Properties() & kBackpropNeedsOutput))
     output_submatrix_index = 0;
-  
-  NnetComputation::Command c(NnetComputation::kBackprop,
-                             node_index,
+
+  NnetComputation::Command c(kBackprop,
+                             component_index,
                              step_info.precomputed_indexes_index,
                              input_submatrix_index,
                              output_submatrix_index,
@@ -865,9 +859,9 @@ void Compiler::AllocateMatrices(NnetComputation *computation) const {
           deriv_matrix_index =
           computation->submatrices[deriv_submatrix_index].matrix_index;
       input_and_oderiv_matrices.insert(deriv_matrix_index);
-    }    
+    }
   }
-  
+
   for (int32 m = 1; m < computation->matrices.size(); m++) {
     // Later in the optimization phase, it turns out that zeroing is not
     // necessary for some matrices, we'll turn these commands into
@@ -875,7 +869,7 @@ void Compiler::AllocateMatrices(NnetComputation *computation) const {
     // We don't set up the matrices that are inputs to the computation;
     // this happens when the user provides the input.
     if (input_and_oderiv_matrices.count(m) == 0) {
-      NnetComputation::Command c(NnetComputation::kAllocMatrixZeroed, m);
+      NnetComputation::Command c(kAllocMatrixZeroed, m);
       computation->commands.push_back(c);
     }
   }
@@ -895,12 +889,12 @@ void Compiler::SetUpPrecomputedIndexes(
     if (node.node_type != kComponent)
       continue;
     const StepInfo &input_step_info = steps_[step - 1];
-    int32 component_index = node.u.component_index;  
+    int32 component_index = node.u.component_index;
     int32 input_node_index = input_step_info.node_index;
     KALDI_ASSERT(input_node_index == node_index - 1);
     const std::vector<Index> &input_indexes = input_step_info.output_indexes;
     const std::vector<Index> &output_indexes = step_info.output_indexes;
-    
+
     const Component *component = nnet_.GetComponent(component_index);
 
     bool need_derivs = request_.NeedDerivatives();
@@ -965,7 +959,7 @@ void Compiler::DeallocateMatrices(NnetComputation *computation) {
   for (int32 m = 1; m < num_matrices; m++)
     if (will_destroy[m])
       computation->commands.push_back(
-          NnetComputation::Command(NnetComputation::kDeallocMatrix, m));
+          NnetComputation::Command(kDeallocMatrix, m));
 }
 
 void Compiler::OutputDebugInfo(NnetComputation *computation) const {
@@ -980,22 +974,31 @@ void Compiler::OutputDebugInfo(NnetComputation *computation) const {
     int32 value_matrix = computation->submatrices[step_info.value].matrix_index;
     int32 deriv_matrix = 0;
     if (step_info.deriv != 0 && computation->IsWholeMatrix(step_info.deriv))
-      deriv_matrix = computation->submatrices[step_info.deriv].matrix_index;    
-    
+      deriv_matrix = computation->submatrices[step_info.deriv].matrix_index;
+
     NnetComputation::MatrixDebugInfo &debug_info =
         computation->matrix_debug_info[value_matrix];
     debug_info.is_deriv = false;
-    debug_info.node_index = step_info.node_index;
-    debug_info.indexes = step_info.output_indexes;
+    AppendCindexes(step_info.node_index, step_info.output_indexes,
+                   &debug_info.cindexes);
     if (deriv_matrix != 0) {
       NnetComputation::MatrixDebugInfo &deriv_debug_info =
           computation->matrix_debug_info[deriv_matrix];
       deriv_debug_info.is_deriv = true;
-      deriv_debug_info.node_index = step_info.node_index;
-      deriv_debug_info.indexes = step_info.output_indexes;
+      deriv_debug_info.cindexes = debug_info.cindexes;
     }
   }
 }
 
+void AppendCindexes(int32 node, const std::vector<Index> &indexes,
+                    std::vector<Cindex> *out) {
+  size_t indexes_size = indexes.size();
+  if (indexes_size > out->size())
+    out->reserve(out->size() + indexes_size);
+  for (size_t i = 0; i < indexes_size; i++)
+    out->push_back(Cindex(node, indexes[i]));
+}
+
+
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-compile.h b/src/nnet3/nnet-compile.h
index cd2ce982ca3..1b4953668ea 100644
--- a/src/nnet3/nnet-compile.h
+++ b/src/nnet3/nnet-compile.h
@@ -45,7 +45,7 @@ class Compiler {
  public:
   Compiler(const ComputationRequest &request,
            const Nnet &nnet);
-  
+
   void CreateComputation(const CompilerOptions &opts,
                          NnetComputation *computation);
 
@@ -65,7 +65,7 @@ class Compiler {
     int32 deriv;  // sub-matrix index of derivative at the output of this step; zero
                   // if not used (note: index zero is reserved for the empty
                   // matrix).
-    
+
     // precomputed_indexes_index is the index into the
     // component_precomputed_indexes array in the NnetComputation, or zero if
     // none needed.
@@ -74,7 +74,7 @@ class Compiler {
     std::vector<Index> output_indexes;      // Indexes that this step outputs.
     std::vector<int32> output_cindex_ids;   // cindex_ids corresponding to each
                                             // of the output indexes.
-    
+
     // If this component is of type kDescriptor (and note that the top-level
     // Descriptor is a concatenation over >= 1 parts), then we set value_parts
     // to a list of submatrix-indexes, each for the corresponding part of the
@@ -92,7 +92,7 @@ class Compiler {
     // row-index), that we store here to avoid computing it twice in forward and
     // backprop.
     std::vector<std::vector<std::vector<std::pair<int32,int32> > > > input_locations_list;
-                     
+
     StepInfo(): node_index(-1), is_input(false), value(0),
                 deriv(0), precomputed_indexes_index(0) { }
   };
@@ -104,20 +104,25 @@ class Compiler {
   // CreateLocationInfo() has already been called.
   void ComputeStepDependencies(const std::vector<int32> &this_step,
                                unordered_set<int32> *dep_steps);
-  
+
   // This function outputs to each element of "deriv_needed" a bool saying
   // whether, for that step, we need to allocate the matrix of derivatives
   // (interpret this as being at the output of that step).  This variable
   // also tells us whether we need to execute the backprop code for that step.
   void ComputeDerivNeeded(const std::vector<std::vector<int32> > &steps,
                           std::vector<bool> *deriv_needed);
-  
+
   // this sets up steps_, destroying the input "by_step" in the process.  It
   // also sets various matrix and sub-matrix sizes in "computation".
   void CreateStepInfo(const std::vector<bool> &deriv_needed,
                       std::vector<std::vector<int32> > *by_step,
                       NnetComputation *computation);
 
+  // Gets the stride type, kDefaultStride or kStrideEqualNumCols,
+  // at the output of this node: interrogates component flags
+  // looking for kInputContiguous or kOutputContiguous.
+  MatrixStrideType GetStrideType(int32 node_index) const;
+
 
   // Miscellaneous info pertaining to various steps of the computation.  Indexed
   // by step-index.
@@ -145,12 +150,12 @@ class Compiler {
 
   // Adds to the computation object the commands to allocate the matrices.
   void AllocateMatrices(NnetComputation *computation) const;
-  
+
   // Sets up the precomputed indexes for each component, and sets the
   // precomputed_indexes_index value for each step.
   void SetUpPrecomputedIndexes(NnetComputation *computation);
-  
-  // Adds to "computation" the command(s) for the forward computation 
+
+  // Adds to "computation" the command(s) for the forward computation
   // for this step.
   void DoForwardComputation(int32 step, NnetComputation *computation) const;
 
@@ -200,7 +205,7 @@ class Compiler {
  const std::vector<std::vector<std::pair<int32, int32> > > &input_locations_list,
  std::vector<std::vector<std::pair<int32, int32> > > *submat_locations_list)
       const;
-  
+
 
 
   // Called from DoForwardComputationSumDescriptor.
@@ -211,16 +216,16 @@ class Compiler {
   void DoForwardComputationFromSubmatLocationsList(
       int32 value_submatrix_index,
       const std::vector<std::vector<std::pair<int32, int32> > > &submat_locations,
-      NnetComputation *computation) const;  
+      NnetComputation *computation) const;
 
 
   void DoForwardComputationFromSubmatLocations(
       int32 value_submatrix_index,
       bool is_first_term_in_sum,
       const std::vector<std::pair<int32, int32> > &submat_locations,
-      NnetComputation *computation) const;  
-  
-  
+      NnetComputation *computation) const;
+
+
   // Called from DoForwardComputationFromSubmatLocations (special
   // case where all input is from the same matrix).
   void DoForwardComputationFromIndexes(
@@ -228,9 +233,9 @@ class Compiler {
       int32 input_submatrix_index,
       bool is_first_term_in_sum,
       const std::vector<int32> &indexes,
-      NnetComputation *computation) const;  
-  
-  
+      NnetComputation *computation) const;
+
+
   // Adds to "computation" the command(s) for the backward computation (if any) for
   // this step.  (non-const only because we clear the cached submat_locations).
   void DoBackwardComputation(int32 step, NnetComputation *computation);
@@ -244,7 +249,7 @@ class Compiler {
   void DoBackwardComputationDescriptor(
       int32 step, NnetComputation *computation);
 
-  // Called from DoBackwardComputationSumDescriptor.  
+  // Called from DoBackwardComputationSumDescriptor.
   void DoBackwardComputationSumDescriptor(
       int32 step, int32 part_index,
       NnetComputation *computation) const;
@@ -253,14 +258,14 @@ class Compiler {
   void DoBackwardComputationFromSubmatLocationsList(
       int32 deriv_submatrix_index,
       const std::vector<std::vector<std::pair<int32, int32> > >&submat_locations,
-      NnetComputation *computation) const;  
+      NnetComputation *computation) const;
 
 
   void DoBackwardComputationFromSubmatLocations(
       int32 deriv_submatrix_index,
       const std::vector<std::pair<int32, int32> > &submat_locations,
-      NnetComputation *computation) const;  
-  
+      NnetComputation *computation) const;
+
   // Called from DoBackwardComputationFromSubmatLocations - special case where
   // input is from just one matrix.
   void DoBackwardComputationFromIndexes(
@@ -268,8 +273,8 @@ class Compiler {
       int32 input_deriv_submatrix_index,
       const std::vector<int32> &indexes,
       NnetComputation *computation) const;
-  
-  
+
+
   // [to be called after steps_ is set up and all the forward and backprop
   // commands have been added].  Adds to the computation the commands that
   // deinitialize all the matrices, except those that may be requested by
@@ -279,16 +284,17 @@ class Compiler {
 
   // sets up the debug_info member of "computation".
   void OutputDebugInfo(NnetComputation *computation) const;
-  
+
   void AddCommands(const std::vector<bool> &deriv_needed,
                    NnetComputation *computation);
 
 };
 
 
+
+
 } // namespace nnet3
 } // namespace kaldi
 
 
 #endif
-
diff --git a/src/nnet3/nnet-component-itf.cc b/src/nnet3/nnet-component-itf.cc
index cceafc4807d..cdb43473090 100644
--- a/src/nnet3/nnet-component-itf.cc
+++ b/src/nnet3/nnet-component-itf.cc
@@ -20,6 +20,7 @@
 
 #include <iterator>
 #include <sstream>
+#include <iomanip>
 #include "nnet3/nnet-component-itf.h"
 #include "nnet3/nnet-simple-component.h"
 #include "nnet3/nnet-general-component.h"
@@ -32,6 +33,35 @@
 namespace kaldi {
 namespace nnet3 {
 
+ComponentPrecomputedIndexes* ComponentPrecomputedIndexes::ReadNew(std::istream &is,
+                                                                  bool binary) {
+  std::string token;
+  ReadToken(is, binary, &token); // e.g. "<DistributePrecomputedComponentIndexes>".
+  token.erase(0, 1); // erase "<".
+  token.erase(token.length()-1); // erase ">".
+  ComponentPrecomputedIndexes *ans = NewComponentPrecomputedIndexesOfType(token);
+  if (!ans)
+   KALDI_ERR << "Unknown ComponentPrecomputedIndexes type " << token;
+  ans->Read(is, binary);
+  return ans;
+}
+
+ComponentPrecomputedIndexes* ComponentPrecomputedIndexes::NewComponentPrecomputedIndexesOfType(
+                                           const std::string &cpi_type) {
+  ComponentPrecomputedIndexes *ans = NULL;
+  if (cpi_type == "DistributeComponentPrecomputedIndexes") {
+    ans = new DistributeComponentPrecomputedIndexes();
+  } else if (cpi_type == "StatisticsExtractionComponentPrecomputedIndexes") {
+    ans = new StatisticsExtractionComponentPrecomputedIndexes();
+  } else if (cpi_type == "StatisticsPoolingComponentPrecomputedIndexes") {
+    ans = new StatisticsPoolingComponentPrecomputedIndexes();
+  }
+  if (ans != NULL) {
+    KALDI_ASSERT(cpi_type == ans->Type());
+  }
+  return ans;
+}
+
 // static
 Component* Component::ReadNew(std::istream &is, bool binary) {
   std::string token;
@@ -63,6 +93,8 @@ Component* Component::NewComponentOfType(const std::string &component_type) {
     ans = new NormalizeComponent();
   } else if (component_type == "PnormComponent") {
     ans = new PnormComponent();
+  } else if (component_type == "SumReduceComponent") {
+    ans = new SumReduceComponent();
   } else if (component_type == "AffineComponent") {
     ans = new AffineComponent();
   } else if (component_type == "NaturalGradientAffineComponent") {
@@ -71,6 +103,8 @@ Component* Component::NewComponentOfType(const std::string &component_type) {
     ans = new PerElementScaleComponent();
   } else if (component_type == "NaturalGradientPerElementScaleComponent") {
     ans = new NaturalGradientPerElementScaleComponent();
+  } else if (component_type == "PerElementOffsetComponent") {
+    ans = new PerElementOffsetComponent();
   } else if (component_type == "SumGroupComponent") {
     ans = new SumGroupComponent();
   } else if (component_type == "FixedAffineComponent") {
@@ -81,8 +115,35 @@ Component* Component::NewComponentOfType(const std::string &component_type) {
     ans = new FixedBiasComponent();
   } else if (component_type == "NoOpComponent") {
     ans = new NoOpComponent();
+  } else if (component_type == "ClipGradientComponent") {
+    ans = new ClipGradientComponent();
   } else if (component_type == "ElementwiseProductComponent") {
     ans = new ElementwiseProductComponent();
+  } else if (component_type == "ConvolutionComponent") {
+    ans = new ConvolutionComponent();
+  } else if (component_type == "MaxpoolingComponent") {
+    ans = new MaxpoolingComponent();
+  } else if (component_type == "PermuteComponent") {
+    ans = new PermuteComponent();
+  } else if (component_type == "DistributeComponent") {
+    ans = new DistributeComponent();
+  } else if (component_type == "CompositeComponent") {
+    ans = new CompositeComponent();
+  } else if (component_type == "RepeatedAffineComponent") {
+    ans = new RepeatedAffineComponent();
+  } else if (component_type == "BlockAffineComponent") {
+    ans = new BlockAffineComponent();
+  } else if (component_type == "NaturalGradientRepeatedAffineComponent") {
+    ans = new NaturalGradientRepeatedAffineComponent();
+  } else if (component_type == "StatisticsExtractionComponent") {
+    ans = new StatisticsExtractionComponent();
+  } else if (component_type == "StatisticsPoolingComponent") {
+    ans = new StatisticsPoolingComponent();
+  } else if (component_type == "ConstantFunctionComponent") {
+    ans = new ConstantFunctionComponent();
+  }
+  if (ans != NULL) {
+    KALDI_ASSERT(component_type == ans->Type());
   }
   return ans;
 }
@@ -117,12 +178,63 @@ bool Component::IsComputable(const MiscComputationInfo &misc_info,
 }
 
 
+void UpdatableComponent::InitLearningRatesFromConfig(ConfigLine *cfl) {
+  cfl->GetValue("learning-rate", &learning_rate_);
+  cfl->GetValue("learning-rate-factor", &learning_rate_factor_);
+  if (learning_rate_ < 0.0 || learning_rate_factor_ < 0.0)
+    KALDI_ERR << "Bad initializer " << cfl->WholeLine();
+}
+
+
+void UpdatableComponent::ReadUpdatableCommon(std::istream &is, bool binary) {
+  std::ostringstream opening_tag;
+  opening_tag << '<' << this->Type() << '>';
+  std::string token;
+  ReadToken(is, binary, &token);
+  if (token == opening_tag.str()) {
+    // if the first token is the opening tag, then
+    // ignore it and get the next tag.
+    ReadToken(is, binary, &token);
+  }
+  if (token == "<LearningRateFactor>") {
+    ReadBasicType(is, binary, &learning_rate_factor_);
+    ReadToken(is, binary, &token);
+  } else {
+    learning_rate_factor_ = 1.0;
+  }
+  if (token == "<IsGradient>") {
+    ReadBasicType(is, binary, &is_gradient_);
+    ReadToken(is, binary, &token);
+  } else {
+    is_gradient_ = false;
+  }
+  if (token == "<LearningRate>") {
+    ReadBasicType(is, binary, &learning_rate_);
+  } else {
+    KALDI_ERR << "Expected token <LearningRate>, got "
+              << token;
+  }
+}
 
-void UpdatableComponent::Init(BaseFloat lr, bool is_gradient) {
-  learning_rate_ = lr;
-  is_gradient_ = is_gradient;
+void UpdatableComponent::WriteUpdatableCommon(std::ostream &os,
+                                              bool binary) const {
+  std::ostringstream opening_tag;
+  opening_tag << '<' << this->Type() << '>';
+  std::string token;
+  WriteToken(os, binary, opening_tag.str());
+  if (learning_rate_factor_ != 1.0) {
+    WriteToken(os, binary, "<LearningRateFactor>");
+    WriteBasicType(os, binary, learning_rate_factor_);
+  }
+  if (is_gradient_) {
+    WriteToken(os, binary, "<IsGradient>");
+    WriteBasicType(os, binary, is_gradient_);
+  }
+  WriteToken(os, binary, "<LearningRate>");
+  WriteBasicType(os, binary, learning_rate_);
 }
 
+
 std::string UpdatableComponent::Info() const {
   std::stringstream stream;
   stream << Type() << ", input-dim=" << InputDim()
@@ -130,6 +242,8 @@ std::string UpdatableComponent::Info() const {
          << LearningRate();
   if (is_gradient_)
     stream << ", is-gradient=true";
+  if (learning_rate_factor_ != 1.0)
+    stream << ", learning-rate-factor=" << learning_rate_factor_;
   return stream.str();
 }
 
@@ -170,8 +284,31 @@ void NonlinearComponent::ZeroStats() {
 
 std::string NonlinearComponent::Info() const {
   std::stringstream stream;
-  KALDI_ASSERT(InputDim() == OutputDim());  // always the case
-  stream << Type() << ", dim=" << InputDim();
+  if (InputDim() == OutputDim())
+    stream << Type() << ", dim=" << InputDim();
+  else
+    stream << Type() << ", input-dim=" << InputDim()
+           << ", output-dim=" << OutputDim()
+           << ", add-log-stddev=true";
+
+  if (self_repair_lower_threshold_ != BaseFloat(kUnsetThreshold))
+    stream << ", self-repair-lower-threshold=" << self_repair_lower_threshold_;
+  if (self_repair_upper_threshold_ != BaseFloat(kUnsetThreshold))
+    stream << ", self-repair-upper-threshold=" << self_repair_upper_threshold_;
+  if (self_repair_scale_ != 0.0)
+    stream << ", self-repair-scale=" << self_repair_scale_;
+  if (count_ > 0 && value_sum_.Dim() == dim_ &&  deriv_sum_.Dim() == dim_) {
+    stream << ", count=" << std::setprecision(3) << count_
+           << std::setprecision(6);
+    Vector<double> value_avg_dbl(value_sum_);
+    Vector<BaseFloat> value_avg(value_avg_dbl);
+    value_avg.Scale(1.0 / count_);
+    stream << ", value-avg=" << SummarizeVector(value_avg);
+    Vector<double> deriv_avg_dbl(deriv_sum_);
+    Vector<BaseFloat> deriv_avg(deriv_avg_dbl);
+    deriv_avg.Scale(1.0 / count_);
+    stream << ", deriv-avg=" << SummarizeVector(deriv_avg);
+  }
   return stream.str();
 }
 
@@ -181,16 +318,19 @@ void NonlinearComponent::Scale(BaseFloat scale) {
   count_ *= scale;
 }
 
-void NonlinearComponent::Add(BaseFloat alpha, const NonlinearComponent &other) {
-  if (value_sum_.Dim() == 0 && other.value_sum_.Dim() != 0)
-    value_sum_.Resize(other.value_sum_.Dim());
-  if (deriv_sum_.Dim() == 0 && other.deriv_sum_.Dim() != 0)
-    deriv_sum_.Resize(other.deriv_sum_.Dim());
-  if (other.value_sum_.Dim() != 0)
-    value_sum_.AddVec(alpha, other.value_sum_);
-  if (other.deriv_sum_.Dim() != 0)
-    deriv_sum_.AddVec(alpha, other.deriv_sum_);
-  count_ += alpha * other.count_;
+void NonlinearComponent::Add(BaseFloat alpha, const Component &other_in) {
+  const NonlinearComponent *other =
+      dynamic_cast<const NonlinearComponent*>(&other_in);
+  KALDI_ASSERT(other != NULL);
+  if (value_sum_.Dim() == 0 && other->value_sum_.Dim() != 0)
+    value_sum_.Resize(other->value_sum_.Dim());
+  if (deriv_sum_.Dim() == 0 && other->deriv_sum_.Dim() != 0)
+    deriv_sum_.Resize(other->deriv_sum_.Dim());
+  if (other->value_sum_.Dim() != 0)
+    value_sum_.AddVec(alpha, other->value_sum_);
+  if (other->deriv_sum_.Dim() != 0)
+    deriv_sum_.AddVec(alpha, other->deriv_sum_);
+  count_ += alpha * other->count_;
 }
 
 void NonlinearComponent::Read(std::istream &is, bool binary) {
@@ -199,29 +339,32 @@ void NonlinearComponent::Read(std::istream &is, bool binary) {
   ostr_end << "</" << Type() << ">"; // e.g. "</SigmoidComponent>"
   ExpectOneOrTwoTokens(is, binary, ostr_beg.str(), "<Dim>");
   ReadBasicType(is, binary, &dim_); // Read dimension.
-  std::string tok; // TODO: remove back-compatibility code.
-  ReadToken(is, binary, &tok);
-  if (tok == "<ValueSum>") {
-    // this branch is for back compatibility.  TODO: delete it
-    // after Dec 2015.
-    value_sum_.Read(is, binary);
-    ExpectToken(is, binary, "<DerivSum>");
-    deriv_sum_.Read(is, binary);
-    ExpectToken(is, binary, "<Count>");
-    ReadBasicType(is, binary, &count_);
-    ExpectToken(is, binary, ostr_end.str());
-  } else {
-    // The new format is more readable as we write values that are normalized by
-    // the count.
-    KALDI_ASSERT(tok == "<ValueAvg>");
-    value_sum_.Read(is, binary);
-    ExpectToken(is, binary, "<DerivAvg>");
-    deriv_sum_.Read(is, binary);
-    ExpectToken(is, binary, "<Count>");
-    ReadBasicType(is, binary, &count_);
-    value_sum_.Scale(count_);
-    deriv_sum_.Scale(count_);
-    ExpectToken(is, binary, ostr_end.str());
+  ExpectToken(is, binary, "<ValueAvg>");
+  value_sum_.Read(is, binary);
+  ExpectToken(is, binary, "<DerivAvg>");
+  deriv_sum_.Read(is, binary);
+  ExpectToken(is, binary, "<Count>");
+  ReadBasicType(is, binary, &count_);
+  value_sum_.Scale(count_);
+  deriv_sum_.Scale(count_);
+
+  std::string token;
+  ReadToken(is, binary, &token);
+  if (token == "<SelfRepairLowerThreshold>") {
+    ReadBasicType(is, binary, &self_repair_lower_threshold_);
+    ReadToken(is, binary, &token);
+  }
+  if (token == "<SelfRepairUpperThreshold>") {
+    ReadBasicType(is, binary, &self_repair_upper_threshold_);
+    ReadToken(is, binary, &token);
+  }
+  if (token == "<SelfRepairScale>") {
+    ReadBasicType(is, binary, &self_repair_scale_);
+    ReadToken(is, binary, &token);
+  }
+  if (token != ostr_end.str()) {
+    KALDI_ERR << "Expected token " << ostr_end.str()
+              << ", got " << token;
   }
 }
 
@@ -246,21 +389,45 @@ void NonlinearComponent::Write(std::ostream &os, bool binary) const {
   temp.Write(os, binary);
   WriteToken(os, binary, "<Count>");
   WriteBasicType(os, binary, count_);
+  if (self_repair_lower_threshold_ != kUnsetThreshold) {
+    WriteToken(os, binary, "<SelfRepairLowerThreshold>");
+    WriteBasicType(os, binary, self_repair_lower_threshold_);
+  }
+  if (self_repair_upper_threshold_ != kUnsetThreshold) {
+    WriteToken(os, binary, "<SelfRepairUpperThreshold>");
+    WriteBasicType(os, binary, self_repair_upper_threshold_);
+  }
+  if (self_repair_scale_ != 0.0) {
+    WriteToken(os, binary, "<SelfRepairScale>");
+    WriteBasicType(os, binary, self_repair_scale_);
+  }
   WriteToken(os, binary, ostr_end.str());
 }
 
+NonlinearComponent::NonlinearComponent():
+    dim_(-1), count_(0.0),
+    self_repair_lower_threshold_(kUnsetThreshold),
+    self_repair_upper_threshold_(kUnsetThreshold),
+    self_repair_scale_(0.0) { }
+
 NonlinearComponent::NonlinearComponent(const NonlinearComponent &other):
     dim_(other.dim_), value_sum_(other.value_sum_), deriv_sum_(other.deriv_sum_),
-    count_(other.count_) { }
+    count_(other.count_),
+    self_repair_lower_threshold_(other.self_repair_lower_threshold_),
+    self_repair_upper_threshold_(other.self_repair_upper_threshold_),
+    self_repair_scale_(other.self_repair_scale_) { }
 
 void NonlinearComponent::InitFromConfig(ConfigLine *cfl) {
-  int32 dim;
-  bool ok = cfl->GetValue("dim", &dim);
-  if (!ok || cfl->HasUnusedValues() || dim <= 0)
+  bool ok = cfl->GetValue("dim", &dim_);
+  cfl->GetValue("self-repair-lower-threshold", &self_repair_lower_threshold_);
+  cfl->GetValue("self-repair-upper-threshold", &self_repair_upper_threshold_);
+  cfl->GetValue("self-repair-scale", &self_repair_scale_);
+  if (!ok || cfl->HasUnusedValues() || dim_ <= 0)
     KALDI_ERR << "Invalid initializer for layer of type "
               << Type() << ": \"" << cfl->WholeLine() << "\"";
-  Init(dim);
 }
 
+
+
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-component-itf.h b/src/nnet3/nnet-component-itf.h
index b8e48163a4e..81631397fad 100644
--- a/src/nnet3/nnet-component-itf.h
+++ b/src/nnet3/nnet-component-itf.h
@@ -2,6 +2,7 @@
 
 // Copyright      2015  Johns Hopkins University (author: Daniel Povey)
 //                2015  Guoguo Chen
+//                2015  Xiaohui Zhang
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -39,7 +40,13 @@ enum ComponentProperties {
                              // of output and this component doesn't care about the indexes
                              // (i.e. it maps each row of input to each row of output without
                              // regard to the index values).  Will normally be true.
-  kUpdatableComponent = 0x002,  // true if the component has parameters that can be updated.
+  kUpdatableComponent = 0x002,  // true if the component has parameters that can
+                                // be updated.  Components that return this flag
+                                // must be dynamic_castable to type
+                                // UpdatableComponent (but components of type
+                                // UpdatableComponent do not have to return this
+                                // flag, e.g.  if this instance is not really
+                                // updatable).
   kLinearInInput = 0x004,    // true if the component's output is always a
                              // linear function of its input, i.e. alpha times
                              // input gives you alpha times output.
@@ -55,22 +62,28 @@ enum ComponentProperties {
                            // than setting, its output.  The Component chooses
                            // whether to add or set, and the calling code has to
                            // accommodate it.
-  kReordersIndexes = 0x040,  // true if the ReordersIndexes function might reorder
+  kReordersIndexes = 0x040,  // true if the ReorderIndexes function might reorder
                              // the indexes (otherwise we can skip calling it).
+                             // Must not be set for simple components.
   kBackpropAdds = 0x080,   // true if the Backprop function adds to, rather than
-                           // setting, the "in_deriv" output.  The Component chooses
-                           // whether to add or set, and the calling code has to
-                           // accommodate it.
-  kBackpropNeedsInput  = 0x100,  // true if backprop operation needs access to
-                                 // forward-pass input.
+                           // setting, the "in_deriv" output.  The Component
+                           // chooses whether to add or set, and the calling
+                           // code has to accommodate it.  Note: in the case of
+                           // in-place backprop, this flag has no effect.
+  kBackpropNeedsInput = 0x100,  // true if backprop operation needs access to
+                                // forward-pass input.
   kBackpropNeedsOutput = 0x200,  // true if backprop operation needs access to
                                  // forward-pass output (e.g. true for Sigmoid).
   kBackpropInPlace = 0x400,   // true if we can do the backprop operation in-place
                              // (input and output matrices may be the same).
-  kStoresStats = 0x800       // true if the StoreStats operation stores
+  kStoresStats = 0x800,      // true if the StoreStats operation stores
                              // statistics e.g. on average node activations and
                              // derivatives of the nonlinearity, (as it does for
                              // Tanh, Sigmoid, ReLU and Softmax).
+  kInputContiguous = 0x1000,  // true if the component requires its input data (and
+                              // input derivatives) to have Stride()== NumCols().
+  kOutputContiguous = 0x2000  // true if the component requires its input data (and
+                              // output derivatives) to have Stride()== NumCols().
 };
 
 
@@ -86,7 +99,14 @@ enum ComponentProperties {
 class ComponentPrecomputedIndexes {
  public:
   virtual ComponentPrecomputedIndexes *Copy() const = 0;
-  virtual ~ComponentPrecomputedIndexes();
+  virtual void Write(std::ostream &os, bool binary) const = 0;
+  virtual void Read(std::istream &os, bool binary) = 0;
+  virtual std::string Type() const = 0;
+  static ComponentPrecomputedIndexes* ReadNew(std::istream &is, bool binary);
+  // cpi stands for component_precomputed_indexes
+  static ComponentPrecomputedIndexes* NewComponentPrecomputedIndexesOfType(
+                                           const std::string &cpi_type);
+  virtual ~ComponentPrecomputedIndexes() { }
 };
 
 
@@ -110,10 +130,12 @@ class Component {
                          const CuMatrixBase<BaseFloat> &in,
                          CuMatrixBase<BaseFloat> *out) const = 0;
 
-  /// \brief Backprop function.
-  ///   \param [in] debug_info  Some kind of component name and/or index in
-  ///     the network, to be printed out in any warning messages so we can
-  ///     identify which layer the message pertains so.
+  /// \brief Backprop function; depending on which of the arguments 'to_update'
+  ///     and 'in_deriv' are non-NULL, this can compute input-data derivatives
+  ///     and/or perform model update.
+  ///
+  ///   \param [in] debug_info  The component name, to be printed out in any
+  ///       warning messages.
   ///   \param [in] indexes     A pointer to some information output by this
   ///      class's PrecomputeIndexes function (will be NULL for simple
   ///      components, i.e. those that don't do things like splicing).
@@ -126,6 +148,8 @@ class Component {
   ///   \param [in] out_deriv  The derivative at the output of this component.
   ///   \param [out] to_update  If model update is desired, the Component
   ///       to be updated, else NULL.  Does not have to be identical to this.
+  ///       If supplied, you can assume that
+  ///       to_update->Properties() & kUpdatableComponent is nonzero.
   ///   \param [out] in_deriv   The derivative at the input of this component,
   ///       if needed (else NULL).   If  Properties()&kBackpropInPlace, may be
   ///       the same matrix as out_deriv.  If Properties()&kBackpropAdds, this
@@ -134,7 +158,7 @@ class Component {
   virtual void Backprop(const std::string &debug_info,
                         const ComponentPrecomputedIndexes *indexes,
                         const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,                        
+                        const CuMatrixBase<BaseFloat> &out_value,
                         const CuMatrixBase<BaseFloat> &out_deriv,
                         Component *to_update, // may be NULL; may be identical
                                               // to "this" or different.
@@ -151,10 +175,12 @@ class Component {
 
   /// \brief Components that provide an implementation of StoreStats should also
   ///        provide an implementation of ZeroStats(), to set those stats to
-  ///        zero.
+  ///        zero.  Other components that store other types of statistics
+  ///        (e.g. regarding gradient clipping) are free to implement ZeroStats()
+  ///        also.
   virtual void ZeroStats() { }
-  
-  
+
+
 
   /// \brief  This function only does something interesting for non-simple Components.
   ///   For a given index at the output of the component, tells us what indexes
@@ -209,7 +235,7 @@ class Component {
                             const Index &output_index,
                             const IndexSet &input_index_set,
                             std::vector<Index> *used_inputs) const;
-  
+
   /// \brief This function only does something interesting for non-simple
   ///  Components.  It provides an opportunity for a Component to reorder the
   ///  indexes at its input and output.  This might be useful, for instance, if
@@ -222,9 +248,9 @@ class Component {
   ///  \param [in,out]  Indexes at the output of the Component
   virtual void ReorderIndexes(std::vector<Index> *input_indexes,
                               std::vector<Index> *output_indexes) const {}
-  
 
-  
+
+
   /// \brief This function only returns non-NULL for non-simple Components (and
   ///     may still return NULL for non-simple Compoennts).  Returns a pointer
   ///     to a class that may contain some precomputed component-specific and
@@ -260,17 +286,17 @@ class Component {
 
   /// \brief Returns a string such as "SigmoidComponent", describing
   ///        the type of the object.
-  virtual std::string Type() const = 0; 
+  virtual std::string Type() const = 0;
 
   /// \brief  Initialize, from a ConfigLine object.
   /// \param [in] cfl  A ConfigLine containing any parameters that
-  ///            are needed for initialization. For example: 
+  ///            are needed for initialization. For example:
   ///            "dim=100 param-stddev=0.1"
   virtual void InitFromConfig(ConfigLine *cfl) = 0;
-  
+
   /// \brief Returns input-dimension of this component.
   virtual int32 InputDim() const = 0;
-  
+
   /// \brief Returns output-dimension of this component.
   virtual int32 OutputDim() const = 0;
 
@@ -283,17 +309,17 @@ class Component {
   static Component* ReadNew(std::istream &is, bool binary);
 
   /// \brief Copies component (deep copy).
-  virtual Component* Copy() const = 0;  
+  virtual Component* Copy() const = 0;
 
   /// \brief Returns a new Component of the given type e.g. "SoftmaxComponent",
-  ///   or NULL if no such component type exists. 
+  ///   or NULL if no such component type exists.
   static Component *NewComponentOfType(const std::string &type);
 
   /// \brief Read function (used after we know the type of the Component);
   ///   accepts input that is missing the token that describes the component
   ///   type, in case it has already been consumed.
-  virtual void Read(std::istream &is, bool binary) = 0; 
-  
+  virtual void Read(std::istream &is, bool binary) = 0;
+
   /// \brief Write component to stream
   virtual void Write(std::ostream &os, bool binary) const = 0;
 
@@ -302,11 +328,25 @@ class Component {
   ///     although most components will have much more info.
   virtual std::string Info() const;
 
+  /// This virtual function when called by
+  //    -- an UpdatableComponent scales the parameters
+  ///      by "scale" when called by an UpdatableComponent.
+  //    -- a Nonlinear component it relates to scaling activation stats, not parameters.
+  virtual void Scale(BaseFloat scale) {};
+
+  /// This virtual function when called by
+  ///    -- an UpdatableComponent adds the parameters of
+  ///      another updatable component, times some constant, to the current
+  ///      parameters.
+  ///    -- a NonlinearComponent it relates to adding stats
+  /// Otherwise it should do nothing.
+  virtual void Add(BaseFloat alpha, const Component &other) {};
+
   Component() { }
-  
+
   virtual ~Component() { }
 
- private:  
+ private:
   KALDI_DISALLOW_COPY_AND_ASSIGN(Component);
 };
 
@@ -315,52 +355,52 @@ class Component {
 /**
  * Class UpdatableComponent is a Component which has trainable parameters; it
  * extends the interface of Component.  This is a base-class for Components with
- * parameters.
+ * parameters.  See comment by declaration of kUpdatableComponent.
+ * The functions in this interface must only be called if the component returns
+ * the kUpdatable flag.
  */
 class UpdatableComponent: public Component {
  public:
   UpdatableComponent(const UpdatableComponent &other):
       learning_rate_(other.learning_rate_),
+      learning_rate_factor_(other.learning_rate_factor_),
       is_gradient_(other.is_gradient_) { }
-  
-  void Init(BaseFloat lr, bool is_gradient = false);
-
-  UpdatableComponent(BaseFloat learning_rate) {  Init(learning_rate); }
 
   /// \brief Sets parameters to zero, and if treat_as_gradient is true,
-  ///    sets is_gradient_ to true and the learning rate to 1.
+  ///  sets is_gradient_ to true and sets learning_rate_ to 1, ignoring
+  ///  learning_rate_factor_.
   virtual void SetZero(bool treat_as_gradient) = 0;
-  
-  UpdatableComponent(): learning_rate_(0.001) { }
-  
+
+  UpdatableComponent(): learning_rate_(0.001), learning_rate_factor_(1.0),
+                        is_gradient_(false) { }
+
   virtual ~UpdatableComponent() { }
 
   /// \brief Computes dot-product between parameters of two instances of a
   ///  Component.  Can be used for computing parameter-norm of an
   ///  UpdatableComponent.
   virtual BaseFloat DotProduct(const UpdatableComponent &other) const = 0;
-  
+
   /// This function is to be used in testing.  It adds unit noise times "stddev"
   /// to the parameters of the component.
   virtual void PerturbParams(BaseFloat stddev) = 0;
-  
-  /// This virtual function (not in base-class Component) scales the parameters
-  /// by "scale".
-  virtual void Scale(BaseFloat scale) = 0;
-
-  /// This virtual function (not in base-class Component) adds the parameters of
-  /// another updatable component, times some constant, to the current
-  /// parameters.
-  virtual void Add(BaseFloat alpha, const UpdatableComponent &other) = 0;
-  
-  /// Sets the learning rate of gradient descent
-  void SetLearningRate(BaseFloat lrate) {  learning_rate_ = lrate; }
-
-  /// Gets the learning rate of gradient descent
+
+  /// Sets the learning rate of gradient descent- gets multiplied by
+  /// learning_rate_factor_.
+  virtual void SetUnderlyingLearningRate(BaseFloat lrate) {
+    learning_rate_ = lrate * learning_rate_factor_;
+  }
+
+  /// Sets the learning rate directly, bypassing learning_rate_factor_.
+  virtual void SetActualLearningRate(BaseFloat lrate) { learning_rate_ = lrate; }
+
+  /// Gets the learning rate of gradient descent.  Note: if you call
+  /// SetLearningRate(x), and learning_rate_factor_ != 1.0,
+  /// a different value than x will returned.
   BaseFloat LearningRate() const { return learning_rate_; }
 
   virtual std::string Info() const;
-  
+
   /// The following new virtual function returns the total dimension of
   /// the parameters in this class.
   virtual int32 NumParameters() const { KALDI_ASSERT(0); return 0; }
@@ -373,13 +413,31 @@ class UpdatableComponent: public Component {
   virtual void UnVectorize(const VectorBase<BaseFloat> &params) {
     KALDI_ASSERT(0);
   }
-  
- protected: 
+
+ protected:
+  // to be called from child classes, extracts any learning rate information
+  // from the config line and sets them appropriately.
+  void InitLearningRatesFromConfig(ConfigLine *cfl);
+
+  // To be used in child-class Read() functions, this function reads the opening
+  // tag <ThisComponentType> and the learning-rate factor and the learning-rate.
+  void ReadUpdatableCommon(std::istream &is, bool binary);
+
+  // To be used in child-class Write() functions, writes the opening
+  // <ThisComponentType> tag and the learning-rate factor (if not 1.0) and the
+  // learning rate;
+  void WriteUpdatableCommon(std::ostream &is, bool binary) const;
+
   BaseFloat learning_rate_; ///< learning rate (typically 0.0..0.01)
+  BaseFloat learning_rate_factor_; ///< learning rate factor (normally 1.0, but
+                                   ///< can be set to another < value so that
+                                   ///when < you call SetLearningRate(), that
+                                   ///value will be scaled by this factor.
   bool is_gradient_;  ///< True if this component is to be treated as a gradient rather
                       ///< than as parameters.  Its main effect is that we disable
                       ///< any natural-gradient update and just compute the standard
                       ///< gradient.
+
  private:
   const UpdatableComponent &operator = (const UpdatableComponent &other); // Disallow.
 };
@@ -390,55 +448,70 @@ class UpdatableComponent: public Component {
 /// during training.
 class NonlinearComponent: public Component {
  public:
-  void Init(int32 dim) { dim_ = dim; count_ = 0.0; }
-  explicit NonlinearComponent(int32 dim) { Init(dim); }
-  NonlinearComponent(): dim_(0) { } // e.g. prior to Read().
+
+  NonlinearComponent();
   explicit NonlinearComponent(const NonlinearComponent &other);
-  
+
   virtual int32 InputDim() const { return dim_; }
   virtual int32 OutputDim() const { return dim_; }
-  
-  /// We implement InitFromConfig at this level.
+
+  // We implement InitFromConfig at this level.
+  // supported config parameters and their defaults:
+  //   dim=-1  self-repair-lower-threshold=-1000  self-repair-upper-threshold=-1000
+  //     self-repair-constant=0.0
+  // the 'self-repair' stuff is 'self-repairing' nonlinearities-- they add small
+  // quantities to the derivative to attempt to keep the average value (for
+  // bounded nonlinearities) or average derivative (for ReLU) for each
+  // dimension within a given range.  The default ranges (if you don't
+  // specify self-repair-lower-threshold or self-repair-upper-threshold) are
+  // dependent on the nonlinearity and are set in their Backprop functions.
+  // To activate this code you have to set self-repair-constant to a number >0 like
+  // 0.0001 when initializing the ReLU (this is a scaling factor on the 'fake
+  // derivative').  This code is only activated if derivative and value stats
+  // are present in the model, which will typically only be the case
+  // if the 'store-stats' code is activated
+  // (e.g. --optimization.store-stats=true) because it needs the stats.  To be
+  // activated this code also requires that is_gradient_ is false (i.e. you're
+  // not computing exact gradients).
+
   virtual void InitFromConfig(ConfigLine *cfl);
-  
+
   /// We implement Read at this level as it just needs the Type().
   virtual void Read(std::istream &is, bool binary);
 
   virtual void ZeroStats();
-  
+
   virtual std::string Info() const;
-  
+
   /// Write component to stream.
   virtual void Write(std::ostream &os, bool binary) const;
 
-  // relates to scaling activation stats, not parameters.
-  void Scale(BaseFloat scale);
-
-  // relates to adding stats  
-  void Add(BaseFloat alpha, const NonlinearComponent &other);
+  virtual void Scale(BaseFloat scale);
+  virtual void Add(BaseFloat alpha, const Component &other);
 
   // The following functions are unique to NonlinearComponent.
   // They mostly relate to diagnostics.
   const CuVector<double> &ValueSum() const { return value_sum_; }
   const CuVector<double> &DerivSum() const { return deriv_sum_; }
-  
+
   double Count() const { return count_; }
 
  protected:
-  friend class NormalizationComponent;
+  enum { kUnsetThreshold = -1000 };
+
   friend class SigmoidComponent;
   friend class TanhComponent;
   friend class SoftmaxComponent;
   friend class LogSoftmaxComponent;
   friend class RectifiedLinearComponent;
-  
+
   // This function updates the stats "value_sum_", "deriv_sum_", and
   // count_. (If deriv == NULL, it won't update "deriv_sum_").
   // It will be called from the Backprop function of child classes.
   void StoreStatsInternal(const CuMatrixBase<BaseFloat> &out_value,
                           const CuMatrixBase<BaseFloat> *deriv = NULL);
 
-  
+
   const NonlinearComponent &operator = (const NonlinearComponent &other); // Disallow.
   int32 dim_;
   CuVector<double> value_sum_; // stats at the output.
@@ -446,6 +519,13 @@ class NonlinearComponent: public Component {
                                // (only applicable to element-by-element
                                // nonlinearities, not Softmax.
   double count_;
+
+
+  // some configuration values relating to self-repairing nonlinearities.
+  BaseFloat self_repair_lower_threshold_;
+  BaseFloat self_repair_upper_threshold_;
+  BaseFloat self_repair_scale_;
+
   // The mutex is used in UpdateStats, only for resizing vectors.
   Mutex mutex_;
 };
diff --git a/src/nnet3/nnet-component-test.cc b/src/nnet3/nnet-component-test.cc
index a6407001195..61016c3d7ae 100644
--- a/src/nnet3/nnet-component-test.cc
+++ b/src/nnet3/nnet-component-test.cc
@@ -24,6 +24,40 @@
 namespace kaldi {
 namespace nnet3 {
 
+// returns true if two are string are equal except for what looks like it might
+// be a difference last digit of a floating point number, e.g. accept
+// 1.234 to be the same as 1.235.  Not very rigorous.
+static bool StringsApproxEqual(const std::string &a,
+                               const std::string &b) {
+  if (a == b || a.size() != b.size())
+    return true;
+  size_t size = a.size();
+  for (size_t pos = 0; pos < size; pos++) {
+    if (a[pos] != b[pos]) {
+      if (!isdigit(a[pos]) || !isdigit(b[pos]))
+        goto fail;
+      // if it's not the last digit in the string, goto fail
+      if (pos + 1 != size && isdigit(a[pos+1]))
+        goto fail;
+      size_t pos2;
+      for (pos2 = pos - 1; pos2 > 0; pos2--) {
+        if (a[pos2] == '.') break;  // we accept this difference: we went backwards and found a '.'
+        if (!isdigit(a[pos2]))  // we reject this difference: we went back and
+                                // found non-digit before '.' -> not floating
+                                // point.
+          goto fail;
+      }
+      if (pos2 == 0)
+        goto fail;
+    }
+  }
+  return true;
+fail:
+  KALDI_WARN << "Info strings differ: '" << a << "' vs. '" << b << "'.";
+  return false;
+}
+
+
 void TestNnetComponentIo(Component *c) {
   bool binary = (Rand() % 2 == 0);
   std::ostringstream os1;
@@ -38,32 +72,191 @@ void TestNnetComponentIo(Component *c) {
   delete c2;
 }
 
+void TestNnetComponentCopy(Component *c) {
+  Component *c2 = c->Copy();
+  if (!StringsApproxEqual(c->Info(), c2->Info())) {
+    KALDI_ERR << "Expected info strings to be equal: '"
+              << c->Info() << "' vs. '" << c2->Info() << "'";
+  }
+  delete c2;
+}
+
+void TestNnetComponentAddScale(Component *c) {
+  Component *c2 = c->Copy();
+  Component *c3 = c2->Copy();
+  c3->Add(0.5, *c2);
+  c2->Scale(1.5);
+  KALDI_ASSERT(StringsApproxEqual(c2->Info(), c3->Info()));
+  delete c2;
+  delete c3;
+}
+
+void TestNnetComponentVectorizeUnVectorize(Component *c) {
+  if (!(c->Properties() & kUpdatableComponent))
+    return;
+  UpdatableComponent *uc = dynamic_cast<UpdatableComponent*>(c);
+  KALDI_ASSERT(uc != NULL);
+  UpdatableComponent *uc2 = dynamic_cast<UpdatableComponent*>(uc->Copy());
+  uc2->SetZero(false);
+  Vector<BaseFloat> params(uc2->NumParameters());
+  uc2->Vectorize(&params);
+  KALDI_ASSERT(params.Min()==0.0 && params.Sum()==0.0);
+  uc->Vectorize(&params);
+  uc2->UnVectorize(params);
+  KALDI_ASSERT(StringsApproxEqual(uc2->Info(), uc->Info()));
+  BaseFloat x = uc2->DotProduct(*uc2), y = uc->DotProduct(*uc),
+      z = uc2->DotProduct(*uc);
+  KALDI_ASSERT(ApproxEqual(x, y) && ApproxEqual(y, z));
+  Vector<BaseFloat> params2(uc2->NumParameters());
+  uc2->Vectorize(&params2);
+  for(int i = 0; i < params.Dim(); i++)
+    KALDI_ASSERT(params(i) == params2(i));
+  delete uc2;
+}
+
+void TestStringsApproxEqual() {
+  // we must test the test.
+  KALDI_ASSERT(!StringsApproxEqual("a", "b"));
+  KALDI_ASSERT(!StringsApproxEqual("1", "2"));
+  KALDI_ASSERT(StringsApproxEqual("1.234", "1.235"));
+  KALDI_ASSERT(StringsApproxEqual("x 1.234 y", "x 1.235 y"));
+  KALDI_ASSERT(StringsApproxEqual("x 1.234 y 6.41", "x 1.235 y 6.49"));
+}
+
+void TestNnetComponentUpdatable(Component *c) {
+  if (!(c->Properties() & kUpdatableComponent))
+    return;
+  UpdatableComponent *uc = dynamic_cast<UpdatableComponent*>(c);
+  if (uc == NULL) {
+    KALDI_ASSERT(!(c->Properties() & kUpdatableComponent) &&
+                 "Component returns updatable flag but does not inherit "
+                 "from UpdatableComponent");
+    return;
+  }
+  if(!(uc->Properties() & kUpdatableComponent)){
+    // testing that if it declares itself as non-updatable,
+    // Scale() and Add() and SetZero() have no effect.
+    KALDI_ASSERT(uc->NumParameters() == 0);
+    KALDI_ASSERT(uc->DotProduct(*uc) == 0);
+    UpdatableComponent *uc2 = dynamic_cast<UpdatableComponent*>(uc->Copy());
+    uc2->Scale(7.0);
+    uc2->Add(3.0, *uc);
+    KALDI_ASSERT(StringsApproxEqual(uc2->Info(), uc->Info()));
+    uc->SetZero(false);
+    KALDI_ASSERT(StringsApproxEqual(uc2->Info(), uc->Info()));
+    delete uc2;
+  } else {
+    KALDI_ASSERT(uc->NumParameters() != 0);
+    UpdatableComponent *uc2 = dynamic_cast<UpdatableComponent*>(uc->Copy()),
+        *uc3 = dynamic_cast<UpdatableComponent*>(uc->Copy());
+
+    // testing some expected invariances of scale and add.
+    uc2->Scale(5.0);
+    uc2->Add(3.0, *uc3);
+    uc3->Scale(8.0);
+    // now they should both be scaled to 8 times the original component.
+    if (!StringsApproxEqual(uc2->Info(), uc3->Info())) {
+      KALDI_ERR << "Expected info strings to be equal: '"
+                << uc2->Info() << "' vs. '" << uc3->Info() << "'";
+    }
+    // testing that scaling by 0.5 works the same whether
+    // done on the vectorized paramters or via Scale().
+    Vector<BaseFloat> vec2(uc->NumParameters());
+    uc2->Vectorize(&vec2);
+    vec2.Scale(0.5);
+    uc2->UnVectorize(vec2);
+    uc3->Scale(0.5);
+    KALDI_ASSERT(uc2->Info() == uc3->Info());
+
+    // testing that SetZero() works the same whether done on the vectorized
+    // paramters or via SetZero(), and that unvectorizing something that's been
+    // zeroed gives us zero parameters.
+    uc2->Vectorize(&vec2);
+    vec2.SetZero();
+    uc2->UnVectorize(vec2);
+    uc3->SetZero(false);
+    uc3->Vectorize(&vec2);
+    KALDI_ASSERT(uc2->Info() == uc3->Info() && VecVec(vec2, vec2) == 0.0);
+
+    delete uc2;
+    delete uc3;
+  }
+}
+
 // tests the properties kPropagateAdds, kBackpropAdds,
 // kBackpropNeedsInput, kBackpropNeedsOutput.
 void TestSimpleComponentPropagateProperties(const Component &c) {
-  Component *c_copy = NULL;
+  int32 properties = c.Properties();
+  Component *c_copy = NULL, *c_copy_scaled = NULL;
   if (RandInt(0, 1) == 0)
     c_copy = c.Copy();  // This will test backprop with an updatable component.
+  if (RandInt(0, 1) == 0 &&
+      (properties & kLinearInParameters)) {
+    c_copy_scaled = c.Copy();  // This will test backprop with an updatable component.
+    c_copy_scaled->Scale(0.5);
+  }
+  MatrixStrideType input_stride_type = (c.Properties()&kInputContiguous) ?
+      kStrideEqualNumCols : kDefaultStride;
+  MatrixStrideType output_stride_type = (c.Properties()&kOutputContiguous) ?
+      kStrideEqualNumCols : kDefaultStride;
+
   int32 input_dim = c.InputDim(),
       output_dim = c.OutputDim(),
       num_rows = RandInt(1, 100);
-  int32 properties = c.Properties();
-  CuMatrix<BaseFloat> input_data(num_rows, input_dim),
-      output_data1(num_rows, output_dim),
-      output_data2(num_rows, output_dim);
-  output_data2.Add(1.0); 
+  CuMatrix<BaseFloat> input_data(num_rows, input_dim, kUndefined,
+                                 input_stride_type);
+  input_data.SetRandn();
+  CuMatrix<BaseFloat> input_data_scaled(num_rows, input_dim, kUndefined,
+                                        input_stride_type),
+      output_data3(num_rows, input_dim, kSetZero,
+                   output_stride_type);
+  input_data_scaled.CopyFromMat(input_data);
+  output_data3.CopyFromMat(input_data);
+  CuMatrix<BaseFloat>
+      output_data1(num_rows, output_dim, kSetZero, output_stride_type),
+      output_data2(num_rows, output_dim, kSetZero, output_stride_type),
+      output_data4(num_rows, output_dim, kSetZero, output_stride_type),
+      output_data5(num_rows, output_dim, kSetZero, output_stride_type);
+  output_data2.Add(1.0);
+  input_data_scaled.Scale(2.0);
+
+  if ((properties & kPropagateAdds) && (properties & kPropagateInPlace)) {
+    KALDI_ERR << "kPropagateAdds and kPropagateInPlace flags are incompatible.";
+  }
 
   c.Propagate(NULL, input_data, &output_data1);
   c.Propagate(NULL, input_data, &output_data2);
+  if (properties & kPropagateInPlace) {
+    c.Propagate(NULL, output_data3, &output_data3);
+    if (!output_data1.ApproxEqual(output_data3)) {
+      KALDI_ERR << "Test of kPropagateInPlace flag for component of type "
+                << c.Type() << " failed.";
+    }
+  }
   if (properties & kPropagateAdds)
     output_data2.Add(-1.0); // remove the offset
   AssertEqual(output_data1, output_data2);
 
-  CuMatrix<BaseFloat> output_deriv(num_rows, output_dim);
+  if (c_copy_scaled) {
+    c_copy_scaled->Propagate(NULL, input_data, &output_data4);
+    output_data4.Scale(2.0);  // we scaled the parameters by 0.5 above, and the
+    // output is supposed to be linear in the parameter value.
+    AssertEqual(output_data1, output_data4);
+  }
+  if (properties & kLinearInInput) {
+    c.Propagate(NULL, input_data_scaled, &output_data5);
+    output_data5.Scale(0.5);
+    AssertEqual(output_data1, output_data5);
+  }
+
+
+  CuMatrix<BaseFloat> output_deriv(num_rows, output_dim, kSetZero, output_stride_type);
   output_deriv.SetRandn();
-  CuMatrix<BaseFloat> input_deriv1(num_rows, input_dim),
-      input_deriv2(num_rows, input_dim),
-      input_deriv3(output_deriv);
+  CuMatrix<BaseFloat> input_deriv1(num_rows, input_dim, kSetZero, input_stride_type),
+      input_deriv2(num_rows, input_dim, kSetZero, input_stride_type);
+  CuMatrix<BaseFloat> input_deriv3(num_rows, output_dim, kSetZero, input_stride_type);
+  input_deriv3.CopyFromMat(output_deriv);
+
   input_deriv2.Add(1.0);
   CuMatrix<BaseFloat> empty_mat;
 
@@ -90,41 +283,51 @@ void TestSimpleComponentPropagateProperties(const Component &c) {
                c_copy,
                &input_deriv3);
   }
+
   if (properties & kBackpropAdds)
     input_deriv2.Add(-1.0);  // subtract the offset.
   AssertEqual(input_deriv1, input_deriv2);
   if (properties & kBackpropInPlace)
     AssertEqual(input_deriv1, input_deriv3);
   delete c_copy;
+  delete c_copy_scaled;
 }
 
 bool TestSimpleComponentDataDerivative(const Component &c,
                                        BaseFloat perturb_delta) {
+  MatrixStrideType input_stride_type = (c.Properties()&kInputContiguous) ?
+      kStrideEqualNumCols : kDefaultStride;
+  MatrixStrideType output_stride_type = (c.Properties()&kOutputContiguous) ?
+      kStrideEqualNumCols : kDefaultStride;
+
   int32 input_dim = c.InputDim(),
       output_dim = c.OutputDim(),
       num_rows = RandInt(1, 100);
   int32 properties = c.Properties();
-  CuMatrix<BaseFloat> input_data(num_rows, input_dim),
-      output_data(num_rows, output_dim),
-      output_deriv(num_rows, output_dim);
+  CuMatrix<BaseFloat> input_data(num_rows, input_dim, kSetZero, input_stride_type),
+      output_data(num_rows, output_dim, kSetZero, output_stride_type),
+      output_deriv(num_rows, output_dim, kSetZero, output_stride_type);
   input_data.SetRandn();
   output_deriv.SetRandn();
 
   c.Propagate(NULL, input_data, &output_data);
 
-  CuMatrix<BaseFloat> input_deriv(num_rows, input_dim), empty_mat;
+  CuMatrix<BaseFloat> input_deriv(num_rows, input_dim, kSetZero, input_stride_type),
+      empty_mat;
   c.Backprop("foobar", NULL,
              ((properties & kBackpropNeedsInput) ? input_data : empty_mat),
              ((properties & kBackpropNeedsOutput) ? output_data : empty_mat),
              output_deriv, NULL, &input_deriv);
-  
+
   int32 test_dim = 3;
   BaseFloat original_objf = TraceMatMat(output_deriv, output_data, kTrans);
   Vector<BaseFloat> measured_objf_change(test_dim),
       predicted_objf_change(test_dim);
   for (int32 i = 0; i < test_dim; i++) {
-    CuMatrix<BaseFloat> perturbed_input_data(num_rows, input_dim),
-        perturbed_output_data(num_rows, output_dim);
+    CuMatrix<BaseFloat> perturbed_input_data(num_rows, input_dim,
+                                             kSetZero, input_stride_type),
+        perturbed_output_data(num_rows, output_dim,
+                              kSetZero, output_stride_type);
     perturbed_input_data.SetRandn();
     perturbed_input_data.Scale(perturb_delta);
     // at this point, perturbed_input_data contains the offset at the input data.
@@ -170,17 +373,21 @@ bool TestSimpleComponentModelDerivative(const Component &c,
     // nothing to test.
     return true;
   }
-  
-  CuMatrix<BaseFloat> input_data(num_rows, input_dim),
-      output_data(num_rows, output_dim),
-      output_deriv(num_rows, output_dim);
+  MatrixStrideType input_stride_type = (c.Properties()&kInputContiguous) ?
+      kStrideEqualNumCols : kDefaultStride;
+  MatrixStrideType output_stride_type = (c.Properties()&kOutputContiguous) ?
+      kStrideEqualNumCols : kDefaultStride;
+
+  CuMatrix<BaseFloat> input_data(num_rows, input_dim, kSetZero, input_stride_type),
+      output_data(num_rows, output_dim, kSetZero, output_stride_type),
+      output_deriv(num_rows, output_dim, kSetZero, output_stride_type);
   input_data.SetRandn();
   output_deriv.SetRandn();
 
   c.Propagate(NULL, input_data, &output_data);
 
   BaseFloat original_objf = TraceMatMat(output_deriv, output_data, kTrans);
-  
+
   Component *c_copy = c.Copy();
 
   const UpdatableComponent *uc = dynamic_cast<const UpdatableComponent*>(&c);
@@ -190,8 +397,10 @@ bool TestSimpleComponentModelDerivative(const Component &c,
     bool is_gradient = true;
     uc_copy->SetZero(is_gradient);
   }
-  
-  CuMatrix<BaseFloat> input_deriv(num_rows, input_dim), empty_mat;
+
+  CuMatrix<BaseFloat> input_deriv(num_rows, input_dim,
+                                  kSetZero, input_stride_type),
+      empty_mat;
   c.Backprop("foobar", NULL,
              ((properties & kBackpropNeedsInput) ? input_data : empty_mat),
              ((properties & kBackpropNeedsOutput) ? output_data : empty_mat),
@@ -199,7 +408,8 @@ bool TestSimpleComponentModelDerivative(const Component &c,
              (RandInt(0, 1) == 0 ? &input_deriv : NULL));
 
   if (!test_derivative) { // Just testing that the model update is downhill.
-    CuMatrix<BaseFloat> new_output_data(num_rows, output_dim);
+    CuMatrix<BaseFloat> new_output_data(num_rows, output_dim,
+                                        kSetZero, output_stride_type);
     c_copy->Propagate(NULL, input_data, &new_output_data);
 
     BaseFloat new_objf = TraceMatMat(output_deriv, new_output_data, kTrans);
@@ -217,7 +427,8 @@ bool TestSimpleComponentModelDerivative(const Component &c,
     Vector<BaseFloat> measured_objf_change(test_dim),
         predicted_objf_change(test_dim);
     for (int32 i = 0; i < test_dim; i++) {
-      CuMatrix<BaseFloat> perturbed_output_data(num_rows, output_dim);
+      CuMatrix<BaseFloat> perturbed_output_data(num_rows, output_dim,
+                                                kSetZero, output_stride_type);
       Component *c_perturbed = c.Copy();
       UpdatableComponent *uc_perturbed =
           dynamic_cast<UpdatableComponent*>(c_perturbed);
@@ -248,9 +459,14 @@ bool TestSimpleComponentModelDerivative(const Component &c,
 
 
 void UnitTestNnetComponent() {
-  for (int32 n = 0; n < 400; n++) {
+  for (int32 n = 0; n < 200; n++)  {
     Component *c = GenerateRandomSimpleComponent();
+    KALDI_LOG << c->Info();
     TestNnetComponentIo(c);
+    TestNnetComponentCopy(c);
+    TestNnetComponentAddScale(c);
+    TestNnetComponentVectorizeUnVectorize(c);
+    TestNnetComponentUpdatable(c);
     TestSimpleComponentPropagateProperties(*c);
     if (!TestSimpleComponentDataDerivative(*c, 1.0e-04) &&
         !TestSimpleComponentDataDerivative(*c, 1.0e-03) &&
@@ -262,12 +478,13 @@ void UnitTestNnetComponent() {
         !TestSimpleComponentModelDerivative(*c, 1.0e-03, false) &&
         !TestSimpleComponentModelDerivative(*c, 1.0e-06, false))
       KALDI_ERR << "Component downhill-update test failed";
-    
+
     if (!TestSimpleComponentModelDerivative(*c, 1.0e-04, true) &&
         !TestSimpleComponentModelDerivative(*c, 1.0e-03, true) &&
         !TestSimpleComponentModelDerivative(*c, 1.0e-05, true) &&
         !TestSimpleComponentModelDerivative(*c, 1.0e-06, true))
-      KALDI_ERR << "Component model-derivative test failed";      
+      KALDI_ERR << "Component model-derivative test failed";
+
     delete c;
   }
 }
@@ -278,8 +495,8 @@ void UnitTestNnetComponent() {
 int main() {
   using namespace kaldi;
   using namespace kaldi::nnet3;
-
-  for (int32 loop = 0; loop < 2; loop++) {
+  TestStringsApproxEqual();
+  for (kaldi::int32 loop = 0; loop < 2; loop++) {
 #if HAVE_CUDA == 1
     if (loop == 0)
       CuDevice::Instantiate().SelectGpuId("no");
diff --git a/src/nnet3/nnet-computation-graph.cc b/src/nnet3/nnet-computation-graph.cc
index 988a43a8424..ac76e69cff6 100644
--- a/src/nnet3/nnet-computation-graph.cc
+++ b/src/nnet3/nnet-computation-graph.cc
@@ -1,3 +1,5 @@
+// nnet3/nnet-computation-graph.cc
+
 // Copyright      2015  Johns Hopkins University (author: Daniel Povey)
 
 // See ../../COPYING for clarification regarding multiple authors
@@ -60,7 +62,7 @@ void ComputationGraph::Renumber(const std::vector<bool> &keep) {
   new2old.reserve(num_cindex_ids);
   for (int32 j = 0; j < num_cindex_ids; j++) {
     if (keep[j]) {
-      old2new[j] = new2old.size();      
+      old2new[j] = new2old.size();
       new2old.push_back(j);
     }
   }
@@ -71,7 +73,7 @@ void ComputationGraph::Renumber(const std::vector<bool> &keep) {
     return;
   }
   temp_graph.cindexes.resize(new_num_cindex_ids);
-  temp_graph.is_input.resize(new_num_cindex_ids);  
+  temp_graph.is_input.resize(new_num_cindex_ids);
   temp_graph.dependencies.resize(new_num_cindex_ids);
   for (int32 c = 0; c < new_num_cindex_ids; c++) {
     int32 d = new2old[c];
@@ -84,8 +86,11 @@ void ComputationGraph::Renumber(const std::vector<bool> &keep) {
       int32 old_dep = *iter, new_dep = old2new[old_dep];
       if (new_dep != -1)
         temp_graph.dependencies[c].push_back(new_dep);
+      else
+        KALDI_ERR << "Dependency on nonexistent cindex-id";
     }
   }
+
   // at this point, rather than setting up cindex_to_cindex_id_ on the temporary
   // graph, we copy cindexes, is_input and dependencies to this graph, and then
   // set up cindex_to_cindex_id_ locally.
@@ -121,13 +126,14 @@ void ComputationGraphBuilder::ExplainWhyNotComputable(
        num_lines_printed++) {
     int32 cindex_id = cindexes_to_explain.front();
     cindexes_to_explain.pop_front();
-    KALDI_ASSERT(static_cast<size_t>(cindex_id) < graph_->cindexes.size());          PrintCindexId(os, cindex_id);
+    KALDI_ASSERT(static_cast<size_t>(cindex_id) < graph_->cindexes.size());
+    PrintCindexId(os, cindex_id);
     os << " is " << static_cast<ComputableInfo>(
         computable_info_[cindex_id]) << ", dependencies: ";
     const std::vector<int32> dependencies = graph_->dependencies[cindex_id];
     std::vector<int32>::const_iterator iter = dependencies.begin(),
         end = dependencies.end();
-    for (; iter != end;) {
+    for (; iter != end; iter++) {
       int32 dep_cindex_id = *iter;
       PrintCindexId(os, dep_cindex_id);
       ComputableInfo status = static_cast<ComputableInfo>(
@@ -192,7 +198,7 @@ void ComputationGraph::Print(std::ostream &os,
     }
   }
   os << "\n";
-  
+
 }
 
 
@@ -215,7 +221,7 @@ void ComputationGraphBuilder::AddCindexId(int32 cindex_id,
     // status.
     computable_queued_.push_back(false);
     next_queue_.push_back(cindex_id);
-  }    
+  }
   depend_on_this_.push_back(std::vector<int32>());
   usable_count_.push_back(is_output ? 1 : 0);
 }
@@ -231,7 +237,7 @@ void ComputationGraphBuilder::AddInputs() {
     NodeType t = nnet_.GetNode(n).node_type;
     KALDI_ASSERT((t == kInput || t == kComponent) &&
                  "Inputs to graph only allowed for Input and Component nodes.");
-    
+
     for (int32 j = 0; j < request_.inputs[i].indexes.size(); j++) {
       Cindex cindex(n, request_.inputs[i].indexes[j]);
       bool is_input = true, is_new;
@@ -260,7 +266,9 @@ void ComputationGraphBuilder::AddOutputs() {
       num_added++;
     }
   }
-  KALDI_ASSERT(num_added > 0 && "AddOutputToGraph: nothing to add.");
+  if (num_added == 0) {
+    KALDI_ERR << "Cannot process computation request with no outputs";
+  }
   current_distance_ = 0;
   // the calls to AddCindexId in this function will have added to next_queue_.
   KALDI_ASSERT(current_queue_.empty());
@@ -302,13 +310,13 @@ std::ostream& operator << (std::ostream &os,
 void ComputationGraphBuilder::ExplainWhyAllOutputsNotComputable() const {
   std::vector<int32> outputs_not_computable;
   int32 num_outputs_total = 0;
-  
+
   std::vector<Cindex>::const_iterator iter = graph_->cindexes.begin(),
       end = graph_->cindexes.end();
   for (int32 cindex_id = 0; iter != end; ++iter,++cindex_id) {
     int32 network_node = iter->first;
     ComputableInfo c = static_cast<ComputableInfo>(computable_info_[cindex_id]);
-    if (!nnet_.IsOutputNode(network_node)) {
+    if (nnet_.IsOutputNode(network_node)) {
       num_outputs_total++;
       if (c != kComputable)
         outputs_not_computable.push_back(cindex_id);
@@ -319,6 +327,9 @@ void ComputationGraphBuilder::ExplainWhyAllOutputsNotComputable() const {
   int32 num_print = 10, num_not_computable = outputs_not_computable.size();
   KALDI_LOG << num_not_computable << " output cindexes out of "
             << num_outputs_total << " were not computable.";
+  std::ostringstream os;
+  request_.Print(os);
+  KALDI_LOG << "Computation request was: " << os.str();
   if (num_not_computable > num_print)
     KALDI_LOG << "Printing the reasons for " << num_print << " of these.";
   for (int32 i = 0; i < num_not_computable && i < num_print; i++)
@@ -339,7 +350,7 @@ void ComputationGraphBuilder::PruneDependencies(int32 cindex_id) {
     // keeping around its dependencies.
     graph_->dependencies[cindex_id].clear();
     return;
-  } 
+  }
   KALDI_ASSERT(c == kComputable);
   const Cindex &cindex = graph_->cindexes[cindex_id];
   int32 node_id = cindex.first;
@@ -349,7 +360,7 @@ void ComputationGraphBuilder::PruneDependencies(int32 cindex_id) {
   std::vector<int32> &dependencies = graph_->dependencies[cindex_id];
   std::sort(dependencies.begin(), dependencies.end());
   std::vector<int32> used_cindex_ids;
-  
+
   switch (node.node_type) {
     case kDescriptor: {
       const Descriptor &desc = node.descriptor;
@@ -361,8 +372,6 @@ void ComputationGraphBuilder::PruneDependencies(int32 cindex_id) {
       // making more inputs available will never change something from not being
       // computable to being computable; or it could be a bug elsewhere.
       KALDI_ASSERT(ans);
-      std::vector<int32> &dependencies = graph_->dependencies[cindex_id];
-      std::sort(dependencies.begin(), dependencies.end());
       size_t size = used_cindexes.size();
       used_cindex_ids.resize(size);
       for (size_t i = 0; i < size; i++) {
@@ -373,12 +382,15 @@ void ComputationGraphBuilder::PruneDependencies(int32 cindex_id) {
                                         dependencies.end(),
                                         dep_cindex_id));
       }
-      break;      
+      break;
     }
     case kComponent: {
       const Component *c = nnet_.GetComponent(node.u.component_index);
       bool dont_care = false;  // there should be no kUnknown, and we check this
-      IndexSet index_set(*graph_, computable_info_, node_id, dont_care);
+      // In the line below, node_id - 1 is the index of the component-input
+      // node-- the descriptor at the input to the component.  We are interested
+      // in the set of inputs to the component that are computable.
+      IndexSet index_set(*graph_, computable_info_, node_id - 1, dont_care);
       std::vector<Index> used_indexes;
       bool ans = c->IsComputable(request_.misc_info, index, index_set,
                                  &used_indexes);
@@ -400,6 +412,7 @@ void ComputationGraphBuilder::PruneDependencies(int32 cindex_id) {
       break;
     }
     case kDimRange:
+      KALDI_ASSERT(dependencies.size() == 1);
       // there should be exactly one dependency and it is required, not
       // optional, so there is nothing to do here.  Return.
       return;
@@ -411,7 +424,7 @@ void ComputationGraphBuilder::PruneDependencies(int32 cindex_id) {
       KALDI_ERR << "Invalid node type";
   }
   SortAndUniq(&used_cindex_ids);
-  
+
   // the next statement modifies the graph.
   dependencies.swap(used_cindex_ids);
 }
@@ -421,17 +434,19 @@ void ComputationGraphBuilder::Compute() {
   AddInputs();
   AddOutputs();  // sets current_distance_ to 0.
   // max_distance for debugging, to detect infinite recursion.
-  int32 max_distance = 10000;  
+  int32 max_distance = 10000;
   while (current_distance_ < max_distance) {
     BuildGraphOneIter();
-    Check();
+    // only check rarely if we're running at low verbose level.
+    if (GetVerboseLevel() >= 3 || RandInt(1,  (current_distance_ + 1)) == 1)
+      Check();
     // TODO: come up with a scheme to delay when we call
     // UpdateAllComputableInfo().
     UpdateAllComputableInfo();
     if (current_queue_.empty()) // we're done.
       break;
   }
-  if (current_distance_ == max_distance) 
+  if (current_distance_ == max_distance)
     KALDI_ERR << "Loop detected while building computation graph (bad "
               << "network topology?)";
   Check();
@@ -453,7 +468,7 @@ void ComputationGraphBuilder::Check() const {
         const std::vector<int32> &dep = graph_->dependencies[other_cindex_id];
         KALDI_ASSERT(std::count(dep.begin(), dep.end(), cindex_id) == 1);
       }
-    }                   
+    }
     { // check dependencies.
       std::vector<int32> dependencies = graph_->dependencies[cindex_id];
       int32 size = dependencies.size();
@@ -465,7 +480,7 @@ void ComputationGraphBuilder::Check() const {
         const std::vector<int32> &dep = depend_on_this_[dep_cindex_id];
         KALDI_ASSERT(std::count(dep.begin(), dep.end(), cindex_id) == 1);
       }
-    }                   
+    }
     { // check usable_count_.
       int32 node_index = graph_->cindexes[cindex_id].first;
       int32 usable_count = usable_count_[cindex_id],
@@ -540,7 +555,6 @@ void ComputationGraphBuilder::Prune() {
   computable_info_.clear();
   computable_queue_.clear();
   usable_count_.clear();
-  
 }
 
 // Add cindex_ids that this cindex_id depends on.
@@ -550,14 +564,14 @@ void ComputationGraphBuilder::AddDependencies(int32 cindex_id) {
   }
 
   Cindex cindex = graph_->cindexes[cindex_id];
-  
+
   // find the dependencies of this cindex.
   int32 node_index = cindex.first;
   const Index &index = cindex.second;
   const NetworkNode &node = nnet_.GetNode(node_index);
 
   std::vector<Cindex> input_cindexes;
-    
+
   // the following switch statement sets up "input_cindexes".
   switch (node.node_type) {
     case kDescriptor: {
@@ -590,7 +604,7 @@ void ComputationGraphBuilder::AddDependencies(int32 cindex_id) {
       KALDI_ERR << "Invalid node type";
   }
 
-  int32 num_dependencies = input_cindexes.size();  
+  int32 num_dependencies = input_cindexes.size();
   // this "reserve" statement is to make sure the reference
   // we declare below does not become invalid in the loop below
   // (the call to graph_->GetCindexId() could add up to
@@ -697,7 +711,7 @@ ComputationGraphBuilder::ComputeComputableInfo(int32 cindex_id)
         return ComputableInfo(computable_info_[input_cindex_id]);
       else
         return kUnknown;
-    }      
+    }
     case kInput: {
       // cindexes for input nodes that are part of the computation request will
       // have graph_->is_input[cindex_id] == true; others will have
@@ -731,7 +745,7 @@ void ComputationGraphBuilder::GetComputableInfo(
       KALDI_ASSERT(cindex_id != -1);
       this_vec[j] = (computable_info_[cindex_id] == kComputable);
     }
-  }  
+  }
 }
 
 
@@ -743,7 +757,7 @@ void ComputationGraphBuilder::UpdateComputableInfo(int32 cindex_id) {
   KALDI_ASSERT(output == kUnknown);
 
   output = static_cast<char>(ComputeComputableInfo(cindex_id));
-  
+
   if (output != kUnknown) {
     // The computable status of cindexes that depend on this cindex and whose
     // status is currently kUnknown might now change, so if they are not in the
@@ -848,12 +862,12 @@ void ComputationGraphBuilder::BuildGraphOneIter() {
 
 void ComputationGraphBuilder::ComputeRequiredArray(
     std::vector<bool> *required) const {
-  
+
   int32 num_cindex_ids = graph_->cindexes.size();
   KALDI_ASSERT(computable_info_.size() == num_cindex_ids);
   required->clear();
   required->resize(num_cindex_ids, false);
-  
+
   std::vector<int32> queue;
   for (int32 c = 0; c < num_cindex_ids; c++) {
     // First put the output cindex_ids into the queue.
@@ -892,7 +906,7 @@ namespace computation_graph {
 // This function adds cindex_ids corresponding to each output
 // index, to the graph.
 void AddOutputToGraph(const ComputationRequest &request,
-                      const Nnet &nnet,                      
+                      const Nnet &nnet,
                       ComputationGraph *graph) {
   int32 num_added = 0;
   for (int32 i = 0; i < request.outputs.size(); i++) {
@@ -909,13 +923,13 @@ void AddOutputToGraph(const ComputationRequest &request,
     }
   }
   KALDI_ASSERT(num_added > 0 && "AddOutputToGraph: nothing to add.");
-}  
+}
 
 
 // This function adds cindex_ids corresponding to each input index, to the
 // graph.
 void AddInputToGraph(const ComputationRequest &request,
-                     const Nnet &nnet,                      
+                     const Nnet &nnet,
                      ComputationGraph *graph) {
   int32 num_added = 0;
   for (int32 i = 0; i < request.inputs.size(); i++) {
@@ -926,7 +940,7 @@ void AddInputToGraph(const ComputationRequest &request,
     NodeType t = nnet.GetNode(n).node_type;
     KALDI_ASSERT((t == kInput || t == kComponent) &&
                  "Inputs to graph only allowed for Input and Component nodes.");
-    
+
     for (int32 j = 0; j < request.inputs[i].indexes.size(); j++) {
       Cindex cindex(n, request.inputs[i].indexes[j]);
       bool is_input = true, is_new;
@@ -936,27 +950,27 @@ void AddInputToGraph(const ComputationRequest &request,
     }
   }
   KALDI_ASSERT(num_added > 0 && "AddInputToGraph: nothing to add.");
-}  
+}
 
 
 // This function outputs to dependencies_subset[c], for each cindex_id c,
 // the subset of elements d of graph.dependencies[c] such that
-// cindex_id_to_phase[d] == cindex_id_to_phase[d].
+// cindex_id_to_epoch[d] == cindex_id_to_epoch[c].
 static void ComputeDependenciesSubset(
     const ComputationGraph &graph,
-    const std::vector<int32> &cindex_id_to_phase,
+    const std::vector<int32> &cindex_id_to_epoch,
     std::vector<std::vector<int32> > *dependencies_subset) {
   int32 num_cindex_ids = graph.cindexes.size();
-  KALDI_ASSERT(cindex_id_to_phase.size() == num_cindex_ids);
+  KALDI_ASSERT(cindex_id_to_epoch.size() == num_cindex_ids);
   dependencies_subset->resize(num_cindex_ids);
   for (int32 cindex_id = 0; cindex_id < num_cindex_ids; cindex_id++) {
-    int32 phase_index = cindex_id_to_phase[cindex_id];
+    int32 phase_index = cindex_id_to_epoch[cindex_id];
     const std::vector<int32> &dependencies = graph.dependencies[cindex_id];
     std::vector<int32> &dep_subset = (*dependencies_subset)[cindex_id];
     int32 num_dep = dependencies.size();
     for (int32 i = 0; i < num_dep; i++) {
       int32 d = dependencies[i];
-      if (cindex_id_to_phase[d] == phase_index)
+      if (cindex_id_to_epoch[d] == phase_index)
         dep_subset.push_back(d);
     }
   }
@@ -989,7 +1003,7 @@ static void ComputeDependenciesSubset(
 ///            to just a single NetworkNode. (and also true for epoch index 0,
 ///            which corresponds only to inputs to the network).
 static void ComputeEpochInfo(
-    const Nnet &nnet,    
+    const Nnet &nnet,
     const ComputationGraph &graph,
     std::vector<int32> *cindex_id_to_epoch,
     std::vector<std::vector<int32 > > *epochs,
@@ -1005,7 +1019,7 @@ static void ComputeEpochInfo(
     PrintIntegerVector(os, node_to_epoch);
     KALDI_VLOG(6) << "node_to_epoch: " << os.str();
   }
-  
+
   // Add one to the epoch numbering because we will be reserving
   // zero for inputs to the network, and we don't want to have to
   // prove that epoch number 0 would correspond only to inputs.
@@ -1015,7 +1029,7 @@ static void ComputeEpochInfo(
       num_cindex_ids = graph.cindexes.size(),
       num_epoch_indexes = 1 + *std::max_element(node_to_epoch.begin(),
                                                       node_to_epoch.end());
-  KALDI_ASSERT(node_to_epoch.size() == num_nodes);  
+  KALDI_ASSERT(node_to_epoch.size() == num_nodes);
 
   // epoch_to_num_nodes is only used so we know whether each epoch
   // index corresponds to multiple nodes; if it's just one node then we know
@@ -1029,7 +1043,7 @@ static void ComputeEpochInfo(
     KALDI_ASSERT(o == 0 || epoch_to_num_nodes[o] > 0);
     (*epoch_is_trivial)[o] = (epoch_to_num_nodes[o] <= 1);
   }
-  
+
   cindex_id_to_epoch->resize(num_cindex_ids);
   epochs->resize(num_epoch_indexes);
   for (int32 cindex_id = 0; cindex_id < num_cindex_ids; cindex_id++) {
@@ -1040,7 +1054,7 @@ static void ComputeEpochInfo(
     (*epochs)[epoch_index].push_back(cindex_id);
   }
 }
-    
+
 
 } // end namespace computation_graph
 
@@ -1076,7 +1090,7 @@ void ComputeComputationGraph(const Nnet &nnet,
     const NetworkNode &node = nnet.GetNode(n);
 
     std::vector<Cindex> input_cindexes;
-    
+
     // the following switch statement sets up "input_cindexes".
     switch (node.node_type) {
       case kDescriptor: {
@@ -1095,7 +1109,7 @@ void ComputeComputationGraph(const Nnet &nnet,
         // input, of type kDescriptor
         KALDI_ASSERT(nnet.GetNode(n-1).node_type ==
                      kDescriptor);
-        
+
         input_cindexes.resize(input_indexes.size());
         for (size_t i = 0; i < input_indexes.size(); i++) {
           input_cindexes[i].first = n - 1;  // preceding node.
@@ -1150,7 +1164,7 @@ static int32 SumVectorSizes(const std::vector<std::vector<int32> > &vec) {
 // represented by dependencies_subset.
 // phase_index maps cindex_id to phase_index (or -1 if not assigned yet).
 static inline void ComputeComputationPhasesForEpoch(
-    const Nnet &nnet,    
+    const Nnet &nnet,
     const ComputationGraph &graph,
     const std::vector<std::vector<int32> > &dependencies_subset,
     const std::vector<std::vector<int32> > &depend_on_subset,
@@ -1194,7 +1208,7 @@ static inline void ComputeComputationPhasesForEpoch(
       return;
 
     int32 cur_phase_index = phases->size() - 1;
-    
+
     // next_phases_candidates is a list of cindexes that we should check
     // whether they are computable now, because one of the things they depend
     // on just became computable.
@@ -1240,12 +1254,12 @@ static inline void ComputeComputationPhasesForEpoch(
 }
 
 void ComputeComputationPhases(
-    const Nnet &nnet,    
+    const Nnet &nnet,
     const ComputationGraph &graph,
     std::vector<std::vector<int32> > *phases) {
   using namespace computation_graph;
   int32 num_cindex_ids = graph.cindexes.size();
-      
+
   std::vector<int32> cindex_id_to_epoch;
   std::vector<std::vector<int32 > > epochs;
   std::vector<bool> epoch_is_trivial;
@@ -1253,7 +1267,7 @@ void ComputeComputationPhases(
                         &epochs, &epoch_is_trivial);
 
   KALDI_ASSERT(SumVectorSizes(epochs) == num_cindex_ids);
-  
+
   // dependencies_subset contains just the subset of dependencies
   // of each cindex_id, that have the same epoch index as
   // cindex_id itself.  This will be used to correctly order
@@ -1263,7 +1277,7 @@ void ComputeComputationPhases(
   std::vector<std::vector<int32> > dependencies_subset;
   ComputeDependenciesSubset(graph, cindex_id_to_epoch,
                             &dependencies_subset);
-  
+
   // depend_on_subset is a subset of the normal "depend_on" list (i.e. a list of
   // all cindex_ids that depend on the current cindex_id), limited to just those
   // cindex_ids that have the same epoch index.
@@ -1274,7 +1288,7 @@ void ComputeComputationPhases(
 
   // "phase_index" is used inside ComputeComputationPhasesForEpoch.
   std::vector<int32> phase_index(num_cindex_ids, -1);
-  
+
   if (phases) {
     phases->clear();
     phases->reserve(50);  // minimize unnecessary copies.  50 is very
@@ -1290,7 +1304,7 @@ void ComputeComputationPhases(
                                      epochs[epoch],
                                      epoch_is_trivial[epoch],
                                      &phase_index, phases);
-    
+
 
   // make sure everything was computable.  If the next assert fails it's likely
   // a bug in this function or in PruneComputataionGraph.
@@ -1361,7 +1375,7 @@ namespace compute_computation_steps {
 /// returns the total number of cindex_ids that correspond to inputs.
 int32 AddInputSteps(const Nnet &nnet,
                     const ComputationRequest &request,
-                    const ComputationGraph &graph,                   
+                    const ComputationGraph &graph,
                     std::vector<std::vector<int32> > *steps) {
   KALDI_ASSERT(steps->empty());
   steps->reserve(50);  // will minimize unnecessary copies of vectors.
@@ -1399,7 +1413,7 @@ int32 AddInputSteps(const Nnet &nnet,
 /// in the request (this won't matter at all).
 void AddOutputSteps(const Nnet &nnet,
                     const ComputationRequest &request,
-                    const ComputationGraph &graph,                   
+                    const ComputationGraph &graph,
                     std::vector<std::vector<int32> > *steps) {
   std::set<int32> all_nodes;  // to make sure nothing listed twice.
   for (int32 i = 0; i < request.outputs.size(); i++) {
@@ -1520,7 +1534,7 @@ static void AddComponentInputSteps(
   int32 space_for_outputs = 10;  // arbitrary.
   all_steps->reserve(all_steps->size() +
                      component_steps->size() * 2 + space_for_outputs);
-  
+
 
   for (size_t i = 0; i < component_steps->size(); i++) {
     std::vector<int32> &component_step = (*component_steps)[i];
@@ -1566,7 +1580,7 @@ static void AddComponentInputSteps(
     }
     // Now that we've sorted, convert back to cindex_ids (this list will be
     // the "step").
-    int32 size = descriptor_cindexes.size();      
+    int32 size = descriptor_cindexes.size();
     std::vector<int32> descriptor_step(size);
     for (int32 i = 0; i < size; i++) {
       descriptor_step[i] = graph.GetCindexId(descriptor_cindexes[i]);
@@ -1575,7 +1589,7 @@ static void AddComponentInputSteps(
     // efficiently add descriptor_step to the end of all_steps.
     all_steps->push_back(std::vector<int32>());
     all_steps->back().swap(descriptor_step);
-    
+
     // efficiently add component_step to the end of all_steps (this destroys the
     // input, which we won't be needing any more).
     all_steps->push_back(std::vector<int32>());
@@ -1586,7 +1600,7 @@ static void AddComponentInputSteps(
 
 
 static void CreateCindexIdToStep(
-    const ComputationGraph &graph,    
+    const ComputationGraph &graph,
     const std::vector<std::vector<int32> > &all_steps,
     std::vector<int32> *cindex_id_to_step) {
   int32 num_cindex_ids = graph.cindexes.size();
@@ -1608,7 +1622,7 @@ static void CreateCindexIdToStep(
 /// "graph" is non-const as there are situations in which we might need to
 /// add cindexes for nodes of type kDimRange.
 static void AddDimRangeSteps(
-    const Nnet &nnet,    
+    const Nnet &nnet,
     ComputationGraph *graph,
     std::vector<std::vector<int32> > *all_steps) {
   int32 num_nodes = nnet.NumNodes();
@@ -1669,9 +1683,11 @@ static void AddDimRangeSteps(
         bool input = false, is_new;
         int32 dimrange_cindex_id = graph->GetCindexId(dimrange_cindex,
                                                       input, &is_new);
-        // actually we don't care about is_new's value.  some new ones are
-        // allowed.
         new_step[i] = dimrange_cindex_id;
+        if (is_new) {  // if we newly added this cindex_id, note the dependency
+                       // on its input.
+          graph->dependencies[dimrange_cindex_id].push_back(cindex_id);
+        }
       }
       all_steps_out.push_back(std::vector<int32>());
       all_steps_out.back().swap(new_step);
@@ -1693,7 +1709,7 @@ static void AddDimRangeSteps(
 /// the normal order.
 void ReorderIndexes(const Nnet &nnet,
                     const ComputationRequest &request,
-                    const ComputationGraph &graph,                   
+                    const ComputationGraph &graph,
                     std::vector<std::vector<int32> > *steps) {
 
   for (int32 step = 0; step < steps->size(); step++) {
@@ -1705,7 +1721,7 @@ void ReorderIndexes(const Nnet &nnet,
     if (node.node_type != kComponent ||
         graph.is_input[cindex_id])
       continue;  // nothing to do if an input, or if not a Component.
-    
+
     int32 c = node.u.component_index;
     const Component *component = nnet.GetComponent(c);
     if (!(component->Properties() & kReordersIndexes))
@@ -1714,13 +1730,13 @@ void ReorderIndexes(const Nnet &nnet,
 
     // preceding step will be Cindexes from the input Descriptor.
     std::vector<int32> &input_cindex_ids = (*steps)[step - 1];
-        
+
     int32 size = cindex_ids.size(), input_size = input_cindex_ids.size();
     std::vector<Index> indexes(size), input_indexes(input_size);
 
     for (int32 i = 0; i < size; i++)
       indexes[i] = graph.cindexes[cindex_ids[i]].second;
-    for (int32 i = 0; i < input_size; i++)    
+    for (int32 i = 0; i < input_size; i++)
       input_indexes[i] = graph.cindexes[input_cindex_ids[i]].second;
 
     component->ReorderIndexes(&input_indexes, &indexes);
diff --git a/src/nnet3/nnet-computation.cc b/src/nnet3/nnet-computation.cc
index 21d0c6884f9..5c0e8911037 100644
--- a/src/nnet3/nnet-computation.cc
+++ b/src/nnet3/nnet-computation.cc
@@ -1,4 +1,9 @@
+// nnet3/nnet-computation.cc
+
+// nnet3/nnet-computation.cc
+
 // Copyright      2015  Johns Hopkins University (author: Daniel Povey)
+//                2015  Xiaohui Zhang
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -94,39 +99,287 @@ void NnetComputation::ComputeCudaIndexes() {
   }
 }
 
-int32 NnetComputation::NewSubMatrix(int32 base_submatrix, int32 dim_offset,
-                                    int32 dim) {
+int32 NnetComputation::NewSubMatrix(int32 base_submatrix,
+                                    int32 row_offset, int32 num_rows,
+                                    int32 col_offset, int32 num_cols) {
   KALDI_ASSERT(base_submatrix > 0 &&
                static_cast<size_t>(base_submatrix) < submatrices.size());
   const SubMatrixInfo &base_info = submatrices[base_submatrix];
   int32 base_matrix = base_info.matrix_index;
-  int32 row_offset = base_info.row_offset, num_rows = base_info.num_rows,
-      col_offset = base_info.col_offset + dim_offset,
-      num_cols = dim;
   KALDI_ASSERT(base_matrix > 0 &&
                static_cast<size_t>(base_matrix) < matrices.size());
-  KALDI_ASSERT(col_offset >= 0 &&
-               col_offset + num_cols <= matrices[base_matrix].num_cols);
+  if (num_rows == -1) // we interpret this to mean 'as many as possible'.
+    num_rows = base_info.num_rows - row_offset;
+  if (num_cols == -1) // we interpret this to mean 'as many as possible'.
+    num_cols = base_info.num_cols - col_offset;
+  KALDI_ASSERT(row_offset + num_rows <= base_info.num_rows &&
+               col_offset + num_cols <= base_info.num_cols &&
+               row_offset >= 0 && col_offset >= 0 &&
+               num_rows > 0 && num_cols > 0);
+  int32 matrix_row_offset = base_info.row_offset + row_offset,
+      matrix_col_offset = base_info.col_offset + col_offset;
   int32 ans = submatrices.size();
   submatrices.push_back(
-      NnetComputation::SubMatrixInfo(base_matrix, row_offset, num_rows,
-                                     col_offset, num_cols));
+      NnetComputation::SubMatrixInfo(base_matrix, matrix_row_offset, num_rows,
+                                     matrix_col_offset, num_cols));
   return ans;
 }
-  
-int32 NnetComputation::NewMatrix(int32 num_rows, int32 num_cols) {
+
+int32 NnetComputation::NewMatrix(int32 num_rows, int32 num_cols,
+                                 MatrixStrideType stride_type) {
   KALDI_ASSERT(num_rows > 0 && num_cols > 0);
   if (matrices.empty()) {  // Set up the zero matrix; index zero is reserved.
-    matrices.push_back(MatrixInfo(0, 0));
+    matrices.push_back(MatrixInfo(0, 0, kDefaultStride));
     submatrices.push_back(SubMatrixInfo(0, 0, 0, 0, 0));
   }
   int32 matrix_index = matrices.size(),
       submatrix_index = submatrices.size();
-  matrices.push_back(MatrixInfo(num_rows, num_cols));
+  matrices.push_back(MatrixInfo(num_rows, num_cols, stride_type));
+  if (!matrix_debug_info.empty())
+    matrix_debug_info.push_back(MatrixDebugInfo());
   submatrices.push_back(SubMatrixInfo(matrix_index, 0, num_rows, 0, num_cols));
   return submatrix_index;
 }
 
+void NnetComputation::MatrixInfo::Read(std::istream &is, bool binary) {
+  ExpectToken(is, binary, "<MatrixInfo>");
+  ExpectToken(is, binary, "<NumRows>");
+  ReadBasicType(is, binary, &num_rows);
+  ExpectToken(is, binary, "<NumCols>");
+  ReadBasicType(is, binary, &num_cols);
+  ExpectToken(is, binary, "</MatrixInfo>");
+}
+
+void NnetComputation::MatrixInfo::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<MatrixInfo>");
+  if (!binary) os << std::endl;
+  WriteToken(os, binary, "<NumRows>");
+  WriteBasicType(os, binary, num_rows);
+  WriteToken(os, binary, "<NumCols>");
+  WriteBasicType(os, binary, num_cols);
+  if (!binary) os << std::endl;
+  WriteToken(os, binary, "</MatrixInfo>");
+  if (!binary) os << std::endl;
+}
+
+void NnetComputation::MatrixDebugInfo::Swap(
+    NnetComputation::MatrixDebugInfo *other) {
+  std::swap(is_deriv, other->is_deriv);
+  cindexes.swap(other->cindexes);
+}
+
+void NnetComputation::MatrixDebugInfo::Read(std::istream &is, bool binary) {
+  ExpectToken(is, binary, "<MatrixDebugInfo>");
+  ExpectToken(is, binary, "<IsDeriv>");
+  ReadBasicType(is, binary, &is_deriv);
+  ExpectToken(is, binary, "<Cindexes>");
+  ReadCindexVector(is, binary, &cindexes);
+  ExpectToken(is, binary, "</MatrixDebugInfo>");
+}
+
+void NnetComputation::MatrixDebugInfo::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<MatrixDebugInfo>");
+  if (!binary) os << std::endl;
+  WriteToken(os, binary, "<IsDeriv>");
+  WriteBasicType(os, binary, is_deriv);
+  if (!binary) os << std::endl;
+  WriteToken(os, binary, "<Cindexes>");
+  WriteCindexVector(os, binary, cindexes);
+  if (!binary) os << std::endl;
+  WriteToken(os, binary, "</MatrixDebugInfo>");
+  if (!binary) os << std::endl;
+}
+
+void NnetComputation::SubMatrixInfo::Read(std::istream &is, bool binary) {
+  ExpectToken(is, binary, "<SubMatrixInfo>");
+  ExpectToken(is, binary, "<MatrixIndex>");
+  ReadBasicType(is, binary, &matrix_index);
+  ExpectToken(is, binary, "<RowOffset>");
+  ReadBasicType(is, binary, &row_offset);
+  ExpectToken(is, binary, "<NumRows>");
+  ReadBasicType(is, binary, &num_rows);
+  ExpectToken(is, binary, "<ColOffset>");
+  ReadBasicType(is, binary, &col_offset);
+  ExpectToken(is, binary, "<NumCols>");
+  ReadBasicType(is, binary, &num_cols);
+  ExpectToken(is, binary, "</SubMatrixInfo>");
+}
+
+void NnetComputation::SubMatrixInfo::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<SubMatrixInfo>");
+  if (!binary) os << std::endl;
+  WriteToken(os, binary, "<MatrixIndex>");
+  WriteBasicType(os, binary, matrix_index);
+  WriteToken(os, binary, "<RowOffset>");
+  WriteBasicType(os, binary, row_offset);
+  WriteToken(os, binary, "<NumRows>");
+  WriteBasicType(os, binary, num_rows);
+  WriteToken(os, binary, "<ColOffset>");
+  WriteBasicType(os, binary, col_offset);
+  WriteToken(os, binary, "<NumCols>");
+  WriteBasicType(os, binary, num_cols);
+  if (!binary) os << std::endl;
+  WriteToken(os, binary, "</SubMatrixInfo>");
+  if (!binary) os << std::endl;
+}
+
+void NnetComputation::Command::Read(std::istream &is, bool binary) {
+  ExpectToken(is, binary, "<Command>");
+  ExpectToken(is, binary, "<CommandType>");
+  if (binary) {
+    int32 command_type_int;
+    ReadBasicType(is, binary, &command_type_int);
+    command_type = static_cast<CommandType>(command_type_int);
+  } else {
+    std::string command_type_str;
+    getline(is, command_type_str); 
+    if (command_type_str == "kAllocMatrixZeroed") {
+      command_type = kAllocMatrixZeroed;
+    } else if (command_type_str == "kAllocMatrixUndefined") {
+      command_type = kAllocMatrixUndefined;
+    } else if (command_type_str == "kDeallocMatrix") {
+      command_type = kDeallocMatrix;
+    } else if (command_type_str == "kAllocMatrixFromOther") {
+      command_type = kAllocMatrixFromOther;
+    } else if (command_type_str == "kAllocMatrixFromOtherZeroed") {
+      command_type = kAllocMatrixFromOtherZeroed;
+    } else if (command_type_str == "kPropagate") {
+      command_type = kPropagate;
+    } else if (command_type_str == "kStoreStats") {
+      command_type = kStoreStats;
+    } else if (command_type_str == "kBackprop") {
+      command_type = kBackprop;
+    } else if (command_type_str == "kBackpropNoModelUpdate") {
+      command_type = kBackpropNoModelUpdate;
+    } else if (command_type_str == "kMatrixCopy") {
+      command_type = kMatrixCopy;
+    } else if (command_type_str == "kMatrixAdd") {
+      command_type = kMatrixAdd;
+    } else if (command_type_str == "kCopyRows") {
+      command_type = kCopyRows;
+    } else if (command_type_str == "kAddRows") {
+      command_type = kAddRows;
+    } else if (command_type_str == "kCopyRowsMulti") {
+      command_type = kCopyRowsMulti;
+    } else if (command_type_str == "kCopyToRowsMulti") {
+      command_type = kCopyToRowsMulti;
+    } else if (command_type_str == "kAddRowsMulti") {
+      command_type = kAddRowsMulti;
+    } else if (command_type_str == "kAddToRowsMulti") {
+      command_type = kAddToRowsMulti;
+    } else if (command_type_str == "kAddRowRanges") {
+      command_type = kAddRowRanges;
+    } else if (command_type_str == "kNoOperation") {
+      command_type = kNoOperation;
+    } else if (command_type_str == "kNoOperationMarker") {
+      command_type = kNoOperationMarker;
+    } else {
+      KALDI_ERR << "Un-handled command type.";
+    }
+  }
+  ExpectToken(is, binary, "<Arg1>");
+  ReadBasicType(is, binary, &arg1);
+  ExpectToken(is, binary, "<Arg2>");
+  ReadBasicType(is, binary, &arg2);
+  ExpectToken(is, binary, "<Arg3>");
+  ReadBasicType(is, binary, &arg3);
+  ExpectToken(is, binary, "<Arg4>");
+  ReadBasicType(is, binary, &arg4);
+  ExpectToken(is, binary, "<Arg5>");
+  ReadBasicType(is, binary, &arg5);
+  ExpectToken(is, binary, "<Arg6>");
+  ReadBasicType(is, binary, &arg6);
+  ExpectToken(is, binary, "</Command>");
+}
+
+void NnetComputation::Command::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<Command>");
+  WriteToken(os, binary, "<CommandType>");
+  if (binary) {
+    WriteBasicType(os, binary, static_cast<int32>(command_type));
+  } else {
+    std::string command_type_str;
+    switch (command_type) {
+      case kAllocMatrixZeroed:
+        os << "kAllocMatrixZeroed\n";
+        break;
+      case kAllocMatrixUndefined:
+        os << "kAllocMatrixUndefined\n";
+        break;
+      case kDeallocMatrix:
+        os << "kDeallocMatrix\n";
+        break;
+      case kAllocMatrixFromOther:
+        os << "kAllocMatrixFromOther\n";
+        break;
+      case kAllocMatrixFromOtherZeroed:
+        os << "kAllocMatrixFromOtherZeroed\n";
+        break;
+      case kPropagate:
+        os << "kPropagate\n";
+        break;
+      case kStoreStats:
+        os << "kStoreStats\n";
+        break;
+      case kBackprop:
+        os << "kBackprop\n";
+        break;
+      case kBackpropNoModelUpdate:
+        os << "kBackpropNoModelUpdate\n";
+        break;
+      case kMatrixCopy:
+        os << "kMatrixCopy\n";
+        break;
+      case kMatrixAdd:
+        os << "kMatrixAdd\n";
+        break;
+      case kCopyRows:
+        os << "kCopyRows\n";
+        break;
+      case kAddRows:
+        os << "kAddRows\n";
+        break;
+      case kCopyRowsMulti:
+        os << "kCopyRowsMulti\n";
+        break;
+      case kCopyToRowsMulti:
+        os << "kCopyToRowsMulti\n";
+        break;
+      case kAddRowsMulti:
+        os << "kAddRowsMulti\n";
+        break;
+      case kAddToRowsMulti:
+        os << "kAddToRowsMulti\n";
+        break;
+      case kAddRowRanges:
+        os << "kAddRowRanges\n";
+        break;
+      case kNoOperation:
+        os << "kNoOperation\n";
+        break;
+      case kNoOperationMarker:
+        os << "kNoOperationMarker\n";
+        break;
+      default:
+        KALDI_ERR << "Un-handled command type.";
+    }
+  }
+  WriteToken(os, binary, "<Arg1>");
+  WriteBasicType(os, binary, arg1);
+  WriteToken(os, binary, "<Arg2>");
+  WriteBasicType(os, binary, arg2);
+  WriteToken(os, binary, "<Arg3>");
+  WriteBasicType(os, binary, arg3);
+  WriteToken(os, binary, "<Arg4>");
+  WriteBasicType(os, binary, arg4);
+  WriteToken(os, binary, "<Arg5>");
+  WriteBasicType(os, binary, arg5);
+  WriteToken(os, binary, "<Arg6>");
+  WriteBasicType(os, binary, arg6);
+  WriteToken(os, binary, "</Command>");
+}
+
+
 // outputs a string explaining the meaning each sub-matrix in vaguely
 // matlab-like notation: for whole matrices, something like "m1", "m2";
 // and for parts of matrices, "m1(0:10, 20:40)".
@@ -138,7 +391,7 @@ void NnetComputation::GetSubmatrixStrings(
   (*submat_strings)[0] = "[]";  // the empty matrix
   for (int32 i = 1; i < num_submatrices; i++) {
     const NnetComputation::SubMatrixInfo &submat = this->submatrices[i];
-    std::ostringstream os;    
+    std::ostringstream os;
     if (this->IsWholeMatrix(i)) {
       os << 'm' << submat.matrix_index;
     } else { // part of a range.
@@ -147,8 +400,8 @@ void NnetComputation::GetSubmatrixStrings(
          << submat.col_offset << ':' << (submat.col_offset + submat.num_cols - 1)
          << ')';
     }
-    (*submat_strings)[i] = os.str();    
-  }  
+    (*submat_strings)[i] = os.str();
+  }
 }
 
 // outputs a string containing a text form of each of the elements of the
@@ -167,41 +420,25 @@ static void GetIndexesStrings(const Nnet &nnet,
 }
 
 // outputs a string containing a text form of each of the elements of the
-// "indexes_multi" vector.  this requires a little care because the vectors of
-// pairs in indexes_multi can have a couple of different meanings.  If used in
-// kAddRowsMulti, KAddToRowsMulti, kCopyRowsMulti or kCopyToRowsMulti it's pairs
-// (sub-matrix index, row index).  Here, while the .first of each element refers
-// to a sub-matrix index, it's only the actual matrices that have names, so we
-// have to go back to the names of the actual matrices and maybe specify a
-// dimension range.  In a simple case it would be e.g. m1(1,:); in a harder case
-// it would be e.g. m1(2, 10:19).  Also the vectors in "indexes_multi" may be used
-// in commands of type kAddRowRange where each pair refers to a row range like
-// 10-19.  [note, the pair 10,20 would mean the range 10-19 and we print in
-// the 10-19 format].  We figure out how each vector is used and print the
-// string in the appropriate format.
+// "indexes_multi" vector.
 static void GetIndexesMultiStrings(
     const Nnet &nnet,
     const NnetComputation &computation,
     std::vector<std::string> *indexes_multi_strings) {
   int32 indexes_multi_size = computation.indexes_multi.size();
   indexes_multi_strings->resize(indexes_multi_size);
-  std::vector<bool> is_row_range(indexes_multi_size, false);
-  for (int32 c = 0; c < computation.commands.size(); c++)
-    if (computation.commands[c].command_type == NnetComputation::kAddRowRanges)
-      is_row_range[computation.commands[c].arg3] = true;
-  
+
   for (int32 i = 0; i < indexes_multi_size; i++) {
-    bool row_range = is_row_range[i];
     std::ostringstream os;
     os << "[";
     const std::vector<std::pair<int32, int32> > &vec =
         computation.indexes_multi[i];
     int32 size = vec.size();
     for (int32 j = 0; j < size; j++) {
-      if (row_range) {
-        os << vec[j].first << ":" << (vec[j].second - 1);
+      int32 submat_index = vec[j].first, row_index = vec[j].second;
+      if (submat_index == -1) {
+        os << "NULL";
       } else {
-        int32 submat_index = vec[j].first, row_index = vec[j].second;
         const NnetComputation::SubMatrixInfo &submat =
             computation.submatrices[submat_index];
         const NnetComputation::MatrixInfo &mat =
@@ -209,7 +446,17 @@ static void GetIndexesMultiStrings(
         int32 row = row_index + submat.row_offset;
         int32 col_start = submat.col_offset,
             col_end = col_start + submat.num_cols;
-        KALDI_ASSERT(row < mat.num_rows);
+        if (!(row_index < submat.num_rows &&
+              row < mat.num_rows)) {
+          KALDI_WARN << "Invalid indexes in indexes-multi[" << i
+                     << ": submatrix " << submat_index << " = m"
+                     << submat.matrix_index << "(" << submat.row_offset
+                     << ':' << (submat.row_offset + submat.num_rows - 1)
+                     << ',' << submat.col_offset << ':'
+                     << (submat.col_offset + submat.num_cols - 1) << ") has "
+                     << submat.num_rows << " rows, but you access row "
+                     << row_index;
+        }
         if (col_start == 0 && col_end == mat.num_cols)
           os << 'm' << submat.matrix_index << '(' << row << ",:)";
         else
@@ -226,7 +473,7 @@ static void GetIndexesMultiStrings(
 
 // writes to "os" the statement for this command.
 static void PrintCommand(std::ostream &os,
-                         const Nnet &nnet,                         
+                         const Nnet &nnet,
                          const NnetComputation &computation,
                          int32 command_index,
                          const std::vector<std::string> &submatrix_strings,
@@ -236,80 +483,101 @@ static void PrintCommand(std::ostream &os,
   os << "c" << command_index << ": ";
   const NnetComputation::Command &c = computation.commands[command_index];
   switch (c.command_type) {
-    case NnetComputation::kAllocMatrixZeroed:
+    case kAllocMatrixZeroed:
       os << "m" << c.arg1 << " = zeros("
          << computation.matrices[c.arg1].num_rows
          << ',' << computation.matrices[c.arg1].num_cols << ")\n";
       break;
-    case NnetComputation::kAllocMatrixUndefined:
+    case kAllocMatrixUndefined:
       os << "m" << c.arg1 << " = undefined("
          << computation.matrices[c.arg1].num_rows
          << ',' << computation.matrices[c.arg1].num_cols << ")\n";
       break;
-    case NnetComputation::kDeallocMatrix:
+    case kDeallocMatrix:
       os << "m" << c.arg1 << " = []\n";
-      break;      
-    case NnetComputation::kPropagate:
+      break;
+    case kAllocMatrixFromOther:
+      os << "m" << c.arg1 << ".swap(m" << c.arg2 << ") [dim = "
+         << computation.matrices[c.arg1].num_rows << " x "
+         << computation.matrices[c.arg1].num_cols << "]\n";
+      break;
+    case kAllocMatrixFromOtherZeroed:
+      os << "m" << c.arg1 << ".swap(m" << c.arg2 << ") [dim = "
+         << computation.matrices[c.arg1].num_rows << " x "
+         << computation.matrices[c.arg1].num_cols << "]; m"
+         << c.arg1 << ".zero();\n";
+      break;
+    case kPropagate:
       os << nnet.GetComponentName(c.arg1) << ".Propagate(";
       if (c.arg2 == 0) os << "NULL, ";
       else os << "precomputed_indexes[" << c.arg2 << "], ";
       os << submatrix_strings[c.arg3] << ", &" << submatrix_strings[c.arg4]
          << ")\n";
       break;
-    case NnetComputation::kStoreStats:
+    case kStoreStats:
       os << nnet.GetComponentName(c.arg1) << ".StoreStats("
          << submatrix_strings[c.arg2] << ")\n";
       break;
-    case NnetComputation::kBackprop: {
-      int32 component_index = nnet.GetNode(c.arg1).u.component_index;
+    case kBackprop:
+    case kBackpropNoModelUpdate: {
+      int32 component_index = c.arg1;
       os << nnet.GetComponentName(component_index) << ".Backprop(";
       if (c.arg2 == 0) os << "NULL, ";
       else os << "precomputed_indexes[" << c.arg2 << "], ";
       os << submatrix_strings[c.arg3] << ", "
          << submatrix_strings[c.arg4] << ", "
          << submatrix_strings[c.arg5] << ", "
-         << (computation.need_model_derivative ? "[component-pointer], &" :
-             "NULL, &")
-         << submatrix_strings[c.arg6] << ")\n";
+         << (computation.need_model_derivative &&
+             c.command_type == kBackprop ?
+             "[component-pointer], " : "NULL, ")
+         << (c.arg6 == 0 ? std::string("NULL") :
+             std::string("&") + submatrix_strings[c.arg6]) << ")\n";
       break;
     }
-    case NnetComputation::kMatrixCopy:
+    case kMatrixCopy:
       os << submatrix_strings[c.arg1] << " = "
          << submatrix_strings[c.arg2] << "\n";
       break;
-    case NnetComputation::kMatrixAdd:
+    case kMatrixAdd:
       os << submatrix_strings[c.arg1] << " += "
          << submatrix_strings[c.arg2] << "\n";
       break;
-    case NnetComputation::kAddRows:
-    case NnetComputation::kCopyRows:
+    case kAddRows:
+    case kCopyRows:
       os << submatrix_strings[c.arg1] << "."
-         << (c.command_type == NnetComputation::kAddRows ? "AddRows" :
+         << (c.command_type == kAddRows ? "AddRows" :
              "CopyRows") << "(" << submatrix_strings[c.arg2]
          << indexes_strings[c.arg3] << ")\n";
       break;
-    case NnetComputation::kAddRowsMulti:
-    case NnetComputation::kAddToRowsMulti:
-    case NnetComputation::kCopyRowsMulti:
-    case NnetComputation::kCopyToRowsMulti: {
-      NnetComputation::CommandType ct = c.command_type;
+    case kAddRowsMulti:
+    case kAddToRowsMulti:
+    case kCopyRowsMulti:
+    case kCopyToRowsMulti: {
+      CommandType ct = c.command_type;
       os << submatrix_strings[c.arg1] << "."
-         << (ct == NnetComputation::kAddRowsMulti ? "AddRowsMulti" :
-             (ct == NnetComputation::kAddToRowsMulti? "AddToRowsMulti" :
-              (ct == NnetComputation::kCopyRowsMulti ? "CopyRowsMulti" :
-               "CopyToRowsMulti"))) << "(" << submatrix_strings[c.arg2]
+         << (ct == kAddRowsMulti ? "AddRowsMulti" :
+             (ct == kAddToRowsMulti? "AddToRowsMulti" :
+              (ct == kCopyRowsMulti ? "CopyRowsMulti" :
+               "CopyToRowsMulti"))) << "("
          << indexes_multi_strings[c.arg2] << ")\n";
       break;
     }
-    case NnetComputation::kAddRowRanges:
+    case kAddRowRanges: {
       os << submatrix_strings[c.arg1] << ".AddRowRanges("
-          << submatrix_strings[c.arg2] << ", "
-          << indexes_multi_strings[c.arg2] << ")\n";
+         << submatrix_strings[c.arg2] << ", [";
+      const std::vector<std::pair<int32, int32> > &pairs =
+           computation.indexes_ranges[c.arg3];
+      for (size_t i = 0; i < pairs.size(); i++) {
+        os << pairs[i].first << ":" << (pairs[i].second - 1);
+        if (i + 1 < pairs.size()) os << ",";
+      }
+      os << "])\n";
       break;
-    case NnetComputation::kNoOperation:
+    }
+    case kNoOperation:
       os << "[no-op]\n";
       break;
-    case NnetComputation::kNoOperationMarker:
+    case kNoOperationMarker:
       os << "# begin backward commands\n";
       break;
     default:
@@ -325,7 +593,7 @@ static void PrintComputationPreamble(
     const std::vector<std::string> &submatrix_strings,
     const std::vector<std::string> &indexes_strings,
     const std::vector<std::string> &indexes_multi_strings) {
-  
+
   // First print info about the matrices.
   os << "matrix ";
   for (int32 i = 1; i < c.matrices.size(); i++) {
@@ -347,7 +615,7 @@ static void PrintComputationPreamble(
     if (deriv_matrix_index != 0) {
       os << nnet.GetNodeName(node_index) << ".deriv -> m"
          << deriv_matrix_index << "\n";
-    }    
+    }
   }
   if (!c.matrix_debug_info.empty()) {
     os << "# The following show how matrices correspond to network-nodes and\n"
@@ -358,13 +626,8 @@ static void PrintComputationPreamble(
     for (int32 i = 1; i < c.matrices.size(); i++) {
       const NnetComputation::MatrixDebugInfo &debug_info =
           c.matrix_debug_info[i];
-      if (debug_info.node_index == -1)  // was not set up for some reason.
-        continue;
-      KALDI_ASSERT(static_cast<size_t>(debug_info.node_index) < nnet.NumNodes());
-      os << "m" << i << " = " << nnet.GetNodeName(debug_info.node_index)
-         << "." << (debug_info.is_deriv ? "deriv" : "value");
-      // PrintIndexes will print the indexes inside [ ] brackets.
-      PrintIndexes(os, debug_info.indexes);
+      os << "m" << i << " == " << (debug_info.is_deriv ? "deriv: " : "value: ");
+      PrintCindexes(os, debug_info.cindexes, nnet.GetNodeNames());
       os << "\n";
     }
   }
@@ -383,7 +646,210 @@ void NnetComputation::Print(std::ostream &os, const Nnet &nnet) const {
     PrintCommand(os, nnet, *this, c, submatrix_strings,
                  indexes_strings, indexes_multi_strings);
   }
-}  
+}
+
+void NnetComputation::Read(std::istream &is, bool binary) {
+  ExpectToken(is, binary, "<NnetComputation>");
+  size_t num_matrices;
+  ExpectToken(is, binary, "<NumMatrices>");
+  ReadBasicType(is, binary, &num_matrices);
+  KALDI_ASSERT(num_matrices >= 0);
+  matrices.resize(num_matrices);
+  ExpectToken(is, binary, "<Matrices>");
+  for (size_t c = 0; c < num_matrices; c++) {
+    matrices[c].Read(is, binary);
+  }
+
+  size_t num_matrix_debug_info;
+  ExpectToken(is, binary, "<NumMatrixDebugInfo>");
+  ReadBasicType(is, binary, &num_matrix_debug_info);
+  KALDI_ASSERT(num_matrix_debug_info >= 0);
+  matrix_debug_info.resize(num_matrix_debug_info);
+  ExpectToken(is, binary, "<MatrixDebugInfo>");
+  for (size_t c = 0; c < num_matrix_debug_info; c++) {
+    matrix_debug_info[c].Read(is, binary);
+  }
+
+  size_t num_submatrices;
+  ExpectToken(is, binary, "<NumSubMatrices>");
+  ReadBasicType(is, binary, &num_submatrices);
+  KALDI_ASSERT(num_submatrices >= 0);
+  submatrices.resize(num_submatrices);
+  ExpectToken(is, binary, "<SubMatrices>");
+  for (size_t c = 0; c < num_submatrices; c++) {
+    submatrices[c].Read(is, binary);
+  }
+
+
+  size_t num_component_precomputed_indexes;
+  ExpectToken(is, binary, "<NumComponentPrecomputedIndexes>");
+  ReadBasicType(is, binary, &num_component_precomputed_indexes);
+  KALDI_ASSERT(num_component_precomputed_indexes >= 0);
+  component_precomputed_indexes.resize(num_component_precomputed_indexes);
+  ExpectToken(is, binary, "<ComponentPrecomputedIndexes>");
+  std::vector<ComponentPrecomputedIndexes*> component_precomputed_indexes_tmp;
+  for (size_t c = 0; c < num_component_precomputed_indexes; c++) {
+    bool is_null; // a boolean indicating whether the pointer should be NULL.
+    ReadBasicType(is, binary, &is_null); 
+    if (!is_null) {
+      ComponentPrecomputedIndexes* p = ComponentPrecomputedIndexes::ReadNew(is, binary);
+      component_precomputed_indexes_tmp.push_back(p);
+    } else {
+      component_precomputed_indexes_tmp.push_back(NULL);
+    }
+  }
+  component_precomputed_indexes = component_precomputed_indexes_tmp;
+
+  size_t num_indexes;
+  ExpectToken(is, binary, "<NumIndexes>");
+  ReadBasicType(is, binary, &num_indexes);
+  KALDI_ASSERT(num_indexes >= 0);
+  indexes.resize(num_indexes);
+  ExpectToken(is, binary, "<Indexes>");
+  for (size_t c = 0; c < num_indexes; c++) {
+    ReadIntegerVector(is, binary, &(indexes[c]));
+  }
+
+  size_t num_indexes_multi;
+  ExpectToken(is, binary, "<NumIndexesMulti>");
+  ReadBasicType(is, binary, &num_indexes_multi);
+  KALDI_ASSERT(num_indexes_multi >= 0);
+  indexes_multi.resize(num_indexes_multi);
+  ExpectToken(is, binary, "<IndexesMulti>");
+  for (size_t c = 0; c < num_indexes_multi; c++) {
+    ReadIntegerPairVector(is, binary, &(indexes_multi[c]));
+  }
+
+  size_t num_indexes_ranges;
+  ExpectToken(is, binary, "<NumIndexesRanges>");
+  ReadBasicType(is, binary, &num_indexes_ranges);
+  KALDI_ASSERT(num_indexes_ranges >= 0);
+  indexes_ranges.resize(num_indexes_ranges);
+  ExpectToken(is, binary, "<IndexesRanges>");
+  for (size_t c = 0; c < num_indexes_ranges; c++) {
+    ReadIntegerPairVector(is, binary, &(indexes_ranges[c]));
+  }
+
+  size_t num_input_output_info;
+  ExpectToken(is, binary, "<NumInputOutputInfo>");
+  ReadBasicType(is, binary, &num_input_output_info);
+  KALDI_ASSERT(num_input_output_info >= 0);
+  input_output_info.clear();
+  ExpectToken(is, binary, "<InputOutputInfo>");
+  for (size_t c = 0; c < num_input_output_info; c++) {
+    int32 key;
+    std::pair<int32, int32> val;
+    ReadBasicType(is, binary, &key);
+    ReadBasicType(is, binary, &(val.first));
+    ReadBasicType(is, binary, &(val.second));
+    input_output_info.insert(std::pair<int32, std::pair<int32, int32> >(key, val));
+  }
+
+  size_t num_commands;
+  ExpectToken(is, binary, "<NumCommands>");
+  ReadBasicType(is, binary, &num_commands);
+  KALDI_ASSERT(num_commands >= 0);
+  commands.resize(num_commands);
+  ExpectToken(is, binary, "<Commands>");
+  for (size_t c = 0; c < num_commands; c++) {
+    commands[c].Read(is, binary);
+  }
+
+  ExpectToken(is, binary, "<NeedModelDerivative>");
+  ReadBasicType(is, binary, &need_model_derivative);
+
+  ComputeCudaIndexes();
+  ExpectToken(is, binary, "</NnetComputation>");
+}
+
+void NnetComputation::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<NnetComputation>");
+  WriteToken(os, binary, "<NumMatrices>");
+  WriteBasicType(os, binary, matrices.size());
+  WriteToken(os, binary, "<Matrices>");
+  for (size_t c = 0; c < matrices.size(); c++) {
+    matrices[c].Write(os, binary);
+  }
+
+  if (!binary) os << std::endl;
+  WriteToken(os, binary, "<NumMatrixDebugInfo>");
+  WriteBasicType(os, binary, matrix_debug_info.size());
+  WriteToken(os, binary, "<MatrixDebugInfo>");
+  for (size_t c = 0; c < matrix_debug_info.size(); c++) {
+    matrix_debug_info[c].Write(os, binary);
+  }
+
+  if (!binary) os << std::endl;
+  WriteToken(os, binary, "<NumSubMatrices>");
+  WriteBasicType(os, binary, submatrices.size());
+  WriteToken(os, binary, "<SubMatrices>");
+  for (size_t c = 0; c < submatrices.size(); c++) {
+    submatrices[c].Write(os, binary);
+  }
+  
+  if (!binary) os << std::endl;
+  WriteToken(os, binary, "<NumComponentPrecomputedIndexes>");
+  WriteBasicType(os, binary, component_precomputed_indexes.size());
+  WriteToken(os, binary, "<ComponentPrecomputedIndexes>");
+  for (size_t c = 0; c < component_precomputed_indexes.size(); c++) {
+    if (component_precomputed_indexes[c] != NULL) {
+      WriteBasicType(os, binary, false); // a boolean indicating whether the pointer is NULL.
+      component_precomputed_indexes[c]->Write(os, binary);
+    } else {
+      WriteBasicType(os, binary, true);
+    }
+  }
+
+  if (!binary) os << std::endl;
+  WriteToken(os, binary, "<NumIndexes>");
+  WriteBasicType(os, binary, indexes.size());
+  WriteToken(os, binary, "<Indexes>");
+  for (size_t c = 0; c < indexes.size(); c++) {
+    WriteIntegerVector(os, binary, indexes[c]);
+  }
+
+  if (!binary) os << std::endl;
+  WriteToken(os, binary, "<NumIndexesMulti>");
+  WriteBasicType(os, binary, indexes_multi.size());
+  WriteToken(os, binary, "<IndexesMulti>");
+  for (size_t c = 0; c < indexes_multi.size(); c++) {
+    WriteIntegerPairVector(os, binary, indexes_multi[c]);
+  }
+
+  if (!binary) os << std::endl;
+  WriteToken(os, binary, "<NumIndexesRanges>");
+  WriteBasicType(os, binary, indexes_ranges.size());
+  WriteToken(os, binary, "<IndexesRanges>");
+  for (size_t c = 0; c < indexes_ranges.size(); c++) {
+    WriteIntegerPairVector(os, binary, indexes_ranges[c]);
+  }
+
+  if (!binary) os << std::endl;
+  WriteToken(os, binary, "<NumInputOutputInfo>");
+  WriteBasicType(os, binary, input_output_info.size());
+  WriteToken(os, binary, "<InputOutputInfo>");
+  std::map<int32, std::pair<int32, int32> > input_output_info_cp(input_output_info.begin(), input_output_info.end());
+  for (std::map<int32, std::pair<int32, int32> >::const_iterator iter =
+           input_output_info_cp.begin(); iter != input_output_info_cp.end(); ++iter) {
+    WriteBasicType(os, binary, iter->first);
+    WriteBasicType(os, binary, iter->second.first);
+    WriteBasicType(os, binary, iter->second.second);
+  }
+
+  if (!binary) os << std::endl;
+  WriteToken(os, binary, "<NumCommands>");
+  WriteBasicType(os, binary, commands.size());
+  WriteToken(os, binary, "<Commands>");
+  for (size_t c = 0; c < commands.size(); c++) {
+    commands[c].Write(os, binary);
+  }
+
+  if (!binary) os << std::endl;
+  WriteToken(os, binary, "<NeedModelDerivative>");
+  WriteBasicType(os, binary, need_model_derivative);
+  WriteToken(os, binary, "</NnetComputation>");
+  if (!binary) os << std::endl;
+}
 
 void NnetComputation::GetCommandStrings(
     const Nnet &nnet,
@@ -412,7 +878,7 @@ void NnetComputation::GetCommandStrings(
       if (!str.empty())
         str.resize(str.size() - 1);
     }
-  }  
+  }
 }
 
 
@@ -441,6 +907,98 @@ void IoSpecification::Print(std::ostream &os) const {
   os << "\n";
 }
 
+void IoSpecification::Swap(IoSpecification *other) {
+  name.swap(other->name);
+  indexes.swap(other->indexes);
+  std::swap(has_deriv, other->has_deriv);
+}
+
+void IoSpecification::Read(std::istream &is, bool binary) {
+  ExpectToken(is, binary, "<IoSpecification>");
+  ReadToken(is, binary, &name);
+  ExpectToken(is, binary, "<NumIndexes>");
+  size_t num_indexes;
+  ReadBasicType(is, binary, &num_indexes);
+  ExpectToken(is, binary, "<Indexes>");
+  ReadIndexVector(is, binary, &indexes);
+  ExpectToken(is, binary, "<HasDeriv>");
+  ReadBasicType(is, binary, &has_deriv);
+  ExpectToken(is, binary, "</IoSpecification>");
+}
+
+void IoSpecification::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<IoSpecification>");
+  if (!binary) os << std::endl;
+  WriteToken(os, binary, name);
+  WriteToken(os, binary, "<NumIndexes>");
+  WriteBasicType(os, binary, indexes.size());
+  WriteToken(os, binary, "<Indexes>");
+  WriteIndexVector(os, binary, indexes);
+  WriteToken(os, binary, "<HasDeriv>");
+  WriteBasicType(os, binary, has_deriv);
+  if (!binary) os << std::endl;
+  WriteToken(os, binary, "</IoSpecification>");
+  if (!binary) os << std::endl;
+}
+
+void ComputationRequest::Read(std::istream &is, bool binary) {
+  ExpectToken(is, binary, "<ComputationRequest>");
+  size_t num_inputs;
+  ExpectToken(is, binary, "<NumInputs>");
+  ReadBasicType(is, binary, &num_inputs);
+  KALDI_ASSERT(num_inputs >= 0);
+  inputs.resize(num_inputs);
+  ExpectToken(is, binary, "<Inputs>");
+  for (size_t c = 0; c < num_inputs; c++) {
+    inputs[c].Read(is, binary);
+  }
+
+  size_t num_outputs;
+  ExpectToken(is, binary, "<NumOutputs>");
+  ReadBasicType(is, binary, &num_outputs);
+  KALDI_ASSERT(num_outputs >= 0);
+  outputs.resize(num_outputs);
+  ExpectToken(is, binary, "<Outputs>");
+  for (size_t c = 0; c < num_outputs; c++) {
+    outputs[c].Read(is, binary);
+  }
+
+  ExpectToken(is, binary, "<NeedModelDerivative>");
+  ReadBasicType(is, binary, &need_model_derivative);
+  ExpectToken(is, binary, "<StoreComponentStats>");
+  ReadBasicType(is, binary, &store_component_stats);
+  ExpectToken(is, binary, "</ComputationRequest>");
+}
+
+void ComputationRequest::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<ComputationRequest>");
+  if (!binary) os << std::endl;
+  WriteToken(os, binary, "<NumInputs>");
+  WriteBasicType(os, binary, inputs.size());
+  if (!binary) os << std::endl;
+  WriteToken(os, binary, "<Inputs>");
+  for (size_t c = 0; c < inputs.size(); c++) {
+    inputs[c].Write(os, binary);
+  }
+  if (!binary) os << std::endl;
+
+  WriteToken(os, binary, "<NumOutputs>");
+  WriteBasicType(os, binary, outputs.size());
+  if (!binary) os << std::endl;
+  WriteToken(os, binary, "<Outputs>");
+  for (size_t c = 0; c < outputs.size(); c++) {
+    outputs[c].Write(os, binary);
+  }
+  if (!binary) os << std::endl;
+
+  WriteToken(os, binary, "<NeedModelDerivative>");
+  WriteBasicType(os, binary, need_model_derivative);
+  WriteToken(os, binary, "<StoreComponentStats>");
+  WriteBasicType(os, binary, store_component_stats);
+  WriteToken(os, binary, "</ComputationRequest>");
+  if (!binary) os << std::endl;
+}
+
 void ComputationRequest::Print(std::ostream &os) const {
   os << " # Computation request:\n";
   for (size_t i = 0; i < inputs.size(); i++) {
@@ -514,7 +1072,7 @@ NnetComputation& NnetComputation::operator = (const NnetComputation &other) {
     need_model_derivative = other.need_model_derivative;
     indexes_cuda = other.indexes_cuda;
     indexes_ranges_cuda = other.indexes_ranges_cuda;
-  
+
     for (size_t i = 0; i < component_precomputed_indexes.size(); i++)
       delete component_precomputed_indexes[i];
     component_precomputed_indexes.clear();
diff --git a/src/nnet3/nnet-computation.h b/src/nnet3/nnet-computation.h
index c1f13d5b492..0d0b13547bf 100644
--- a/src/nnet3/nnet-computation.h
+++ b/src/nnet3/nnet-computation.h
@@ -1,6 +1,7 @@
 // nnet3/nnet-computation.h
 
 // Copyright   2012-2015  Johns Hopkins University (author: Daniel Povey)
+//             2015       Xiaohui Zhang
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -81,12 +82,18 @@ struct IoSpecification {
       name(name), indexes(indexes), has_deriv(has_deriv) { }
   // This constructor sets n = 0, x = 0 and t from t_start to t_end-1; and
   // has_deriv to false.
-  IoSpecification(const std::string &name, int32 t_start, int32 t_end);  
+  IoSpecification(const std::string &name, int32 t_start, int32 t_end);
 
   /// This function is for printing in a human-readable way, for debugging.
   /// Output ends in a newline.
   void Print(std::ostream &os) const;
 
+  void Swap(IoSpecification *other);
+
+  void Read(std::istream &istream, bool binary);
+  
+  void Write(std::ostream &ostream, bool binary) const;
+  
   bool operator== (const IoSpecification &other) const;
 };
 
@@ -137,93 +144,116 @@ struct ComputationRequest {
   /// in a human-readable way.
   void Print(std::ostream &os) const;
 
+  void Read(std::istream &istream, bool binary);
+
+  void Write(std::ostream &ostream, bool binary) const;
+  
   bool operator== (const ComputationRequest &other) const;
 };
 
 
 
+/**
+   CommandType is an enum that describes the category of the command used in
+   the NnetComputation.  We declare it outside that class because it's so
+   frequently used and we got tired of typing NnetComputation:: everywhere.
+   We document the commands here.
+
+   - kAllocMatrixUndefined: Allocate a matrix.  arg1 = index of matrix.
+   - kAllocMatrixZeroed: Allocate and zero a matrix.  arg1 = index of matrix.
+   - kDeallocMatrix: Deallocate a matrix.  arg1 = index of matrix.
+   - kAllocMatrixFromOther: initialize matrix indexed arg1 using memory
+   from matrix indexed arg2 (using shallow swap).
+   - kAllocMatrixFromOtherZeroed: initialize matrix indexed arg1 using memory
+     from matrix indexed arg2 (using shallow swap), then zero the matrix
+     we just allocated.
+   - kPropagate: Forward computation of neural net, see Component::Propagate()
+     - arg1 is is component-index in neural net
+     - arg2 is index into ComponentPrecomputedIndexes (0 if NULL; always 0
+       for simple Components)
+     - arg3 is sub-matrix index of input
+     - arg4 is sub-matrix index of output
+   - kStoreStats: Call Component::StoreStats() (used for computing diagnostics
+      such as average activations; called after Propagate).
+     - arg1 is component-index in neural net
+     - arg2 is sub-matrix index of the output of the Propagate function
+   - kBackprop: Do the back-propagation operation, see Component::Backprop()
+     - arg1 is index of component in neural net
+     - arg2 is index into ComponentPrecomputedIndexes (0 if NULL; always 0
+       for simple Components)
+     - arg3 is submatrix-index of input value (input to Propagate()); 0 if unused
+     - arg4 is submatrix-index of output value (output of Propagate()); 0 if unused
+     - arg5 is submatrix-index of output derivative
+     - arg6 is submatrix-index of input derivative; 0 if unused.
+   - kBackpropNoModelUpdate: as kBackprop, but does not set the
+     'to_update' argument to the Backprop call, even if the model  is updatable,
+     so it skips the model-update phase of backprop.
+   - kMatrixCopy: Copy contents of sub-matrix arg2 to sub-matrix arg1
+   - kMatrixAdd: Add contents of sub-matrix arg2 to sub-matrix arg1
+   - kCopyRows: call \ref CuMatrix::CopyRows() "CopyRows()" on sub-matrix arg1
+     with sub-matrix arg2 and indexes[arg3] as arguments.
+   - kAddRows: call \ref CuMatrix::AddRows() "AddRows()" on sub-matrix arg1
+     with sub-matrix arg2 and indexes[arg3] as arguments.
+   - kAddRowsMulti, kAddToRowsMulti, kCopyRowsMulti, kCopyToRowsMulti:
+     Call the corresponding function in class CuMatrix.
+     - arg1 is sub-matrix index of *this matrix in operation
+     - arg2 is index into "indexes_multi", of which each pair is
+     (sub-matrix index, row index) (or (-1,-1) for NULL marker), which
+     is turned into a vector of BaseFloat* (pointers to matrix rows)
+     before being given as the argument to the function.
+   - kAddRowRanges: call \ref CuMatrix::AddRowRanges() "AddRowRanges()"
+     on sub-matrix arg1, with arg2 as source sub-matrix, and indexes given
+     indexes_ranges[arg3].
+   - kNoOperation: does nothing (sometimes useful during optimization)
+   - kNoOperationMarker: does nothing, but used to mark end of forward commands.
+*/
+enum CommandType {
+  kAllocMatrixUndefined, kAllocMatrixZeroed,
+  kDeallocMatrix, kAllocMatrixFromOther, kAllocMatrixFromOtherZeroed,
+  kPropagate, kStoreStats, kBackprop, kBackpropNoModelUpdate,
+  kMatrixCopy, kMatrixAdd, kCopyRows, kAddRows,
+  kCopyRowsMulti, kCopyToRowsMulti, kAddRowsMulti, kAddToRowsMulti,
+  kAddRowRanges, kNoOperation, kNoOperationMarker };
+
+
 // struct NnetComputation defines the specific steps of a neural-net
 // computation.  View this as a compiled program; given the Nnet and the
 // ComputationRequest, we compile to struct NnetComputation.
-
 struct NnetComputation {
   struct MatrixInfo {
     int32 num_rows;
     int32 num_cols;
+    MatrixStrideType stride_type;
     MatrixInfo() { }
-    MatrixInfo(int32 num_rows, int32 num_cols):
-        num_rows(num_rows), num_cols(num_cols) {}
+    MatrixInfo(int32 num_rows, int32 num_cols,
+               MatrixStrideType stride_type):
+        num_rows(num_rows), num_cols(num_cols), stride_type(stride_type) {}
+    void Read(std::istream &istream, bool binary);
+    void Write(std::ostream &ostream, bool binary) const;
   };
   struct MatrixDebugInfo {
     bool is_deriv;  // true if this represents a derivative, not a value.
-    int32 node_index;  // network-node index.
-    std::vector<Index> indexes;
-    MatrixDebugInfo(): is_deriv(false), node_index(-1) {}
+    std::vector<Cindex> cindexes;
+    MatrixDebugInfo(): is_deriv(false) { }
+    void Swap(MatrixDebugInfo *other);  // Shallow swap
+    void Read(std::istream &istream, bool binary);
+    void Write(std::ostream &ostream, bool binary) const;
   };
   struct SubMatrixInfo {
     int32 matrix_index;  // index into "matrices": the underlying matrix.
-    int32 row_offset;    
+    int32 row_offset;
     int32 num_rows;
-    int32 col_offset;    
+    int32 col_offset;
     int32 num_cols;
     SubMatrixInfo() { }
-    SubMatrixInfo(int32 matrix_index, int32 row_offset, int32 num_rows, 
+    SubMatrixInfo(int32 matrix_index, int32 row_offset, int32 num_rows,
                   int32 col_offset, int32 num_cols):
         matrix_index(matrix_index), row_offset(row_offset), num_rows(num_rows),
         col_offset(col_offset), num_cols(num_cols) {}
+    void Read(std::istream &istream, bool binary);
+    void Write(std::ostream &ostream, bool binary) const;
     bool operator== (const SubMatrixInfo &other) const;
   };
-  /**
-    CommandType is an enum that describes the category of the command.  We
-    document the commands here:
-      - kAllocMatrixUndefined: Allocate a matrix.  arg1 = index of matrix.
-      - kAllocMatrixZeroed: Allocate and zero a matrix.  arg1 = index of matrix.
-      - kDeallocMatrix: Deallocate a matrix.  arg1 = index of matrix.
-      - kPropagate: Forward computation of neural net, see Component::Propagate()
-          - arg1 is is component-index in neural net
-          - arg2 is index into ComponentPrecomputedIndexes (0 if NULL; always 0
-            for simple Components)
-          - arg3 is sub-matrix index of input
-          - arg4 is sub-matrix index of output
-      - kStoreStats: Call Component::StoreStats() (used for computing diagnostics
-         such as average activations; called after Propagate).
-          - arg1 is component-index in neural net
-          - arg2 is sub-matrix index of the output of the Propagate function
-      - kBackprop: Do the back-propagation operation, see Component::Backprop()
-          - arg1 is index of NetworkNode in neural net (component-index is worked
-            out from this)
-          - arg2 is index into ComponentPrecomputedIndexes (0 if NULL; always 0
-            for simple Components)
-          - arg3 is submatrix-index of input value (input to Propagate())
-          - arg4 is submatrix-index of output value (output of Propagate())
-          - arg5 is submatrix-index of output derivative
-          - arg6 is submatrix-index of input derivative
-      - kMatrixCopy: Copy contents of sub-matrix arg2 to sub-matrix arg1
-      - kMatrixAdd: Add contents of sub-matrix arg2 to sub-matrix arg1      
-      - kCopyRows: call \ref CuMatrix::CopyRows() "CopyRows()" on sub-matrix arg1
-           with sub-matrix arg2 and indexes[arg3] as arguments.
-      - kAddRows: call \ref CuMatrix::AddRows() "AddRows()" on sub-matrix arg1
-           with sub-matrix arg2 and indexes[arg3] as arguments.
-      - kAddRowsMulti, kAddToRowsMulti, kCopyRowsMulti, kCopyToRowsMulti:
-          Call the corresponding function in class CuMatrix.
-            - arg1 is sub-matrix index of *this matrix in operation
-            - arg2 is index into "indexes_multi", of which each pair is
-               (sub-matrix index, row index) (or (-1,-1) for NULL marker), which
-               is turned into a vector of BaseFloat* (pointers to matrix rows)
-               before being given as the argument to the function.
-      - kAddRowRanges: call \ref CuMatrix::AddRowRanges() "AddRowRanges()"
-         on sub-matrix arg1, with arg2 as source matrix, and indexes given
-         indexes_ranges[arg3].
-      - kNoOperation: does nothing (sometimes useful during optimization)
-      - kNoOperationMarker: does nothing, but used to mark end of forward commands
-          (sometimes useful during optimization).
-   */
-  enum CommandType {
-    kAllocMatrixUndefined, kAllocMatrixZeroed, 
-    kDeallocMatrix, kPropagate, kStoreStats, kBackprop,
-    kMatrixCopy, kMatrixAdd, kCopyRows, kAddRows,
-    kCopyRowsMulti, kCopyToRowsMulti, kAddRowsMulti, kAddToRowsMulti,
-    kAddRowRanges, kNoOperation, kNoOperationMarker };
   struct Command {
     CommandType command_type;
     int32 arg1;
@@ -237,8 +267,10 @@ struct NnetComputation {
             int32 arg5 = -1, int arg6 = -1):
         command_type(command_type), arg1(arg1), arg2(arg2), arg3(arg3),
         arg4(arg4), arg5(arg5), arg6(arg6) { }
+    void Read(std::istream &istream, bool binary);
+    void Write(std::ostream &ostream, bool binary) const;
   };
-  
+
   // "matrices" describes the sizes of the matrices that we use as variables in
   // the computation [note: index zero is reserved for an empty matrix].  Most
   // commands refer to submatrices below (note: each matrix will have its own
@@ -248,7 +280,7 @@ struct NnetComputation {
   // debug information for each of the matrices (indexed by matrix-index), only
   // computed if requested in the compiler options.
   std::vector<MatrixDebugInfo> matrix_debug_info;
-  
+
 
   // Because some parts of the computation may involve parts of matrix, we
   // declare sub-matrices.  Some of these sub-matrices correspond to entire
@@ -258,7 +290,7 @@ struct NnetComputation {
   // Note: there is no rule against having identical submatrices.  These
   // will be removed by class ComputationRenumberer in nnet-optimize.cc.
   std::vector<SubMatrixInfo> submatrices;
-  
+
   // For Components that require precomputed indexes for their Propagate and
   // Backprop operations.  The index into this vector is referred to in
   // kPropagate and kBackprop operations.  Index 0 in the vector is reserved for
@@ -269,7 +301,7 @@ struct NnetComputation {
 
   // used in kAddRows, kAddToRows, kCopyRows, kCopyToRows.  contains row-indexes.
   std::vector<std::vector<int32> > indexes;
-  
+
   // used kAddRowsMulti, kAddToRowsMulti, kCopyRowsMulti, kCopyToRowsMulti.
   // contains pairs (sub-matrix index, row index)- or (-1,-1) meaning don't
   // do anything for this row.
@@ -285,13 +317,13 @@ struct NnetComputation {
   // the nodes_ array in the Nnet), each pair is (value_matrix_index,
   // deriv_matrix_index), with 0 for derivatives that are not present.
   unordered_map<int32, std::pair<int32, int32> > input_output_info;
-  
+
   // The sequence of commands.
   std::vector<Command> commands;
 
   // This is a copy of "need_model_derivative" from the ComputationRequest.
   bool need_model_derivative;
-  
+
   // computed from "indexes" by ComputeCudaIndexes().
   std::vector<CuArray<int32> > indexes_cuda;
 
@@ -299,15 +331,21 @@ struct NnetComputation {
   std::vector<CuArray<Int32Pair> > indexes_ranges_cuda;
 
 
-  // Convenience function used when adding new matrices.  Returns the corresponding
-  // sub-matrix index, which may or not equal the actual matrix index.
-  int32 NewMatrix(int32 num_rows, int32 num_cols);
+  /// Convenience function used when adding new matrices.  Writes to
+  /// 'this->matrices' and 'this->submatrices'; and if 'this->matrix_debug_info'
+  /// is nonempty, also increases its size by one.  Returns the *sub-matrix*
+  /// index corresponding to the newly added matrix.
+  int32 NewMatrix(int32 num_rows, int32 num_cols, MatrixStrideType stride_type);
 
-  // Convenience function used when adding new sub-matrices.  Returns the new
-  // sub-matrix index.  base_submatrix is the submatrix of which we want
-  // a column range.  dim_offset is the start column, and dim is the dimension
-  // of the desired matrix.
-  int32 NewSubMatrix(int32 base_submatrix, int32 dim_offset, int32 dim);
+  /// Convenience function used when adding new sub-matrices.  base_submatrix is
+  /// the submatrix of which we want a column and/or row range.  As a
+  /// convenience, -1 for the 'num_rows' or the 'num_cols' will be interpreted
+  /// as 'as much as possible'.  Returns the new sub-matrix index.  Writes to
+  /// 'this->submatrices'.  There is no mechanism to stop duplicates from being
+  /// created, but calling RenumberComputation() will remove such duplicates.
+  int32 NewSubMatrix(int32 base_submatrix,
+                     int32 row_offset, int32 num_rows,
+                     int32 col_offset, int32 num_cols);
 
   // returns true if this submatrix corresponds to the whole of a matrix.
   // submatrix_index must be > 0.
@@ -318,10 +356,12 @@ struct NnetComputation {
   // the indexes.
   void ComputeCudaIndexes();
 
-  // This function produces pretty-print ouput intended to allow a human to 
+  // This function produces pretty-print ouput intended to allow a human to
   // interpret the computation.
   void Print(std::ostream &os, const Nnet &nnet) const;
 
+  void Read(std::istream &istream, bool binary);
+  void Write(std::ostream &ostream, bool binary) const;
 
   // This function outputs a vector of strings, one for each submatrix,
   // that explains the meaning of each one: something like "m1", "m2";
@@ -329,7 +369,7 @@ struct NnetComputation {
   void GetSubmatrixStrings(const Nnet &nnet,
                            std::vector<std::string> *submat_strings) const;
 
-  
+
   // This function outputs information similar to Print(), but outputs the
   // preamble as a string and a vector of strings, one per command (with no
   // newlines on these).   This is used in the debugging code in NnetComputer.
@@ -337,8 +377,8 @@ struct NnetComputation {
   void GetCommandStrings(const Nnet &nnet,
                          std::string *preamble,
                          std::vector<std::string> *command_strings) const;
-                         
-  
+
+
   // destructor deletes pointers in component_precomputed_indexes.
   ~NnetComputation();
   // removes all information from this struct, makes it as a newly constructed one.
diff --git a/src/nnet3/nnet-compute-test.cc b/src/nnet3/nnet-compute-test.cc
index 198d1678c10..7fdb3dab982 100644
--- a/src/nnet3/nnet-compute-test.cc
+++ b/src/nnet3/nnet-compute-test.cc
@@ -1,6 +1,7 @@
 // nnet3/nnet-compute-test.cc
 
 // Copyright 2015  Johns Hopkins University (author: Daniel Povey)
+//           2015  Xiaohui Zhang
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -27,12 +28,63 @@
 namespace kaldi {
 namespace nnet3 {
 
+void UnitTestNnetComputationIo(NnetComputation *computation) {
+  bool binary = (Rand() % 2 == 0);
+  std::ostringstream os;
+  computation->Write(os, binary);
+  const std::string &original_output = os.str();
+  std::istringstream computation_is(original_output);
+  computation->Read(computation_is, binary);
+  std::istringstream computation_is2(original_output);
+  NnetComputation computation2;
+  computation2.Read(computation_is2, binary);
+
+  std::ostringstream os2, os3;
+  computation->Write(os2, binary);
+  computation2.Write(os3, binary);
+
+  if (binary) {
+    KALDI_ASSERT(os2.str() == original_output);
+    KALDI_ASSERT(os3.str() == original_output);
+  }
+}
+
+void UnitTestComputationRequestIo(ComputationRequest *request) {
+  bool binary = (Rand() % 2 == 0);
+  std::ostringstream os;
+  request->Write(os, binary);
+  const std::string &original_output = os.str();
+  std::istringstream request_is(original_output);
+  request->Read(request_is, binary);
+  std::istringstream request_is2(original_output);
+  ComputationRequest request2;
+  request2.Read(request_is2, binary);
+
+  std::ostringstream os2, os3;
+  request->Write(os2, binary);
+  request2.Write(os3, binary);
+  KALDI_ASSERT(*request == request2);
+
+  if (binary) {
+    KALDI_ASSERT(os2.str() == original_output);
+    KALDI_ASSERT(os3.str() == original_output);
+  }
+}
+
+void TestNnetDecodable(const ComputationRequest &request,
+                       const std::vector<Matrix<BaseFloat> > &inputs,
+                       const Nnet &nnet,
+                       const CuMatrixBase<BaseFloat> &reference_output) {
+  // DecodableAmNnetSimpleOptions opts;
+  // This is a placeholder for where we'll eventually test either the decodable
+  // object or something similar to it (e.g. a base class)
+}
 
 void UnitTestNnetCompute() {
-  for (int32 n = 0; n < 20; n++) {    
+  for (int32 n = 0; n < 20; n++) {
     struct NnetGenerationOptions gen_config;
 
-    
+
     std::vector<std::string> configs;
     GenerateConfigSequence(gen_config, &configs);
     Nnet nnet;
@@ -45,7 +97,7 @@ void UnitTestNnetCompute() {
     ComputationRequest request;
     std::vector<Matrix<BaseFloat> > inputs;
     ComputeExampleComputationRequestSimple(nnet, &request, &inputs);
-    
+
     NnetComputation computation;
     Compiler compiler(request, nnet);
 
@@ -55,11 +107,13 @@ void UnitTestNnetCompute() {
       std::ostringstream os;
       computation.Print(os, nnet);
       KALDI_LOG << "Generated computation is: " << os.str();
+      UnitTestNnetComputationIo(&computation);
+      UnitTestComputationRequestIo(&request);
     }
     CheckComputationOptions check_config;
     // we can do the rewrite check since it's before optimization.
-    check_config.check_rewrite = true;  
-    ComputationChecker checker(check_config, nnet, request, computation);
+    check_config.check_rewrite = true;
+    ComputationChecker checker(check_config, nnet, computation);
     checker.Check();
 
     if (RandInt(0, 1) == 0) {
@@ -76,7 +130,7 @@ void UnitTestNnetCompute() {
     NnetComputeOptions compute_opts;
     if (RandInt(0, 1) == 0)
       compute_opts.debug = true;
-    
+
     computation.ComputeCudaIndexes();
     NnetComputer computer(compute_opts,
                           computation,
@@ -90,6 +144,9 @@ void UnitTestNnetCompute() {
     }
     computer.Forward();
     const CuMatrixBase<BaseFloat> &output(computer.GetOutput("output"));
+
+    TestNnetDecodable(request, inputs, nnet, output);
+
     KALDI_LOG << "Output sum is " << output.Sum();
     CuMatrix<BaseFloat> output_deriv(output.NumRows(), output.NumCols());
     output_deriv.SetRandn();
@@ -117,7 +174,7 @@ int main() {
   //SetVerboseLevel(2);
 
 
-  for (int32 loop = 0; loop < 2; loop++) {
+  for (kaldi::int32 loop = 0; loop < 2; loop++) {
 #if HAVE_CUDA == 1
     if (loop == 0)
       CuDevice::Instantiate().SelectGpuId("no");
diff --git a/src/nnet3/nnet-compute.cc b/src/nnet3/nnet-compute.cc
index a8bae7e96f1..c6d17cbb589 100644
--- a/src/nnet3/nnet-compute.cc
+++ b/src/nnet3/nnet-compute.cc
@@ -90,8 +90,8 @@ void NnetComputer::DebugBeforeExecute(int32 command,
     }
   }
   const NnetComputation::Command &c = computation_.commands[command];
-  if (c.command_type == NnetComputation::kBackprop) {
-    const Component *component = nnet_.GetComponentForNode(c.arg1);
+  if (c.command_type == kBackprop) {
+    const Component *component = nnet_.GetComponent(c.arg1);
     if (component->Properties() & kUpdatableComponent)
       info->components_parameter_stddev = ParameterStddev(*component);
   }
@@ -131,11 +131,10 @@ void NnetComputer::DebugAfterExecute(int32 command,
     }
   }
   const NnetComputation::Command &c = computation_.commands[command];
-  if (c.command_type == NnetComputation::kBackprop) {
-    const Component *component = nnet_.GetComponentForNode(c.arg1);
+  if (c.command_type == kBackprop) {
+    const Component *component = nnet_.GetComponent(c.arg1);
     if (component->Properties() & kUpdatableComponent) {
-      const std::string &component_name = nnet_.GetComponentName(
-          nnet_.GetNode(c.arg1).u.component_index);
+      const std::string &component_name = nnet_.GetComponentName(c.arg1);
       os << component_name << ": " << info.components_parameter_stddev
          << "->" << ParameterStddev(*component) << " ";
     }
@@ -148,20 +147,29 @@ void NnetComputer::ExecuteCommand(int32 command) {
   const NnetComputation::Command &c = computation_.commands[command];
   try {
     switch (c.command_type) {
-      case NnetComputation::kAllocMatrixZeroed:
+      case kAllocMatrixZeroed:
         matrices_[c.arg1].Resize(computation_.matrices[c.arg1].num_rows,
                                  computation_.matrices[c.arg1].num_cols,
-                                 kSetZero);
+                                 kSetZero,
+                                 computation_.matrices[c.arg1].stride_type);
         break;
-      case NnetComputation::kAllocMatrixUndefined:
+      case kAllocMatrixUndefined:
         matrices_[c.arg1].Resize(computation_.matrices[c.arg1].num_rows,
                                  computation_.matrices[c.arg1].num_cols,
-                                 kUndefined);
+                                 kUndefined,
+                                 computation_.matrices[c.arg1].stride_type);
         break;
-      case NnetComputation::kDeallocMatrix:
+      case kDeallocMatrix:
         matrices_[c.arg1].Resize(0, 0);
         break;
-      case NnetComputation::kPropagate: {
+      case kAllocMatrixFromOther:
+        matrices_[c.arg1].Swap(&(matrices_[c.arg2]));
+        break;
+      case kAllocMatrixFromOtherZeroed:
+        matrices_[c.arg1].Swap(&(matrices_[c.arg2]));
+        matrices_[c.arg1].SetZero();
+        break;
+      case kPropagate: {
         const Component *component = nnet_.GetComponent(c.arg1);
         ComponentPrecomputedIndexes *indexes =
             computation_.component_precomputed_indexes[c.arg2];
@@ -170,24 +178,24 @@ void NnetComputer::ExecuteCommand(int32 command) {
         component->Propagate(indexes, input, &output);
         break;
       }
-      case NnetComputation::kStoreStats: {
+      case kStoreStats: {
         KALDI_ASSERT(nnet_to_update_ != NULL);
         Component *upd_component = nnet_to_update_->GetComponent(c.arg1);
         CuSubMatrix<BaseFloat> output(GetSubMatrix(c.arg2));
         upd_component->StoreStats(output);
         break;
       }
-      case NnetComputation::kBackprop: {
-        int32 node_index = c.arg1;
+      case kBackprop:
+      case kBackpropNoModelUpdate:  {
         std::ostringstream debug_str;
-        KALDI_ASSERT(nnet_to_update_ != NULL);      
-        debug_str << "node " << node_index << '['
-                  << nnet_.GetNodeNames()[node_index] << ']';
-        const Component *component = nnet_.GetComponentForNode(c.arg1);
+        KALDI_ASSERT(nnet_to_update_ != NULL);
+        debug_str << nnet_.GetComponentName(c.arg1);
+        const Component *component = nnet_.GetComponent(c.arg1);
         KALDI_ASSERT(!(computation_.need_model_derivative && !nnet_to_update_));
         Component *upd_component = (nnet_to_update_ &&
+                                    c.command_type == kBackprop &&
                                     computation_.need_model_derivative ?
-                                    nnet_to_update_->GetComponentForNode(c.arg1) :
+                                    nnet_to_update_->GetComponent(c.arg1) :
                                     NULL);
         ComponentPrecomputedIndexes *indexes =
             computation_.component_precomputed_indexes[c.arg2];
@@ -200,68 +208,68 @@ void NnetComputer::ExecuteCommand(int32 command) {
                             c.arg6 == 0 ? NULL : &in_deriv);
         break;
       }
-      case NnetComputation::kMatrixCopy: {
-        CuSubMatrix<BaseFloat> dest(GetSubMatrix(c.arg1));      
+      case kMatrixCopy: {
+        CuSubMatrix<BaseFloat> dest(GetSubMatrix(c.arg1));
         const CuSubMatrix<BaseFloat> src(GetSubMatrix(c.arg2));
         dest.CopyFromMat(src);
         break;
       }
-      case NnetComputation::kMatrixAdd: {
+      case kMatrixAdd: {
         CuSubMatrix<BaseFloat> dest(GetSubMatrix(c.arg1));
         const CuSubMatrix<BaseFloat> src(GetSubMatrix(c.arg2));
         dest.AddMat(1.0, src);
         break;
       }
-      case NnetComputation::kAddRows: {
+      case kAddRows: {
         CuSubMatrix<BaseFloat> dest(GetSubMatrix(c.arg1));
         const CuSubMatrix<BaseFloat> src(GetSubMatrix(c.arg2));
         const CuArray<int32> &indexes = computation_.indexes_cuda[c.arg3];
         dest.AddRows(1.0, src, indexes);
         break;
       }
-      case NnetComputation::kCopyRows: {
+      case kCopyRows: {
         CuSubMatrix<BaseFloat> dest(GetSubMatrix(c.arg1));
         const CuSubMatrix<BaseFloat> src(GetSubMatrix(c.arg2));
         const CuArray<int32> &indexes = computation_.indexes_cuda[c.arg3];
         dest.CopyRows(src, indexes);
         break;
       }
-      case NnetComputation::kCopyRowsMulti: {
+      case kCopyRowsMulti: {
         CuSubMatrix<BaseFloat> dest(GetSubMatrix(c.arg1));
         CuArray<const BaseFloat*> pointers;
         GetPointers(c.arg2, dest.NumCols(), &pointers);
         dest.CopyRows(pointers);
         break;
       }
-      case NnetComputation::kCopyToRowsMulti: {
+      case kCopyToRowsMulti: {
         CuSubMatrix<BaseFloat> src(GetSubMatrix(c.arg1));
         CuArray<BaseFloat*> pointers;
         GetPointers(c.arg2, src.NumCols(), &pointers);
         src.CopyToRows(pointers);
         break;
       }
-      case NnetComputation::kAddRowsMulti: {
+      case kAddRowsMulti: {
         CuSubMatrix<BaseFloat> dest(GetSubMatrix(c.arg1));
         CuArray<const BaseFloat*> pointers;
         GetPointers(c.arg2, dest.NumCols(), &pointers);
         dest.AddRows(1.0, pointers);
         break;
       }
-      case NnetComputation::kAddToRowsMulti: {
+      case kAddToRowsMulti: {
         CuSubMatrix<BaseFloat> src(GetSubMatrix(c.arg1));
         CuArray<BaseFloat*> pointers;
         GetPointers(c.arg2, src.NumCols(), &pointers);
         src.AddToRows(1.0, pointers);
         break;
       }
-      case NnetComputation::kAddRowRanges: {
+      case kAddRowRanges: {
         CuSubMatrix<BaseFloat> dest(GetSubMatrix(c.arg1));
         const CuSubMatrix<BaseFloat> src(GetSubMatrix(c.arg2));
         const CuArray<Int32Pair> &pairs = computation_.indexes_ranges_cuda[c.arg3];
         dest.AddRowRanges(src, pairs);
         break;
       }
-      case NnetComputation::kNoOperation: case NnetComputation::kNoOperationMarker:
+      case kNoOperation: case kNoOperationMarker:
         break;
       default:
         KALDI_ERR << "Invalid command in computation";
@@ -304,20 +312,25 @@ void NnetComputer::GetPointers(int32 indexes_multi_index,
   // the map "lookup" maps from submatrix index to the Data()
   // pointer of that submatrix, and the corresponding Stride().
   unordered_map<int32, std::pair<BaseFloat*, int32> > lookup;
-  
+
   for (int32 i = 0; i < size; i++) {
     int32 submatrix_index = pairs[i].first, row = pairs[i].second;
-    unordered_map<int32, std::pair<BaseFloat*, int32> >::iterator
+    if (submatrix_index != -1) {
+      unordered_map<int32, std::pair<BaseFloat*, int32> >::iterator
+          iter = lookup.find(submatrix_index);
+      if (iter == lookup.end()) {
+        CuSubMatrix<BaseFloat> m = GetSubMatrix(submatrix_index);
+        lookup[submatrix_index] = std::pair<BaseFloat*, int32>(m.Data(),
+                                                               m.Stride());
         iter = lookup.find(submatrix_index);
-    if (iter == lookup.end()) {
-      CuSubMatrix<BaseFloat> m = GetSubMatrix(submatrix_index);
-      lookup[submatrix_index] = std::pair<BaseFloat*, int32>(m.Data(),
-                                                             m.Stride());
-      iter = lookup.find(submatrix_index);
+      }
+      BaseFloat *data = iter->second.first;
+      int32 stride = iter->second.second;
+      vec[i] = data + (row * stride);
+    } else {
+      // -1 is a marker that will be translated to NULL.
+      vec[i] = NULL;
     }
-    BaseFloat *data = iter->second.first;
-    int32 stride = iter->second.second;
-    vec[i] = data + (row * stride);
   }
 #ifdef KALDI_PARANOID
   for (int32 i = 0; i < size; i += 30 + RandInt(0, 9)) {
@@ -326,7 +339,7 @@ void NnetComputer::GetPointers(int32 indexes_multi_index,
     CuSubMatrix<BaseFloat> m = GetSubMatrix(submatrix_index);
     KALDI_ASSERT(row >= 0 && row < m.NumRows() && num_cols == m.NumCols());
   }
-#endif  
+#endif
   pointers->CopyFromVec(vec);
 }
 
@@ -343,7 +356,7 @@ void NnetComputer::Forward() {
   const std::vector<NnetComputation::Command> &c = computation_.commands;
   CommandDebugInfo info;
 
-  for (; i < size && c[i].command_type != NnetComputation::kNoOperationMarker;
+  for (; i < size && c[i].command_type != kNoOperationMarker;
        i++) {
     if (debug_)
       DebugBeforeExecute(i, &info);
@@ -351,7 +364,7 @@ void NnetComputer::Forward() {
     if (debug_)
       DebugAfterExecute(i, info);
   }
-    
+
 }
 
 
@@ -359,7 +372,7 @@ void NnetComputer::Backward() {
   CheckInputs(true);
   int32 size = computation_.commands.size(), i = 0;
   const std::vector<NnetComputation::Command> &c = computation_.commands;
-  for (; i < size && c[i].command_type != NnetComputation::kNoOperationMarker;
+  for (; i < size && c[i].command_type != kNoOperationMarker;
        i++);
   CommandDebugInfo info;
   for (; i < size; i++) {
@@ -372,22 +385,31 @@ void NnetComputer::Backward() {
 }
 
 void NnetComputer::AcceptInput(const std::string &input_name,
-                             CuMatrix<BaseFloat> *input) {
+                               CuMatrix<BaseFloat> *input) {
   bool is_output = false, is_deriv = false;
   int32 matrix_index = GetMatrixIndex(input_name, is_output, is_deriv);
-
   KALDI_ASSERT(static_cast<size_t>(matrix_index) < matrices_.size());
-  if (input->NumRows() != computation_.matrices[matrix_index].num_rows)
+  const NnetComputation::MatrixInfo &matrix_info =
+      computation_.matrices[matrix_index];
+  if (input->NumRows() != matrix_info.num_rows)
     KALDI_ERR << "Num-rows mismatch for input '" << input_name
-              << "': " << computation_.matrices[matrix_index].num_rows
+              << "': " << matrix_info.num_rows
               <<  " in computation-request, " << input->NumRows()
               << " provided.";
-  if (input->NumCols() != computation_.matrices[matrix_index].num_cols)
+  if (input->NumCols() != matrix_info.num_cols)
     KALDI_ERR << "Num-cols mismatch for input '" << input_name
-              << "': " << computation_.matrices[matrix_index].num_cols
+              << "': " << matrix_info.num_cols
               <<  " in computation-request, " << input->NumCols()
               << " provided.";
-  matrices_[matrix_index].Swap(input);
+  if (matrix_info.stride_type == kDefaultStride ||
+      input->Stride() == input->NumCols()) {
+    matrices_[matrix_index].Swap(input);
+  } else {
+    matrices_[matrix_index].Resize(matrix_info.num_rows,
+                                   matrix_info.num_cols,
+                                   kUndefined, kStrideEqualNumCols);
+    matrices_[matrix_index].CopyFromMat(*input);
+  }
   input->Resize(0, 0);
 }
 
@@ -427,17 +449,27 @@ void NnetComputer::AcceptOutputDeriv(const std::string &output_name,
   bool is_output = true, is_deriv = true;
   int32 matrix_index = GetMatrixIndex(output_name, is_output, is_deriv);
   KALDI_ASSERT(static_cast<size_t>(matrix_index) < matrices_.size());
-  if (output_deriv->NumRows() != computation_.matrices[matrix_index].num_rows)
+  const NnetComputation::MatrixInfo &matrix_info =
+      computation_.matrices[matrix_index];
+  if (output_deriv->NumRows() != matrix_info.num_rows)
     KALDI_ERR << "Num-rows mismatch for output-deriv '" << output_name
-              << "': " << computation_.matrices[matrix_index].num_rows
+              << "': " << matrix_info.num_rows
               <<  " in computation-request, " << output_deriv->NumRows()
               << " provided.";
-  if (output_deriv->NumCols() != computation_.matrices[matrix_index].num_cols)
+  if (output_deriv->NumCols() != matrix_info.num_cols)
     KALDI_ERR << "Num-cols mismatch for output_deriv '" << output_name
-              << "': " << computation_.matrices[matrix_index].num_cols
+              << "': " << matrix_info.num_cols
               <<  " in computation-request, " << output_deriv->NumCols()
               << " provided.";
-  matrices_[matrix_index].Swap(output_deriv);
+  if (matrix_info.stride_type == kDefaultStride ||
+      output_deriv->Stride() == output_deriv->NumCols()) {
+    matrices_[matrix_index].Swap(output_deriv);
+  } else {
+    matrices_[matrix_index].Resize(matrix_info.num_rows,
+                                   matrix_info.num_cols,
+                                   kUndefined, kStrideEqualNumCols);
+    matrices_[matrix_index].CopyFromMat(*output_deriv);
+  }
   output_deriv->Resize(0, 0);
 }
 
@@ -502,9 +534,9 @@ void NnetComputer::CheckInputs(bool check_output_deriv) const {
 }
 
 void NnetComputer::AcceptInputs(const Nnet &nnet,
-                                const NnetExample &example) {
-  for (size_t i = 0; i < example.io.size(); i++) {
-    const NnetIo &io = example.io[i];
+                                const std::vector<NnetIo> &io_vec) {
+  for (size_t i = 0; i < io_vec.size(); i++) {
+    const NnetIo &io = io_vec[i];
     int32 node_index = nnet.GetNodeIndex(io.name);
     if (node_index == -1)
       KALDI_ERR << "No node named '" << io.name << "' in nnet.";
diff --git a/src/nnet3/nnet-compute.h b/src/nnet3/nnet-compute.h
index d38cbc40428..abcc0e1721e 100644
--- a/src/nnet3/nnet-compute.h
+++ b/src/nnet3/nnet-compute.h
@@ -44,7 +44,7 @@ struct NnetComputeOptions {
                    "debug for the neural net computation (very verbose!) "
                    "Will be turned on regardless if --verbose >= 5");
   }
-  
+
 };
 
 
@@ -76,16 +76,17 @@ class NnetComputer {
   void AcceptInput(const std::string &input_name,
                    CuMatrix<BaseFloat> *input);
 
-  /// This function calls AcceptInput() in turn on all the inputs in
-  /// the training example.  It needs "nnet" only in order to distinguish
+  /// This function calls AcceptInput() in turn on all the inputs in the
+  /// training example (provide example.io; this interface makes it easy to work
+  /// with CCTC examples too).  It needs "nnet" only in order to distinguish
   /// inputs from outputs.
   void AcceptInputs(const Nnet &nnet,
-                    const NnetExample &example);
+                    const std::vector<NnetIo> &io);
+
 
-  
   // Does the forward computation.
   void Forward();
-  
+
   // e.g. GetOutput ("output").  Will crash if no such output.
   const CuMatrixBase<BaseFloat> &GetOutput(const std::string &output_name) const;
 
@@ -98,7 +99,7 @@ class NnetComputer {
   /// e.g. AcceptOutputDeriv("output", &output_deriv_mat).
   void AcceptOutputDeriv(const std::string &output_name,
                          CuMatrix<BaseFloat> *output_deriv);
-  
+
 
   // Does the backward computation.
   void Backward();
@@ -108,13 +109,12 @@ class NnetComputer {
   // ComputationRequest.
   const CuMatrixBase<BaseFloat> &GetInputDeriv(
       const std::string &input_name) const;
-  
+
  private:
   const NnetComputeOptions &options_;
   const NnetComputation &computation_;
   const Nnet &nnet_;
   Nnet *nnet_to_update_;
-  bool forward_done_;
   bool debug_;
   // command_attributes_ is only used if debug_=true.
   std::vector<CommandAttributes> command_attributes_;
@@ -122,7 +122,7 @@ class NnetComputer {
   std::vector<std::string> submatrix_strings_;
   // command_strings_ is only used if debug_=true, or in case of error.
   std::vector<std::string> command_strings_;
-  
+
   // The matrices used in the computation.
   std::vector<CuMatrix<BaseFloat> > matrices_;
 
@@ -142,7 +142,7 @@ class NnetComputer {
                    int32 num_cols,
                    CuArray<BaseFloat*> *pointers);
   void GetPointers(int32 indexes_multi_index,
-                   int32 num_cols,                   
+                   int32 num_cols,
                    CuArray<const BaseFloat*> *pointers);
 
   // with check_output_deriv = false, checks we have all inputs.
@@ -168,15 +168,15 @@ class NnetComputer {
   static BaseFloat MatrixStddev(const CuMatrixBase<BaseFloat> &m);
   // Used in debugging code
   static BaseFloat ParameterStddev(const Component &c);
-  
+
   // only non-const because of the way GetSubMatrix works.
   void DebugBeforeExecute(int32 command,
                           CommandDebugInfo *info);
-  // only non-const because of the way GetSubMatrix works.  
+  // only non-const because of the way GetSubMatrix works.
   void DebugAfterExecute(int32 command,
                          const CommandDebugInfo &info);
 
-  
+
 };
 
 
diff --git a/src/nnet3/nnet-derivative-test.cc b/src/nnet3/nnet-derivative-test.cc
index 777938be723..8c89a74d0e3 100644
--- a/src/nnet3/nnet-derivative-test.cc
+++ b/src/nnet3/nnet-derivative-test.cc
@@ -28,16 +28,76 @@ namespace kaldi {
 namespace nnet3 {
 
 
+void ComputeMinAndMaxTimes(const std::vector<Index> &indexes,
+                           int32 *min_t,
+                           int32 *max_t) {
+  KALDI_ASSERT(!indexes.empty());
+  *min_t = indexes[0].t;
+  *max_t = *min_t;
+  for (int32 n = 1; n < static_cast<int32>(indexes.size()); n++) {
+    *min_t = std::min(*min_t, indexes[n].t);
+    *max_t = std::max(*max_t, indexes[n].t);
+  }
+}
+
+
+// This function is called if you want to set min_deriv_time and max_deriv_time.
+// It works out some meaningful values to set, based on the config.
+void SetDerivTimesOptions(const ComputationRequest &request,
+                          NnetOptimizeOptions *opt_config) {
+  int32 min_t, max_t;
+  KALDI_ASSERT(request.inputs[0].name == "input");
+  const std::vector<Index> &input_indexes = request.inputs[0].indexes;
+  ComputeMinAndMaxTimes(input_indexes, &min_t, &max_t);
+
+  int32 orig_min_t = min_t, orig_max_t = max_t;
+  int t_length = max_t + 1 - min_t;
+  KALDI_ASSERT(t_length > 0);
+  if (t_length == 1)
+    return;
+  if (RandInt(0, 2) == 0) {
+    // remove as much as 4 frames from the left (but don't remove everything).
+    min_t += std::min(4, RandInt(0, t_length - 1));
+    opt_config->min_deriv_time = min_t;
+    t_length = max_t + 1 - min_t;
+    KALDI_ASSERT(t_length > 0);
+  }
+  if (RandInt(0, 2) == 0) {
+    max_t -= std::min(4, RandInt(0, t_length - 1));
+    opt_config->max_deriv_time = max_t;
+    t_length = max_t + 1 - min_t;
+    KALDI_ASSERT(t_length > 0);
+  }
+  if (RandInt(0, 4) == 0) {
+    // ensure that all derivs will be pruned away;
+    // this tests more code.
+    min_t = orig_min_t - 10;
+    max_t = min_t + 1;
+  }
+
+  int32 output_min_t, output_max_t;
+  KALDI_ASSERT(request.outputs[0].name == "output");
+  ComputeMinAndMaxTimes(request.outputs[0].indexes,
+                        &output_min_t, &output_max_t);
+
+  KALDI_LOG << "ComputationRequest has output (min,max) = (" << output_min_t
+            << ',' << output_max_t << "), input (min,max) = (" << orig_min_t
+            << ',' << orig_max_t << "), limiting deriv times to ("
+            << opt_config->min_deriv_time << ','
+            << opt_config->max_deriv_time;
+}
+
 // This test makes sure that the model-derivatives are correct.
 void UnitTestNnetModelDerivatives() {
-  int32 num_tries = 20, num_success = 0;
+  int32 num_tries = 20, num_success = 0, num_fail = 0;
   for (int32 n = 0; n < num_tries; n++) {
     struct NnetGenerationOptions gen_config;
     //gen_config.allow_nonlinearity = false;
     //gen_config.allow_recursion = false;
     //gen_config.allow_final_nonlinearity = true;
     bool allow_optimization = true;
-    
+    bool limit_deriv_times = (RandInt(0, 2) == 0);
+
     std::vector<std::string> configs;
     GenerateConfigSequence(gen_config, &configs);
     Nnet nnet;
@@ -57,7 +117,7 @@ void UnitTestNnetModelDerivatives() {
     request.outputs[0].has_deriv = true;
     // whether input-derivatives are required or not does not matter,
     // so leave it as it is in that regard.
-    
+
     NnetComputation computation;
     Compiler compiler(request, nnet);
 
@@ -70,16 +130,22 @@ void UnitTestNnetModelDerivatives() {
     }
     CheckComputationOptions check_config;
     // we can do the rewrite check since it's before optimization.
-    check_config.check_rewrite = true;  
-    ComputationChecker checker(check_config, nnet, request, computation);
+    check_config.check_rewrite = true;
+    ComputationChecker checker(check_config, nnet, computation);
     checker.Check();
-    
+
     if (RandInt(0, 3) != 0 && allow_optimization) {
       NnetOptimizeOptions opt_config;
+      if (limit_deriv_times)
+        SetDerivTimesOptions(request, &opt_config);
+
       Optimize(opt_config, nnet, request, &computation);
       std::ostringstream os;
       computation.Print(os, nnet);
       KALDI_LOG << "Optimized computation is: " << os.str();
+      check_config.check_rewrite = false;
+      ComputationChecker checker_opt(check_config, nnet, computation);
+      checker_opt.Check();
     }
 
     NnetComputeOptions compute_opts;
@@ -92,7 +158,7 @@ void UnitTestNnetModelDerivatives() {
     bool is_gradient = true;
     SetZero(is_gradient, &nnet_deriv);  // forces "simple" update and unit
                                         // learning rate.
-    
+
     int32 num_directions = 4;  // must be >= 1.  Best if it's >1, will reduce
                                // the probability of random failures.
 
@@ -130,10 +196,10 @@ void UnitTestNnetModelDerivatives() {
         CuMatrix<BaseFloat> temp(inputs[i]);
         computer.AcceptInput(request.inputs[i].name, &temp);
       }
-      
+
       KALDI_LOG << "Running forward computation";
       computer.Forward();
-      
+
       const CuMatrixBase<BaseFloat> &output(computer.GetOutput("output"));
       KALDI_LOG << "Output sum for pass " << pass << " is " << output.Sum();
       BaseFloat objf = TraceMatMat(output, output_deriv, kTrans);
@@ -153,7 +219,7 @@ void UnitTestNnetModelDerivatives() {
                                       DotProduct(nnet, nnet_deriv);
       }
     }
-    
+
     Vector<BaseFloat> predicted_objf_change_vec(num_directions),
         measured_objf_change_vec(num_directions);
     for (int32 d = 0; d < num_directions; d++) {
@@ -167,16 +233,23 @@ void UnitTestNnetModelDerivatives() {
     KALDI_LOG << "Vector of measured objf-change is: "
               << measured_objf_change_vec;
     BaseFloat delta_thresh = 0.05;
-    if (!ApproxEqual(predicted_objf_change_vec,
-                     measured_objf_change_vec, delta_thresh)) {
-      KALDI_WARN << "Predicted and measured objf-changes differ too much.";
+    if (limit_deriv_times) {
+      KALDI_LOG << "Not checking that predicted/measured changes matched "
+                << "because we limited times of derivatives.";
     } else {
-      num_success++;
+      if (!ApproxEqual(predicted_objf_change_vec,
+                       measured_objf_change_vec, delta_thresh)) {
+        KALDI_WARN << "Predicted and measured objf-changes differ too much.";
+        num_fail++;
+      } else {
+        num_success++;
+      }
     }
   }
   KALDI_LOG << "Model-derivative check succeeded for " << num_success << " out of "
-            << num_tries << " tries.";
-  if (num_success < num_tries - (2 + num_tries / 5))
+            << (num_fail + num_success) << " tries.";
+  int32 num_checked = num_fail + num_success;
+  if (num_success < num_checked - (2 + num_checked / 5))
     KALDI_ERR << "Failed too many times.";
 }
 
@@ -190,7 +263,7 @@ void UnitTestNnetInputDerivatives() {
     //gen_config.allow_recursion = false;
     //gen_config.allow_final_nonlinearity = true;
     bool allow_optimization = true;
-    
+
     std::vector<std::string> configs;
     GenerateConfigSequence(gen_config, &configs);
     Nnet nnet;
@@ -210,7 +283,7 @@ void UnitTestNnetInputDerivatives() {
     for (int32 i = 0; i < request.inputs.size(); i++)
       request.inputs[i].has_deriv = true;
     request.outputs[0].has_deriv = true;
-    
+
     NnetComputation computation;
     Compiler compiler(request, nnet);
 
@@ -223,12 +296,13 @@ void UnitTestNnetInputDerivatives() {
     }
     CheckComputationOptions check_config;
     // we can do the rewrite check since it's before optimization.
-    check_config.check_rewrite = true;  
-    ComputationChecker checker(check_config, nnet, request, computation);
+    check_config.check_rewrite = true;
+    ComputationChecker checker(check_config, nnet, computation);
     checker.Check();
-    
+
     if (RandInt(0, 3) != 0 && allow_optimization) {
       NnetOptimizeOptions opt_config;
+      // opt_config.initialize_undefined = false;  // temp
       Optimize(opt_config, nnet, request, &computation);
       std::ostringstream os;
       computation.Print(os, nnet);
@@ -274,7 +348,7 @@ void UnitTestNnetInputDerivatives() {
     // pass num_directions + 1.
     // Other passes are with various differently-perturbed versions of
     // the features.
-    for (int32 pass = 0; pass <= num_directions + 1; pass++) {  
+    for (int32 pass = 0; pass <= num_directions + 1; pass++) {
       // provide the input to the computations.
       for (size_t i = 0; i < request.inputs.size(); i++) {
         CuMatrix<BaseFloat> temp(inputs[i]);
@@ -296,7 +370,7 @@ void UnitTestNnetInputDerivatives() {
 
       KALDI_LOG << "Running forward computation";
       computer.Forward();
-      
+
       const CuMatrixBase<BaseFloat> &output(computer.GetOutput("output"));
       KALDI_LOG << "Output sum for pass " << pass << " is " << output.Sum();
       BaseFloat objf = TraceMatMat(output, output_deriv, kTrans);
@@ -317,7 +391,7 @@ void UnitTestNnetInputDerivatives() {
     }
     KALDI_ASSERT(ApproxEqual(measured_objf[0],
                              measured_objf[num_directions + 1]));
-    
+
     Vector<BaseFloat> predicted_objf_change_vec(num_directions),
         measured_objf_change_vec(num_directions);
     for (int32 d = 0; d < num_directions; d++) {
@@ -354,7 +428,7 @@ int main() {
   //SetVerboseLevel(2);
 
 
-  for (int32 loop = 0; loop < 2; loop++) {
+  for (kaldi::int32 loop = 0; loop < 2; loop++) {
 #if HAVE_CUDA == 1
     if (loop == 0)
       CuDevice::Instantiate().SelectGpuId("no");
diff --git a/src/nnet3/nnet-descriptor-test.cc b/src/nnet3/nnet-descriptor-test.cc
index b3fe6127e84..eef68a8648f 100644
--- a/src/nnet3/nnet-descriptor-test.cc
+++ b/src/nnet3/nnet-descriptor-test.cc
@@ -57,9 +57,11 @@ ForwardingDescriptor *GenRandForwardingDescriptor(int32 num_nodes) {
 SumDescriptor *GenRandSumDescriptor(
     int32 num_nodes) {
   if (Rand() % 3 != 0) {
-    bool required = (Rand() % 2 == 0);
-    return new UnarySumDescriptor(GenRandForwardingDescriptor(num_nodes),
-                                  required);
+    bool not_required = (Rand() % 5 == 0);
+    if (not_required)
+      return new OptionalSumDescriptor(GenRandSumDescriptor(num_nodes));
+    else
+      return new SimpleSumDescriptor(GenRandForwardingDescriptor(num_nodes));
   } else {
     return new BinarySumDescriptor((Rand() % 2 == 0 ? BinarySumDescriptor::kSum:
                                     BinarySumDescriptor::kFailover),
@@ -76,7 +78,7 @@ void GenRandDescriptor(int32 num_nodes,
   std::vector<SumDescriptor*> parts;
   for (int32 part = 0; part < num_parts; part++)
     parts.push_back(GenRandSumDescriptor(num_nodes));
-  *desc = Descriptor(parts);                    
+  *desc = Descriptor(parts);
 
 }
 
@@ -106,23 +108,115 @@ void UnitTestDescriptorIo() {
     const std::string *next_token = &(tokens[0]);
     bool ans = desc4.Parse(node_names, &next_token);
     KALDI_ASSERT(ans);
-    
+
     std::ostringstream ostr2;
     desc2.WriteConfig(ostr2, node_names);
     std::ostringstream ostr3;
     desc3.WriteConfig(ostr3, node_names);
     std::ostringstream ostr4;
-    desc4.WriteConfig(ostr4, node_names);    
+    desc4.WriteConfig(ostr4, node_names);
 
     KALDI_ASSERT(ostr.str() == ostr2.str());
     KALDI_ASSERT(ostr.str() == ostr3.str());
     KALDI_LOG << "x = " << ostr.str();
     KALDI_LOG << "y = " << ostr4.str();
-    KALDI_ASSERT(ostr.str() == ostr4.str());
+    if (ostr.str() != ostr4.str()) {
+      KALDI_WARN << "x and y differ: checking that it's due to Offset normalization.";
+      KALDI_ASSERT(ostr.str().find("Offset(Offset") != std::string::npos ||
+                   (ostr.str().find("Offset(") != std::string::npos &&
+                    ostr.str().find(", 0)") != std::string::npos));
+    }
+  }
+}
+
+
+// This function tests GeneralDescriptor, but only for correctly-normalized input.
+void UnitTestGeneralDescriptor() {
+  for (int32 i = 0; i < 100; i++) {
+    int32 num_nodes = 1 + Rand() % 5;
+    std::vector<std::string> node_names(num_nodes);
+    for (int32 i = 0; i < node_names.size(); i++) {
+      std::ostringstream ostr;
+      ostr << "a" << (i+1);
+      node_names[i] = ostr.str();
+    }
+    Descriptor desc;
+    std::ostringstream ostr;
+    GenRandDescriptor(num_nodes, &desc);
+    desc.WriteConfig(ostr, node_names);
+
+    Descriptor desc2(desc), desc3;
+    desc3 = desc;
+    std::vector<std::string> tokens;
+    DescriptorTokenize(ostr.str(), &tokens);
+    tokens.push_back("end of input");
+    std::istringstream istr(ostr.str());
+    const std::string *next_token = &(tokens[0]);
+
+
+    GeneralDescriptor *gen_desc = GeneralDescriptor::Parse(node_names,
+                                                           &next_token);
+
+    if (*next_token != "end of input")
+      KALDI_ERR << "Parsing Descriptor, expected end of input but got "
+                << "'" <<  *next_token << "'";
+
+    Descriptor *desc4 = gen_desc->ConvertToDescriptor();
+    std::ostringstream ostr2;
+    desc4->WriteConfig(ostr2, node_names);
+    KALDI_LOG << "Original descriptor was: " << ostr.str();
+    KALDI_LOG << "Parsed descriptor was: " << ostr2.str();
+    if (ostr2.str() != ostr.str())
+      KALDI_WARN << "Strings differed.  Check manually.";
+
+    delete gen_desc;
+    delete desc4;
   }
 }
 
 
+// normalizes the text form of a descriptor.
+std::string NormalizeTextDescriptor(const std::vector<std::string> &node_names,
+                                    const std::string &desc_str) {
+  std::vector<std::string> tokens;
+  DescriptorTokenize(desc_str, &tokens);
+  tokens.push_back("end of input");
+  const std::string *next_token = &(tokens[0]);
+  GeneralDescriptor *gen_desc = GeneralDescriptor::Parse(node_names,
+                                                         &next_token);
+  if (*next_token != "end of input")
+    KALDI_ERR << "Parsing Descriptor, expected end of input but got "
+              << "'" <<  *next_token << "'";
+  Descriptor *desc = gen_desc->ConvertToDescriptor();
+  std::ostringstream ostr;
+  desc->WriteConfig(ostr, node_names);
+  delete gen_desc;
+  delete desc;
+  KALDI_LOG << "Result of normalizing " << desc_str << " is: " << ostr.str();
+  return ostr.str();
+}
+
+void UnitTestGeneralDescriptorSpecial() {
+  std::vector<std::string> names;
+  names.push_back("a");
+  names.push_back("b");
+  names.push_back("c");
+  names.push_back("d");
+  KALDI_ASSERT(NormalizeTextDescriptor(names, "a") == "a");
+  KALDI_ASSERT(NormalizeTextDescriptor(names, "Offset(Offset(a, 3, 5), 2, 1)") == "Offset(a, 5, 6)");
+
+  KALDI_ASSERT(NormalizeTextDescriptor(names, "Offset(Sum(a, b), 2, 1)") ==
+               "Sum(Offset(a, 2, 1), Offset(b, 2, 1))");
+  KALDI_ASSERT(NormalizeTextDescriptor(names, "Sum(Append(a, b), Append(c, d))") ==
+               "Append(Sum(a, c), Sum(b, d))");
+  KALDI_ASSERT(NormalizeTextDescriptor(names, "Append(Append(a, b), Append(c, d))") ==
+               "Append(a, b, c, d)");
+  KALDI_ASSERT(NormalizeTextDescriptor(names, "Sum(a, b, c, d)") ==
+               "Sum(a, Sum(b, Sum(c, d)))");
+  KALDI_ASSERT(NormalizeTextDescriptor(names, "Sum(a)") == "a");
+  KALDI_ASSERT(NormalizeTextDescriptor(names, "Offset(a, 0)") == "a");
+}
+
 } // namespace nnet3
 } // namespace kaldi
 
@@ -130,8 +224,12 @@ int main() {
   using namespace kaldi;
   using namespace kaldi::nnet3;
 
+
+  UnitTestGeneralDescriptorSpecial();
+  UnitTestGeneralDescriptor();
   UnitTestDescriptorIo();
 
+
   KALDI_LOG << "Nnet descriptor tests succeeded.";
 
   return 0;
diff --git a/src/nnet3/nnet-descriptor.cc b/src/nnet3/nnet-descriptor.cc
index 07a2c55ecca..162a55b8149 100644
--- a/src/nnet3/nnet-descriptor.cc
+++ b/src/nnet3/nnet-descriptor.cc
@@ -1,3 +1,5 @@
+// nnet3/nnet-descriptor.cc
+
 // Copyright      2015  Johns Hopkins University (author: Daniel Povey)
 
 // See ../../COPYING for clarification regarding multiple authors
@@ -62,86 +64,6 @@ static int32 ReadIntegerToken(const std::string &what_we_are_parsing,
   return ans;
 }
 
-//static
-ForwardingDescriptor* ForwardingDescriptor::Parse(
-    const std::vector<std::string> &node_names,
-    const std::string **next_token) {
-  if (**next_token == "Offset") {
-    (*next_token)++;
-    ExpectToken("(", "OffsetForwardingDescriptor", next_token);
-    ForwardingDescriptor *src = Parse(node_names, next_token);
-    ExpectToken(",", "OffsetForwardingDescriptor", next_token);
-    Index offset;
-    offset.t = ReadIntegerToken("OffsetForwardingDescriptor", next_token);
-    if (**next_token == ",") {
-      (*next_token)++;      
-      offset.x = ReadIntegerToken("OffsetForwardingDescriptor", next_token);
-    }
-    ExpectToken(")", "OffsetForwardingDescriptor", next_token);
-    return new OffsetForwardingDescriptor(src, offset);
-  } else if (**next_token == "Switch") {
-    (*next_token)++;
-    ExpectToken("(", "SwitchingForwardingDescriptor", next_token);
-    std::vector<ForwardingDescriptor*> vec;
-    while (true) {
-      ForwardingDescriptor *src = Parse(node_names, next_token);
-      vec.push_back(src);
-      if (**next_token == ",") {
-        (*next_token)++;
-      } else {
-        ExpectToken(")", "SwitchingForwardingDescriptor", next_token);
-        return new SwitchingForwardingDescriptor(vec);
-      }
-    }
-  } else if (**next_token == "Round") {
-    (*next_token)++;
-    ExpectToken("(", "RoundingForwardingDescriptor", next_token);
-    ForwardingDescriptor *src = Parse(node_names, next_token);
-    ExpectToken(",", "RoundingForwardingDescriptor", next_token);
-    int32 t_modulus = ReadIntegerToken("RoundingForwardingDescriptor", next_token);
-    if (t_modulus <= 0)
-      KALDI_ERR << "Invalid modulus " << t_modulus << " in Round(..) expression";
-    ExpectToken(")", "RoundingForwardingDescriptor", next_token);    
-    return new RoundingForwardingDescriptor(src, t_modulus);
-  } else if (**next_token == "ReplaceIndex") {
-    (*next_token)++;
-    ExpectToken("(", "ReplaceIndexForwardingDescriptor", next_token);
-    ForwardingDescriptor *src = Parse(node_names, next_token);
-    ExpectToken(",", "ReplaceIndexForwardingDescriptor", next_token);
-    ReplaceIndexForwardingDescriptor::VariableName variable_name;
-    if (**next_token == "t") {
-      variable_name = ReplaceIndexForwardingDescriptor::kT;
-    } else if (**next_token == "x") {
-      variable_name = ReplaceIndexForwardingDescriptor::kX;
-    } else {
-      KALDI_ERR << "Parsing ReplaceIndexForwardingDescriptor, expected "
-                << "'t' or 'x', got " << **next_token;
-    }
-    (*next_token)++;
-    ExpectToken(",", "ReplaceIndexForwardingDescriptor", next_token);
-    int32 value = ReadIntegerToken("ReplaceIndexForwardingDescriptor",
-                                   next_token);
-    ExpectToken(")", "ReplaceIndexForwardingDescriptor", next_token);
-    return new ReplaceIndexForwardingDescriptor(src, variable_name, value);
-  } else {
-    // Note, node_names will have any node names that aren't allowed to appear
-    // in Descriptors (e.g. output nodes) replace with something that can never
-    // appear as a token, e.g. "**", so they will never match.
-    int32 num_nodes = node_names.size();
-    for (int32 i = 0; i < num_nodes; i++) {
-      if (**next_token == node_names[i]) {
-        (*next_token)++;
-        return new SimpleForwardingDescriptor(i);
-      }
-    }
-    KALDI_ERR << "Parsing Decriptor, expected a ForwardingDescriptor but got "
-              << (**next_token == "end of input" ?
-                  **next_token : std::string("'") + **next_token + "'");
-    return NULL;  // suppress compiler warning.
-  }
-}
-                        
-
 
 void Descriptor::GetDependencies(
     const Index &index,
@@ -284,14 +206,14 @@ Cindex ReplaceIndexForwardingDescriptor::MapToInput(const Index &ind) const {
     default:  // kN or any other value is not allowed (doesn't make sense
       // to change the minibatch index in this way).
       KALDI_ERR << "Invalid variable name";
-  }    
+  }
   return ans;
 }
 
 ForwardingDescriptor *ReplaceIndexForwardingDescriptor::Copy() const {
   return new ReplaceIndexForwardingDescriptor(src_->Copy(),
                                               variable_name_, value_);
-                                              
+
 }
 
 void ReplaceIndexForwardingDescriptor::WriteConfig(
@@ -304,44 +226,68 @@ void ReplaceIndexForwardingDescriptor::WriteConfig(
      << value_ << ")";
 }
 
-SumDescriptor *UnarySumDescriptor::Copy() const {
-  return new UnarySumDescriptor(src_->Copy(), required_);
+SumDescriptor *OptionalSumDescriptor::Copy() const {
+  return new OptionalSumDescriptor(src_->Copy());
+}
+
+void OptionalSumDescriptor::GetDependencies(
+    const Index &ind,
+    std::vector<Cindex> *dependencies) const {
+  src_->GetDependencies(ind, dependencies);
+}
+
+void OptionalSumDescriptor::WriteConfig(
+    std::ostream &os,
+    const std::vector<std::string> &node_names) const {
+  os << "IfDefined(";
+  src_->WriteConfig(os, node_names);
+  os << ")";
+}
+
+int32 OptionalSumDescriptor::Dim(const Nnet &nnet) const {
+  return src_->Dim(nnet);
+}
+
+void OptionalSumDescriptor::GetNodeDependencies(
+    std::vector<int32> *node_indexes) const {
+  src_->GetNodeDependencies(node_indexes);
 }
 
-void UnarySumDescriptor::GetDependencies(const Index &ind,
-                                         std::vector<Cindex> *dependencies) const {
+SumDescriptor *SimpleSumDescriptor::Copy() const {
+  return new SimpleSumDescriptor(src_->Copy());
+}
+
+void SimpleSumDescriptor::GetDependencies(const Index &ind,
+                                          std::vector<Cindex> *dependencies) const {
   dependencies->push_back(src_->MapToInput(ind));
 }
 
-bool UnarySumDescriptor::IsComputable(
+bool SimpleSumDescriptor::IsComputable(
     const Index &ind,
     const CindexSet &cindex_set,
-    std::vector<Cindex> *required_inputs) const {
+    std::vector<Cindex> *used_inputs) const {
   Cindex c = src_->MapToInput(ind);
   bool src_present  = cindex_set(c);
-  if (src_present && required_inputs)
-    required_inputs->push_back(c);
-  return (src_present || !required_);
+  if (src_present && used_inputs != NULL)
+    used_inputs->push_back(c);
+  return src_present;
 }
 
-void UnarySumDescriptor::WriteConfig(
+void SimpleSumDescriptor::WriteConfig(
     std::ostream &os,
     const std::vector<std::string> &node_names) const {
-  if (!required_) os << "IfDefined(";
   src_->WriteConfig(os, node_names);
-  if (!required_) os << ")";  
 }
 
-int32 UnarySumDescriptor::Dim(const Nnet &nnet) const {
+int32 SimpleSumDescriptor::Dim(const Nnet &nnet) const {
   return src_->Dim(nnet);
 }
 
-void UnarySumDescriptor::GetNodeDependencies(
+void SimpleSumDescriptor::GetNodeDependencies(
     std::vector<int32> *node_indexes) const {
   src_->GetNodeDependencies(node_indexes);
 }
 
-
 void BinarySumDescriptor::GetDependencies(
     const Index &ind, std::vector<Cindex> *dependencies) const {
   src1_->GetDependencies(ind, dependencies);
@@ -351,10 +297,10 @@ void BinarySumDescriptor::GetDependencies(
 bool BinarySumDescriptor::IsComputable(
     const Index &ind,
     const CindexSet &cindex_set,
-    std::vector<Cindex> *required_inputs) const {
+    std::vector<Cindex> *used_inputs) const {
   KALDI_PARANOID_ASSERT(op_ == kSum || op_ == kFailover);
   std::vector<Cindex> src1_inputs, src2_inputs;
-  bool r = (required_inputs != NULL);
+  bool r = (used_inputs != NULL);
   bool src1_computable = src1_->IsComputable(ind, cindex_set,
                                              r ? &src1_inputs: NULL),
       src2_computable = src2_->IsComputable(ind, cindex_set,
@@ -362,10 +308,10 @@ bool BinarySumDescriptor::IsComputable(
   if (op_ == kSum) {
     if (src1_computable && src2_computable) {
       if (r) {
-        required_inputs->insert(required_inputs->end(),
-                                src1_inputs.begin(), src1_inputs.end());
-        required_inputs->insert(required_inputs->end(),
-                                src2_inputs.begin(), src2_inputs.end());
+        used_inputs->insert(used_inputs->end(),
+                            src1_inputs.begin(), src1_inputs.end());
+        used_inputs->insert(used_inputs->end(),
+                            src2_inputs.begin(), src2_inputs.end());
       }
       return true;
     } else {
@@ -375,13 +321,13 @@ bool BinarySumDescriptor::IsComputable(
     KALDI_ASSERT(op_ == kFailover);
     if (src1_computable) {
       if (r)
-        required_inputs->insert(required_inputs->end(),
-                                src1_inputs.begin(), src1_inputs.end());
+        used_inputs->insert(used_inputs->end(),
+                            src1_inputs.begin(), src1_inputs.end());
       return true;
     } else if (src2_computable) {
-      if (r) 
-        required_inputs->insert(required_inputs->end(),
-                                src2_inputs.begin(), src2_inputs.end());
+      if (r)
+        used_inputs->insert(used_inputs->end(),
+                            src2_inputs.begin(), src2_inputs.end());
       return true;
     } else {
       return false;
@@ -424,39 +370,6 @@ void BinarySumDescriptor::WriteConfig(
   os << ")";
 }
 
-
-//static
-SumDescriptor* SumDescriptor::Parse(
-    const std::vector<std::string> &node_names,
-    const std::string **next_token) {
-
-  if (**next_token == "IfDefined") {
-    (*next_token)++;
-    ExpectToken("(", "SumDescriptor", next_token);
-    ForwardingDescriptor *src = ForwardingDescriptor::Parse(node_names,
-                                                            next_token);
-    ExpectToken(")", "SumDescriptor", next_token);
-    bool required = false;
-    return new UnarySumDescriptor(src, required);
-  } else if (**next_token == "Sum" || **next_token == "Failover") {
-    BinarySumDescriptor::Operation op_type = (**next_token == "Sum" ?
-                                              BinarySumDescriptor::kSum :
-                                              BinarySumDescriptor::kFailover);
-    (*next_token)++;
-    ExpectToken("(", "SumDescriptor", next_token);
-    SumDescriptor *src1 = Parse(node_names, next_token);
-    ExpectToken(",", "SumDescriptor", next_token);
-    SumDescriptor *src2 = Parse(node_names, next_token);
-    ExpectToken(")", "SumDescriptor", next_token);
-    return new BinarySumDescriptor(op_type, src1, src2);
-  } else {
-    ForwardingDescriptor *src = ForwardingDescriptor::Parse(node_names,
-                                                            next_token);
-    bool required = true;
-    return new UnarySumDescriptor(src, required);
-  }
-}
-
 int32 SwitchingForwardingDescriptor::Modulus() const {
   int32 ans = src_.size();;
   for (int32 i = 0; i < src_.size(); i++)
@@ -464,6 +377,25 @@ int32 SwitchingForwardingDescriptor::Modulus() const {
   return ans;
 }
 
+bool Descriptor::Parse(const std::vector<std::string> &node_names,
+                       const std::string **next_token) {
+  GeneralDescriptor *gen_desc;
+  try {
+    gen_desc = GeneralDescriptor::Parse(node_names,
+                                        next_token);
+  } catch (...) {
+    return false;
+  }
+  if (**next_token != "end of input")
+    KALDI_ERR << "Parsing Descriptor, expected end of input but got "
+              << "'" <<  *next_token << "'";
+  Descriptor *desc = gen_desc->ConvertToDescriptor();
+  *this = *desc;
+  delete desc;
+  delete gen_desc;
+  return true;
+}
+
 void Descriptor::WriteConfig(std::ostream &os,
                              const std::vector<std::string> &node_names) const {
   KALDI_ASSERT(parts_.size() > 0);
@@ -494,37 +426,6 @@ int32 Descriptor::Dim(const Nnet &nnet) const {
   return dim;
 }
 
-bool Descriptor::Parse(const std::vector<std::string> &node_names,
-                       const std::string **next_token) {
-  Destroy();
-  try {
-    if (**next_token == "Append") {
-      (*next_token)++;
-      ExpectToken("(", "Descriptor", next_token);
-      while (1) {
-        SumDescriptor *ptr = SumDescriptor::Parse(node_names,
-                                                  next_token);
-        parts_.push_back(ptr);
-        if (**next_token == ",") {
-          (*next_token)++;
-          continue;
-        } else {
-          ExpectToken(")", "Descriptor", next_token);
-          ExpectToken("end of input", "Descriptor", next_token);
-          return true;
-        }
-      }
-    } else {
-      SumDescriptor *ptr = SumDescriptor::Parse(node_names, next_token);
-      parts_.push_back(ptr);
-      ExpectToken("end of input", "Descriptor", next_token);
-      return true;
-    }
-  } catch (...) {
-    return false;
-  }
-}
-
 
 Descriptor& Descriptor::operator=(const Descriptor &other) {
   Destroy();
@@ -537,12 +438,12 @@ int32 Descriptor::Modulus() const {
   int32 ans = 1;
   for (size_t i = 0; i < parts_.size(); i++)
     ans = Lcm(ans, parts_[i]->Modulus());
-  return ans;  
+  return ans;
 }
 
 
 bool Descriptor::IsComputable(const Index &ind,
-                              const CindexSet &cindex_set,                    
+                              const CindexSet &cindex_set,
                               std::vector<Cindex> *input_terms) const {
   if (input_terms)
     input_terms->clear();
@@ -568,5 +469,438 @@ void Descriptor::GetNodeDependencies(std::vector<int32> *node_indexes) const {
     parts_[i]->GetNodeDependencies(node_indexes);
 }
 
+
+// static
+GeneralDescriptor* GeneralDescriptor::Parse(
+    const std::vector<std::string> &node_names,
+    const std::string **next_token) {
+
+  DescriptorType t;
+  if (**next_token == "Append") {
+    t = kAppend;
+  } else if (**next_token == "Sum") {
+    t = kSum;
+  } else if (**next_token == "Failover") {
+    t = kFailover;
+  } else if (**next_token == "IfDefined") {
+    t = kIfDefined;
+  } else if (**next_token == "Offset") {
+    t = kOffset;
+  } else if (**next_token == "Switch") {
+    t = kSwitch;
+  } else if (**next_token == "Round") {
+    t = kRound;
+  } else if (**next_token == "ReplaceIndex") {
+    t = kReplaceIndex;
+  } else {
+    // what we read wasn't a reserved name like Offset, etc.
+    // We expect a node name in that case.
+    for (size_t i = 0; i < node_names.size(); i++) {
+      if (**next_token == node_names[i]) {
+        GeneralDescriptor *ans = new GeneralDescriptor(kNodeName, i);
+        (*next_token)++;
+        return ans;
+      }
+    }
+    KALDI_ERR << "Expected a Descriptor, got instead "
+              << **next_token;
+    t = kNodeName;  // suppress compiler warning.
+  }
+  (*next_token)++;
+  ExpectToken("(", "Descriptor", next_token);
+  GeneralDescriptor *ans = new GeneralDescriptor(t);
+  switch (t) {
+    case kAppend: case kSum: case kSwitch:
+      ans->ParseAppendOrSumOrSwitch(node_names, next_token); break;
+    case kFailover: ans->ParseFailover(node_names, next_token); break;
+    case kIfDefined: ans->ParseIfDefined(node_names, next_token); break;
+    case kOffset: ans->ParseOffset(node_names, next_token); break;
+    case kRound: ans->ParseRound(node_names, next_token); break;
+    case kReplaceIndex: ans->ParseReplaceIndex(node_names, next_token); break;
+    default:
+      KALDI_ERR << "Code error";
+  }
+  return ans;
+}
+
+void GeneralDescriptor::ParseAppendOrSumOrSwitch(
+    const std::vector<std::string> &node_names,
+    const std::string **next_token) {
+  descriptors_.push_back(Parse(node_names, next_token));
+  while (true) {
+    if (**next_token == ")") {
+      (*next_token)++;
+      return;
+    } else if (**next_token == ",") {
+      (*next_token)++;
+      descriptors_.push_back(Parse(node_names, next_token));
+    } else {
+      KALDI_ERR << "Expected ',' or ')', got "
+                << **next_token;
+    }
+  }
+}
+
+void GeneralDescriptor::ParseIfDefined(
+    const std::vector<std::string> &node_names,
+    const std::string **next_token) {
+  descriptors_.push_back(Parse(node_names, next_token));
+  ExpectToken(")", "IfDefined", next_token);
+}
+
+void GeneralDescriptor::ParseFailover(
+    const std::vector<std::string> &node_names,
+    const std::string **next_token) {
+  descriptors_.push_back(Parse(node_names, next_token));
+  ExpectToken(",", "Failover", next_token);
+  descriptors_.push_back(Parse(node_names, next_token));
+  ExpectToken(")", "Failover", next_token);
+}
+
+void GeneralDescriptor::ParseOffset(
+    const std::vector<std::string> &node_names,
+    const std::string **next_token) {
+  descriptors_.push_back(Parse(node_names, next_token));
+  ExpectToken(",", "Offset", next_token);
+  value1_ = ReadIntegerToken("Offset", next_token);
+  if (**next_token == ",") {
+    (*next_token)++;
+    value2_ = ReadIntegerToken("Offset", next_token);
+  } else {
+    value2_ = 0;
+  }
+  ExpectToken(")", "Offset", next_token);
+}
+
+
+void GeneralDescriptor::ParseRound(
+    const std::vector<std::string> &node_names,
+    const std::string **next_token) {
+  descriptors_.push_back(Parse(node_names, next_token));
+  ExpectToken(",", "Round", next_token);
+  value1_ = ReadIntegerToken("Round", next_token);
+  ExpectToken(")", "Round", next_token);
+}
+
+
+void GeneralDescriptor::ParseReplaceIndex(
+    const std::vector<std::string> &node_names,
+    const std::string **next_token) {
+  descriptors_.push_back(Parse(node_names, next_token));
+  ExpectToken(",", "ReplaceIndex", next_token);
+  if (**next_token == "t") {
+    value1_ = int32(ReplaceIndexForwardingDescriptor::kT);
+    (*next_token)++;
+  } else if (**next_token == "x") {
+    value1_ = int32(ReplaceIndexForwardingDescriptor::kX);
+    (*next_token)++;
+  } else {
+    KALDI_ERR << "Expected 't' or 'x', got " << **next_token;
+  }
+  ExpectToken(",", "ReplaceIndex", next_token);
+  value2_ = ReadIntegerToken("Replace", next_token);
+  ExpectToken(")", "ReplaceIndex", next_token);
+}
+
+
+int32 GeneralDescriptor::NumAppendTerms() const {
+  int32 ans = 0;
+  switch (descriptor_type_) {
+    case kNodeName: ans = 1; break;
+    case kAppend: {
+      for (size_t i = 0; i < descriptors_.size(); i++)
+        ans += descriptors_[i]->NumAppendTerms();
+      break;
+    }
+    default:
+      KALDI_ASSERT(descriptors_.size() > 0);
+      ans = descriptors_[0]->NumAppendTerms();
+      for (size_t i = 1; i < descriptors_.size(); i++)
+        KALDI_ASSERT(descriptors_[i]->NumAppendTerms() == ans);
+  }
+  return ans;
+}
+
+GeneralDescriptor* GeneralDescriptor::GetAppendTerm(int32 term) const {
+  switch (descriptor_type_) {
+    case kNodeName:
+      KALDI_ASSERT(term == 0);
+      return new GeneralDescriptor(kNodeName, value1_);
+    case kAppend: {
+      int32 cur_term = term;
+      for (size_t i = 0; i < descriptors_.size(); i++) {
+        int32 this_num_terms = descriptors_[i]->NumAppendTerms();
+        if (cur_term < this_num_terms)
+          return descriptors_[i]->GetAppendTerm(cur_term);
+        else
+          cur_term -= this_num_terms;
+      }
+      KALDI_ERR << "Code error, getting append term.";
+      return NULL; // avoid compiler warning
+    }
+    default: {
+      GeneralDescriptor *ans = new GeneralDescriptor(descriptor_type_,
+                                                     value1_, value2_);
+      ans->descriptors_.resize(descriptors_.size());
+      for (size_t i = 0; i < descriptors_.size(); i++)
+        ans->descriptors_[i] = descriptors_[i]->GetAppendTerm(term);
+      return ans;
+    }
+  }
+}
+
+
+// this is only called at the top level.
+GeneralDescriptor* GeneralDescriptor::NormalizeAppend() const {
+  int32 num_terms = NumAppendTerms();
+  KALDI_ASSERT(num_terms > 0);
+  if (num_terms == 1) {
+    return GetAppendTerm(0);
+  } else {
+    GeneralDescriptor *ans = new GeneralDescriptor(kAppend);
+    ans->descriptors_.resize(num_terms);
+    for (size_t i = 0; i < num_terms; i++) {
+      ans->descriptors_[i] = GetAppendTerm(i);
+    }
+    return ans;
+  }
+}
+
+
+//static
+bool GeneralDescriptor::Normalize(GeneralDescriptor *parent) {
+  bool changed = false;
+  switch (parent->descriptor_type_) {
+    case kOffset: {  // this block combines Offset(Offset(x, ..), ..).
+      KALDI_ASSERT(parent->descriptors_.size() == 1);
+      GeneralDescriptor *child = parent->descriptors_[0];
+      if (child->descriptor_type_ == kOffset) {
+        KALDI_ASSERT(child->descriptors_.size() == 1);
+        GeneralDescriptor *grandchild = child->descriptors_[0];
+        parent->value1_ += child->value1_;
+        parent->value2_ += child->value2_;
+        child->descriptors_.clear();  // avoid delete in destructor.
+        delete child;
+        parent->descriptors_[0] = grandchild;
+        changed = true;
+      } else if (parent->value1_ == 0 && parent->value2_ == 0) {
+        // remove redundant Offset expression like Offset(x, 0).
+        parent->descriptors_.swap(child->descriptors_);
+        parent->descriptor_type_ = child->descriptor_type_;
+        parent->value1_ = child->value1_;
+        parent->value2_ = child->value2_;
+        child->descriptors_.clear();  // avoid delete in destructor.
+        delete child;
+        changed = true;
+        break;  // break from the switch ('parent' is no longer of type
+        // kOffset)', so we don't want to carry through.
+      }
+    }
+      // ... and continue through to the next case statement.
+    case kSwitch: case kRound: case kReplaceIndex: { // ..and kOffset:
+      KALDI_ASSERT(parent->descriptors_.size() >= 1);
+      GeneralDescriptor *child = parent->descriptors_[0];
+      KALDI_ASSERT(child->descriptor_type_ != kAppend);  // would be code error
+      // (already did
+      // NormalizeAppend()).
+      if (child->descriptor_type_ == kSum ||
+          child->descriptor_type_ == kFailover ||
+          child->descriptor_type_ == kIfDefined) {
+        if (parent->descriptors_.size() > 1) {
+          KALDI_ASSERT(parent->descriptor_type_ == kSwitch);
+          KALDI_ERR << "Sum(), Failover() or IfDefined() expression inside Switch(), "
+                    << "we can't currently normalize this.";
+        }
+        // this is a forbidden case of a sum descriptor inside a forwarding
+        // descriptor.  we need to rearrange.  E.g. Offset(Sum(x, y), 1) becomes
+        // Sum(Offset(x, 1), Offset(y, 1)).
+        for (size_t i = 0; i < child->descriptors_.size(); i++) {
+          GeneralDescriptor *grandchild = child->descriptors_[i];
+          GeneralDescriptor *modified_grandchild =
+              new GeneralDescriptor(parent->descriptor_type_,
+                                    parent->value1_,
+                                    parent->value2_);
+          // modified_grandchild takes ownership of grandchild.
+          modified_grandchild->descriptors_.push_back(grandchild);
+          child->descriptors_[i] = modified_grandchild;
+        }
+        // copy all members from child to parent.
+        parent->descriptor_type_ = child->descriptor_type_;
+        parent->value1_ = child->value1_;
+        parent->value2_ = child->value2_;
+        parent->descriptors_.swap(child->descriptors_);
+        child->descriptors_.clear();  // avoid delete in destructor of 'child'
+        delete child;
+        changed = true;
+      }
+      break;
+    }
+    case kSum: {
+      KALDI_ASSERT(!parent->descriptors_.empty());
+      if (parent->descriptors_.size() == 1) {
+        // convert Sum(x) to just x.
+        GeneralDescriptor *child = parent->descriptors_[0];
+        parent->descriptor_type_ = child->descriptor_type_;
+        parent->descriptors_.swap(child->descriptors_);
+        parent->value1_ = child->value1_;
+        parent->value2_ = child->value2_;
+        child->descriptors_.clear();  // avoid delete in destructor.
+        delete child;
+        changed = true;
+      } else if (parent->descriptors_.size() > 2) {
+        // convert Sum(a, b, c, ...) to Sum(a, Sum(b, c, ...)).
+        GeneralDescriptor *new_child = new GeneralDescriptor(kSum);
+        // assign b, c, .. to the descriptors of new_child.
+        new_child->descriptors_.insert(new_child->descriptors_.begin(),
+                                       parent->descriptors_.begin() + 1,
+                                       parent->descriptors_.end());
+        parent->descriptors_.erase(parent->descriptors_.begin() + 1,
+                                   parent->descriptors_.end());
+        parent->descriptors_.push_back(new_child);
+        changed = true;
+      }
+      break;
+    }
+    default: { } // empty statement.
+  }
+  // ... and recurse.
+  for (size_t i = 0; i < parent->descriptors_.size(); i++)
+    changed = changed || Normalize(parent->descriptors_[i]);
+  return changed;
+}
+
+GeneralDescriptor* GeneralDescriptor::GetNormalizedDescriptor() const {
+  GeneralDescriptor *ans = NormalizeAppend();
+  while (Normalize(ans));  // keep normalizing as long as it changes.
+  return ans;
+}
+
+void GeneralDescriptor::Print(const std::vector<std::string> &node_names,
+                              std::ostream &os) {
+  switch (descriptor_type_) {
+    // first handle all the expressions of the form "Operator(<desc1>, ... <descN>)".
+    case kAppend: os << "Append("; break;
+    case kSum: os << "Sum("; break;
+    case kFailover: os << "Failover("; break;
+    case kIfDefined: os << "IfDefined("; break;
+    case kSwitch: os << "Switch("; break;
+      // now handle the exceptions.
+    case kOffset: case kRound: {
+      os << "Offset(";
+      KALDI_ASSERT(descriptors_.size() == 1);
+      descriptors_[0]->Print(node_names, os);
+      os << ", " << value1_;
+      if (descriptor_type_ == kOffset && value2_ != 0) os << ", " << value2_;
+      os << ")";
+      return;
+    }
+    case kReplaceIndex: {
+      os << "ReplaceIndex(";
+      KALDI_ASSERT(descriptors_.size() == 1);
+      descriptors_[0]->Print(node_names, os);
+      KALDI_ASSERT(value1_ == int32(ReplaceIndexForwardingDescriptor::kT) ||
+                   value1_ == int32(ReplaceIndexForwardingDescriptor::kX));
+      if (value1_ == int32(ReplaceIndexForwardingDescriptor::kT)) {
+        os << ", t, ";
+      } else {
+        os << ", x, ";
+      }
+      os << value2_ << ")";
+      return;
+    }
+    case kNodeName: {
+      KALDI_ASSERT(static_cast<size_t>(value1_) < node_names.size());
+      os << node_names[value1_];
+      return;
+    }
+  }
+  for (size_t i = 0; i < descriptors_.size(); i++) {
+    if (i > 0) os << ", ";
+    descriptors_[i]->Print(node_names, os);
+  }
+  os << ")";
+}
+
+
+Descriptor* GeneralDescriptor::ConvertToDescriptor() {
+  GeneralDescriptor *normalized = GetNormalizedDescriptor();
+  std::vector<SumDescriptor*> sum_descriptors;
+  if (normalized->descriptor_type_ == kAppend) {
+    for (size_t i = 0; i < normalized->descriptors_.size(); i++)
+      sum_descriptors.push_back(
+          normalized->descriptors_[i]->ConvertToSumDescriptor());
+  } else {
+    sum_descriptors.push_back(normalized->ConvertToSumDescriptor());
+  }
+  Descriptor *ans = new Descriptor(sum_descriptors);
+  delete normalized;
+  return ans;
+}
+
+SumDescriptor *GeneralDescriptor::ConvertToSumDescriptor() const {
+  KALDI_ASSERT(descriptor_type_ != kAppend &&
+               "Badly normalized descriptor");
+  switch (descriptor_type_) {
+    case kAppend:
+      KALDI_ERR << "Badly normalized descriptor";
+    case kSum: case kFailover: {
+      KALDI_ASSERT(descriptors_.size() == 2 && "Bad descriptor");
+      return new BinarySumDescriptor(
+          descriptor_type_ == kSum ?
+          BinarySumDescriptor::kSum : BinarySumDescriptor::kFailover,
+          descriptors_[0]->ConvertToSumDescriptor(),
+          descriptors_[1]->ConvertToSumDescriptor());
+    }
+    case kIfDefined: {
+      KALDI_ASSERT(descriptors_.size() == 1 && "Bad descriptor");
+      return new OptionalSumDescriptor(
+          descriptors_[0]->ConvertToSumDescriptor());
+    }
+    default: {
+      return new SimpleSumDescriptor(this->ConvertToForwardingDescriptor());
+    }
+  }
+}
+
+
+ForwardingDescriptor *GeneralDescriptor::ConvertToForwardingDescriptor() const {
+  switch (this->descriptor_type_) {
+    case kNodeName: return new SimpleForwardingDescriptor(value1_);
+    case kOffset: {
+      KALDI_ASSERT(descriptors_.size() == 1 && "bad descriptor");
+      return new OffsetForwardingDescriptor(
+          descriptors_[0]->ConvertToForwardingDescriptor(),
+          Index(0, value1_, value2_));
+    }
+    case kSwitch: {
+      std::vector<ForwardingDescriptor*> descriptors;
+      for (size_t i = 0; i < descriptors_.size(); i++)
+        descriptors.push_back(descriptors_[i]->ConvertToForwardingDescriptor());
+      return new SwitchingForwardingDescriptor(descriptors);
+    }
+    case kRound: {
+      KALDI_ASSERT(descriptors_.size() == 1 && "bad descriptor");
+      return new RoundingForwardingDescriptor(
+          descriptors_[0]->ConvertToForwardingDescriptor(),
+          value1_);
+    }
+    case kReplaceIndex: {
+      KALDI_ASSERT(descriptors_.size() == 1 && "bad descriptor");
+      KALDI_ASSERT(value1_ == int32(ReplaceIndexForwardingDescriptor::kT) ||
+                   value1_ == int32(ReplaceIndexForwardingDescriptor::kX));
+      return new ReplaceIndexForwardingDescriptor(
+          descriptors_[0]->ConvertToForwardingDescriptor(),
+          value1_ == int32(ReplaceIndexForwardingDescriptor::kT) ?
+          ReplaceIndexForwardingDescriptor::kT :
+          ReplaceIndexForwardingDescriptor::kX,
+          value2_);
+    }
+    default:
+      KALDI_ERR << "Invalid descriptor type (failure in normalization?)";
+      return NULL;
+  }
+}
+
+
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-descriptor.h b/src/nnet3/nnet-descriptor.h
index bf4f2dcb379..ac6527261b8 100644
--- a/src/nnet3/nnet-descriptor.h
+++ b/src/nnet3/nnet-descriptor.h
@@ -39,11 +39,11 @@ namespace nnet3 {
    \file nnet-descriptor.h
 
    This file contains class definitions for classes ForwardingDescriptor,
-   SumDescriptor and InputDescriptor.  Basically this is code that specifies how
+   SumDescriptor and Descriptor.  Basically this is code that specifies how
    we glue together the outputs of possibly several other network-nodes, as the
    input of a particular network node (or as an output of the network).  In the
    neural-network code we refer to the top-level descriptor which is
-   InputDescriptor.  The InputDescriptor is a concatenation features; each part
+   Descriptor.  The InputDescriptor is a concatenation of features; each part
    is a SumDescriptor.  The SumDescriptor is a summation over a set of features
    of all the same dimension, each of which is represented by a
    ForwardingDescriptor.  A ForwardingDescriptor in the simplest case just
@@ -78,7 +78,7 @@ namespace nnet3 {
 ;; iVectors; iVector would always have time-index t=0.
 <descriptor>  ::=   ReplaceIndex(<descriptor>, <variable-name>, <value>)
 \endverbatim
-   
+
  */
 
 
@@ -99,17 +99,8 @@ class ForwardingDescriptor {
 
   // Return the feature dimension.
   virtual int32 Dim(const Nnet &nnet) const = 0;
-  
-  virtual ForwardingDescriptor *Copy() const = 0;
 
-  // The Parse method is used for reading a config-file-style represenation.
-  // Assumes the input has already been tokenized into an array of strings, and
-  // it moves the begin-pointer "next_token" to account for token that it
-  // consumes.  Calls KALDI_ERR on error.
-  // The list of tokens should be terminated with a string saying "end of input".
-  static ForwardingDescriptor *Parse(const std::vector<std::string> &node_names,
-                                     const std::string **next_token);
-  
+  virtual ForwardingDescriptor *Copy() const = 0;
 
   /// This function is for use in things like clockwork RNNs, where shifting the
   /// time of the inputs and outputs of the network by some multiple integer n
@@ -127,7 +118,7 @@ class ForwardingDescriptor {
   /// This function appends to "node_indexes" all the node indexes
   // that this descriptor may access.
   virtual void GetNodeDependencies(std::vector<int32> *node_indexes) const = 0;
-  
+
   virtual ~ForwardingDescriptor() { }
   ForwardingDescriptor() { }
  private:
@@ -141,7 +132,7 @@ class SimpleForwardingDescriptor: public ForwardingDescriptor {
   virtual int32 Dim(const Nnet &nnet) const;
   virtual ForwardingDescriptor *Copy() const;
   virtual void GetNodeDependencies(std::vector<int32> *node_indexes) const;
-  
+
   // Write to string that will be one line of a config-file-like format.  The
   // opposite of Parse.
   // written form is just the node-name of src_node_.
@@ -165,15 +156,15 @@ class OffsetForwardingDescriptor: public ForwardingDescriptor {
   // written form is: Offset(<src-written-form>, t-offset [, x-offset])
   virtual void WriteConfig(std::ostream &os,
                            const std::vector<std::string> &node_names) const;
-  
+
   virtual int32 Modulus() const { return src_->Modulus(); }
-  
+
   virtual void GetNodeDependencies(std::vector<int32> *node_indexes) const;
-  
+
   // takes ownership of src.
   OffsetForwardingDescriptor(ForwardingDescriptor *src,
                              Index offset): src_(src), offset_(offset) { }
-  
+
   virtual ~OffsetForwardingDescriptor() { delete src_; }
  private:
   ForwardingDescriptor *src_;  // Owned here.
@@ -192,7 +183,7 @@ class SwitchingForwardingDescriptor: public ForwardingDescriptor {
                           const std::vector<std::string> &node_names) const;
 
   virtual int32 Modulus() const;
-  
+
   /// This function appends to "node_indexes" all the node indexes
   // that this descriptor may access.
   virtual void GetNodeDependencies(std::vector<int32> *node_indexes) const;
@@ -203,7 +194,7 @@ class SwitchingForwardingDescriptor: public ForwardingDescriptor {
   virtual ~SwitchingForwardingDescriptor() { DeletePointers(&src_); }
  private:
   // Pointers are owned here.
-  std::vector<ForwardingDescriptor*> src_; 
+  std::vector<ForwardingDescriptor*> src_;
 };
 
 
@@ -239,10 +230,10 @@ class RoundingForwardingDescriptor: public ForwardingDescriptor {
 
 /// This ForwardingDescriptor modifies the indexes (n, t, x) by replacing one
 /// of them (normally t) with a constant value and keeping the rest.
-class ReplaceIndexForwardingDescriptor: public ForwardingDescriptor {  
+class ReplaceIndexForwardingDescriptor: public ForwardingDescriptor {
  public:
-  enum VariableName { kN, kT, kX };
-  
+  enum VariableName { kN = 0, kT = 1, kX = 2};
+
   virtual Cindex MapToInput(const Index &ind) const;
   virtual int32 Dim(const Nnet &nnet) const { return src_->Dim(nnet); }
   virtual ForwardingDescriptor *Copy() const;
@@ -260,7 +251,7 @@ class ReplaceIndexForwardingDescriptor: public ForwardingDescriptor {
                                    VariableName variable_name,
                                    int32 value):
       src_(src), variable_name_(variable_name), value_(value) { }
-  
+
   virtual ~ReplaceIndexForwardingDescriptor() { delete src_; }
  private:
   ForwardingDescriptor *src_;
@@ -288,21 +279,21 @@ class SumDescriptor {
   /// sure that it's unique.
   virtual void GetDependencies(const Index &ind,
                                std::vector<Cindex> *dependencies) const = 0;
-  
+
   /// This function exists to enable us to manage optional dependencies,
   /// i.e. for making sense of expressions like (A + (B is present)) and (A if
   /// present; if not, B).  Suppose we are trying to compute the index "ind",
   /// and the user represents that "cindex_set" is the set of Cindexes are
   /// available to the computation; then this function will return true if we
   /// can compute the expression given these inputs; and if so, will output to
-  /// "input_terms" the list of Cindexes that this expression will be a
+  /// "used_inputs" the list of Cindexes that this expression will be a
   /// summation over.
   ///
   ///  @param [in] ind  The index that we want to compute at the output of the
   ///                   Descriptor.
   ///  @param [in] cindex_set  The set of Cindexes that are available at the
   ///                   input of the Descriptor.
-  ///  @param [out] input_terms If non-NULL, if this function returns true then
+  ///  @param [out] used_inputs If non-NULL, if this function returns true then
   ///                  to this vector will be *appended* the inputs that will
   ///                  actually participate in the computation.  Else (if non-NULL) it
   ///                  will be left unchanged.
@@ -310,29 +301,21 @@ class SumDescriptor {
   ///          inputs.
   virtual bool IsComputable(const Index &ind,
                             const CindexSet &cindex_set,
-                            std::vector<Cindex> *input_terms) const = 0;
-  
+                            std::vector<Cindex> *used_inputs) const = 0;
+
   virtual int32 Dim(const Nnet &nnet) const = 0;
 
   virtual SumDescriptor *Copy() const = 0;
-  
+
   virtual ~SumDescriptor() { }
 
   // This function appends to "node_indexes" a list (not necessarily sorted or
   // unique) of all the node indexes that this descriptor may forward data from.
   virtual void GetNodeDependencies(std::vector<int32> *node_indexes) const = 0;
-  
+
   // see Modulus function of ForwardingDescriptor for explanation.
   virtual int32 Modulus() const = 0;
 
-  // The Parse method is used for reading a config-file-style represenation.
-  // Assumes the input has already been tokenized into an array of strings, and
-  // it moves the begin-pointer "next_token" to account for token that it
-  // consumes.  Calls KALDI_ERR on error.
-  // The input tokens should be terminated with a token that says "end of input". 
-  static SumDescriptor* Parse(const std::vector<std::string> &node_names,
-                              const std::string **next_token);
-  
   /// Write in config-file format.  Conventional Read and Write methods are not
   /// supported.
   virtual void WriteConfig(std::ostream &os,
@@ -341,17 +324,19 @@ class SumDescriptor {
 
 };
 
-/// This is the simple case of class SumDescriptor, in which we
-/// contain just one term (the term is a ForwardingDescriptor).
-/// You can initialize with reqired = false in order to express
-/// an optional quantity, like (A if defined, else zero).
-class UnarySumDescriptor: public SumDescriptor {
+/// This is the case of class SumDescriptor, in which we contain just one term,
+/// and that term is optional (an IfDefined() expression).  That term is a
+/// general SumDescriptor.
+class OptionalSumDescriptor: public SumDescriptor {
  public:
   virtual void GetDependencies(const Index &ind,
                                std::vector<Cindex> *dependencies) const;
   virtual bool IsComputable(const Index &ind,
                             const CindexSet &cindex_set,
-                            std::vector<Cindex> *input_terms) const;
+                            std::vector<Cindex> *used_inputs) const {
+      return src_->IsComputable(ind, cindex_set, used_inputs) || true;
+  }
+
   virtual int32 Dim(const Nnet &nnet) const;
 
   // This function appends to "node_indexes" a list (not necessarily sorted or
@@ -363,17 +348,42 @@ class UnarySumDescriptor: public SumDescriptor {
   virtual void WriteConfig(std::ostream &os,
                            const std::vector<std::string> &node_names) const;
   virtual SumDescriptor *Copy() const;
-  
-  UnarySumDescriptor(ForwardingDescriptor *src,
-                     bool required = true):
-      src_(src), required_(required) { }
-  virtual ~UnarySumDescriptor() { delete src_; }
+
+  OptionalSumDescriptor(SumDescriptor *src): src_(src) { }
+  virtual ~OptionalSumDescriptor() { delete src_; }
+ private:
+  SumDescriptor *src_;
+};
+
+// This is the base-case of SumDescriptor which just wraps
+// a ForwardingDescriptor.
+class SimpleSumDescriptor: public SumDescriptor {
+ public:
+  virtual void GetDependencies(const Index &ind,
+                               std::vector<Cindex> *dependencies) const;
+  virtual bool IsComputable(const Index &ind,
+                            const CindexSet &cindex_set,
+                            std::vector<Cindex> *used_inputs) const;
+  virtual int32 Dim(const Nnet &nnet) const;
+
+  // This function appends to "node_indexes" a list (not necessarily sorted or
+  // unique) of all the node indexes that this descriptor may forward data from.
+  virtual void GetNodeDependencies(std::vector<int32> *node_indexes) const;
+  virtual int32 Modulus() const { return src_->Modulus(); }
+  /// written form is: if required_ == true, "<written-form-of-src>"
+  /// else "IfDefined(<written-form-of-src>)".
+  virtual void WriteConfig(std::ostream &os,
+                           const std::vector<std::string> &node_names) const;
+  virtual SumDescriptor *Copy() const;
+
+  SimpleSumDescriptor(ForwardingDescriptor *src): src_(src) { }
+  virtual ~SimpleSumDescriptor() { delete src_; }
  private:
   ForwardingDescriptor *src_;
-  bool required_;
 };
 
 
+
 /// BinarySumDescriptor can represent either A + B, or (A if defined, else B).
 /// Other expressions such as A + (B if defined, else zero), (A if defined, else
 /// zero) + (B if defined, else zero), and (A if defined, else B if defined,
@@ -389,9 +399,9 @@ class BinarySumDescriptor: public SumDescriptor {
                                std::vector<Cindex> *dependencies) const;
   virtual bool IsComputable(const Index &ind,
                             const CindexSet &cindex_set,
-                            std::vector<Cindex> *input_terms) const;
+                            std::vector<Cindex> *used_inputs) const;
   virtual int32 Dim(const Nnet &nnet) const;
-  
+
   // This function appends to "node_indexes" a list (not necessarily sorted or
   // unique) of all the node indexes that this descriptor may forward data from.
   virtual void GetNodeDependencies(std::vector<int32> *node_indexes) const;
@@ -408,9 +418,9 @@ class BinarySumDescriptor: public SumDescriptor {
  private:
   Operation op_;
   SumDescriptor *src1_;
-  SumDescriptor *src2_;  
+  SumDescriptor *src2_;
 };
-  
+
 
 // A Descriptor concatenates over its parts, so its feature-dimension will
 // be the sum of the feature-dimensions of its parts.  In a valid Descriptor,
@@ -419,52 +429,53 @@ class BinarySumDescriptor: public SumDescriptor {
 class Descriptor {
  public:
   int32 Dim(const Nnet &nnet) const;
-  
+
   // The Parse method is used for reading a config-file-style represenation.
-  // Assumes the input has already been tokenized into an array of strings by
-  // DescriptorTokenize(); it moves the begin-pointer "next_token" to account
-  // for token that it consumes.  Prints warning and returns false on error
-  // (including if there was junk after the last token).
-  // The input tokens should be terminated with a token that says "end of input".
+  // Internally this uses class GeneralDescriptor to read and normalize the
+  // input.  Assumes the input has already been tokenized into an array of
+  // strings by DescriptorTokenize(); it moves the begin-pointer "next_token" to
+  // account for token that it consumes.  Prints warning and returns false on
+  // error (including if there was junk after the last token).  The input tokens
+  // should be terminated with a token that says "end of input".
   bool Parse(const std::vector<std::string> &node_names,
              const std::string **next_token);
-  
+
   // Write in config-file format.
   // if parts_.size() == 1, written form is just "<written-form-of-part0>"
   // otherwise, written form is "Append(<written-form-of-part0>, <written-form-of-part1>,  ... )".
   void WriteConfig(std::ostream &os,
                    const std::vector<std::string> &node_names) const;
-  
+
   /// This function exists to enable us to manage optional dependencies,
   /// i.e. for making sense of expressions like (A + (B is present)) and (A if
   /// present; if not, B).  Suppose we are trying to compute the index "ind",
   /// and the user represents that "cindex_set" is the set of Cindexes are
   /// available to the computation; then this function will return true if we
   /// can compute the expression given these inputs; and if so, will output to
-  /// "input_terms" the list of Cindexes (not necessarily unique) that this
+  /// "used_inputs" the list of Cindexes (not necessarily unique) that this
   /// expression will include.  Otherwise it will return false and set
-  /// input_terms to the empty vector.
+  /// used_inputs to the empty vector.
   ///
   ///  @param [in] ind  The index that we want to compute at the output of the
   ///                   Descriptor.
   ///  @param [in] cindex_set  The set of Cindexes that are available at the
   ///                   input of the Descriptor.
-  ///  @param [out] input_terms If non-NULL, if this function returns true then
+  ///  @param [out] used_inputs If non-NULL, if this function returns true then
   ///                  to this vector will be *appended* the inputs that will
   ///                  actually participate in the computation.  Else (if non-NULL) it
   ///                  will be left unchanged.
   ///  @return Returns true if this output is computable given the provided
   ///          inputs.
   void GetDependencies(const Index &index,
-                       std::vector<Cindex> *input_terms) const;
+                       std::vector<Cindex> *used_inputs) const;
 
   /// Has the same purpose and interface as the IsComputable function of the
-  /// SumDescriptor function.   Outputs to input_terms rather than appending
-  /// to it, though.  input_terms will not be sorted or have repeats removed.
+  /// SumDescriptor function.   Outputs to used_inputs rather than appending
+  /// to it, though.  used_inputs will not be sorted or have repeats removed.
   bool IsComputable(const Index &ind,
-                    const CindexSet &cindex_set,                    
-                    std::vector<Cindex> *input_terms) const;
-  
+                    const CindexSet &cindex_set,
+                    std::vector<Cindex> *used_inputs) const;
+
   // This function outputs to "node_indexes" a list (not necessarily sorted or
   // unique) of all the node indexes that this descriptor may forward data from.
   void GetNodeDependencies(std::vector<int32> *node_indexes) const;
@@ -480,7 +491,7 @@ class Descriptor {
   Descriptor() { }
   /// Copy constructor
   Descriptor(const Descriptor &other) { *this = other; }
-  /// Assignment operator.  
+  /// Assignment operator.
   Descriptor &operator = (const Descriptor &other);
   /// Takes ownership of pointers in "parts".
   Descriptor(const std::vector<SumDescriptor*> &parts):
@@ -494,6 +505,102 @@ class Descriptor {
 };
 
 
+/**
+   This class is only used when parsing Descriptors.  It is useful for normalizing
+   descriptors that are structured in an invalid or redundant way, into a
+   form that can be turned into a real Descriptor.
+ */
+struct GeneralDescriptor {
+  enum DescriptorType { kAppend, kSum, kFailover, kIfDefined, kOffset, kSwitch,
+                        kRound, kReplaceIndex, kNodeName };
+
+  // The Parse method is used for reading a config-file-style represenation.
+  // Assumes the input has already been tokenized into an array of strings, and
+  // it moves the begin-pointer "next_token" to account for token that it
+  // consumes.  Calls KALDI_ERR on error.  The list of tokens should be
+  // terminated with a string saying "end of input".  Does not check that all
+  // the input has been consumed-- the caller should do that [check that
+  // **next_token == "end of input" after calling.]
+  static GeneralDescriptor *Parse(const std::vector<std::string> &node_names,
+                                  const std::string **next_token);
+
+  explicit GeneralDescriptor(DescriptorType t, int32 value1 = -1,
+                             int32 value2 = -1):
+      descriptor_type_(t), value1_(value1), value2_(value2) { }
+
+  ~GeneralDescriptor() { DeletePointers(&descriptors_); }
+
+  GeneralDescriptor *GetNormalizedDescriptor() const;
+
+  Descriptor *ConvertToDescriptor();
+
+  // prints in text form-- this is really only used for debug.
+  void Print(const std::vector<std::string> &node_names,
+             std::ostream &os);
+
+ private:
+  KALDI_DISALLOW_COPY_AND_ASSIGN(GeneralDescriptor);
+
+  DescriptorType descriptor_type_;
+
+  // the following is only relevant if descriptor_type == kReplaceIndex [1 for t, 2 for ]
+  // or kNodeName (the index of the node), or kOffset [the t offset].
+  int32 value1_;
+  // the following is only relevant if descriptor_type == kReplaceIndex [the value
+  // we replace the index with], or kOffset [the x offset]
+  int32 value2_;
+
+  // For any descriptor types that take args of type kDescriptor, a list of those
+  // args.  Pointers owned here.
+  std::vector<GeneralDescriptor*> descriptors_;
+
+  //  parses an Append() or Sum() or Switch() expression after the "Append(" or
+  //  "Sum(" or "Switch(" has been read.
+  void ParseAppendOrSumOrSwitch(const std::vector<std::string> &node_names,
+                                const std::string **next_token);
+  // parse an IfDefined() expression after the IfDefined( has already been
+  // read.
+  void ParseIfDefined(const std::vector<std::string> &node_names,
+                      const std::string **next_token);
+  // ... and so on.
+  void ParseOffset(const std::vector<std::string> &node_names,
+                   const std::string **next_token);
+  void ParseSwitch(const std::vector<std::string> &node_names,
+                   const std::string **next_token);
+  void ParseFailover(const std::vector<std::string> &node_names,
+                     const std::string **next_token);
+  void ParseRound(const std::vector<std::string> &node_names,
+                  const std::string **next_token);
+  void ParseReplaceIndex(const std::vector<std::string> &node_names,
+                         const std::string **next_token);
+
+
+
+  // Used inside NormalizeAppend().  Return the number of terms there
+  // would be in a single consolidated Append() expressions, and asserts that in
+  // whichever branch of any other expressions we take, the number of terms is
+  // the same.
+  int32 NumAppendTerms() const;
+  // Used inside NormalizeAppend().  Gets one of the appended terms from this
+  // descriptor, with 0 <= term < NumAppendTerms().  Answer is newly allocated.
+  GeneralDescriptor *GetAppendTerm(int32 term) const;
+
+
+  // Normalizes w.r.t. Append expressions by moving Append() to the outside.
+  // Called only at the top level.
+  GeneralDescriptor *NormalizeAppend() const;
+
+  // This call does all other types of normalization except for normalizing
+  // Append() expressions (which is assumed to have been done already).  Returns
+  // true if anything was changed.
+  static bool Normalize(GeneralDescriptor *ptr);
+
+  SumDescriptor *ConvertToSumDescriptor() const;
+  ForwardingDescriptor *ConvertToForwardingDescriptor() const;
+
+};
+
+
 
 
 } // namespace nnet3
diff --git a/src/nnet3/nnet-diagnostics.cc b/src/nnet3/nnet-diagnostics.cc
index 8a395f3faa7..7f7d485ffe0 100644
--- a/src/nnet3/nnet-diagnostics.cc
+++ b/src/nnet3/nnet-diagnostics.cc
@@ -47,6 +47,16 @@ NnetComputeProb::~NnetComputeProb() {
   delete deriv_nnet_;  // delete does nothing if pointer is NULL.
 }
 
+void NnetComputeProb::Reset() {
+  num_minibatches_processed_ = 0;
+  objf_info_.clear();
+  accuracy_info_.clear();
+  if (deriv_nnet_) {
+    bool is_gradient = true;
+    SetZero(is_gradient, deriv_nnet_);
+  }
+}
+
 void NnetComputeProb::Compute(const NnetExample &eg) {
   bool need_model_derivative = config_.compute_deriv,
       store_component_stats = false;
@@ -58,7 +68,7 @@ void NnetComputeProb::Compute(const NnetExample &eg) {
   NnetComputer computer(config_.compute_config, *computation,
                         nnet_, deriv_nnet_);
   // give the inputs to the computer object.
-  computer.AcceptInputs(nnet_, eg);
+  computer.AcceptInputs(nnet_, eg.io);
   computer.Forward();
   this->ProcessOutputs(eg, &computer);
   if (config_.compute_deriv)
@@ -72,8 +82,9 @@ void NnetComputeProb::ProcessOutputs(const NnetExample &eg,
   for (; iter != end; ++iter) {
     const NnetIo &io = *iter;
     int32 node_index = nnet_.GetNodeIndex(io.name);
+    if (node_index < 0)
+      KALDI_ERR << "Network has no output named " << io.name;
     ObjectiveType obj_type = nnet_.GetNode(node_index).u.objective_type;
-    KALDI_ASSERT(node_index >= 0);
     if (nnet_.IsOutputNode(node_index)) {
       const CuMatrixBase<BaseFloat> &output = computer->GetOutput(io.name);
       if (output.NumCols() != io.features.NumCols()) {
@@ -87,7 +98,7 @@ void NnetComputeProb::ProcessOutputs(const NnetExample &eg,
         ComputeObjectiveFunction(io.features, obj_type, io.name,
                                  supply_deriv, computer,
                                  &tot_weight, &tot_objf);
-        SimpleObjectiveInfo &totals = objf_info_[io.name];       
+        SimpleObjectiveInfo &totals = objf_info_[io.name];
         totals.tot_weight += tot_weight;
         totals.tot_objective += tot_objf;
       }
@@ -108,7 +119,7 @@ bool NnetComputeProb::PrintTotalStats() const {
   bool ans = false;
   unordered_map<std::string, SimpleObjectiveInfo, StringHasher>::const_iterator
       iter, end;
-  { // First print regular objectives  
+  { // First print regular objectives
     iter = objf_info_.begin();
     end = objf_info_.end();
     for (; iter != end; ++iter) {
@@ -132,7 +143,7 @@ bool NnetComputeProb::PrintTotalStats() const {
     for (; iter != end; ++iter) {
       const std::string &name = iter->first;
       const SimpleObjectiveInfo &info = iter->second;
-      KALDI_LOG << "Overall accuracy for '" << name << "'is "
+      KALDI_LOG << "Overall accuracy for '" << name << "' is "
                 << (info.tot_objective / info.tot_weight) << " per frame"
                 << ", over " << info.tot_weight << " frames.";
       // don't bother changing ans; the loop over the regular objective should
@@ -178,7 +189,7 @@ void ComputeAccuracy(const GeneralMatrix &supervision,
           tot_accuracy += row_sum;
       }
       break;
-      
+
     }
     case kFullMatrix: {
       const Matrix<BaseFloat> &mat = supervision.GetFullMatrix();
diff --git a/src/nnet3/nnet-diagnostics.h b/src/nnet3/nnet-diagnostics.h
index 41fff44daff..298548857dd 100644
--- a/src/nnet3/nnet-diagnostics.h
+++ b/src/nnet3/nnet-diagnostics.h
@@ -34,7 +34,6 @@ namespace nnet3 {
 struct SimpleObjectiveInfo {
   double tot_weight;
   double tot_objective;
-
   SimpleObjectiveInfo(): tot_weight(0.0),
                          tot_objective(0.0) { }
 
@@ -80,9 +79,13 @@ struct NnetComputeProbOptions {
  */
 class NnetComputeProb {
  public:
+  // does not store a reference to 'config' but does store one to 'nnet'.
   NnetComputeProb(const NnetComputeProbOptions &config,
                   const Nnet &nnet);
 
+  // Reset the likelihood stats, and the derivative stats (if computed).
+  void Reset();
+
   // compute objective on one minibatch.
   void Compute(const NnetExample &eg);
 
@@ -102,8 +105,8 @@ class NnetComputeProb {
  private:
   void ProcessOutputs(const NnetExample &eg,
                       NnetComputer *computer);
-  
-  const NnetComputeProbOptions config_;
+
+  NnetComputeProbOptions config_;
   const Nnet &nnet_;
 
   Nnet *deriv_nnet_;
@@ -111,7 +114,7 @@ class NnetComputeProb {
 
   // this is only for diagnostics.
   int32 num_minibatches_processed_;
-    
+
   unordered_map<std::string, SimpleObjectiveInfo, StringHasher> objf_info_;
 
   unordered_map<std::string, SimpleObjectiveInfo, StringHasher> accuracy_info_;
@@ -144,7 +147,7 @@ class NnetComputeProb {
                      indexes r such that the maximum column index of row r of
                      supervision and nnet_output is the same, of the sum of the
                      r'th row of supervision (i.e. the row's weight).
-   
+
 */
 void ComputeAccuracy(const GeneralMatrix &supervision,
                      const CuMatrixBase<BaseFloat> &nnet_output,
diff --git a/src/nnet3/nnet-discriminative-diagnostics.cc b/src/nnet3/nnet-discriminative-diagnostics.cc
new file mode 100644
index 00000000000..10f0811c12e
--- /dev/null
+++ b/src/nnet3/nnet-discriminative-diagnostics.cc
@@ -0,0 +1,209 @@
+// nnet3/nnet-discriminative-diagnostics.cc
+
+// Copyright  2012-2015    Johns Hopkins University (author: Daniel Povey)
+// Copyright  2014-2015    Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "nnet3/nnet-discriminative-diagnostics.h"
+#include "nnet3/nnet-utils.h"
+#include "nnet3/discriminative-training.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+NnetDiscriminativeComputeObjf::NnetDiscriminativeComputeObjf(
+    const NnetComputeProbOptions &nnet_config,
+    const discriminative::DiscriminativeOptions &discriminative_config,
+    const TransitionModel &tmodel,
+    const VectorBase<BaseFloat> &priors,
+    const Nnet &nnet):
+    nnet_config_(nnet_config),
+    discriminative_config_(discriminative_config),
+    tmodel_(tmodel),
+    log_priors_(priors),
+    nnet_(nnet),
+    compiler_(nnet, nnet_config_.optimize_config),
+    deriv_nnet_(NULL),
+    num_minibatches_processed_(0) {
+  log_priors_.ApplyLog();
+  if (nnet_config_.compute_deriv) {
+    deriv_nnet_ = new Nnet(nnet_);
+    bool is_gradient = true;  // force simple update
+    SetZero(is_gradient, deriv_nnet_);
+  }
+}
+
+const Nnet& NnetDiscriminativeComputeObjf::GetDeriv() const {
+  if (deriv_nnet_ == NULL)
+    KALDI_ERR << "GetDeriv() called when no derivatives were requested.";
+  return *deriv_nnet_;
+}
+
+NnetDiscriminativeComputeObjf::~NnetDiscriminativeComputeObjf() {
+  delete deriv_nnet_;  // delete does nothing if pointer is NULL.
+}
+
+void NnetDiscriminativeComputeObjf::Reset() {
+  num_minibatches_processed_ = 0;
+  objf_info_.clear();
+  if (deriv_nnet_) {
+    bool is_gradient = true;
+    SetZero(is_gradient, deriv_nnet_);
+  }
+}
+
+void NnetDiscriminativeComputeObjf::Compute(const NnetDiscriminativeExample &eg) {
+  bool need_model_derivative = nnet_config_.compute_deriv,
+      store_component_stats = false;
+  bool use_xent_regularization = (discriminative_config_.xent_regularize != 0.0),
+      use_xent_derivative = false;
+
+  ComputationRequest request;
+  GetDiscriminativeComputationRequest(nnet_, eg, 
+                                      need_model_derivative,
+                                      store_component_stats,
+                                      use_xent_regularization, use_xent_derivative,
+                                      &request);
+  const NnetComputation *computation = compiler_.Compile(request);
+  NnetComputer computer(nnet_config_.compute_config, *computation,
+                        nnet_, deriv_nnet_);
+  // give the inputs to the computer object.
+  computer.AcceptInputs(nnet_, eg.inputs);
+  computer.Forward();
+  this->ProcessOutputs(eg, &computer);
+  if (nnet_config_.compute_deriv)
+    computer.Backward();
+}
+
+void NnetDiscriminativeComputeObjf::ProcessOutputs(
+                                    const NnetDiscriminativeExample &eg,
+                                    NnetComputer *computer) {
+  // There will normally be just one output here, named 'output',
+  // but the code is more general than this.
+  std::vector<NnetDiscriminativeSupervision>::const_iterator iter = eg.outputs.begin(),
+      end = eg.outputs.end();
+  for (; iter != end; ++iter) {
+    const NnetDiscriminativeSupervision &sup = *iter;
+    int32 node_index = nnet_.GetNodeIndex(sup.name);
+    if (node_index < 0 ||
+        !nnet_.IsOutputNode(node_index))
+      KALDI_ERR << "Network has no output named " << sup.name;
+
+    const CuMatrixBase<BaseFloat> &nnet_output = computer->GetOutput(sup.name);
+    
+    bool use_xent = (discriminative_config_.xent_regularize != 0.0);
+    std::string xent_name = sup.name + "-xent";  // typically "output-xent".
+    CuMatrix<BaseFloat> nnet_output_deriv, xent_deriv;
+
+    if (nnet_config_.compute_deriv)
+      nnet_output_deriv.Resize(nnet_output.NumRows(), nnet_output.NumCols(),
+                               kUndefined);
+    
+    if (use_xent)
+      xent_deriv.Resize(nnet_output.NumRows(), nnet_output.NumCols(),
+                        kUndefined);
+
+    if (objf_info_.count(sup.name) == 0)
+      objf_info_.insert(std::make_pair(sup.name, 
+          discriminative::DiscriminativeObjectiveInfo(discriminative_config_)));
+
+    discriminative::DiscriminativeObjectiveInfo *stats = &(objf_info_[sup.name]);
+
+    discriminative::ComputeDiscriminativeObjfAndDeriv(discriminative_config_, 
+                                                      tmodel_, log_priors_,
+                                                      sup.supervision, nnet_output,
+                                                      stats,
+                                                      (nnet_config_.compute_deriv ?
+                                                       &nnet_output_deriv : NULL),
+                                                      (use_xent ? &xent_deriv : NULL));
+
+    if (nnet_config_.compute_deriv)
+      computer->AcceptOutputDeriv(sup.name, &nnet_output_deriv);
+    
+    if (use_xent) {
+      if (objf_info_.count(xent_name) == 0)
+        objf_info_.insert(std::make_pair(xent_name, 
+          discriminative::DiscriminativeObjectiveInfo(discriminative_config_)));
+      discriminative::DiscriminativeObjectiveInfo &xent_stats = objf_info_[xent_name];
+
+      // this block computes the cross-entropy objective.
+      const CuMatrixBase<BaseFloat> &xent_output = computer->GetOutput(xent_name);
+      // at this point, xent_deriv is posteriors derived from the numerator
+      // computation.  note, xent_deriv has a factor of 'supervision.weight',
+      // but so does tot_weight.
+      BaseFloat xent_objf = TraceMatMat(xent_output, xent_deriv, kTrans);
+      xent_stats.tot_t_weighted += stats->tot_t_weighted;
+      xent_stats.tot_objf += xent_objf;
+    }
+    
+    num_minibatches_processed_++;
+  }
+}
+
+bool NnetDiscriminativeComputeObjf::PrintTotalStats() const {
+  bool ans = false;
+  unordered_map<std::string, discriminative::DiscriminativeObjectiveInfo, StringHasher>::const_iterator
+      iter, end;
+  iter = objf_info_.begin();
+  end = objf_info_.end();
+  for (; iter != end; ++iter) {
+    const std::string &name = iter->first;
+    int32 node_index = nnet_.GetNodeIndex(name);
+    KALDI_ASSERT(node_index >= 0);
+    const discriminative::DiscriminativeObjectiveInfo &info = iter->second;
+    BaseFloat tot_weight = info.tot_t_weighted;
+    BaseFloat tot_objective = info.TotalObjf(
+        discriminative_config_.criterion);
+    
+    info.PrintAll(discriminative_config_.criterion);
+
+    if (info.tot_l2_term == 0.0) {
+      KALDI_LOG << "Overall " << discriminative_config_.criterion
+                << " objective for '"
+                << name << "' is "
+                << (tot_objective / tot_weight) 
+                << " per frame, "
+                << "over " << tot_weight << " frames.";
+    } else {
+      KALDI_LOG << "Overall " << discriminative_config_.criterion
+                << " objective for '"
+                << name << "' is "
+                << (tot_objective / tot_weight) 
+                << " + " << (info.tot_l2_term / tot_weight)
+                << " per frame, "
+                << "over " << tot_weight << " frames.";
+    }
+
+    if (tot_weight > 0)
+      ans = true;
+  }
+  return ans;
+}
+
+const discriminative::DiscriminativeObjectiveInfo* NnetDiscriminativeComputeObjf::GetObjective(
+    const std::string &output_name) const {
+  unordered_map<std::string, discriminative::DiscriminativeObjectiveInfo, StringHasher>::const_iterator
+      iter = objf_info_.find(output_name);
+  if (iter != objf_info_.end())
+    return &(iter->second);
+  else
+    return NULL;
+}
+
+} // namespace nnet3
+} // namespace kaldi
+
diff --git a/src/nnet3/nnet-discriminative-diagnostics.h b/src/nnet3/nnet-discriminative-diagnostics.h
new file mode 100644
index 00000000000..3bcae8fac30
--- /dev/null
+++ b/src/nnet3/nnet-discriminative-diagnostics.h
@@ -0,0 +1,88 @@
+// nnet3/nnet-discriminative-diagnostics.h
+
+// Copyright    2012-2015  Johns Hopkins University (author: Daniel Povey)
+// Copyright    2014-2015  Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_NNET3_NNET_DISCRIMINATIVE_DIAGNOSTICS_H_
+#define KALDI_NNET3_NNET_DISCRIMINATIVE_DIAGNOSTICS_H_
+
+#include "nnet3/nnet-example.h"
+#include "nnet3/nnet-computation.h"
+#include "nnet3/nnet-compute.h"
+#include "nnet3/nnet-optimize.h"
+#include "nnet3/nnet-discriminative-example.h"
+#include "nnet3/nnet-diagnostics.h"
+#include "nnet3/discriminative-training.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+/** This class is for computing objective-function values in a nnet3 
+    discriminative training, for diagnostics.  It also supports computing model
+    derivatives.
+ */
+class NnetDiscriminativeComputeObjf {
+ public:
+  // does not store a reference to 'config' but does store one to 'nnet'.
+  NnetDiscriminativeComputeObjf(const NnetComputeProbOptions &nnet_config,
+      const discriminative::DiscriminativeOptions &discriminative_config,
+      const TransitionModel &tmodel,
+      const VectorBase<BaseFloat> &priors,
+      const Nnet &nnet);
+
+  // Reset the likelihood stats, and the derivative stats (if computed).
+  void Reset();
+
+  // compute objective on one minibatch.
+  void Compute(const NnetDiscriminativeExample &eg);
+
+  // Prints out the final stats, and return true if there was a nonzero count.
+  bool PrintTotalStats() const;
+
+  // returns the objective-function info for this output name (e.g. "output"),
+  // or NULL if there is no such info.
+  const discriminative::DiscriminativeObjectiveInfo *GetObjective(
+      const std::string &output_name) const;
+
+  // if config.compute_deriv == true, returns a reference to the
+  // computed derivative.  Otherwise crashes.
+  const Nnet &GetDeriv() const;
+  
+  ~NnetDiscriminativeComputeObjf();
+ private:
+  void ProcessOutputs(const NnetDiscriminativeExample &eg,
+                      NnetComputer *computer);
+
+  NnetComputeProbOptions nnet_config_;
+
+  discriminative::DiscriminativeOptions discriminative_config_;
+  const TransitionModel &tmodel_;
+  CuVector<BaseFloat> log_priors_;
+  const Nnet &nnet_;
+  CachingOptimizingCompiler compiler_;
+  Nnet *deriv_nnet_;
+  int32 num_minibatches_processed_;  // this is only for diagnostics
+
+  unordered_map<std::string, discriminative::DiscriminativeObjectiveInfo, StringHasher> objf_info_;
+};
+
+} // namespace nnet3
+} // namespace kaldi
+
+#endif // KALDI_NNET3_NNET_DISCRIMINATIVE_DIAGNOSTICS_H_
+
diff --git a/src/nnet3/nnet-discriminative-example.cc b/src/nnet3/nnet-discriminative-example.cc
new file mode 100644
index 00000000000..e9a063e268e
--- /dev/null
+++ b/src/nnet3/nnet-discriminative-example.cc
@@ -0,0 +1,419 @@
+// nnet3/nnet-discriminative-example.cc
+
+// Copyright      2015    Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cmath>
+#include "nnet3/nnet-discriminative-example.h"
+#include "nnet3/nnet-example-utils.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+
+void NnetDiscriminativeSupervision::Write(std::ostream &os, bool binary) const {
+  CheckDim();
+  WriteToken(os, binary, "<NnetDiscriminativeSup>");
+  WriteToken(os, binary, name);
+  WriteIndexVector(os, binary, indexes);
+  supervision.Write(os, binary);
+  WriteToken(os, binary, "<DW>");  // for DerivWeights.  Want to save space.
+  WriteVectorAsChar(os, binary, deriv_weights);
+  WriteToken(os, binary, "</NnetDiscriminativeSup>");
+}
+
+bool NnetDiscriminativeSupervision::operator == (const NnetDiscriminativeSupervision &other) const {
+  return name == other.name && indexes == other.indexes &&
+      supervision == other.supervision &&
+      deriv_weights.ApproxEqual(other.deriv_weights);
+}
+
+void NnetDiscriminativeSupervision::Read(std::istream &is, bool binary) {
+  ExpectToken(is, binary, "<NnetDiscriminativeSup>");
+  ReadToken(is, binary, &name);
+  ReadIndexVector(is, binary, &indexes);
+  supervision.Read(is, binary);
+  ExpectToken(is, binary, "<DW>");
+  ReadVectorAsChar(is, binary, &deriv_weights);
+  ExpectToken(is, binary, "</NnetDiscriminativeSup>");
+  CheckDim();
+}
+
+
+void NnetDiscriminativeSupervision::CheckDim() const {
+  if (supervision.frames_per_sequence == -1) {
+    // this object has not been set up.
+    KALDI_ASSERT(indexes.empty());
+    return;
+  }
+  KALDI_ASSERT(indexes.size() == supervision.num_sequences *
+               supervision.frames_per_sequence && !indexes.empty() &&
+               supervision.frames_per_sequence > 1);
+  int32 first_frame = indexes[0].t,
+      frame_skip = indexes[supervision.num_sequences].t - first_frame,
+      num_sequences = supervision.num_sequences,
+      frames_per_sequence = supervision.frames_per_sequence;
+  int32 k = 0;
+  for (int32 i = 0; i < frames_per_sequence; i++) {
+    for (int32 j = 0; j < num_sequences; j++,k++) {
+      int32 n = j, t = i * frame_skip + first_frame, x = 0;
+      Index index(n, t, x);
+      KALDI_ASSERT(indexes[k] == index);
+    }
+  }
+  if (deriv_weights.Dim() != 0) {
+    KALDI_ASSERT(deriv_weights.Dim() == indexes.size());
+    KALDI_ASSERT(deriv_weights.Min() >= 0.0 &&
+                 deriv_weights.Max() <= 1.0);
+  }
+}
+
+NnetDiscriminativeSupervision::NnetDiscriminativeSupervision(const NnetDiscriminativeSupervision &other):
+    name(other.name),
+    indexes(other.indexes),
+    supervision(other.supervision),
+    deriv_weights(other.deriv_weights) { CheckDim(); }
+
+NnetDiscriminativeSupervision::NnetDiscriminativeSupervision(
+    const std::string &name,
+    const discriminative::DiscriminativeSupervision &supervision,
+    const Vector<BaseFloat> &deriv_weights,
+    int32 first_frame,
+    int32 frame_skip):
+    name(name),
+    supervision(supervision),
+    deriv_weights(deriv_weights) {
+  // note: this will set the 'x' index to zero.
+  indexes.resize(supervision.num_sequences *
+                 supervision.frames_per_sequence);
+  int32 k = 0, num_sequences = supervision.num_sequences,
+      frames_per_sequence = supervision.frames_per_sequence;
+  for (int32 i = 0; i < frames_per_sequence; i++) {
+    for (int32 j = 0; j < num_sequences; j++,k++) {
+      indexes[k].n = j;
+      indexes[k].t = i * frame_skip + first_frame;
+    }
+  }
+  KALDI_ASSERT(k == indexes.size());
+  CheckDim();
+}
+
+void NnetDiscriminativeSupervision::Swap(NnetDiscriminativeSupervision *other) {
+  name.swap(other->name);
+  indexes.swap(other->indexes);
+  supervision.Swap(&(other->supervision));
+  deriv_weights.Swap(&(other->deriv_weights));
+  if (RandInt(0, 5) == 0)
+    CheckDim();
+}
+
+
+void NnetDiscriminativeExample::Write(std::ostream &os, bool binary) const {
+  // Note: weight, label, input_frames and spk_info are members.  This is a
+  // struct.
+  WriteToken(os, binary, "<Nnet3DiscriminativeEg>");
+  WriteToken(os, binary, "<NumInputs>");
+  int32 size = inputs.size();
+  WriteBasicType(os, binary, size);
+  KALDI_ASSERT(size > 0 && "Attempting to write NnetDiscriminativeExample with no inputs");
+  if (!binary) os << '\n';
+  for (int32 i = 0; i < size; i++) {
+    inputs[i].Write(os, binary);
+    if (!binary) os << '\n';
+  }
+  WriteToken(os, binary, "<NumOutputs>");
+  size = outputs.size();
+  WriteBasicType(os, binary, size);
+  KALDI_ASSERT(size > 0 && "Attempting to write NnetDiscriminativeExample with no outputs");
+  if (!binary) os << '\n';
+  for (int32 i = 0; i < size; i++) {
+    outputs[i].Write(os, binary);
+    if (!binary) os << '\n';
+  }
+  WriteToken(os, binary, "</Nnet3DiscriminativeEg>");
+}
+
+void NnetDiscriminativeExample::Read(std::istream &is, bool binary) {
+  ExpectToken(is, binary, "<Nnet3DiscriminativeEg>");
+  ExpectToken(is, binary, "<NumInputs>");
+  int32 size;
+  ReadBasicType(is, binary, &size);
+  if (size < 1 || size > 1000000)
+    KALDI_ERR << "Invalid size " << size;
+  inputs.resize(size);
+  for (int32 i = 0; i < size; i++)
+    inputs[i].Read(is, binary);
+  ExpectToken(is, binary, "<NumOutputs>");
+  ReadBasicType(is, binary, &size);
+  if (size < 1 || size > 1000000)
+    KALDI_ERR << "Invalid size " << size;
+  outputs.resize(size);
+  for (int32 i = 0; i < size; i++)
+    outputs[i].Read(is, binary);
+  ExpectToken(is, binary, "</Nnet3DiscriminativeEg>");
+}
+
+void NnetDiscriminativeExample::Swap(NnetDiscriminativeExample *other) {
+  inputs.swap(other->inputs);
+  outputs.swap(other->outputs);
+}
+
+void NnetDiscriminativeExample::Compress() {
+  std::vector<NnetIo>::iterator iter = inputs.begin(), end = inputs.end();
+  // calling features.Compress() will do nothing if they are sparse or already
+  // compressed.
+  for (; iter != end; ++iter) iter->features.Compress();
+}
+
+NnetDiscriminativeExample::NnetDiscriminativeExample(const NnetDiscriminativeExample &other):
+    inputs(other.inputs), outputs(other.outputs) { }
+
+void MergeSupervision(
+    const std::vector<const NnetDiscriminativeSupervision*> &inputs,
+    NnetDiscriminativeSupervision *output) {
+  int32 num_inputs = inputs.size(),
+      num_indexes = 0;
+  for (int32 n = 0; n < num_inputs; n++) {
+    KALDI_ASSERT(inputs[n]->name == inputs[0]->name);
+    num_indexes += inputs[n]->indexes.size();
+  }
+  output->name = inputs[0]->name;
+  std::vector<const discriminative::DiscriminativeSupervision*> input_supervision;
+  input_supervision.reserve(inputs.size());
+  for (int32 n = 0; n < num_inputs; n++)
+    input_supervision.push_back(&(inputs[n]->supervision));
+  std::vector<discriminative::DiscriminativeSupervision> output_supervision;
+  bool compactify = true;
+  discriminative::AppendSupervision(input_supervision,
+                         compactify,
+                         &output_supervision);
+  if (output_supervision.size() != 1)
+    KALDI_ERR << "Failed to merge discriminative examples-- inconsistent lengths "
+              << "or weights?";
+  output->supervision.Swap(&(output_supervision[0]));
+
+  output->indexes.clear();
+  output->indexes.reserve(num_indexes);
+  for (int32 n = 0; n < num_inputs; n++) {
+    const std::vector<Index> &src_indexes = inputs[n]->indexes;
+    int32 cur_size = output->indexes.size();
+    output->indexes.insert(output->indexes.end(),
+                           src_indexes.begin(), src_indexes.end());
+    std::vector<Index>::iterator iter = output->indexes.begin() + cur_size,
+        end = output->indexes.end();
+    // change the 'n' index to correspond to the index into 'input'.
+    // Each example gets a different 'n' value, starting from 0.
+    for (; iter != end; ++iter) {
+      KALDI_ASSERT(iter->n == 0 && "Merging already-merged discriminative egs");
+      iter->n = n;
+    }
+  }
+  KALDI_ASSERT(output->indexes.size() == num_indexes);
+  // OK, at this point the 'indexes' will be in the wrong order,
+  // because they should be first sorted by 't' and next by 'n'.
+  // 'sort' will fix this, due to the operator < on type Index.
+  // TODO: Is this required?
+  std::sort(output->indexes.begin(), output->indexes.end());
+
+  // merge the deriv_weights.
+  if (inputs[0]->deriv_weights.Dim() != 0) {
+    int32 frames_per_sequence = inputs[0]->deriv_weights.Dim();
+    output->deriv_weights.Resize(output->indexes.size(), kUndefined);
+    KALDI_ASSERT(output->deriv_weights.Dim() ==
+                 frames_per_sequence * num_inputs);
+    for (int32 n = 0; n < num_inputs; n++) {
+      const Vector<BaseFloat> &src_deriv_weights = inputs[n]->deriv_weights;
+      KALDI_ASSERT(src_deriv_weights.Dim() == frames_per_sequence);
+      // the ordering of the deriv_weights corresponds to the ordering of the
+      // Indexes, where the time dimension has the greater stride.
+      for (int32 t = 0; t < frames_per_sequence; t++) {
+        output->deriv_weights(t * num_inputs + n) = src_deriv_weights(t);
+      }
+    }
+  }
+  output->CheckDim();
+}
+
+
+void MergeDiscriminativeExamples(bool compress,
+                        std::vector<NnetDiscriminativeExample> *input,
+                        NnetDiscriminativeExample *output) {
+  int32 num_examples = input->size();
+  KALDI_ASSERT(num_examples > 0);
+  // we temporarily make the input-features in 'input' look like regular NnetExamples,
+  // so that we can recycle the MergeExamples() function.
+  std::vector<NnetExample> eg_inputs(num_examples);
+  for (int32 i = 0; i < num_examples; i++)
+    eg_inputs[i].io.swap((*input)[i].inputs);
+  NnetExample eg_output;
+  MergeExamples(eg_inputs, compress, &eg_output);
+  // swap the inputs back so that they are not really changed.
+  for (int32 i = 0; i < num_examples; i++)
+    eg_inputs[i].io.swap((*input)[i].inputs);
+  // write to 'output->inputs'
+  eg_output.io.swap(output->inputs);
+
+  // Now deal with the discriminative-supervision 'outputs'.  There will
+  // normally be just one of these, with name "output", but we
+  // handle the more general case.
+  int32 num_output_names = (*input)[0].outputs.size();
+  output->outputs.resize(num_output_names);
+  for (int32 i = 0; i < num_output_names; i++) {
+    std::vector<const NnetDiscriminativeSupervision*> to_merge(num_examples);
+    for (int32 j = 0; j < num_examples; j++) {
+      KALDI_ASSERT((*input)[j].outputs.size() == num_output_names);
+      to_merge[j] = &((*input)[j].outputs[i]);
+    }
+    MergeSupervision(to_merge,
+                     &(output->outputs[i]));
+  }
+}
+
+void TruncateDerivWeights(int32 truncate,
+                          NnetDiscriminativeExample *eg) {
+  for (size_t i = 0; i < eg->outputs.size(); i++) {
+    NnetDiscriminativeSupervision &supervision = eg->outputs[i];
+    Vector<BaseFloat> &deriv_weights = supervision.deriv_weights;
+    if (deriv_weights.Dim() == 0) {
+      deriv_weights.Resize(supervision.indexes.size());
+      deriv_weights.Set(1.0);
+    }
+    int32 num_sequences = supervision.supervision.num_sequences,
+       frames_per_sequence = supervision.supervision.frames_per_sequence;
+    KALDI_ASSERT(2 * truncate  < frames_per_sequence);
+    for (int32 t = 0; t < truncate; t++)
+      for (int32 s = 0; s < num_sequences; s++)
+        deriv_weights(t * num_sequences + s) = 0.0;
+    for (int32 t = frames_per_sequence - truncate;
+         t < frames_per_sequence; t++)
+      for (int32 s = 0; s < num_sequences; s++)
+        deriv_weights(t * num_sequences + s) = 0.0;
+  }
+}
+
+void GetDiscriminativeComputationRequest(const Nnet &nnet,
+                                         const NnetDiscriminativeExample &eg,
+                                         bool need_model_derivative,
+                                         bool store_component_stats,
+                                         bool use_xent_regularization,
+                                         bool use_xent_derivative,
+                                         ComputationRequest *request) {
+  request->inputs.clear();
+  request->inputs.reserve(eg.inputs.size());
+  request->outputs.clear();
+  request->outputs.reserve(eg.outputs.size());
+  request->need_model_derivative = need_model_derivative;
+  request->store_component_stats = store_component_stats;
+  for (size_t i = 0; i < eg.inputs.size(); i++) {
+    const NnetIo &io = eg.inputs[i];
+    const std::string &name = io.name;
+    int32 node_index = nnet.GetNodeIndex(name);
+    if (node_index == -1 &&
+        !nnet.IsInputNode(node_index))
+      KALDI_ERR << "Nnet example has input named '" << name
+                << "', but no such input node is in the network.";
+
+    request->inputs.resize(request->inputs.size() + 1);
+    IoSpecification &io_spec = request->inputs.back();
+    io_spec.name = name;
+    io_spec.indexes = io.indexes;
+    io_spec.has_deriv = false;
+  }
+  for (size_t i = 0; i < eg.outputs.size(); i++) {
+    // there will normally be exactly one output , named "output"
+    const NnetDiscriminativeSupervision &sup = eg.outputs[i];
+    const std::string &name = sup.name;
+    int32 node_index = nnet.GetNodeIndex(name);
+    if (node_index == -1 &&
+        !nnet.IsOutputNode(node_index))
+      KALDI_ERR << "Nnet example has output named '" << name
+                << "', but no such output node is in the network.";
+    request->outputs.resize(request->outputs.size() + 1);
+    IoSpecification &io_spec = request->outputs.back();
+    io_spec.name = name;
+    io_spec.indexes = sup.indexes;
+    io_spec.has_deriv = need_model_derivative;
+    
+    if (use_xent_regularization) {
+      size_t cur_size = request->outputs.size();
+      request->outputs.resize(cur_size + 1);
+      IoSpecification &io_spec = request->outputs[cur_size - 1],
+          &io_spec_xent = request->outputs[cur_size];
+      // the IoSpecification for the -xent output is the same
+      // as for the regular output, except for its name which has
+      // the -xent suffix (and the has_deriv member may differ).
+      io_spec_xent = io_spec;
+      io_spec_xent.name = name + "-xent";
+      io_spec_xent.has_deriv = use_xent_derivative;
+    }
+  }
+  // check to see if something went wrong.
+  if (request->inputs.empty())
+    KALDI_ERR << "No inputs in computation request.";
+  if (request->outputs.empty())
+    KALDI_ERR << "No outputs in computation request.";
+}
+
+void ShiftDiscriminativeExampleTimes(int32 frame_shift,
+                            const std::vector<std::string> &exclude_names,
+                            NnetDiscriminativeExample *eg) {
+  std::vector<NnetIo>::iterator input_iter = eg->inputs.begin(),
+      input_end = eg->inputs.end();
+  for (; input_iter != input_end; ++input_iter) {
+    bool must_exclude = false;
+    std::vector<string>::const_iterator exclude_iter = exclude_names.begin(),
+        exclude_end = exclude_names.end();
+    for (; exclude_iter != exclude_end; ++exclude_iter)
+      if (input_iter->name == *exclude_iter)
+        must_exclude = true;
+    if (!must_exclude) {
+      std::vector<Index>::iterator indexes_iter = input_iter->indexes.begin(),
+          indexes_end = input_iter->indexes.end();
+      for (; indexes_iter != indexes_end; ++indexes_iter)
+        indexes_iter->t += frame_shift;
+    }
+  }
+  // note: we'll normally choose a small enough shift that the output-data
+  // shift will be zero after dividing by frame_subsampling_factor
+  // (e.g. frame_subsampling_factor == 3 and shift = 0 or 1.
+  std::vector<NnetDiscriminativeSupervision>::iterator
+      sup_iter = eg->outputs.begin(),
+      sup_end = eg->outputs.end();
+  for (; sup_iter != sup_end; ++sup_iter) {
+    std::vector<Index> &indexes = sup_iter->indexes;
+    KALDI_ASSERT(indexes.size() >= 2 && indexes[0].n == indexes[1].n &&
+                 indexes[0].x == indexes[1].x);
+    int32 frame_subsampling_factor = indexes[1].t - indexes[0].t;
+    KALDI_ASSERT(frame_subsampling_factor > 0);
+
+    // We need to shift by a multiple of frame_subsampling_factor.
+    // Round to the closest multiple.
+    int32 supervision_frame_shift =
+        frame_subsampling_factor *
+        std::floor(0.5 + (frame_shift * 1.0 / frame_subsampling_factor));
+    if (supervision_frame_shift == 0)
+      continue;
+    std::vector<Index>::iterator indexes_iter = indexes.begin(),
+        indexes_end = indexes.end();
+    for (; indexes_iter != indexes_end; ++indexes_iter)
+      indexes_iter->t += supervision_frame_shift;
+  }
+}
+
+} // namespace nnet3
+} // namespace kaldi
+
diff --git a/src/nnet3/nnet-discriminative-example.h b/src/nnet3/nnet-discriminative-example.h
new file mode 100644
index 00000000000..b2458b0cdcd
--- /dev/null
+++ b/src/nnet3/nnet-discriminative-example.h
@@ -0,0 +1,206 @@
+// nnet3/nnet-discriminative-example.h
+
+// Copyright 2012-2015  Johns Hopkins University (author: Daniel Povey)
+//           2014-2015  Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_NNET3_NNET_DISCRIMINATIVE_EXAMPLE_H_
+#define KALDI_NNET3_NNET_DISCRIMINATIVE_EXAMPLE_H_
+
+#include "nnet3/nnet-nnet.h"
+#include "nnet3/nnet-computation.h"
+#include "util/table-types.h"
+#include "nnet3/discriminative-supervision.h"
+#include "nnet3/nnet-example.h"
+#include "hmm/posterior.h"
+#include "hmm/transition-model.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+// Glossary: mmi = Maximum Mutual Information,
+//          mpfe = Minimum Phone Frame Error
+//          smbr = State-level Minimum Bayes Risk
+
+// This file relates to the creation of examples for discriminative training
+
+struct NnetDiscriminativeSupervision {
+  // the name of the output in the neural net; in simple setups it
+  // will just be "output".
+  std::string name;
+  
+  // The indexes that the output corresponds to.  The size of this vector will
+  // be equal to supervision.num_sequences * supervision.frames_per_sequence.
+  // Be careful about the order of these indexes-- it is a little confusing.
+  // The indexes in the 'index' vector are ordered as: (frame 0 of each sequence);
+  // (frame 1 of each sequence); and so on.  But in the 'supervision' object,
+  // the lattice contains (sequence 0; sequence 1; ...).  So reordering is needed.
+  // This is done to make the code similar that for the 'chain' model.
+  std::vector<Index> indexes;
+
+  // The supervision object, containing the numerator and denominator 
+  // lattices.
+  discriminative::DiscriminativeSupervision supervision;
+
+  // This is a vector of per-frame weights, required to be between 0 and 1,
+  // that is applied to the derivative during training (but not during model
+  // combination, where the derivatives need to agree with the computed objf
+  // values for the optimization code to work).  The reason for this is to more
+  // exactly handle edge effects and to ensure that no frames are
+  // 'double-counted'.  The order of this vector corresponds to the order of
+  // the 'indexes' (i.e. all the first frames, then all the second frames,
+  // etc.)
+  // If this vector is empty it means we're not applying per-frame weights,
+  // so it's equivalent to a vector of all ones.  This vector is written
+  // to disk compactly as unsigned char.
+  Vector<BaseFloat> deriv_weights;
+  
+  // Use default assignment operator
+  NnetDiscriminativeSupervision() { }
+
+  // Initialize the object from an object of type discriminative::Supervision,
+  // and some extra information.  
+  // Note: you probably want to set 'name' to "output".
+  // 'first_frame' will often be zero but you can choose (just make it
+  // consistent with how you numbered your inputs), and 'frame_skip' would be 1
+  // in a vanilla setup, but 3 in the case of 'chain' models
+  NnetDiscriminativeSupervision(const std::string &name,
+                                const discriminative::DiscriminativeSupervision &supervision,
+                                const Vector<BaseFloat> &deriv_weights,
+                                int32 first_frame,
+                                int32 frame_skip);
+
+  NnetDiscriminativeSupervision(const NnetDiscriminativeSupervision &other);
+
+  void Write(std::ostream &os, bool binary) const;
+
+  void Read(std::istream &is, bool binary);
+  
+  void Swap(NnetDiscriminativeSupervision *other);
+
+  void CheckDim() const;
+  
+  bool operator == (const NnetDiscriminativeSupervision &other) const;
+};
+
+/// NnetDiscriminativeExample is like NnetExample, but specialized for 
+/// sequence training.
+struct NnetDiscriminativeExample {
+
+  /// 'inputs' contains the input to the network-- normally just it has just one
+  /// element called "input", but there may be others (e.g. one called
+  /// "ivector")...  this depends on the setup.
+  std::vector<NnetIo> inputs;
+
+  /// 'outputs' contains the sequence output supervision.  There will normally
+  /// be just one member with name == "output".
+  std::vector<NnetDiscriminativeSupervision> outputs;
+
+  void Write(std::ostream &os, bool binary) const;
+  
+  void Read(std::istream &is, bool binary);
+
+  void Swap(NnetDiscriminativeExample *other);
+
+  // Compresses the input features (if not compressed)
+  void Compress();
+
+  NnetDiscriminativeExample() { }
+
+  NnetDiscriminativeExample(const NnetDiscriminativeExample &other);
+
+  bool operator == (const NnetDiscriminativeExample &other) const {
+    return inputs == other.inputs && outputs == other.outputs;
+  }
+};
+
+/** 
+  Appends the given vector of examples (which must be non-empty) into 
+  a single output example.
+  Intended to be used when forming minibatches for neural net training. If 
+  'compress' it compresses the output features (recommended to save disk
+  space).
+
+  Note: the input is left as it was at the start, but it is temporarily
+  changed inside the function; this is a trick to allow us to use the
+  MergeExamples() routine while avoiding having to rewrite code.
+*/
+void MergeDiscriminativeExamples(
+    bool compress,
+    std::vector<NnetDiscriminativeExample> *input,
+    NnetDiscriminativeExample *output);
+
+// called from MergeDiscriminativeExamples, this function merges the Supervision
+// objects into one.  Requires (and checks) that they all have the same name.
+
+void MergeSupervision(
+    const std::vector<const NnetDiscriminativeSupervision*> &inputs,
+    NnetDiscriminativeSupervision *output); 
+
+
+/** Shifts the time-index t of everything in the input of "eg" by adding
+    "t_offset" to all "t" values-- but excluding those with names listed in
+    "exclude_names", e.g.  "ivector".  This might be useful if you are doing
+    subsampling of frames at the output, because shifted examples won't be quite
+    equivalent to their non-shifted counterparts.  "exclude_names" is a vector
+    of names of nnet inputs that we avoid shifting the "t" values of-- normally
+    it will contain just the single string "ivector" because we always leave t=0
+    for any ivector.
+
+    Note: input features will be shifted by 'frame_shift', and indexes in the
+    supervision in (eg->output) will be shifted by 'frame_shift' rounded to the
+    closest multiple of the frame subsampling factor (e.g. 3).  The frame
+    subsampling factor is worked out from the time spacing between the indexes
+    in the output.  */
+void ShiftDiscriminativeExampleTimes(int32 frame_shift,
+                                    const std::vector<std::string> &exclude_names,
+                                    NnetDiscriminativeExample *eg);
+
+/**
+   This sets to zero any elements of 'egs->outputs[*].deriv_weights' that correspond
+   to frames within the first or last 'truncate' frames of the sequence (e.g. you could
+   set 'truncate=5' to set zero deriv-weight for the first and last 5 frames of the
+   sequence).
+ */
+void TruncateDerivWeights(int32 truncate,
+                          NnetDiscriminativeExample *eg);
+
+/**  This function takes a NnetDiscriminativeExample and produces a 
+     ComputationRequest.
+     Assumes you don't want the derivatives w.r.t. the inputs; if you do, you
+     can create the ComputationRequest manually.  Assumes that if
+     need_model_derivative is true, you will be supplying derivatives w.r.t. all
+     outputs.
+*/
+void GetDiscriminativeComputationRequest(const Nnet &nnet,
+                                         const NnetDiscriminativeExample &eg,
+                                         bool need_model_derivative,
+                                         bool store_component_stats,
+                                         bool use_xent_regularization,
+                                         bool use_xent_derivative,
+                                         ComputationRequest *computation_request);
+
+
+typedef TableWriter<KaldiObjectHolder<NnetDiscriminativeExample > > NnetDiscriminativeExampleWriter;
+typedef SequentialTableReader<KaldiObjectHolder<NnetDiscriminativeExample > > SequentialNnetDiscriminativeExampleReader;
+typedef RandomAccessTableReader<KaldiObjectHolder<NnetDiscriminativeExample > > RandomAccessNnetDiscriminativeExampleReader;
+
+} // namespace nnet3
+} // namespace kaldi
+
+#endif // KALDI_NNET3_NNET_DISCRIMINATIVE_EXAMPLE_H_
+
diff --git a/src/nnet3/nnet-discriminative-training.cc b/src/nnet3/nnet-discriminative-training.cc
new file mode 100644
index 00000000000..e4f6bf9d463
--- /dev/null
+++ b/src/nnet3/nnet-discriminative-training.cc
@@ -0,0 +1,262 @@
+// nnet3/nnet-discriminative-training.cc
+
+// Copyright      2012-2015    Johns Hopkins University (author: Daniel Povey)
+// Copyright      2014-2015    Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "nnet3/nnet-discriminative-training.h"
+#include "nnet3/nnet-utils.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+NnetDiscriminativeTrainer::NnetDiscriminativeTrainer(
+                                   const NnetDiscriminativeOptions &opts,
+                                   const TransitionModel &tmodel,
+                                   const VectorBase<BaseFloat> &priors,
+                                   Nnet *nnet):
+    opts_(opts), tmodel_(tmodel), log_priors_(priors),
+    nnet_(nnet),
+    compiler_(*nnet, opts_.nnet_config.optimize_config),
+    num_minibatches_processed_(0) {
+  if (opts.nnet_config.zero_component_stats)
+    ZeroComponentStats(nnet);
+  if (opts.nnet_config.momentum == 0.0 &&
+      opts.nnet_config.max_param_change == 0.0) {
+    delta_nnet_= NULL;
+  } else {
+    KALDI_ASSERT(opts.nnet_config.momentum >= 0.0 &&
+                 opts.nnet_config.max_param_change >= 0.0);
+    delta_nnet_ = nnet_->Copy();
+    bool is_gradient = false;  // setting this to true would disable the
+                               // natural-gradient updates.
+    SetZero(is_gradient, delta_nnet_);
+  }
+  if (opts.nnet_config.read_cache != "") {
+    bool binary;
+    try {
+      Input ki(opts.nnet_config.read_cache, &binary);
+      compiler_.ReadCache(ki.Stream(), binary);
+    } catch (...) {
+      KALDI_WARN << "Could not open cached computation. "
+                    "Probably this is the first training iteration.";
+    }
+  } 
+  log_priors_.ApplyLog();
+}
+
+
+void NnetDiscriminativeTrainer::Train(const NnetDiscriminativeExample &eg) {
+  bool need_model_derivative = true;
+  const NnetTrainerOptions &nnet_config = opts_.nnet_config;
+  bool use_xent_regularization = (opts_.discriminative_config.xent_regularize != 0.0);
+  ComputationRequest request;
+  GetDiscriminativeComputationRequest(*nnet_, eg, need_model_derivative,
+                                      nnet_config.store_component_stats,
+                                      use_xent_regularization,
+                                      need_model_derivative,
+                                      &request);
+  const NnetComputation *computation = compiler_.Compile(request);
+
+  NnetComputer computer(nnet_config.compute_config, *computation,
+                        *nnet_,
+                        (delta_nnet_ == NULL ? nnet_ : delta_nnet_));
+  // give the inputs to the computer object.
+  computer.AcceptInputs(*nnet_, eg.inputs);
+  computer.Forward();
+
+  this->ProcessOutputs(eg, &computer);
+  computer.Backward();
+
+  if (delta_nnet_ != NULL) {
+    BaseFloat scale = (1.0 - nnet_config.momentum);
+    if (nnet_config.max_param_change != 0.0) {
+      BaseFloat param_delta =
+          std::sqrt(DotProduct(*delta_nnet_, *delta_nnet_)) * scale;
+      if (param_delta > nnet_config.max_param_change) {
+        if (param_delta - param_delta != 0.0) {
+          KALDI_WARN << "Infinite parameter change, will not apply.";
+          SetZero(false, delta_nnet_);
+        } else {
+          scale *= nnet_config.max_param_change / param_delta;
+          KALDI_LOG << "Parameter change too big: " << param_delta << " > "
+                    << "--max-param-change=" << nnet_config.max_param_change
+                    << ", scaling by "
+                    << nnet_config.max_param_change / param_delta;
+        }
+      }
+    }
+    AddNnet(*delta_nnet_, scale, nnet_);
+    ScaleNnet(nnet_config.momentum, delta_nnet_);
+  }
+}
+
+
+void NnetDiscriminativeTrainer::ProcessOutputs(const NnetDiscriminativeExample &eg,
+                                               NnetComputer *computer) {
+  // normally the eg will have just one output named 'output', but
+  // we don't assume this.
+  std::vector<NnetDiscriminativeSupervision>::const_iterator iter = eg.outputs.begin(),
+      end = eg.outputs.end();
+  for (; iter != end; ++iter) {
+    const NnetDiscriminativeSupervision &sup = *iter;
+    int32 node_index = nnet_->GetNodeIndex(sup.name);
+    if (node_index < 0 ||
+        !nnet_->IsOutputNode(node_index))
+      KALDI_ERR << "Network has no output named " << sup.name;
+
+    const CuMatrixBase<BaseFloat> &nnet_output = computer->GetOutput(sup.name);
+
+    CuMatrix<BaseFloat> nnet_output_deriv(nnet_output.NumRows(),
+                                          nnet_output.NumCols(),
+                                          kUndefined);
+    
+    bool use_xent = (opts_.discriminative_config.xent_regularize != 0.0);
+    std::string xent_name = sup.name + "-xent";  // typically "output-xent".
+    CuMatrix<BaseFloat> xent_deriv;
+    if (use_xent)
+      xent_deriv.Resize(nnet_output.NumRows(), nnet_output.NumCols(),
+                               kUndefined);
+
+    discriminative::DiscriminativeObjectiveInfo stats(opts_.discriminative_config);
+
+    if (objf_info_.count(sup.name) == 0) {
+      objf_info_[sup.name].stats.Configure(opts_.discriminative_config);
+      objf_info_[sup.name].stats.Reset();
+    }
+    
+    ComputeDiscriminativeObjfAndDeriv(opts_.discriminative_config, 
+                                      tmodel_, log_priors_,
+                                      sup.supervision, nnet_output,
+                                      &stats, 
+                                      &nnet_output_deriv,
+                                      (use_xent ? &xent_deriv : NULL));
+    
+    if (use_xent) {
+      // this block computes the cross-entropy objective.
+      const CuMatrixBase<BaseFloat> &xent_output = computer->GetOutput(xent_name);
+      // at this point, xent_deriv is posteriors derived from the numerator
+      // computation.  note, xent_objf has a factor of '.supervision.weight'
+      BaseFloat xent_objf = TraceMatMat(xent_output, xent_deriv, kTrans);
+      if (xent_objf != xent_objf) {
+        BaseFloat default_objf = -10;
+        xent_objf = default_objf;
+      }
+
+      discriminative::DiscriminativeObjectiveInfo xent_stats;
+      xent_stats.tot_t_weighted = stats.tot_t_weighted;
+      xent_stats.tot_objf = xent_objf;
+
+      objf_info_[xent_name].UpdateStats(xent_name, "xent",
+                                        opts_.nnet_config.print_interval,
+                                        num_minibatches_processed_, xent_stats);
+    }
+
+    if (opts_.apply_deriv_weights && sup.deriv_weights.Dim() != 0) {
+      CuVector<BaseFloat> cu_deriv_weights(sup.deriv_weights);
+      nnet_output_deriv.MulRowsVec(cu_deriv_weights);
+      if (use_xent)
+        xent_deriv.MulRowsVec(cu_deriv_weights);
+    }
+
+    computer->AcceptOutputDeriv(sup.name, &nnet_output_deriv);
+
+    objf_info_[sup.name].UpdateStats(sup.name, opts_.discriminative_config.criterion,
+                                     opts_.nnet_config.print_interval,
+                                     num_minibatches_processed_++,
+                                     stats);
+    
+    if (use_xent) {
+      xent_deriv.Scale(opts_.discriminative_config.xent_regularize);
+      computer->AcceptOutputDeriv(xent_name, &xent_deriv);
+    }
+  }
+}
+
+
+bool NnetDiscriminativeTrainer::PrintTotalStats() const {
+  unordered_map<std::string, DiscriminativeObjectiveFunctionInfo>::const_iterator
+      iter = objf_info_.begin(),
+      end = objf_info_.end();
+  bool ans = false;
+  for (; iter != end; ++iter) {
+    const std::string &name = iter->first;
+    const DiscriminativeObjectiveFunctionInfo &info = iter->second;
+    bool ret = info.PrintTotalStats(name, opts_.discriminative_config.criterion);
+    ans = ans || ret;
+  }
+
+  return ans;
+}
+
+
+void DiscriminativeObjectiveFunctionInfo::UpdateStats(
+    const std::string &output_name,
+    const std::string &criterion,
+    int32 minibatches_per_phase,
+    int32 minibatch_counter,
+    discriminative::DiscriminativeObjectiveInfo this_minibatch_stats) {
+  int32 phase = minibatch_counter / minibatches_per_phase;
+  if (phase != current_phase) {
+    KALDI_ASSERT(phase == current_phase + 1); // or doesn't really make sense.
+    PrintStatsForThisPhase(output_name, criterion, minibatches_per_phase);
+    current_phase = phase;
+    stats_this_phase.Reset();
+  }
+  stats_this_phase.Add(this_minibatch_stats);
+  stats.Add(this_minibatch_stats);
+}
+
+void DiscriminativeObjectiveFunctionInfo::PrintStatsForThisPhase(
+    const std::string &output_name,
+    const std::string &criterion,
+    int32 minibatches_per_phase) const {
+  int32 start_minibatch = current_phase * minibatches_per_phase,
+      end_minibatch = start_minibatch + minibatches_per_phase - 1;
+
+  BaseFloat objf = (stats_this_phase.TotalObjf(criterion) / stats_this_phase.tot_t_weighted);
+  KALDI_LOG << "Average objective function for '" << output_name
+            << "' for minibatches " << start_minibatch
+            << '-' << end_minibatch << " is " << objf
+            << " over " << stats_this_phase.tot_t_weighted << " frames.";
+}
+
+bool DiscriminativeObjectiveFunctionInfo::PrintTotalStats(const std::string &name,
+                const std::string &criterion) const {
+  BaseFloat objf = stats.TotalObjf(criterion) /stats.tot_t_weighted;
+  KALDI_LOG << "Overall average objective function for '" << name << "' is "
+            << objf << " over " << stats.tot_t_weighted << " frames.";
+  KALDI_LOG << "[this line is to be parsed by a script:] "
+            << criterion << "-per-frame="
+            << objf;
+  return (stats.tot_t_weighted != 0.0);
+}
+
+
+NnetDiscriminativeTrainer::~NnetDiscriminativeTrainer() {
+  delete delta_nnet_;
+  
+  if (opts_.nnet_config.write_cache != "") {
+    Output ko(opts_.nnet_config.write_cache, opts_.nnet_config.binary_write_cache);
+    compiler_.WriteCache(ko.Stream(), opts_.nnet_config.binary_write_cache);
+  } 
+}
+
+
+} // namespace nnet3
+} // namespace kaldi
+
diff --git a/src/nnet3/nnet-discriminative-training.h b/src/nnet3/nnet-discriminative-training.h
new file mode 100644
index 00000000000..4846aeca9d3
--- /dev/null
+++ b/src/nnet3/nnet-discriminative-training.h
@@ -0,0 +1,131 @@
+// nnet3/nnet-discriminative-training.h
+
+// Copyright 2012-2015   Johns Hopkins University (author: Daniel Povey)
+//           2014-2015   Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_NNET3_NNET_DISCRIMINATIVE_TRAINING_H_
+#define KALDI_NNET3_NNET_DISCRIMINATIVE_TRAINING_H_
+
+#include "nnet3/nnet-example.h"
+#include "nnet3/nnet-computation.h"
+#include "nnet3/nnet-compute.h"
+#include "nnet3/nnet-optimize.h"
+#include "nnet3/nnet-discriminative-example.h"
+#include "nnet3/nnet-training.h"
+#include "nnet3/discriminative-training.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+struct NnetDiscriminativeOptions {
+  NnetTrainerOptions nnet_config;
+  discriminative::DiscriminativeOptions discriminative_config;
+
+  bool apply_deriv_weights;
+
+  NnetDiscriminativeOptions(): apply_deriv_weights(true) { }
+
+  void Register(OptionsItf *opts) {
+    nnet_config.Register(opts);
+    discriminative_config.Register(opts);
+    opts->Register("apply-deriv-weights", &apply_deriv_weights,
+                   "If true, apply the per-frame derivative weights stored with "
+                   "the example.");
+  }
+};
+
+// This struct is used in multiple nnet training classes for keeping
+// track of objective function values.
+// Also see struct AccuracyInfo, in nnet-diagnostics.h.
+struct DiscriminativeObjectiveFunctionInfo {
+  int32 current_phase;
+
+  discriminative::DiscriminativeObjectiveInfo stats;
+  discriminative::DiscriminativeObjectiveInfo stats_this_phase;
+
+  DiscriminativeObjectiveFunctionInfo():
+      current_phase(0) { }
+
+  // This function updates the stats and, if the phase has just changed,
+  // prints a message indicating progress.  The phase equals
+  // minibatch_counter / minibatches_per_phase.  Its only function is to
+  // control how frequently we print logging messages.
+  void UpdateStats(const std::string &output_name,
+                   const std::string &criterion,
+                   int32 minibatches_per_phase,
+                   int32 minibatch_counter,
+                   discriminative::DiscriminativeObjectiveInfo stats);
+
+  // Prints stats for the current phase.
+  void PrintStatsForThisPhase(const std::string &output_name,
+                              const std::string &criterion,
+                              int32 minibatches_per_phase) const;
+  // Prints total stats, and returns true if total stats' weight was nonzero.
+  bool PrintTotalStats(const std::string &output_name,
+                       const std::string &criterion) const;
+};
+
+
+/**
+   This class is for single-threaded discriminative training of neural nets 
+*/
+class NnetDiscriminativeTrainer {
+ public:
+  NnetDiscriminativeTrainer(const NnetDiscriminativeOptions &config,
+                            const TransitionModel &tmodel,
+                            const VectorBase<BaseFloat> &priors,
+                            Nnet *nnet);
+
+  // train on one minibatch.
+  void Train(const NnetDiscriminativeExample &eg);
+
+  // Prints out the final stats, and return true if there was a nonzero count.
+  bool PrintTotalStats() const;
+
+  ~NnetDiscriminativeTrainer();
+ private:
+  void ProcessOutputs(const NnetDiscriminativeExample &eg,
+                      NnetComputer *computer);
+
+  const NnetDiscriminativeOptions opts_;
+
+  const TransitionModel &tmodel_;
+  CuVector<BaseFloat> log_priors_;
+  
+  Nnet *nnet_;
+
+  Nnet *delta_nnet_;  // Only used if momentum != 0.0.  nnet representing
+                      // accumulated parameter-change (we'd call this
+                      // gradient_nnet_, but due to natural-gradient update,
+                      // it's better to consider it as a delta-parameter nnet.
+  CachingOptimizingCompiler compiler_;
+
+  int32 num_minibatches_processed_;
+
+  // This code supports multiple output layers, even though in the
+  // normal case there will be just one output layer named "output".
+  // So we store the objective functions per output layer.
+  unordered_map<std::string, DiscriminativeObjectiveFunctionInfo, StringHasher> objf_info_;
+};
+
+
+} // namespace nnet3
+} // namespace kaldi
+
+#endif // KALDI_NNET3_NNET_DISCRIMINATIVE_TRAINING_H_
+
diff --git a/src/nnet3/nnet-example-test.cc b/src/nnet3/nnet-example-test.cc
index dca62b4477d..b0a94df7824 100644
--- a/src/nnet3/nnet-example-test.cc
+++ b/src/nnet3/nnet-example-test.cc
@@ -50,13 +50,14 @@ void UnitTestNnetExample() {
     eg.Write(os, binary);
     NnetExample eg_copy;
     if (RandInt(0, 1) == 0)
-      eg_copy = eg; 
+      eg_copy = eg;
     std::istringstream is(os.str());
     eg_copy.Read(is, binary);
     std::ostringstream os2;
     eg_copy.Write(os2, binary);
     if (binary) {
       KALDI_ASSERT(os.str() == os2.str());
+      KALDI_ASSERT(eg_copy == eg);
     }
     KALDI_ASSERT(ExampleApproxEqual(eg, eg_copy, 0.1));
   }
diff --git a/src/nnet3/nnet-example-utils.cc b/src/nnet3/nnet-example-utils.cc
index 57969eaeae4..30f7840f6f8 100644
--- a/src/nnet3/nnet-example-utils.cc
+++ b/src/nnet3/nnet-example-utils.cc
@@ -113,7 +113,7 @@ static void MergeIo(const std::vector<NnetExample> &src,
       KALDI_ASSERT(*names_iter == io.name);
       int32 f = names_iter - names_begin;
       int32 this_size = io.indexes.size(),
-        &this_offset = cur_size[f];
+          &this_offset = cur_size[f];
       KALDI_ASSERT(this_size + this_offset <= sizes[f]);
       output_lists[f].push_back(&(io.features));
       NnetIo &output_io = merged_eg->io[f];
@@ -156,6 +156,33 @@ void MergeExamples(const std::vector<NnetExample> &src,
   MergeIo(src, io_names, io_sizes, compress, merged_eg);
 }
 
+void ShiftExampleTimes(int32 t_offset,
+                       const std::vector<std::string> &exclude_names,
+                       NnetExample *eg) {
+  if (t_offset == 0)
+    return;
+  std::vector<NnetIo>::iterator iter = eg->io.begin(),
+      end = eg->io.end();
+  for (; iter != end; iter++) {
+    bool name_is_excluded = false;
+    std::vector<std::string>::const_iterator
+        exclude_iter = exclude_names.begin(),
+        exclude_end = exclude_names.end();
+    for (; exclude_iter != exclude_end; ++exclude_iter) {
+      if (iter->name == *exclude_iter) {
+        name_is_excluded = true;
+        break;
+      }
+    }
+    if (!name_is_excluded) {
+      // name is not something like "ivector" that we exclude from shifting.
+      std::vector<Index>::iterator index_iter = iter->indexes.begin(),
+          index_end = iter->indexes.end();
+      for (; index_iter != index_end; ++index_iter)
+        index_iter->t += t_offset;
+    }
+  }
+}
 
 void GetComputationRequest(const Nnet &nnet,
                            const NnetExample &eg,
@@ -192,5 +219,72 @@ void GetComputationRequest(const Nnet &nnet,
     KALDI_ERR << "No outputs in computation request.";
 }
 
+void WriteVectorAsChar(std::ostream &os,
+                       bool binary,
+                       const VectorBase<BaseFloat> &vec) {
+  if (binary) {
+    int32 dim = vec.Dim();
+    std::vector<unsigned char> char_vec(dim);
+    const BaseFloat *data = vec.Data();
+    for (int32 i = 0; i < dim; i++) {
+      BaseFloat value = data[i];
+      KALDI_ASSERT(value >= 0.0 && value <= 1.0);
+      // below, the adding 0.5 is done so that we round to the closest integer
+      // rather than rounding down (since static_cast will round down).
+      char_vec[i] = static_cast<unsigned char>(255.0 * value + 0.5);
+    }
+    WriteIntegerVector(os, binary, char_vec);
+  } else {
+    // the regular floating-point format will be more readable for text mode.
+    vec.Write(os, binary);
+  }
+}
+
+void ReadVectorAsChar(std::istream &is,
+                      bool binary,
+                      Vector<BaseFloat> *vec) {
+  if (binary) {
+    BaseFloat scale = 1.0 / 255.0;
+    std::vector<unsigned char> char_vec;
+    ReadIntegerVector(is, binary, &char_vec);
+    int32 dim = char_vec.size();
+    vec->Resize(dim, kUndefined);
+    BaseFloat *data = vec->Data();
+    for (int32 i = 0; i < dim; i++)
+      data[i] = scale * char_vec[i];
+  } else {
+    vec->Read(is, binary);
+  }
+}
+
+void RoundUpNumFrames(int32 frame_subsampling_factor,
+                      int32 *num_frames,
+                      int32 *num_frames_overlap) {
+  if (*num_frames % frame_subsampling_factor != 0) {
+    int32 new_num_frames = frame_subsampling_factor *
+        (*num_frames / frame_subsampling_factor + 1);
+    KALDI_LOG << "Rounding up --num-frames=" << (*num_frames)
+              << " to a multiple of --frame-subsampling-factor="
+              << frame_subsampling_factor
+              << ", now --num-frames=" << new_num_frames;
+    *num_frames = new_num_frames;
+  }
+  if (*num_frames_overlap % frame_subsampling_factor != 0) {
+    int32 new_num_frames_overlap = frame_subsampling_factor *
+        (*num_frames_overlap / frame_subsampling_factor + 1);
+    KALDI_LOG << "Rounding up --num-frames-overlap=" << (*num_frames_overlap)
+              << " to a multiple of --frame-subsampling-factor="
+              << frame_subsampling_factor
+              << ", now --num-frames-overlap=" << new_num_frames_overlap;
+    *num_frames_overlap = new_num_frames_overlap;
+  }
+  if (*num_frames_overlap < 0 || *num_frames_overlap >= *num_frames) {
+    KALDI_ERR << "--num-frames-overlap=" << (*num_frames_overlap) << " < "
+              << "--num-frames=" << (*num_frames);
+  }
+
+}
+
+
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-example-utils.h b/src/nnet3/nnet-example-utils.h
index 99236293047..3e309e18915 100644
--- a/src/nnet3/nnet-example-utils.h
+++ b/src/nnet3/nnet-example-utils.h
@@ -41,19 +41,18 @@ void MergeExamples(const std::vector<NnetExample> &src,
 /** Shifts the time-index t of everything in the "eg" by adding "t_offset" to
     all "t" values.  This might be useful in things like clockwork RNNs that are
     not invariant to time-shifts, to ensure that we see different shifts of each
-    example during training.  "exclude_names" is a vector of names of nnet
-    inputs that we avoid shifting the "t" values of-- normally it will contain
-    just the single string "ivector" because we always leave t=0 for any
-    ivector. */
-void ShiftTime(int32 t_offset,
-               const std::vector<std::string> &exclude_names,
-               NnetExample *eg);
-
-/**  This function takes a NnetExample (which should already have
-     been frame-selected, if desired, and merged into a minibatch) an produces
-     a ComputationRequest.
-     Assumes you don't want the derivatives w.r.t. the inputs; if you do,
-     you can create the ComputationRequest manually.
+    example during training.  "exclude_names" is a vector (not necessarily
+    sorted) of names of nnet inputs that we avoid shifting the "t" values of--
+    normally it will contain just the single string "ivector" because we always
+    leave t=0 for any ivector. */
+void ShiftExampleTimes(int32 t_offset,
+                       const std::vector<std::string> &exclude_names,
+                       NnetExample *eg);
+
+/**  This function takes a NnetExample (which should already have been
+     frame-selected, if desired, and merged into a minibatch) and produces a
+     ComputationRequest.  It ssumes you don't want the derivatives w.r.t. the
+     inputs; if you do, you can create/modify the ComputationRequest manually.
      Assumes that if need_model_derivative is true, you will be supplying
      derivatives w.r.t. all outputs.
 */
@@ -64,6 +63,23 @@ void GetComputationRequest(const Nnet &nnet,
                            ComputationRequest *computation_request);
 
 
+// Writes as unsigned char a vector 'vec' that is required to have
+// values between 0 and 1.
+void WriteVectorAsChar(std::ostream &os,
+                       bool binary,
+                       const VectorBase<BaseFloat> &vec);
+
+// Reads data written by WriteVectorAsChar.
+void ReadVectorAsChar(std::istream &is,
+                             bool binary,
+                             Vector<BaseFloat> *vec);
+
+// This function rounds up the quantities 'num_frames' and 'num_frames_overlap'
+// to the nearest multiple of the frame_subsampling_factor
+void RoundUpNumFrames(int32 frame_subsampling_factor,
+                      int32 *num_frames,
+                      int32 *num_frames_overlap);
+
 
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-example.cc b/src/nnet3/nnet-example.cc
index 7ae38286cbe..9a34258e0ee 100644
--- a/src/nnet3/nnet-example.cc
+++ b/src/nnet3/nnet-example.cc
@@ -26,6 +26,7 @@ namespace kaldi {
 namespace nnet3 {
 
 void NnetIo::Write(std::ostream &os, bool binary) const {
+  KALDI_ASSERT(features.NumRows() == static_cast<int32>(indexes.size()));
   WriteToken(os, binary, "<NnetIo>");
   WriteToken(os, binary, name);
   WriteIndexVector(os, binary, indexes);
@@ -39,7 +40,19 @@ void NnetIo::Read(std::istream &is, bool binary) {
   ReadToken(is, binary, &name);
   ReadIndexVector(is, binary, &indexes);
   features.Read(is, binary);
-  ExpectToken(is, binary, "</NnetIo>");    
+  ExpectToken(is, binary, "</NnetIo>");
+}
+
+bool NnetIo::operator == (const NnetIo &other) const {
+  if (name != other.name) return false;
+  if (indexes != other.indexes) return false;
+  if (features.NumRows() != other.features.NumRows() ||
+      features.NumCols() != other.features.NumCols())
+    return false;
+  Matrix<BaseFloat> this_mat, other_mat;
+  features.GetMatrix(&this_mat);
+  other.features.GetMatrix(&other_mat);
+  return ApproxEqual(this_mat, other_mat);
 }
 
 NnetIo::NnetIo(const std::string &name,
@@ -52,6 +65,12 @@ NnetIo::NnetIo(const std::string &name,
     indexes[i].t = t_begin + i;
 }
 
+void NnetIo::Swap(NnetIo *other) {
+  name.swap(other->name);
+  indexes.swap(other->indexes);
+  features.Swap(&(other->features));
+}
+
 NnetIo::NnetIo(const std::string &name,
                int32 dim,
                int32 t_begin,
@@ -74,6 +93,7 @@ void NnetExample::Write(std::ostream &os, bool binary) const {
   WriteToken(os, binary, "<Nnet3Eg>");
   WriteToken(os, binary, "<NumIo>");
   int32 size = io.size();
+  KALDI_ASSERT(size > 0 && "Writing empty nnet example");
   WriteBasicType(os, binary, size);
   for (int32 i = 0; i < size; i++)
     io[i].Write(os, binary);
@@ -85,7 +105,7 @@ void NnetExample::Read(std::istream &is, bool binary) {
   ExpectToken(is, binary, "<NumIo>");
   int32 size;
   ReadBasicType(is, binary, &size);
-  if (size < 0 || size > 1000000)
+  if (size <= 0 || size > 1000000)
     KALDI_ERR << "Invalid size " << size;
   io.resize(size);
   for (int32 i = 0; i < size; i++)
diff --git a/src/nnet3/nnet-example.h b/src/nnet3/nnet-example.h
index 359dbadd0bc..1df7cd1e78e 100644
--- a/src/nnet3/nnet-example.h
+++ b/src/nnet3/nnet-example.h
@@ -24,9 +24,7 @@
 #include "nnet3/nnet-nnet.h"
 #include "hmm/posterior.h"
 #include "util/table-types.h"
-#include "lat/kaldi-lattice.h"
 #include "hmm/posterior.h"
-#include "thread/kaldi-semaphore.h"
 
 namespace kaldi {
 namespace nnet3 {
@@ -42,11 +40,11 @@ struct NnetIo {
   /// in the indexes will always be zero in individual examples, but in general
   /// nonzero after we aggregate the examples into the minibatch level.
   std::vector<Index> indexes;
-  
+
   /// The features or labels.  GeneralMatrix may contain either a CompressedMatrix,
   /// a Matrix, or SparseMatrix (a SparseMatrix would be the natural format for posteriors).
   GeneralMatrix features;
-  
+
   /// This constructor creates NnetIo with name "name", indexes with n=0, x=0,
   /// and t values ranging from t_begin to t_begin + feats.NumRows() - 1, and
   /// the provided features.  t_begin should be the frame that feats.Row(0)
@@ -61,20 +59,26 @@ struct NnetIo {
          int32 dim,
          int32 t_begin,
          const Posterior &labels);
-  
+
+  void Swap(NnetIo *other);
+
   NnetIo() { }
-  
+
   // Use default copy constructor and assignment operators.
   void Write(std::ostream &os, bool binary) const;
 
   void Read(std::istream &is, bool binary);
+
+  // this comparison is not very efficient, especially for sparse supervision.
+  // It's only used in testing code.
+  bool operator == (const NnetIo &other) const;
 };
 
 
 
 /// NnetExample is the input data and corresponding label (or labels) for one or
 /// more frames of input, used for standard cross-entropy training of neural
-/// nets (and possibly for other objective functions). 
+/// nets (and possibly for other objective functions).
 struct NnetExample {
 
   /// "io" contains the input and output.  In principle there can be multiple
@@ -91,8 +95,12 @@ struct NnetExample {
 
   void Swap(NnetExample *other) { io.swap(other->io); }
 
-  /// Compresses any features that are not sparse.
+  /// Compresses any (input) features that are not sparse.
   void Compress();
+
+  /// Caution: this operator == is not very efficient.  It's only used in
+  /// testing code.
+  bool operator == (const NnetExample &other) const { return io == other.io; }
 };
 
 
diff --git a/src/nnet3/nnet-general-component.cc b/src/nnet3/nnet-general-component.cc
index ac97e3d409f..80793bf1d98 100644
--- a/src/nnet3/nnet-general-component.cc
+++ b/src/nnet3/nnet-general-component.cc
@@ -20,11 +20,865 @@
 #include <iterator>
 #include <sstream>
 #include "nnet3/nnet-general-component.h"
+#include "nnet3/nnet-computation-graph.h"
 #include "nnet3/nnet-parse.h"
 
 namespace kaldi {
 namespace nnet3 {
 
+// used in I/O
+static void CopyPairVector(const CuArray<Int32Pair> &in,
+                        std::vector<std::pair<int32, int32> > *out) {
+  in.CopyToVec(reinterpret_cast<std::vector<Int32Pair>*>(out));
+}
+// used in I/O
+static void CopyPairVector(const std::vector<std::pair<int32, int32> > &in,
+                        CuArray<Int32Pair> *out) {
+  const std::vector<Int32Pair> *in_cast =
+      reinterpret_cast<const std::vector<Int32Pair>*>(&in);
+  out->CopyFromVec(*in_cast);
+}
+
+
+
+//inline
+void DistributeComponent::ComputeInputIndexAndBlock(const Index &output_index,
+                                                    Index *input_index,
+                                                    int32 *block) const {
+  int32 num_blocks = input_dim_ / output_dim_;
+  *input_index = output_index;
+  int32 output_x = output_index.x, input_x;
+  if (output_x >= 0) {
+    input_x = output_x / num_blocks;
+  } else {
+    input_x = (output_x - num_blocks + 1) / num_blocks;
+  }
+  input_index->x = input_x;
+  if (block)
+    *block = output_x - (input_x * num_blocks);
+}
+
+//virtual
+void DistributeComponent::GetInputIndexes(
+    const MiscComputationInfo &misc_info,
+    const Index &output_index,
+    std::vector<Index> *desired_indexes) const {
+  desired_indexes->resize(1);
+  ComputeInputIndexAndBlock(output_index, &((*desired_indexes)[0]), NULL);
+}
+
+//virtual
+bool DistributeComponent::IsComputable(
+    const MiscComputationInfo &misc_info,
+    const Index &output_index,
+    const IndexSet &input_index_set,
+    std::vector<Index> *used_inputs) const {
+  Index input_index;
+  ComputeInputIndexAndBlock(output_index, &input_index, NULL);
+  if (!input_index_set(input_index))
+    return false;
+  if (used_inputs) {
+    used_inputs->clear();
+    used_inputs->push_back(input_index);
+  }
+  return true;
+}
+
+void DistributeComponentPrecomputedIndexes::Write(std::ostream &ostream, bool binary) const {
+  WriteToken(ostream, binary, "<DistributeComponentPrecomputedIndexes>");
+  WriteToken(ostream, binary, "<Pairs>");
+  WriteIntegerPairVector(ostream, binary, pairs);
+  WriteToken(ostream, binary, "</DistributeComponentPrecomputedIndexes>");
+}
+
+void DistributeComponentPrecomputedIndexes::Read(std::istream &istream, bool binary) {
+  ExpectOneOrTwoTokens(istream, binary, "<DistributeComponentPrecomputedIndexes>", "<Pairs>");
+  ReadIntegerPairVector(istream, binary, &pairs);
+  ExpectToken(istream, binary, "</DistributeComponentPrecomputedIndexes>");
+}
+
+// virtual
+ComponentPrecomputedIndexes* DistributeComponent::PrecomputeIndexes(
+    const MiscComputationInfo &, // misc_info
+    const std::vector<Index> &input_indexes,
+    const std::vector<Index> &output_indexes,
+    bool) const {  // the bool is 'need_backprop'- unused.
+  unordered_map<Index, int32, IndexHasher> index_to_input_dim;
+  int32 num_input_indexes = input_indexes.size(),
+      num_output_indexes = output_indexes.size();
+  for (int32 i = 0; i < num_input_indexes; i++)
+    index_to_input_dim[input_indexes[i]] = i;
+  DistributeComponentPrecomputedIndexes *ans = new
+      DistributeComponentPrecomputedIndexes;
+  ans->pairs.resize(output_indexes.size());
+
+  int32 num_blocks = input_dim_ / output_dim_,
+      block_size = input_dim_ / num_blocks;
+
+  for (int32 i = 0; i < num_output_indexes; i++) {
+    Index input_index;
+    int32 block_index;
+    ComputeInputIndexAndBlock(output_indexes[i], &input_index, &block_index);
+    unordered_map<Index, int32, IndexHasher>::iterator iter =
+        index_to_input_dim.find(input_index);
+    if (iter == index_to_input_dim.end())
+      KALDI_ERR << "Input index not found (code error)";
+    int32 input_row = iter->second;
+    ans->pairs[i] = std::pair<int32,int32>(input_row, block_index * block_size);
+  }
+  return ans;
+}
+
+
+void DistributeComponent::ComputeInputPointers(
+    const ComponentPrecomputedIndexes *indexes_in,
+    const CuMatrixBase<BaseFloat> &in,
+    int32 num_output_rows,
+    std::vector<const BaseFloat*> *input_pointers) const {
+  const DistributeComponentPrecomputedIndexes *indexes =
+      dynamic_cast<const DistributeComponentPrecomputedIndexes*>(indexes_in);
+  KALDI_ASSERT(indexes != NULL && "Invalid pointer type");
+  KALDI_ASSERT(num_output_rows == static_cast<int32>(indexes->pairs.size()));
+  input_pointers->resize(num_output_rows);
+
+  const BaseFloat *input_data = in.Data();
+  int32 input_stride = in.Stride();
+  const BaseFloat **input_pointers_data = &((*input_pointers)[0]);
+  const std::pair<int32, int32> *pairs_data = &(indexes->pairs[0]);
+  for (int32 i = 0; i < num_output_rows; i++) {
+    input_pointers_data[i] = input_data +
+        pairs_data[i].first * input_stride +
+        pairs_data[i].second;
+  }
+}
+
+void DistributeComponent::ComputeInputPointers(
+    const ComponentPrecomputedIndexes *indexes_in,
+    int32 num_output_rows,
+    CuMatrixBase<BaseFloat> *in,
+    std::vector<BaseFloat*> *input_pointers) const {
+  const DistributeComponentPrecomputedIndexes *indexes =
+      dynamic_cast<const DistributeComponentPrecomputedIndexes*>(indexes_in);
+  KALDI_ASSERT(indexes != NULL && "Invalid pointer type");
+  KALDI_ASSERT(num_output_rows == static_cast<int32>(indexes->pairs.size()));
+  input_pointers->resize(num_output_rows);
+
+  BaseFloat *input_data = in->Data();
+  int32 input_stride = in->Stride();
+  BaseFloat **input_pointers_data = &((*input_pointers)[0]);
+  const std::pair<int32, int32> *pairs_data = &(indexes->pairs[0]);
+  for (int32 i = 0; i < num_output_rows; i++) {
+    input_pointers_data[i] = input_data +
+        pairs_data[i].first * input_stride +
+        pairs_data[i].second;
+  }
+}
+
+
+// virtual
+void DistributeComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
+                                    const CuMatrixBase<BaseFloat> &in,
+                                    CuMatrixBase<BaseFloat> *out) const {
+  KALDI_ASSERT(indexes != NULL &&
+               in.NumCols() == input_dim_ && out->NumCols() == output_dim_);
+  int32 num_output_rows = out->NumRows();
+  std::vector<const BaseFloat*> input_pointers;
+  ComputeInputPointers(indexes, in, num_output_rows, &input_pointers);
+  CuArray<const BaseFloat*> input_pointers_cuda(input_pointers);
+  out->CopyRows(input_pointers_cuda);
+}
+
+// virtual
+void DistributeComponent::Backprop(const std::string &debug_info,
+                                   const ComponentPrecomputedIndexes *indexes,
+                                   const CuMatrixBase<BaseFloat> &, // in_value,
+                                   const CuMatrixBase<BaseFloat> &, // out_value
+                                   const CuMatrixBase<BaseFloat> &out_deriv,
+                                   Component *, // to_update,
+                                   CuMatrixBase<BaseFloat> *in_deriv) const {
+  if (in_deriv == NULL) return;
+
+  int32 num_blocks = input_dim_ / output_dim_,
+      num_output_rows = out_deriv.NumRows();
+  if (num_output_rows != in_deriv->NumRows() * num_blocks) {
+    // there could be some 'gaps', i.e. some input values that are not ever
+    // referred to.  So we need to zero the input.  This would't happen in the
+    // setups I plan to use this for.
+    in_deriv->SetZero();
+  }
+
+  std::vector<BaseFloat*> input_pointers;
+  ComputeInputPointers(indexes, num_output_rows, in_deriv, &input_pointers);
+  CuArray<BaseFloat*> input_pointers_cuda(input_pointers);
+  out_deriv.CopyToRows(input_pointers_cuda);
+}
+
+
+void DistributeComponent::Init(int32 input_dim, int32 output_dim) {
+  input_dim_ = input_dim;
+  output_dim_ = output_dim;
+  KALDI_ASSERT(input_dim > 0 && output_dim > 0 && input_dim % output_dim == 0);
+}
+
+// virtual
+void DistributeComponent::InitFromConfig(ConfigLine *cfl) {
+  int32 input_dim, output_dim;
+  bool ok = cfl->GetValue("input-dim", &input_dim) &&
+      cfl->GetValue("output-dim", &output_dim);
+  if (!ok || cfl->HasUnusedValues())
+    KALDI_ERR << "Invalid initializer for layer of type "
+              << Type() << ": \"" << cfl->WholeLine() << "\"";
+  else
+    Init(input_dim, output_dim);
+}
+
+void DistributeComponent::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<DistributeComponent>");
+  WriteToken(os, binary, "<InputDim>");
+  WriteBasicType(os, binary, input_dim_);
+  WriteToken(os, binary, "<OutputDim>");
+  WriteBasicType(os, binary, output_dim_);
+  WriteToken(os, binary, "</DistributeComponent>");
+}
+
+void DistributeComponent::Read(std::istream &is, bool binary) {
+  ExpectOneOrTwoTokens(is, binary, "<DistributeComponent>", "<InputDim>");
+  ReadBasicType(is, binary, &input_dim_);
+  ExpectToken(is, binary, "<OutputDim>");
+  ReadBasicType(is, binary, &output_dim_);
+  ExpectToken(is, binary, "</DistributeComponent>");
+}
+
+
+void StatisticsExtractionComponentPrecomputedIndexes::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<StatisticsExtractionComponentPrecomputedIndexes>");
+  WriteToken(os, binary, "<ForwardIndexes>");
+  std::vector<std::pair<int32, int32> > pairs_cpu;
+  CopyPairVector(forward_indexes, &pairs_cpu);
+  WriteIntegerPairVector(os, binary, pairs_cpu);
+  WriteToken(os, binary, "<Counts>");
+  counts.Write(os, binary);
+  WriteToken(os, binary, "<BackwardIndexes>");
+  std::vector<int32> backward_indexes_cpu;
+  backward_indexes.CopyToVec(&backward_indexes_cpu);
+  WriteIntegerVector(os, binary, backward_indexes_cpu);
+  WriteToken(os, binary, "</StatisticsExtractionComponentPrecomputedIndexes>");
+}
+
+void StatisticsExtractionComponentPrecomputedIndexes::Read(std::istream &is, bool binary) {
+  ExpectOneOrTwoTokens(is, binary,
+                       "<StatisticsExtractionComponentPrecomputedIndexes>",
+                       "<ForwardIndexes>");
+  std::vector<std::pair<int32, int32> > pairs_cpu;
+  ReadIntegerPairVector(is, binary, &pairs_cpu);
+  CopyPairVector(pairs_cpu, &forward_indexes);
+  ExpectToken(is, binary, "<Counts>");
+  counts.Read(is, binary);
+  ExpectToken(is, binary, "<BackwardIndexes>");
+  std::vector<int32> backward_indexes_cpu;
+  ReadIntegerVector(is, binary, &backward_indexes_cpu);
+  backward_indexes.CopyFromVec(backward_indexes_cpu);
+  ExpectToken(is, binary, "</StatisticsExtractionComponentPrecomputedIndexes>");
+}
+
+ComponentPrecomputedIndexes*
+StatisticsExtractionComponent::PrecomputeIndexes(
+    const MiscComputationInfo &misc_info,
+    const std::vector<Index> &input_indexes,
+    const std::vector<Index> &output_indexes,
+    bool need_backprop) const {
+  int32 num_input_indexes = input_indexes.size(),
+      num_output_indexes = output_indexes.size();
+  StatisticsExtractionComponentPrecomputedIndexes *ans = new
+      StatisticsExtractionComponentPrecomputedIndexes();
+  // both input and output indexes are assumed sorted first on
+  // n and x, then on t.
+  Int32Pair invalid_pair;
+  invalid_pair.first = -1;
+  invalid_pair.second = -1;
+  std::vector<Int32Pair> forward_indexes_cpu(output_indexes.size(),
+                                             invalid_pair);
+  std::vector<int32> backward_indexes_cpu(input_indexes.size(), -1);
+  Vector<BaseFloat> counts_cpu(output_indexes.size());
+
+  // this map maps from Index to the position in 'input_indexes'.
+  unordered_map<Index, int32, IndexHasher> index_to_input_pos;
+  for (int32 i = 0; i < num_input_indexes; i++)
+    index_to_input_pos[input_indexes[i]] = i;
+
+  for (int32 i = 0; i < num_output_indexes; i++) {
+    Index output_index = output_indexes[i];
+    Index input_index(output_index);
+    int32 t = output_index.t,
+        t_start = output_period_ * (t / output_period_);
+    if (t_start > t)                // could happen for negative t_start due to
+      t_start -= output_period_;    // the way modulus works in C.
+    int32 t_end = t_start + output_period_;
+    for (int32 t = t_start; t < t_end; t += input_period_) {
+      input_index.t = t;
+      unordered_map<Index, int32, IndexHasher>::iterator iter =
+          index_to_input_pos.find(input_index);
+      if (iter != index_to_input_pos.end()) {
+        int32 input_pos = iter->second;
+        if (forward_indexes_cpu[i].first == -1) {
+          forward_indexes_cpu[i].first = input_pos;
+          forward_indexes_cpu[i].second = input_pos + 1;
+          counts_cpu(i) = 1.0;
+        } else {
+          // the following might fail, for instance, if the sorting
+          // of the input or output indexes was not as expected.
+          KALDI_ASSERT(forward_indexes_cpu[i].second == input_pos);
+          forward_indexes_cpu[i].second++;
+          counts_cpu(i) += 1.0;
+        }
+        KALDI_ASSERT(backward_indexes_cpu[input_pos] == -1);
+        backward_indexes_cpu[input_pos] = i;
+      }
+    }
+    KALDI_ASSERT(counts_cpu(i) != 0.0);
+  }
+  for (int32 i = 0; i < num_input_indexes; i++) {
+    KALDI_ASSERT(backward_indexes_cpu[i] != -1);
+  }
+  ans->forward_indexes = forward_indexes_cpu;
+  ans->counts = counts_cpu;
+  if (need_backprop)
+    ans->backward_indexes = backward_indexes_cpu;
+  return ans;
+}
+
+StatisticsExtractionComponent::StatisticsExtractionComponent():
+    input_dim_(-1), input_period_(1), output_period_(1),
+    include_variance_(true) { }
+
+StatisticsExtractionComponent::StatisticsExtractionComponent(
+    const StatisticsExtractionComponent &other):
+    input_dim_(other.input_dim_),
+    input_period_(other.input_period_),
+    output_period_(other.output_period_),
+    include_variance_(other.include_variance_) {
+  Check();
+}
+
+void StatisticsExtractionComponent::InitFromConfig(ConfigLine *cfl) {
+  // input-dim is required.
+  bool ok = cfl->GetValue("input-dim", &input_dim_);
+  cfl->GetValue("input-period", &input_period_);
+  cfl->GetValue("output-period", &output_period_);
+  cfl->GetValue("include-variance", &include_variance_);
+  if (cfl->HasUnusedValues())
+    KALDI_ERR << "Could not process these elements in initializer: "
+              << cfl->UnusedValues();
+  if (!ok || input_dim_ <= 0 || input_period_ <= 0 || output_period_ <= 0 ||
+      (output_period_ % input_period_ != 0))
+    KALDI_ERR << "Invalid initializer for layer of type "
+              << Type() << ": \"" << cfl->WholeLine() << "\"";
+  Check();
+}
+
+void StatisticsExtractionComponent::Check() const {
+  if (!(input_dim_ > 0 && input_period_ > 0 && output_period_ > 0 &&
+        (output_period_ % input_period_) == 0))
+    KALDI_ERR << "Invalid configuration of StatisticsExtractionComponent";
+}
+
+void StatisticsExtractionComponent::ReorderIndexes(
+    std::vector<Index> *input_indexes,
+    std::vector<Index> *output_indexes) const {
+    std::sort(input_indexes->begin(), input_indexes->end(),
+              IndexLessNxt());
+    std::sort(output_indexes->begin(), output_indexes->end(),
+              IndexLessNxt());
+}
+
+bool StatisticsExtractionComponent::IsComputable(
+    const MiscComputationInfo &misc_info,
+    const Index &output_index,
+    const IndexSet &input_index_set,
+    std::vector<Index> *used_inputs) const {
+  Index input_index(output_index);
+  int32 t = output_index.t,
+      t_start = output_period_ * (t / output_period_);
+  if (t_start > t)                // could happen for negative t_start due to
+    t_start -= output_period_;    // the way modulus works in C.
+  int32 t_end = t_start + output_period_;
+  if (!used_inputs) {
+    for (int32 t = t_start; t < t_end; t += input_period_) {
+      input_index.t = t;
+      if (input_index_set(input_index))
+        return true;
+    }
+    return false;
+  } else {
+    used_inputs->clear();
+    bool ans = false;
+    for (int32 t = t_start; t < t_end; t += input_period_) {
+      input_index.t = t;
+      if (input_index_set(input_index)) {
+        ans = true;
+        used_inputs->push_back(input_index);
+      }
+    }
+    return ans;
+  }
+}
+
+void StatisticsExtractionComponent::GetInputIndexes(
+    const MiscComputationInfo &misc_info,
+    const Index &output_index,
+    std::vector<Index> *desired_indexes) const {
+  desired_indexes->clear();
+  Index input_index(output_index);
+  int32 t = output_index.t,
+      t_start = output_period_ * (t / output_period_);
+  if (t_start > t)                // could happen for negative t due to
+    t_start -= output_period_;    // the way modulus works in C
+  int32 t_end = t_start + output_period_;
+  for (int32 t = t_start; t < t_end; t += input_period_) {
+    input_index.t = t;
+    desired_indexes->push_back(input_index);
+  }
+}
+
+
+void StatisticsExtractionComponent::Propagate(
+    const ComponentPrecomputedIndexes *indexes_in,
+    const CuMatrixBase<BaseFloat> &in,
+    CuMatrixBase<BaseFloat> *out) const {
+  KALDI_ASSERT(indexes_in != NULL);
+  const StatisticsExtractionComponentPrecomputedIndexes *indexes =
+     dynamic_cast<const StatisticsExtractionComponentPrecomputedIndexes*>(
+         indexes_in);
+  int32 num_rows_out = out->NumRows();
+  KALDI_ASSERT(indexes != NULL &&
+               indexes->forward_indexes.Dim() == num_rows_out &&
+               in.NumCols() == input_dim_ &&
+               out->NumCols() == OutputDim());
+  out->SetZero();
+  // store the counts.
+  out->CopyColFromVec(indexes->counts, 0);
+  // store the mean stats
+  out->ColRange(1, input_dim_).AddRowRanges(in, indexes->forward_indexes);
+  if (include_variance_) {
+    // store the variance (sum-squared) stats.
+    CuMatrix<BaseFloat> in_squared(in);
+    in_squared.ApplyPow(2.0);
+    out->ColRange(input_dim_ + 1,
+                  input_dim_).AddRowRanges(in_squared,
+                                           indexes->forward_indexes);
+  }
+}
+
+void StatisticsExtractionComponent::Backprop(
+    const std::string &debug_info,
+    const ComponentPrecomputedIndexes *indexes_in,
+    const CuMatrixBase<BaseFloat> &in_value,
+    const CuMatrixBase<BaseFloat> &, // out_value,
+    const CuMatrixBase<BaseFloat> &out_deriv,
+    Component *, // to_update,
+    CuMatrixBase<BaseFloat> *in_deriv) const {
+  KALDI_ASSERT(indexes_in != NULL);
+  const StatisticsExtractionComponentPrecomputedIndexes *indexes =
+      dynamic_cast<const StatisticsExtractionComponentPrecomputedIndexes*>(indexes_in);
+  in_deriv->SetZero();
+  in_deriv->AddRows(1.0, out_deriv.ColRange(1, input_dim_),
+                    indexes->backward_indexes);
+  if (include_variance_) {
+    CuMatrix<BaseFloat> variance_deriv(in_value.NumRows(),
+                                       in_value.NumCols(),
+                                       kUndefined);
+    variance_deriv.CopyRows(out_deriv.ColRange(1 + input_dim_, input_dim_),
+                            indexes->backward_indexes);
+    in_deriv->AddMatMatElements(2.0, variance_deriv, in_value, 1.0);
+  }
+}
+
+void StatisticsExtractionComponent::Read(std::istream &is, bool binary) {
+  ExpectOneOrTwoTokens(is, binary, "<StatisticsExtractionComponent>",
+                       "<InputDim>");
+  ReadBasicType(is, binary, &input_dim_);
+  ExpectToken(is, binary, "<InputPeriod>");
+  ReadBasicType(is, binary, &input_period_);
+  ExpectToken(is, binary, "<OutputPeriod>");
+  ReadBasicType(is, binary, &output_period_);
+  ExpectToken(is, binary, "<IncludeVarinance>");
+  ReadBasicType(is, binary, &include_variance_);
+  ExpectToken(is, binary, "</StatisticsExtractionComponent>");
+  Check();
+}
+
+void StatisticsExtractionComponent::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<StatisticsExtractionComponent>");
+  WriteToken(os, binary, "<InputDim>");
+  WriteBasicType(os, binary, input_dim_);
+  WriteToken(os, binary, "<InputPeriod>");
+  WriteBasicType(os, binary, input_period_);
+  WriteToken(os, binary, "<OutputPeriod>");
+  WriteBasicType(os, binary, output_period_);
+  WriteToken(os, binary, "<IncludeVarinance>");
+  WriteBasicType(os, binary, include_variance_);
+  WriteToken(os, binary, "</StatisticsExtractionComponent>");
+}
+
+void StatisticsPoolingComponentPrecomputedIndexes::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<StatisticsPoolingComponentPrecomputedIndexes>");
+  WriteToken(os, binary, "<ForwardIndexes>");
+  std::vector<std::pair<int32, int32> > indexes_cpu;
+  CopyPairVector(forward_indexes, &indexes_cpu);
+  WriteIntegerPairVector(os, binary, indexes_cpu);
+  WriteToken(os, binary, "<BackwardIndexes>");
+  CopyPairVector(backward_indexes, &indexes_cpu);
+  WriteIntegerPairVector(os, binary, indexes_cpu);
+  WriteToken(os, binary, "</StatisticsPoolingComponentPrecomputedIndexes>");
+}
+
+void StatisticsPoolingComponentPrecomputedIndexes::Read(std::istream &is, bool binary) {
+  ExpectOneOrTwoTokens(is, binary,
+                       "<StatisticsPoolingComponentPrecomputedIndexes>",
+                       "<ForwardIndexes>");
+  std::vector<std::pair<int32, int32> > indexes_cpu;
+  ReadIntegerPairVector(is, binary, &indexes_cpu);
+  CopyPairVector(indexes_cpu, &forward_indexes);
+  ExpectToken(is, binary, "<BackwardIndexes>");
+  ReadIntegerPairVector(is, binary, &indexes_cpu);
+  CopyPairVector(indexes_cpu, &backward_indexes);
+  ExpectToken(is, binary, "</StatisticsPoolingComponentPrecomputedIndexes>");
+}
+
+void StatisticsPoolingComponent::InitFromConfig(ConfigLine *cfl) {
+  bool ok = cfl->GetValue("input-dim", &input_dim_);
+  cfl->GetValue("input-period", &input_period_);
+  cfl->GetValue("left-context", &left_context_);
+  cfl->GetValue("right-context", &right_context_);
+  cfl->GetValue("num-log-count-features", &num_log_count_features_);
+  cfl->GetValue("output-stddevs", &output_stddevs_);
+  cfl->GetValue("variance-floor", &variance_floor_);
+
+  if (cfl->HasUnusedValues())
+    KALDI_ERR << "Could not process these elements in initializer: "
+	      << cfl->UnusedValues();
+  // do some basic checks here but Check() will check more completely.
+  if (!ok || input_dim_ <= 0 || left_context_ + right_context_ <= 0 ||
+      num_log_count_features_ < 0)
+    KALDI_ERR << "Invalid initializer for layer of type "
+              << Type() << ": \"" << cfl->WholeLine() << "\"";
+  Check();
+}
+
+StatisticsPoolingComponent::StatisticsPoolingComponent():
+    input_dim_(-1), input_period_(1), left_context_(-1), right_context_(-1),
+    num_log_count_features_(0), output_stddevs_(false),
+    variance_floor_(1.0e-10) { }
+
+
+StatisticsPoolingComponent::StatisticsPoolingComponent(
+    const StatisticsPoolingComponent &other):
+    input_dim_(other.input_dim_), input_period_(other.input_period_),
+    left_context_(other.left_context_), right_context_(other.right_context_),
+    num_log_count_features_(other.num_log_count_features_),
+    output_stddevs_(other.output_stddevs_),
+    variance_floor_(1.0e-10) {
+  Check();
+}
+
+void StatisticsPoolingComponent::Check() const {
+  KALDI_ASSERT(input_dim_ > 0);
+  KALDI_ASSERT(input_period_ > 0);
+  KALDI_ASSERT(left_context_ >= 0 && right_context_ >= 0 &&
+               left_context_ + right_context_ > 0);
+  KALDI_ASSERT(left_context_ % input_period_ == 0 &&
+               right_context_ % input_period_ == 0);
+  KALDI_ASSERT(variance_floor_ > 0.0 && variance_floor_ < 1.0);
+  KALDI_ASSERT(!output_stddevs_ || (input_dim_ - 1) % 2 == 0);
+}
+
+void StatisticsPoolingComponent::Read(std::istream &is, bool binary) {
+  ExpectOneOrTwoTokens(is, binary, "<StatisticsPoolingComponent>",
+                       "<InputDim>");
+  ReadBasicType(is, binary, &input_dim_);
+  ExpectToken(is, binary, "<InputPeriod>");
+  ReadBasicType(is, binary, &input_period_);
+  ExpectToken(is, binary, "<LeftContext>");
+  ReadBasicType(is, binary, &left_context_);
+  ExpectToken(is, binary, "<RightContext>");
+  ReadBasicType(is, binary, &right_context_);
+  ExpectToken(is, binary, "<NumLogCountFeatures>");
+  ReadBasicType(is, binary, &num_log_count_features_);
+  ExpectToken(is, binary, "<OutputStddevs>");
+  ReadBasicType(is, binary, &output_stddevs_);
+  ExpectToken(is, binary, "<VarianceFloor>");
+  ReadBasicType(is, binary, &variance_floor_);
+  ExpectToken(is, binary, "</StatisticsPoolingComponent>");
+  Check();
+}
+
+void StatisticsPoolingComponent::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<StatisticsPoolingComponent>");
+  WriteToken(os, binary, "<InputDim>");
+  WriteBasicType(os, binary, input_dim_);
+  WriteToken(os, binary, "<InputPeriod>");
+  WriteBasicType(os, binary, input_period_);
+  WriteToken(os, binary, "<LeftContext>");
+  WriteBasicType(os, binary, left_context_);
+  WriteToken(os, binary, "<RightContext>");
+  WriteBasicType(os, binary, right_context_);
+  WriteToken(os, binary, "<NumLogCountFeatures>");
+  WriteBasicType(os, binary, num_log_count_features_);
+  WriteToken(os, binary, "<OutputStddevs>");
+  WriteBasicType(os, binary, output_stddevs_);
+  WriteToken(os, binary, "<VarianceFloor>");
+  WriteBasicType(os, binary, variance_floor_);
+  WriteToken(os, binary, "</StatisticsPoolingComponent>");
+}
+
+void StatisticsPoolingComponent::ReorderIndexes(
+    std::vector<Index> *input_indexes,
+    std::vector<Index> *output_indexes) const {
+    std::sort(input_indexes->begin(), input_indexes->end(),
+              IndexLessNxt());
+    std::sort(output_indexes->begin(), output_indexes->end(),
+              IndexLessNxt());
+}
+
+void StatisticsPoolingComponent::GetInputIndexes(
+    const MiscComputationInfo &misc_info,
+    const Index &output_index,
+    std::vector<Index> *desired_indexes) const {
+  desired_indexes->clear();
+  Index input_index(output_index);
+  int32 middle_t = output_index.t,
+      t_start = middle_t - left_context_,
+      t_last = middle_t + right_context_;
+  KALDI_ASSERT(middle_t % input_period_ == 0);
+  for (int32 t = t_start; t <= t_last; t += input_period_) {
+    input_index.t = t;
+    desired_indexes->push_back(input_index);
+  }
+}
+
+bool StatisticsPoolingComponent::IsComputable(
+    const MiscComputationInfo &misc_info,
+    const Index &output_index,
+    const IndexSet &input_index_set,
+    std::vector<Index> *used_inputs) const {
+  if (used_inputs)
+    used_inputs->clear();
+  // you are not supposed to access the output of this component other than at
+  // multiples of the input period.  We could make this an error but decided to
+  // just have it return false.
+  if (output_index.t % input_period_ != 0)
+    return false;
+
+  Index input_index(output_index);
+  int32 output_t = output_index.t,
+      t_start = output_t - left_context_,
+      t_last = output_t + right_context_;
+  if (!used_inputs) {
+    for (int32 t = t_start; t <= t_last; t += input_period_) {
+      input_index.t = t;
+      if (input_index_set(input_index))
+        return true;
+    }
+    return false;
+  } else {
+    bool ans = false;
+    for (int32 t = t_start; t <= t_last; t += input_period_) {
+      input_index.t = t;
+      if (input_index_set(input_index)) {
+        ans = true;
+        used_inputs->push_back(input_index);
+      }
+    }
+    return ans;
+  }
+}
+
+ComponentPrecomputedIndexes*
+StatisticsPoolingComponent::PrecomputeIndexes(
+    const MiscComputationInfo &misc_info,
+    const std::vector<Index> &input_indexes,
+    const std::vector<Index> &output_indexes,
+    bool need_backprop) const {
+  int32 num_input_indexes = input_indexes.size(),
+      num_output_indexes = output_indexes.size();
+  StatisticsPoolingComponentPrecomputedIndexes *ans = new
+      StatisticsPoolingComponentPrecomputedIndexes();
+
+  Int32Pair invalid_pair;
+  invalid_pair.first = -1;
+  invalid_pair.second = -1;
+  // forward_indexes_cpu[i] will be the (begin, end) of input indexes
+  // included in the sum for the i'th output index.
+  std::vector<Int32Pair> forward_indexes_cpu(num_output_indexes,
+                                             invalid_pair);
+  // backward_indexes_cpu[i] will be the (begin, end) of output indexes
+  // for which the i'th input index participates in the sum.
+  // because of the way the indexes are sorted (and the fact that only
+  // required indexes are present at the input), it naturally has this
+  // structure [i.e. no gaps in the sets of indexes].
+  std::vector<Int32Pair> backward_indexes_cpu(num_input_indexes,
+                                              invalid_pair);
+
+  // this map maps from Index to the position in 'input_indexes'.
+  unordered_map<Index, int32, IndexHasher> index_to_input_pos;
+  for (int32 i = 0; i < num_input_indexes; i++)
+    index_to_input_pos[input_indexes[i]] = i;
+
+  for (int32 i = 0; i < num_output_indexes; i++) {
+    Index input_index(output_indexes[i]);
+    int32 middle_t = input_index.t,
+        t_start = middle_t - left_context_,
+        t_last = middle_t + right_context_;
+    for (int32 t = t_start; t <= t_last; t += input_period_) {
+      input_index.t = t;
+      unordered_map<Index, int32, IndexHasher>::iterator iter =
+          index_to_input_pos.find(input_index);
+      if (iter != index_to_input_pos.end()) {
+        int32 input_pos = iter->second;
+        if (forward_indexes_cpu[i].first == -1) {
+          forward_indexes_cpu[i].first = input_pos;
+          forward_indexes_cpu[i].second = input_pos + 1;
+        } else {
+          KALDI_ASSERT(forward_indexes_cpu[i].second == input_pos);
+          forward_indexes_cpu[i].second++;
+        }
+        if (backward_indexes_cpu[input_pos].first == -1) {
+          backward_indexes_cpu[input_pos].first = i;
+          backward_indexes_cpu[input_pos].second = i + 1;
+        } else {
+          KALDI_ASSERT(backward_indexes_cpu[input_pos].second == i);
+          backward_indexes_cpu[input_pos].second++;
+        }
+      }
+    }
+    KALDI_ASSERT(forward_indexes_cpu[i].first != -1);
+  }
+  for (int32 i = 0; i < num_input_indexes; i++) {
+    KALDI_ASSERT(backward_indexes_cpu[i].first != -1);
+  }
+
+  ans->forward_indexes = forward_indexes_cpu;
+  if (need_backprop)
+    ans->backward_indexes = backward_indexes_cpu;
+  return ans;
+}
+
+void StatisticsPoolingComponent::Propagate(
+    const ComponentPrecomputedIndexes *indexes_in,
+    const CuMatrixBase<BaseFloat> &in,
+    CuMatrixBase<BaseFloat> *out) const {
+  out->SetZero();
+  KALDI_ASSERT(indexes_in != NULL);
+  const StatisticsPoolingComponentPrecomputedIndexes *indexes =
+      dynamic_cast<const StatisticsPoolingComponentPrecomputedIndexes*>(indexes_in);
+  int32 num_rows_out = out->NumRows();
+  KALDI_ASSERT(indexes != NULL &&
+               indexes->forward_indexes.Dim() == num_rows_out &&
+               in.NumCols() == input_dim_ &&
+               out->NumCols() == OutputDim());
+  CuVector<BaseFloat> counts(num_rows_out);
+  // counts_mat is a fake matrix with one column, containing the counts.
+  CuSubMatrix<BaseFloat> counts_mat(counts.Data(), num_rows_out, 1, 1);
+  counts_mat.AddRowRanges(in.ColRange(0, 1), indexes->forward_indexes);
+
+  CuSubMatrix<BaseFloat> out_non_count(*out, 0, num_rows_out,
+                                       num_log_count_features_, input_dim_ - 1);
+  out_non_count.AddRowRanges(in.ColRange(1, input_dim_ - 1),
+                             indexes->forward_indexes);
+  out_non_count.DivRowsVec(counts);
+
+  if (num_log_count_features_ > 0) {
+    counts.ApplyLog();
+    CuVector<BaseFloat> ones(num_log_count_features_, kUndefined);
+    ones.Set(1.0);
+    out->ColRange(0, num_log_count_features_).AddVecVec(1.0, counts, ones);
+  }
+
+  if (output_stddevs_) {
+    // if this is true, then we assume the input contains x^2 stats as well as x
+    // stats, and we want to process them into a standard deviation.
+    KALDI_ASSERT((input_dim_ - 1) % 2 == 0);
+    int32 feature_dim = (input_dim_ - 1) / 2;
+    CuSubMatrix<BaseFloat> mean(*out, 0, num_rows_out,
+                                num_log_count_features_, feature_dim),
+        variance(*out, 0, num_rows_out,
+                 num_log_count_features_ + feature_dim, feature_dim);
+    // subtract mean-squared from average of x^2 to get the variance.
+    variance.AddMatMatElements(-1.0, mean, mean, 1.0);
+    variance.ApplyFloor(variance_floor_);
+    // compute the standard deviation via square root.
+    variance.ApplyPow(0.5);
+  }
+}
+
+void StatisticsPoolingComponent::Backprop(
+    const std::string &debug_info,
+    const ComponentPrecomputedIndexes *indexes_in,
+    const CuMatrixBase<BaseFloat> &in_value,
+    const CuMatrixBase<BaseFloat> &out_value,
+    const CuMatrixBase<BaseFloat> &out_deriv_in,
+    Component *, // to_update,
+    CuMatrixBase<BaseFloat> *in_deriv) const {
+  KALDI_ASSERT(indexes_in != NULL);
+  const StatisticsPoolingComponentPrecomputedIndexes *indexes =
+      dynamic_cast<const StatisticsPoolingComponentPrecomputedIndexes*>(
+          indexes_in);
+  int32 num_rows_out = out_deriv_in.NumRows();
+  CuMatrix<BaseFloat> out_deriv(out_deriv_in);
+  if (output_stddevs_) {
+    // for now we actually ignore the covariance flooring in the backprop- this
+    // is an approximation.  Typically the derivatives computed will be quite
+    // tiny for floored variances (they should be zero), so it won't affect the
+    // derivatives much.
+    int32 feature_dim = (input_dim_ - 1) / 2;
+    CuSubMatrix<BaseFloat> mean_deriv(out_deriv, 0, num_rows_out,
+                                      num_log_count_features_, feature_dim),
+        variance_deriv(out_deriv, 0, num_rows_out,
+                       num_log_count_features_ + feature_dim, feature_dim),
+        mean_value(out_value, 0, num_rows_out,
+                   num_log_count_features_, feature_dim),
+        stddev_value(out_value, 0, num_rows_out,
+                     num_log_count_features_ + feature_dim, feature_dim);
+    // we currently have the deriv w.r.t. the stddev.  step 1 is to get it
+    // w.r.t. the centered variance.  If the centered variance is s,
+    // and the stddev is sqrt(s), then d/ds sqrt(s) = 0.5 / sqrt(s),
+    // so we need to multiply variance_deriv by 0.5 / the stddev.
+    variance_deriv.DivElements(stddev_value);
+    variance_deriv.Scale(0.5);
+
+    // the deriv w.r.t. the uncentered variance is the same as w.r.t.  the
+    // uncentered variance (since they difer by a constant term of -(mean *
+    // mean), but we need to add to dF/dmean, the value -2.0 * mean *
+    // dF/dvariance.
+    mean_deriv.AddMatMatElements(-2.0, mean_value, variance_deriv, 1.0);
+  }
+  // now we have to account for the effect of division by the count, on
+  // the derivative.
+  CuVector<BaseFloat> counts(num_rows_out, kUndefined);
+  if (num_log_count_features_ > 0) {
+    counts.CopyColFromMat(out_value, 0);
+    counts.ApplyExp();
+  } else {
+    counts.SetZero();
+    // we need to recompute the counts from the input since they are not in the
+    // output.  The submatrix initializer below takes num-rows, num-cols,
+    // stride;  num-cols and stride are 1.
+    CuSubMatrix<BaseFloat> counts_mat(counts.Data(), num_rows_out, 1, 1);
+    counts_mat.AddRowRanges(in_value.ColRange(0, 1), indexes->forward_indexes);
+  }
+  // Divide the output derivative by the counts.  This is what we want as it
+  // concerns the mean and x^2 stats.  As for the counts themselves, the
+  // derivative will end up being discarded when we backprop to the
+  // StatisticsExtractionComponent (as the count is not differentiable) so it
+  // doesn't really matter.
+  out_deriv.DivRowsVec(counts);
+
+  // Now propagate the derivative back to the input.  we don't propagate it
+  // back for the count's row since it's non-differentiable.
+  in_deriv->ColRange(1, input_dim_ - 1).
+      AddRowRanges(out_deriv.ColRange(num_log_count_features_, input_dim_ - 1),
+                   indexes->backward_indexes);
+}
 
 
 } // namespace nnet3
diff --git a/src/nnet3/nnet-general-component.h b/src/nnet3/nnet-general-component.h
index c91eb8cb41c..e7c2ff3a78e 100644
--- a/src/nnet3/nnet-general-component.h
+++ b/src/nnet3/nnet-general-component.h
@@ -1,10 +1,6 @@
 // nnet3/nnet-general-component.h
 
-// Copyright 2011-2013  Karel Vesely
-//           2012-2015  Johns Hopkins University (author: Daniel Povey)
-//                2013  Xiaohui Zhang    
-//                2014  Vijayaditya Peddinti
-//                2014  Guoguo Chen
+// Copyright      2015  Johns Hopkins University (author: Daniel Povey)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -38,8 +34,410 @@ namespace nnet3 {
 ///   number of outputs than inputs.
 
 
-// We don't have any general components yet.
 
+/**
+   This Component takes a larger input-dim than output-dim, where the input-dim
+   must be a multiple of the output-dim, and distributes different blocks of the
+   input dimension to different 'x' values.  In the normal case where the input
+   is only valid at x=0, the first block of output goes to x=0, the second block
+   at x=1, and so on.  It also supports a more general usage, so in general a
+   value 'x' at the output will map to block 'x % n_blocks' of the dimension
+   blocks of the input, and to an x value 'x / n_blocks' of the input.  For negative
+   x values the % and / operations are always rounded down, not towards zero.
+
+   The config line is of the form
+     input-dim=xx output-dim=xx
+   where input-dim must be a multiple of the output-dim, and n_blocks is
+   set to input-dim / output-dim.
+   */
+class DistributeComponent: public Component {
+ public:
+  DistributeComponent(int32 input_dim, int32 output_dim) {
+    Init(input_dim, output_dim);
+  }
+  DistributeComponent(): input_dim_(0), output_dim_(0) { }
+  virtual int32 InputDim() const { return input_dim_; }
+  virtual int32 OutputDim() const { return output_dim_; }
+
+  // use the default Info() function.
+  virtual void InitFromConfig(ConfigLine *cfl);
+  virtual std::string Type() const { return "DistributeComponent"; }
+  virtual int32 Properties() const { return kLinearInInput; }
+  virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
+                         const CuMatrixBase<BaseFloat> &in,
+                         CuMatrixBase<BaseFloat> *out) const;
+  virtual void Backprop(const std::string &debug_info,
+                        const ComponentPrecomputedIndexes *indexes,
+                        const CuMatrixBase<BaseFloat> &in_value,
+                        const CuMatrixBase<BaseFloat> &out_value,
+                        const CuMatrixBase<BaseFloat> &out_deriv,
+                        Component *, // to_update,
+                        CuMatrixBase<BaseFloat> *in_deriv) const;
+
+  virtual void Read(std::istream &is, bool binary); // This Read function
+  // requires that the Component has the correct type.
+
+  /// Write component to stream
+  virtual void Write(std::ostream &os, bool binary) const;
+  virtual Component* Copy() const {
+    return new DistributeComponent(input_dim_, output_dim_);
+  }
+
+
+  // Some functions that are only to be reimplemented for GeneralComponents.
+  virtual void GetInputIndexes(const MiscComputationInfo &misc_info,
+                               const Index &output_index,
+                               std::vector<Index> *desired_indexes) const;
+
+  // This function returns true if at least one of the input indexes used to
+  // compute this output index is computable.
+  virtual bool IsComputable(const MiscComputationInfo &misc_info,
+                            const Index &output_index,
+                            const IndexSet &input_index_set,
+                            std::vector<Index> *used_inputs) const;
+
+  virtual ComponentPrecomputedIndexes* PrecomputeIndexes(
+      const MiscComputationInfo &misc_info,
+      const std::vector<Index> &input_indexes,
+      const std::vector<Index> &output_indexes,
+      bool need_backprop) const;
+
+  // Some functions that are specific to this class.
+  void Init(int32 input_dim, int32 output_dim);
+ private:
+  // computes the input index corresponding to a particular output index.
+  // if block != NULL, also computes which block of the input this corresponds to.
+  inline void ComputeInputIndexAndBlock(const Index &output_index,
+                                        Index *input_index,
+                                        int32 *block) const;
+  inline void ComputeInputPointers(
+      const ComponentPrecomputedIndexes *indexes,
+      const CuMatrixBase<BaseFloat> &in,
+      int32 num_output_rows,
+      std::vector<const BaseFloat*> *input_pointers) const;
+  // non-const version of the above.
+  inline void ComputeInputPointers(
+      const ComponentPrecomputedIndexes *indexes,
+      int32 num_output_rows,
+      CuMatrixBase<BaseFloat> *in,
+      std::vector<BaseFloat*> *input_pointers) const;
+  int32 input_dim_;
+  int32 output_dim_;
+
+};
+
+class DistributeComponentPrecomputedIndexes:
+      public ComponentPrecomputedIndexes {
+ public:
+
+  // each pair is a pair (row, dim_offset), and by
+  // computing (input.Data() + row * input.Stride() + dim_offset)
+  // we get an address that points to the correct input location.
+  std::vector<std::pair<int32, int32> > pairs;
+
+  // this class has a virtual destructor so it can be deleted from a pointer
+  // to ComponentPrecomputedIndexes.
+  virtual ~DistributeComponentPrecomputedIndexes() { }
+
+  virtual ComponentPrecomputedIndexes* Copy() const {
+    return new DistributeComponentPrecomputedIndexes(*this);
+  }
+
+  virtual void Write(std::ostream &ostream, bool binary) const;
+
+  virtual void Read(std::istream &istream, bool binary);
+
+  virtual std::string Type() const { return "DistributeComponentPrecomputedIndexes"; }
+};
+
+/*
+  Class StatisticsExtractionComponent is used together with
+  StatisticsPoolingComponent to extract moving-average mean and
+  standard-deviation statistics.
+
+  StatisticsExtractionExomponent designed to extract statistics-- 0th-order,
+  1st-order and optionally diagonal 2nd-order stats-- from small groups of
+  frames, such as 10 frame.  The statistics will then be further processed by
+  StatisticsPoolingComponent to compute moving-average means and (if configured)
+  standard deviations.  The reason for the two-component way of doing this is
+  efficiency, particularly in the graph-compilation phase.  (Otherwise there
+  would be too many dependencies to process).  The StatisticsExtractionComponent
+  is designed to let you extract statistics from fixed-size groups of frames
+  (e.g. 10 frames), and in StatisticsPoolingComponent you are only expected to
+  compute the averages at the same fixed period (e.g. 10 frames), so it's more
+  efficient than if you were to compute a moving average at every single frame;
+  and the computation of the intermediate stats means that most of the
+  computation that goes into extracting the means and standard deviations for
+  nearby frames is shared.
+
+  The config line in a typical setup will be something like:
+
+    input-dim=250 input-period=1 output-period=10 include-variance=true
+
+  input-dim is self-explanatory.  The inputs will be obtained at multiples of
+  input-period (e.g. it might be 3 for chain models).  output-period must be a
+  multiple of input period, and the requested output indexes will be expected to
+  be multiples of output-period (which you can ensure through use of the Round
+  descriptor).  For instance, if you request the output on frame 80, it will
+  consist of stats from input frames 80 through 89.
+
+  An output of this component will be 'computable' any time at least one of
+  the corresponding inputs is computable.
+
+   In all cases the first dimension of the output will be a count (between 1 and
+  10 inclusive in this example).  If include-variance=false, then the output
+  dimension will be input-dim + 1.  and the output dimensions >0 will be
+  1st-order statistics (sums of the input).  If include-variance=true, then the
+  output dimension will be input-dim * 2 + 1, where the raw diagonal 2nd-order
+  statistics are appended to the 0 and 1st order statistics.
+
+  The default configuration values are:
+     input-dim=-1 input-period=1 output-period=1 include-variance=true
+ */
+class StatisticsExtractionComponent: public Component {
+ public:
+  // Initializes to defaults which would not pass Check(); use InitFromConfig()
+  // or Read() or copy constructor to really initialize.
+  StatisticsExtractionComponent();
+  // copy constructor, used in Copy().
+  StatisticsExtractionComponent(const StatisticsExtractionComponent &other);
+
+  virtual int32 InputDim() const { return input_dim_; }
+  virtual int32 OutputDim() const {
+    // count + sum stats [ + sum-squared stats].
+    return 1 + input_dim_ + (include_variance_ ? input_dim_ : 0);
+  }
+  virtual void InitFromConfig(ConfigLine *cfl);
+  virtual std::string Type() const { return "StatisticsExtractionComponent"; }
+  virtual int32 Properties() const {
+    return kPropagateAdds|kReordersIndexes|
+        (include_variance_ ? kBackpropNeedsInput : 0);
+  }
+  virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
+                         const CuMatrixBase<BaseFloat> &in,
+                         CuMatrixBase<BaseFloat> *out) const;
+  virtual void Backprop(const std::string &debug_info,
+                        const ComponentPrecomputedIndexes *indexes,
+                        const CuMatrixBase<BaseFloat> &in_value,
+                        const CuMatrixBase<BaseFloat> &out_value,
+                        const CuMatrixBase<BaseFloat> &out_deriv,
+                        Component *, // to_update,
+                        CuMatrixBase<BaseFloat> *in_deriv) const;
+
+  virtual void Read(std::istream &is, bool binary); // This Read function
+  // requires that the Component has the correct type.
+
+  /// Write component to stream
+  virtual void Write(std::ostream &os, bool binary) const;
+  virtual Component* Copy() const {
+    return new StatisticsExtractionComponent(*this);
+  }
+
+  // Some functions that are only to be reimplemented for GeneralComponents.
+  virtual void GetInputIndexes(const MiscComputationInfo &misc_info,
+                               const Index &output_index,
+                               std::vector<Index> *desired_indexes) const;
+
+  virtual bool IsComputable(const MiscComputationInfo &misc_info,
+                            const Index &output_index,
+                            const IndexSet &input_index_set,
+                            std::vector<Index> *used_inputs) const;
+
+  // This function reorders the input and output indexes so that they
+  // are sorted first on n and then x and then t.
+  virtual void ReorderIndexes(std::vector<Index> *input_indexes,
+                              std::vector<Index> *output_indexes) const;
+
+  virtual ComponentPrecomputedIndexes* PrecomputeIndexes(
+      const MiscComputationInfo &misc_info,
+      const std::vector<Index> &input_indexes,
+      const std::vector<Index> &output_indexes,
+      bool need_backprop) const;
+
+ private:
+  // Checks that the parameters are valid.
+  void Check() const;
+
+  // Disallow assignment operator.
+  StatisticsExtractionComponent &operator =(
+      const StatisticsExtractionComponent &other);
+
+  int32 input_dim_;
+  int32 input_period_;
+  int32 output_period_;
+  bool include_variance_;
+};
+
+class StatisticsExtractionComponentPrecomputedIndexes:
+      public ComponentPrecomputedIndexes {
+ public:
+  // While creating the output we sum over row ranges of the input.
+  // forward_indexes.Dim() equals the number of rows of the output, and each
+  // element is a (start, end) range of inputs, that is summed over.
+  CuArray<Int32Pair> forward_indexes;
+
+  // this vector stores the number of inputs for each output.  Normally this will be
+  // the same as the component's output_period_ / input_period_, but could be less
+  // due to edge effects at the utterance boundary.
+  CuVector<BaseFloat> counts;
+
+  // Each input row participates in exactly one output element, and
+  // 'backward_indexes' identifies which row of the output each row
+  // of the input is part of.  It's used in backprop.
+  CuArray<int32> backward_indexes;
+
+  ComponentPrecomputedIndexes *Copy() const {
+    return new StatisticsExtractionComponentPrecomputedIndexes(*this);
+  }
+
+  virtual void Write(std::ostream &os, bool binary) const;
+
+  virtual void Read(std::istream &is, bool binary);
+
+  virtual std::string Type() const { return "StatisticsExtractionComponentPrecomputedIndexes"; }
+ private:
+  virtual ~StatisticsExtractionComponentPrecomputedIndexes() { }
+};
+
+/*
+  Class StatisticsPoolingComponent is used together with
+  StatisticsExtractionComponent to extract moving-average mean and
+  standard-deviation statistics.
+
+  StatisticsPoolingComponent pools the stats over a specified window and
+  computes means and possibly log-count and stddevs from them for you.
+
+ # In StatisticsPoolingComponent, the first element of the input is interpreted
+ # as a count, which we divide by.
+ # Optionally the log of the count can be output, and you can allow it to be
+ # repeated several times if you want (useful for systems using the jesus-layer).
+ # The output dimension is equal to num-log-count-features plus (input-dim - 1).
+
+ # If include-log-count==false, the output dimension is the input dimension minus one.
+ # If output-stddevs=true, then it expects the input-dim to be of the form 2n+1 where n is
+ #  presumably the original feature dim, and it interprets the last n dimensions of the feature
+ #  as a variance; it outputs the square root of the variance instead of the actual variance.
+
+ configs and their defaults:  input-dim=-1, input-period=1, left-context=-1, right-context=-1,
+    num-log-count-features=0, output-stddevs=true, variance-floor=1.0e-10
+
+ You'd access the output of the StatisticsPoolingComponent using rounding, e.g.
+  Round(component-name, 10)
+ or whatever, instead of just component-name, because its output is only defined at multiples
+ of its input-period.
+
+ The output of StatisticsPoolingComponent will only be defined if at least one input was defined.
+ */
+class StatisticsPoolingComponent: public Component {
+ public:
+  // Initializes to defaults which would not pass Check(); use InitFromConfig()
+  // or Read() or copy constructor to really initialize.
+  StatisticsPoolingComponent();
+  // copy constructor, used in Copy()
+  StatisticsPoolingComponent(const StatisticsPoolingComponent &other);
+
+  virtual int32 InputDim() const { return input_dim_; }
+  virtual int32 OutputDim() const {
+    return input_dim_ + num_log_count_features_ - 1;
+  }
+  virtual void InitFromConfig(ConfigLine *cfl);
+  virtual std::string Type() const { return "StatisticsPoolingComponent"; }
+  virtual int32 Properties() const {
+    return kReordersIndexes|kBackpropAdds|
+        (output_stddevs_ || num_log_count_features_ > 0 ?
+         kBackpropNeedsOutput : 0) |
+        (num_log_count_features_ == 0 ? kBackpropNeedsInput : 0);
+  }
+  virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
+                         const CuMatrixBase<BaseFloat> &in,
+                         CuMatrixBase<BaseFloat> *out) const;
+  virtual void Backprop(const std::string &debug_info,
+                        const ComponentPrecomputedIndexes *indexes,
+                        const CuMatrixBase<BaseFloat> &in_value,
+                        const CuMatrixBase<BaseFloat> &out_value,
+                        const CuMatrixBase<BaseFloat> &out_deriv,
+                        Component *, // to_update,
+                        CuMatrixBase<BaseFloat> *in_deriv) const;
+
+  virtual void Read(std::istream &is, bool binary); // This Read function
+  // requires that the Component has the correct type.
+
+  /// Write component to stream
+  virtual void Write(std::ostream &os, bool binary) const;
+  virtual Component* Copy() const {
+    return new StatisticsPoolingComponent(*this);
+  }
+
+  // Some functions that are only to be reimplemented for GeneralComponents.
+  virtual void GetInputIndexes(const MiscComputationInfo &misc_info,
+                               const Index &output_index,
+                               std::vector<Index> *desired_indexes) const;
+
+  // returns true if at least one of its inputs is computable.
+  virtual bool IsComputable(const MiscComputationInfo &misc_info,
+                            const Index &output_index,
+                            const IndexSet &input_index_set,
+                            std::vector<Index> *used_inputs) const;
+
+  // This function reorders the input and output indexes so that they
+  // are sorted first on n and then x and then t.
+  virtual void ReorderIndexes(std::vector<Index> *input_indexes,
+                              std::vector<Index> *output_indexes) const;
+
+  virtual ComponentPrecomputedIndexes* PrecomputeIndexes(
+      const MiscComputationInfo &misc_info,
+      const std::vector<Index> &input_indexes,
+      const std::vector<Index> &output_indexes,
+      bool need_backprop) const;
+
+ private:
+  // Checks that the parameters are valid.
+  void Check() const;
+
+  // Disallow assignment operator.
+  StatisticsPoolingComponent &operator =(
+      const StatisticsPoolingComponent &other);
+
+  int32 input_dim_;
+  int32 input_period_;
+  int32 left_context_;
+  int32 right_context_;
+  int32 num_log_count_features_;
+  bool output_stddevs_;
+  BaseFloat variance_floor_;
+};
+
+class StatisticsPoolingComponentPrecomputedIndexes:
+      public ComponentPrecomputedIndexes {
+ public:
+
+  // in the first stage of creating the output we sum over row ranges of
+  // the input.  forward_indexes.Dim() equals the number of rows of the
+  // output, and each element is a (start, end) range of inputs, that is
+  // summed over.
+  CuArray<Int32Pair> forward_indexes;
+
+  // backward_indexes contains the same information as forward_indexes, but in a
+  // different format.  backward_indexes.Dim() is the same as the number of rows
+  // of input, and each element contains the (start,end) of the range of outputs
+  // for which this input index appears as an element of the sum for that
+  // output.  This is possible because of the way the inputs and outputs are
+  // ordered and because of how we select the elments to appear in the sum using
+  // a window.  This quantity is used in backprop.
+  CuArray<Int32Pair> backward_indexes;
+
+  virtual ~StatisticsPoolingComponentPrecomputedIndexes() { }
+
+  ComponentPrecomputedIndexes *Copy() const {
+    return new StatisticsPoolingComponentPrecomputedIndexes(*this);
+  }
+
+  virtual void Write(std::ostream &os, bool binary) const;
+
+  virtual void Read(std::istream &is, bool binary);
+
+  virtual std::string Type() const { return "StatisticsPoolingComponentPrecomputedIndexes"; }
+};
 
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-graph-test.cc b/src/nnet3/nnet-graph-test.cc
index 40b0f5fb021..b44e3204eba 100644
--- a/src/nnet3/nnet-graph-test.cc
+++ b/src/nnet3/nnet-graph-test.cc
@@ -196,6 +196,35 @@ void UnitTestComputeTopSortOrder() {
   KALDI_ASSERT(AssertVectorEqual(node_to_order, ref_node_to_order));
 }
 
+void UnitTestComputeTopSortOrder2() {
+  // The outer vector is indexed by node ID, and each nested vector contains
+  // the node IDs for its successors in the graph. For example, if there are
+  // arcs from node 0 to nodes 1 and 2, then the vector at graph[0] will be (1, 2)
+  std::vector<std::vector<int32> > graph;
+
+  // Build a test graph:
+  // 0 ---> 1 ---> 2 ---> 4
+  //   `--> 3 -----^
+  graph.resize(5);
+  graph[0].push_back(1); graph[0].push_back(3);
+  graph[1].push_back(2);
+  graph[2].push_back(4);
+  graph[3].push_back(2);
+  // graph[4] is empty(has no successors)
+
+  // fill in the desired(topological) mapping node->order
+  std::vector<int32> ref_node_to_order;
+  ref_node_to_order.push_back(0); // node 0 comes first
+  ref_node_to_order.push_back(2); // node 1 comes third
+  ref_node_to_order.push_back(3); // node 2 comes fourth
+  ref_node_to_order.push_back(1); // node 3 comes second
+  ref_node_to_order.push_back(4); // node 4 comes last
+
+  std::vector<int32> computed_node_to_order;
+  ComputeTopSortOrder(graph, &computed_node_to_order);
+  KALDI_ASSERT(AssertVectorEqual(ref_node_to_order, computed_node_to_order));
+}
+
 } // namespace nnet3
 } // namespace kaldi
 
@@ -207,6 +236,7 @@ int main() {
   UnitTestFindSccs();
   UnitTestMakeSccGraph();
   UnitTestComputeTopSortOrder();
+  UnitTestComputeTopSortOrder2();
 
   KALDI_LOG << "Nnet graph tests succeeded.";
 
diff --git a/src/nnet3/nnet-graph.cc b/src/nnet3/nnet-graph.cc
index 409fe2273d3..e66a34fc26a 100644
--- a/src/nnet3/nnet-graph.cc
+++ b/src/nnet3/nnet-graph.cc
@@ -277,11 +277,6 @@ void ComputeNnetComputationEpochs(const Nnet &nnet,
   MakeSccGraph(graph, sccs, &scc_graph);
   KALDI_VLOG(6) << "scc graph is: " << PrintGraphToString(scc_graph);
   
-  std::vector<std::vector<int32> > scc_graph_transpose;
-  // compute transpose because we actually want the reverse of the topological
-  // order so inputs come first and then outputs.
-  ComputeGraphTranspose(scc_graph, &scc_graph_transpose);
-
   std::vector<int32> scc_node_to_epoch;
   ComputeTopSortOrder(scc_graph, &scc_node_to_epoch);
   if (GetVerboseLevel() >= 6) {
diff --git a/src/nnet3/nnet-graph.h b/src/nnet3/nnet-graph.h
index 7b511e3b88d..6aff41b4453 100644
--- a/src/nnet3/nnet-graph.h
+++ b/src/nnet3/nnet-graph.h
@@ -45,8 +45,8 @@ std::string PrintGraphToString(const std::vector<std::vector<int32> > &graph);
 /// where graph->size() == nnet.NumNodes().  For each node-index
 /// n, the vector in (*graph)[n] will contain a list of all the nodes
 /// that have a direct dependency on node n (in order to compute them).
-/// For instance, if n is an input node, (*graph)[n] will be the empty
-/// list because it won't depend on anything.
+/// For instance, if n is the output node, (*graph)[n] will be the empty
+/// list because no other node will depend on it.
 void NnetToDirectedGraph(const Nnet &nnet,
                          std::vector<std::vector<int32> > *graph);
 
diff --git a/src/nnet3/nnet-nnet.cc b/src/nnet3/nnet-nnet.cc
index 61e76219e0f..8dea02b8918 100644
--- a/src/nnet3/nnet-nnet.cc
+++ b/src/nnet3/nnet-nnet.cc
@@ -1,7 +1,7 @@
 // nnet3/nnet-nnet.cc
 
 // Copyright      2015  Johns Hopkins University (author: Daniel Povey)
-
+//                2016  Daniel Galvez
 // See ../../COPYING for clarification regarding multiple authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
@@ -40,6 +40,7 @@ int32 NetworkNode::Dim(const Nnet &nnet) const {
       ans = nnet.GetComponent(u.component_index)->OutputDim();
       break;
     default:
+      ans = 0;  // suppress compiler warning
       KALDI_ERR << "Invalid node type.";
   }
   KALDI_ASSERT(ans > 0);
@@ -80,7 +81,7 @@ std::string Nnet::GetAsConfigLine(int32 node_index, bool include_dim) const {
           << component_names_[node.u.component_index] << " input=";
       KALDI_ASSERT(nodes_[node_index-1].node_type == kDescriptor);
       nodes_[node_index-1].descriptor.WriteConfig(ans, node_names_);
-      if (include_dim)      
+      if (include_dim)
         ans << " input-dim=" << nodes_[node_index-1].Dim(*this)
             << " output-dim=" << node.Dim(*this);
       break;
@@ -96,34 +97,34 @@ std::string Nnet::GetAsConfigLine(int32 node_index, bool include_dim) const {
 }
 
 bool Nnet::IsOutputNode(int32 node) const {
-  int32 size = nodes_.size();  
-  KALDI_ASSERT(node >= 0 && node < size);  
+  int32 size = nodes_.size();
+  KALDI_ASSERT(node >= 0 && node < size);
   return (nodes_[node].node_type == kDescriptor &&
           (node + 1 == size ||
            nodes_[node + 1].node_type != kComponent));
 }
 
 bool Nnet::IsInputNode(int32 node) const {
-  int32 size = nodes_.size();  
-  KALDI_ASSERT(node >= 0 && node < size);  
+  int32 size = nodes_.size();
+  KALDI_ASSERT(node >= 0 && node < size);
   return (nodes_[node].node_type == kInput);
 }
 
 bool Nnet::IsDescriptorNode(int32 node) const {
-  int32 size = nodes_.size();  
-  KALDI_ASSERT(node >= 0 && node < size);  
+  int32 size = nodes_.size();
+  KALDI_ASSERT(node >= 0 && node < size);
   return (nodes_[node].node_type == kDescriptor);
 }
 
 bool Nnet::IsComponentNode(int32 node) const {
-  int32 size = nodes_.size();  
-  KALDI_ASSERT(node >= 0 && node < size);  
+  int32 size = nodes_.size();
+  KALDI_ASSERT(node >= 0 && node < size);
   return (nodes_[node].node_type == kComponent);
 }
 
 bool Nnet::IsDimRangeNode(int32 node) const {
-  int32 size = nodes_.size();  
-  KALDI_ASSERT(node >= 0 && node < size);  
+  int32 size = nodes_.size();
+  KALDI_ASSERT(node >= 0 && node < size);
   return (nodes_[node].node_type == kDimRange);
 }
 
@@ -138,6 +139,12 @@ Component *Nnet::GetComponent(int32 c) {
   return components_[c];
 }
 
+void Nnet::SetComponent(int32 c, Component *component) {
+  KALDI_ASSERT(static_cast<size_t>(c) < components_.size());
+  delete components_[c];
+  components_[c] = component;
+}
+
 /// Returns true if this is component-input node, i.e. a node of type kDescriptor
 /// that immediately precedes a node of type kComponent.
 bool Nnet::IsComponentInputNode(int32 node) const {
@@ -154,7 +161,7 @@ void Nnet::GetConfigLines(bool include_dim,
   for (int32 n = 0; n < NumNodes(); n++)
     if (!IsComponentInputNode(n))
       config_lines->push_back(GetAsConfigLine(n, include_dim));
-  
+
 }
 
 void Nnet::ReadConfig(std::istream &config_is) {
@@ -166,14 +173,14 @@ void Nnet::ReadConfig(std::istream &config_is) {
   // and combine the existing and new config lines in that representation.
   const bool include_dim = false;
   GetConfigLines(include_dim, &lines);
-  
+
   // we'll later regenerate what we need from nodes_ and node_name_ from the
   // string representation.
   nodes_.clear();
   node_names_.clear();
-  
+
   int32 num_lines_initial = lines.size();
-    
+
   ReadConfigFile(config_is, &lines);
   // now "lines" will have comments removed and empty lines stripped out
 
@@ -263,12 +270,12 @@ void Nnet::ProcessComponentConfigLine(
 void Nnet::ProcessComponentNodeConfigLine(
     int32 pass,
     ConfigLine *config) {
-  
+
   std::string name;
   if (!config->GetValue("name", &name))
     KALDI_ERR << "Expected field name=<component-name> in config line: "
               << config->WholeLine();
-  
+
   std::string input_name = name + std::string("_input");
   int32 input_node_index = GetNodeIndex(input_name),
       node_index = GetNodeIndex(name);
@@ -293,7 +300,7 @@ void Nnet::ProcessComponentNodeConfigLine(
       KALDI_ERR << "No component named '" << component_name
                 << "', in config line: " << config->WholeLine();
     nodes_[node_index].u.component_index = component_index;
-    
+
     if (!config->GetValue("input", &input_descriptor))
       KALDI_ERR << "Expected input=<input-descriptor>, in config line: "
                 << config->WholeLine();
@@ -305,7 +312,8 @@ void Nnet::ProcessComponentNodeConfigLine(
     GetSomeNodeNames(&node_names_temp);
     tokens.push_back("end of input");
     const std::string *next_token = &(tokens[0]);
-    if (!nodes_[input_node_index].descriptor.Parse(node_names_temp, &next_token))
+    if (!nodes_[input_node_index].descriptor.Parse(node_names_temp,
+                                                   &next_token))
       KALDI_ERR << "Error parsing Descriptor in config line: "
                 << config->WholeLine();
     if (config->HasUnusedValues())
@@ -329,12 +337,12 @@ void Nnet::ProcessInputNodeConfigLine(
   if (config->HasUnusedValues())
     KALDI_ERR << "Unused values '" << config->UnusedValues()
               << " in config line: " << config->WholeLine();
-  
+
   KALDI_ASSERT(GetNodeIndex(name) == -1);
   if (dim <= 0)
     KALDI_ERR << "Invalid dimension in config line: " << config->WholeLine();
-  
-  int32 node_index = nodes_.size();    
+
+  int32 node_index = nodes_.size();
   nodes_.push_back(NetworkNode(kInput));
   nodes_[node_index].dim = dim;
   node_names_.push_back(name);
@@ -441,8 +449,8 @@ void Nnet::ProcessDimRangeNodeConfigLine(
 
 int32 Nnet::GetNodeIndex(const std::string &node_name) const {
   size_t size = node_names_.size();
-  for (size_t i = 0; i < size; i++) 
-    if (node_names_[i] == node_name) 
+  for (size_t i = 0; i < size; i++)
+    if (node_names_[i] == node_name)
       return static_cast<int32>(i);
   return -1;
 }
@@ -655,18 +663,6 @@ const std::string& Nnet::GetComponentName(int32 component_index) const {
   return component_names_[component_index];
 }
 
-const Component *Nnet::GetComponentForNode(int32 node_index) const {
-  KALDI_ASSERT(static_cast<size_t>(node_index) < node_names_.size() &&
-               nodes_[node_index].node_type == kComponent);
-  return GetComponent(nodes_[node_index].u.component_index);
-}
-
-Component *Nnet::GetComponentForNode(int32 node_index) {
-  KALDI_ASSERT(static_cast<size_t>(node_index) < node_names_.size() &&
-               nodes_[node_index].node_type == kComponent);
-  return GetComponent(nodes_[node_index].u.component_index);
-}
-
 void Nnet::Check() const {
   int32 num_nodes = nodes_.size(),
     num_input_nodes = 0,
@@ -727,7 +723,7 @@ void Nnet::Check() const {
                     << ": input-dim=" << input_dim << ", dim=" << node.dim
                     << ", dim-offset=" << node.dim_offset;
         }
-        break;        
+        break;
       }
       default:
         KALDI_ERR << "Invalid node type for node " << node_name;
@@ -775,6 +771,7 @@ Nnet& Nnet::operator =(const Nnet &nnet) {
 std::string Nnet::Info() const {
   std::ostringstream os;
   os << "num-parameters: " << NumParameters(*this) << "\n";
+  os << "modulus: " << this->Modulus() << "\n";
   std::vector<std::string> config_lines;
   bool include_dim = true;
   GetConfigLines(include_dim, &config_lines);
diff --git a/src/nnet3/nnet-nnet.h b/src/nnet3/nnet-nnet.h
index cc2958ce8cc..a48fbb26f88 100644
--- a/src/nnet3/nnet-nnet.h
+++ b/src/nnet3/nnet-nnet.h
@@ -1,7 +1,7 @@
 // nnet3/nnet-nnet.h
 
 // Copyright   2012-2015  Johns Hopkins University (author: Daniel Povey)
-
+//             2016  Daniel Galvez
 // See ../../COPYING for clarification regarding multiple authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
@@ -46,7 +46,7 @@ namespace nnet3 {
 ///    - Objective type kLinear is intended for Neural nets where the final
 ///      component is a LogSoftmaxComponent, so the log-prob (negative
 ///      cross-entropy) objective is just a linear function of the input.
-///    - Objective type kQuadratic is used to mean the objective function 
+///    - Objective type kQuadratic is used to mean the objective function
 ///      f(x, y) = -0.5 (x-y).(x-y), which is to be maximized, as in the kLinear
 ///      case.
 enum ObjectiveType { kLinear, kQuadratic };
@@ -78,7 +78,7 @@ enum NodeType { kInput, kDescriptor, kComponent, kDimRange, kNone };
 /// component and output.  output maps to kDescriptor, but the nodes of type
 /// kDescriptor that represent the input to a component, are described in the
 /// same config-file line as the Component itself.
-struct NetworkNode {  
+struct NetworkNode {
   NodeType node_type;
   // "descriptor" is relevant only for nodes of type kDescriptor.
   Descriptor descriptor;
@@ -101,7 +101,7 @@ struct NetworkNode {
   int32 dim;
   // for kDimRange, the dimension of the offset into the input component's feature.
   int32 dim_offset;
-  
+
   int32 Dim(const Nnet &nnet) const;  // Dimension that this node outputs.
 
   NetworkNode(NodeType nt = kNone):
@@ -118,11 +118,11 @@ class Nnet {
   // file, or to add to an existing Nnet, possibly replacing certain parts of
   // it.  It will die with error if something went wrong.
   void ReadConfig(std::istream &config_file);
-  
+
   int32 NumComponents() const { return components_.size(); }
 
   int32 NumNodes() const { return nodes_.size(); }
-  
+
   /// return component indexed c.  not a copy; not owned by caller.
   Component *GetComponent(int32 c);
 
@@ -130,14 +130,9 @@ class Nnet {
   /// caller.
   const Component *GetComponent(int32 c) const;
 
-
-  /// return the component corresponding to the node indexed n, which must
-  /// be of type kComponent.  Convenience function.  Result is not a copy and
-  /// not owned by the caller.
-  Component *GetComponentForNode(int32 n);
-  /// Const version of GetComponentForNode().
-  const Component *GetComponentForNode(int32 n) const;
-
+  /// Replace the component indexed by c with a new component.
+  /// Frees previous component indexed by c.
+  void SetComponent(int32 c, Component *component);
 
   /// returns const reference to a particular numbered network node.
   const NetworkNode &GetNode(int32 node) const {
@@ -152,7 +147,7 @@ class Nnet {
   /// Returns true if this is a dim-range node, meaning that it is of type
   /// kDimRange.
   bool IsDimRangeNode(int32 node) const;
-  
+
   /// Returns true if this is an output node, meaning that it is of type
   /// kInput.
   bool IsInputNode(int32 node) const;
@@ -161,14 +156,14 @@ class Nnet {
   /// kDescriptor.  Exactly one of IsOutput or IsComponentInput will also
   /// apply.
   bool IsDescriptorNode(int32 node) const;
-  
+
   /// Returns true if this is an output node, meaning that it is of type kDescriptor
   /// and is not directly followed by a node of type kComponent.
   bool IsOutputNode(int32 node) const;
 
   /// Returns true if this is component-input node, i.e. a node of type kDescriptor
   /// that immediately precedes a node of type kComponent.
-  bool IsComponentInputNode(int32 node) const;  
+  bool IsComponentInputNode(int32 node) const;
 
   /// returns vector of node names (needed by some parsing code, for instance).
   const std::vector<std::string> &GetNodeNames() const;
@@ -181,7 +176,7 @@ class Nnet {
 
   /// returns individual component name.
   const std::string &GetComponentName(int32 component_index) const;
-  
+
   /// returns index associated with this node name, or -1 if no such index.
   int32 GetNodeIndex(const std::string &node_name) const;
 
@@ -197,7 +192,7 @@ class Nnet {
   // name "input_name" (e.g. output_name="input"), or -1 if there is
   // no such input.
   int32 OutputDim(const std::string &output_name) const;
-  
+
   void Read(std::istream &istream, bool binary);
 
   void Write(std::ostream &ostream, bool binary) const;
@@ -211,6 +206,8 @@ class Nnet {
 
   /// returns some human-readable information about the network, mostly for
   /// debugging purposes.
+  /// Also see function NnetInfo() in nnet-utils.h, which prints out more
+  /// extensive infoformation.
   std::string Info() const;
 
   /// [Relevant for clockwork RNNs and similar].  Computes the smallest integer
@@ -233,9 +230,9 @@ class Nnet {
   // Assignment operator
   Nnet& operator =(const Nnet &nnet);
  private:
-  
+
   void Destroy();
-  
+
   // This function returns as a string the contents of a line of a config-file
   // corresponding to the node indexed "node_index", which must not be of type
   // kComponentInput.  If include_dim=false, it appears in the same format as it
@@ -267,7 +264,7 @@ class Nnet {
   static void RemoveRedundantConfigLines(int32 num_lines_initial,
                                          std::vector<std::string> *first_tokens,
                                          std::vector<ConfigLine> *configs);
-  
+
   void ProcessComponentConfigLine(int32 initial_num_components,
                                   ConfigLine *config);
   void ProcessComponentNodeConfigLine(int32 pass,
@@ -285,17 +282,17 @@ class Nnet {
   // they are not allowed.
   void GetSomeNodeNames(std::vector<std::string> *modified_node_names) const;
 
-  
+
   // the names of the components of the network.  Note, these may be distinct
   // from the network node names below (and live in a different namespace); the
   // same component may be used in multiple network nodes, to define parameter
   // sharing.
   std::vector<std::string> component_names_;
-  
+
   // the components of the nnet, in arbitrary order.  The network topology is
   // defined separately, below; a given Component may appear more than once in
   // the network if necessary for parameter tying.
-  std::vector<Component*> components_;  
+  std::vector<Component*> components_;
 
   // names of network nodes, i.e. inputs, components and outputs, used only in
   // reading and writing code.  Indexed by network-node index.  Note,
@@ -305,7 +302,7 @@ class Nnet {
 
   // the network nodes of the network.
   std::vector<NetworkNode> nodes_;
-  
+
 };
 
 
diff --git a/src/nnet3/nnet-optimize-test.cc b/src/nnet3/nnet-optimize-test.cc
index 096fe48ff2d..4ec9f7da733 100644
--- a/src/nnet3/nnet-optimize-test.cc
+++ b/src/nnet3/nnet-optimize-test.cc
@@ -30,8 +30,13 @@ namespace nnet3 {
 // Run the test wothout optimizations and with optimizations specified by the
 // parameter. Only print warnings; we'll fail the whole test later.
 static bool UnitTestNnetOptimizeWithOptions(NnetOptimizeOptions opt_config) {
+  //opt_config.convert_addition = false;
+  //opt_config.remove_assignments = false;
+  //opt_config.move_sizing_commands = false;
+  //opt_config.allocate_from_other = false;
+
   srand(0);  // Every run must be deterministic.
-  for (int32 n = 0; n < 20; n++) {
+  for (int32 n = 0; n < 40; n++) {
     struct NnetGenerationOptions gen_config;
 
     std::vector<std::string> configs;
@@ -60,7 +65,7 @@ static bool UnitTestNnetOptimizeWithOptions(NnetOptimizeOptions opt_config) {
     CheckComputationOptions check_config;
     // we can do the rewrite check since it's before optimization.
     check_config.check_rewrite = true;
-    ComputationChecker checker(check_config, nnet, request, computation);
+    ComputationChecker checker(check_config, nnet, computation);
     checker.Check();
 
     NnetComputation computation_opt(computation);
@@ -68,7 +73,7 @@ static bool UnitTestNnetOptimizeWithOptions(NnetOptimizeOptions opt_config) {
     {
       Optimize(opt_config, nnet, request, &computation_opt);
       std::ostringstream os;
-      computation.Print(os, nnet);
+      computation_opt.Print(os, nnet);
       KALDI_LOG << "Optimized computation is: " << os.str();
     }
 
@@ -78,18 +83,30 @@ static bool UnitTestNnetOptimizeWithOptions(NnetOptimizeOptions opt_config) {
 
     computation.ComputeCudaIndexes();
     computation_opt.ComputeCudaIndexes();
+    Nnet nnet_to_update(nnet);  // copy of the nnet that we update...  needed to
+                                // test the consolidation of backprop commands,
+                                // otherwise the optimized and non-optimized
+                                // comptuations differ.
+    bool is_gradient = true;  // with natural gradient, the consolidation would
+                              // affect the final model params -> test just the
+                              // gradient.
+    SetZero(is_gradient, &nnet_to_update);
+
     NnetComputer computer(compute_opts,
                           computation,
                           nnet,
-                          &nnet);
+                          &nnet_to_update);
+
     Nnet nnet_opt(nnet);  // copy of the nnet for the optimized computation.
                           // necessary in case backprop changes parameters.
+    Nnet nnet_opt_to_update(nnet_opt);
+    SetZero(is_gradient, &nnet_opt_to_update);
 
     // NnetComputer for the optimized version of the computation.
     NnetComputer computer_opt(compute_opts,
                               computation_opt,
                               nnet_opt,
-                              &nnet_opt);
+                              &nnet_opt_to_update);
 
     // provide the input to the computations.
     for (size_t i = 0; i < request.inputs.size(); i++) {
@@ -145,7 +162,8 @@ static bool UnitTestNnetOptimizeWithOptions(NnetOptimizeOptions opt_config) {
       }
     }
 
-    if (!NnetParametersAreIdentical(nnet, nnet_opt, 1.0e-05)) {
+    if (!NnetParametersAreIdentical(nnet_to_update,
+                                    nnet_opt_to_update, 1.0e-05)) {
       KALDI_WARN << "Neural networks differ after training, between "
                  << "optimized and non-optimized computation.";
       return false;
@@ -159,8 +177,14 @@ static bool UnitTestNnetOptimizeWithOptions(NnetOptimizeOptions opt_config) {
 // the outputs are the same.
 static void UnitTestNnetOptimize() {
   NnetOptimizeOptions optimize_all;
+  // randomly sometimes set min_deriv and max_deriv to small/large values,
+  // which will cause some of the LimitDerivativeTimes() code to be called
+  // (without really changing anything).
+  if (RandInt(0, 3) == 0) optimize_all.min_deriv_time = -200;
+  if (RandInt(0, 3) == 0) optimize_all.max_deriv_time = 1000;
+
   // this is useful for debugging as it removes nans:
-  optimize_all.initialize_undefined = false;
+  // optimize_all.initialize_undefined = false;
   bool success = UnitTestNnetOptimizeWithOptions(optimize_all);
   if (success)
     return;
diff --git a/src/nnet3/nnet-optimize-utils.cc b/src/nnet3/nnet-optimize-utils.cc
new file mode 100644
index 00000000000..df7f975db86
--- /dev/null
+++ b/src/nnet3/nnet-optimize-utils.cc
@@ -0,0 +1,1842 @@
+// nnet3/nnet-optimize-utils.cc
+
+// Copyright      2015  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <map>
+#include "nnet3/nnet-optimize-utils.h"
+#include "nnet3/nnet-optimize.h"
+
+
+namespace kaldi {
+namespace nnet3 {
+
+
+void IdentifySubmatrixArgs(NnetComputation::Command *c,
+                           std::vector<int32*> *submatrix_args) {
+  submatrix_args->clear();
+  switch (c->command_type) {
+    case kAllocMatrixZeroed:
+    case kAllocMatrixUndefined:
+    case kDeallocMatrix:
+    case kAllocMatrixFromOther:
+    case kAllocMatrixFromOtherZeroed:
+      break;
+    case kPropagate:
+      submatrix_args->push_back(&c->arg3);
+      submatrix_args->push_back(&c->arg4);
+      break;
+    case kStoreStats:
+      submatrix_args->push_back(&c->arg2);
+      break;
+    case kBackprop:
+    case kBackpropNoModelUpdate:
+      submatrix_args->push_back(&c->arg3);
+      submatrix_args->push_back(&c->arg4);
+      submatrix_args->push_back(&c->arg5);
+      submatrix_args->push_back(&c->arg6);
+      break;
+    case kMatrixCopy:
+    case kMatrixAdd:
+    case kAddRows:
+    case kCopyRows:
+    case kAddRowRanges:
+      submatrix_args->push_back(&c->arg1);
+      submatrix_args->push_back(&c->arg2);
+      break;
+    case kAddRowsMulti:
+    case kCopyRowsMulti:
+    case kAddToRowsMulti:
+    case kCopyToRowsMulti:
+      submatrix_args->push_back(&c->arg1);
+      break;
+    case kNoOperation:
+    case kNoOperationMarker:
+      break;
+    default:
+      KALDI_ERR << "Unknown command type.";
+  }
+}
+
+void IdentifySubmatrixArgs(std::vector<NnetComputation::Command> *commands,
+                           std::vector<int32*> *submatrix_args) {
+  submatrix_args->clear();
+  std::vector<NnetComputation::Command>::iterator iter = commands->begin(),
+      end = commands->end();
+  std::vector<int32*> this_submatrix_args;
+  for (; iter != end; ++iter) {
+    IdentifySubmatrixArgs(&(*iter), &this_submatrix_args);
+    submatrix_args->insert(submatrix_args->end(),
+                           this_submatrix_args.begin(),
+                           this_submatrix_args.end());
+  }
+}
+
+
+void IdentifyMatrixArgs(std::vector<NnetComputation::Command> *commands,
+                        std::vector<int32*> *matrix_args) {
+  matrix_args->clear();
+  std::vector<NnetComputation::Command>::iterator iter = commands->begin(),
+      end = commands->end();
+  std::vector<int32*> this_matrix_args;
+  for (; iter != end; ++iter) {
+    IdentifyMatrixArgs(&(*iter), &this_matrix_args);
+    matrix_args->insert(matrix_args->end(),
+                        this_matrix_args.begin(),
+                        this_matrix_args.end());
+  }
+}
+
+
+void IdentifyMatrixArgsInComputation(bool include_in_submatrices,
+                                     NnetComputation *computation,
+                                     std::vector<int32*> *matrix_args) {
+  IdentifyMatrixArgs(&(computation->commands), matrix_args);
+  int32 num_submatrices = computation->submatrices.size();
+  matrix_args->reserve(matrix_args->size() +
+                       (include_in_submatrices ?
+                        computation->submatrices.size() : 0) +
+                       2 * computation->input_output_info.size());
+  if (include_in_submatrices)
+    for (int32 s = 1; s < num_submatrices; s++)
+      matrix_args->push_back(&(computation->submatrices[s].matrix_index));
+  unordered_map<int32, std::pair<int32, int32> >::iterator
+      iter = computation->input_output_info.begin(),
+      end = computation->input_output_info.end();
+  for (; iter != end; ++iter) {
+    matrix_args->push_back(&(iter->second.first));
+    matrix_args->push_back(&(iter->second.second));
+  }
+}
+
+
+void IdentifyIndexesMultiArgs(std::vector<NnetComputation::Command> *commands,
+                              std::vector<int32*> *indexes_multi_args) {
+  indexes_multi_args->clear();
+  std::vector<NnetComputation::Command>::iterator iter = commands->begin(),
+      end = commands->end();
+  for (; iter != end; ++iter) {
+    NnetComputation::Command &command = *iter;
+    if (command.command_type == kAddRowsMulti ||
+        command.command_type == kAddToRowsMulti ||
+        command.command_type == kCopyRowsMulti ||
+        command.command_type == kCopyToRowsMulti)
+      indexes_multi_args->push_back(&(command.arg2));
+  }
+}
+
+
+void IdentifyIndexesRangesArgs(std::vector<NnetComputation::Command> *commands,
+                               std::vector<int32*> *indexes_ranges_args) {
+  indexes_ranges_args->clear();
+  std::vector<NnetComputation::Command>::iterator iter = commands->begin(),
+      end = commands->end();
+  for (; iter != end; ++iter) {
+    NnetComputation::Command &command = *iter;
+    if (command.command_type == kAddRowRanges)
+      indexes_ranges_args->push_back(&(command.arg3));
+  }
+}
+
+void IdentifyIndexesArgs(std::vector<NnetComputation::Command> *commands,
+                         std::vector<int32*> *indexes_args) {
+  indexes_args->clear();
+  std::vector<NnetComputation::Command>::iterator iter = commands->begin(),
+      end = commands->end();
+  for (; iter != end; ++iter) {
+    NnetComputation::Command &command = *iter;
+    if (command.command_type == kCopyRows ||
+        command.command_type == kAddRows)
+      indexes_args->push_back(&(command.arg3));
+  }
+}
+
+
+
+void IdentifyMatrixArgs(NnetComputation::Command *c,
+                        std::vector<int32*> *matrix_args) {
+  matrix_args->clear();
+  switch (c->command_type) {
+    case kAllocMatrixZeroed:
+    case kAllocMatrixUndefined:
+    case kDeallocMatrix:
+      matrix_args->push_back(&c->arg1);
+      break;
+    case kAllocMatrixFromOther:
+    case kAllocMatrixFromOtherZeroed:
+      matrix_args->push_back(&c->arg1);
+      matrix_args->push_back(&c->arg2);
+      break;
+    default:
+      break;
+  }
+}
+
+// static
+int32 ComputationRenumberer::CreateRenumbering(
+    const std::vector<bool> &used,
+    std::vector<int32> *renumbering) {
+  renumbering->clear();
+  renumbering->reserve(used.size());
+  std::vector<bool>::const_iterator iter = used.begin(), end = used.end();
+  int32 cur_index = 0;
+  for (; iter != end; ++iter) {
+    if (*iter) renumbering->push_back(cur_index++);
+    else renumbering->push_back(-1);
+  }
+  return cur_index;
+}
+
+// static
+void ComputationRenumberer::CreateRenumbering(
+    int32 old_num_elements,
+    const std::vector<int32> &to_remove,
+    std::vector<int32> *renumbering) {
+  KALDI_ASSERT(IsSortedAndUniq(to_remove) && old_num_elements > 0);
+  renumbering->clear();
+  renumbering->resize(old_num_elements, 0);
+  int32 num_remove = to_remove.size();
+  for (int32 r = 0; r < num_remove; r++) {
+    int32 this_remove = to_remove[r];
+    // the "> 0" would be ">= 0" in a more generic context, but zero is
+    // not valid in this particular application.
+    KALDI_ASSERT(this_remove > 0 && this_remove < old_num_elements);
+    (*renumbering)[this_remove] = -1;
+  }
+  int32 cur_number = 0;
+  for (int32 i = 0; i < old_num_elements; i++) {
+    if ((*renumbering)[i] != -1)
+      (*renumbering)[i] = cur_number++;
+  }
+  KALDI_ASSERT(cur_number == old_num_elements -
+               static_cast<int32>(to_remove.size()));
+}
+
+
+void IdentifySubmatrixArgsInComputation(NnetComputation *computation,
+                                        std::vector<int32*> *submatrix_args) {
+  IdentifySubmatrixArgs(&(computation->commands), submatrix_args);
+
+  size_t extra_size = 0;
+  for (size_t i = 0; i < computation->indexes_multi.size(); i++)
+    extra_size += computation->indexes_multi[i].size();
+  submatrix_args->reserve(submatrix_args->size() + extra_size);
+
+  for (size_t i = 0; i < computation->indexes_multi.size(); i++) {
+    std::vector<std::pair<int32, int32> > &indexes_multi =
+        computation->indexes_multi[i];
+    std::vector<std::pair<int32, int32> >::iterator
+        iter = indexes_multi.begin(), end = indexes_multi.end();
+    for (; iter != end; ++iter)
+      if (iter->first != -1)
+        submatrix_args->push_back(&(iter->first));
+  }
+}
+
+
+void ComputationRenumberer::ComputeSubmatrixIsUsed() {
+  int32 num_submatrices = computation_->submatrices.size();
+  submatrix_is_used_.clear();
+  submatrix_is_used_.resize(num_submatrices, false);
+  submatrix_is_used_[0] = true;
+  // the zeroth element of the array is 'special', it refers to the
+  // zero submatrix, and we don't want to renumber it.
+  std::vector<int32*> submatrix_args;
+  IdentifySubmatrixArgsInComputation(computation_, &submatrix_args);
+  std::vector<int32*>::iterator iter = submatrix_args.begin(),
+      end = submatrix_args.end();
+  int32 cur_submatrix_index = -1;  // an optimization to avoid too many
+                                   // indexings of the bool vector
+                                   // submatrix_is_used_.
+  for (; iter != end; ++iter) {
+    int32 submatrix_index = **iter;
+    if (submatrix_index > 0 && submatrix_index != cur_submatrix_index) {
+      cur_submatrix_index = submatrix_index;
+      KALDI_ASSERT(submatrix_index < num_submatrices);
+      submatrix_is_used_[submatrix_index] = true;
+    }
+  }
+}
+
+void ComputationRenumberer::ComputeMatrixIsUsed() {
+  matrix_is_used_.clear();
+  matrix_is_used_.resize(computation_->matrices.size(), false);
+  matrix_is_used_[0] = true;
+
+  std::vector<int32*> matrix_args;
+  bool include_in_submatrices = false;
+  IdentifyMatrixArgsInComputation(include_in_submatrices,
+                                  computation_, &matrix_args);
+  std::vector<int32*>::iterator iter = matrix_args.begin(),
+      end = matrix_args.end();
+  for (; iter != end; ++iter) {
+    int32 matrix_index = **iter;
+    if (matrix_index > 0)
+      matrix_is_used_[matrix_index] = true;
+  }
+  // We also need to take into account when matrices are used indirectly via
+  // submatrices (which is actually the main way they are accessed).
+  int32 num_submatrices_orig = computation_->submatrices.size();
+  for (int32 s = 1; s < num_submatrices_orig; s++) {
+    int32 matrix_index = computation_->submatrices[s].matrix_index;
+    if (submatrix_is_used_[s])
+      matrix_is_used_[matrix_index] = true;
+  }
+}
+
+
+
+void ComputationRenumberer::SetUpMappings() {
+  num_matrices_new_ = CreateRenumbering(matrix_is_used_, &old_to_new_matrix_);
+
+  unordered_map<NnetComputation::SubMatrixInfo, int32,
+                SubMatrixHasher> submat_map;
+  int32 cur_index = 1, num_submatrices_orig =
+      computation_->submatrices.size();
+  // the old_to_new_submatrix_ map will remove duplicates.
+  // -1's will appear wherever a particular submatrix was never used.
+  submatrix_is_kept_ = submatrix_is_used_;
+  old_to_new_submatrix_.resize(num_submatrices_orig, -1);
+  old_to_new_submatrix_[0] = 0;
+  for (int32 s = 1; s < num_submatrices_orig; s++) {
+    if (submatrix_is_used_[s]) {
+      const NnetComputation::SubMatrixInfo &info =
+          computation_->submatrices[s];
+      if (submat_map.count(info) > 0) {  // a duplicate...
+        old_to_new_submatrix_[s] = submat_map[info];
+        submatrix_is_kept_[s] = false;
+      } else {
+        old_to_new_submatrix_[s] = (submat_map[info] = cur_index++);
+      }
+    }
+  }
+  num_submatrices_new_ = cur_index;
+}
+
+void ComputationRenumberer::RenumberSubmatrices() {
+  std::vector<int32*> submatrix_args;
+  IdentifySubmatrixArgsInComputation(computation_, &submatrix_args);
+  std::vector<int32*>::iterator iter = submatrix_args.begin(),
+      end = submatrix_args.end();
+  for (; iter != end; ++iter) {
+    if (**iter > 0) {
+      int32 new_submatrix_index = old_to_new_submatrix_[**iter];
+      // old_to_new_submatrix_[s] for s > 0 is only <= 0 (actually, -1) for
+      // submatrices that are never accessed, and these should never appear
+      // in this list.
+      KALDI_ASSERT(new_submatrix_index > 0);
+      **iter = new_submatrix_index;
+    }
+  }
+  std::vector<NnetComputation::SubMatrixInfo> new_submatrices;
+  int32 num_submatrices_old = computation_->submatrices.size();
+  new_submatrices.reserve(num_submatrices_old);
+  for (int32 s = 0; s < num_submatrices_old; s++)
+    if (submatrix_is_kept_[s])
+      new_submatrices.push_back(computation_->submatrices[s]);
+  computation_->submatrices.swap(new_submatrices);
+  // We'll map the matrix indexes inside computation_->submatrices
+  // when we call RenumberMatrices().
+}
+
+void ComputationRenumberer::RenumberMatrices() {
+  std::vector<int32*> matrix_args;
+  bool include_in_submatrices = true;
+  IdentifyMatrixArgsInComputation(include_in_submatrices,
+                                  computation_, &matrix_args);
+  std::vector<int32*>::iterator iter = matrix_args.begin(),
+      end = matrix_args.end();
+  for (; iter != end; ++iter) {
+    if (**iter > 0) {
+      int32 new_matrix_index = old_to_new_matrix_[**iter];
+      // old_to_new_matrix_[s] for s > 0 is only <= 0 (actually, -1) for
+      // submatrices that are never accessed, and these should never appear
+      // in this list.
+      KALDI_ASSERT(new_matrix_index > 0);
+      **iter = new_matrix_index;
+    }
+  }
+
+  std::vector<NnetComputation::MatrixInfo> new_matrices;
+  int32 num_matrices_old = computation_->matrices.size();
+  new_matrices.reserve(num_matrices_old);
+  for (int32 m = 0; m < num_matrices_old; m++)
+    if (matrix_is_used_[m])
+      new_matrices.push_back(computation_->matrices[m]);
+  computation_->matrices.swap(new_matrices);
+
+  std::vector<NnetComputation::MatrixDebugInfo> new_debug_info;
+  int32 debug_info_size = computation_->matrix_debug_info.size();
+  KALDI_ASSERT(debug_info_size == 0 || debug_info_size == num_matrices_old);
+  new_debug_info.reserve(debug_info_size);
+  for (int32 m = 0; m < debug_info_size; m++) {
+    if (matrix_is_used_[m]) {
+      new_debug_info.push_back(NnetComputation::MatrixDebugInfo());
+      new_debug_info.back().Swap(&(computation_->matrix_debug_info[m]));
+    }
+  }
+  computation_->matrix_debug_info.swap(new_debug_info);
+}
+
+
+void ComputationRenumberer::Renumber() {
+  RemoveUnusedIndexesMulti();
+  ComputeSubmatrixIsUsed();
+  ComputeMatrixIsUsed();
+  SetUpMappings();
+  RenumberSubmatrices();
+  RenumberMatrices();
+  RemoveIndexesMultiDuplicates();
+  RenumberIndexes();
+  RenumberIndexesRanges();
+}
+
+void ComputationRenumberer::RemoveUnusedIndexesMulti() {
+  int32 num_indexes_multi = computation_->indexes_multi.size();
+  if (num_indexes_multi == 0)
+    return;  // Nothing to do.  An optimization.
+  std::vector<bool> indexes_multi_used(num_indexes_multi, false);
+  std::vector<int32*> indexes_multi_args;
+  IdentifyIndexesMultiArgs(&(computation_->commands), &indexes_multi_args);
+  std::vector<int32*>::iterator iter = indexes_multi_args.begin(),
+      end = indexes_multi_args.end();
+  for (; iter != end; ++iter) {
+    int32 indexes_multi_index = **iter;
+    KALDI_ASSERT(indexes_multi_index >= 0 &&
+                 indexes_multi_index < num_indexes_multi);
+    indexes_multi_used[indexes_multi_index] = 1;
+  }
+  // old->new mapping for the indexes_multi arrays.  will remain -1 for
+  // ones that are unused.
+  std::vector<int32> old_to_new(num_indexes_multi, -1);
+  int32 new_num_indexes_multi = CreateRenumbering(indexes_multi_used,
+                                                  &old_to_new);
+  if (new_num_indexes_multi == num_indexes_multi)
+    return;  // Nothing to do.  An optimization.
+  std::vector<std::vector<std::pair<int32, int32> > >
+      new_indexes_multi(new_num_indexes_multi);
+  for (int32 i = 0; i < num_indexes_multi; i++) {
+    if (old_to_new[i] != -1)
+      new_indexes_multi[old_to_new[i]].swap(computation_->indexes_multi[i]);
+  }
+  computation_->indexes_multi.swap(new_indexes_multi);
+  // renumber within the commands.
+  for (iter = indexes_multi_args.begin(); iter != end; ++iter)
+    **iter = old_to_new[**iter];
+}
+
+
+// removes duplicates within the indexes_multi_ array itself.
+void ComputationRenumberer::RemoveIndexesMultiDuplicates() {
+  int32 cur_index = 0,
+      old_indexes_multi_size = computation_->indexes_multi.size();
+  if (old_indexes_multi_size == 0)
+    return;
+  // create index mapping from old to new.  the use of map is generally not that
+  // efficient, but the idea here is that we can do most of the comparisons just
+  // based on the size of the vectors, and avoid even visiting most of their
+  // contents.
+  std::vector<int32> indexes_multi_old_to_new(old_indexes_multi_size);
+  typedef std::vector<std::pair<int32,int32> > PairVectorType;
+  typedef std::map<const PairVectorType*, int32,
+                   PointerCompare<std::pair<int32,int32> > > MapType;
+  MapType indexes_multi_map;
+  for (int32 i = 0; i < computation_->indexes_multi.size(); i++) {
+    std::pair<MapType::iterator, bool> p =
+        indexes_multi_map.insert(std::pair<const PairVectorType*, int32>(
+            &(computation_->indexes_multi[i]), cur_index));
+    if (p.second) {  // was inserted-- was not there already.
+      indexes_multi_old_to_new[i] = cur_index++;
+    } else {
+      int32 index_from_map = p.first->second;
+      indexes_multi_old_to_new[i] = index_from_map;
+    }
+  }
+  if (cur_index == old_indexes_multi_size)
+    return;  // An optimization.  No duplicates were found.
+  std::vector<PairVectorType> new_indexes_multi(cur_index);
+  for (int32 i = 0; i < old_indexes_multi_size; i++) {
+    int32 new_index = indexes_multi_old_to_new[i];
+    computation_->indexes_multi[i].swap(new_indexes_multi[new_index]);
+  }
+  computation_->indexes_multi.swap(new_indexes_multi);
+
+  std::vector<int32*> indexes_multi_args;
+  IdentifyIndexesMultiArgs(&(computation_->commands), &indexes_multi_args);
+  std::vector<int32*>::const_iterator iter = indexes_multi_args.begin(),
+      end = indexes_multi_args.end();
+  for (; iter != end; ++iter)
+    **iter = indexes_multi_old_to_new[**iter];
+}
+
+
+void ComputationRenumberer::RenumberIndexes() {
+  int32 old_num_indexes = computation_->indexes.size();
+  if (old_num_indexes == 0)
+    return;
+  std::vector<int32*> indexes_args;
+  IdentifyIndexesArgs(&(computation_->commands), &indexes_args);
+
+  std::vector<bool> indexes_seen(old_num_indexes, false);
+  std::vector<int32*>::const_iterator iter = indexes_args.begin(),
+      end = indexes_args.end();
+  for (; iter != end; ++iter)
+    indexes_seen[**iter] = true;
+
+  std::vector<int32> old_to_new_index(old_num_indexes);
+  typedef std::map<const std::vector<int32>*, int32,
+                   PointerCompare<int32> > MapType;
+  MapType indexes_map;
+
+  int32 cur_index = 0;
+  for (int32 i = 0; i < old_num_indexes; i++) {
+    if (!indexes_seen[i]) {
+      old_to_new_index[i] = -1;
+    } else {
+      std::pair<MapType::iterator, bool> p =
+          indexes_map.insert(std::pair<const std::vector<int32>*, int32>(
+              &(computation_->indexes[i]), cur_index));
+      if (p.second) {  // was inserted-- was not there already.
+        old_to_new_index[i] = cur_index++;
+      } else {
+        int32 index_from_map = p.first->second;
+        old_to_new_index[i] = index_from_map;
+      }
+    }
+  }
+  if (cur_index == old_num_indexes)
+    return;  // An optimization.  No changes to the numbering are made.
+  std::vector<std::vector<int32> > new_indexes(cur_index);
+  for (int32 i = 0; i < old_num_indexes; i++) {
+    int32 new_index = old_to_new_index[i];
+    if (new_index != -1)
+      computation_->indexes[i].swap(new_indexes[new_index]);
+  }
+  computation_->indexes.swap(new_indexes);
+
+  // renumber the indexes inside the commmands.
+  for (iter = indexes_args.begin(); iter != end; ++iter) {
+    int32 old_index = **iter;
+    KALDI_ASSERT(old_index >= 0 && old_index < old_num_indexes);
+    int32 new_index = old_to_new_index[old_index];
+    KALDI_ASSERT(new_index >= 0);
+    **iter = new_index;
+  }
+}
+
+void ComputationRenumberer::RenumberIndexesRanges() {
+  int32 old_num_indexes_ranges = computation_->indexes_ranges.size();
+  if (old_num_indexes_ranges == 0)
+    return;
+  std::vector<int32*> indexes_ranges_args;
+  IdentifyIndexesRangesArgs(&(computation_->commands), &indexes_ranges_args);
+
+  std::vector<bool> is_seen(old_num_indexes_ranges, false);
+  std::vector<int32*>::const_iterator iter = indexes_ranges_args.begin(),
+      end = indexes_ranges_args.end();
+  for (; iter != end; ++iter)
+    is_seen[**iter] = true;
+
+  std::vector<int32> old_to_new_index(old_num_indexes_ranges);
+  typedef std::map<const std::vector<std::pair<int32, int32> >*, int32,
+                   PointerCompare<std::pair<int32, int32> > > MapType;
+  MapType indexes_map;
+  int32 cur_index = 0;
+  for (int32 i = 0; i < old_num_indexes_ranges; i++) {
+    if (!is_seen[i]) {
+      old_to_new_index[i] = -1;
+    } else {
+      std::pair<MapType::iterator, bool> p =
+          indexes_map.insert(
+              std::pair<const std::vector<std::pair<int32, int32> >*, int32>(
+                  &(computation_->indexes_ranges[i]), cur_index));
+      if (p.second) {  // was inserted-- was not there already.
+        old_to_new_index[i] = cur_index++;
+      } else {
+        int32 index_from_map = p.first->second;
+        old_to_new_index[i] = index_from_map;
+      }
+    }
+  }
+  if (cur_index == old_num_indexes_ranges)
+    return;  // An optimization.  No changes to the numbering are made.
+  std::vector<std::vector<std::pair<int32, int32> > > new_indexes_ranges(
+      cur_index);
+  for (int32 i = 0; i < old_num_indexes_ranges; i++) {
+    int32 new_index = old_to_new_index[i];
+    if (new_index != -1)
+      computation_->indexes_ranges[i].swap(new_indexes_ranges[new_index]);
+  }
+  computation_->indexes_ranges.swap(new_indexes_ranges);
+
+  // renumber the indexes inside the commmands.
+  for (iter = indexes_ranges_args.begin(); iter != end; ++iter) {
+    int32 old_index = **iter;
+    KALDI_ASSERT(old_index >= 0 && old_index < old_num_indexes_ranges);
+    int32 new_index = old_to_new_index[old_index];
+    KALDI_ASSERT(new_index >= 0);
+    **iter = new_index;
+  }
+}
+
+
+
+
+void RenumberComputation(NnetComputation *computation) {
+  ComputationRenumberer renumberer(computation);
+  renumberer.Renumber();
+}
+
+void RemoveNoOps(NnetComputation *computation) {
+  std::vector<NnetComputation::Command>::iterator
+      input_iter = computation->commands.begin(),
+      input_end = computation->commands.end(),
+      output_iter = computation->commands.begin();
+  for (; input_iter != input_end; ++input_iter) {
+    if (input_iter->command_type != kNoOperation) {
+      *output_iter = *input_iter;
+      ++output_iter;
+    }
+  }
+  computation->commands.resize(output_iter - computation->commands.begin());
+}
+
+/// Wherever matrix orig_matrix_index appears in the input of the network
+/// (i.e. in computation->input_output_info), replaces it with new_matrix_index.
+/// Returns true if it did replace it.
+bool ReplaceInInput(
+    const Nnet &nnet,
+    int32 orig_matrix_index, int32 new_matrix_index,
+    NnetComputation *computation) {
+  bool ans = false;
+  int32 num_matrices = computation->matrices.size();
+  KALDI_ASSERT(orig_matrix_index > 0 && orig_matrix_index < num_matrices &&
+               new_matrix_index > 0 && new_matrix_index < num_matrices);
+  unordered_map<int32, std::pair<int32, int32> >::iterator
+      iter = computation->input_output_info.begin(),
+      end = computation->input_output_info.end();
+  for (; iter != end; ++iter) {
+    int32 network_node = iter->first,
+        &value_matrix_index = iter->second.first,
+        &deriv_matrix_index = iter->second.second;
+    if (nnet.IsOutputNode(network_node)) {
+      // deriv_matrix_index would be an input to the computation.
+      if (deriv_matrix_index == orig_matrix_index) {
+        deriv_matrix_index = new_matrix_index;
+        ans = true;
+      }
+    } else {
+      // value_matrix_index would be an input to the computation.
+      if (value_matrix_index == orig_matrix_index) {
+        value_matrix_index = new_matrix_index;
+        ans = true;
+      }
+    }
+  }
+  return ans;
+}
+
+
+/// Wherever matrix orig_matrix_index appears in the output of the network
+/// (i.e. in computation->input_output_info), replaces it with new_matrix_index.
+/// Returns true if it did replace it.
+bool ReplaceInOutput(
+    const Nnet &nnet, int32 orig_matrix_index, int32 new_matrix_index,
+    NnetComputation *computation) {
+  bool ans = false;
+  int32 num_matrices = computation->matrices.size();
+  KALDI_ASSERT(orig_matrix_index > 0 && orig_matrix_index < num_matrices &&
+               new_matrix_index > 0 && new_matrix_index < num_matrices);
+  unordered_map<int32, std::pair<int32, int32> >::iterator
+      iter = computation->input_output_info.begin(),
+      end = computation->input_output_info.end();
+  for (; iter != end; ++iter) {
+    int32 network_node = iter->first,
+        &value_matrix_index = iter->second.first,
+        &deriv_matrix_index = iter->second.second;
+    if (nnet.IsOutputNode(network_node)) {
+      // value_matrix_index would be an output of the computation.
+      if (value_matrix_index == orig_matrix_index) {
+        value_matrix_index = new_matrix_index;
+        ans = true;
+      }
+    } else {
+      // deriv_matrix_index would be an output of the computation.
+      if (deriv_matrix_index == orig_matrix_index) {
+        // we'd only have derivatives for actual inputs. [note: we also allow
+        // users to provide inputs for component nodes, but these would not have
+        // derivatives.]
+        KALDI_ASSERT(nnet.IsInputNode(network_node));
+        deriv_matrix_index = new_matrix_index;
+        ans = true;
+      }
+    }
+  }
+  return ans;
+}
+
+
+VariableMergingOptimizer::VariableMergingOptimizer(
+    const NnetOptimizeOptions &config,
+    const Nnet &nnet,
+    const ComputationRequest &request,
+    NnetComputation *computation):
+    config_(config), nnet_(nnet), request_(request),
+    computation_(computation),
+    already_called_merge_variables_(false) {
+  analyzer_.Init(nnet, *computation);
+  ComputeMatrixToSubmatrix(*computation_, &matrix_to_submatrix_);
+  variable_dirty_.resize(analyzer_.variables.NumVariables(), false);
+}
+
+bool VariableMergingOptimizer::MergeVariables() {
+  KALDI_ASSERT(!already_called_merge_variables_);
+  already_called_merge_variables_ = true;
+  if (!config_.optimize)
+    return false;
+  bool merged = false;
+  int32 num_commands = computation_->commands.size();
+  for (int32 command_index = 0; command_index < num_commands;
+       command_index++) {
+    // This loop looks for pairs of sub-matrix indexes s1,s2 that we could
+    // potentially merge into a single variable.
+    const NnetComputation::Command &c =
+        computation_->commands[command_index];
+    int32 s1 = -1, s2 = -1;
+    if (c.command_type == kMatrixCopy &&
+        config_.remove_assignments) {
+      s2 = c.arg1;  // s2 is the written-to matrix.
+      s1 = c.arg2;
+    } else if (c.command_type == kPropagate &&
+               config_.propagate_in_place) {
+      const Component *component = nnet_.GetComponent(c.arg1);
+      if (component->Properties() & kPropagateInPlace) {
+        s1 = c.arg3;
+        s2 = c.arg4;  // s2 is the written-to matrix.
+      }
+    } else if ((c.command_type == kBackprop ||
+                c.command_type == kBackpropNoModelUpdate) &&
+               config_.backprop_in_place) {
+      const Component *component = nnet_.GetComponent(c.arg1);
+      if (component->Properties() & kBackpropInPlace) {
+        s1 = c.arg5;
+        s2 = c.arg6;  // s2 is the written-to matrix.
+        if (s1 == c.arg3 || s2 == c.arg3 || s1 == c.arg4 || s2 == c.arg4) {
+          // we don't think this should ever happen, but just out of an
+          // abundance of caution: if either of these submatrix indexes are the
+          // input-value or output-value args to Backprop, don't do the optimization.
+          s1 = -1;
+          s2 = -1;
+        }
+      }
+    }
+    if (s1 > 0 && s2 > 0) {
+      std::pair<bool,bool> p = MayBeMerged(command_index, s1, s2);
+      if (p.first) {
+        DoLeftMerge(command_index, s1, s2);
+        merged = true;
+      } else if (p.second) {
+        DoRightMerge(command_index, s1, s2);
+        merged = true;
+      }
+    }
+  }
+  if (merged) {
+    RenumberComputation(computation_);
+    RemoveNoOps(computation_);
+  }
+  return merged;
+}
+
+/**
+   This static function returns a SubMatrixInfo corresponding to
+   replacing the matrix-index in a's "matrix_index" with, essentially, sub-matrix b.
+   Of course the matrix_index will be b's "matrix_index", but we may
+   have to modify the row and column offsets.  The idea is that sub-matrix
+   submat_b should have the same dimensions as the matrix underlying
+   submat_a.
+ */
+static NnetComputation::SubMatrixInfo GetSubMatrixOfSubMatrix(
+    const NnetComputation &computation, int32 submat_a, int32 submat_b) {
+  KALDI_ASSERT(static_cast<size_t>(submat_a) < computation.submatrices.size());
+  KALDI_ASSERT(static_cast<size_t>(submat_b) < computation.submatrices.size());
+  const NnetComputation::SubMatrixInfo &a = computation.submatrices[submat_a],
+                                       &b = computation.submatrices[submat_b];
+  const NnetComputation::MatrixInfo &a_mat =
+      computation.matrices[a.matrix_index];
+  KALDI_ASSERT(a_mat.num_rows == b.num_rows && a_mat.num_cols == b.num_cols);
+  NnetComputation::SubMatrixInfo ans;
+  ans.matrix_index = b.matrix_index;
+  ans.row_offset = a.row_offset + b.row_offset;
+  ans.num_rows = a.num_rows;
+  ans.col_offset = a.col_offset + b.col_offset;
+  ans.num_cols = a.num_cols;
+  return ans;
+}
+
+void VariableMergingOptimizer::MarkAsDirty(int32 s) {
+  std::vector<int32> variable_indexes;
+  analyzer_.variables.AppendVariablesForSubmatrix(s, &variable_indexes);
+  std::vector<int32>::const_iterator iter = variable_indexes.begin(),
+      end = variable_indexes.end();
+  for (; iter != end; ++iter) {
+    int32 v = *iter;
+    KALDI_ASSERT(static_cast<size_t>(v) < variable_dirty_.size());
+    variable_dirty_[v] = true;
+  }
+}
+
+void VariableMergingOptimizer::DoRightMerge(int32 command_index,
+                                            int32 s1, int32 s2) {
+  // Prevent further optimizations touching s1 or s2 (we can
+  // try again in a later round of optimization, with a new
+  // instance of this class).
+  MarkAsDirty(s1);
+  MarkAsDirty(s2);
+
+  int32 m1 = computation_->submatrices[s1].matrix_index,
+      m2 = computation_->submatrices[s2].matrix_index;
+  KALDI_ASSERT(m1 != m2 && m1 > 0 && m2 > 0);
+  { // modify submatrices for submatrices of m1 to effectively be sub-matrices of
+    // s2 instead (they will refer to m2 as the matrix_index).
+    std::vector<int32>::const_iterator iter = matrix_to_submatrix_[m1].begin(),
+        end = matrix_to_submatrix_[m1].end();
+    for (; iter != end; ++iter) {
+      int32 submatrix_index = *iter;
+      KALDI_ASSERT(computation_->submatrices[submatrix_index].matrix_index==m1);
+      computation_->submatrices[submatrix_index] =
+          GetSubMatrixOfSubMatrix(*computation_, submatrix_index, s2);
+    }
+  }
+  const std::vector<MatrixAccesses> &matrix_accesses = analyzer_.matrix_accesses;
+  // - If m1 was an input, replace it as an input with m2
+  bool replaced = ReplaceInInput(nnet_, m1, m2, computation_);
+  KALDI_ASSERT(replaced == matrix_accesses[m1].is_input);
+  if (replaced) {  // Remove the command that allocates m2.
+    int32 alloc_command = matrix_accesses[m2].allocate_command;
+    KALDI_ASSERT(alloc_command != -1);
+    computation_->commands[alloc_command].command_type =
+        kNoOperation;
+  }
+  // we keep matrix m2 (so m2 is m_to_keep, m1 is m_to_discard).
+  DoMergeCommon(command_index, m2, m1);
+}
+
+void VariableMergingOptimizer::DoMergeCommon(int32 command_index,
+                                             int32 m_to_keep,
+                                             int32 m_to_discard) {
+  ComputationAnalysis analysis(*computation_, analyzer_);
+  NnetComputation::Command &c = computation_->commands[command_index];
+  const std::vector<MatrixAccesses> &matrix_accesses =
+      analyzer_.matrix_accesses;
+
+  //  - If it was case (a), replace the assignment command with a no-op.
+  if (c.command_type == kMatrixCopy) {
+    // remove the command.
+    c.command_type = kNoOperation;
+    c.arg1 = -1;
+    c.arg2 = -1;
+  }
+
+  //   - If both m_to_keep and m_to_discard have commands that deallocate them,
+  //    keep only the allocation command for m_to_keep, and make sure it's after
+  //    the last access of m_to_discard (otherwise delete any deallocation
+  //    command).
+  int32 dealloc_keep = matrix_accesses[m_to_keep].deallocate_command,
+      dealloc_discard = matrix_accesses[m_to_discard].deallocate_command;
+  if (dealloc_keep != -1 && dealloc_discard != -1) {
+    KALDI_ASSERT(analysis.LastMatrixAccess(m_to_discard) < dealloc_keep);
+    computation_->commands[dealloc_discard].command_type = kNoOperation;
+  } else {
+    if (dealloc_keep != -1)
+      computation_->commands[dealloc_keep].command_type =
+          kNoOperation;
+    if (dealloc_discard != -1)
+      computation_->commands[dealloc_discard].command_type =
+          kNoOperation;
+  }
+
+  //   - If both m_to_keep and m_to_discard have commands that allocate them,
+  //     keep only the allocation command for m_to_keep and make sure it's
+  //     before the first access of m_to_discard.
+  //     (otherwise delete any allocation command).
+  int32 alloc_keep = matrix_accesses[m_to_keep].allocate_command,
+      alloc_discard = matrix_accesses[m_to_discard].allocate_command;
+  if (alloc_keep != -1 && alloc_discard != -1) {
+    KALDI_ASSERT(analysis.FirstMatrixAccess(m_to_discard) > alloc_keep);
+    NnetComputation::Command
+        &keep_alloc_command = computation_->commands[alloc_keep],
+        &discard_alloc_command = computation_->commands[alloc_discard];
+    discard_alloc_command.command_type = kNoOperation;
+    if (keep_alloc_command.command_type == kAllocMatrixUndefined) {
+      keep_alloc_command.command_type = kAllocMatrixZeroed;
+    } else if (keep_alloc_command.command_type == kAllocMatrixFromOther) {
+      keep_alloc_command.command_type = kAllocMatrixFromOtherZeroed;
+    }
+  } else {
+    if (alloc_keep != -1)
+      computation_->commands[alloc_keep].command_type =
+          kNoOperation;
+    if (alloc_discard != -1)
+      computation_->commands[alloc_discard].command_type =
+          kNoOperation;
+  }
+
+  //  If the matrix to discard had stride_type == kStrideEqualNumCols, set the
+  //  matrix to keep's stride_type to kStrideEqualNuMCols.
+  if (computation_->matrices[m_to_discard].stride_type == kStrideEqualNumCols) {
+    computation_->matrices[m_to_keep].stride_type = kStrideEqualNumCols;
+    // ... and perform an additional check.
+    KALDI_ASSERT(computation_->matrices[m_to_discard].num_rows ==
+                 computation_->matrices[m_to_keep].num_rows &&
+                 computation_->matrices[m_to_discard].num_cols ==
+                 computation_->matrices[m_to_keep].num_cols);
+  }
+}
+
+void VariableMergingOptimizer::DoLeftMerge(int32 command_index,
+                                           int32 s1, int32 s2) {
+  // Prevent further optimizations touching s1 or s2 (we can
+  // try again in a later round of optimization, with a new
+  // instance of this class).
+  MarkAsDirty(s1);
+  MarkAsDirty(s2);
+
+  int32 m1 = computation_->submatrices[s1].matrix_index,
+      m2 = computation_->submatrices[s2].matrix_index;
+  KALDI_ASSERT(m1 != m2 && m1 > 0 && m2 > 0);
+  { // modify submatrices for submatrices of m2 to effectively be sub-matrices of
+    // s1 instead (they will refer to m1 as the matrix_index).
+    std::vector<int32>::const_iterator iter = matrix_to_submatrix_[m2].begin(),
+        end = matrix_to_submatrix_[m2].end();
+    for (; iter != end; ++iter) {
+      int32 submatrix_index = *iter;
+      KALDI_ASSERT(computation_->submatrices[submatrix_index].matrix_index==m2);
+      computation_->submatrices[submatrix_index] =
+          GetSubMatrixOfSubMatrix(*computation_, submatrix_index, s1);
+    }
+  }
+  const std::vector<MatrixAccesses> &matrix_accesses = analyzer_.matrix_accesses;
+  // - If m2 was an output, replace it as an input with m1.
+  bool replaced = ReplaceInOutput(nnet_, m2, m1, computation_);
+  KALDI_ASSERT(replaced == matrix_accesses[m2].is_output);
+  if (replaced) {  // Remove the command that deallocates m1.
+    int32 dealloc_command = matrix_accesses[m1].deallocate_command;
+    KALDI_ASSERT(dealloc_command != -1);
+    computation_->commands[dealloc_command].command_type =
+        kNoOperation;
+  }
+  // we keep matrix m1 (so m1 is m_to_keep, m2 is m_to_discard).
+  DoMergeCommon(command_index, m1, m2);
+}
+
+
+
+
+std::pair<bool,bool> VariableMergingOptimizer::MayBeMerged(
+    int32 command_index, int32 s1, int32 s2) const {
+  KALDI_ASSERT(s1 > 0 && s2 > 0 && static_cast<size_t>(command_index) <
+               computation_->commands.size());
+  if (!config_.allow_left_merge && !config_.allow_right_merge)
+    return std::pair<bool,bool>(false,false);
+  int32 m1 = computation_->submatrices[s1].matrix_index,
+      m2 = computation_->submatrices[s2].matrix_index;
+  // we can't merge two different submatrices of the same matrix.
+  if (m1 == m2) return std::pair<bool,bool>(false,false);
+  std::vector<int32> variable_indexes;
+  analyzer_.variables.AppendVariablesForSubmatrix(s1, &variable_indexes);
+  analyzer_.variables.AppendVariablesForSubmatrix(s2, &variable_indexes);
+  std::vector<int32>::iterator iter = variable_indexes.begin(),
+      end = variable_indexes.end();
+  // condition c5:
+  for (; iter != end; ++iter)
+    if (variable_dirty_[*iter])
+      return std::pair<bool,bool>(false,false);
+  const std::vector<MatrixAccesses> &matrix_accesses = analyzer_.matrix_accesses;
+  const MatrixAccesses &m1_access = matrix_accesses[m1],
+      &m2_access = matrix_accesses[m2];
+  // condition c1:
+  if ((m1_access.is_input && m2_access.is_input) ||
+      (m1_access.is_output && m2_access.is_output))
+    return std::pair<bool,bool>(false,false);
+  // condition c2:
+  if ((m1_access.is_input || m1_access.is_output ||
+       m2_access.is_input || m2_access.is_output) &&
+      (!computation_->IsWholeMatrix(s1) ||
+       !computation_->IsWholeMatrix(s2)))
+    return std::pair<bool,bool>(false,false);
+  bool left = config_.allow_left_merge,
+      right = config_.allow_right_merge;
+  // condition c3:
+  if (!computation_->IsWholeMatrix(s2)) left = false;
+  // condition c4:
+  if (!computation_->IsWholeMatrix(s1)) right = false;
+  // condition c6:
+  if (computation_->matrices[m2].stride_type == kStrideEqualNumCols &&
+      !computation_->IsWholeMatrix(s1)) left = false;
+  // condition c7:
+  if (computation_->matrices[m1].stride_type == kStrideEqualNumCols &&
+      !computation_->IsWholeMatrix(s2)) right = false;
+
+
+  if (!left && !right)  // save some time.
+    return std::pair<bool,bool>(false,false);
+  bool is_assignment = (computation_->commands[command_index].command_type ==
+                        kMatrixCopy);
+  ComputationAnalysis analysis(*computation_, analyzer_);
+  if (is_assignment) {
+    if (analysis.FirstAccess(s2) == command_index &&
+        analysis.LastWriteAccess(s1) < command_index &&
+        analysis.LastAccess(s1) <
+        analysis.DataInvalidatedCommand(command_index, s2)) {
+      return std::pair<bool,bool>(left, right);  // possible success.
+    }
+  } else {
+    if (analysis.FirstAccess(s2) == command_index &&
+        analysis.LastAccess(s1) == command_index) {
+      return std::pair<bool,bool>(left, right);  // possible success.
+    }
+  }
+  // failure.
+  return std::pair<bool,bool>(false,false);
+}
+
+void ModelUpdateConsolidator::AppendDebugInfoForSubmatrix(
+    int32 submatrix_index,
+    NnetComputation::MatrixDebugInfo *debug_info) const {
+  KALDI_ASSERT(!computation_->matrix_debug_info.empty());
+  KALDI_ASSERT(static_cast<size_t>(submatrix_index) <
+               computation_->submatrices.size());
+  NnetComputation::SubMatrixInfo submatrix_info =
+      computation_->submatrices[submatrix_index];
+  int32 matrix_index = submatrix_info.matrix_index;
+  KALDI_ASSERT(matrix_index > 0 && static_cast<size_t>(matrix_index) <
+               computation_->matrix_debug_info.size());
+  const NnetComputation::MatrixDebugInfo &src_info =
+      computation_->matrix_debug_info[matrix_index];
+  debug_info->is_deriv = src_info.is_deriv;
+  KALDI_ASSERT(src_info.cindexes.size() ==
+               computation_->matrices[matrix_index].num_rows);
+  int32 row_begin = submatrix_info.row_offset,
+      row_end = row_begin + submatrix_info.num_rows;
+  debug_info->cindexes.insert(debug_info->cindexes.end(),
+                             src_info.cindexes.begin() + row_begin,
+                             src_info.cindexes.begin() + row_end);
+}
+
+
+// see comment by declaration in header.
+int32 ModelUpdateConsolidator::ConsolidateSubmatrices(
+    const std::vector<int32> &commands,
+    const std::vector<int32> &submatrices) {
+  int32 num_submatrices = submatrices.size();
+  KALDI_ASSERT(num_submatrices > 1 && commands.size() == submatrices.size());
+  int32 first_submatrix = submatrices[0];
+  int32 num_cols = computation_->submatrices[first_submatrix].num_cols,
+      num_rows = 0;
+  MatrixStrideType stride_type = kDefaultStride;
+  NnetComputation::MatrixDebugInfo debug_info;
+  for (int32 i = 0; i < num_submatrices; i++) {
+    int32 submatrix = submatrices[i];
+    num_rows += computation_->submatrices[submatrix].num_rows;
+    KALDI_ASSERT(computation_->submatrices[submatrix].num_cols == num_cols);
+    if (!computation_->matrix_debug_info.empty())
+      AppendDebugInfoForSubmatrix(submatrix, &debug_info);
+    if (computation_->IsWholeMatrix(submatrix)) {
+      int32 matrix = computation_->submatrices[submatrix].matrix_index;
+      if (computation_->matrices[matrix].stride_type == kStrideEqualNumCols)
+        stride_type = kStrideEqualNumCols;
+    }
+  }
+  // new_whole_submatrix is a new submatrix index corresponding to the whole
+  // of a new matrix that we are creating.
+  int32 new_whole_submatrix = computation_->NewMatrix(num_rows, num_cols,
+                                                      stride_type);
+  // Add a command at the very start, to initialize this new matrix.
+  int32 new_matrix_index =
+      computation_->submatrices[new_whole_submatrix].matrix_index;
+  // we can later on optimize this zeroed initialization to an undefined
+  // initialization.
+  extra_commands_[0].push_back(
+      NnetComputation::Command(kAllocMatrixZeroed, new_matrix_index));
+  final_deallocate_commands_.push_back(
+      NnetComputation::Command(kDeallocMatrix, new_matrix_index));
+  if (!computation_->matrix_debug_info.empty())
+    computation_->matrix_debug_info[new_matrix_index].Swap(&debug_info);
+
+  int32 row_offset = 0;
+  for (int32 i = 0; i < num_submatrices; i++) {
+    int32 submatrix_index = submatrices[i];
+    int32 this_num_rows = computation_->submatrices[submatrix_index].num_rows;
+    // submatrix corresponding to the part of the new matrix corresponding
+    // to 'submatrices[i]'.
+    int32 new_submatrix = computation_->NewSubMatrix(new_whole_submatrix,
+                                                     row_offset, this_num_rows,
+                                                     0, num_cols);
+    // Just before command 'commands[i]', add a command that assigns to the
+    // submatrix numbered 'new_submatrix' the contents of the submatrix numbered
+    // 'submatrices[i]'.  Note: we hope that a later pass of optimization
+    // (VariableMergingOptimization) will remove this redundant copy by
+    // having the operation that created it right directly to the location
+    // we want it to be.
+    NnetComputation::Command c(kMatrixCopy, new_submatrix, submatrices[i]);
+    extra_commands_[commands[i]].push_back(c);
+    row_offset += this_num_rows;
+  }
+  KALDI_ASSERT(row_offset == num_rows);
+  return new_whole_submatrix;
+}
+
+void ModelUpdateConsolidator::AddCommandsToComputation() {
+  KALDI_ASSERT(computation_->commands.size() == extra_commands_.size());
+  int32 old_num_commands = computation_->commands.size(),
+      new_num_commands = old_num_commands +
+      static_cast<int32>(final_commands_.size() +
+                         final_deallocate_commands_.size());
+  for (size_t i = 0; i < extra_commands_.size(); i++)
+    new_num_commands += static_cast<int32>(extra_commands_[i].size());
+  std::vector<NnetComputation::Command> new_commands;
+  new_commands.reserve(new_num_commands);
+  for (int32 c = 0; c < old_num_commands; c++) {
+    new_commands.insert(new_commands.end(),
+                        extra_commands_[c].begin(), extra_commands_[c].end());
+    new_commands.push_back(computation_->commands[c]);
+  }
+  new_commands.insert(new_commands.end(),
+                      final_commands_.begin(), final_commands_.end());
+  new_commands.insert(new_commands.end(),
+                      final_deallocate_commands_.begin(),
+                      final_deallocate_commands_.end());
+  computation_->commands.swap(new_commands);
+}
+
+/** This function, called from ConsolidateModelUpdate, is passed a list of
+    commands that are all backprops for the same component, and it consolidates
+    them into a single model-update command. */
+void ModelUpdateConsolidator::ConsolidateUpdateForComponent(
+    int32 component_index,
+    const std::vector<int32> &backprop_commands) {
+  const Component *component = nnet_.GetComponent(component_index);
+  int32 num_backprop_commands = backprop_commands.size();
+
+  bool need_input = (component->Properties() & kBackpropNeedsInput) != 0,
+      need_output = (component->Properties() & kBackpropNeedsOutput) != 0;
+
+  std::vector<int32>  input_submatrices(num_backprop_commands),
+      output_submatrices(num_backprop_commands),
+      output_deriv_submatrices(num_backprop_commands);
+
+  for (int32 i = 0; i < num_backprop_commands; i++) {
+    int32 command_index = backprop_commands[i];
+    NnetComputation::Command &command =
+        computation_->commands[command_index];
+    // arg2 must be 0 because simple components don't use precomputed indexes.
+    KALDI_ASSERT(command.command_type == kBackprop && command.arg2 == 0);
+    command.command_type = kBackpropNoModelUpdate;
+    int32 input_submatrix = command.arg3,
+        output_submatrix = command.arg4,
+        output_deriv_submatrix = command.arg5;
+    KALDI_ASSERT((input_submatrix != 0) == need_input &&
+                 (output_submatrix != 0) == need_output);
+    input_submatrices[i] = input_submatrix;
+    output_submatrices[i] = output_submatrix;
+    output_deriv_submatrices[i] = output_deriv_submatrix;
+  }
+  // Get the sub-matrix indexes of whichever of the consolidated matrices we
+  // need (will usually be input_submatrix and output_deriv_submatrix).
+  int32 input_submatrix = (need_input ?
+                           ConsolidateSubmatrices(backprop_commands,
+                                                  input_submatrices) : 0),
+      output_submatrix = (need_output ?
+                         ConsolidateSubmatrices(backprop_commands,
+                                                output_submatrices) : 0),
+      output_deriv_submatrix = ConsolidateSubmatrices(backprop_commands,
+                                                      output_deriv_submatrices);
+  int32 precomputed_indexes_index = 0,  // unused since simple component
+      input_deriv_submatrix = 0;  // we don't need the input-deriv, so this is
+                                  // zero.
+  NnetComputation::Command c(kBackprop, component_index, precomputed_indexes_index,
+                             input_submatrix, output_submatrix,
+                             output_deriv_submatrix, input_deriv_submatrix);
+  final_commands_.push_back(c);
+}
+
+ModelUpdateConsolidator::ModelUpdateConsolidator(
+    const Nnet &nnet,
+    NnetComputation *computation):
+    nnet_(nnet), computation_(computation),
+    extra_commands_(computation->commands.size()) { }
+
+void ModelUpdateConsolidator::ConsolidateModelUpdate() {
+  int32 num_components = nnet_.NumComponents(),
+      num_commands = computation_->commands.size();
+  // 'backprop_commands' is a list, for each component (but nonempty only for
+  // updatable components), of the command indexes for the backprop commands.
+  std::vector<std::vector<int32> > backprop_commands(num_components);
+  for (int32 command_index = 0;
+       command_index < num_commands; command_index++) {
+    const NnetComputation::Command &c = computation_->commands[command_index];
+    if (c.command_type == kBackprop) {
+      int32 component_index = c.arg1;
+      const Component *component = nnet_.GetComponent(component_index);
+      if (component->Properties() & kUpdatableComponent)
+        backprop_commands[component_index].push_back(command_index);
+    }
+  }
+  bool consolidated = false;
+  for (int32 component = 0; component < num_components; component++) {
+    if (backprop_commands[component].size() > 1) {
+      ConsolidateUpdateForComponent(component,
+                                    backprop_commands[component]);
+      consolidated = true;
+    }
+  }
+  if (!consolidated)  // This is an optimization to avoid redundant computation
+    return;           // if there is nothing to do.
+  // the following function call commits all the commands we stored in member
+  // variables, to computation_->commands.
+  AddCommandsToComputation();
+}
+
+// inline
+void DerivativeTimeLimiter::GetPruneValues(int32 initial_submatrix,
+                                           int32 new_submatrix,
+                                           int32 *left_prune,
+                                           int32 *right_prune) const {
+  KALDI_ASSERT(initial_submatrix > 0 && new_submatrix > 0);
+  const NnetComputation::SubMatrixInfo
+      initial_info = computation_->submatrices[initial_submatrix],
+      new_info = computation_->submatrices[new_submatrix];
+  KALDI_ASSERT(initial_info.matrix_index == new_info.matrix_index);
+  *left_prune = new_info.row_offset - initial_info.row_offset;
+  if (right_prune != NULL) {
+    *right_prune = initial_info.num_rows - new_info.num_rows - *left_prune;
+    KALDI_ASSERT(*left_prune >= 0 && *right_prune >= 0);
+  }
+}
+
+// modify commands to take into account the fact that some matrices are zero or
+// partially zero.  Allocation commands and sizes of underlying matrices are not
+// affected-- we'll work out later on, what to do with them.
+void DerivativeTimeLimiter::ModifyCommand(NnetComputation::Command *command) {
+  CommandType command_type = command->command_type;
+  switch (command_type) {
+    case kAllocMatrixUndefined:
+    case kAllocMatrixFromOther:
+    case kAllocMatrixFromOtherZeroed:
+      KALDI_ERR << "No undefined initialization or initialization-from-other "
+                << "is allowed before LimitDerivativeTimes";
+      break;
+    case kAllocMatrixZeroed:
+    case kDeallocMatrix:
+      break;  // we'll deal with allocation and deallocation later on.
+    case kPropagate:
+      // Propagate commands are unchanged.
+      break;
+    case kStoreStats: {
+      const Component *component = nnet_.GetComponent(command->arg1);
+      if ((component->Properties() & kSimpleComponent)) {
+        // We choose to apply the time-limitation here, as it will save time and
+        // is probably what the user wants.
+        int32 submatrix_mapped = submatrix_map_[command->arg2];
+        if (submatrix_mapped == 0)
+          command->command_type = kNoOperation;
+        else
+          command->arg2 = submatrix_mapped;
+      }
+      break;
+    }
+    case kBackpropNoModelUpdate:  // we actually don't expect to encounter this,
+                                  // but it's trivial to support as it's the
+                                  // same as backprop.
+    case kBackprop: {
+      const Component *component = nnet_.GetComponent(command->arg1);
+      if (!(component->Properties() & kSimpleComponent)) {
+        // we don't (yet) do this optimization for non-simple Components...
+        // it would be a bit more complicated as we'd have to recompute the
+        // PrecomputedIndexes.
+        break;
+      }
+      int32 input_submatrix = command->arg3,
+          output_submatrix = command->arg4,
+          output_deriv_submatrix = command->arg5,
+          input_deriv_submatrix = command->arg6;
+      int32 mapped_input_submatrix = submatrix_map_[input_submatrix],
+           mapped_output_submatrix =  submatrix_map_[output_submatrix],
+     mapped_output_deriv_submatrix = submatrix_map_[output_deriv_submatrix],
+      mapped_input_deriv_submatrix = submatrix_map_[input_deriv_submatrix];
+
+      if (mapped_output_deriv_submatrix == 0) {
+        // completely outside range..
+        KALDI_ASSERT(mapped_input_deriv_submatrix == 0 &&
+                     mapped_input_submatrix == 0 &&
+                     mapped_output_submatrix == 0);
+        // just delete the command.
+        command->command_type = kNoOperation;
+      } else if (mapped_output_deriv_submatrix !=
+                 output_deriv_submatrix) {
+        // we're operating on a range of the input or output.
+        command->arg3 = mapped_input_submatrix;
+        command->arg4 = mapped_output_submatrix;
+        command->arg5 = mapped_output_deriv_submatrix;
+        command->arg6 = mapped_input_deriv_submatrix;
+      }
+    }
+      break;
+    case kMatrixCopy: case kMatrixAdd:
+      MapSimpleMatrixCommand(command);
+      break;
+    case kCopyRows: case kAddRows:
+      MapIndexesCommand(command);
+      break;
+    case kCopyRowsMulti: case kCopyToRowsMulti:
+    case kAddRowsMulti: case kAddToRowsMulti:
+      MapIndexesMultiCommand(command);
+      break;
+    case kAddRowRanges: {
+      MapAddRowRangesCommand(command);
+      break;
+    }
+    case kNoOperation: case kNoOperationMarker:
+      break;
+    default:
+      KALDI_ERR << "Un-handled command type.";
+  }
+}
+
+void DerivativeTimeLimiter::MapSimpleMatrixCommand(NnetComputation::Command *c) {
+  int32 submatrix1 = c->arg1,
+      submatrix2 = c->arg2;
+  int32 submatrix1_mapped = submatrix_map_if_deriv_[submatrix1],
+      submatrix2_mapped = submatrix_map_if_deriv_[submatrix2];
+  if (submatrix1_mapped == submatrix1 &&
+      submatrix2_mapped == submatrix2) {
+    // nothing to do.
+    return;
+  }
+  if (submatrix1_mapped == 0 || submatrix2_mapped == 0) {
+    // remove the operation-- it has nothing to do.
+    c->command_type = kNoOperation;
+    return;
+  }
+  // left_prune1 is the nmber of rows pruned away on the left for submatrix1.
+  int32 orig_num_rows = computation_->submatrices[submatrix1].num_rows,
+      left_prune1, left_prune2, right_prune1, right_prune2;
+  GetPruneValues(submatrix1, submatrix1_mapped, &left_prune1, &right_prune1);
+  GetPruneValues(submatrix2, submatrix2_mapped, &left_prune2, &right_prune2);
+  if (left_prune1 == left_prune2 && right_prune1 == right_prune2) {
+    // we took the same number of rows away from the left and right for
+    // both arguments; the normal mapped values will work in this case
+    c->arg1 = submatrix1_mapped;
+    c->arg2 = submatrix2_mapped;
+  } else {
+    // there is some kind of mismatch- we'll prune back to what remains
+    // after applying the maximum pruning on the left and right.
+    int32 left_prune = std::max(left_prune1, left_prune2),
+        right_prune = std::max(right_prune1, right_prune2);
+    if (left_prune + right_prune >= orig_num_rows) {
+      // everything was pruned away; remove the operation.
+      c->command_type = kNoOperation;
+      return;
+    } else {
+      int32 num_rows = orig_num_rows - left_prune - right_prune;
+      // note: the call NewSubMatrix effectively gives us a sub-matrix of a
+      // subm-matrix.
+      c->arg1 = computation_->NewSubMatrix(submatrix1,
+                                           left_prune, num_rows, 0, -1);
+      c->arg2 = computation_->NewSubMatrix(submatrix2,
+                                           left_prune, num_rows, 0, -1);
+    }
+  }
+}
+
+// does the processing for a command of type kCopyRows or kAddRows, where
+// 1st and 2nd args are submatrix indexes and the 3rd arg is a vector of
+// row-indexes.
+void DerivativeTimeLimiter::MapIndexesCommand(NnetComputation::Command *c) {
+  int32 output_submatrix = c->arg1,
+      input_submatrix = c->arg2;
+  int32 input_submatrix_mapped = submatrix_map_if_deriv_[input_submatrix],
+      output_submatrix_mapped = submatrix_map_if_deriv_[output_submatrix];
+  // input_submatrix_mapped and output_submatrix_mapped map both submatrices to
+  // just the portion that we are treating as nonzero.
+
+  if (input_submatrix_mapped == input_submatrix &&
+      output_submatrix_mapped == output_submatrix) {
+    return;  // nothing is changed.
+  }
+  if (input_submatrix_mapped == 0 ||
+      output_submatrix_mapped == 0) {
+    // Either input or output is all zeros; make the command a no-op.
+    // It may not be obvious that in the case of kCopyRows it would
+    // be valid to make this a no-op (because what if the existing
+    // contents were nonzero?), but we insist that this optimization
+    // come before optimizations, and we know that the originally
+    // generated computation would not overwrite a nonzero value
+    // (and there are no undefined values because we make sure to
+    // initialize everything with zeros; ununitialized values are
+    // allowed only at a later optimization stage.
+    c->command_type = kNoOperation;
+    return;
+  }
+  const std::vector<int32> &old_indexes = computation_->indexes[c->arg3];
+
+  int32 left_prune_input, left_prune_output;
+  GetPruneValues(input_submatrix, input_submatrix_mapped,
+                 &left_prune_input, NULL);
+  GetPruneValues(output_submatrix, output_submatrix_mapped,
+                 &left_prune_output, NULL);
+  int32 new_num_input_rows =
+      computation_->submatrices[input_submatrix_mapped].num_rows,
+      new_num_output_rows =
+      computation_->submatrices[output_submatrix_mapped].num_rows;
+  std::vector<int32> new_indexes(new_num_output_rows);
+  bool must_keep_command = false;
+  for (int32 i = 0; i < new_num_output_rows; i++) {
+    // the index into the 'new_indexes' vector is the row of the output
+    // submatrix; the value is the row of the input submatrix.
+    int32 orig_index = old_indexes[i + left_prune_output];
+    if (orig_index == -1) {
+      new_indexes[i] = -1;
+    } else {
+      int32 mapped_index = orig_index - left_prune_input;
+      if (mapped_index >= 0 && mapped_index < new_num_input_rows) {
+        new_indexes[i] = mapped_index;
+        must_keep_command = true;
+      } else {
+        // input was out of range (i.e. it takes a value that we are asserting
+        // is zero)-- use -1 as the index.
+        new_indexes[i] = -1;
+      }
+    }
+  }
+  if (!must_keep_command) {
+    c->command_type = kNoOperation;
+    return;
+  }
+  int32 new_indexes_index = computation_->indexes.size();
+  computation_->indexes.push_back(new_indexes);
+  c->arg1 = output_submatrix_mapped;
+  c->arg2 = input_submatrix_mapped;
+  c->arg3 = new_indexes_index;
+}
+
+void DerivativeTimeLimiter::MapIndexesMultiCommand(NnetComputation::Command *c) {
+  int32 submatrix_arg = c->arg1,
+      indexes_multi_arg = c->arg2;
+  int32 submatrix_mapped = submatrix_map_if_deriv_[submatrix_arg];
+  if (submatrix_mapped == 0) {
+    c->command_type = kNoOperation;
+    return;
+  }
+  int32 left_prune;
+  GetPruneValues(submatrix_arg, submatrix_mapped, &left_prune, NULL);
+  int32 new_num_rows = computation_->submatrices[submatrix_mapped].num_rows;
+  const std::vector<std::pair<int32, int32> > &old_indexes_multi(
+      computation_->indexes_multi[indexes_multi_arg]);
+  std::vector<std::pair<int32, int32> > new_indexes_multi(new_num_rows);
+  for (int32 i = 0; i < new_num_rows; i++) {
+    std::pair<int32,int32> &this_pair = new_indexes_multi[i];
+    this_pair = old_indexes_multi[i + left_prune];
+    int32 this_submatrix = this_pair.first,
+        this_row = this_pair.second;
+    if (this_submatrix == -1)  // don't map the (-1, -1) pairs.
+      continue;
+    int32 this_submatrix_mapped = submatrix_map_if_deriv_[this_submatrix];
+    if (this_submatrix_mapped == this_submatrix) {
+      continue;
+    } else if (this_submatrix_mapped == 0) {  // was completely out of range.
+      this_pair.first = -1;
+      this_pair.second = -1;
+    } else {
+      int32 this_left_prune, this_num_rows =
+          computation_->submatrices[this_submatrix_mapped].num_rows;
+      GetPruneValues(this_submatrix, this_submatrix_mapped,
+                     &this_left_prune, NULL);
+      int32 this_row_mapped = this_row - this_left_prune;
+      if (this_row_mapped >= 0 && this_row_mapped < this_num_rows) {
+        this_pair.first = this_submatrix_mapped;
+        this_pair.second = this_row_mapped;
+      } else {
+        this_pair.first = -1;
+        this_pair.second = -1;
+      }
+    }
+  }
+  if (submatrix_mapped == submatrix_arg &&
+      new_indexes_multi == old_indexes_multi)  // nothing changed.
+    return;
+  bool command_can_be_deleted = true;
+  std::vector<std::pair<int32, int32> >::iterator
+      iter = new_indexes_multi.begin(),
+      end = new_indexes_multi.end();
+  for (; iter != end; ++iter) {
+    if (iter->first != -1) {
+      command_can_be_deleted = false;
+      break;
+    }
+  }
+  if (command_can_be_deleted) {
+    c->command_type = kNoOperation;
+    return;
+  }
+  c->arg1 = submatrix_mapped;
+  c->arg2 = computation_->indexes_multi.size();
+  computation_->indexes_multi.push_back(new_indexes_multi);
+}
+
+void DerivativeTimeLimiter::MapAddRowRangesCommand(
+    NnetComputation::Command *c) {
+  int32 dest_submatrix = c->arg1,
+      src_submatrix = c->arg2,
+      indexes_ranges_index = c->arg3;
+  int32 dest_submatrix_mapped = submatrix_map_if_deriv_[dest_submatrix],
+      src_submatrix_mapped = submatrix_map_if_deriv_[src_submatrix];
+  if (dest_submatrix_mapped == dest_submatrix &&
+      src_submatrix_mapped == src_submatrix)
+    return;
+  if (dest_submatrix_mapped == 0 || src_submatrix_mapped == 0) {
+    c->command_type = kNoOperation;
+    return;
+  }
+  int32 dest_num_rows = computation_->submatrices[dest_submatrix_mapped].num_rows,
+      src_num_rows = computation_->submatrices[src_submatrix_mapped].num_rows,
+      src_left_prune, dest_left_prune;
+  GetPruneValues(dest_submatrix, dest_submatrix_mapped,
+                 &dest_left_prune, NULL);
+  GetPruneValues(src_submatrix, src_submatrix_mapped,
+                 &src_left_prune, NULL);
+  const std::vector<std::pair<int32,int32> > &old_indexes_ranges(
+      computation_->indexes_ranges[indexes_ranges_index]);
+  std::vector<std::pair<int32,int32> > new_indexes_ranges(dest_num_rows);
+  for (int32 i = 0; i < dest_num_rows; i++) {
+    std::pair<int32, int32> &this_pair = new_indexes_ranges[i];
+    this_pair = old_indexes_ranges[i + dest_left_prune];
+    // note: the .first is a start-index in the src matrix, and the .second is
+    // an end-index in the src matrix.
+    int32 new_first = this_pair.first - src_left_prune,
+        new_second = this_pair.second - src_left_prune;
+    if (new_first < 0) new_first = 0;
+    if (new_first >= src_num_rows) new_first = src_num_rows - 1;
+    if (new_second < 0) new_second = 0;
+    if (new_second >= src_num_rows) new_second = src_num_rows - 1;
+    if (new_first == new_second) {
+      // for clarity, represent empty ranges as (-1, -1).
+      new_first = -1;
+      new_second = -1;
+    }
+    KALDI_ASSERT(new_second >= new_first);
+    this_pair.first = new_first;
+    this_pair.second = new_second;
+  }
+  c->arg1 = dest_submatrix_mapped;
+  c->arg2 = src_submatrix_mapped;
+  c->arg3 = computation_->indexes_ranges.size();
+  computation_->indexes_ranges.push_back(new_indexes_ranges);
+}
+
+
+DerivativeTimeLimiter::DerivativeTimeLimiter(const Nnet &nnet,
+                                             int32 min_deriv_time,
+                                             int32 max_deriv_time,
+                                             NnetComputation *computation):
+    nnet_(nnet),
+    min_deriv_time_(min_deriv_time),
+    max_deriv_time_(max_deriv_time),
+    computation_(computation) { }
+
+void DerivativeTimeLimiter::LimitDerivTimes() {
+  KALDI_ASSERT(max_deriv_time_ >= min_deriv_time_);
+  if (min_deriv_time_ == std::numeric_limits<BaseFloat>::min() &&
+      max_deriv_time_ == std::numeric_limits<BaseFloat>::max())
+    return;  // nothing to do.
+
+  EnsureMatricesHaveEntireSubmatrices();
+  ComputeMatrixPruneInfo();
+  ComputeSubmatrixMaps();
+  ModifyCommands();
+  PruneMatrices();
+  RemoveNoOps(computation_);
+  RenumberComputation(computation_);
+}
+
+void DerivativeTimeLimiter::EnsureMatricesHaveEntireSubmatrices() {
+  int32 num_matrices = computation_->matrices.size(),
+      num_submatrices = computation_->submatrices.size();
+  entire_submatrix_.clear();
+  entire_submatrix_.resize(num_matrices, -1);
+  entire_submatrix_[0] = 0;
+  for (int32 s = 1; s < num_submatrices; s++)
+    if (computation_->IsWholeMatrix(s))
+      entire_submatrix_[computation_->submatrices[s].matrix_index] = s;
+  for (int32 m = 1; m < num_matrices; m++)
+    if (entire_submatrix_[m] == -1)
+      entire_submatrix_[m] = computation_->NewSubMatrix(m, 0, -1, 0, -1);
+}
+
+void DerivativeTimeLimiter::ComputeMatrixPruneInfo() {
+  KALDI_ASSERT(computation_->matrix_debug_info.size() ==
+               computation_->matrices.size() &&
+               "Limiting derivative times requires debug info.");
+  const int32 num_matrices = computation_->matrices.size(),
+      min_deriv_time = min_deriv_time_,
+      max_deriv_time = max_deriv_time_;
+  matrix_prune_info_.resize(num_matrices);
+  // matrix_prune_info_[0] will remain undefined.
+  for (int32 matrix_index = 1; matrix_index < num_matrices; matrix_index++) {
+    NnetComputation::MatrixDebugInfo &debug_info =
+        computation_->matrix_debug_info[matrix_index];
+    MatrixPruneInfo &prune_info = matrix_prune_info_[matrix_index];
+    const std::vector<Cindex> &cindexes = debug_info.cindexes;
+    int32 num_rows = computation_->matrices[matrix_index].num_rows;
+    KALDI_ASSERT(num_rows == static_cast<int32>(cindexes.size()));
+    int32 first_row_within_range = num_rows,
+        last_row_within_range = -1;
+    for (int32 i = 0; i < num_rows; i++) {
+      int32 t = cindexes[i].second.t;
+      if (t >= min_deriv_time && t <= max_deriv_time) {
+        if (i < first_row_within_range) first_row_within_range = i;
+        if (i > last_row_within_range) last_row_within_range = i;
+      }
+    }
+    if (last_row_within_range == -1) {
+      prune_info.fully_inside_range = false;
+      prune_info.partly_inside_range = false;
+    } else if (last_row_within_range == num_rows - 1 &&
+               first_row_within_range == 0) {
+      prune_info.fully_inside_range = true;
+      prune_info.partly_inside_range = false;
+    } else {
+      prune_info.fully_inside_range = false;
+      prune_info.partly_inside_range = true;
+      prune_info.row_begin = first_row_within_range;
+      prune_info.row_end = last_row_within_range + 1;
+    }
+  }
+}
+
+void DerivativeTimeLimiter::ComputeSubmatrixMaps() {
+  int32 num_submatrices = computation_->submatrices.size();
+  submatrix_map_.resize(num_submatrices);
+  submatrix_map_if_deriv_.resize(num_submatrices);
+  // index zero is for the empty submatrix.
+  submatrix_map_[0] = 0;
+  submatrix_map_if_deriv_[0] = 0;
+  for (int32 s = 1; s < num_submatrices; s++) {
+    NnetComputation::SubMatrixInfo &submatrix_info(computation_->submatrices[s]);
+    int32 matrix_index = submatrix_info.matrix_index;
+    int32 row_offset = submatrix_info.row_offset,
+        num_rows = submatrix_info.num_rows;
+    const MatrixPruneInfo &matrix_prune_info = matrix_prune_info_[matrix_index];
+    if (matrix_prune_info.fully_inside_range) {
+      submatrix_map_[s] = s;
+    } else if (!matrix_prune_info.partly_inside_range) {
+      // completely outside time range.
+      submatrix_map_[s] = 0;
+    } else {
+      // the matrix is partly inside the time range.
+      int32 pruned_row_begin = std::max(matrix_prune_info.row_begin,
+                                        row_offset),
+          pruned_row_end = std::min(matrix_prune_info.row_end,
+                                    row_offset + num_rows);
+      if (pruned_row_end <= pruned_row_begin) {
+        // there was no overlap between the submatrix and the part
+        // of the matrix that was inside the time range.
+        submatrix_map_[s] = 0;
+      } else {
+        // caution: this invalidates the reference 'submatrix_info'.
+        int32 row_offset_within_submatrix =
+            pruned_row_begin - row_offset,
+            new_num_rows = pruned_row_end - pruned_row_begin;
+        submatrix_map_[s] =
+            computation_->NewSubMatrix(s, row_offset_within_submatrix,
+                                       new_num_rows, 0, -1);
+      }
+    }
+    bool is_deriv = computation_->matrix_debug_info[matrix_index].is_deriv;
+    submatrix_map_if_deriv_[s] = (is_deriv ?
+                                  submatrix_map_[s] : s);
+  }
+}
+
+void DerivativeTimeLimiter::ModifyCommands() {
+  std::vector<NnetComputation::Command>::iterator
+      iter = computation_->commands.begin(),
+      end =  computation_->commands.end();
+  for (; iter != end; ++iter)
+    ModifyCommand(&(*iter));
+}
+
+// called from PruneMatrices only for matrices that are derivatives,
+// not inputs or outputs of the computation, and which are partly
+// inside the time range, this function returns true if we can
+// limit the size of the matrix (because variables outside the
+// desired range are never accessed), and false otherwise.
+bool DerivativeTimeLimiter::CanLimitMatrix(const Analyzer &analyzer,
+                                           int32 m) const {
+  int32 s_entire = entire_submatrix_[m];  // submatrix consisting of
+                                                     // all of the matrix.
+  int32 s_mapped = submatrix_map_[s_entire];  // the matrix limited in time.
+  KALDI_ASSERT(s_mapped != 0 && s_mapped != s_entire);
+  std::vector<int32> entire_variables, mapped_variables;
+  analyzer.variables.AppendVariablesForSubmatrix(s_entire,
+                                                 &entire_variables);
+  analyzer.variables.AppendVariablesForSubmatrix(s_mapped,
+                                                 &mapped_variables);
+  KALDI_ASSERT(entire_variables.size() > mapped_variables.size());
+  std::vector<int32> excluded_variables(entire_variables.size() -
+                                        mapped_variables.size());
+  std::vector<int32>::iterator end_iter =
+      std::set_difference(entire_variables.begin(), entire_variables.end(),
+                          mapped_variables.begin(), mapped_variables.end(),
+                          excluded_variables.begin());
+  KALDI_ASSERT(end_iter == excluded_variables.end());
+  // We want to make sure that none of the excluded variables are
+  // ever accessed.  If they are, we cannot prune the matrix.
+  int32 allocate_command = analyzer.matrix_accesses[m].allocate_command;
+  for (std::vector<int32>::iterator iter = excluded_variables.begin();
+       iter != end_iter; ++iter) {
+    int32 variable_index = *iter;
+    const std::vector<Access> &variable_accesses =
+        analyzer.variable_accesses[variable_index];
+    std::vector<Access>::const_iterator viter = variable_accesses.begin(),
+        vend = variable_accesses.end();
+    for (; viter != vend; ++viter) {
+      // if a variable outside the pruned range of the matrix is ever accessed
+      // apart from on allocation, we cannot prune.
+      if (viter->command_index != allocate_command) {
+        // we may one day want to look at this.. it's not really expected.
+        KALDI_VLOG(4) << "Cannot prune matrix " << m;
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+void DerivativeTimeLimiter::LimitMatrices(const std::vector<bool> &will_limit) {
+  // first modify 'submatrices'.
+  int32 num_submatrices = computation_->submatrices.size(),
+      num_matrices = computation_->matrices.size();
+  for (int32 s = 1; s < num_submatrices; s++) {
+    NnetComputation::SubMatrixInfo &submat_info = computation_->submatrices[s];
+    int32 m = submat_info.matrix_index;
+    if (will_limit[m]) {
+      // we need to do something...
+      const MatrixPruneInfo &prune_info = matrix_prune_info_[m];
+      int32 matrix_num_rows = prune_info.row_end - prune_info.row_begin;
+      KALDI_ASSERT(matrix_num_rows > 0 &&
+                   matrix_num_rows < computation_->matrices[m].num_rows);
+      KALDI_ASSERT(prune_info.partly_inside_range);
+      int32 new_row_begin = submat_info.row_offset - prune_info.row_begin;
+      if (new_row_begin >= 0 &&
+          submat_info.num_rows + new_row_begin <= matrix_num_rows) {
+        // If this submatrix is entirely inside the limited range of the matrix,
+        // then we modify its row_offset to account for the truncation of
+        // rows to the left.
+        submat_info.row_offset = new_row_begin;
+      } else {
+        // This submatrix is not entirely the kept range of the matrix.
+        // We assume that this submatrix is never accessed directly (as when
+        // we modified the computation we ensured this).  We
+        // give it a valid but stupid size of num-rows=1, num-cols=1, so
+        // that if it ever does get accessed it should produce an error.
+        submat_info.row_offset = 0;
+        submat_info.num_rows = 1;
+        submat_info.col_offset = 0;
+        submat_info.num_cols = 1;
+      }
+    }
+  }
+  // next modify 'matrices'
+  for (int32 m = 1; m < num_matrices; m++) {
+    if (will_limit[m]) {
+      const MatrixPruneInfo &prune_info = matrix_prune_info_[m];
+      NnetComputation::MatrixInfo &matrix_info = computation_->matrices[m];
+      if (!computation_->matrix_debug_info.empty()) {
+        NnetComputation::MatrixDebugInfo &debug_info =
+            computation_->matrix_debug_info[m];
+        std::vector<Cindex> &cindexes = debug_info.cindexes;
+        KALDI_ASSERT(cindexes.size() == static_cast<size_t>(matrix_info.num_rows));
+        cindexes.erase(cindexes.begin() + prune_info.row_end, cindexes.end());
+        cindexes.erase(cindexes.begin(),
+                       cindexes.begin() + prune_info.row_begin);
+      }
+      matrix_info.num_rows = prune_info.row_end - prune_info.row_begin;
+      // num_cols stays the same.
+    }
+  }
+}
+
+void DerivativeTimeLimiter::PruneMatrices() {
+  Analyzer analyzer;
+  analyzer.Init(nnet_, *computation_);
+  KALDI_ASSERT(computation_->matrices.size() == entire_submatrix_.size());
+  int32 num_matrices = computation_->matrices.size();
+  std::vector<bool> will_limit(num_matrices, false);
+  bool will_limit_at_least_one = false;
+  for (int32 m = 1; m < num_matrices; m++) {
+    const MatrixAccesses &accesses = analyzer.matrix_accesses[m];
+    const MatrixPruneInfo &matrix_prune_info = matrix_prune_info_[m];
+    if (matrix_prune_info.fully_inside_range ||
+        accesses.is_input || accesses.is_output ||
+        !computation_->matrix_debug_info[m].is_deriv)
+      continue;  // nothing to do: it's inside the time-range or not a
+                 // derivative.
+    // if we got here it's not completely inside the time range, not an input or
+    // an output, and it's a derivative.
+    if (!matrix_prune_info.partly_inside_range) {
+      // completely outside time range.  we can prune the matrix if it is not an
+      // input or output, and is never accessed apart from allocation.
+      if (accesses.accesses.empty() ||
+          (accesses.accesses.size() == 1 &&
+           accesses.accesses[0].command_index == accesses.allocate_command)) {
+        // we prune the matrix away.  the only thing we need to do here is
+        // to remove the allocation and deallocation commands.
+        // they should exist, because we just checked that it's not an input
+        // or an output.
+        KALDI_ASSERT(accesses.allocate_command >= 0 &&
+                     accesses.deallocate_command >= 0);
+        computation_->commands[accesses.allocate_command].command_type =
+            kNoOperation;
+        computation_->commands[accesses.deallocate_command].command_type =
+            kNoOperation;
+      }
+    } else {
+      // the matrix is partly inside the time range, it's a derivative, and not
+      // an input or an output.
+      if (CanLimitMatrix(analyzer, m)) {
+        will_limit[m] = true;
+        will_limit_at_least_one = true;
+      }
+    }
+  }
+  if (will_limit_at_least_one)
+    LimitMatrices(will_limit);
+}
+
+void LimitDerivativeTimes(const Nnet &nnet,
+                          int32 min_deriv_time,
+                          int32 max_deriv_time,
+                          NnetComputation *computation) {
+  DerivativeTimeLimiter limiter(nnet, min_deriv_time, max_deriv_time,
+                                computation);
+  limiter.LimitDerivTimes();
+}
+
+} // namespace nnet3
+} // namespace kaldi
diff --git a/src/nnet3/nnet-optimize-utils.h b/src/nnet3/nnet-optimize-utils.h
new file mode 100644
index 00000000000..d82867252ec
--- /dev/null
+++ b/src/nnet3/nnet-optimize-utils.h
@@ -0,0 +1,658 @@
+// nnet3/nnet-optimize-utils.h
+
+// Copyright 2015    Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_NNET3_NNET_OPTIMIZE_UTILS_H_
+#define KALDI_NNET3_NNET_OPTIMIZE_UTILS_H_
+
+#include "nnet3/nnet-compile.h"
+#include "nnet3/nnet-analyze.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+
+struct NnetOptimizeOptions;  // Forward declaration.
+
+/**
+   This class is responsible for merging matrices, although you probably want to
+   access it via the the function VariableMergingOptimization().
+
+   We identify pairs of submatrices which can potentially be merged into a single
+   submatrix.
+
+   Suppose there are two different submatrices s1 != s2 that are submatrices of
+   different respective matrices m1 != m2, and somewhere in the computation we
+   have a command C, which is one of:
+      (a) the assignment command  "s2 = s1", or
+      (b) a propagate command with s1 as input and s2 as output, with a component
+          that supports propagate in place, or
+      (c) a backprop command with s1 as output-deriv and s2 as input-deriv, with
+          a component that supports backprop in place.
+
+   Then the triple (C, s1, s2) is a candidate for merging.  We support two types
+   of merging: 'right merging', in which we delete s1 and use s2 instead; and
+   'left merging' in which we delete s2 and use s1 instead.  The two types of
+   merging may seem to be the same thing, but remember that in general s1 and s2
+   may be sub-matrices of larger matrices.
+
+   Note: the following
+     - Define last-access(submatrix) as:
+       If matrix-of(submatrix) is an output, then num-commands, otherwise the
+       last command that accesses that submatrix for either read or write.  [note:
+       deallocation does not count as a read or write operation].
+     - Define first-access(submatrix) as:
+       If matrix-of(submatrix) is an input, then -1, otherwise the first command
+       that is *not* an allocation command that accessed that submatrix for either
+       read or write.
+     - Define last-write-access(submatrix) as the last command-index that accessed
+       the submatrix in a write operation, or -1 if there is no such command (this
+       could happen for inputs).
+     - Define data-invalidated-command(c, submatrix) as the first
+       command-index after 'c' that 'submatrix' is written to; or if there is
+       no such command, then the command index of the deallocation command
+       for 'submatrix'; or if this does not exist, then num-commands.
+
+   The conditions that must be satisfied for merges are as follows:
+     - Condition c1: it cannot be the case that m1 and m2 are both inputs, or
+       that they are both outputs.
+     - Condition c2: If either m1 or m2 is an input or an output, then s1 must
+       be the entirety of m1 and s2 must be the entirety of m2 (this is because
+       inputs and outputs must be whole matrices).
+     - Condition c3: if we are left-merging (deleting s2,m2), then s2 must be the
+       entirety of m2.
+     - Condition c4: If we are right-merging (deleting s1,m1), then s1 must be
+       the entirety of m1.
+     - Condition c5: None of the the variables underlying s1 and s2 may be
+       marked as 'dirty' (implying that they were the subjects of a previous
+       merge during the lifetime of this class).
+     - Condition c6: if we are left-merging (deleting s2, m2) and m2 has
+       stride_type == kStrideEqualNumCols, then s1 must be the entirety of m1.
+       [note: because of condition c3, we can assume that s2 is the entirety of
+       m2.]
+     - Condition c7: if we are right-merging (deleting s1, m1) and m1 has
+       stride_type == kStrideEqualNumCols, then s2 must be the entirety of m2.
+       [note: because of condition c4, we can assume that s1 is the entirety of
+       m1.]
+
+
+   If the command C is case (a), i.e. an assignment operation, then the following
+   conditions must apply:
+     - first-access(s2) == C
+     - last-write-access(s1) < C
+     - last-access(s1) < data-invalidated-command(C, s2)
+   Otherwise (cases (b) and (c), in-place propagate or backprop), we insist that:
+     - first-access(s2) == C
+     - last-access(s1) == C
+   Note: in either case, these conditions imply that s2 is not an input and s1 is
+   not an output.
+
+   The sequence of things we have to do for a right-merge (in which we delete
+   s1,m1) is as follows:
+     - All submatrices that reference m1, make them reference m2 instead.
+       [later we'll renumber so that there are no duplicates.]
+     - If m1 was an input, replace it as an input with m2 and remove the
+       command that allocated m2.
+     - If it was an assignment [case (a)], replace the assignment command with a
+       no-op.
+     - If both m1 and m2 have commands that allocate them, keep only the
+       allocation command for m2, and make sure that it zeroes the data (we can
+       later change to undefined if allowed) and that it's before the first
+       non-allocation access of m1.  Otherwise remove any allocation commands
+       (the merged variable is an input).
+     - If both m1 and m2 have commands that deallocate them, keep only the
+       deallocation command for m2, and make sure that it's after the last
+       access of m1 (otherwise delete any deallocation command, because m2 must
+       be an output).  [note: previously we kept the later of the 2 commands,
+       but this had the effect of making inaccurate the Analyzer info for
+       a matrix (m2) that might later be used.
+     - If m1 had stride_type == kStrideEqualNumCols, set m2's stride_type
+       to kStrideEqualNuMCols.
+
+
+   The sequence of things we have to do for a right-merge (in which we delete
+   s1,m1) is as follows:
+     - All submatrices that reference m2, make them reference m1 instead.
+       [later we'll renumber so that there are no duplicates.]
+     - If m2 was an output, replace it as an output with m1 and remove the
+       command that deallocated m1.
+     ... the last four bullet-points, regarding removing the assignment command,
+        and allocation and deallocation, and stride-type, are the same as for a
+        left-merge, except swap m1 and m2.
+
+   At the end when we call RemoveOrphanMatrices(), the renumbering code will
+   automatically detect that there are duplicate submatrices, and will merge
+   them, as well as removing the now-unused matrix indexes.  After merging, we
+   will mark the variables (i.e. row-ranges) underlying s1 and s2 as being
+   "dirty" so they can no longer be merged during the lifetime of this class.
+ */
+class VariableMergingOptimizer {
+ public:
+  VariableMergingOptimizer(const NnetOptimizeOptions &config,
+                           const Nnet &nnet,
+                           const ComputationRequest &request,
+                           NnetComputation *computation);
+  // Note: you can call this only once.  If it returns true, it means it has
+  // merged variables.  In this case, you have the option to instantiate another
+  // copy of the class and try again with that other copy.
+  bool MergeVariables();
+
+ private:
+  /// @brief This function returns a pair of bools saying whether we can do a
+  ///   (left and/or right) merge respectively, based on the conditions defined
+  ///   in the header.
+  ///
+  /// Note: if one of the variables underlying s1 or s2 is marked as 'dirty' due
+  /// to a previous merge, this function will return (false,false).  The terms
+  /// left-merge and right-merge are defined in the extended comment above this
+  /// class.  Note: left_merge will always be false if config.allow_left_merge
+  /// == false, and the same respectively for right_merge.
+  ///
+  ///  @param command  [in] The command-index that assigns s2 := s1
+  ///                        or does a forward or backprop with s1 as the
+  ///                        input and s2 as the output
+  ///  @param s1   [in]     A submatrix-index s1 > 0.
+  ///  @param s2   [in]     A submatrix-index s2 > 0
+  std::pair<bool,bool> MayBeMerged(int32 command, int32 s1, int32 s2) const;
+
+  // performs the left merge.  Search for left-merge in the comment
+  // above the class declaration for details.
+  void DoLeftMerge(int32 command_index, int32 s1, int32 s2);
+
+  // performs the right merge.  Search for right-merge in the comment
+  // above the class declaration for details.
+  void DoRightMerge(int32 command_index, int32 s1, int32 s2);
+
+  // Performs the actions common to both left and right merges, regarding
+  // removing the assignment command, and allocation and deallocation (called
+  // from DoLeftMerge and DoRightMerge).  The m_to_keep and m_to_discard
+  // are the matrix-indexes we will keep and discard respectively.
+  void DoMergeCommon(int32 command_index, int32 m_to_keep,
+                     int32 m_to_discard);
+
+  /// Marks the variables underlying submatrix 's' as dirty
+  void MarkAsDirty(int32 s);
+
+  void Initialize();
+
+  const NnetOptimizeOptions &config_;
+  const Nnet &nnet_;
+  const ComputationRequest &request_;
+  NnetComputation *computation_;
+
+  Analyzer analyzer_;
+
+  // lists of submatrices that correspond to each matrix.
+  std::vector<std::vector<int32> > matrix_to_submatrix_;
+
+  // for each variable (as defined by analyzer_.variables), true if
+  // we have already performed a merge on it.
+  std::vector<bool> variable_dirty_;
+
+  bool already_called_merge_variables_;
+};
+
+
+/** This class is responsible for consolidating the model-update part of
+    backprop commands, for components in (e.g.) recurrent networks that need to
+    have many separate backprop commands, into more efficient single commands
+    operating on consolidated data in larger matrices.  This is useful for
+    recurrent networks.  */
+class ModelUpdateConsolidator {
+ public:
+  ModelUpdateConsolidator(const Nnet &nnet,
+                          NnetComputation *computation);
+  void ConsolidateModelUpdate();
+ private:
+  void ConsolidateUpdateForComponent(
+      int32 component,
+      const std::vector<int32> &backprop_commands);
+
+  /// This function, called at the end of ConsolidateModelUpdate(), takes the
+  /// commands that we have put in extra_commands_, final_commands_ and
+  /// final_deallocate_commands_, and puts them in the appropriate place in
+  /// computation->commands_.
+  void AddCommandsToComputation();
+
+  /// You call this function when you want to consolidate the values of a list
+  /// of submatrices taken just prior to particular commands.  The input
+  /// 'commands' and 'submatrices' lists must be the same size, and size must be
+  /// > 1.  This function will create a new matrix that is the row-wise
+  /// concatentation of all these submatrices, with values taken just prior to
+  /// the respective command indexes.  This function will will add to
+  /// extra_commands_ the commands to do the copying at the appropriate places
+  /// (at the supplied command indexes; they will be inserted just before).  The
+  /// return value is the submatrix index of a submatrix that represents the
+  /// whole of the consolidated matrix.  This command will insert, at the
+  /// beginning of the computation (in extra_commands_[0]), a command to
+  /// initialize the matrix; and will append to final_deallocate_commands_ the
+  /// commands to deallocate the matrix.  If computation_->matrix_debug_info is
+  /// nonempty, this function will also update computation_->matrix_debug_info
+  /// with suitable values for the newly added matrix
+  int32 ConsolidateSubmatrices(
+      const std::vector<int32> &commands,
+      const std::vector<int32> &submatrices);
+
+  /// This function, called from ConsolidateSubmatrices, will
+  /// update 'debug_info' by appending the corresponding 'indexes' from
+  /// the existing debug info for this submatrix.  It will also set
+  /// the 'is_deriv' of '*debug_info' to the same value as the
+  /// debug info for 'submatrix_index', and set the 'node_index' to the
+  /// 'node_index' in the debug info for that submatrix-index.
+  /// It requires that computation_->matrix_debug_info be nonempty.
+  void AppendDebugInfoForSubmatrix(
+      int32 submatrix_index,
+      NnetComputation::MatrixDebugInfo *debug_info) const;
+
+  const Nnet &nnet_;
+  NnetComputation *computation_;
+
+  // Indexed by the original command index in *computation_ (and sized to the
+  // original number of commands in *computation_ before we added anything),
+  // extra_commands_[c] contains a list of commands that need to be inserted
+  // just before command c in the previously existing computation.
+  std::vector<std::vector<NnetComputation::Command> > extra_commands_;
+
+  // This is as list of kBackprop commands that will be placed after the
+  // commands in 'computation_->commands' and 'extra_commands_', but before
+  // the 'final_deallocate_commands_'.
+  std::vector<NnetComputation::Command> final_commands_;
+  // This is a list of commands to deallocate our 'consolidated' matrices; the
+  // commands will be placed after the commands in 'final_commands_'.
+  std::vector<NnetComputation::Command> final_deallocate_commands_;
+};
+
+
+// We declare this class in the .cc file, we don't need to export it.
+// It's used inside RenumberComputation.
+class ComputationRenumberer {
+ public:
+  ComputationRenumberer(NnetComputation *computation):
+      computation_(computation) { }
+
+  void Renumber();
+ private:
+  // this function removes unused vectors within the indexes_multi_ array, i.e.
+  // ones that are not referenced in the computation.
+  void RemoveUnusedIndexesMulti();
+  // this function computes the submatrix_is_used_ vector, saying whether each
+  // of the original submatrices is referenced somewhere.
+  void ComputeSubmatrixIsUsed();
+  // this function computes the matrix_is_used_ vector (from the
+  // submatrix_is_used_ vector, from computation_->input_output_info, and from
+  // computation_->commands, saying whether each of the original matrices is
+  // referenced somewhere, directly or indirectly.
+  void ComputeMatrixIsUsed();
+  // This function sets up mappings from old to new matrix and submatrix indexes,
+  // writing to num_{,sub}matrices_new_ and old_to_new_{,sub}matrix_.
+  void SetUpMappings();
+  // This function renumbers submatrix indexes appearing within commands and
+  // indexes_multi_, and then removes unused submatrices from the list of
+  // submatrices while leaving the matrix-indexes at their old values (they will
+  // be mapped by RenumberMatrices()).
+  void RenumberSubmatrices();
+  // renumber matrix indexes appearing within 'commmands', within 'submatrices'
+  // and 'input_output_info'; renumber 'matrices' and if applicable
+  // 'debug_info'.
+  void RenumberMatrices();
+  // removes duplicates within the indexes_multi array itself.
+  void RemoveIndexesMultiDuplicates();
+  // removes unused elements and duplicates within 'computation->indexes'
+  void RenumberIndexes();
+  // removes unused elements and duplicates within 'computation->indexes_ranges'
+  void RenumberIndexesRanges();
+
+  struct SubMatrixHasher {
+    SubMatrixHasher() { }
+    size_t operator () (const NnetComputation::SubMatrixInfo &submat) const {
+      // these numbers are arbitrarily chosen primes.
+      return submat.matrix_index +
+          19553 * submat.row_offset +
+          29297 * submat.num_rows +
+          42209 * submat.col_offset +
+          56527 * submat.num_cols;
+    }
+  };
+
+
+  // Here, T will be int32 or std::pair<int32,int32>
+  template <class T>
+  struct PointerCompare {
+    // This provides an operator < on two vectors of ints or pairs of ints.  It
+    // is designed to provide a total order on the vectors while accessing as
+    // small a portion of the vectors' data as possible.  It's used in removing
+    // duplicates from computation_->indexes_multi and computation_->indexes.
+    // First it compares the length, then it does lexicographical compare.
+    bool operator ()(const std::vector<T> *ptr1,
+                     const std::vector<T> *ptr2) const {
+      size_t size1 = ptr1->size(), size2 = ptr2->size();
+      if (size1 < size2) return true;
+      else if (size1 > size2) return false;
+      else return (*ptr1 < *ptr2);  // use the std::vector operator <, which is
+                                    // lexicographical comparison.
+    }
+  };
+
+  /// creates a renumbering that removes the elements in "to_remove",
+  /// e.g. if old_num_elements = 3 and to_remove = [1], would output
+  /// the vector [ 0, -1, 1 ].
+  static void CreateRenumbering(int32 old_num_elements,
+                                const std::vector<int32> &to_remove,
+                                std::vector<int32> *renumbering);
+
+  /// creates a renumbering from old to new index that removes the unused
+  /// elements, e.g. if used == [ true, false, true, true], would output the
+  /// vector [ 0, -1, 1, 2 ].  Returns number of new elements, i.e. the
+  /// number of elements of 'used' that were true.
+  static int32 CreateRenumbering(const std::vector<bool> &used,
+                                 std::vector<int32> *renumbering);
+
+  // vector of bool indexed by original submatrix-index, that is true if a
+  // submatrix-index is used somewhere in the computation (always true for
+  // the zeroth element).
+  std::vector<bool> submatrix_is_used_;
+  // vector of bool indexed by original submatrix-index, that is true if a
+  // submatrix-index will be kept; this is like submatrix_is_used_; but for
+  // duplicate submatrices, all but the first duplicate will be marked false).
+  std::vector<bool> submatrix_is_kept_;
+  // vector of bool indexed by original-matrix-index > 0, that is true if a
+  // matrix-index is used somewhere in the computation, directly or indirectly.
+  // always true for the zeroth element.
+  std::vector<bool> matrix_is_used_;
+  NnetComputation *computation_;
+  int32 num_matrices_new_;
+  int32 num_submatrices_new_;
+  std::vector<int32> old_to_new_matrix_; // numbered by orig-matrix-index, gives
+                                         // new-matrix-index.  -1 for removed
+                                         // ones.
+  std::vector<int32> old_to_new_submatrix_; // numbered by orig-submatrix-index,
+                                            // gives new-submatrix-index.  -1
+                                            // for removed ones.
+};
+
+
+// We require that the computation have debug info set up
+// (!matrix_debug_info.empty()) and that this be the first
+// optimization you perform.  This means that the debug_info will
+// be accurate and that all matrices will be initialized with
+// zero contents.
+class DerivativeTimeLimiter {
+ public:
+  DerivativeTimeLimiter(const Nnet &nnet,
+                        int32 min_deriv_time,
+                        int32 max_deriv_time,
+                        NnetComputation *computation);
+
+  void LimitDerivTimes();
+
+ private:
+
+  // This command ensures that for each matrix m there is a corresponding
+  // submatrix that spans the entire matrix, and stores its index in
+  // entire_submatrix_[m].
+  void EnsureMatricesHaveEntireSubmatrices();
+
+  // sets up matrix_prune_info_.
+  void ComputeMatrixPruneInfo();
+
+  // sets up subatrix_map_ and submatrix_map_if_deriv_.
+  void ComputeSubmatrixMaps();
+
+  // modifies all the commands as appropriate to reflect that some derivative
+  // values are zero (i.e. save any computation we can, based on this
+  // assumption).
+  void ModifyCommands();
+
+  // this function, called after we've modified the commands to operate on
+  // submatrices of the original matrices, works out for which of the matrices
+  // we can actually limit their extent in time, and changes the way the
+  // matrices are allocated (it may remove some matrices entirely).
+  void PruneMatrices();
+
+  // called from PruneMatrices only for matrices that are derivatives,
+  // not inputs or outputs of the computation, and which are partly
+  // inside the time range, this function returns true if we can
+  // limit the size of the matrix (because variables outside the
+  // desired range are never accessed), and false otherwise.
+  inline bool CanLimitMatrix(const Analyzer &analyzer,
+                             int32 matrix_index) const;
+
+  // called from PruneMatrices after it has figured out which matrices we need
+  // to limit to a row-range, this function changes computation->submatrices and
+  // computation->matrices in the way required to do that.
+  inline void LimitMatrices(const std::vector<bool> &will_limit);
+
+  // does the processing for a command of type kMatrixCopy or kMatrixAdd.
+  void MapSimpleMatrixCommand(NnetComputation::Command *c);
+
+  // does the processing for a command of type kCopyRows or kAddRows, where
+  // 1st and 2nd args are submatrix indexes and the 3rd arg is a vector of
+  // row-indexes.
+  void MapIndexesCommand(NnetComputation::Command *c);
+
+  // does the processing for a command of type kAddRowsMulti, kAddToRowsMulti,
+  // kCopyRowsMulti or kCopyToRowsMulti, 1st arg is submatrix index that the
+  // command is called with, and 2nd arg is 'indexes_multi' index (which
+  // contains pairs (source-submatrix, source-row).
+  void MapIndexesMultiCommand(NnetComputation::Command *c);
+
+  // does the processing for a command of type kAddRowRanges.
+  void MapAddRowRangesCommand(NnetComputation::Command *c);
+
+  // Modifies this command to take into account prune_info_.  At this point we
+  // don't actually reduce the size of the matrices, we simply make the commands
+  // operate on submatrices of the original matrices where possible- or
+  // delete them completely if their output is all zeros or for other reasons
+  // we detect that they would be no-ops.
+  // Note: this calls computation_->NewSubMatrix, and will generate duplicates
+  // of the same submatrix which we'll later remove in RemoveOrphanMatrices.
+  void ModifyCommand(NnetComputation::Command *command);
+
+  // this will detect which matrices we can reduce the allocated size of,
+  // and reduce their size.
+  void ResizeMatrices();
+
+  // Requires that we have mapped 'initial_submatrix' to 'new_submatrix' in
+  // an operation that may have removed some data on the left and/or the
+  // right (but still they point to the same underlying matrix).  Outputs
+  // to 'left_prune' and 'right_prune' the number of rows we have
+  // removed on the left and on the right respectively.
+  inline void GetPruneValues(int32 initial_submatrix,
+                             int32 new_submatrix,
+                             int32 *left_prune,
+                             int32 *right_prune) const;
+
+  struct MatrixPruneInfo {
+    bool is_deriv;  // true if the matrix represents a derivative (copied from
+                    // the debug-info; repeated here for convenience).
+    bool fully_inside_range;  // True if the matrix is completely inside the time range
+                             // specified.
+    bool partly_inside_range;  // true if the matrix is partly (but not fully)
+                               // inside the time range specified.
+    int32 row_begin;  // if partly_inside_range, the first row that's within the time range (i.e. for which
+                      // min_deriv_time_ <= t < max_deriv_time_.
+    int32 row_end;    // if partly_inside_range, one plus the last row that's within
+                      // the time range.
+  };
+
+
+  const Nnet &nnet_;
+
+  int32 min_deriv_time_;
+  int32 max_deriv_time_;
+
+  // the computation; we require it to have debug info set up
+  // (otherwise you shouldn't be instantiating this class).
+  NnetComputation *computation_;
+
+  // for each matrix index > 0, the index of a submatrix that consists of
+  // the entirety of that matrix.
+  std::vector<int32> entire_submatrix_;
+
+  std::vector<MatrixPruneInfo> matrix_prune_info_;
+
+  // for each submatrix in the original range of computation_->submatrices,
+  // submatrix_map_ maps it to itself if the submatrix is completely inside the
+  // time-range, or to zero if it's completely outside the time-range, or to a
+  // newly created submatrix-index if it's partly inside the time-range.
+  std::vector<int32> submatrix_map_;
+
+  // submatrix_map_if_deriv_ contains the quantity:
+  // IsDerivative(s) ? submatrix_map_[s] : s,
+  // where IsDerivative(s) is true if s is part of a matrix that (according to its
+  // debug info) represents a derivative.
+  // this comes up so frequently that storing it separately seemed like a good idea.
+  std::vector<int32> submatrix_map_if_deriv_;
+
+  std::vector<MatrixPruneInfo> prune_info_;
+};
+
+// This is the top-level interface to limit the times on which derivatives are
+// computed (e.g. for truncated BPTT); internally it uses class
+// DerivativeLimiter.  Will do nothing if min_deriv_time and max_deriv_time are
+// their default -inf,+inf values.
+void LimitDerivativeTimes(const Nnet &nnet,
+                          int32 min_deriv_time,
+                          int32 max_deriv_time,
+                          NnetComputation *computation);
+
+
+/// This function detects submatrices, matrices, and members of indexes_multi
+/// and indexes that are never used (e.g. due to changes made in other
+/// optimization code), and removes them from the computation by way of suitable
+/// renumbering.  It does not remove no-ops from computation->commands_; to do
+/// that, call RemoveNoOps(computation).
+void RenumberComputation(NnetComputation *computation);
+
+/// Removes commands of type kNoOperation in the computation.
+void RemoveNoOps(NnetComputation *computation);
+
+/// Wherever matrix orig_matrix_index appears in the input of the network
+/// (i.e. in computation->input_output_info), replaces it with new_matrix_index.
+/// Returns true if it did replace it.
+bool ReplaceInInput(
+    const Nnet &nnet, int32 orig_matrix_index, int32 new_matrix_index,
+    NnetComputation *computation);
+
+/// A helper function used in some optimization functions.
+/// Wherever matrix orig_matrix_index appears in the output of the network
+/// (i.e. in computation->input_output_info), replaces it with new_matrix_index.
+/// Returns true if it did replace it.
+bool ReplaceInOutput(
+    const Nnet &nnet, int32 orig_matrix_index, int32 new_matrix_index,
+    NnetComputation *computation);
+
+/// This function outputs to "submatrix_args" the addresses of a subset of
+/// arguments arg1 through arg6 in "command", that correspond to the indexes of
+/// submatrices.  This is useful in renumbering code.  Note: some of the
+/// pointers may point to a zero value, for optional submatrix args.
+void IdentifySubmatrixArgs(NnetComputation::Command *command,
+                           std::vector<int32*> *submatrix_args);
+
+
+/// This function outputs to "submatrix_args" the addresses of the args
+/// (arguments arg1 through arg6) in the vector "commands", that correspond to
+/// the indexes of submatrices.  This is useful in renumbering code.  Note: some
+/// of the pointers may point to a zero value, for optional submatrix args.
+void IdentifySubmatrixArgs(std::vector<NnetComputation::Command> *commands,
+                           std::vector<int32*> *submatrix_args);
+
+
+/// This function outputs to "submatrix_args" the addresses of integers in
+/// 'computation' that correspond to submatrices.  These may be present in
+/// 'commands', and in 'indexes_multi'.  This is useful in renumbering code.
+/// Note: some of the pointers may point to a zero value, for optional submatrix
+/// args in commands, but for efficiency we don't provide pointers for the -1's
+/// in 'indexes_multi'.
+void IdentifySubmatrixArgsInComputation(NnetComputation *computation,
+                                        std::vector<int32*> *submatrix_args);
+
+
+/// This function outputs to "matrix_args" the addresses of a subset of the
+/// arguments arg1 through arg6 in "command", that correspond to the indexes of
+/// matrices.  This is useful in renumbering code.  (Note: only a few types of
+/// command use matrix indexes).
+void IdentifyMatrixArgs(NnetComputation::Command *command,
+                        std::vector<int32*> *matrix_args);
+
+/// This function outputs to "matrix_args" the addresses of a subset of the
+/// arguments arg1 through arg6 in commands in "commands", that correspond to
+/// the indexes of matrices.  This is useful in renumbering code.  (Note: only a
+/// few types of command use matrix indexes).
+void IdentifyMatrixArgs(std::vector<NnetComputation::Command> *command,
+                        std::vector<int32*> *matrix_args);
+
+/// This function outputs to "matrix_args" the addresses of indexes inside
+/// 'computation' that correspond to matrices.  These live inside
+/// computation->commands and computation->input_output_info; and if
+/// 'include_from_submatrices' is true, then the matrix-indexes present in
+/// computation->submatrices[*].matrix_index will be included too.  Zeros may be
+/// present if there were optional arguments; we do include pointers to them,
+/// but you can just ignore them.
+void IdentifyMatrixArgsInComputation(bool include_from_submatrices,
+                                     NnetComputation *computation,
+                                     std::vector<int32*> *matrix_args);
+
+
+/// Identifies in the vector of commands, arguments that correspond to indexes
+/// into the computation's indexes_multi array, and outputs a list of pointers
+/// to those arguments to 'indexes_multi_args'.  Useful in renumbering code.
+void IdentifyIndexesMultiArgs(std::vector<NnetComputation::Command> *commands,
+                              std::vector<int32*> *indexes_multi_args);
+
+/// Identifies in the vector of commands, arguments that correspond to indexes
+/// into the computation's 'indexes' array, and outputs a list of pointers
+/// to those arguments to 'indexes_args'.  Useful in renumbering code.
+void IdentifyIndexesArgs(std::vector<NnetComputation::Command> *commands,
+                         std::vector<int32*> *indexes_args);
+
+/// Identifies in the vector of commands, arguments that correspond to indexes
+/// into the computation's 'indexes' array, and outputs a list of pointers
+/// to those arguments to 'indexes_args'.  Useful in renumbering code.
+void IdentifyIndexesArgs(std::vector<NnetComputation::Command> *commands,
+                         std::vector<int32*> *indexes_args);
+
+/// Identifies in the vector of commands, arguments that correspond to indexes
+/// into the computation's 'indexes_ranges' array, and outputs a list of pointers
+/// to those arguments to 'indexes_ranges_args'.  Useful in renumbering code.
+void IdentifyIndexesRangesArgs(std::vector<NnetComputation::Command> *commands,
+                               std::vector<int32*> *indexes_ranges_args);
+
+
+
+
+/*
+
+   Possible TODO:
+      optimizations to replace row-by-row copy and add commands with whole-matrix
+      commands on smaller sub-matrices (if the row-by-row copy commands have certain
+      regularities).  this is a minor issue, we can handle it later.  We have to be
+      careful if this causes sub-matrices to overlap.
+
+ */
+
+
+
+
+
+} // namespace nnet3
+} // namespace kaldi
+
+
+#endif
+
diff --git a/src/nnet3/nnet-optimize.cc b/src/nnet3/nnet-optimize.cc
index 805afc7ac66..08a28e22025 100644
--- a/src/nnet3/nnet-optimize.cc
+++ b/src/nnet3/nnet-optimize.cc
@@ -1,6 +1,7 @@
 // nnet3/nnet-optimize.cc
 
 // Copyright      2015  Johns Hopkins University (author: Daniel Povey)
+//                2015  Xiaohui Zhang
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -18,587 +19,87 @@
 // limitations under the License.
 
 #include "nnet3/nnet-optimize.h"
+#include "nnet3/nnet-optimize-utils.h"
 
 namespace kaldi {
 namespace nnet3 {
 
-
-void IdentifySubmatrixArgs(NnetComputation::Command *c,
-                           std::vector<int32*> *submatrix_args) {
-  submatrix_args->clear();
-  switch (c->command_type) {
-      case NnetComputation::kAllocMatrixZeroed:
-      case NnetComputation::kAllocMatrixUndefined:
-      case NnetComputation::kDeallocMatrix:
-        break;
-    case NnetComputation::kPropagate:
-      submatrix_args->push_back(&c->arg3);
-      submatrix_args->push_back(&c->arg4);
-      break;
-    case NnetComputation::kStoreStats:
-      submatrix_args->push_back(&c->arg2);
-      break;
-    case NnetComputation::kBackprop:
-      submatrix_args->push_back(&c->arg3);
-      submatrix_args->push_back(&c->arg4);
-      submatrix_args->push_back(&c->arg5);
-      submatrix_args->push_back(&c->arg6);      
-      break;
-    case NnetComputation::kMatrixCopy:
-    case NnetComputation::kMatrixAdd:
-    case NnetComputation::kAddRows:
-    case NnetComputation::kCopyRows:
-    case NnetComputation::kAddRowRanges:
-      submatrix_args->push_back(&c->arg1);
-      submatrix_args->push_back(&c->arg2);
-      break;
-    case NnetComputation::kAddRowsMulti:
-    case NnetComputation::kCopyRowsMulti:
-    case NnetComputation::kAddToRowsMulti:
-    case NnetComputation::kCopyToRowsMulti:
-      submatrix_args->push_back(&c->arg1);
-      break;
-    case NnetComputation::kNoOperation:
-    case NnetComputation::kNoOperationMarker:
-      break;
-    default:
-      KALDI_ERR << "Unknown command type.";
-  }
-}
-    
-void IdentifyMatrixArgs(NnetComputation::Command *c,
-                        std::vector<int32*> *matrix_args) {
-  matrix_args->clear();
-  switch (c->command_type) {
-    case NnetComputation::kAllocMatrixZeroed:
-    case NnetComputation::kAllocMatrixUndefined:
-    case NnetComputation::kDeallocMatrix:
-      matrix_args->push_back(&c->arg1);
-      break;
-    default:
-      break;
-  }
-}
-
-// We declare this class in the .cc file, we don't need to export it.
-// It's used inside RemoveSomeMatrices.  matrices_to_remove must be
-// sorted and uniq.
-class ComputationRenumberer {
- public:
-  ComputationRenumberer(NnetComputation *computation):
-      computation_(computation) { }
-  
-  void Renumber() {
-    SetUpMappings();
-    RenumberCommands();
-    RenumberMatrices();
-    RenumberSubmatrices();
-    RenumberIndexesMulti();
-    RenumberDebugInfo();
-    RenumberIo();
-  }
- private:
-  void SetUpMappings();
-  void RenumberCommands();
-  void RenumberMatrices();
-  void RenumberSubmatrices();
-  void RenumberIndexesMulti();
-  void RenumberDebugInfo();
-  void RenumberIo();
-
-  struct SubMatrixHasher {
-    SubMatrixHasher() { }
-    size_t operator () (const NnetComputation::SubMatrixInfo &submat) const {
-      // these numbers are arbitrarily chosen primes.
-      return submat.matrix_index +
-          19553 * submat.row_offset +
-          29297 * submat.num_rows +
-          42209 * submat.col_offset +
-          56527 * submat.num_cols;
-    }
-  };
-  
-  /// creates a renumbering that removes the elements in "to_remove",
-  /// e.g. if old_num_elements = 3 and to_remove = [1], would output
-  /// the vector [ 0, -1, 1 ].
-  static void CreateRenumbering(int32 old_num_elements,
-                                const std::vector<int32> &to_remove,
-                                std::vector<int32> *renumbering);
-
-  std::vector<int32> matrices_to_remove_;
-  NnetComputation *computation_;
-  int32 num_matrices_orig_;
-  int32 num_submatrices_orig_;
-  int32 num_matrices_new_;
-  int32 num_submatrices_new_;
-  std::vector<int32> old_to_new_matrix_; // numbered by orig-matrix-index, gives
-                                         // new-matrix-index.  -1 for removed
-                                         // ones.
-  std::vector<int32> old_to_new_submatrix_; // numbered by orig-submatrix-index,
-                                            // gives new-submatrix-index.  -1
-                                            // for removed ones.
-
-};
-
-//static
-void ComputationRenumberer::CreateRenumbering(
-    int32 old_num_elements,
-    const std::vector<int32> &to_remove,
-    std::vector<int32> *renumbering) {
-  KALDI_ASSERT(IsSortedAndUniq(to_remove) && old_num_elements > 0);
-  renumbering->clear();
-  renumbering->resize(old_num_elements, 0);
-  int32 num_remove = to_remove.size();
-  for (int32 r = 0; r < num_remove; r++) {
-    int32 this_remove = to_remove[r];
-    // the "> 0" would be ">= 0" in a more generic context, but zero is
-    // not valid in this particular application.
-    KALDI_ASSERT(this_remove > 0 && this_remove < old_num_elements);
-    (*renumbering)[this_remove] = -1;
-  }
-  int32 cur_number = 0;
-  for (int32 i = 0; i < old_num_elements; i++) {
-    if ((*renumbering)[i] != -1)
-      (*renumbering)[i] = cur_number++;
-  }
-  KALDI_ASSERT(cur_number == old_num_elements -
-               static_cast<int32>(to_remove.size()));
-}
-
-
-void ComputationRenumberer::SetUpMappings() {
-  KALDI_ASSERT(matrices_to_remove_.empty());
-  num_matrices_orig_ = computation_->matrices.size();
-  num_submatrices_orig_ = computation_->submatrices.size();
-
-  // list of submats per matrix.  
-  std::vector<std::vector<int32> > submatrix_lists; 
-  ComputeSubmatLists(*computation_, &submatrix_lists);
-
-  for (int32 m = 1; m < num_matrices_orig_; m++)
-    if (submatrix_lists[m].empty())
-      matrices_to_remove_.push_back(m);
-
-  CreateRenumbering(num_matrices_orig_, matrices_to_remove_,
-                    &old_to_new_matrix_);
-
-  num_matrices_new_ = num_matrices_orig_ -
-      static_cast<int32>(matrices_to_remove_.size());
-  
-  unordered_map<NnetComputation::SubMatrixInfo, int32,
-                SubMatrixHasher> submat_map;
-  int32 cur_index = 1;
-  // the old_to_new_submatrix_ map will remove duplicates.
-  old_to_new_submatrix_.resize(num_submatrices_orig_);
-  old_to_new_submatrix_[0] = 0;
-  for (int32 s = 1; s < num_submatrices_orig_; s++) {
-    const NnetComputation::SubMatrixInfo &info =
-        computation_->submatrices[s];
-    if (submat_map.count(info) > 0)
-      old_to_new_submatrix_[s] = submat_map[info];
-    else
-      old_to_new_submatrix_[s] = (submat_map[info] = cur_index++);
-  }        
-  num_submatrices_new_ = cur_index;
-}
-
-void ComputationRenumberer::RenumberCommands() {
-  // renumbers matrices and submatrices in commands.
-  const int32 num_matrices_old = num_matrices_orig_,
-      num_submatrices_old = num_submatrices_orig_;
-  int32 num_commands = computation_->commands.size();
-  for (int32 command_index = 0; command_index < num_commands; command_index++) {
-    NnetComputation::Command &c = computation_->commands[command_index];
-    {
-      std::vector<int32*> submatrix_args;
-      IdentifySubmatrixArgs(&c, &submatrix_args);
-      std::vector<int32*>::const_iterator iter = submatrix_args.begin(),
-          end = submatrix_args.end();
-      for (; iter != end; ++iter) {
-        int32 *submatrix_arg = *iter;
-        int32 submatrix_index = *submatrix_arg,
-            new_submatrix_index = old_to_new_submatrix_[submatrix_index];
-        KALDI_ASSERT(submatrix_index >= 0 &&
-                     submatrix_index < num_submatrices_old);
-        // renumber the argument of the command.
-        *submatrix_arg = new_submatrix_index;
-      }
-    }
-    {
-      std::vector<int32*> matrix_args;
-      IdentifyMatrixArgs(&c, &matrix_args);
-      std::vector<int32*>::const_iterator iter = matrix_args.begin(),
-          end = matrix_args.end();
-      for (; iter != end; ++iter) {
-        int32 *matrix_arg = *iter;
-        int32 matrix_index = *matrix_arg,
-            new_matrix_index = old_to_new_matrix_[matrix_index];
-        KALDI_ASSERT(matrix_index >= 0 && matrix_index < num_matrices_old &&
-                     new_matrix_index >= 0);
-        // renumber the argument of the command.
-        *matrix_arg = new_matrix_index;
-      }
-    }
-  }
-}
-
-void ComputationRenumberer::RenumberMatrices() {
-  std::vector<NnetComputation::MatrixInfo> new_matrices(num_matrices_new_);
-  for (int32 m = 0; m < num_matrices_orig_; m++) {
-    int32 m_new = old_to_new_matrix_[m];
-    if (m_new != -1)
-      new_matrices[m_new] = computation_->matrices[m];
-  }
-  computation_->matrices = new_matrices;
-}
-
-
-
-void ComputationRenumberer::RenumberSubmatrices() {
-  std::vector<NnetComputation::SubMatrixInfo> new_submatrices(
-      num_submatrices_new_);
-  for (int32 s = 0; s < num_submatrices_orig_; s++) {
-    int32 s_new = old_to_new_submatrix_[s];
-    if (s_new != -1) {
-      NnetComputation::SubMatrixInfo &dest = new_submatrices[s_new];
-      dest = computation_->submatrices[s];
-      dest.matrix_index = old_to_new_matrix_[dest.matrix_index];
-      KALDI_ASSERT(dest.matrix_index >= 0);
-    }
-  }
-  computation_->submatrices = new_submatrices;
-}
-
-void ComputationRenumberer::RenumberIndexesMulti() {
-  std::vector<std::vector<std::pair<int32,int32> > >::iterator
-      iter = computation_->indexes_multi.begin(),
-      end = computation_->indexes_multi.end();
-  for (; iter != end; ++iter) {
-    std::vector<std::pair<int32,int32> >::iterator
-        iter2 = iter->begin(), end2 = iter->end();
-    for (; iter2 != end2; ++iter2) {
-      int32 &submatrix_index = iter2->first;
-      if (submatrix_index > 0) {
-        KALDI_ASSERT(submatrix_index < num_submatrices_orig_);
-        submatrix_index = old_to_new_submatrix_[submatrix_index];
-      }
-    }
-  }    
-}
-
-void ComputationRenumberer::RenumberDebugInfo() {
-  if (computation_->matrix_debug_info.empty())
-    return;
-  KALDI_ASSERT(static_cast<int32>(computation_->matrix_debug_info.size()) ==
-               num_matrices_orig_);
-  // we arbitrarily keep the matrix debug info from the earliest numbered matrix
-  // when constructing the new debug info.  The info may sometimes differ and
-  // we'll just choose to identify the matrix with one or other of the nodes.
-  // this information is only consumed by human readers anyway, while debugging.
-  std::vector<NnetComputation::MatrixDebugInfo> matrix_debug_info(
-      num_matrices_new_);
-  for (int32 m = 0; m < num_matrices_orig_; m++) {
-    int32 m_new = old_to_new_matrix_[m];
-    if (m_new != -1 && matrix_debug_info[m_new].indexes.empty())
-      matrix_debug_info[m_new] = computation_->matrix_debug_info[m];
-  }
-  computation_->matrix_debug_info = matrix_debug_info;
-}
-
-void ComputationRenumberer::RenumberIo() {
-  unordered_map<int32, std::pair<int32, int32> >::iterator
-      iter = computation_->input_output_info.begin(),
-      end = computation_->input_output_info.end();
-  for (; iter != end; ++iter) {
-    int32 &value_matrix_index = iter->second.first,
-        &deriv_matrix_index = iter->second.second;
-    KALDI_ASSERT(value_matrix_index > 0 && value_matrix_index <
-                 num_matrices_orig_);
-    value_matrix_index = old_to_new_matrix_[value_matrix_index];
-    KALDI_ASSERT(value_matrix_index != -1);
-    if (deriv_matrix_index != 0) {
-      KALDI_ASSERT(deriv_matrix_index > 0 && deriv_matrix_index <
-                   num_matrices_orig_);
-      deriv_matrix_index = old_to_new_matrix_[deriv_matrix_index];
-      KALDI_ASSERT(deriv_matrix_index > 0);
-    }
-  }
-}
-
-
-/// This function detects matrices that have no submatrices corresponding to them (due,
-/// to changes made in other optimization code), and removes them from the computation.
-/// It also renumbers the submatrix indexes to remove duplicates.
-void RemoveOrphanMatrices(NnetComputation *computation) {
-  ComputationRenumberer renumberer(computation);
-  renumberer.Renumber();
-}
-
-void RemoveNoOps(NnetComputation *computation) {
-  std::vector<NnetComputation::Command>::iterator
-      input_iter = computation->commands.begin(),
-      input_end = computation->commands.end(),
-      output_iter = computation->commands.begin();
-  for (; input_iter != input_end; ++input_iter) {
-    if (input_iter->command_type != NnetComputation::kNoOperation) {
-      *output_iter = *input_iter;
-      ++output_iter;
-    }
-  }
-  computation->commands.resize(output_iter - computation->commands.begin());
-}
-
-/// Wherever matrix orig_matrix_index appears in the input of the network
-/// (i.e. in computation->input_output_info), replaces it with new_matrix_index.
-/// Returns true if it did replace it.
-bool ReplaceInInput(
-    const Nnet &nnet,
-    int32 orig_matrix_index, int32 new_matrix_index,
-    NnetComputation *computation) {
-  bool ans = false;
-  int32 num_matrices = computation->matrices.size();
-  KALDI_ASSERT(orig_matrix_index > 0 && orig_matrix_index < num_matrices &&
-               new_matrix_index > 0 && new_matrix_index < num_matrices);
-  unordered_map<int32, std::pair<int32, int32> >::iterator
-      iter = computation->input_output_info.begin(),
-      end = computation->input_output_info.end();
-  for (; iter != end; ++iter) {
-    int32 network_node = iter->first,
-        &value_matrix_index = iter->second.first,
-        &deriv_matrix_index = iter->second.second;
-    if (!nnet.IsOutputNode(network_node)) {
-      // value_matrix_index would be an input to the computation.
-      if (value_matrix_index == orig_matrix_index) {
-        value_matrix_index = new_matrix_index;
-        ans = true;
-      }
-    } else {
-      // deriv_matrix_index would be an input to the computation.
-      if (deriv_matrix_index == orig_matrix_index) {
-        deriv_matrix_index = new_matrix_index;
-        ans = true;
-      }
-    }
-  }
-  return ans;
-}
-
-VariableMergingOptimizer::VariableMergingOptimizer(
-    const NnetOptimizeOptions &config,
-    const Nnet &nnet,
-    const ComputationRequest &request,
-    NnetComputation *computation):
-    config_(config), nnet_(nnet), request_(request),
-    computation_(computation) {
-  analyzer_.Init(nnet, *computation);
-}
-
-bool VariableMergingOptimizer::MergeVariables() {
-  if (!config_.optimize)
-    return false;
-  bool merged = false;
-  Initialize();
-  int32 num_commands = computation_->commands.size();
-  for (int32 command_index = 0; command_index < num_commands;
-       command_index++) {
-    const NnetComputation::Command &c =
-        computation_->commands[command_index];
-    int32 s1 = -1, s2 = -1;
-    if (c.command_type == NnetComputation::kMatrixCopy &&
-        config_.remove_assignments) {
-      s2 = c.arg1;  // s2 is the written-to matrix.
-      s1 = c.arg2;
-    } else if (c.command_type == NnetComputation::kPropagate &&
-               config_.propagate_in_place) {
-      const Component *component = nnet_.GetComponent(c.arg1);
-      if (component->Properties() & kPropagateInPlace) {
-        s1 = c.arg3;
-        s2 = c.arg4;  // s2 is the written-to matrix.
-      }
-    } else if (c.command_type == NnetComputation::kBackprop &&
-               config_.backprop_in_place) {
-      const Component *component = nnet_.GetComponentForNode(c.arg1);
-      if (component->Properties() & kBackpropInPlace) {
-        s1 = c.arg5;
-        s2 = c.arg6;  // s2 is the written-to matrix.
-        if (s1 == c.arg3 || s2 == c.arg3 || s1 == c.arg4 || s2 == c.arg4) {
-          // we don't think this should ever happen, but just out of an
-          // abundance of caution: if either of these submatrix indexes are the
-          // input-value or output-value args to Backprop, don't do the optimization.
-          s1 = -1;
-          s2 = -1;
-        }
-      }
-    }
-    if (s1 != -1 && IsCandidate(command_index, s1, s2)) {
-      merged = true;
-      DoMerge(command_index, s1, s2);
-    }
-  }
-  if (merged) {
-    RemoveOrphanMatrices(computation_);
-    RemoveNoOps(computation_);
-  }
-  return merged;
-}
-
-/**
-   This static function returns a SubMatrixInfo corresponding to
-   replacing the matrix-index in a's "matrix_index" with, essentially, sub-matrix b.
-   Of course the matrix_index will be b's "matrix_index", but we may
-   have to modify the row and column offsets.  The idea is that sub-matrix
-   b should have the same dimensions as
- */
-static NnetComputation::SubMatrixInfo GetSubMatrixOfSubMatrix(
-    const NnetComputation &computation, int32 submat_a, int32 submat_b) {
-  KALDI_ASSERT(static_cast<size_t>(submat_a) < computation.submatrices.size());
-  KALDI_ASSERT(static_cast<size_t>(submat_b) < computation.submatrices.size());
-  const NnetComputation::SubMatrixInfo &a = computation.submatrices[submat_a],
-                                       &b = computation.submatrices[submat_b];
-  const NnetComputation::MatrixInfo &a_mat =
-      computation.matrices[a.matrix_index];
-  KALDI_ASSERT(a_mat.num_rows == b.num_rows && a_mat.num_cols == b.num_cols);
-  NnetComputation::SubMatrixInfo ans;
-  ans.matrix_index = b.matrix_index;
-  ans.row_offset = a.row_offset + b.row_offset;
-  ans.num_rows = a.num_rows;
-  ans.col_offset = a.col_offset + b.col_offset;
-  ans.num_cols = a.num_cols;
-  return ans;
-}
-
-void VariableMergingOptimizer::DoMerge(int32 command_index,
-                                       int32 s1, int32 s2) {
-  NnetComputation::Command &c = computation_->commands[command_index];
-  int32 m1 = computation_->submatrices[s1].matrix_index,
-      m2 = computation_->submatrices[s2].matrix_index;
-  KALDI_ASSERT(m1 != m2 && m1 > 0 && m2 > 0);
-  { // modify submatrices for submatrices of m1 to effectively be sub-matrices of
-    // s2 instead (they will refer to m2 as the matrix_index).
-    std::vector<int32>::const_iterator iter = submatrix_lists_[m1].begin(),
-        end = submatrix_lists_[m1].end();
-    for (; iter != end; ++iter) {
-      int32 submatrix_index = *iter;
-      KALDI_ASSERT(computation_->submatrices[submatrix_index].matrix_index==m1);
-      computation_->submatrices[submatrix_index] =
-          GetSubMatrixOfSubMatrix(*computation_, submatrix_index, s2);
-    }
-  }
-  const std::vector<MatrixAccesses> &matrix_accesses = analyzer_.matrix_accesses;
-  // - If m1 was an input, replace it as an input with m2  
-  bool replaced = ReplaceInInput(nnet_, m1, m2, computation_);
-  KALDI_ASSERT(replaced == matrix_accesses[m1].is_input);
-  if (replaced) {  // Remove the command that initializes m2.
-    int32 alloc_command = matrix_accesses[m2].allocate_command;
-    KALDI_ASSERT(alloc_command != -1);
-    computation_->commands[alloc_command].command_type =
-        NnetComputation::kNoOperation;
-                          
-  }
-  //  - If it was case (a), replace the assignment command with a no-op.  
-  if (c.command_type == NnetComputation::kMatrixCopy) {
-    // remove the command.
-    c.command_type = NnetComputation::kNoOperation;
-    c.arg1 = -1;
-    c.arg2 = -1;
-  }
-  //   - If both m2 and m1 have commands that deallocate them, keep only the
-  //    later of the two and make it refer to m2 (otherwise delete any
-  //     deallocation command).
-  
-  int32 dealloc1 = matrix_accesses[m1].deallocate_command,
-      dealloc2 = matrix_accesses[m2].deallocate_command;
-  if (dealloc1 != -1 && dealloc2 != -1) {
-    int32 earlier_index = std::min(dealloc1, dealloc2),
-            later_index = std::max(dealloc1, dealloc2);
-    NnetComputation::Command
-        &earlier_command = computation_->commands[earlier_index],
-        &later_command = computation_->commands[later_index];
-    earlier_command.command_type = NnetComputation::kNoOperation;
-    later_command.arg1 = m2;
-  } else {
-    if (dealloc1 != -1)
-      computation_->commands[dealloc1].command_type =
-          NnetComputation::kNoOperation;
-    if (dealloc2 != -1)
-      computation_->commands[dealloc2].command_type =
-          NnetComputation::kNoOperation;
-  }
-  
-  // Remove the original command that allocated m1, if it exists.
-  if (matrix_accesses[m1].allocate_command != -1) {
-    NnetComputation::Command &allocate_command = computation_->commands[
-        matrix_accesses[m1].allocate_command];
-    KALDI_ASSERT((allocate_command.command_type ==
-                  NnetComputation::kAllocMatrixZeroed ||
-                  allocate_command.command_type ==
-                  NnetComputation::kAllocMatrixUndefined) &&
-                 allocate_command.arg1 == m1);
-    allocate_command.command_type = NnetComputation::kNoOperation;
-  }
-  // Prevent further optimizations touching m1 or m2 (we can
-  // try again in a later round of optimization, with a new
-  // instance of this class).
-  matrix_already_optimized_[m1] = true;
-  matrix_already_optimized_[m2] = true;  
+void NnetOptimizeOptions::Read(std::istream &is, bool binary) {
+  ExpectToken(is, binary, "<NnetOptimizeOptions>");
+  ExpectToken(is, binary, "<Optimize>");
+  ReadBasicType(is, binary, &optimize);
+  ExpectToken(is, binary, "<ConsolidateModelUpdate>");
+  ReadBasicType(is, binary, &consolidate_model_update);
+  ExpectToken(is, binary, "<PropagateInPlace>");
+  ReadBasicType(is, binary, &propagate_in_place);
+  ExpectToken(is, binary, "<BackpropInPlace>");
+  ReadBasicType(is, binary, &backprop_in_place);
+  ExpectToken(is, binary, "<ConvertAddition>");
+  ReadBasicType(is, binary, &convert_addition);
+  ExpectToken(is, binary, "<RemoveAssignments>");
+  ReadBasicType(is, binary, &remove_assignments);
+  ExpectToken(is, binary, "<AllowLeftMerge>");
+  ReadBasicType(is, binary, &allow_left_merge);
+  ExpectToken(is, binary, "<AllowRightMerge>");
+  ReadBasicType(is, binary, &allow_right_merge);
+  ExpectToken(is, binary, "<InitializeUndefined>");
+  ReadBasicType(is, binary, &initialize_undefined);
+  ExpectToken(is, binary, "<MoveSizingCommands>");
+  ReadBasicType(is, binary, &move_sizing_commands);
+  ExpectToken(is, binary, "<AllocateFromOther>");
+  ReadBasicType(is, binary, &allocate_from_other);
+  ExpectToken(is, binary, "<MinDerivTime>");
+  ReadBasicType(is, binary, &min_deriv_time);
+  ExpectToken(is, binary, "<MaxDerivTime>");
+  ReadBasicType(is, binary, &max_deriv_time);
+  ExpectToken(is, binary, "</NnetOptimizeOptions>");
 }
 
-// see comment by declaration of this function in nnet-optimize.h.
-bool VariableMergingOptimizer::IsCandidate(int32 command_index,
-                                           int32 s1, int32 s2) const {
-  bool is_assignment = (computation_->commands[command_index].command_type ==
-                        NnetComputation::kMatrixCopy);
-  if (s1 == s2) return false;
-  if (!computation_->IsWholeMatrix(s1))
-    return false;
-  int32 m1 = computation_->submatrices[s1].matrix_index,
-      m2 = computation_->submatrices[s2].matrix_index;
-  if (matrix_already_optimized_[m1] || matrix_already_optimized_[m2])
-    return false;
-  const std::vector<MatrixAccesses> &matrix_accesses = analyzer_.matrix_accesses;
-  const MatrixAccesses &m1_access = matrix_accesses[m1],
-      &m2_access = matrix_accesses[m2];
-  if (m1_access.is_input && !computation_->IsWholeMatrix(s2))
-    return false;
-  if (m1_access.is_output && !is_assignment) return false;
-  if (m2_access.is_input) return false;
-  // the following check would probably indicate a coding error- this
-  // function should never be called if those things are empty.
-  if (m1_access.accesses.empty() || m2_access.accesses.empty())
-    KALDI_ERR << "Matrices never accessed [confusing].";
-
-  if (is_assignment) {
-    // check that:  m1 is never written to after command C, and
-    if (MatrixIsWrittenToAfterCommand(matrix_accesses, m1, command_index))
-      return false;  // m1, or equivalently s1 (since it's all of m1) is written
-                     // to after command C
-    // check that:
-    //        - If s2 is written to after command C, then m1 is never read or written
-    //          to at time >= (the first time s2 is written to after command C)
-    int32 s2_write_index =
-        FirstTimeSubmatrixIsWrittenToAfterCommand(analyzer_, s2, command_index);
-    // the -1 in "s2_write_index - 1" is because we want to ensure that
-    // m1 is never written to a time ">=" (not ">") that command.
-    if (s2_write_index != -1 &&
-        MatrixIsAccessedAfterCommand(matrix_accesses, m1, s2_write_index - 1))
-      return false;
-  } else {
-    if (MatrixIsAccessedAfterCommand(matrix_accesses, m1, command_index))
-      return false;
-  }
-  if (MatrixIsAccessedBeforeCommand(matrix_accesses, m2, command_index))
-    return false;
-  
-  return true;
+void NnetOptimizeOptions::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<NnetOptimizeOptions>");
+  WriteToken(os, binary, "<Optimize>");
+  WriteBasicType(os, binary, optimize);
+  WriteToken(os, binary, "<ConsolidateModelUpdate>");
+  WriteBasicType(os, binary, consolidate_model_update);
+  WriteToken(os, binary, "<PropagateInPlace>");
+  WriteBasicType(os, binary, propagate_in_place);
+  WriteToken(os, binary, "<BackpropInPlace>");
+  WriteBasicType(os, binary, backprop_in_place);
+  WriteToken(os, binary, "<ConvertAddition>");
+  WriteBasicType(os, binary, convert_addition);
+  WriteToken(os, binary, "<RemoveAssignments>");
+  WriteBasicType(os, binary, remove_assignments);
+  WriteToken(os, binary, "<AllowLeftMerge>");
+  WriteBasicType(os, binary, allow_left_merge);
+  WriteToken(os, binary, "<AllowRightMerge>");
+  WriteBasicType(os, binary, allow_right_merge);
+  WriteToken(os, binary, "<InitializeUndefined>");
+  WriteBasicType(os, binary, initialize_undefined);
+  WriteToken(os, binary, "<MoveSizingCommands>");
+  WriteBasicType(os, binary, move_sizing_commands);
+  WriteToken(os, binary, "<AllocateFromOther>");
+  WriteBasicType(os, binary, allocate_from_other);
+  WriteToken(os, binary, "<MinDerivTime>");
+  WriteBasicType(os, binary, min_deriv_time);
+  WriteToken(os, binary, "<MaxDerivTime>");
+  WriteBasicType(os, binary, max_deriv_time);
+  WriteToken(os, binary, "</NnetOptimizeOptions>");
 }
 
-
-
-void VariableMergingOptimizer::Initialize() {
-  KALDI_ASSERT(matrix_already_optimized_.empty() &&
-               "You cannot call Merge twice on the same object.");
-  ComputeSubmatLists(*computation_, &submatrix_lists_);
-  matrix_already_optimized_.resize(computation_->matrices.size(), false);
+bool NnetOptimizeOptions::operator == (const NnetOptimizeOptions &other) const {
+  return (other.propagate_in_place == propagate_in_place &&
+          other.optimize == optimize &&
+          other.consolidate_model_update == consolidate_model_update &&
+          other.backprop_in_place == backprop_in_place &&
+          other.convert_addition == convert_addition &&
+          other.remove_assignments == remove_assignments &&
+          other.allow_left_merge == allow_left_merge &&
+          other.allow_right_merge == allow_right_merge &&
+          other.initialize_undefined == initialize_undefined &&
+          other.move_sizing_commands == move_sizing_commands &&
+          other.allocate_from_other == allocate_from_other &&
+          other.min_deriv_time == min_deriv_time &&
+          other.max_deriv_time == max_deriv_time);
 }
 
 // move commands that resize matrices to as late/early as possible.
@@ -667,7 +168,7 @@ void RemoveUnnecessaryZeroing(const Nnet &nnet,
                               NnetComputation *computation) {
   Analyzer a;
   a.Init(nnet, *computation);
-  
+
   // OK, now we'll work out which matrices have all their pieces (i.e. all the
   // variables belonging to that matrix) written to as the first instruction
   // apart from the initial zeroing.  These matrices can have the initial
@@ -680,9 +181,9 @@ void RemoveUnnecessaryZeroing(const Nnet &nnet,
     if (allocate_command == -1)  // an input
       continue;  // nothing to do.
     if (computation->commands[allocate_command].command_type !=
-        NnetComputation::kAllocMatrixZeroed) {
+        kAllocMatrixZeroed) {
       KALDI_ASSERT(computation->commands[allocate_command].command_type ==
-                   NnetComputation::kAllocMatrixUndefined);
+                   kAllocMatrixUndefined);
       continue;  // already leaving it undefined, so nothing to do.
     }
     std::vector<int32> variables_for_matrix;
@@ -697,23 +198,140 @@ void RemoveUnnecessaryZeroing(const Nnet &nnet,
                    v_accesses[0].command_index == allocate_command &&
                    v_accesses[0].access_type == kWriteAccess);
       if (v_accesses.size() > 1 &&
-          v_accesses[1].access_type != kWriteAccess)
+          v_accesses[1].access_type != kWriteAccess) {
         all_variables_ok = false;  // first access after zeroing was not a write
+        break;
+      }
+      if (v_accesses.size() == 1 &&
+          accesses.is_output) {
+        // the only command that touches this variable is the allocation, and it
+        // is an output variable.  (this is unusual, but can happen e.g. if it's
+        // a derivative, but due to min_deriv_time and max_deriv_time it ends up
+        // always being zero.
+        all_variables_ok = false;
+        break;
+      }
     }
     if (all_variables_ok) {
       // Here is where the change actually happens.
       computation->commands[allocate_command].command_type =
-          NnetComputation::kAllocMatrixUndefined;
-    }      
+          kAllocMatrixUndefined;
+    }
   }
 }
 
-void Optimize(const NnetOptimizeOptions &config,
-              const Nnet &nnet,
-              const ComputationRequest &request,
-              NnetComputation *computation) {
-  if (!config.optimize)
-    return;
+/*
+  This function is called from RemoveUnnecessaryAllocation.  The input is two
+  sorted, unique lists, of (deallocation-commands, allocation-commands)
+  e.g. (d1, d2, d3.. ), (a1, a2, a3..); and to the output is *appended* a list
+  of pairs (d, a).  Each output pair must satisfy the property that d < a, and
+  no member of the input lists may appear more than once in the output pairs
+  (although it's OK for input a and d values not to appear in any output pairs).
+
+  The goal of the implementation is to output as many pairs as possible, and
+  secondarily for the pairs to be as close as possible to each other (to avoid
+  wasting too much memory).  I'm not sure if this implementation achieves that.
+*/
+static void ComputeCommandPairs(
+    const std::pair<std::vector<int32>, std::vector<int32> > &lists,
+    std::vector<std::pair<int32,int32> > *pairs) {
+  std::vector<int32> d_list = lists.first;
+
+  std::set<int32> a_set;
+  CopyVectorToSet(lists.second, &a_set);
+
+  std::vector<int32>::reverse_iterator iter = d_list.rbegin(),
+      end = d_list.rend();
+
+  // from the latest to the earliest deallocation command...
+  for (; iter != end; ++iter) {
+    int32 d = *iter;
+    std::set<int32>::iterator a_iter = a_set.upper_bound(d);
+    // a_iter is an iterator to the first element a of the set 'a_set' such
+    // that a > d, or a_set.end() if no such element exists.
+    if (a_iter == a_set.end())
+      continue;  // we will output no pair for this d.
+    int32 a = *a_iter;
+    KALDI_PARANOID_ASSERT(a > d);  // or code error
+    a_set.erase(a_iter);  // remove this a from 'a_set' so it doesn't get used
+                          // twice
+    pairs->push_back(std::pair<int32,int32>(d, a));
+  }
+}
+
+void RemoveUnnecessaryAllocation(const Nnet &nnet,
+                                 NnetComputation *computation) {
+  // For each size of matrix and stride-type, represented as a pair<int32,int32>
+  // (the num-rows, and the num-cols * (stride-type == kDefaultStride ? 1 : -1), we
+  // accumulate a list of indexes of deallocation commands that
+  // are for that size, and a list of indexes of allocation commands
+  // for that size.
+  // For each distinct matrix size, we then call ComputeCommandPairs on those
+  // two lists, to get pairs of (deallocation, allocation) command-indexes that
+  // we can optimize out to a single command.
+
+  // The map is from a (num-rows,num-columns) to two lists, of
+  // (deallocation-commands, allocation-commands).  The order may seem
+  // backwards, but that's the order of the pairs we are looking for.
+  typedef unordered_map<std::pair<int32,int32>,
+      std::pair<std::vector<int32>,std::vector<int32> >,
+      PairHasher<int32> > MapType;
+  MapType pair_map;
+  int32 num_commands = computation->commands.size();
+  for (int32 command_index = 0; command_index < num_commands; command_index++) {
+    NnetComputation::Command &command = computation->commands[command_index];
+    if (command.command_type == kAllocMatrixZeroed ||
+        command.command_type == kAllocMatrixUndefined ||
+        command.command_type == kDeallocMatrix) {
+      int32 m = command.arg1, num_rows = computation->matrices[m].num_rows,
+          num_cols = computation->matrices[m].num_cols,
+          num_cols_mod = num_cols * (
+              computation->matrices[m].stride_type == kDefaultStride ? 1 : -1);
+      std::pair<int32,int32> p(num_rows, num_cols_mod);
+      std::pair<std::vector<int32>,std::vector<int32> > &lists = pair_map[p];
+      if (command.command_type == kDeallocMatrix)
+        lists.first.push_back(command_index);
+      else
+        lists.second.push_back(command_index);
+    }
+  }
+
+  MapType::const_iterator iter = pair_map.begin(), end = pair_map.end();
+  std::vector<std::pair<int32,int32> > command_pairs;
+  for (; iter != end; ++iter)
+    ComputeCommandPairs(iter->second, &command_pairs);
+
+  for (size_t i = 0; i < command_pairs.size(); i++) {
+    int32 dealloc_index = command_pairs[i].first,
+        alloc_index = command_pairs[i].second;
+    NnetComputation::Command
+        &dealloc_command = computation->commands[dealloc_index],
+        &alloc_command = computation->commands[alloc_index];
+    KALDI_ASSERT(dealloc_command.command_type ==
+                 kDeallocMatrix);
+    KALDI_ASSERT(alloc_command.command_type ==
+                 kAllocMatrixUndefined ||
+                 alloc_command.command_type==
+                 kAllocMatrixZeroed);
+    // remove the deallocation command.
+    dealloc_command.command_type =  kNoOperation;
+    alloc_command.arg2 = dealloc_command.arg1;
+    if (alloc_command.command_type ==
+        kAllocMatrixUndefined)
+      alloc_command.command_type =
+          kAllocMatrixFromOther;
+    else
+      alloc_command.command_type =
+          kAllocMatrixFromOtherZeroed;
+  }
+  RemoveNoOps(computation);
+}
+
+
+void VariableMergingOptimization(const NnetOptimizeOptions &config,
+                                 const Nnet &nnet,
+                                 const ComputationRequest &request,
+                                 NnetComputation *computation) {
   bool changed = true;
   while (changed) {
     changed = false;
@@ -721,48 +339,278 @@ void Optimize(const NnetOptimizeOptions &config,
     if (opt.MergeVariables())
       changed = true;
   }
+}
+
+// This is a simplified top-level interface to the model-update consolidation
+// code from class ModelUpdateConsolidator.
+void ConsolidateModelUpdate(const Nnet &nnet,
+                            const ComputationRequest &request,
+                            NnetComputation *computation) {
+  if (!request.need_model_derivative)
+    return;   // An optimization; there would be nothing to do in this case.
+  ModelUpdateConsolidator consolidator(nnet, computation);
+  consolidator.ConsolidateModelUpdate();
+}
+
+
+void ConvertAdditionToAssignment(const Nnet &nnet,
+                                 NnetComputation *computation) {
+  Analyzer analyzer;
+  analyzer.Init(nnet, *computation);
+  ComputationAnalysis analysis(*computation, analyzer);
+  int32 num_commands = computation->commands.size();
+  for (int32 command = 0; command < num_commands; command++) {
+    NnetComputation::Command &c = computation->commands[command];
+    switch (c.command_type) {
+      case kAllocMatrixUndefined: case kAllocMatrixFromOther:
+        KALDI_ERR << "Cannot call ConvertAdditionToAssignment after "
+                  << "allowing undefined initialization.";
+      case kMatrixAdd: case kAddRows: case kAddRowsMulti:
+      case kAddToRowsMulti: {
+        const std::vector<int32> &submatrices_written =
+            analyzer.command_attributes[command].submatrices_written;
+        KALDI_ASSERT(!submatrices_written.empty());
+        std::vector<int32>::const_iterator iter = submatrices_written.begin(),
+            end = submatrices_written.end();
+        bool can_convert = true;
+        for (; iter != end; ++iter) {
+          int32 submatrix_written = *iter;
+          int32 first_access_command = analysis.FirstAccess(submatrix_written);
+          // first_access_command is first non-initialization command that
+          // accesses this submatrix.  It can be assumed to be a write command,
+          // since it makes no sense to read a variable before it's written to.
+          // If it's before this command then we need to add rather than copy,
+          // we can't do the conversion to a copy command.
+          if (first_access_command != command) {
+            can_convert = false;
+            break;
+          }
+        }
+        if (can_convert) {  // convert to a copy command.
+          switch (c.command_type) {
+            case kMatrixAdd: c.command_type = kMatrixCopy; break;
+            case kAddRows: c.command_type = kCopyRows; break;
+            case kAddRowsMulti: c.command_type = kCopyRowsMulti; break;
+            case kAddToRowsMulti: c.command_type = kCopyToRowsMulti; break;
+            default: KALDI_ERR << "Unexpected command type.";
+          }
+        }
+        break;
+      }
+      default:
+        break;
+    }
+  }
+}
+
+void Optimize(const NnetOptimizeOptions &config,
+              const Nnet &nnet,
+              const ComputationRequest &request,
+              NnetComputation *computation) {
+  if (!config.optimize)
+    return;
+
+  if (GetVerboseLevel() >= 4)
+    CheckComputation(nnet, request, *computation, true);
+
+  // this will do nothing unless --min-deriv-time or --max-deriv-time was
+  // set.
+  LimitDerivativeTimes(nnet, config.min_deriv_time, config.max_deriv_time,
+                       computation);
+
+  if (GetVerboseLevel() >= 4)
+    CheckComputation(nnet, request, *computation, true);
+
+  if (config.consolidate_model_update)
+    ConsolidateModelUpdate(nnet, request, computation);
+
+  if (GetVerboseLevel() >= 4)
+    CheckComputation(nnet, request, *computation, true);
+
+  if (config.convert_addition)
+    ConvertAdditionToAssignment(nnet, computation);
+
+  if (GetVerboseLevel() >= 4)
+    CheckComputation(nnet, request, *computation, true);
+
+  if (config.remove_assignments || config.backprop_in_place ||
+      config.propagate_in_place)
+    VariableMergingOptimization(config, nnet, request, computation);
+
+  if (GetVerboseLevel() >= 4)
+    CheckComputation(nnet, request, *computation, false);
+
   if (config.initialize_undefined)
     RemoveUnnecessaryZeroing(nnet, computation);
 
+  if (GetVerboseLevel() >= 4)
+    CheckComputation(nnet, request, *computation, false);
+
   if (config.move_sizing_commands)
     MoveSizingCommands(nnet, computation);
+
+  if (GetVerboseLevel() >= 4)
+    CheckComputation(nnet, request, *computation, false);
+
+  if (config.allocate_from_other)
+    RemoveUnnecessaryAllocation(nnet, computation);
+
+  if (GetVerboseLevel() >= 4)
+    CheckComputation(nnet, request, *computation, false);
+}
+
+// ComputationRequests are distinguished by the names and indexes
+// of inputs and outputs
+size_t ComputationRequestHasher::operator() (const ComputationRequest *cr) const {
+  size_t ans = 0;
+  std::vector<IoSpecification>::const_iterator itr = cr->inputs.begin(),
+                                               end = cr->inputs.end();
+  for (; itr != end; ++itr) {
+    ans += IoSpecificationToInt(*itr);
+  }
+  itr = cr->outputs.begin();
+  end = cr->outputs.end();
+  for (; itr != end; ++itr) {
+    ans += IoSpecificationToInt(*itr);
+  }
+  return ans;
+}
+
+size_t ComputationRequestHasher::IoSpecificationToInt(const IoSpecification& spec) const {
+  size_t ans;
+  StringHasher string_hasher;
+  ans = string_hasher(spec.name);
+  std::vector<Index>::const_iterator itr = spec.indexes.begin(),
+                                     end = spec.indexes.end();
+  for (; itr != end; ++itr) {
+    ans += (*itr).n * 1619;
+    ans += (*itr).t * 15649;
+    ans += (*itr).x * 89809;
+  }
+  return ans;
+}
+
+void CachingOptimizingCompiler::UpdateCache(const ComputationRequest *request,
+                                            NnetComputation *computation) {
+  if (computation_cache_.size() == cache_capacity_) {
+    // full, locate the least-recently-accessed request
+    const CacheType::iterator it =
+        computation_cache_.find(access_queue_.front());
+    KALDI_ASSERT(it != computation_cache_.end());
+    // purge the least-recently-accessed request
+    delete it->first;
+    delete it->second.first;
+    computation_cache_.erase(it);
+    access_queue_.pop_front();
+  }
+  AqType::iterator ait = access_queue_.insert(access_queue_.end(), request);
+  computation_cache_.insert(std::make_pair(request,
+                            std::make_pair(computation, ait)));
+}
+
+void CachingOptimizingCompiler::ReadCache(std::istream &is, bool binary) {
+  NnetOptimizeOptions opt_config_cached;
+  opt_config_cached.Read(is, binary);
+  // we won't read cached computations if any optimize option has been changed.
+  bool read_cache = (opt_config_ == opt_config_cached);
+
+  if (read_cache) {
+    int32 computation_cache_size;
+    ExpectToken(is, binary, "<ComputationCacheSize>");
+    ReadBasicType(is, binary, &computation_cache_size);
+    KALDI_ASSERT(computation_cache_size >= 0);
+    computation_cache_.clear();
+    access_queue_.clear();
+    ExpectToken(is, binary, "<ComputationCache>");
+    for (size_t c = 0; c < computation_cache_size; c++) {
+      ComputationRequest *request = new ComputationRequest();
+      request->Read(is, binary);
+      NnetComputation *computation = new NnetComputation();
+      computation->Read(is, binary);
+      UpdateCache(request, computation);
+    }
+  }
+}
+
+void CachingOptimizingCompiler::WriteCache(std::ostream &os, bool binary) const {
+  opt_config_.Write(os, binary);
+  WriteToken(os, binary, "<ComputationCacheSize>");
+  WriteBasicType(os, binary, static_cast<int32>(computation_cache_.size()));
+  WriteToken(os, binary, "<ComputationCache>");
+  for (CacheType::const_iterator iter = computation_cache_.begin();
+           iter != computation_cache_.end(); ++iter) {
+    iter->first->Write(os, binary);
+    iter->second.first->Write(os, binary);
+  }
+}
+
+void CachingOptimizingCompiler::UpdateAccessQueue(CacheType::iterator &cit) {
+  // exist, update access record by moving the accessed
+  // request to the end of the access queue
+  KALDI_ASSERT(cit != computation_cache_.end());
+  access_queue_.splice(access_queue_.end(), access_queue_,
+                       cit->second.second);
+}
+
+CachingOptimizingCompiler::~CachingOptimizingCompiler() {
+  CacheType::const_iterator itr = computation_cache_.begin(),
+      end = computation_cache_.end();
+  for (; itr !=end; ++itr) {
+    delete itr->first;
+    delete itr->second.first;
+  }
 }
 
 const NnetComputation* CachingOptimizingCompiler::Compile(
-    const ComputationRequest  &request) {
-  if (!(request == request_)) {
-    request_ = request;
-    Compiler compiler(request_, nnet_);
+    const ComputationRequest  &in_request) {
+  NnetComputation *computation;
+  ComputationRequest *request;
+  // find computation in the cache
+  CacheType::iterator cit = computation_cache_.find(&in_request);
+  if (cit == computation_cache_.end()) {
+    // if not found, compile and update cache
+    request = new ComputationRequest;
+    *request = in_request;
+    Compiler compiler(*request, nnet_);
     CompilerOptions opts;
+    computation = new NnetComputation;
+    compiler.CreateComputation(opts, computation);
 
-    compiler.CreateComputation(opts, &computation_);
-    
-    int32 verbose_level = 4;
-    if (GetVerboseLevel() >= verbose_level) {
+    int32 verbose_cutoff = 4;
+    if (GetVerboseLevel() >= verbose_cutoff) {
       std::ostringstream os1;
-      request.Print(os1);
+      request->Print(os1);
       KALDI_LOG << "Computation request is " << os1.str();
-      std::ostringstream os2;      
-      computation_.Print(os2, nnet_);
+      std::ostringstream os2;
+      computation->Print(os2, nnet_);
       KALDI_LOG << "Generated computation is: " << os2.str();
     }
     { // some checking.
       CheckComputationOptions check_config;
       // we can do the rewrite check since it's before optimization.
-      check_config.check_rewrite = true;  
-      ComputationChecker checker(check_config, nnet_, request_,
-                                 computation_);
+      check_config.check_rewrite = true;
+      ComputationChecker checker(check_config, nnet_, *computation);
       checker.Check();
     }
-    Optimize(opt_config_, nnet_, request_, &computation_);
-    { // check the computation again.
+    Optimize(opt_config_, nnet_, *request, computation);
+    if (GetVerboseLevel() >= verbose_cutoff) {
+      std::ostringstream os;
+      computation->Print(os, nnet_);
+      KALDI_LOG << "Optimized computation is: " << os.str();
+    }
+    {  // check the computation again.
       CheckComputationOptions check_config;
-      ComputationChecker checker(check_config, nnet_, request_, computation_);
+      ComputationChecker checker(check_config, nnet_, *computation);
       checker.Check();
     }
-    computation_.ComputeCudaIndexes();
+    computation->ComputeCudaIndexes();
+    UpdateCache(request, computation);
+  } else {
+    // if found, update access queue
+    computation = cit->second.first;
+    UpdateAccessQueue(cit);
   }
-  return &computation_;
+  return computation;
 }
 
 
diff --git a/src/nnet3/nnet-optimize.h b/src/nnet3/nnet-optimize.h
index 718ad39b3cd..e04aff302c9 100644
--- a/src/nnet3/nnet-optimize.h
+++ b/src/nnet3/nnet-optimize.h
@@ -1,6 +1,7 @@
 // nnet3/nnet-optimize.h
 
-// Copyright 2015    Johns Hopkins University (author: Daniel Povey)
+// Copyright      2015  Johns Hopkins University (author: Daniel Povey)
+//                2015  Xiaohui Zhang
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -23,6 +24,8 @@
 #include "nnet3/nnet-compile.h"
 #include "nnet3/nnet-analyze.h"
 
+#include <list>
+
 namespace kaldi {
 namespace nnet3 {
 
@@ -31,166 +34,216 @@ namespace nnet3 {
 // detected, we can work out which optimization was responsible for the error.
 struct NnetOptimizeOptions {
   bool optimize;  // setting this false disallow all optimization.
+  bool consolidate_model_update;
   bool propagate_in_place;
   bool backprop_in_place;
+  bool convert_addition;
   bool remove_assignments;
+  bool allow_left_merge;
+  bool allow_right_merge;
   bool initialize_undefined;
   bool move_sizing_commands;
+  bool allocate_from_other;
+  int32 min_deriv_time;
+  int32 max_deriv_time;
 
   NnetOptimizeOptions(): optimize(true),
+                         consolidate_model_update(true),
                          propagate_in_place(true),
                          backprop_in_place(true),
+                         convert_addition(true),
                          remove_assignments(true),
+                         allow_left_merge(true),
+                         allow_right_merge(true),
                          initialize_undefined(true),
-                         move_sizing_commands(true) { }
-  
+                         move_sizing_commands(true),
+                         allocate_from_other(true),
+                         min_deriv_time(std::numeric_limits<int32>::min()),
+                         max_deriv_time(std::numeric_limits<int32>::max()) { }
+
   void Register(OptionsItf *opts) {
     opts->Register("optimize", &optimize, "Set this to false to turn off all "
                  "optimizations");
+    opts->Register("consolidate-model-update", &consolidate_model_update,
+                   "Set to false to disable optimization that consolidates "
+                   "the model-update phase of backprop (e.g. for recurrent "
+                   "architectures");
     opts->Register("propagate-in-place", &propagate_in_place, "Set to false to "
                    "disable optimization that allows in-place propagation");
     opts->Register("backprop-in-place", &backprop_in_place, "Set to false to "
                    "disable optimization that allows in-place backprop");
+    opts->Register("convert-addition", &convert_addition, "Set to false to "
+                   "disable the optimization that converts Add commands into "
+                   "Copy commands wherever possible.");
     opts->Register("remove-assignments", &remove_assignments, "Set to false to "
                    "disable optimization that removes redundant assignments");
+    opts->Register("allow-left-merge", &allow_left_merge, "Set to false to "
+                   "disable left-merging of variables in remove-assignments "
+                   "(obscure option)");
+    opts->Register("allow-right-merge", &allow_right_merge, "Set to false to "
+                   "disable right-merging of variables in remove-assignments "
+                   "(obscure option)");
     opts->Register("initialize-undefined", &initialize_undefined, "Set to false "
                    "to disable optimization that avoids redundant zeroing");
     opts->Register("move-sizing-commands", &move_sizing_commands, "Set to false "
                    "to disable optimization that moves matrix allocation and "
                    "deallocation commands to conserve memory.");
+    opts->Register("allocate-from-other", &allocate_from_other, "Instead of "
+                   "deleting a matrix of a given size and then allocating "
+                   "a matrix of the same size, allow re-use of that memory");
+    opts->Register("min-deriv-time", &min_deriv_time, "You can set this to "
+                   "the minimum t value that you want derivatives to be computed "
+                   "at when updating the model.  This is an optimization that "
+                   "saves time in the backprop phase for recurrent frameworks");
+    opts->Register("max-deriv-time", &max_deriv_time, "You can set this to "
+                   "the maximum t value that you want derivatives to be computed "
+                   "at when updating the model.  This is an optimization that "
+                   "saves time in the backprop phase for recurrent frameworks");
   }
+  void Read(std::istream &is, bool binary);
+  void Write(std::ostream &os, bool binary) const;
+  bool operator == (const NnetOptimizeOptions &other) const;
 };
 
-
 /// This is the top-level function for optimizing a computation.
-/// The rest of this file contains various things that are
-/// called from this, and which you probably won't need to call
-/// directly.
 void Optimize(const NnetOptimizeOptions &config,
               const Nnet &nnet,
               const ComputationRequest &request,
               NnetComputation *computation);
 
+// Hash function for ComputationRequest. It converts
+// ComputationRequest to hash code by looking at input
+// and output IoSpecifications vectors.
+struct ComputationRequestHasher {
+  size_t operator()(const ComputationRequest *cr) const;
+ private:
+  size_t IoSpecificationToInt(const IoSpecification& spec) const;
+  static const int kPrime = 7853;
+};
+
+// Equality function for ComputationRequest pointer
+struct ComputationRequestPtrEqual {
+ public:
+  bool operator() (const ComputationRequest* cr1,
+                   const ComputationRequest* cr2) const {
+    return (*cr1) == (*cr2);
+  }
+};
 
 /// This class enables you to do the compilation and optimization in one call,
 /// and also ensures that if the ComputationRequest is identical to the previous
 /// one, the compilation process is not repeated.
 class CachingOptimizingCompiler {
  public:
-  CachingOptimizingCompiler(const Nnet &nnet): nnet_(nnet) { }
+  CachingOptimizingCompiler(const Nnet &nnet,
+                           const int32 capacity = 20):
+      nnet_(nnet), cache_capacity_(capacity) { }
 
   /// Note: nnet is retained as a const reference but opt_config is copied.
   CachingOptimizingCompiler(const Nnet &nnet,
-                            const NnetOptimizeOptions &opt_config):
-      nnet_(nnet), opt_config_(opt_config) { }
+                            const NnetOptimizeOptions &opt_config,
+                            const int32 capacity = 20):
+      nnet_(nnet), opt_config_(opt_config), cache_capacity_(capacity) { }
 
+  ~CachingOptimizingCompiler();
   /// Does the compilation and returns a const pointer to
   /// the result, which is owned by this class, not the caller.
   /// It calls ComputeCudaIndexes() for you, because you wouldn't
   /// be able to do this on a const object.
-  const NnetComputation* Compile(const ComputationRequest  &request);
+  const NnetComputation* Compile(const ComputationRequest &request);
+  void ReadCache(std::istream &is, bool binary);
+  void WriteCache(std::ostream &os, bool binary) const;
  private:
   const Nnet &nnet_;
   NnetOptimizeOptions opt_config_;
-  ComputationRequest request_;
-  NnetComputation computation_;
+
+  // The access queue for keeping track of the freshness of computation.
+  // Most-recently-accessed computation is at the end, and
+  // least-recently-accessed computaiton is at the beginning.
+  // Together with computation_cache_, this forms a most-recently-used (MRU)
+  // cache for Computations, indexed by ComputationRequest. Pointers
+  // are owned in computation_cache_.
+  typedef std::list<const ComputationRequest*> AqType;
+  AqType access_queue_;
+
+  // Map from computation-request to pair of (computation, and position in
+  // access_queue_). Used for fast lookup of previously compiled computations.
+  // All pointers are owned here.
+  typedef unordered_map<const ComputationRequest*, std::pair<NnetComputation*,
+    AqType::iterator>, ComputationRequestHasher,
+    ComputationRequestPtrEqual> CacheType;
+  CacheType computation_cache_;
+
+  // This function updates the computation cache. It is called within Compile().
+  // It takes ownership of the pointers.  It inserts the request at the end of
+  // the queue, and purges the least-recently-accessed request from the queue and
+  // the cache if the capacity is reached.
+  void UpdateCache(const ComputationRequest *request,
+                   NnetComputation *computation);
+  // This function updates the recently accessed queue.
+  void UpdateAccessQueue(CacheType::iterator &cit);
+  // This configuration value determines how many unique Computations
+  // to cache in our most-recently-used cache.
+  int32 cache_capacity_;
 };
 
 
-/**
-   This class is responsible for merging matrices.  Suppose there are m1 and s1
-   on the one hand, where s1 is a submatrix consisting of the whole of m1, and
-   s2 which is a submatrix of m2 on the other hand, and somewhere in the
-   computation we have a command C, which is one of:
-      (a) the assignment command  "s2 = s1", or
-      (b) a propagate command with s1 as input and s2 as output, with a component
-          that supports propagate in place, or
-      (c) a backprop command with s1 as output-deriv and s2 as input-deriv, with
-          a component that supports backprop in place.
-   Suppose also that:
-     - m1 is not an output (or it's case (a))
-     - m1 is not an input, or s2 is the whole of m2.
-     - if it's case (a), assignment, then:
-        - m1 is never written to after command C, and
-        - If s2 is written to after command C, then m1 is never accessed
-          to at time >= (the first time s2 is written to after command C)
-     - otherwise:
-      - after command C, s1 is never accessed [apart from deallocating its matrix]
-     - before command C, s2 is never accessed, apart from initializing it and possibly
-       zeroing it
-     - m2 is not an input.
-   Of course the matrices will have the same size.
-
-   We merge the variables as follows:
-     - All submatrices that reference m1, make them reference m2 instead.
-       [later we'll renumber so that there are no duplicates.]
-     - If m1 was an input, replace it as an input with m2.
-     - If it was case (a), replace the assignment command with a no-op.
-     - If both m1 and m2 have commands that deallocate them, keep only the
-       later of the two and make it refer to m2 (otherwise delete any
-       deallocation command).
-
-    At the end when we call RemoveOrphanMatrices(), renumbering code will
-    automatically detect that there are duplicate submatrices, and will merge
-    them, as well as removing the now-unused matrix indexes.
- */
-class VariableMergingOptimizer {
- public:
-  VariableMergingOptimizer(const NnetOptimizeOptions &config,
-                           const Nnet &nnet,
-                           const ComputationRequest &request,
-                           NnetComputation *computation);
-  // Note: you can call this only once.  If it returns true, it means it has
-  // merged variables.  In this case, you have the option to instantiate another
-  // copy of the class and try again with that other copy.
-  bool MergeVariables();
+/// This optimization, which has no effect unless you set --min-deriv-time or
+/// --max-deriv-time, modifies the backprop operations for efficiency based on
+/// the assumption that derivatives for any Cindex with t < min_deriv_time or t
+/// > max_deriv_time are zero.  (this is based on the fact that derivatives in
+/// recurrent setups will either decay to zero over time, or will explode and
+/// anyway become meaningless).  This is only applied if you are not comoputing
+/// any input-derivatives).  The assumption, for simple Components, is that
+/// backprop operations are no-ops as long as the input was zeroed, because the
+/// back-propagated derivatives would be zero and the model would not be
+/// updated.
+///
+/// The most important effect of this operation is to modify some operations of
+/// type kBackprop and kBackpropNoModelUpdate for simple Components, to either
+/// make them operate on row ranges of their original input (which in general
+/// will be newly created submatrices), or to remove them altogether if they do
+/// not operate on any 't' values within the correct range.
+///
+/// We assert as a requirement of this optimization that all allocation commands
+/// must zero their matrices (this effectively means that you cannot apply this
+/// optimization after RemoveUnnecessaryZeroing()).  This means that we don't
+/// have to worry about leaving things undefined after removing backprop
+/// operations.  We also assert that backprop commands that set instead of
+/// adding to their input, must not be outputting to things that were
+/// previously set to nonzero values.   (this shouldn't ever be a problem, but
+/// we do check.
+///
+/// Note: after this optimization it will likely be beneficial to call
+/// RemoveUnnecessaryOperations to remove operations not of type kBackprop that have
+/// now become unnecessary-- e.g. operations that do the backprop through
+/// Descriptors.
+void LimitDerivativeTimes(const Nnet &nnet,
+                          const ComputationRequest &request,
+                          const NnetOptimizeOptions &opts,
+                          NnetComputation *computation);
+
+/// This consolidates the model-update parts of the backprop into larger
+/// operations (applicable mostly to recurrent setups)-- internally it uses
+/// class ModelUpdateConsolidator.  Will fail if called a
+/// second time.
+void ConsolidateModelUpdate(const Nnet &nnet,
+                            const ComputationRequest &request,
+                            NnetComputation *computation);
+
+/// This converts addition operations (things with Add in their names) to
+/// copy operations (things with Copy in their names).  This is both slightly
+/// more efficient,
+void ConvertAdditionToAssignment(const Nnet &nnet,
+                                 NnetComputation *computation);
+
+
+/// This wraps class VariableMergingOptimizer in a simplified interface.
+void VariableMergingOptimization(const NnetOptimizeOptions &config,
+                                 const Nnet &nnet,
+                                 const ComputationRequest &request,
+                                 NnetComputation *computation);
 
- private:
-  // this function, called while testing whether the pair (s1,s2) is a candidate
-  // for optimization, returns true if all the following conditions hold:
-  //   - s1 != s2
-  //   - s1 and s2 correspond to the whole of their corresponding matrices m1 and m2.
-  //   - neither matrix_already_optimized_[m1] nor matrix_already_optimized_[m2] is true
-  //   - m1 is not an output of the computation (or command command_index is an
-  //     assignment).
-  //   - m2 is not an input of the computation
-  //   - after command "command_index", no part of m1 is ever accessed [apart
-  //     from deallocating it] (or command "command_index" is an assignment and
-  //     no part of m1 is written to after command "command_index"
-  //   - before command C, no part of m2 is never accessed, apart from
-  //     initializing it and possibly zeroing it.
-  bool IsCandidate(int32 command_index, int32 s1, int32 s2) const;
-  
-  // performs the merge.
-  // compute m1,m2 from s1,s2.
-  //  - All submatrices that reference m2, make them reference m1 instead.
-  //   [later we'll renumber so that there are no duplicates.]
-  //  - If m1 was an input, replace it as an input with m2 and remove
-  //    the command that initializes m2.
-  //  - If it was case (a), replace the assignment command with a no-op.
-  //  - If both m2 and m1 have commands that deallocate them, keep only the
-  //    later of the two and make it refer to m1 (otherwise delete any
-  //    deallocation command).
-  //  - Remove the original command that allocated m1, if it exists.
-  void DoMerge(int32 command_index, int32 s1, int32 s2);
-
-  void Initialize();
-
-  const NnetOptimizeOptions &config_;
-  const Nnet &nnet_;
-  const ComputationRequest &request_;
-  NnetComputation *computation_;
-
-  Analyzer analyzer_;
-  
-  // lists of submatrices that correspond to each matrix.
-  std::vector<std::vector<int32> > submatrix_lists_;
-
-  // true for each matrix that has already been part of
-  // an optimization (either as m1 or m2), so we can
-  // void potential
-  std::vector<bool> matrix_already_optimized_;  
-};
 
 /// This optimization function changes, where possible, matrix initializations
 /// of type kAllocMatrixZeroed to kAllocMatrixUndefined.
@@ -201,116 +254,11 @@ void RemoveUnnecessaryZeroing(const Nnet &nnet, NnetComputation *computation);
 /// possible, and commands that empty matrices to as early as possible.
 void MoveSizingCommands(const Nnet &nnet, NnetComputation *computation);
 
-/// This function detects matrices that have no submatrices corresponding to
-/// them (due, to changes made in other optimization code), and removes them
-/// from the computation.  It also renumbers the submatrix indexes to remove
-/// duplicates.
-void RemoveOrphanMatrices(NnetComputation *computation);
-
-/// Removes commands of type kNoOperation in the computation.
-void RemoveNoOps(NnetComputation *computation);
-
-/// Wherever matrix orig_matrix_index appears in the input of the network
-/// (i.e. in computation->input_output_info), replaces it with new_matrix_index.
-/// Returns true if it did replace it.
-bool ReplaceInInput(
-    const Nnet &nnet,
-    int32 orig_matrix_index, int32 new_matrix_index,
-    NnetComputation *computation);
-
-/// This function outputs to "submatrix_args" the addresses of a subset of
-/// arguments arg1 through arg6 in "command", that correspond to the indexes
-/// of submatrices.  This is useful in renumbering code.
-void IdentifySubmatrixArgs(NnetComputation::Command *command,
-                           std::vector<int32*> *submatrix_args);
-
-/// This function outputs to "matrix_args" the addresses of a subset of the
-/// arguments arg1 through arg6 in "command", that correspond to the indexes of
-/// matrices.  This is useful in renumbering code.  (Note: only a few types of
-/// command use matrix indexes).
-void IdentifyMatrixArgs(NnetComputation::Command *command,
-                        std::vector<int32*> *matrix_args);
-
-
-
-
-
-  
-/*
-  Things we can do to optimize a computation...
-
-  (1) replacing un-needed inputs to Backprop functions (if used)
-      with the empty matrix
-  
-  (2) sharing of matrices that would otherwise just be copied.
-
-    If the only input to a submatrix A (apart from zeroing) is copying or adding
-    from another sub-matrix B, then
-    
-      - if A is a whole matrix we can remove submatrix A and let all references
-        to it point to B instead, and remove the copy/add commands.  Otherwise,
-      - if B is a whole matrix we can remove submatrix B and let all references
-        to it point to A instead, and remove the copy/add commands.
-
-  (3) sharing of matrices that are inputs and outputs of Propagate
-     or Backprop functions that support in-place computation.
-     If there are submatrices A and B that are also whole matrices,
-     then
-     
-       - If there is a Propagate operation for which A is the input and B is the
-         output, and the component supports in-place propagate, and there is no
-         operation after that Propagate that reads A, and there is no operation
-         prior to the Propagate that sets B (apart from sizing it and zeroing
-         it) then make B point to A and replace all references to B with
-         references to A.
-
-       - If there is a Backprop operation for which A is the output-deriv and B
-         is the input-deriv (note: Backprop reads A and sets B), and the
-         component supports in-place backprop, and there is no operation prior
-         to the Backprop that writes to B apart from sizing and zeroing,
-         and there is no operation after the Backprop that reads A, then
-         make B point to A and replace all references to B with references to
-         A.
-
-  (4) optimizations w.r.t. Propagate and Backprop functions that add to (rather
-     than set) their output.
-       TBD, but the basic idea is that if the output of, say, a Propagate function
-      is added to another matrix, and that is the only time it is used,
-      then we could just set the output location to that other matrix.
-
-   (5) optimizations w.r.t. avoiding Backprop functions that are not needed.
-      Basically, we need to keep track of what the outputs of each Backprop
-      function are and whether they are used.  If we are are doing model
-      update and this component is updatable then the Backprop function is
-      considered to output to the model.  Also, it may output to the
-      input-derivative of that component.  We have to keep track of which of
-      these input-derivatives are actually used.
-
-   (6) optimizations w.r.t. zeroing matrices.
-      This optimization is to avoid unnecessarily zeroing matrices
-      when we initialize them.  If the first time a matrix (or all the sub-parts
-      thereof) is set, it is set in a copy operation, or in a Propagate or
-      Backprop operation that sets (rather than adds to) its output, then
-      we can initialize it with kUndefined rather than kZero.
-
-  (7) optimizations for memory consumption.
-      The idea here is to move the command to initialize a matrix to just
-      before its first use, and to move the command to deinitialize a matrix
-      to just after its last use.
-
-  (8) renumbering optimizations.
-       - renumber Matrices to get rid of zero-sized, un-needed ones, and a similar thing for Sub-matrices.
-       - renumber Computations to get rid of no-ops introduced by earlier optimizations
-         [also, modify forward_computation_end].
-       - maybe renumber Indexes to get rid of duplicates.
-
-  (9) optimizations to replace row-by-row copy and add commands with whole-matrix
-      commands on smaller sub-matrices (if the row-by-row copy commands have certain
-      regularities).  this is a minor issue, we can handle it later.  We have to be
-      careful if this causes sub-matrices to overlap.
-
- */
-
+/// This optimization detects cases where we deallocate a matrix, and then
+/// later allocate another matrix of the same size; and replaces them
+/// with commands of type kAllocFromOther or kAllocFromOtherZeroed.
+void RemoveUnnecessaryAllocation(const Nnet &nnet,
+                                 NnetComputation *computation);
 
 
 
@@ -320,4 +268,3 @@ void IdentifyMatrixArgs(NnetComputation::Command *command,
 
 
 #endif
-
diff --git a/src/nnet3/nnet-parse-test.cc b/src/nnet3/nnet-parse-test.cc
index cf8bbdfa2fe..c4715e098f6 100644
--- a/src/nnet3/nnet-parse-test.cc
+++ b/src/nnet3/nnet-parse-test.cc
@@ -19,198 +19,230 @@
 
 #include "nnet3/nnet-parse.h"
 
+
 namespace kaldi {
 namespace nnet3 {
 
-  void UnitTestConfigLineParse() {
-    std::string str;
-    {
-      ConfigLine cfl;
-      str = "xx=yyy foo=bar  baz=123 ba=1:2";
-      bool status = cfl.ParseLine(str);
-      KALDI_ASSERT(status);
-      
-      KALDI_ASSERT(cfl.HasUnusedValues());
-      std::string str_value;
-      KALDI_ASSERT(cfl.GetValue("xx", &str_value));
-      KALDI_ASSERT(str_value == "yyy");
-      KALDI_ASSERT(cfl.HasUnusedValues());
-      KALDI_ASSERT(cfl.GetValue("foo", &str_value));
-      KALDI_ASSERT(str_value == "bar");
-      KALDI_ASSERT(cfl.HasUnusedValues());
-      KALDI_ASSERT(!cfl.GetValue("xy", &str_value));
-      KALDI_ASSERT(cfl.GetValue("baz", &str_value));
-      KALDI_ASSERT(str_value == "123");
-
-      std::vector<int32> int_values;
-      KALDI_ASSERT(!cfl.GetValue("xx", &int_values));
-      KALDI_ASSERT(cfl.GetValue("baz", &int_values));
-      KALDI_ASSERT(cfl.HasUnusedValues());
-      KALDI_ASSERT(int_values.size() == 1 && int_values[0] == 123);
-      KALDI_ASSERT(cfl.GetValue("ba", &int_values));
-      KALDI_ASSERT(int_values.size() == 2 && int_values[0] == 1 && int_values[1] == 2);
-      KALDI_ASSERT(!cfl.HasUnusedValues());
-    }
-    
-    {
-      ConfigLine cfl;
-      str = "baz=x y z pp = qq ab =cd ac= bd";
-      KALDI_ASSERT(!cfl.ParseLine(str));
-    }
-    {
-      ConfigLine cfl;
-      str = "baz=x y z pp = qq ab=cd ac=bd";
-      KALDI_ASSERT(!cfl.ParseLine(str));
-    }
-    {
-      ConfigLine cfl;
-      str = "baz";
-      KALDI_ASSERT(!cfl.ParseLine(str));
-    }
-    {
-      ConfigLine cfl;
-      str = "a=b baz";
-      KALDI_ASSERT(cfl.ParseLine(str) && cfl.UnusedValues() == "a=b baz");
-    }
-    {
-      ConfigLine cfl;
-      str = "a=b =c";
-      KALDI_ASSERT(!cfl.ParseLine(str));
-    }
-    {
-      ConfigLine cfl;
-      str = "baz=x y z pp=qq ab=cd ac=bd";
-      KALDI_ASSERT(cfl.ParseLine(str));
-      std::string str_value;
-      KALDI_ASSERT(cfl.GetValue("baz", &str_value));
-      KALDI_ASSERT(str_value == "x y z");
-      KALDI_ASSERT(cfl.GetValue("pp", &str_value));
-      KALDI_ASSERT(str_value == "qq");
-      KALDI_ASSERT(cfl.UnusedValues() == "ab=cd ac=bd");
-      KALDI_ASSERT(cfl.GetValue("ab", &str_value));
-      KALDI_ASSERT(str_value == "cd");
-      KALDI_ASSERT(cfl.UnusedValues() == "ac=bd");
-      KALDI_ASSERT(cfl.HasUnusedValues());
-      KALDI_ASSERT(cfl.GetValue("ac", &str_value));
-      KALDI_ASSERT(str_value == "bd");
-      KALDI_ASSERT(!cfl.HasUnusedValues());
-    }
-    
-    {
-      ConfigLine cfl;
-      str = "baz= pp = qq flag=t ";
-      KALDI_ASSERT(!cfl.ParseLine(str));
-    }
-    {
-      ConfigLine cfl;
-      str = " baz= pp=qq flag=t  ";
-      KALDI_ASSERT(cfl.ParseLine(str));
-      
-      std::string str_value;
-      KALDI_ASSERT(cfl.GetValue("baz", &str_value));
-      KALDI_ASSERT(str_value == "");
-      KALDI_ASSERT(cfl.GetValue("pp", &str_value));
-      KALDI_ASSERT(str_value == "qq");
-      KALDI_ASSERT(cfl.HasUnusedValues());
-      KALDI_ASSERT(cfl.GetValue("flag", &str_value));
-      KALDI_ASSERT(str_value == "t");
-      KALDI_ASSERT(!cfl.HasUnusedValues());
-
-      bool bool_value = false;
-      KALDI_ASSERT(cfl.GetValue("flag", &bool_value));
-      KALDI_ASSERT(bool_value);
-    }
-    
-    {
-      ConfigLine cfl;
-      str = "_baz=a -pp=qq";
-      KALDI_ASSERT(!cfl.ParseLine(str));
-    }
-    {
-      ConfigLine cfl;
-      str = "0baz=a pp=qq";
-      KALDI_ASSERT(!cfl.ParseLine(str));
-    }
-    {
-      ConfigLine cfl;
-      str = "-baz=a pp=qq";
-      KALDI_ASSERT(!cfl.ParseLine(str));
-    }
-    {
-      ConfigLine cfl;
-      str = "_baz'=a pp=qq";
-      KALDI_ASSERT(!cfl.ParseLine(str));
-    }
-    {
-      ConfigLine cfl;
-      str = "baz=g";
-      KALDI_ASSERT(cfl.ParseLine(str));
-      bool flag;
-      KALDI_ASSERT(!cfl.GetValue("baz", &flag));
-    }
-    {
-      ConfigLine cfl;
-      str = "_baz1=a pp=qq";
-      KALDI_ASSERT(cfl.ParseLine(str));
-      
-      std::string str_value;
-      KALDI_ASSERT(cfl.GetValue("_baz1", &str_value));
-    }
-  }
-
-  void UnitTestReadConfig() {
-    std::string str = " alpha=aa beta=b b# String test\n"
-                      "beta2=b c beta3=bd # \n"
-                      "gamma=1:2:3:4  # Int Vector test\n"
-                      " de1ta=f  # Bool + Integer in key Comment test delta=t  \n"
-                      "_epsilon=-1  # Int Vector test _epsilon=1 \n"
-                      " zet-_a=0.15   theta=1.1# Float, -, _ test\n";
-
-    std::istringstream is(str);
-    std::vector<std::string> lines;
-    ReadConfigFile(is, &lines);
-    KALDI_ASSERT(lines.size() == 6);
-
-    ConfigLine cfl;
-    for (size_t i = 0; i < lines.size(); i++) {
-      KALDI_ASSERT(cfl.ParseLine(lines[i]));
-    }
-    KALDI_ASSERT(cfl.GetValue("beta2", &str) && str == "b c");
-    KALDI_ASSERT(cfl.GetValue("_epsilon", &str) && str == "-1");
-
-    BaseFloat float_val = 0;
-    KALDI_ASSERT(cfl.GetValue("zet-_a", &float_val) && ApproxEqual(float_val, 0.15));
-  }
-
-  void UnitTestDescriptorTokenize() {
-    std::vector<std::string> lines;
-
-    std::string str = "(,test )";
-    KALDI_ASSERT(DescriptorTokenize(str, &lines));
-    KALDI_ASSERT(lines[0] == "(" && lines[1] == "," && lines[2] == "test" && lines[3] == ")");
-
-    str = "(,1test )";
-    KALDI_ASSERT(!DescriptorTokenize(str, &lines));
-
-    str = "t (,-1 )";
-    KALDI_ASSERT(DescriptorTokenize(str, &lines));
-    KALDI_ASSERT(lines.size() == 5 && lines[0] == "t" && lines[3] == "-1");
-
-    str = "   sd , -112 )";
-    KALDI_ASSERT(DescriptorTokenize(str, &lines));
-    KALDI_ASSERT(lines.size() == 4 && lines[0] == "sd" && lines[2] == "-112");
-    
-    str = "   sd , +112 )";
-    KALDI_ASSERT(DescriptorTokenize(str, &lines));
-    KALDI_ASSERT(lines.size() == 4 && lines[0] == "sd" && lines[2] == "+112");
-
-    str = "foo";
-    KALDI_ASSERT(DescriptorTokenize(str, &lines));
-    KALDI_ASSERT(lines.size() == 1 && lines[0] == "foo");
-    
+void UnitTestConfigLineParse() {
+  std::string str;
+  {
+    ConfigLine cfl;
+    str = "xx=yyy foo=bar  baz=123 ba=1:2";
+    bool status = cfl.ParseLine(str);
+    KALDI_ASSERT(status);
+
+    KALDI_ASSERT(cfl.HasUnusedValues());
+    std::string str_value;
+    KALDI_ASSERT(cfl.GetValue("xx", &str_value));
+    KALDI_ASSERT(str_value == "yyy");
+    KALDI_ASSERT(cfl.HasUnusedValues());
+    KALDI_ASSERT(cfl.GetValue("foo", &str_value));
+    KALDI_ASSERT(str_value == "bar");
+    KALDI_ASSERT(cfl.HasUnusedValues());
+    KALDI_ASSERT(!cfl.GetValue("xy", &str_value));
+    KALDI_ASSERT(cfl.GetValue("baz", &str_value));
+    KALDI_ASSERT(str_value == "123");
+
+    std::vector<int32> int_values;
+    KALDI_ASSERT(!cfl.GetValue("xx", &int_values));
+    KALDI_ASSERT(cfl.GetValue("baz", &int_values));
+    KALDI_ASSERT(cfl.HasUnusedValues());
+    KALDI_ASSERT(int_values.size() == 1 && int_values[0] == 123);
+    KALDI_ASSERT(cfl.GetValue("ba", &int_values));
+    KALDI_ASSERT(int_values.size() == 2 && int_values[0] == 1 && int_values[1] == 2);
+    KALDI_ASSERT(!cfl.HasUnusedValues());
   }
 
+  {
+    ConfigLine cfl;
+    str = "baz=x y z pp = qq ab =cd ac= bd";
+    KALDI_ASSERT(!cfl.ParseLine(str));
+  }
+  {
+    ConfigLine cfl;
+    str = "baz=x y z pp = qq ab=cd ac=bd";
+    KALDI_ASSERT(!cfl.ParseLine(str));
+  }
+  {
+    ConfigLine cfl;
+    str = "baz";
+    KALDI_ASSERT(!cfl.ParseLine(str));
+  }
+  {
+    ConfigLine cfl;
+    str = "a=b baz";
+    KALDI_ASSERT(cfl.ParseLine(str) && cfl.UnusedValues() == "a=b baz");
+  }
+  {
+    ConfigLine cfl;
+    str = "a=b baz ";
+    KALDI_ASSERT(cfl.ParseLine(str) && cfl.UnusedValues() == "a=b baz");
+  }
+  {
+    ConfigLine cfl;
+    str = "a=b =c";
+    KALDI_ASSERT(!cfl.ParseLine(str));
+  }
+  {
+    ConfigLine cfl;
+    str = "baz='x y z' pp=qq ab=cd ac=bd";
+    KALDI_ASSERT(cfl.ParseLine(str));
+    std::string str_value;
+    KALDI_ASSERT(cfl.GetValue("baz", &str_value));
+    KALDI_ASSERT(str_value == "x y z");
+    KALDI_ASSERT(cfl.GetValue("pp", &str_value));
+    KALDI_ASSERT(str_value == "qq");
+    KALDI_ASSERT(cfl.UnusedValues() == "ab=cd ac=bd");
+    KALDI_ASSERT(cfl.GetValue("ab", &str_value));
+    KALDI_ASSERT(str_value == "cd");
+    KALDI_ASSERT(cfl.UnusedValues() == "ac=bd");
+    KALDI_ASSERT(cfl.HasUnusedValues());
+    KALDI_ASSERT(cfl.GetValue("ac", &str_value));
+    KALDI_ASSERT(str_value == "bd");
+    KALDI_ASSERT(!cfl.HasUnusedValues());
+  }
+
+  {
+    ConfigLine cfl;
+    str = "baz= pp = qq flag=t ";
+    KALDI_ASSERT(!cfl.ParseLine(str));
+  }
+  {
+    ConfigLine cfl;
+    str = " baz= pp=qq flag=t  ";
+    KALDI_ASSERT(cfl.ParseLine(str));
+
+    std::string str_value;
+    KALDI_ASSERT(cfl.GetValue("baz", &str_value));
+    KALDI_ASSERT(str_value == "");
+    KALDI_ASSERT(cfl.GetValue("pp", &str_value));
+    KALDI_ASSERT(str_value == "qq");
+    KALDI_ASSERT(cfl.HasUnusedValues());
+    KALDI_ASSERT(cfl.GetValue("flag", &str_value));
+    KALDI_ASSERT(str_value == "t");
+    KALDI_ASSERT(!cfl.HasUnusedValues());
+
+    bool bool_value = false;
+    KALDI_ASSERT(cfl.GetValue("flag", &bool_value));
+    KALDI_ASSERT(bool_value);
+  }
+
+  {
+    ConfigLine cfl;
+    str = "_baz=a -pp=qq";
+    KALDI_ASSERT(!cfl.ParseLine(str));
+  }
+  {
+    ConfigLine cfl;
+    str = "0baz=a pp=qq";
+    KALDI_ASSERT(!cfl.ParseLine(str));
+  }
+  {
+    ConfigLine cfl;
+    str = "-baz=a pp=qq";
+    KALDI_ASSERT(!cfl.ParseLine(str));
+  }
+  {
+    ConfigLine cfl;
+    str = "_baz'=a pp=qq";
+    KALDI_ASSERT(!cfl.ParseLine(str));
+  }
+  {
+    ConfigLine cfl;
+    str = "baz=g";
+    KALDI_ASSERT(cfl.ParseLine(str));
+    bool flag;
+    KALDI_ASSERT(!cfl.GetValue("baz", &flag));
+  }
+  {
+    ConfigLine cfl;
+    str = "_baz1=a pp=qq";
+    KALDI_ASSERT(cfl.ParseLine(str));
+
+    std::string str_value;
+    KALDI_ASSERT(cfl.GetValue("_baz1", &str_value));
+  }
+}
+
+void UnitTestReadConfig() {
+  std::string str = " alpha=aa beta=\"b b\"# String test\n"
+      "beta2='b c' beta3=bd # \n"
+      "gamma=1:2:3:4  # Int Vector test\n"
+      " de1ta=f  # Bool + Integer in key Comment test delta=t  \n"
+      "_epsilon=-1  # Int Vector test _epsilon=1 \n"
+      " zet-_a=0.15   theta=1.1# Float, -, _ test\n"
+      "quoted='a b c' # quoted string\n"
+      "quoted2=\"d e 'a b=c' f\" # string quoted with double quotes";
+
+  std::istringstream is(str);
+  std::vector<std::string> lines;
+  ReadConfigFile(is, &lines);
+  KALDI_ASSERT(lines.size() == 8);
+
+  ConfigLine cfl;
+  for (size_t i = 0; i < lines.size(); i++) {
+    KALDI_ASSERT(cfl.ParseLine(lines[i]));
+  }
+  KALDI_ASSERT(cfl.GetValue("beta2", &str) && str == "b c");
+  KALDI_ASSERT(cfl.GetValue("_epsilon", &str) && str == "-1");
+  KALDI_ASSERT(cfl.GetValue("quoted", &str) && str == "a b c");
+  KALDI_ASSERT(cfl.GetValue("quoted2", &str) && str == "d e 'a b=c' f");
+
+  BaseFloat float_val = 0;
+  KALDI_ASSERT(cfl.GetValue("zet-_a", &float_val) && ApproxEqual(float_val, 0.15));
+}
+
+void UnitTestDescriptorTokenize() {
+  std::vector<std::string> lines;
+
+  std::string str = "(,test )";
+  KALDI_ASSERT(DescriptorTokenize(str, &lines));
+  KALDI_ASSERT(lines[0] == "(" && lines[1] == "," && lines[2] == "test" && lines[3] == ")");
+
+  str = "(,1test )";
+  KALDI_ASSERT(!DescriptorTokenize(str, &lines));
+
+  str = "t (,-1 )";
+  KALDI_ASSERT(DescriptorTokenize(str, &lines));
+  KALDI_ASSERT(lines.size() == 5 && lines[0] == "t" && lines[3] == "-1");
+
+  str = "   sd , -112 )";
+  KALDI_ASSERT(DescriptorTokenize(str, &lines));
+  KALDI_ASSERT(lines.size() == 4 && lines[0] == "sd" && lines[2] == "-112");
+
+  str = "   sd , +112 )";
+  KALDI_ASSERT(DescriptorTokenize(str, &lines));
+  KALDI_ASSERT(lines.size() == 4 && lines[0] == "sd" && lines[2] == "+112");
+
+  str = "foo";
+  KALDI_ASSERT(DescriptorTokenize(str, &lines));
+  KALDI_ASSERT(lines.size() == 1 && lines[0] == "foo");
+
+}
+
+void UnitTestSummarizeVector() {
+  // will be eyeballed by a human.
+  Vector<BaseFloat> vec(9);
+  vec.SetRandn();
+  vec(0) = 1024.2343;
+  vec(1) = 0.01;
+  vec(2) = 0.001234;
+  vec(3) = 0.000198;
+  vec(3) = 1.98e-09;
+  vec(4) = 153.0;
+  vec(5) = 0.154;
+  vec(6) = 1.2;
+  vec(7) = 9.2;
+  vec(8) = 10.8;
+
+  KALDI_LOG << "vec = " << vec << " -> " << SummarizeVector(vec);
+
+  vec.Resize(20, kCopyData);
+  KALDI_LOG << "vec = " << vec << " -> " << SummarizeVector(vec);
+}
+
 } // namespace nnet3
+
 } // namespace kaldi
 
 int main() {
@@ -220,6 +252,7 @@ int main() {
   UnitTestConfigLineParse();
   UnitTestReadConfig();
   UnitTestDescriptorTokenize();
+  UnitTestSummarizeVector();
 
   KALDI_LOG << "Parse tests succeeded.";
 
diff --git a/src/nnet3/nnet-parse.cc b/src/nnet3/nnet-parse.cc
index e1f62112596..da1daddb21b 100644
--- a/src/nnet3/nnet-parse.cc
+++ b/src/nnet3/nnet-parse.cc
@@ -1,3 +1,5 @@
+// nnet3/nnet-parse.cc
+
 // Copyright      2015  Johns Hopkins University (author: Daniel Povey)
 
 // See ../../COPYING for clarification regarding multiple authors
@@ -17,68 +19,74 @@
 
 #include <iterator>
 #include <sstream>
+#include <iomanip>
 #include "nnet3/nnet-parse.h"
+#include "cudamatrix/cu-vector.h"
+#include "cudamatrix/cu-matrix.h"
 
 namespace kaldi {
 namespace nnet3 {
 
+
 bool ConfigLine::ParseLine(const std::string &line) {
   if (line.size() == 0) return false;   // Empty line
-  
-  // Line ends or begins with space -> remove it and recurse.
-  if (isspace(line[line.size()-1]) || isspace(line[0])) {
-    size_t initial_pos = line.find_first_not_of(" \t\r\n"),
-        final_pos = line.find_last_not_of(" \t\r\n");
-    if (initial_pos == std::string::npos || final_pos <= initial_pos)
+
+  size_t pos = 0, size = line.size();
+  while (pos < size) {
+    if (isspace(line[pos])) {
+      pos++;
+      continue;
+    }
+    // OK, at this point we know that we are pointing at nonspace.
+    size_t next_equals_sign = line.find_first_of("=", pos);
+    if (next_equals_sign == pos || next_equals_sign == std::string::npos) {
+      // we're looking for something like 'key=value'.  If there is no equals sign,
+      // or it's not preceded by something, it's a parsing failure.
       return false;
-    std::string processed_line(line, initial_pos, final_pos - initial_pos + 1);
-    return ParseLine(processed_line);
-  }
-  
-  size_t pos = 0;
-  size_t found_eq = line.find_first_of("=", pos + 1);
-  if (found_eq == std::string::npos) return false; // Could not find '='
-
-  
-  while (found_eq < line.size()) {
-    std::string key(line, pos, found_eq - pos);
+    }
+    std::string key(line, pos, next_equals_sign - pos);
     if (!IsValidName(key)) return false;
-    if (found_eq == std::string::npos) return false; // Could not find '='
-    if (found_eq == line.size() - 1 || line[found_eq+1] == ' ' || line[found_eq+1] == '\t') {
-      // Empty value for key
-      data_.insert(std::make_pair(key, std::make_pair("", false)));
-      pos = line.find_first_not_of(" \t", found_eq + 1);
-      if (pos == std::string::npos)
-        break; // Done reading
-      found_eq = line.find_first_of("=", pos + 1);
-      continue;
-    } 
- 
-    // See if there is next key
-    size_t found = line.find_first_of("=", found_eq + 1);
-    size_t value_end = std::string::npos;
-
-    if (found != std::string::npos) {
-      size_t found_ws = line.find_last_of(" \t", found);
-      if (found_ws < found_eq + 1) found_ws = found;
-
-      value_end = line.find_last_not_of(" \t", found_ws);
-      pos = line.find_first_not_of(" \t", found_ws + 1);
+
+    // handle any quotes.  we support key='blah blah' or key="foo bar".
+    // no escaping is supported.
+    if (line[next_equals_sign+1] == '\'' || line[next_equals_sign+1] == '"') {
+      char my_quote = line[next_equals_sign+1];
+      size_t next_quote = line.find_first_of(my_quote, next_equals_sign + 2);
+      if (next_quote == std::string::npos) {  // no matching quote was found.
+        KALDI_WARN << "No matching quote for " << my_quote << " in config line '"
+                   << line << "'";
+        return false;
+      } else {
+        std::string value(line, next_equals_sign + 2,
+                          next_quote - next_equals_sign - 2);
+        data_.insert(std::make_pair(key, std::make_pair(value, false)));
+        pos = next_quote + 1;
+        continue;
+      }
     } else {
-      value_end = line.find_last_not_of(" \t", found);
+      // we want to be able to parse something like "... input=Offset(a, -1) foo=bar":
+      // in general, config values with spaces in them, even without quoting.
+
+      size_t next_next_equals_sign = line.find_first_of("=", next_equals_sign + 1),
+          terminating_space = size;
+      
+      if (next_next_equals_sign != std::string::npos) {  // found a later equals sign.
+        size_t preceding_space = line.find_last_of(" \t", next_next_equals_sign);
+        if (preceding_space != std::string::npos &&
+            preceding_space > next_equals_sign)
+          terminating_space = preceding_space;
+      }
+      while (isspace(line[terminating_space - 1]) && terminating_space > 0)
+        terminating_space--;
+      
+      std::string value(line, next_equals_sign + 1,
+                        terminating_space - (next_equals_sign + 1));
+      data_.insert(std::make_pair(key, std::make_pair(value, false)));
+      pos = terminating_space;
     }
-    
-    KALDI_ASSERT(value_end > found_eq);
-
-    std::string value(line, found_eq + 1, value_end - found_eq);
-
-    if (value[0] == ' ' || value[0] == '\t') return false;
-    data_.insert(std::make_pair(key, std::make_pair(value, false)));
-
-    found_eq = found;
   }
   whole_line_ = line;
-  return true;
+  return !data_.empty();
 }
 
 bool ConfigLine::GetValue(const std::string &key, std::string *value) {
@@ -139,7 +147,7 @@ bool ConfigLine::GetValue(const std::string &key, std::vector<int32> *value) {
   }
   return false;
 }
-  
+
 bool ConfigLine::GetValue(const std::string &key, bool *value) {
   KALDI_ASSERT(value != NULL);
   std::map<std::string, std::pair<std::string, bool> >::iterator it = data_.begin();
@@ -155,7 +163,7 @@ bool ConfigLine::GetValue(const std::string &key, bool *value) {
         case 't':
           *value = true;
           break;
-        default: 
+        default:
           return false;
       }
       (it->second).second = true;
@@ -180,7 +188,7 @@ std::string ConfigLine::UnusedValues() const {
     if (!(it->second).second) {
       if (unused_str == "")
         unused_str = it->first + "=" + (it->second).first;
-      else 
+      else
         unused_str += " " + it->first + "=" + (it->second).first;
     }
   }
@@ -403,7 +411,7 @@ bool IsValidName(const std::string &name) {
   return true;
 }
 
-void ReadConfigFile(std::istream &is, 
+void ReadConfigFile(std::istream &is,
                     std::vector<std::string> *lines) {
   KALDI_ASSERT(lines != NULL);
   std::string line;
@@ -424,7 +432,7 @@ std::string ErrorContext(std::istream &is) {
   is.read(buf, 21);
   if (is) {
     return (std::string(buf, 20) + "...");
-  } 
+  }
   return std::string(buf, is.gcount());
 }
 
@@ -434,5 +442,95 @@ std::string ErrorContext(const std::string &str) {
   return std::string(str, 0, 20) + "...";
 }
 
+static void PrintFloatSuccinctly(std::ostream &os, BaseFloat f) {
+  if (fabs(f) < 10000.0 && fabs(f) >= 10.0) {
+    os  << std::fixed << std::setprecision(0) << f;
+  } else if (fabs(f) >= 0.995) {
+    os  << std::fixed << std::setprecision(1) << f;
+  } else if (fabs(f) >= 0.01) {
+    os  << std::fixed << std::setprecision(2) << f;
+  } else {
+    os << std::setprecision(1) << f;
+  }
+  os.unsetf(std::ios_base::floatfield);
+  os << std::setprecision(6);  // Restore the default.
+}
+
+
+// Returns a string that summarizes a vector fairly succintly, for
+// printing stats in info lines.
+std::string SummarizeVector(const Vector<BaseFloat> &vec) {
+  std::ostringstream os;
+  if (vec.Dim() < 10) {
+    os << "[ ";
+    for (int32 i = 0; i < vec.Dim(); i++) {
+      PrintFloatSuccinctly(os, vec(i));
+      os << ' ';
+    }
+    os << "]";
+  } else {
+    // print out mean and standard deviation, and some selected values.
+    BaseFloat mean = vec.Sum() / vec.Dim(),
+        stddev = sqrt(VecVec(vec, vec) / vec.Dim() - mean * mean);
+
+    std::string percentiles_str = "0,1,2,5 10,20,50,80,90 95,98,99,100";
+    std::vector<int32> percentiles;
+    bool ans = SplitStringToIntegers(percentiles_str, ", ", false,
+                                     &percentiles);
+    KALDI_ASSERT(ans);
+    os << "[percentiles(" << percentiles_str << ")=(";
+    Vector<BaseFloat> vec_sorted(vec);
+    std::sort(vec_sorted.Data(), vec_sorted.Data() + vec_sorted.Dim());
+    int32 n = vec.Dim() - 1;
+    for (size_t i = 0; i < percentiles.size(); i++) {
+      int32 percentile = percentiles[i];
+      BaseFloat value = vec_sorted((n * percentile) / 100);
+      PrintFloatSuccinctly(os, value);
+      if (i + 1 < percentiles.size())
+        os << (i == 3 || i == 8 ? ' ' : ',');
+    }
+    os << std::setprecision(3);
+    os << "), mean=" << mean << ", stddev=" << stddev << "]";
+  }
+  return os.str();
+}
+
+void PrintParameterStats(std::ostringstream &os,
+                         const std::string &name,
+                         const CuVector<BaseFloat> &params,
+                         bool include_mean) {
+  os << std::setprecision(4);
+  os << ", " << name << '-';
+  if (include_mean) {
+    BaseFloat mean = params.Sum() / params.Dim(),
+        stddev = std::sqrt(VecVec(params, params) / params.Dim() - mean * mean);
+    os << "{mean,stddev}=" << mean << ',' << stddev;
+  } else {
+    BaseFloat rms = std::sqrt(VecVec(params, params) / params.Dim());
+    os << "rms=" << rms;
+  }
+  os << std::setprecision(6);  // restore the default precision.
+}
+
+void PrintParameterStats(std::ostringstream &os,
+                         const std::string &name,
+                         const CuMatrix<BaseFloat> &params,
+                         bool include_mean) {
+  os << std::setprecision(4);
+  os << ", " << name << '-';
+  int32 dim = params.NumRows() * params.NumCols();
+  if (include_mean) {
+    BaseFloat mean = params.Sum() / dim,
+        stddev = std::sqrt(TraceMatMat(params, params, kTrans) / dim -
+                           mean * mean);
+    os << "{mean,stddev}=" << mean << ',' << stddev;
+  } else {
+    BaseFloat rms = std::sqrt(TraceMatMat(params, params, kTrans) / dim);
+    os << "rms=" << rms;
+  }
+  os << std::setprecision(6);  // restore the default precision.
+}
+
+
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-parse.h b/src/nnet3/nnet-parse.h
index 73f550fdad3..4e3fd9f0891 100644
--- a/src/nnet3/nnet-parse.h
+++ b/src/nnet3/nnet-parse.h
@@ -21,31 +21,28 @@
 #define KALDI_NNET3_NNET_PARSE_H_
 
 #include "util/text-utils.h"
+#include "matrix/kaldi-vector.h"
 
 namespace kaldi {
 namespace nnet3 {
 
 /**
-   This class is responsible for parsing input like
-     xx=yyy foo=bar  baz=123
-   and giving you access to the fields.  It parses lines pretty much
-   how you would expect from the examples above, except that it does
-   allow whitespaces in the values, and it parses lines by assuming the
-   values in expressions don't contain the '=' characters,
-   and that the values don't begin or end with whitespace, so that
-        xx=yyy foo=bar  baz=x y z  pp=qq
-   will assign "x y z" to baz.  Empty values are allowed (although not really
-   expected), so that
-      x=  y=bar
-   is allowed, and assigns "" to key x.  Key values may contain -_a-zA-Z0-9, but
-   must begin with a-zA-Z_.  
+   This class is responsible for parsing input
+     xx=yyy a=b c empty= f-oo=bar  ba_z=123 bing='a b c' baz="a b c d='a b' e"
+   and giving you access to the fields, in this case
+
+   xx->yyy, a->"b c", empty->"", f-oo->bar, ba_z->"123",
+   bing->"a b c", baz->"a b c d='a b' e"
+
+   Key values may contain -_a-zA-Z0-9, but must begin with a-zA-Z_.
  */
 class ConfigLine {
  public:
   //ConfigLine();
   // tries to parse the line as a config-file line.  Returns false if it could not
   // for some reason, e.g. "x" is not a value config-file line, nor is "=y".
-  // Prints no warnings; the user should do this.
+  // In most cases prints no warnings; the user should do this.
+  // Does not expect comments.
   bool ParseLine(const std::string &line);
 
   // the GetValue functions are overloaded for various types.  They return true
@@ -58,7 +55,7 @@ class ConfigLine {
   // Values may be separated by ":" or by ",".
   bool GetValue(const std::string &key, std::vector<int32> *value);
   bool GetValue(const std::string &key, bool *value);
-  
+
   bool HasUnusedValues() const;
   /// returns e.g. foo=bar xxx=yyy if foo and xxx were not consumed by one
   /// of the GetValue() functions.
@@ -70,9 +67,9 @@ class ConfigLine {
   std::string whole_line_;
   // data_ maps from key to (value, is-this-value-consumed?).
   std::map<std::string, std::pair<std::string, bool> > data_;
-  
+
 };
-             
+
 // Note: the ParseFromString functions are to be removed after we switch over to
 // using the ConfigLine mechanism.
 
@@ -157,6 +154,39 @@ std::string ErrorContext(std::istream &is);
 
 std::string ErrorContext(const std::string &str);
 
+// Returns a string that summarizes a vector fairly succintly, for
+// printing stats in info lines.
+std::string SummarizeVector(const Vector<BaseFloat> &vec);
+
+/** Print to 'os' some information about the mean and standard deviation of
+    some parameters, used in Info() functions in nnet-simple-component.cc.
+    For example:
+     PrintParameterStats(os, "bias", bias_params_, true);
+    would print to 'os' something like the string 
+     ", bias-{mean,stddev}=-0.013,0.196".  If 'include_mean = false',
+    it will print something like
+     ", bias-rms=0.2416", and this represents and uncentered standard deviation.
+ */
+void PrintParameterStats(std::ostringstream &os,
+                         const std::string &name,
+                         const CuVector<BaseFloat> &params,
+                         bool include_mean = false);
+
+/** Print to 'os' some information about the mean and standard deviation of
+    some parameters, used in Info() functions in nnet-simple-component.cc.
+    For example:
+     PrintParameterStats(os, "linear-params", linear_params_;
+    would print to 'os' something like the string 
+     ", linear-params-rms=0.239".
+    If you set include_mean to true, it will print something like
+    ", linear-params-{mean-stddev}=0.103,0.183".
+ */
+void PrintParameterStats(std::ostringstream &os,
+                         const std::string &name,
+                         const CuMatrix<BaseFloat> &params,
+                         bool include_mean = false);
+
+
 } // namespace nnet3
 } // namespace kaldi
 
diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index 5120d11937d..1c936f55e4e 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -2,6 +2,7 @@
 
 // Copyright      2015  Johns Hopkins University (author: Daniel Povey)
 //                2015  Guoguo Chen
+//                2015  Daniel Galvez
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -20,6 +21,8 @@
 
 #include <iterator>
 #include <sstream>
+#include <algorithm>
+#include <iomanip>
 #include "nnet3/nnet-simple-component.h"
 #include "nnet3/nnet-parse.h"
 
@@ -29,10 +32,8 @@ namespace nnet3 {
 void PnormComponent::Init(int32 input_dim, int32 output_dim)  {
   input_dim_ = input_dim;
   output_dim_ = output_dim;
-  if (input_dim_ == 0)
-    input_dim_ = 10 * output_dim_; // default group size : 10
-  KALDI_ASSERT(input_dim_ > 0 && output_dim_ >= 0);
-  KALDI_ASSERT(input_dim_ % output_dim_ == 0);
+  KALDI_ASSERT(input_dim_ > 0 && output_dim_ > 0 &&
+               input_dim_ % output_dim_ == 0);
 }
 
 void PnormComponent::InitFromConfig(ConfigLine *cfl) {
@@ -57,14 +58,14 @@ void PnormComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
 void PnormComponent::Backprop(const std::string &debug_info,
                               const ComponentPrecomputedIndexes *indexes,
                               const CuMatrixBase<BaseFloat> &in_value,
-                              const CuMatrixBase<BaseFloat> &out_value,                        
+                              const CuMatrixBase<BaseFloat> &out_value,
                               const CuMatrixBase<BaseFloat> &out_deriv,
                               Component *to_update,
                               CuMatrixBase<BaseFloat> *in_deriv) const {
-  if (!in_deriv)  return;  
+  if (!in_deriv)  return;
   BaseFloat p = 2.0;
   // TODO: use Group2NormDeriv when done.
-  in_deriv->GroupPnormDeriv(in_value, out_value, p); 
+  in_deriv->GroupPnormDeriv(in_value, out_value, p);
   in_deriv->MulRowsGroupMat(out_deriv);
 }
 
@@ -85,11 +86,76 @@ void PnormComponent::Write(std::ostream &os, bool binary) const {
   WriteToken(os, binary, "</PnormComponent>");
 }
 
-std::string PnormComponent::Info() const {
-  std::stringstream stream;
-  stream << Type() << ", input-dim = " << input_dim_
-         << ", output-dim = " << output_dim_;
-  return stream.str();
+
+void SumReduceComponent::Init(int32 input_dim, int32 output_dim)  {
+  input_dim_ = input_dim;
+  output_dim_ = output_dim;
+  KALDI_ASSERT(input_dim_ > 0 && output_dim_ > 0 &&
+               input_dim_ % output_dim_ == 0);
+}
+
+void SumReduceComponent::InitFromConfig(ConfigLine *cfl) {
+  int32 input_dim = 0;
+  int32 output_dim = 0;
+  bool ok = cfl->GetValue("output-dim", &output_dim) &&
+      cfl->GetValue("input-dim", &input_dim);
+  if (!ok || cfl->HasUnusedValues() || output_dim <= 0)
+    KALDI_ERR << "Invalid initializer for layer of type "
+              << Type() << ": \"" << cfl->WholeLine() << "\"";
+  Init(input_dim, output_dim);
+}
+
+
+void SumReduceComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
+                                   const CuMatrixBase<BaseFloat> &in,
+                                   CuMatrixBase<BaseFloat> *out) const {
+  KALDI_ASSERT(out->NumRows() == in.NumRows() && in.NumCols() == input_dim_
+               && out->NumCols() == output_dim_);
+  int32 num_blocks = input_dim_ / output_dim_;
+  for (int32 i = 0; i < num_blocks; i++) {
+    CuSubMatrix<BaseFloat> in_block(in, 0, in.NumRows(),
+                                    i * output_dim_, output_dim_);
+    if (i == 0)
+      out->CopyFromMat(in_block);
+    else
+      out->AddMat(1.0, in_block);
+  }
+}
+
+void SumReduceComponent::Backprop(const std::string &debug_info,
+                                  const ComponentPrecomputedIndexes *indexes,
+                                  const CuMatrixBase<BaseFloat> &, // in_value
+                                  const CuMatrixBase<BaseFloat> &, // out_value
+                                  const CuMatrixBase<BaseFloat> &out_deriv,
+                                  Component *, // to_update
+                                  CuMatrixBase<BaseFloat> *in_deriv) const {
+  if (!in_deriv)  return;
+  KALDI_ASSERT(out_deriv.NumRows() == in_deriv->NumRows() &&
+               in_deriv->NumCols() == input_dim_ &&
+               out_deriv.NumCols() == output_dim_);
+  int32 num_blocks = input_dim_ / output_dim_;
+  for (int32 i = 0; i < num_blocks; i++) {
+    CuSubMatrix<BaseFloat> in_deriv_block(*in_deriv, 0, in_deriv->NumRows(),
+                                          i * output_dim_, output_dim_);
+    in_deriv_block.CopyFromMat(out_deriv);
+  }
+}
+
+void SumReduceComponent::Read(std::istream &is, bool binary) {
+  ExpectOneOrTwoTokens(is, binary, "<SumReduceComponent>", "<InputDim>");
+  ReadBasicType(is, binary, &input_dim_);
+  ExpectToken(is, binary, "<OutputDim>");
+  ReadBasicType(is, binary, &output_dim_);
+  ExpectToken(is, binary, "</SumReduceComponent>");
+}
+
+void SumReduceComponent::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<SumReduceComponent>");
+  WriteToken(os, binary, "<InputDim>");
+  WriteBasicType(os, binary, input_dim_);
+  WriteToken(os, binary, "<OutputDim>");
+  WriteBasicType(os, binary, output_dim_);
+  WriteToken(os, binary, "</SumReduceComponent>");
 }
 
 
@@ -112,7 +178,6 @@ void ElementwiseProductComponent::InitFromConfig(ConfigLine *cfl) {
   Init(input_dim, output_dim);
 }
 
-
 void ElementwiseProductComponent::Propagate(
     const ComponentPrecomputedIndexes *indexes,
     const CuMatrixBase<BaseFloat> &in,
@@ -133,11 +198,11 @@ void ElementwiseProductComponent::Propagate(
 void ElementwiseProductComponent::Backprop(const std::string &debug_info,
                               const ComponentPrecomputedIndexes *indexes,
                               const CuMatrixBase<BaseFloat> &in_value,
-                              const CuMatrixBase<BaseFloat> &out_value,                        
+                              const CuMatrixBase<BaseFloat> &out_value,
                               const CuMatrixBase<BaseFloat> &out_deriv,
                               Component *to_update,
                               CuMatrixBase<BaseFloat> *in_deriv) const {
-  if (!in_deriv)  return;  
+  if (!in_deriv)  return;
   int32 num_inputs = input_dim_ / output_dim_;
   for (int32 i = 0; i < num_inputs; i++)  {
     CuSubMatrix<BaseFloat> current_in_deriv(*in_deriv, 0, in_deriv->NumRows(),
@@ -174,35 +239,126 @@ void ElementwiseProductComponent::Write(std::ostream &os, bool binary) const {
   WriteToken(os, binary, "</ElementwiseProductComponent>");
 }
 
-std::string ElementwiseProductComponent::Info() const {
-  std::stringstream stream;
-  stream << Type() << ", input-dim=" << input_dim_
-         << ", output-dim=" << output_dim_;
-  return stream.str();
+const BaseFloat NormalizeComponent::kSquaredNormFloor =
+    pow(2.0, NormalizeComponent::kExpSquaredNormFloor);
+
+// This component modifies the vector of activations by scaling it
+// so that the root-mean-square equals 1.0.  It's important that its
+// square root be exactly representable in float.
+void NormalizeComponent::Init(int32 input_dim, BaseFloat target_rms,
+                              bool add_log_stddev) {
+  KALDI_ASSERT(input_dim > 0);
+  KALDI_ASSERT(target_rms > 0);
+  input_dim_ = input_dim;
+  target_rms_ = target_rms;
+  add_log_stddev_ = add_log_stddev;
+}
+
+NormalizeComponent::NormalizeComponent(const NormalizeComponent &other):
+    input_dim_(other.input_dim_), target_rms_(other.target_rms_),
+    add_log_stddev_(other.add_log_stddev_) { }
+
+void NormalizeComponent::InitFromConfig(ConfigLine *cfl) {
+  int32 input_dim = 0;
+  bool add_log_stddev = false;
+  BaseFloat target_rms = 1.0;
+  bool ok = cfl->GetValue("dim", &input_dim) ||
+      cfl->GetValue("input-dim", &input_dim);
+  cfl->GetValue("target-rms", &target_rms);
+  cfl->GetValue("add-log-stddev", &add_log_stddev);
+  if (!ok || cfl->HasUnusedValues() || input_dim <= 0 || target_rms <= 0.0)
+    KALDI_ERR << "Invalid initializer for layer of type "
+              << Type() << ": \"" << cfl->WholeLine() << "\"";
+  Init(input_dim, target_rms, add_log_stddev);
+}
+
+void NormalizeComponent::Read(std::istream &is, bool binary) {
+  std::string token;
+  ReadToken(is, binary, &token);
+  if (token == "<NormalizeComponent>") {
+    ReadToken(is, binary, &token);
+  }
+  KALDI_ASSERT(token == "<Dim>" || token == "<InputDim>");
+  ReadBasicType(is, binary, &input_dim_); // Read dimension.
+  ReadToken(is, binary, &token);
+  // read target_rms_ if it is available.
+  if (token == "<TargetRms>") {
+    ReadBasicType(is, binary, &target_rms_);
+    ReadToken(is, binary, &token);
+  }
+  //  Read add_log_stddev_ token, if it is available.
+  if (token == "<AddLogStddev>") {
+    ReadBasicType(is, binary, &add_log_stddev_);
+    ReadToken(is, binary, &token);
+  }
+  if (token == "<ValueAvg>") {
+    // back-compatibility code.
+    CuVector<double> temp;
+    temp.Read(is, binary);
+    ExpectToken(is, binary, "<DerivAvg>");
+    temp.Read(is, binary);
+    ExpectToken(is, binary, "<Count>");
+    double count;
+    ReadBasicType(is, binary, &count);
+    ReadToken(is, binary, &token);
+  }
+  KALDI_ASSERT(token == "</NormalizeComponent>");
+}
+
+void NormalizeComponent::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<NormalizeComponent>");
+  WriteToken(os, binary, "<InputDim>");
+  WriteBasicType(os, binary, input_dim_);
+  WriteToken(os, binary, "<TargetRms>");
+  WriteBasicType(os, binary, target_rms_);
+  WriteToken(os, binary, "<AddLogStddev>");
+  WriteBasicType(os, binary, add_log_stddev_);
+  WriteToken(os, binary, "</NormalizeComponent>");
 }
 
-const BaseFloat NormalizeComponent::kNormFloor = pow(2.0, -66);
-// This component modifies the vector of activations by scaling it so that the
-// root-mean-square equals 1.0.  It's important that its square root
-// be exactly representable in float.
+std::string NormalizeComponent::Info() const {
+  std::ostringstream stream;
+  stream << Type() << ", input-dim=" << InputDim()
+         << ", output-dim=" << OutputDim() << ", target-rms=" << target_rms_
+         << ", add-log-stddev=" << std::boolalpha << add_log_stddev_;
+  return stream.str();
+}
 
+// The output y_i = scale * x_i,
+// and we want to RMS value of the y_i to equal target_rms,
+// so y^t y = D * target_rms^2 (if y is one row of the input).
+// we need to have scale = 1.0 / sqrt(x^t x / (D * target_rms^2)).
+// there is also flooring involved, to avoid division-by-zero
+// problems.  It's important for the backprop, that the floor's
+// square root is exactly representable as float.
+// If add_log_stddev_ is true, log(max(epsi, sqrt(x^t x / D)))
+// is an extra dimension of the output.
 void NormalizeComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
                                    const CuMatrixBase<BaseFloat> &in,
                                    CuMatrixBase<BaseFloat> *out) const {
+  KALDI_ASSERT(out->NumCols() == in.NumCols() + (add_log_stddev_ ? 1 : 0));
+  CuSubMatrix<BaseFloat> out_no_log(*out, 0, out->NumRows(), 0, input_dim_);
+  if (in.Data() != out_no_log.Data())
+    out_no_log.CopyFromMat(in);
   CuVector<BaseFloat> in_norm(in.NumRows());
-  in_norm.AddDiagMat2(1.0 / in.NumCols(),
-                      in, kNoTrans, 0.0);
-  in_norm.ApplyFloor(kNormFloor);
+  BaseFloat d_scaled = in.NumCols() * target_rms_ * target_rms_;
+  in_norm.AddDiagMat2(1.0 / d_scaled, in, kNoTrans, 0.0);
+  in_norm.ApplyFloor(kSquaredNormFloor);
   in_norm.ApplyPow(-0.5);
-  out->CopyFromMat(in);
-  out->MulRowsVec(in_norm);
+  out_no_log.MulRowsVec(in_norm);
+  if (add_log_stddev_) {
+    in_norm.ApplyLog();
+    in_norm.Scale(-1.0);
+    in_norm.Add(log(target_rms_));
+    out->CopyColFromVec(in_norm, in.NumCols());
+  }
 }
 
 /*
   A note on the derivative of NormalizeComponent...
   let both row_in and row_out be vectors of dimension D.
-  Let p = row_in^T row_in / D, and let
-  f = 1 / sqrt(max(kNormFloor, p)), and we compute row_out as:
+  Let p = row_in^T row_in / (D * target_rms^2), and let
+  f = 1.0 / sqrt(max(kSquaredNormFloor, p)), and we compute row_out as:
   row_out = f row_in.
   Suppose we have a quantity deriv_out which is the derivative
   of the objective function w.r.t. row_out.  We want to compute
@@ -211,13 +367,15 @@ void NormalizeComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
   deriv_in = f deriv_out + ....
   next we have to take into account the derivative that gets back-propagated
   through f.  Obviously, dF/df = deriv_out^T row_in.
-  And df/dp = (p <= kNormFloor ? 0.0 : -0.5 p^{-1.5}) = (f == 1 / sqrt(kNormFloor) ? 0.0 : -0.5 f^3),
-  and dp/d(row_in) = 2/D row_in. [it's vector_valued].
+  And df/dp = (p <= kSquaredNormFloor ? 0.0 : -0.5 p^{-1.5}) = (f == 1.0 / sqrt(kSquaredNormFloor) ? 0.0 : -0.5 f^3),
+  and dp/d(row_in) = 2/(D * target_rms^2) row_in. [it's vector_valued].
   So this term in dF/d(row_in) equals:
-  dF/df df/dp dp/d(row_in)   =    2/D (f == 1 / sqrt(kNormFloor)  ? 0.0 : -0.5 f^3) (deriv_out^T row_in) row_in
+  dF/df df/dp dp/d(row_in)   =    2/(D * target_rms^2) (f == 1.0 / sqrt(kSquaredNormFloor)  ? 0.0 : -0.5 f^3) (deriv_out^T row_in) row_in
   So
-  deriv_in = f deriv_out + (f == 1.0 ? 0.0 : -f^3 / D) (deriv_out^T row_in) row_in
+  deriv_in = f deriv_out + (f == 1.0 ? 0.0 : -f^3  / (D * target_rms^2) ) (deriv_out^T row_in) row_in
 
+  if add_log_stddev_ true, the deriv_in has another term as
+  dF/dx_i = dF/df . df/dx_i => df/dx_i = x_i/(x^T x)
 */
 void NormalizeComponent::Backprop(const std::string &debug_info,
                                   const ComponentPrecomputedIndexes *indexes,
@@ -227,26 +385,47 @@ void NormalizeComponent::Backprop(const std::string &debug_info,
                                   Component *to_update,
                                   CuMatrixBase<BaseFloat> *in_deriv) const {
   if (!in_deriv)  return;
+  const CuSubMatrix<BaseFloat> out_deriv_no_log(out_deriv,
+                                                0, out_deriv.NumRows(),
+                                                0, input_dim_);
   CuVector<BaseFloat> dot_products(out_deriv.NumRows());
-  dot_products.AddDiagMatMat(1.0, out_deriv, kNoTrans, in_value, kTrans, 0.0);
+  dot_products.AddDiagMatMat(1.0, out_deriv_no_log, kNoTrans,
+                             in_value, kTrans, 0.0);
   CuVector<BaseFloat> in_norm(in_value.NumRows());
-  in_norm.AddDiagMat2(1.0 / in_value.NumCols(),
-                      in_value, kNoTrans, 0.0);
-  in_norm.ApplyFloor(kNormFloor);
+  BaseFloat d_scaled = (in_value.NumCols() * target_rms_ * target_rms_);
+  in_norm.AddDiagMat2(1.0, in_value, kNoTrans, 0.0);
+
+  if (add_log_stddev_) {
+    CuVector<BaseFloat> log_stddev_deriv(in_norm), // log_stddev deriv as dF/dy .* (x^T x)^-1
+        out_deriv_for_stddev(out_deriv.NumRows(), kUndefined);
+    // f = log(sqrt(max(epsi, x^T x / D)))
+    // df/dx = epsi^2 * D < x^T x ? (1/(x^T x)) * x  : 0.
+    // we don't compute this exactly below for the case wehn x^2 x is very
+    // small, but we do make sure that the deriv isn't infinity when the input
+    // is zero.
+    log_stddev_deriv.ApplyFloor(input_dim_ * kSquaredNormFloor);
+    log_stddev_deriv.ApplyPow(-1.0);
+    out_deriv_for_stddev.CopyColFromMat(out_deriv, (out_deriv.NumCols() - 1));
+    log_stddev_deriv.MulElements(out_deriv_for_stddev);
+    if (in_deriv)
+      in_deriv->AddDiagVecMat(1.0, log_stddev_deriv, in_value, kNoTrans, 1.0);
+  }
+  in_norm.Scale(1.0 / d_scaled);
+  in_norm.ApplyFloor(kSquaredNormFloor);
   in_norm.ApplyPow(-0.5);
-  
   if (in_deriv) {
-    if (in_deriv->Data() != out_deriv.Data())
-      in_deriv->AddDiagVecMat(1.0, in_norm, out_deriv, kNoTrans, 0.0);
+    if (in_deriv->Data() != out_deriv_no_log.Data())
+      in_deriv->AddDiagVecMat(1.0, in_norm, out_deriv_no_log, kNoTrans, 1.0);
     else
       in_deriv->MulRowsVec(in_norm);
+    in_norm.ReplaceValue(1.0 / sqrt(kSquaredNormFloor), 0.0);
+    in_norm.ApplyPow(3.0);
+    dot_products.MulElements(in_norm);
+
+    in_deriv->AddDiagVecMat(-1.0 / d_scaled,
+                            dot_products, in_value,
+                            kNoTrans, 1.0);
   }
-  in_norm.ReplaceValue(1.0 / sqrt(kNormFloor), 0.0);
-  in_norm.ApplyPow(3.0);
-  dot_products.MulElements(in_norm);
-  in_deriv->AddDiagVecMat(-1.0 / in_value.NumCols(),
-                          dot_products, in_value,
-                          kNoTrans, 1.0);
 }
 
 void SigmoidComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
@@ -258,16 +437,90 @@ void SigmoidComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
 void SigmoidComponent::Backprop(const std::string &debug_info,
                                 const ComponentPrecomputedIndexes *indexes,
                                 const CuMatrixBase<BaseFloat> &,
-                                const CuMatrixBase<BaseFloat> &out_value,                        
+                                const CuMatrixBase<BaseFloat> &out_value,
                                 const CuMatrixBase<BaseFloat> &out_deriv,
                                 Component *,
                                 CuMatrixBase<BaseFloat> *in_deriv) const {
-  if (in_deriv != NULL)
+  if (in_deriv != NULL) {
     in_deriv->DiffSigmoid(out_value, out_deriv);
+    RepairGradients(out_value, in_deriv);
+  }
 }
 
+void SigmoidComponent::RepairGradients(
+    const CuMatrixBase<BaseFloat> &out_value,
+    CuMatrixBase<BaseFloat> *in_deriv) const {
+  // maximum possible derivative of SigmoidComponent is 0.25.
+  // the default lower-threshold on the derivative, below which we
+  // add a term to the derivative to encourage the inputs to the sigmoid
+  // to be closer to zero, is 0.05, which means the derivative is on average
+  // 5 times smaller than its maximum possible value.
+  BaseFloat default_lower_threshold = 0.05;
+
+  // we use this 'repair_probability' (hardcoded for now) to limit
+  // this code to running on about half of the minibatches.
+  BaseFloat repair_probability = 0.5;
+
+  if (self_repair_scale_ == 0.0 || count_ == 0.0 || deriv_sum_.Dim() != dim_ ||
+      RandUniform() > repair_probability)
+    return;
+
+  // check that the self-repair scale is in a reasonable range.
+  KALDI_ASSERT(self_repair_scale_ > 0.0 && self_repair_scale_ < 0.1);
+  BaseFloat unset = kUnsetThreshold; // -1000.0
+  BaseFloat lower_threshold = (self_repair_lower_threshold_ == unset ?
+                               default_lower_threshold :
+                               self_repair_lower_threshold_) *
+      count_;
+  if (self_repair_upper_threshold_ != unset) {
+    KALDI_ERR << "Do not set the self-repair-upper-threshold for sigmoid "
+              << "components, it does nothing.";
+  }
+
+  // thresholds_vec is actually a 1-row matrix.  (the ApplyHeaviside
+  // function isn't defined for vectors).
+  CuMatrix<BaseFloat> thresholds(1, dim_);
+  CuSubVector<BaseFloat> thresholds_vec(thresholds, 0);
+  thresholds_vec.AddVec(-1.0, deriv_sum_);
+  thresholds_vec.Add(lower_threshold);
+  thresholds.ApplyHeaviside();
+
+  // At this point, 'thresholds_vec' contains a 1 for each dimension of
+  // the output that is 'problematic', i.e. for which the avg-deriv
+  // is less than the self-repair lower threshold, and a 0 for
+  // each dimension that is not problematic.
+
+  // what we want to do is to add
+  // -self_repair_scale_ / repair_probability times (2 * output-valiue - 1.0)
+  // to the input derivative for each problematic dimension.
+
+  // Here, 2 * output - 1.0 is a version of the sigmoid that goes from -1.0 to
+  // 1.0, like a tanh.  the negative sign is so that for inputs <0, we push them
+  // up towards 0, and for inputs >0, we push them down towards 0.
+  // Our use of this sigmoid-type function here is just a convenience since
+  // we have it available.  We could use just about any function that is positive
+  // for inputs < 0 and negative for inputs > 0.
+
+  // We can rearrange the above as: for only the problematic columns,
+  //   input-deriv -= 2 * self-repair-scale / repair-probabilty * output
+  //   input-deriv +=  self-repair-scale / repair-probabilty
+  // which we can write as:
+  //   input-deriv -= 2 * self-repair-scale / repair-probabilty * output * thresholds-vec
+  //   input-deriv +=  self-repair-scale / repair-probabilty * thresholds-vec
+
+  in_deriv->AddMatDiagVec(-2.0 * self_repair_scale_ / repair_probability,
+                          out_value, kNoTrans, thresholds_vec);
+  in_deriv->AddVecToRows(self_repair_scale_ / repair_probability,
+                         thresholds_vec);
+}
+
+
+
 void SigmoidComponent::StoreStats(const CuMatrixBase<BaseFloat> &out_value) {
-  // derivative of the nonlinearity is out_value * (1.0 - out_value);  
+  // only store stats about every other minibatch.
+  if (RandInt(0, 1) == 0)
+    return;
+  // derivative of the nonlinearity is out_value * (1.0 - out_value);
   CuMatrix<BaseFloat> temp_deriv(out_value.NumRows(), out_value.NumCols(),
                                  kUndefined);
   temp_deriv.Set(1.0);
@@ -295,6 +548,147 @@ void NoOpComponent::Backprop(const std::string &debug_info,
   in_deriv->CopyFromMat(out_deriv);
 }
 
+void ClipGradientComponent::Read(std::istream &is, bool binary) {
+  // might not see the "<NaturalGradientAffineComponent>" part because
+  // of how ReadNew() works.
+  ExpectOneOrTwoTokens(is, binary, "<ClipGradientComponent>",
+                       "<Dim>");
+  ReadBasicType(is, binary, &dim_);
+  ExpectToken(is, binary, "<ClippingThreshold>");
+  ReadBasicType(is, binary, &clipping_threshold_);
+  ExpectToken(is, binary, "<NormBasedClipping>");
+  ReadBasicType(is, binary, &norm_based_clipping_);
+  ExpectToken(is, binary, "<NumElementsClipped>");
+  ReadBasicType(is, binary, &num_clipped_);
+  ExpectToken(is, binary, "<NumElementsProcessed>");
+  ReadBasicType(is, binary, &count_);
+  ExpectToken(is, binary, "</ClipGradientComponent>");
+}
+
+void ClipGradientComponent::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<ClipGradientComponent>");
+  WriteToken(os, binary, "<Dim>");
+  WriteBasicType(os, binary, dim_);
+  WriteToken(os, binary, "<ClippingThreshold>");
+  WriteBasicType(os, binary, clipping_threshold_);
+  WriteToken(os, binary, "<NormBasedClipping>");
+  WriteBasicType(os, binary, norm_based_clipping_);
+  WriteToken(os, binary, "<NumElementsClipped>");
+  WriteBasicType(os, binary, num_clipped_);
+  WriteToken(os, binary, "<NumElementsProcessed>");
+  WriteBasicType(os, binary, count_);
+  WriteToken(os, binary, "</ClipGradientComponent>");
+}
+
+std::string ClipGradientComponent::Info() const {
+  std::ostringstream stream;
+  stream << Type() << ", dim=" << dim_
+         << ", norm-based-clipping="
+         << (norm_based_clipping_ ? "true" : "false")
+         << ", clipping-threshold=" << clipping_threshold_
+         << ", clipped-proportion="
+         << (count_ > 0 ? static_cast<BaseFloat>(num_clipped_)/count_ : 0);
+  return stream.str();
+}
+
+void ClipGradientComponent::Init(int32 dim,
+                                 BaseFloat clipping_threshold,
+                                 bool norm_based_clipping,
+                                 int32 num_clipped,
+                                 int32 count)  {
+  KALDI_ASSERT(clipping_threshold >= 0 && dim > 0);
+  dim_ = dim;
+  norm_based_clipping_ = norm_based_clipping;
+  clipping_threshold_ = clipping_threshold;
+  num_clipped_ = num_clipped;
+  count_ = count;
+}
+
+void ClipGradientComponent::InitFromConfig(ConfigLine *cfl) {
+  int32 dim = 0;
+  bool ok = cfl->GetValue("dim", &dim);
+  bool norm_based_clipping = false;
+  BaseFloat clipping_threshold = 15.0;
+  cfl->GetValue("clipping-threshold", &clipping_threshold);
+  cfl->GetValue("norm-based-clipping", &norm_based_clipping);
+  if (!ok || cfl->HasUnusedValues() ||
+      clipping_threshold < 0 || dim <= 0)
+    KALDI_ERR << "Invalid initializer for layer of type "
+              << Type() << ": \"" << cfl->WholeLine() << "\"";
+  Init(dim, clipping_threshold, norm_based_clipping, 0, 0);
+}
+
+void ClipGradientComponent::Propagate(
+                                 const ComponentPrecomputedIndexes *indexes,
+                                 const CuMatrixBase<BaseFloat> &in,
+                                 CuMatrixBase<BaseFloat> *out) const {
+  out->CopyFromMat(in);
+}
+
+
+void ClipGradientComponent::Backprop(const std::string &debug_info,
+                             const ComponentPrecomputedIndexes *indexes,
+                             const CuMatrixBase<BaseFloat> &,
+                             const CuMatrixBase<BaseFloat> &,
+                             const CuMatrixBase<BaseFloat> &out_deriv,
+                             Component *to_update_in, // may be NULL; may be identical
+                             // to "this" or different.
+                             CuMatrixBase<BaseFloat> *in_deriv) const {
+  // the following statement will do nothing if in_deriv and out_deriv have same
+  // memory.
+  in_deriv->CopyFromMat(out_deriv);
+
+  ClipGradientComponent *to_update =
+      dynamic_cast<ClipGradientComponent*>(to_update_in);
+  KALDI_ASSERT(to_update != NULL);
+
+  if (clipping_threshold_ > 0) {
+    if (norm_based_clipping_) {
+      // each row in the derivative matrix, which corresponds to one sample in
+      // the mini-batch, is scaled to have a max-norm of clipping_threshold_
+      CuVector<BaseFloat> clipping_scales(in_deriv->NumRows());
+      clipping_scales.AddDiagMat2(pow(clipping_threshold_, -2), *in_deriv,
+                                  kNoTrans, 0.0);
+     // now clipping_scales contains the squared (norm of each row divided by
+     //  clipping_threshold)
+      int32 num_not_scaled = clipping_scales.ApplyFloor(1.0);
+     // now clipping_scales contains min(1,
+     //    squared-(norm/clipping_threshold))
+      if (num_not_scaled != clipping_scales.Dim()) {
+        clipping_scales.ApplyPow(-0.5);
+        // now clipping_scales contains max(1,
+        //       clipping_threshold/vector_norm)
+        in_deriv->MulRowsVec(clipping_scales);
+        to_update->num_clipped_ += (clipping_scales.Dim() - num_not_scaled);
+       }
+      to_update->count_ += clipping_scales.Dim();
+    } else {
+      // each element of the derivative matrix, is clipped to be below the
+      // clipping_threshold_
+      in_deriv->ApplyCeiling(clipping_threshold_);
+      in_deriv->ApplyFloor(-1 * clipping_threshold_);
+    }
+  }
+}
+
+void ClipGradientComponent::ZeroStats()  {
+  count_ = 0.0;
+  num_clipped_ = 0.0;
+}
+
+void ClipGradientComponent::Scale(BaseFloat scale) {
+  count_ *= scale;
+  num_clipped_ *= scale;
+}
+
+void ClipGradientComponent::Add(BaseFloat alpha, const Component &other_in) {
+  const ClipGradientComponent *other =
+      dynamic_cast<const ClipGradientComponent*>(&other_in);
+  KALDI_ASSERT(other != NULL);
+  count_ += alpha * other->count_;
+  num_clipped_ += alpha * other->num_clipped_;
+}
+
 void TanhComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
                               const CuMatrixBase<BaseFloat> &in,
                               CuMatrixBase<BaseFloat> *out) const {
@@ -304,16 +698,80 @@ void TanhComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
   out->Tanh(in);
 }
 
+
+void TanhComponent::RepairGradients(
+    const CuMatrixBase<BaseFloat> &out_value,
+    CuMatrixBase<BaseFloat> *in_deriv) const {
+  // maximum possible derivative of SigmoidComponent is 1.0
+  // the default lower-threshold on the derivative, below which we
+  // add a term to the derivative to encourage the inputs to the sigmoid
+  // to be closer to zero, is 0.2, which means the derivative is on average
+  // 5 times smaller than its maximum possible value.
+  BaseFloat default_lower_threshold = 0.2;
+
+  // we use this 'repair_probability' (hardcoded for now) to limit
+  // this code to running on about half of the minibatches.
+  BaseFloat repair_probability = 0.5;
+
+  if (self_repair_scale_ == 0.0 || count_ == 0.0 || deriv_sum_.Dim() != dim_ ||
+      RandUniform() > repair_probability)
+    return;
+
+  // check that the self-repair scale is in a reasonable range.
+  KALDI_ASSERT(self_repair_scale_ > 0.0 && self_repair_scale_ < 0.1);
+  BaseFloat unset = kUnsetThreshold; // -1000.0
+  BaseFloat lower_threshold = (self_repair_lower_threshold_ == unset ?
+                               default_lower_threshold :
+                               self_repair_lower_threshold_) *
+      count_;
+  if (self_repair_upper_threshold_ != unset) {
+    KALDI_ERR << "Do not set the self-repair-upper-threshold for sigmoid "
+              << "components, it does nothing.";
+  }
+
+  // thresholds_vec is actually a 1-row matrix.  (the ApplyHeaviside
+  // function isn't defined for vectors).
+  CuMatrix<BaseFloat> thresholds(1, dim_);
+  CuSubVector<BaseFloat> thresholds_vec(thresholds, 0);
+  thresholds_vec.AddVec(-1.0, deriv_sum_);
+  thresholds_vec.Add(lower_threshold);
+  thresholds.ApplyHeaviside();
+
+  // At this point, 'thresholds_vec' contains a 1 for each dimension of
+  // the output that is 'problematic', i.e. for which the avg-deriv
+  // is less than the self-repair lower threshold, and a 0 for
+  // each dimension that is not problematic.
+
+  // what we want to do is to add -self_repair_scale_ / repair_probability times
+  // output-valiue) to the input derivative for each problematic dimension.
+  // note that for the tanh, the output-value goes from -1.0 when the input is
+  // -inf to +1.0 when the input is +inf.  The negative sign is so that for
+  // inputs <0, we push them up towards 0, and for inputs >0, we push them down
+  // towards 0.  Our use of the tanh here is just a convenience since we have it
+  // available.  We could use just about any function that is positive for
+  // inputs < 0 and negative for inputs > 0.
+
+  // We can rearrange the above as: for only the problematic columns,
+  //   input-deriv -= self-repair-scale / repair-probabilty * output
+  // which we can write as:
+  //   input-deriv -=  self-repair-scale / repair-probabilty * output * thresholds-vec
+
+  in_deriv->AddMatDiagVec(-self_repair_scale_ / repair_probability,
+                          out_value, kNoTrans, thresholds_vec);
+}
+
 void TanhComponent::Backprop(const std::string &debug_info,
                              const ComponentPrecomputedIndexes *indexes,
                              const CuMatrixBase<BaseFloat> &,
-                             const CuMatrixBase<BaseFloat> &out_value,                        
+                             const CuMatrixBase<BaseFloat> &out_value,
                              const CuMatrixBase<BaseFloat> &out_deriv,
                              Component *to_update, // may be NULL; may be identical
                              // to "this" or different.
                              CuMatrixBase<BaseFloat> *in_deriv) const {
-  if (in_deriv != NULL)
+  if (in_deriv != NULL) {
     in_deriv->DiffTanh(out_value, out_deriv);
+    RepairGradients(out_value, in_deriv);
+  }
 }
 
 /*
@@ -324,7 +782,10 @@ void TanhComponent::Backprop(const std::string &debug_info,
   in_deriv = out_deriv * (1.0 - out_value^2).
   We can accomplish this via calls to the matrix library. */
 void TanhComponent::StoreStats(const CuMatrixBase<BaseFloat> &out_value) {
-  // derivative of the onlinearity is out_value * (1.0 - out_value);  
+  // only store stats about every other minibatch.
+  if (RandInt(0, 1) == 0)
+    return;
+  // derivative of the onlinearity is out_value * (1.0 - out_value);
   CuMatrix<BaseFloat> temp_deriv(out_value);
   temp_deriv.ApplyPow(2.0);
   temp_deriv.Scale(-1.0);
@@ -332,7 +793,6 @@ void TanhComponent::StoreStats(const CuMatrixBase<BaseFloat> &out_value) {
   StoreStatsInternal(out_value, &temp_deriv);
 }
 
-
 void RectifiedLinearComponent::Propagate(
     const ComponentPrecomputedIndexes *indexes,
     const CuMatrixBase<BaseFloat> &in,
@@ -351,18 +811,79 @@ void RectifiedLinearComponent::Backprop(
     Component *to_update,
     CuMatrixBase<BaseFloat> *in_deriv) const {
   if (in_deriv != NULL) {
-    in_deriv->CopyFromMat(out_value);
-    in_deriv->ApplyHeaviside();
+    in_deriv->Heaviside(out_value);
     in_deriv->MulElements(out_deriv);
+    RepairGradients(in_deriv);
   }
 }
 
+
+void RectifiedLinearComponent::RepairGradients(
+    CuMatrixBase<BaseFloat> *in_deriv) const {
+  BaseFloat default_lower_threshold = 0.05,
+      default_upper_threshold = 0.95;
+  // we use this 'repair_probability' (hardcoded for now) to limit
+  // this code to running on about half of the minibatches.
+  BaseFloat repair_probability = 0.5;
+
+  if (self_repair_scale_ == 0.0 || count_ == 0.0 || deriv_sum_.Dim() != dim_ ||
+      RandUniform() > repair_probability)
+    return;
+
+  // check that the self-repair scale is in a reasonable range.
+  KALDI_ASSERT(self_repair_scale_ > 0.0 && self_repair_scale_ < 0.1);
+  BaseFloat unset = kUnsetThreshold; // -1000.0
+  BaseFloat lower_threshold = (self_repair_lower_threshold_ == unset ?
+                               default_lower_threshold :
+                               self_repair_lower_threshold_) *
+      count_,
+      upper_threshold = (self_repair_upper_threshold_ == unset ?
+                         default_upper_threshold :
+                         self_repair_upper_threshold_) *
+      count_;
+
+  CuMatrix<BaseFloat> storage(2, dim_ + 2, kUndefined);
+  CuSubVector<BaseFloat> thresholds_vec(storage.RowData(0) + dim_, 2);
+  CuSubMatrix<BaseFloat> stats_mat(storage, 0, 2, 0, dim_);
+  thresholds_vec(0) = -lower_threshold;
+  thresholds_vec(1) = -upper_threshold;
+  CuSubVector<BaseFloat> row0(stats_mat, 0);
+  CuSubVector<BaseFloat> row1(stats_mat, 1);
+
+  row0.CopyFromVec(deriv_sum_);
+  row1.CopyFromVec(row0);
+  stats_mat.AddVecToCols(1.0, thresholds_vec, 1.0);
+  // now row0 equals stats - lower_threshold, and
+  //     row1 equals stats - upper_threshold.
+  stats_mat.ApplyHeaviside();
+  // now row0 equals (stats > lower_threshold ? 1 : 0), and
+  //     row1 equals (stats > upper_threshold ? 1 : 0).
+  // what we want is:
+  // self_repair_scale * ((stats <= lower_threshold ? 1 : 0) +
+  //                         (stats > upper_threshold ? -1 : 0)).
+  //
+  // we can get these in stats_mat.Row(0) by computing:
+  // -self_repair_scale * (stats_mat.Row(1)  + stats_mat.Row(0) - 1).
+  row0.AddVec(1.0, row1, 1.0);
+  row0.Add(-1.0);
+  // [actually we need to divide by repair_probability also, to
+  //  correct for the fact that we only do this on some frames.]
+  row0.Scale(-self_repair_scale_ / repair_probability);
+  in_deriv->AddVecToRows(1.0, row0, 1.0);
+}
+
+
 void RectifiedLinearComponent::StoreStats(
     const CuMatrixBase<BaseFloat> &out_value) {
-  CuMatrix<BaseFloat> temp_deriv(out_value);
-  temp_deriv.ApplyHeaviside();
+  // only store stats about every other minibatch.
+  if (RandInt(0, 1) == 0)
+    return;
+  CuMatrix<BaseFloat> temp_deriv(out_value.NumRows(),
+                                 out_value.NumCols(),
+                                 kUndefined);
+  temp_deriv.Heaviside(out_value);
   StoreStatsInternal(out_value, &temp_deriv);
-}  
+}
 
 void AffineComponent::Scale(BaseFloat scale) {
   linear_params_.Scale(scale);
@@ -375,7 +896,7 @@ void AffineComponent::Resize(int32 input_dim, int32 output_dim) {
   linear_params_.Resize(output_dim, input_dim);
 }
 
-void AffineComponent::Add(BaseFloat alpha, const UpdatableComponent &other_in) {
+void AffineComponent::Add(BaseFloat alpha, const Component &other_in) {
   const AffineComponent *other =
       dynamic_cast<const AffineComponent*>(&other_in);
   KALDI_ASSERT(other != NULL);
@@ -391,9 +912,9 @@ AffineComponent::AffineComponent(const AffineComponent &component):
 AffineComponent::AffineComponent(const CuMatrixBase<BaseFloat> &linear_params,
                                  const CuVectorBase<BaseFloat> &bias_params,
                                  BaseFloat learning_rate):
-    UpdatableComponent(learning_rate),
     linear_params_(linear_params),
     bias_params_(bias_params) {
+  SetUnderlyingLearningRate(learning_rate);
   KALDI_ASSERT(linear_params.NumRows() == bias_params.Dim()&&
                bias_params.Dim() != 0);
 }
@@ -402,7 +923,7 @@ AffineComponent::AffineComponent(const CuMatrixBase<BaseFloat> &linear_params,
 
 void AffineComponent::SetZero(bool treat_as_gradient) {
   if (treat_as_gradient) {
-    SetLearningRate(1.0);
+    SetActualLearningRate(1.0);
     is_gradient_ = true;
   }
   linear_params_.SetZero();
@@ -427,29 +948,15 @@ void AffineComponent::PerturbParams(BaseFloat stddev) {
 }
 
 std::string AffineComponent::Info() const {
-  std::stringstream stream;
-  BaseFloat linear_params_size = static_cast<BaseFloat>(linear_params_.NumRows())
-      * static_cast<BaseFloat>(linear_params_.NumCols());
-  BaseFloat linear_stddev =
-      std::sqrt(TraceMatMat(linear_params_, linear_params_, kTrans) /
-                linear_params_size),
-      bias_stddev = std::sqrt(VecVec(bias_params_, bias_params_) /
-                              bias_params_.Dim());
-  stream << Type() << ", input-dim=" << InputDim()
-         << ", output-dim=" << OutputDim()
-         << ", linear-params-stddev=" << linear_stddev
-         << ", bias-params-stddev=" << bias_stddev
-         << ", learning-rate=" << LearningRate()
-         << ", is-gradient=" << (is_gradient_ ? "true" : "false");
+  std::ostringstream stream;
+  stream << UpdatableComponent::Info();
+  PrintParameterStats(stream, "linear-params", linear_params_);
+  PrintParameterStats(stream, "bias", bias_params_, true);
   return stream.str();
 }
 
 Component* AffineComponent::Copy() const {
-  AffineComponent *ans = new AffineComponent();
-  ans->learning_rate_ = learning_rate_;
-  ans->linear_params_ = linear_params_;
-  ans->bias_params_ = bias_params_;
-  ans->is_gradient_ = is_gradient_;
+  AffineComponent *ans = new AffineComponent(*this);
   return ans;
 }
 
@@ -460,10 +967,8 @@ BaseFloat AffineComponent::DotProduct(const UpdatableComponent &other_in) const
       + VecVec(bias_params_, other->bias_params_);
 }
 
-void AffineComponent::Init(BaseFloat learning_rate,
-                           int32 input_dim, int32 output_dim,
+void AffineComponent::Init(int32 input_dim, int32 output_dim,
                            BaseFloat param_stddev, BaseFloat bias_stddev) {
-  UpdatableComponent::Init(learning_rate);
   linear_params_.Resize(output_dim, input_dim);
   bias_params_.Resize(output_dim);
   KALDI_ASSERT(output_dim > 0 && input_dim > 0 && param_stddev >= 0.0);
@@ -473,9 +978,7 @@ void AffineComponent::Init(BaseFloat learning_rate,
   bias_params_.Scale(bias_stddev);
 }
 
-void AffineComponent::Init(BaseFloat learning_rate,
-                           std::string matrix_filename) {
-  UpdatableComponent::Init(learning_rate);
+void AffineComponent::Init(std::string matrix_filename) {
   CuMatrix<BaseFloat> mat;
   ReadKaldiObject(matrix_filename, &mat); // will abort on failure.
   KALDI_ASSERT(mat.NumCols() >= 2);
@@ -488,12 +991,11 @@ void AffineComponent::Init(BaseFloat learning_rate,
 
 void AffineComponent::InitFromConfig(ConfigLine *cfl) {
   bool ok = true;
-  BaseFloat learning_rate = learning_rate_;
   std::string matrix_filename;
   int32 input_dim = -1, output_dim = -1;
-  cfl->GetValue("learning-rate", &learning_rate); // optional.
+  InitLearningRatesFromConfig(cfl);
   if (cfl->GetValue("matrix", &matrix_filename)) {
-    Init(learning_rate, matrix_filename);
+    Init(matrix_filename);
     if (cfl->GetValue("input-dim", &input_dim))
       KALDI_ASSERT(input_dim == InputDim() &&
                    "input-dim mismatch vs. matrix.");
@@ -507,7 +1009,7 @@ void AffineComponent::InitFromConfig(ConfigLine *cfl) {
         bias_stddev = 1.0;
     cfl->GetValue("param-stddev", &param_stddev);
     cfl->GetValue("bias-stddev", &bias_stddev);
-    Init(learning_rate, input_dim, output_dim,
+    Init(input_dim, output_dim,
          param_stddev, bias_stddev);
   }
   if (cfl->HasUnusedValues())
@@ -523,7 +1025,7 @@ void AffineComponent::InitFromConfig(ConfigLine *cfl) {
 void AffineComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
                                 const CuMatrixBase<BaseFloat> &in,
                                  CuMatrixBase<BaseFloat> *out) const {
-  
+
   // No need for asserts as they'll happen within the matrix operations.
   out->CopyRowsFromVec(bias_params_); // copies bias_params_ to each row
   // of *out.
@@ -565,23 +1067,18 @@ void AffineComponent::Backprop(const std::string &debug_info,
 }
 
 void AffineComponent::Read(std::istream &is, bool binary) {
-  // might not see the "<AffineComponent>" part because
-  // of how ReadNew() works.
-  ExpectOneOrTwoTokens(is, binary, "</AffineComponent>", "<LearningRate>");
-  ReadBasicType(is, binary, &learning_rate_);
+  ReadUpdatableCommon(is, binary);  // read opening tag and learning rate.
   ExpectToken(is, binary, "<LinearParams>");
   linear_params_.Read(is, binary);
   ExpectToken(is, binary, "<BiasParams>");
   bias_params_.Read(is, binary);
-  ExpectToken(is, binary, "<IsGradient>");  
+  ExpectToken(is, binary, "<IsGradient>");
   ReadBasicType(is, binary, &is_gradient_);
   ExpectToken(is, binary, "</AffineComponent>");
 }
 
 void AffineComponent::Write(std::ostream &os, bool binary) const {
-  WriteToken(os, binary, "<AffineComponent>");
-  WriteToken(os, binary, "<LearningRate>");
-  WriteBasicType(os, binary, learning_rate_);
+  WriteUpdatableCommon(os, binary);  // Write opening tag and learning rate
   WriteToken(os, binary, "<LinearParams>");
   linear_params_.Write(os, binary);
   WriteToken(os, binary, "<BiasParams>");
@@ -653,18 +1150,6 @@ Component *AffineComponent::CollapseWithNext(
   return ans;
 }
 
-Component *AffineComponent::CollapseWithNext(
-    const PerElementScaleComponent &next_component) const {
-  KALDI_ASSERT(this->OutputDim() == next_component.InputDim());
-  AffineComponent *ans =
-      dynamic_cast<AffineComponent*>(this->Copy());
-  KALDI_ASSERT(ans != NULL);
-  ans->linear_params_.MulRowsVec(next_component.scales_);
-  ans->bias_params_.MulElements(next_component.scales_);
-
-  return ans;
-}
-
 Component *AffineComponent::CollapseWithPrevious(
     const FixedAffineComponent &prev_component) const {
   // If at least one was non-updatable, make the whole non-updatable.
@@ -682,125 +1167,704 @@ Component *AffineComponent::CollapseWithPrevious(
   return ans;
 }
 
+RepeatedAffineComponent::RepeatedAffineComponent(const RepeatedAffineComponent & component) :
+    UpdatableComponent(component),
+    linear_params_(component.linear_params_),
+    bias_params_(component.bias_params_),
+    num_repeats_(component.num_repeats_) {}
 
-void PerElementScaleComponent::Scale(BaseFloat scale) {
-  scales_.Scale(scale);
-}
 
-void PerElementScaleComponent::Resize(int32 dim) {
-  KALDI_ASSERT(dim > 0);
-  scales_.Resize(dim);
+void RepeatedAffineComponent::Scale(BaseFloat scale) {
+  linear_params_.Scale(scale);
+  bias_params_.Scale(scale);
 }
 
-void PerElementScaleComponent::Add(BaseFloat alpha,
-                                   const UpdatableComponent &other_in) {
-  const PerElementScaleComponent *other =
-      dynamic_cast<const PerElementScaleComponent*>(&other_in);
+void RepeatedAffineComponent::Add(BaseFloat alpha, const Component &other_in) {
+  const RepeatedAffineComponent *other =
+      dynamic_cast<const RepeatedAffineComponent *>(&other_in);
   KALDI_ASSERT(other != NULL);
-  scales_.AddVec(alpha, other->scales_);
+  linear_params_.AddMat(alpha, other->linear_params_);
+  bias_params_.AddVec(alpha, other->bias_params_);
 }
 
-PerElementScaleComponent::PerElementScaleComponent(
-    const PerElementScaleComponent &component):
-    UpdatableComponent(component),
-    scales_(component.scales_) { }
-
-void PerElementScaleComponent::SetZero(bool treat_as_gradient) {
+void RepeatedAffineComponent::SetZero(bool treat_as_gradient) {
   if (treat_as_gradient) {
-    SetLearningRate(1.0);
+    SetActualLearningRate(1.0);
     is_gradient_ = true;
   }
-  scales_.SetZero();
+  linear_params_.SetZero();
+  bias_params_.SetZero();
 }
 
-void PerElementScaleComponent::PerturbParams(BaseFloat stddev) {
-  CuVector<BaseFloat> temp_scales(scales_);
-  temp_scales.SetRandn();
-  scales_.AddVec(stddev, temp_scales);
+void RepeatedAffineComponent::PerturbParams(BaseFloat stddev){
+  CuMatrix<BaseFloat> temp_linear_params(linear_params_);
+  temp_linear_params.SetRandn();
+  linear_params_.AddMat(stddev, temp_linear_params);
+  CuVector<BaseFloat> temp_bias_params(bias_params_);
+  temp_bias_params.SetRandn();
+  bias_params_.AddVec(stddev, temp_bias_params);
 }
 
-std::string PerElementScaleComponent::Info() const {
-  std::stringstream stream;
-  BaseFloat scales_stddev = std::sqrt(VecVec(scales_, scales_) /
-                              scales_.Dim());
-  stream << Type() << ", input-dim=" << InputDim()
-         << ", output-dim=" << OutputDim()
-         << ", scales-stddev=" << scales_stddev
-         << ", learning-rate=" << LearningRate();
+std::string RepeatedAffineComponent::Info() const {
+  std::ostringstream stream;
+  stream << UpdatableComponent::Info()
+         << ", num-repeats=" << num_repeats_;
+  PrintParameterStats(stream, "linear-params", linear_params_);
+  PrintParameterStats(stream, "bias", bias_params_, true);
   return stream.str();
 }
 
-Component* PerElementScaleComponent::Copy() const {
-  PerElementScaleComponent *ans = new PerElementScaleComponent();
-  ans->learning_rate_ = learning_rate_;
-  ans->scales_ = scales_;
-  ans->is_gradient_ = is_gradient_;
+Component* RepeatedAffineComponent::Copy() const {
+  RepeatedAffineComponent *ans = new RepeatedAffineComponent(*this);
   return ans;
 }
 
-BaseFloat PerElementScaleComponent::DotProduct(
-    const UpdatableComponent &other_in) const {
-  const PerElementScaleComponent *other =
-      dynamic_cast<const PerElementScaleComponent*>(&other_in);
-  return VecVec(scales_, other->scales_);
+BaseFloat RepeatedAffineComponent::DotProduct(const UpdatableComponent &other_in) const {
+  const RepeatedAffineComponent *other =
+      dynamic_cast<const RepeatedAffineComponent*>(&other_in);
+  return TraceMatMat(linear_params_, other->linear_params_, kTrans)
+                     + VecVec(bias_params_, other->bias_params_);
 }
 
-void PerElementScaleComponent::Init(
-    BaseFloat learning_rate, int32 dim,
-    BaseFloat param_mean, BaseFloat param_stddev) {
-  UpdatableComponent::Init(learning_rate);
-  scales_.Resize(dim);
-  KALDI_ASSERT(dim > 0 && param_stddev >= 0.0);
-  scales_.SetRandn();
-  scales_.Scale(param_stddev);
-  scales_.Add(param_mean);
+void RepeatedAffineComponent::Init(int32 input_dim, int32 output_dim, int32 num_repeats,
+                                   BaseFloat param_stddev, BaseFloat bias_mean,
+                                   BaseFloat bias_stddev) {
+  KALDI_ASSERT(input_dim % num_repeats == 0 && output_dim % num_repeats == 0);
+  linear_params_.Resize(output_dim / num_repeats, input_dim / num_repeats);
+  bias_params_.Resize(output_dim / num_repeats);
+  num_repeats_ = num_repeats;
+  KALDI_ASSERT(output_dim > 0 && input_dim > 0 && param_stddev >= 0.0);
+  linear_params_.SetRandn(); // sets to random normally distributed noise.
+  linear_params_.Scale(param_stddev);
+  bias_params_.SetRandn();
+  bias_params_.Scale(bias_stddev);
+  bias_params_.Add(bias_mean);
+  SetNaturalGradientConfigs();
 }
 
-void PerElementScaleComponent::Init(BaseFloat learning_rate,
-                                    std::string matrix_filename) {
-  UpdatableComponent::Init(learning_rate);
-  CuMatrix<BaseFloat> mat;
-  ReadKaldiObject(matrix_filename, &mat); // will abort on failure.
-  KALDI_ASSERT(mat.NumCols() == 1);
-  int32 dim = mat.NumRows();
-  scales_.Resize(dim);
-  scales_.CopyColFromMat(mat, 0);
-}
 
-void PerElementScaleComponent::InitFromConfig(ConfigLine *cfl) {
+void RepeatedAffineComponent::InitFromConfig(ConfigLine *cfl) {
   bool ok = true;
-  BaseFloat learning_rate = learning_rate_;
-  std::string matrix_filename;
-  int32 dim = -1;
-  cfl->GetValue("learning-rate", &learning_rate); // optional.
-  if (cfl->GetValue("matrix", &matrix_filename)) {
-    Init(learning_rate, matrix_filename);
-    if (cfl->GetValue("dim", &dim))
-      KALDI_ASSERT(dim == InputDim() &&
-                   "input-dim mismatch vs. matrix.");
-  } else {
-    ok = ok && cfl->GetValue("dim", &dim);
-    BaseFloat param_mean = 1.0, param_stddev = 0.0;
-    cfl->GetValue("param-mean", &param_mean);
-    cfl->GetValue("param-stddev", &param_stddev);
-    Init(learning_rate, dim, param_mean, param_stddev);
-  }
+  int32 num_repeats = num_repeats_;
+  int32 input_dim = -1, output_dim = -1;
+  InitLearningRatesFromConfig(cfl);
+  ok = cfl->GetValue("num-repeats", &num_repeats) && ok;
+  ok = cfl->GetValue("input-dim", &input_dim) && ok;
+  ok = cfl->GetValue("output-dim", &output_dim) && ok;
+  KALDI_ASSERT(input_dim % num_repeats == 0 &&
+               "num-repeats must divide input-dim");
+  KALDI_ASSERT(output_dim % num_repeats == 0 &&
+               "num-repeats must divide output-dim");
+  BaseFloat param_stddev = 1.0 / std::sqrt(input_dim / num_repeats),
+      bias_mean = 0.0, bias_stddev = 0.0;
+  cfl->GetValue("param-stddev", &param_stddev);
+  cfl->GetValue("bias-mean", &bias_mean);
+  cfl->GetValue("bias-stddev", &bias_stddev);
+  Init(input_dim, output_dim,
+       num_repeats, param_stddev, bias_mean, bias_stddev);
   if (cfl->HasUnusedValues())
     KALDI_ERR << "Could not process these elements in initializer: "
-              << cfl->UnusedValues();
+	          << cfl->UnusedValues();
   if (!ok)
     KALDI_ERR << "Bad initializer " << cfl->WholeLine();
 }
 
-void PerElementScaleComponent::Propagate(
-    const ComponentPrecomputedIndexes *indexes,
-    const CuMatrixBase<BaseFloat> &in,
-    CuMatrixBase<BaseFloat> *out) const {
-  out->CopyFromMat(in);
-  out->MulColsVec(scales_);
+void RepeatedAffineComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
+                                        const CuMatrixBase<BaseFloat> &in,
+                                        CuMatrixBase<BaseFloat> *out) const {
+  // we gave the kInputContiguous and kOutputContiguous flags-- check that they
+  // are honored.
+  KALDI_ASSERT(in.NumCols() == in.Stride() &&
+               out->NumCols() == out->Stride() &&
+               out->NumRows() == in.NumRows());
+
+  int32 num_repeats = num_repeats_,
+      num_rows = in.NumRows(),
+      block_dim_out = linear_params_.NumRows(),
+      block_dim_in = linear_params_.NumCols();
+
+  CuSubMatrix<BaseFloat> in_reshaped(in.Data(), num_rows * num_repeats,
+                                     block_dim_in, block_dim_in),
+      out_reshaped(out->Data(), num_rows * num_repeats,
+                   block_dim_out, block_dim_out);
+
+  out_reshaped.CopyRowsFromVec(bias_params_);
+
+  out_reshaped.AddMatMat(1.0, in_reshaped, kNoTrans,
+                         linear_params_, kTrans, 1.0);
 }
 
-void PerElementScaleComponent::UpdateSimple(
+void RepeatedAffineComponent::Backprop(const std::string &debug_info,
+                                       const ComponentPrecomputedIndexes *indexes,
+                                       const CuMatrixBase<BaseFloat> &in_value,
+                                       const CuMatrixBase<BaseFloat> &, // out_value
+                                       const CuMatrixBase<BaseFloat> &out_deriv,
+                                       Component *to_update_in,
+                                       CuMatrixBase<BaseFloat> *in_deriv) const {
+  KALDI_ASSERT(out_deriv.NumCols() == out_deriv.Stride() &&
+       (in_value.NumCols() == 0 || in_value.NumCols() == in_value.Stride()) &&
+               (!in_deriv || in_deriv->NumCols() == in_deriv->Stride()));
+
+  RepeatedAffineComponent *to_update = dynamic_cast<RepeatedAffineComponent*>(
+      to_update_in);
+
+  // Propagate the derivative back to the input.
+  // add with coefficient 1.0 since property kBackpropAdds is true.
+  // If we wanted to add with coefficient 0.0 we'd need to zero the
+  // in_deriv, in case of infinities.
+  if (in_deriv) {
+    int32 num_repeats = num_repeats_,
+        num_rows = out_deriv.NumRows(),
+        block_dim_out = linear_params_.NumRows(),
+        block_dim_in = linear_params_.NumCols();
+
+    CuSubMatrix<BaseFloat> in_deriv_reshaped(in_deriv->Data(),
+                                             num_rows * num_repeats,
+                                             block_dim_in, block_dim_in),
+        out_deriv_reshaped(out_deriv.Data(),
+                           num_rows * num_repeats,
+                           block_dim_out, block_dim_out);
+    in_deriv_reshaped.AddMatMat(1.0, out_deriv_reshaped, kNoTrans,
+                                linear_params_, kNoTrans, 1.0);
+  }
+
+  // Next update the model (must do this 2nd so the derivatives we propagate are
+  // accurate, in case this == to_update_in.)
+  if (to_update != NULL)
+    to_update->Update(in_value, out_deriv);
+}
+
+void RepeatedAffineComponent::Update(const CuMatrixBase<BaseFloat> &in_value,
+                                     const CuMatrixBase<BaseFloat> &out_deriv) {
+  KALDI_ASSERT(out_deriv.NumCols() == out_deriv.Stride() &&
+               in_value.NumCols() == in_value.Stride() &&
+               in_value.NumRows() == out_deriv.NumRows());
+
+
+    int32 num_repeats = num_repeats_,
+        num_rows = in_value.NumRows(),
+        block_dim_out = linear_params_.NumRows(),
+        block_dim_in = linear_params_.NumCols();
+
+    CuSubMatrix<BaseFloat> in_value_reshaped(in_value.Data(),
+                                             num_rows * num_repeats,
+                                             block_dim_in, block_dim_in),
+        out_deriv_reshaped(out_deriv.Data(),
+                           num_rows * num_repeats,
+                           block_dim_out, block_dim_out);
+
+
+  linear_params_.AddMatMat(learning_rate_, out_deriv_reshaped, kTrans,
+                           in_value_reshaped, kNoTrans, 1.0);
+  bias_params_.AddRowSumMat(learning_rate_,
+                            out_deriv_reshaped);
+}
+
+void RepeatedAffineComponent::Read(std::istream &is, bool binary) {
+  // This Read function also works for NaturalGradientRepeatedAffineComponent.
+  ReadUpdatableCommon(is, binary);  // read opening tag and learning rate.
+  ExpectToken(is, binary, "<NumRepeats>");
+  ReadBasicType(is, binary, &num_repeats_);
+  ExpectToken(is, binary, "<LinearParams>");
+  linear_params_.Read(is, binary);
+  ExpectToken(is, binary, "<BiasParams>");
+  bias_params_.Read(is, binary);
+  ExpectToken(is, binary, "<IsGradient>");
+  ReadBasicType(is, binary, &is_gradient_);
+  ExpectToken(is, binary, std::string("</") + Type() + std::string(">"));
+  SetNaturalGradientConfigs();
+}
+
+void RepeatedAffineComponent::Write(std::ostream &os, bool binary) const {
+  // This Write function also works for NaturalGradientRepeatedAffineComponent.
+  WriteUpdatableCommon(os, binary);  // Write opening tag and learning rate
+  WriteToken(os, binary, "<NumRepeats>");
+  WriteBasicType(os, binary, num_repeats_);
+  WriteToken(os, binary, "<LinearParams>");
+  linear_params_.Write(os, binary);
+  WriteToken(os, binary, "<BiasParams>");
+  bias_params_.Write(os, binary);
+  WriteToken(os, binary, "<IsGradient>");
+  WriteBasicType(os, binary, is_gradient_);
+  // write closing token.
+  WriteToken(os, binary, std::string("</") + Type() + std::string(">"));
+}
+
+int32 RepeatedAffineComponent::NumParameters() const {
+  // Note: unlike AffineComponent, InputDim() & OutputDim() are not used here and below,
+  // for they are multipled by num_repeats_.
+  return linear_params_.NumCols() * linear_params_.NumRows() + bias_params_.Dim();
+}
+
+void RepeatedAffineComponent::Vectorize(VectorBase<BaseFloat> *params) const {
+  KALDI_ASSERT(params->Dim() == this->NumParameters());
+  params->Range(0, linear_params_.NumCols() * linear_params_.NumRows()).CopyRowsFromMat(linear_params_);
+  params->Range(linear_params_.NumCols() * linear_params_.NumRows(),
+                bias_params_.Dim()).CopyFromVec(bias_params_);
+}
+
+void RepeatedAffineComponent::UnVectorize(const VectorBase<BaseFloat> &params) {
+  KALDI_ASSERT(params.Dim() == this->NumParameters());
+  linear_params_.CopyRowsFromVec(params.Range(0, linear_params_.NumCols() * linear_params_.NumRows()));
+  bias_params_.CopyFromVec(params.Range(linear_params_.NumCols() * linear_params_.NumRows(),
+                                        bias_params_.Dim()));
+}
+
+void NaturalGradientRepeatedAffineComponent::SetNaturalGradientConfigs() {
+  int32 rank_in = 40;
+  int32 input_dim = linear_params_.NumCols();
+  if (rank_in > input_dim / 2)
+    rank_in = input_dim / 2;
+  if (rank_in < 1)
+    rank_in = 1;
+  preconditioner_in_.SetRank(rank_in);
+  preconditioner_in_.SetUpdatePeriod(4);
+}
+
+NaturalGradientRepeatedAffineComponent::NaturalGradientRepeatedAffineComponent(
+    const NaturalGradientRepeatedAffineComponent &other):
+    RepeatedAffineComponent(other),
+    preconditioner_in_(other.preconditioner_in_) { }
+
+// virtual
+Component* NaturalGradientRepeatedAffineComponent::Copy() const {
+  return new NaturalGradientRepeatedAffineComponent(*this);
+}
+
+void NaturalGradientRepeatedAffineComponent::Update(
+    const CuMatrixBase<BaseFloat> &in_value,
+    const CuMatrixBase<BaseFloat> &out_deriv) {
+  KALDI_ASSERT(out_deriv.NumCols() == out_deriv.Stride() &&
+               in_value.NumCols() == in_value.Stride() &&
+               in_value.NumRows() == out_deriv.NumRows());
+
+  int32 num_repeats = num_repeats_,
+      num_rows = in_value.NumRows(),
+      block_dim_out = linear_params_.NumRows(),
+      block_dim_in = linear_params_.NumCols();
+
+  CuSubMatrix<BaseFloat> in_value_reshaped(in_value.Data(),
+                                           num_rows * num_repeats,
+                                           block_dim_in, block_dim_in),
+        out_deriv_reshaped(out_deriv.Data(),
+                           num_rows * num_repeats,
+                           block_dim_out, block_dim_out);
+
+  CuVector<BaseFloat> bias_deriv(block_dim_out);
+  bias_deriv.AddRowSumMat(1.0, out_deriv_reshaped);
+
+  CuMatrix<BaseFloat> deriv(block_dim_out,
+                            block_dim_in + 1);
+  deriv.ColRange(0, block_dim_in).AddMatMat(
+      1.0, out_deriv_reshaped, kTrans,
+      in_value_reshaped, kNoTrans, 1.0);
+  deriv.CopyColFromVec(bias_deriv, block_dim_in);
+
+  BaseFloat scale = 1.0;
+  if (!is_gradient_) {
+    try {
+      // Only apply the preconditioning/natural-gradient if we're not computing
+      // the exact gradient.
+      preconditioner_in_.PreconditionDirections(&deriv, NULL, &scale);
+    } catch (...) {
+      int32 num_bad_rows = 0;
+      for (int32 i = 0; i < out_deriv.NumRows(); i++) {
+        BaseFloat f = out_deriv.Row(i).Sum();
+        if (!(f - f == 0)) num_bad_rows++;
+      }
+      KALDI_ERR << "Preonditioning failed, in_value sum is "
+                << in_value.Sum() << ", out_deriv sum is " << out_deriv.Sum()
+                << ", out_deriv has " << num_bad_rows << " bad rows.";
+    }
+  }
+  linear_params_.AddMat(learning_rate_ * scale,
+                        deriv.ColRange(0, block_dim_in));
+  bias_deriv.CopyColFromMat(deriv, block_dim_in);
+  bias_params_.AddVec(learning_rate_ * scale, bias_deriv);
+}
+
+BlockAffineComponent::BlockAffineComponent(const BlockAffineComponent &other) :
+  UpdatableComponent(other),
+  linear_params_(other.linear_params_),
+  bias_params_(other.bias_params_),
+  num_blocks_(other.num_blocks_) {}
+
+BlockAffineComponent::BlockAffineComponent(const RepeatedAffineComponent &rac) :
+  UpdatableComponent(rac),
+  linear_params_(rac.num_repeats_ * rac.linear_params_.NumRows(),
+                 rac.linear_params_.NumCols(), kUndefined),
+  bias_params_(rac.num_repeats_ * rac.linear_params_.NumRows(), kUndefined),
+  num_blocks_(rac.num_repeats_) {
+  // copy rac's linear_params_ and bias_params_ to this.
+  int32 num_rows_in_block = rac.linear_params_.NumRows();
+  for(int32 block_counter = 0; block_counter < num_blocks_; block_counter++) {
+    int32 row_offset = block_counter * num_rows_in_block;
+    CuSubMatrix<BaseFloat> block = this->linear_params_.RowRange(row_offset,
+                                                                 num_rows_in_block);
+    block.CopyFromMat(rac.linear_params_);
+    CuSubVector<BaseFloat> block_bias = this->bias_params_.Range(row_offset,
+                                                                 num_rows_in_block);
+    block_bias.CopyFromVec(rac.bias_params_);
+  }
+}
+
+Component* BlockAffineComponent::Copy() const {
+  BlockAffineComponent *ans = new BlockAffineComponent(*this);
+  return ans;
+}
+
+std::string BlockAffineComponent::Info() const {
+  std::ostringstream stream;
+  stream << UpdatableComponent::Info()
+         << ", num-blocks=" << num_blocks_;
+  PrintParameterStats(stream, "linear-params", linear_params_);
+  PrintParameterStats(stream, "bias", bias_params_, true);
+  return stream.str();
+}
+
+void BlockAffineComponent::Init(int32 input_dim,
+                                int32 output_dim, int32 num_blocks,
+                                BaseFloat param_stddev, BaseFloat bias_mean,
+                                BaseFloat bias_stddev) {
+  KALDI_ASSERT(input_dim > 0 && output_dim > 0 && num_blocks >= 1);
+  KALDI_ASSERT(output_dim % num_blocks == 0 && input_dim % num_blocks == 0);
+  const int32 num_columns_per_block = input_dim / num_blocks;
+  linear_params_.Resize(output_dim, num_columns_per_block);
+  bias_params_.Resize(output_dim);
+  KALDI_ASSERT(param_stddev >= 0.0 && bias_stddev >= 0.0);
+  linear_params_.SetRandn();
+  linear_params_.Scale(param_stddev);
+  bias_params_.SetRandn();
+  bias_params_.Scale(bias_stddev);
+  bias_params_.Add(bias_mean);
+  num_blocks_ = num_blocks;
+}
+
+void BlockAffineComponent::InitFromConfig(ConfigLine *cfl) {
+  int32 input_dim = -1, output_dim = -1, num_blocks = -1;
+  if(!cfl->GetValue("input-dim", &input_dim) ||
+     !cfl->GetValue("output-dim", &output_dim) ||
+     !cfl->GetValue("num-blocks", &num_blocks))
+    KALDI_ERR << "Invalid initializer for layer of type "
+              << Type() << ": \"" << cfl->WholeLine() << "\"";
+  InitLearningRatesFromConfig(cfl);
+  BaseFloat param_stddev = 1.0 / std::sqrt(input_dim / num_blocks),
+      bias_mean = 0.0, bias_stddev = 1.0;
+  cfl->GetValue("param-stddev", &param_stddev);
+  cfl->GetValue("bias-stddev", &bias_stddev);
+  cfl->GetValue("bias-mean", &bias_mean);
+
+  if (cfl->HasUnusedValues())
+    KALDI_ERR << "Invalid initializer for layer of type "
+              << Type() << ": \"" << cfl->WholeLine() << "\"";
+
+  Init(input_dim, output_dim, num_blocks,
+       param_stddev, bias_mean, bias_stddev);
+}
+
+void BlockAffineComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
+                                     const CuMatrixBase<BaseFloat> &in,
+                                     CuMatrixBase<BaseFloat> *out) const {
+  out->CopyRowsFromVec(bias_params_);
+  // block_dimension is both the number of columns, and the number of rows,
+  // of a block.
+  int32 num_rows_in_block = linear_params_.NumRows() / num_blocks_;
+  int32 num_cols_in_block = linear_params_.NumCols();
+  std::vector<CuSubMatrix<BaseFloat> *> in_batch, out_batch,
+    linear_params_batch;
+  for(int block_counter = 0; block_counter < num_blocks_; block_counter++) {
+    CuSubMatrix<BaseFloat> *in_block =
+      new CuSubMatrix<BaseFloat>(in.ColRange(block_counter * num_cols_in_block,
+                                   num_cols_in_block));
+    in_batch.push_back(in_block);
+
+    CuSubMatrix<BaseFloat> *out_block =
+      new CuSubMatrix<BaseFloat>(out->ColRange(block_counter * num_rows_in_block,
+                                    num_rows_in_block));
+    out_batch.push_back(out_block);
+
+    CuSubMatrix<BaseFloat> *linear_params_block =
+      new CuSubMatrix<BaseFloat>(linear_params_.RowRange(block_counter * num_rows_in_block,
+                                              num_rows_in_block));
+    linear_params_batch.push_back(linear_params_block);
+  }
+  AddMatMatBatched<BaseFloat>(1.0, out_batch, in_batch, kNoTrans,
+                              linear_params_batch, kTrans, 1.0);
+
+  DeletePointers(&in_batch);
+  DeletePointers(&out_batch);
+  DeletePointers(&linear_params_batch);
+}
+
+void BlockAffineComponent::Backprop(const std::string &debug_info,
+                                    const ComponentPrecomputedIndexes *indexes,
+                                    const CuMatrixBase<BaseFloat> &in_value,
+                                    const CuMatrixBase<BaseFloat> &, // out_value
+                                    const CuMatrixBase<BaseFloat> &out_deriv,
+                                    Component *to_update_in,
+                                    CuMatrixBase<BaseFloat> *in_deriv) const {
+  BlockAffineComponent *to_update = dynamic_cast<BlockAffineComponent*>(to_update_in);
+
+  const int32 num_rows_in_block = linear_params_.NumRows() / num_blocks_;
+  const int32 num_cols_in_block = linear_params_.NumCols();
+
+  // Propagate the derivative back to the input.
+  // add with coefficient 1.0 since property kBackpropAdds is true.
+  // If we wanted to add with coefficient 0.0 we'd need to zero the
+  // in_deriv, in case of infinities.
+  if (in_deriv) {
+    std::vector<CuSubMatrix<BaseFloat> *> in_deriv_batch, out_deriv_batch, linear_params_batch;
+
+    for(int block_counter = 0; block_counter < num_blocks_; block_counter++) {
+      CuSubMatrix<BaseFloat> *in_deriv_block =
+        new CuSubMatrix<BaseFloat>(in_deriv->ColRange(block_counter * num_cols_in_block,
+                                                      num_cols_in_block));
+      in_deriv_batch.push_back(in_deriv_block);
+
+      CuSubMatrix<BaseFloat> *out_deriv_block =
+        new CuSubMatrix<BaseFloat>(out_deriv.ColRange(block_counter * num_rows_in_block,
+                                                       num_rows_in_block));
+      out_deriv_batch.push_back(out_deriv_block);
+
+      CuSubMatrix<BaseFloat> *linear_params_block =
+        new CuSubMatrix<BaseFloat>(linear_params_.RowRange(block_counter * num_rows_in_block,
+                                                          num_rows_in_block));
+      linear_params_batch.push_back(linear_params_block);
+    }
+
+    AddMatMatBatched<BaseFloat>(1.0, in_deriv_batch, out_deriv_batch, kNoTrans,
+                                linear_params_batch, kNoTrans, 1.0);
+
+    DeletePointers(&in_deriv_batch);
+    DeletePointers(&out_deriv_batch);
+    DeletePointers(&linear_params_batch);
+  }
+
+  if (to_update != NULL) {
+
+    { // linear params update
+
+      std::vector<CuSubMatrix<BaseFloat> *> in_value_batch,
+        out_deriv_batch, linear_params_batch;
+
+      for (int block_counter = 0; block_counter < num_blocks_; block_counter++) {
+        CuSubMatrix<BaseFloat> *in_value_block =
+          new CuSubMatrix<BaseFloat>(in_value.ColRange(block_counter * num_cols_in_block,
+                                                       num_cols_in_block));
+        in_value_batch.push_back(in_value_block);
+
+        CuSubMatrix<BaseFloat> *out_deriv_block =
+          new CuSubMatrix<BaseFloat>(out_deriv.ColRange(block_counter * num_rows_in_block,
+                                                        num_rows_in_block));
+        out_deriv_batch.push_back(out_deriv_block);
+
+        CuSubMatrix<BaseFloat> *linear_params_block =
+          new CuSubMatrix<BaseFloat>(to_update->linear_params_.RowRange(block_counter * num_rows_in_block,
+                                                                        num_rows_in_block));
+        linear_params_batch.push_back(linear_params_block);
+      }
+
+      AddMatMatBatched<BaseFloat>(to_update->learning_rate_,
+                                  linear_params_batch,
+                                  out_deriv_batch, kTrans,
+                                  in_value_batch, kNoTrans, 1.0);
+
+      DeletePointers(&in_value_batch);
+      DeletePointers(&out_deriv_batch);
+      DeletePointers(&linear_params_batch);
+    } // end linear params update
+
+    { // bias update
+      to_update->bias_params_.AddRowSumMat(to_update->learning_rate_,
+                                           out_deriv, 1.0);
+    } // end bias update
+  }
+}
+
+void BlockAffineComponent::Scale(BaseFloat scale) {
+  linear_params_.Scale(scale);
+  bias_params_.Scale(scale);
+}
+
+void BlockAffineComponent::Add(BaseFloat alpha, const Component &other_in) {
+  const BlockAffineComponent *other =
+    dynamic_cast<const BlockAffineComponent *>(&other_in);
+  KALDI_ASSERT(other != NULL);
+  linear_params_.AddMat(alpha, other->linear_params_);
+  bias_params_.AddVec(alpha, other->bias_params_);
+}
+
+void BlockAffineComponent::SetZero(bool treat_as_gradient) {
+  if (treat_as_gradient) {
+    SetActualLearningRate(1.0);
+    is_gradient_ = true;
+  }
+  linear_params_.SetZero();
+  bias_params_.SetZero();
+}
+
+void BlockAffineComponent::PerturbParams(BaseFloat stddev) {
+  CuMatrix<BaseFloat> temp_linear_params(linear_params_);
+  temp_linear_params.SetRandn();
+  linear_params_.AddMat(stddev, temp_linear_params);
+
+  CuVector<BaseFloat> temp_bias_params(bias_params_);
+  temp_bias_params.SetRandn();
+  bias_params_.AddVec(stddev, temp_bias_params);
+}
+
+BaseFloat BlockAffineComponent::DotProduct(const UpdatableComponent &other_in) const {
+  const BlockAffineComponent *other =
+    dynamic_cast<const BlockAffineComponent*>(&other_in);
+  return TraceMatMat(linear_params_, other->linear_params_, kTrans) +
+    VecVec(bias_params_, other->bias_params_);
+}
+
+void BlockAffineComponent::Read(std::istream &is, bool binary) {
+  ReadUpdatableCommon(is, binary);  // read opening tag and learning rate.
+  ExpectToken(is, binary, "<NumBlocks>");
+  ReadBasicType(is, binary, &num_blocks_);
+  ExpectToken(is, binary, "<LinearParams>");
+  linear_params_.Read(is, binary);
+  ExpectToken(is, binary, "<BiasParams>");
+  bias_params_.Read(is, binary);
+  ExpectToken(is, binary, "<IsGradient>");
+  ReadBasicType(is, binary, &is_gradient_);
+  ExpectToken(is, binary, "</BlockAffineComponent>");
+}
+
+void BlockAffineComponent::Write(std::ostream &os, bool binary) const {
+  WriteUpdatableCommon(os, binary);  // Write opening tag and learning rate
+  WriteToken(os, binary, "<NumBlocks>");
+  WriteBasicType(os, binary, num_blocks_);
+  WriteToken(os, binary, "<LinearParams>");
+  linear_params_.Write(os, binary);
+  WriteToken(os, binary, "<BiasParams>");
+  bias_params_.Write(os, binary);
+  WriteToken(os, binary, "<IsGradient>");
+  WriteBasicType(os, binary, is_gradient_);
+  WriteToken(os, binary, "</BlockAffineComponent>");
+}
+
+int32 BlockAffineComponent::NumParameters() const {
+  return linear_params_.NumCols() * linear_params_.NumRows() + bias_params_.Dim();
+}
+
+void BlockAffineComponent::Vectorize(VectorBase<BaseFloat> *params) const {
+  KALDI_ASSERT(params->Dim() == this->NumParameters());
+  int32 num_linear_params = linear_params_.NumCols() * linear_params_.NumRows();
+  int32 num_bias_params = bias_params_.Dim();
+  params->Range(0, num_linear_params).CopyRowsFromMat(linear_params_);
+  params->Range(num_linear_params, num_bias_params).CopyFromVec(bias_params_);
+}
+
+void BlockAffineComponent::UnVectorize(const VectorBase<BaseFloat> &params) {
+  KALDI_ASSERT(params.Dim() == this->NumParameters());
+  int32 num_linear_params = linear_params_.NumCols() * linear_params_.NumRows();
+  int32 num_bias_params = bias_params_.Dim();
+  linear_params_.CopyRowsFromVec(params.Range(0, num_linear_params));
+  bias_params_.CopyFromVec(params.Range(num_linear_params, num_bias_params));
+}
+
+void PerElementScaleComponent::Scale(BaseFloat scale) {
+  scales_.Scale(scale);
+}
+
+void PerElementScaleComponent::Add(BaseFloat alpha,
+                                   const Component &other_in) {
+  const PerElementScaleComponent *other =
+      dynamic_cast<const PerElementScaleComponent*>(&other_in);
+  KALDI_ASSERT(other != NULL);
+  scales_.AddVec(alpha, other->scales_);
+}
+
+PerElementScaleComponent::PerElementScaleComponent(
+    const PerElementScaleComponent &component):
+    UpdatableComponent(component),
+    scales_(component.scales_) { }
+
+void PerElementScaleComponent::SetZero(bool treat_as_gradient) {
+  if (treat_as_gradient) {
+    SetActualLearningRate(1.0);
+    is_gradient_ = true;
+  }
+  scales_.SetZero();
+}
+
+void PerElementScaleComponent::PerturbParams(BaseFloat stddev) {
+  CuVector<BaseFloat> temp_scales(scales_.Dim(), kUndefined);
+  temp_scales.SetRandn();
+  scales_.AddVec(stddev, temp_scales);
+}
+
+std::string PerElementScaleComponent::Info() const {
+  std::ostringstream stream;
+  stream << UpdatableComponent::Info()
+         << ", scales-min=" << scales_.Min()
+         << ", scales-max=" << scales_.Max();
+  PrintParameterStats(stream, "scales", scales_, true);
+  return stream.str();
+}
+
+Component* PerElementScaleComponent::Copy() const {
+  return new PerElementScaleComponent(*this);
+}
+
+BaseFloat PerElementScaleComponent::DotProduct(
+    const UpdatableComponent &other_in) const {
+  const PerElementScaleComponent *other =
+      dynamic_cast<const PerElementScaleComponent*>(&other_in);
+  return VecVec(scales_, other->scales_);
+}
+
+void PerElementScaleComponent::Init(int32 dim,
+                                    BaseFloat param_mean,
+                                    BaseFloat param_stddev) {
+  KALDI_ASSERT(dim > 0 && param_stddev >= 0.0);
+  scales_.Resize(dim);
+  scales_.SetRandn();
+  scales_.Scale(param_stddev);
+  scales_.Add(param_mean);
+}
+
+void PerElementScaleComponent::Init(std::string vector_filename) {
+  CuVector<BaseFloat> vec;
+  ReadKaldiObject(vector_filename, &vec); // will abort on failure.
+  scales_.Resize(vec.Dim());
+  scales_.CopyFromVec(vec);
+}
+
+void PerElementScaleComponent::InitFromConfig(ConfigLine *cfl) {
+  std::string vector_filename;
+  int32 dim = -1;
+  InitLearningRatesFromConfig(cfl);
+  if (cfl->GetValue("vector", &vector_filename)) {
+    Init(vector_filename);
+    if (cfl->GetValue("dim", &dim))
+      KALDI_ASSERT(dim == InputDim() &&
+                   "input-dim mismatch vs. vector.");
+  } else {
+    if(!cfl->GetValue("dim", &dim))
+      KALDI_ERR << "'dim' not provided in the config line.";
+    BaseFloat param_mean = 1.0, param_stddev = 0.0;
+    cfl->GetValue("param-mean", &param_mean);
+    cfl->GetValue("param-stddev", &param_stddev);
+    Init(dim, param_mean, param_stddev);
+  }
+  if (cfl->HasUnusedValues())
+    KALDI_ERR << "Could not process these elements in initializer: "
+              << cfl->UnusedValues();
+}
+
+void PerElementScaleComponent::Propagate(
+    const ComponentPrecomputedIndexes *indexes,
+    const CuMatrixBase<BaseFloat> &in,
+    CuMatrixBase<BaseFloat> *out) const {
+  out->CopyFromMat(in);
+  out->MulColsVec(scales_);
+}
+
+void PerElementScaleComponent::UpdateSimple(
     const CuMatrixBase<BaseFloat> &in_value,
     const CuMatrixBase<BaseFloat> &out_deriv) {
   scales_.AddDiagMatMat(learning_rate_, out_deriv, kTrans,
@@ -835,20 +1899,16 @@ void PerElementScaleComponent::Backprop(
 }
 
 void PerElementScaleComponent::Read(std::istream &is, bool binary) {
-  // might not see the begin marker part because of how ReadNew() works.
-  ExpectOneOrTwoTokens(is, binary, "<PerElementScaleComponent>", "<LearningRate>");
-  ReadBasicType(is, binary, &learning_rate_);
+  ReadUpdatableCommon(is, binary);  // Read opening tag and learning rate.
   ExpectToken(is, binary, "<Params>");
   scales_.Read(is, binary);
-  ExpectToken(is, binary, "<IsGradient>");  
+  ExpectToken(is, binary, "<IsGradient>");
   ReadBasicType(is, binary, &is_gradient_);
   ExpectToken(is, binary, "</PerElementScaleComponent>");
 }
 
 void PerElementScaleComponent::Write(std::ostream &os, bool binary) const {
-  WriteToken(os, binary, "<PerElementScaleComponent>");
-  WriteToken(os, binary, "<LearningRate>");
-  WriteBasicType(os, binary, learning_rate_);
+  WriteUpdatableCommon(os, binary);  // Write opening tag and learning rate.
   WriteToken(os, binary, "<Params>");
   scales_.Write(os, binary);
   WriteToken(os, binary, "<IsGradient>");
@@ -869,38 +1929,378 @@ void PerElementScaleComponent::UnVectorize(
   scales_.CopyFromVec(params);
 }
 
-// virtual
-void NaturalGradientAffineComponent::Resize(
-    int32 input_dim, int32 output_dim) {
-  KALDI_ASSERT(input_dim > 1 && output_dim > 1);
-  if (rank_in_ >= input_dim) rank_in_ = input_dim - 1;
-  if (rank_out_ >= output_dim) rank_out_ = output_dim - 1;
-  bias_params_.Resize(output_dim);
-  linear_params_.Resize(output_dim, input_dim);
-  OnlineNaturalGradient temp;
-  preconditioner_in_ = temp;
-  preconditioner_out_ = temp;
-  SetNaturalGradientConfigs();
+void PerElementOffsetComponent::Scale(BaseFloat scale) {
+  offsets_.Scale(scale);
 }
 
 
-void NaturalGradientAffineComponent::Read(std::istream &is, bool binary) {
-  // might not see the "<NaturalGradientAffineComponent>" part because
-  // of how ReadNew() works.
-  ExpectOneOrTwoTokens(is, binary, "<NaturalGradientAffineComponent>",
-                       "<LearningRate>");
-  ReadBasicType(is, binary, &learning_rate_);
-  ExpectToken(is, binary, "<LinearParams>");
-  linear_params_.Read(is, binary);
-  ExpectToken(is, binary, "<BiasParams>");
-  bias_params_.Read(is, binary);
-  ExpectToken(is, binary, "<RankIn>");
-  ReadBasicType(is, binary, &rank_in_);
-  ExpectToken(is, binary, "<RankOut>");
-  ReadBasicType(is, binary, &rank_out_);
-  ExpectToken(is, binary, "<UpdatePeriod>");
-  ReadBasicType(is, binary, &update_period_);
-  ExpectToken(is, binary, "<NumSamplesHistory>");
+void PerElementOffsetComponent::Add(BaseFloat alpha,
+                                   const Component &other_in) {
+  const PerElementOffsetComponent *other =
+      dynamic_cast<const PerElementOffsetComponent*>(&other_in);
+  KALDI_ASSERT(other != NULL);
+  offsets_.AddVec(alpha, other->offsets_);
+}
+
+PerElementOffsetComponent::PerElementOffsetComponent(
+    const PerElementOffsetComponent &component):
+    UpdatableComponent(component),
+    offsets_(component.offsets_) { }
+
+void PerElementOffsetComponent::SetZero(bool treat_as_gradient) {
+  if (treat_as_gradient) {
+    SetActualLearningRate(1.0);
+    is_gradient_ = true;
+  }
+  offsets_.SetZero();
+}
+
+void PerElementOffsetComponent::PerturbParams(BaseFloat stddev) {
+  CuVector<BaseFloat> temp_offsets(offsets_.Dim(), kUndefined);
+  temp_offsets.SetRandn();
+  offsets_.AddVec(stddev, temp_offsets);
+}
+
+std::string PerElementOffsetComponent::Info() const {
+  std::ostringstream stream;
+  stream << UpdatableComponent::Info()
+         << ", offsets-min=" << offsets_.Min()
+         << ", offsets-max=" << offsets_.Max();
+  PrintParameterStats(stream, "offsets", offsets_, true);
+  return stream.str();
+}
+
+Component* PerElementOffsetComponent::Copy() const {
+  return new PerElementOffsetComponent(*this);
+}
+
+BaseFloat PerElementOffsetComponent::DotProduct(
+    const UpdatableComponent &other_in) const {
+  const PerElementOffsetComponent *other =
+      dynamic_cast<const PerElementOffsetComponent*>(&other_in);
+  return VecVec(offsets_, other->offsets_);
+}
+
+void PerElementOffsetComponent::Init(int32 dim,
+                                     BaseFloat param_mean,
+                                     BaseFloat param_stddev) {
+  KALDI_ASSERT(dim > 0 && param_stddev >= 0.0);
+  offsets_.Resize(dim);
+  offsets_.SetRandn();
+  offsets_.Scale(param_stddev);
+  offsets_.Add(param_mean);
+}
+
+void PerElementOffsetComponent::Init(std::string vector_filename) {
+  CuVector<BaseFloat> vec;
+  ReadKaldiObject(vector_filename, &vec); // will abort on failure.
+  offsets_.Resize(vec.Dim());
+  offsets_.CopyFromVec(vec);
+}
+
+void PerElementOffsetComponent::InitFromConfig(ConfigLine *cfl) {
+  std::string vector_filename;
+  int32 dim = -1;
+  InitLearningRatesFromConfig(cfl);
+  if (cfl->GetValue("vector", &vector_filename)) {
+    Init(vector_filename);
+    if (cfl->GetValue("dim", &dim))
+      KALDI_ASSERT(dim == InputDim() &&
+                   "input-dim mismatch vs. vector.");
+  } else {
+    if(!cfl->GetValue("dim", &dim))
+      KALDI_ERR << "'dim' not provided in the config line.";
+    BaseFloat param_mean = 0.0, param_stddev = 0.0;
+    cfl->GetValue("param-mean", &param_mean);
+    cfl->GetValue("param-stddev", &param_stddev);
+    Init(dim, param_mean, param_stddev);
+  }
+  if (cfl->HasUnusedValues())
+    KALDI_ERR << "Could not process these elements in initializer: "
+              << cfl->UnusedValues();
+}
+
+void PerElementOffsetComponent::Propagate(
+    const ComponentPrecomputedIndexes *indexes,
+    const CuMatrixBase<BaseFloat> &in,
+    CuMatrixBase<BaseFloat> *out) const {
+  out->CopyFromMat(in);
+  out->AddVecToRows(1.0, offsets_);
+}
+
+void PerElementOffsetComponent::Backprop(
+    const std::string &debug_info,
+    const ComponentPrecomputedIndexes *indexes,
+    const CuMatrixBase<BaseFloat> &, // in_value
+    const CuMatrixBase<BaseFloat> &, // out_value
+    const CuMatrixBase<BaseFloat> &out_deriv,
+    Component *to_update_in,
+    CuMatrixBase<BaseFloat> *in_deriv) const {
+  PerElementOffsetComponent *to_update =
+      dynamic_cast<PerElementOffsetComponent*>(to_update_in);
+
+  if (in_deriv) {
+    // Propagate the derivative back to the input.
+    in_deriv->CopyFromMat(out_deriv);
+  }
+
+  if (to_update != NULL)
+    to_update->offsets_.AddRowSumMat(to_update->learning_rate_, out_deriv);
+}
+
+void PerElementOffsetComponent::Read(std::istream &is, bool binary) {
+  ReadUpdatableCommon(is, binary);  // Read opening tag and learning rate
+  ExpectToken(is, binary, "<Offsets>");
+  offsets_.Read(is, binary);
+  ExpectToken(is, binary, "<IsGradient>");
+  ReadBasicType(is, binary, &is_gradient_);
+  ExpectToken(is, binary, "</PerElementOffsetComponent>");
+}
+
+void PerElementOffsetComponent::Write(std::ostream &os, bool binary) const {
+  WriteUpdatableCommon(os, binary);  // Write opening tag and learning rate
+  WriteToken(os, binary, "<Offsets>");
+  offsets_.Write(os, binary);
+  WriteToken(os, binary, "<IsGradient>");
+  WriteBasicType(os, binary, is_gradient_);
+  WriteToken(os, binary, "</PerElementOffsetComponent>");
+}
+
+int32 PerElementOffsetComponent::NumParameters() const {
+  return InputDim();
+}
+
+void PerElementOffsetComponent::Vectorize(VectorBase<BaseFloat> *params) const {
+  params->CopyFromVec(offsets_);
+}
+
+void PerElementOffsetComponent::UnVectorize(
+    const VectorBase<BaseFloat> &params) {
+  offsets_.CopyFromVec(params);
+}
+
+std::string ConstantFunctionComponent::Info() const {
+  std::ostringstream stream;
+  stream << UpdatableComponent::Info()
+         << ", " << Type() << ", input-dim=" << InputDim()
+         << ", output-dim=" << OutputDim()
+         << ", is-updatable=" << std::boolalpha << is_updatable_
+         << ", use-natural-gradient=" << std::boolalpha
+         << use_natural_gradient_;
+  PrintParameterStats(stream, "output", output_, true);
+  return stream.str();
+}
+
+ConstantFunctionComponent::ConstantFunctionComponent():
+    input_dim_(-1), is_updatable_(true), use_natural_gradient_(true) { }
+
+ConstantFunctionComponent::ConstantFunctionComponent(
+    const ConstantFunctionComponent &other):
+    input_dim_(other.input_dim_), output_(other.output_),
+    is_updatable_(other.is_updatable_),
+    use_natural_gradient_(other.use_natural_gradient_),
+    preconditioner_(other.preconditioner_) { }
+
+void ConstantFunctionComponent::Propagate(
+    const ComponentPrecomputedIndexes *indexes,
+    const CuMatrixBase<BaseFloat> &in,
+    CuMatrixBase<BaseFloat> *out) const {
+  out->CopyRowsFromVec(output_);
+}
+
+void ConstantFunctionComponent::Backprop(
+    const std::string &debug_info,
+    const ComponentPrecomputedIndexes *indexes,
+    const CuMatrixBase<BaseFloat> &, // in_value
+    const CuMatrixBase<BaseFloat> &, // out_value
+    const CuMatrixBase<BaseFloat> &out_deriv,
+    Component *to_update_in,
+    CuMatrixBase<BaseFloat> *in_deriv) const {
+  // we don't update in_deriv, since we set the flag
+  // kBackpropAdds, and the output doesn't depend on the
+  // input, so the input-derivative is zero.
+  if (to_update_in) {
+    ConstantFunctionComponent *to_update =
+      dynamic_cast<ConstantFunctionComponent*>(to_update_in);
+    if (to_update->is_updatable_) {
+      // only do the update if the is_updatable_ flag is set.
+      KALDI_ASSERT(to_update && to_update->is_updatable_);
+      if (to_update->use_natural_gradient_ && !to_update->is_gradient_) {
+        CuMatrix<BaseFloat> out_deriv_copy(out_deriv);
+        BaseFloat scale = 1.0;
+        to_update->preconditioner_.PreconditionDirections(&out_deriv_copy,
+                                                          NULL, &scale);
+        to_update->output_.AddRowSumMat(scale * to_update->learning_rate_,
+                                        out_deriv_copy);
+      } else {
+        to_update->output_.AddRowSumMat(to_update->learning_rate_,
+                                        out_deriv);
+      }
+    }
+  }
+}
+
+void ConstantFunctionComponent::Read(std::istream &is, bool binary) {
+  std::string token;
+  ReadToken(is, binary, &token);
+  if (token == "<ConstantFunctionComponent>") {
+    ReadToken(is, binary, &token);
+  }
+  if (token == "<LearningRateFactor>") {
+    ReadBasicType(is, binary, &learning_rate_factor_);
+    ReadToken(is, binary, &token);
+  } else {
+    learning_rate_factor_ = 1.0;
+  }
+  if (token == "<IsGradient>") {
+    ReadBasicType(is, binary, &is_gradient_);
+    ReadToken(is, binary, &token);
+  } else {
+    is_gradient_ = false;
+  }
+  if (token == "<LearningRate>") {
+    ReadBasicType(is, binary, &learning_rate_);
+    ReadToken(is, binary, &token);
+  } else {
+    learning_rate_ = 0.001;
+  }
+  if (token == "<InputDim>") {
+    ReadBasicType(is, binary, &input_dim_);
+  } else {
+    KALDI_ERR << "Expected token <InputDim>, got "
+              << token;
+  }
+  ExpectToken(is, binary, "<Output>");
+  output_.Read(is, binary);
+  ExpectToken(is, binary, "<IsUpdatable>");
+  ReadBasicType(is, binary, &is_updatable_);
+  ExpectToken(is, binary, "<UseNaturalGradient>");
+  ReadBasicType(is, binary, &use_natural_gradient_);
+  ExpectToken(is, binary, "</ConstantFunctionComponent>");
+}
+
+void ConstantFunctionComponent::Write(std::ostream &os, bool binary) const {
+  WriteUpdatableCommon(os, binary);  // Write the opening tag and learning rate
+  WriteToken(os, binary, "<InputDim>");
+  WriteBasicType(os, binary, input_dim_);
+  WriteToken(os, binary, "<Output>");
+  output_.Write(os, binary);
+  WriteToken(os, binary, "<IsUpdatable>");
+  WriteBasicType(os, binary, is_updatable_);
+  WriteToken(os, binary, "<UseNaturalGradient>");
+  WriteBasicType(os, binary, use_natural_gradient_);
+  WriteToken(os, binary, "</ConstantFunctionComponent>");
+}
+
+Component* ConstantFunctionComponent::Copy() const {
+  return new ConstantFunctionComponent(*this);
+}
+
+void ConstantFunctionComponent::Scale(BaseFloat scale) {
+  if (is_updatable_)
+    output_.Scale(scale);
+}
+
+void ConstantFunctionComponent::Add(BaseFloat alpha, const Component &other_in) {
+  if (is_updatable_) {
+    const ConstantFunctionComponent *other =
+        dynamic_cast<const ConstantFunctionComponent*>(&other_in);
+    KALDI_ASSERT(other != NULL);
+    output_.AddVec(alpha, other->output_);
+  }
+}
+
+void ConstantFunctionComponent::SetZero(bool treat_as_gradient) {
+  if (treat_as_gradient) {
+    SetActualLearningRate(1.0);
+    is_gradient_ = true;
+  }
+  output_.SetZero();
+}
+
+void ConstantFunctionComponent::PerturbParams(BaseFloat stddev) {
+  CuVector<BaseFloat> temp_output(output_.Dim(), kUndefined);
+  temp_output.SetRandn();
+  output_.AddVec(stddev, temp_output);
+}
+
+BaseFloat ConstantFunctionComponent::DotProduct(
+    const UpdatableComponent &other_in) const {
+  KALDI_ASSERT(is_updatable_);
+  const ConstantFunctionComponent *other =
+      dynamic_cast<const ConstantFunctionComponent*>(&other_in);
+  KALDI_ASSERT(other != NULL);
+  return VecVec(output_, other->output_);
+}
+
+void ConstantFunctionComponent::InitFromConfig(ConfigLine *cfl) {
+  int32 output_dim = 0;
+  InitLearningRatesFromConfig(cfl);
+  bool ok = cfl->GetValue("output-dim", &output_dim) &&
+      cfl->GetValue("input-dim", &input_dim_);
+  cfl->GetValue("is-updatable", &is_updatable_);
+  cfl->GetValue("use-natural-gradient", &use_natural_gradient_);
+  BaseFloat output_mean = 0.0, output_stddev = 0.0;
+  cfl->GetValue("output-mean", &output_mean);
+  cfl->GetValue("output-stddev", &output_stddev);
+  if (!ok || cfl->HasUnusedValues() || input_dim_ <= 0 ||
+      output_dim <= 0) {
+    KALDI_ERR << "Bad initializer " << cfl->WholeLine();
+  }
+  Vector<BaseFloat> output(output_dim);
+  output.SetRandn();
+  output.Scale(output_stddev);
+  output.Add(output_mean);
+  output_ = output;
+}
+
+int32 ConstantFunctionComponent::NumParameters() const {
+  KALDI_ASSERT(is_updatable_);
+  return output_.Dim();
+}
+
+void ConstantFunctionComponent::Vectorize(VectorBase<BaseFloat> *params) const {
+  params->CopyFromVec(output_);
+}
+
+void ConstantFunctionComponent::UnVectorize(const VectorBase<BaseFloat> &params) {
+  output_.CopyFromVec(params);
+}
+
+
+NaturalGradientAffineComponent::NaturalGradientAffineComponent():
+    max_change_per_sample_(0.0),
+    update_count_(0.0), active_scaling_count_(0.0),
+    max_change_scale_stats_(0.0) { }
+
+// virtual
+void NaturalGradientAffineComponent::Resize(
+    int32 input_dim, int32 output_dim) {
+  KALDI_ASSERT(input_dim > 1 && output_dim > 1);
+  if (rank_in_ >= input_dim) rank_in_ = input_dim - 1;
+  if (rank_out_ >= output_dim) rank_out_ = output_dim - 1;
+  bias_params_.Resize(output_dim);
+  linear_params_.Resize(output_dim, input_dim);
+  OnlineNaturalGradient temp;
+  preconditioner_in_ = temp;
+  preconditioner_out_ = temp;
+  SetNaturalGradientConfigs();
+}
+
+
+void NaturalGradientAffineComponent::Read(std::istream &is, bool binary) {
+  ReadUpdatableCommon(is, binary);  // Read the opening tag and learning rate
+  ExpectToken(is, binary, "<LinearParams>");
+  linear_params_.Read(is, binary);
+  ExpectToken(is, binary, "<BiasParams>");
+  bias_params_.Read(is, binary);
+  ExpectToken(is, binary, "<RankIn>");
+  ReadBasicType(is, binary, &rank_in_);
+  ExpectToken(is, binary, "<RankOut>");
+  ReadBasicType(is, binary, &rank_out_);
+  ExpectToken(is, binary, "<UpdatePeriod>");
+  ReadBasicType(is, binary, &update_period_);
+  ExpectToken(is, binary, "<NumSamplesHistory>");
   ReadBasicType(is, binary, &num_samples_history_);
   ExpectToken(is, binary, "<Alpha>");
   ReadBasicType(is, binary, &alpha_);
@@ -908,18 +2308,31 @@ void NaturalGradientAffineComponent::Read(std::istream &is, bool binary) {
   ReadBasicType(is, binary, &max_change_per_sample_);
   ExpectToken(is, binary, "<IsGradient>");
   ReadBasicType(is, binary, &is_gradient_);
-  ExpectToken(is, binary, "<NaturalGradientAffineComponent>");
+  std::string token;
+  ReadToken(is, binary, &token);
+  if (token == "<UpdateCount>") {
+    ReadBasicType(is, binary, &update_count_);
+    ExpectToken(is, binary, "<ActiveScalingCount>");
+    ReadBasicType(is, binary, &active_scaling_count_);
+    ExpectToken(is, binary, "<MaxChangeScaleStats>");
+    ReadBasicType(is, binary, &max_change_scale_stats_);
+    ReadToken(is, binary, &token);
+  }
+  if (token != "<NaturalGradientAffineComponent>" &&
+      token != "</NaturalGradientAffineComponent>")
+    KALDI_ERR << "Expected <NaturalGradientAffineComponent> or "
+              << "</NaturalGradientAffineComponent>, got " << token;
   SetNaturalGradientConfigs();
 }
+
 void NaturalGradientAffineComponent::InitFromConfig(ConfigLine *cfl) {
   bool ok = true;
   std::string matrix_filename;
-  BaseFloat learning_rate = learning_rate_;
   BaseFloat num_samples_history = 2000.0, alpha = 4.0,
-      max_change_per_sample = 0.075;
+      max_change_per_sample = 0.0;
   int32 input_dim = -1, output_dim = -1, rank_in = 20, rank_out = 80,
       update_period = 4;
-  cfl->GetValue("learning-rate", &learning_rate); // optional.
+  InitLearningRatesFromConfig(cfl);
   cfl->GetValue("num-samples-history", &num_samples_history);
   cfl->GetValue("alpha", &alpha);
   cfl->GetValue("max-change-per-sample", &max_change_per_sample);
@@ -928,7 +2341,7 @@ void NaturalGradientAffineComponent::InitFromConfig(ConfigLine *cfl) {
   cfl->GetValue("update-period", &update_period);
 
   if (cfl->GetValue("matrix", &matrix_filename)) {
-    Init(learning_rate, rank_in, rank_out, update_period,
+    Init(rank_in, rank_out, update_period,
          num_samples_history, alpha, max_change_per_sample,
          matrix_filename);
     if (cfl->GetValue("input-dim", &input_dim))
@@ -941,11 +2354,12 @@ void NaturalGradientAffineComponent::InitFromConfig(ConfigLine *cfl) {
     ok = ok && cfl->GetValue("input-dim", &input_dim);
     ok = ok && cfl->GetValue("output-dim", &output_dim);
     BaseFloat param_stddev = 1.0 / std::sqrt(input_dim),
-        bias_stddev = 1.0;
+        bias_stddev = 1.0, bias_mean = 0.0;
     cfl->GetValue("param-stddev", &param_stddev);
     cfl->GetValue("bias-stddev", &bias_stddev);
-    Init(learning_rate, input_dim, output_dim, param_stddev,
-         bias_stddev, rank_in, rank_out, update_period,
+    cfl->GetValue("bias-mean", &bias_mean);
+    Init(input_dim, output_dim, param_stddev,
+         bias_stddev, bias_mean, rank_in, rank_out, update_period,
          num_samples_history, alpha, max_change_per_sample);
   }
   if (cfl->HasUnusedValues())
@@ -967,11 +2381,10 @@ void NaturalGradientAffineComponent::SetNaturalGradientConfigs() {
 }
 
 void NaturalGradientAffineComponent::Init(
-    BaseFloat learning_rate, int32 rank_in, int32 rank_out,
+    int32 rank_in, int32 rank_out,
     int32 update_period, BaseFloat num_samples_history, BaseFloat alpha,
     BaseFloat max_change_per_sample,
     std::string matrix_filename) {
-  UpdatableComponent::Init(learning_rate);
   rank_in_ = rank_in;
   rank_out_ = rank_out;
   update_period_ = update_period;
@@ -989,16 +2402,17 @@ void NaturalGradientAffineComponent::Init(
   linear_params_.CopyFromMat(mat.Range(0, output_dim, 0, input_dim));
   bias_params_.CopyColFromMat(mat, input_dim);
   is_gradient_ = false;  // not configurable; there's no reason you'd want this
+  update_count_ = 0.0;
+  active_scaling_count_ = 0.0;
+  max_change_scale_stats_ = 0.0;
 }
 
 void NaturalGradientAffineComponent::Init(
-    BaseFloat learning_rate,
     int32 input_dim, int32 output_dim,
-    BaseFloat param_stddev, BaseFloat bias_stddev,
+    BaseFloat param_stddev, BaseFloat bias_stddev, BaseFloat bias_mean,
     int32 rank_in, int32 rank_out, int32 update_period,
     BaseFloat num_samples_history, BaseFloat alpha,
     BaseFloat max_change_per_sample) {
-  UpdatableComponent::Init(learning_rate);
   linear_params_.Resize(output_dim, input_dim);
   bias_params_.Resize(output_dim);
   KALDI_ASSERT(output_dim > 0 && input_dim > 0 && param_stddev >= 0.0 &&
@@ -1007,22 +2421,28 @@ void NaturalGradientAffineComponent::Init(
   linear_params_.Scale(param_stddev);
   bias_params_.SetRandn();
   bias_params_.Scale(bias_stddev);
+  bias_params_.Add(bias_mean);
   rank_in_ = rank_in;
   rank_out_ = rank_out;
   update_period_ = update_period;
   num_samples_history_ = num_samples_history;
   alpha_ = alpha;
   SetNaturalGradientConfigs();
-  KALDI_ASSERT(max_change_per_sample >= 0.0);
+  if (max_change_per_sample > 0.0)
+    KALDI_WARN << "You are setting a positive max_change_per_sample for "
+               << "NaturalGradientAffineComponent. But the per-component "
+               << "gradient clipping mechansim has been removed. Instead it's currently "
+               << "done at the whole model level.";
   max_change_per_sample_ = max_change_per_sample;
-  is_gradient_ = false;  // not configurable; there's no reason you'd want this  
+  is_gradient_ = false;  // not configurable; there's no reason you'd want this
+  update_count_ = 0.0;
+  active_scaling_count_ = 0.0;
+  max_change_scale_stats_ = 0.0;
 }
 
-
-void NaturalGradientAffineComponent::Write(std::ostream &os, bool binary) const {
-  WriteToken(os, binary, "<NaturalGradientAffineComponent>");
-  WriteToken(os, binary, "<LearningRate>");
-  WriteBasicType(os, binary, learning_rate_);
+void NaturalGradientAffineComponent::Write(std::ostream &os,
+                                           bool binary) const {
+  WriteUpdatableCommon(os, binary);  // Write the opening tag and learning rate
   WriteToken(os, binary, "<LinearParams>");
   linear_params_.Write(os, binary);
   WriteToken(os, binary, "<BiasParams>");
@@ -1041,79 +2461,53 @@ void NaturalGradientAffineComponent::Write(std::ostream &os, bool binary) const
   WriteBasicType(os, binary, max_change_per_sample_);
   WriteToken(os, binary, "<IsGradient>");
   WriteBasicType(os, binary, is_gradient_);
-  WriteToken(os, binary, "<NaturalGradientAffineComponent>");
+  WriteToken(os, binary, "<UpdateCount>");
+  WriteBasicType(os, binary, update_count_);
+  WriteToken(os, binary, "<ActiveScalingCount>");
+  WriteBasicType(os, binary, active_scaling_count_);
+  WriteToken(os, binary, "<MaxChangeScaleStats>");
+  WriteBasicType(os, binary, max_change_scale_stats_);
+  WriteToken(os, binary, "</NaturalGradientAffineComponent>");
 }
 
 std::string NaturalGradientAffineComponent::Info() const {
-  std::stringstream stream;
-  BaseFloat linear_params_size = static_cast<BaseFloat>(linear_params_.NumRows())
-      * static_cast<BaseFloat>(linear_params_.NumCols());
-  BaseFloat linear_stddev =
-      std::sqrt(TraceMatMat(linear_params_, linear_params_, kTrans) /
-                linear_params_size),
-      bias_stddev = std::sqrt(VecVec(bias_params_, bias_params_) /
-                              bias_params_.Dim());
-  stream << Type() << ", input-dim=" << InputDim()
-         << ", output-dim=" << OutputDim()
-         << ", linear-params-stddev=" << linear_stddev
-         << ", bias-params-stddev=" << bias_stddev
-         << ", learning-rate=" << LearningRate()
-         << ", rank-in=" << rank_in_
+  std::ostringstream stream;
+  stream << UpdatableComponent::Info();
+  PrintParameterStats(stream, "linear-params", linear_params_);
+  PrintParameterStats(stream, "bias", bias_params_, true);
+  stream << ", rank-in=" << rank_in_
          << ", rank-out=" << rank_out_
          << ", num_samples_history=" << num_samples_history_
          << ", update_period=" << update_period_
          << ", alpha=" << alpha_
          << ", max-change-per-sample=" << max_change_per_sample_;
+  if (update_count_ > 0.0 && max_change_per_sample_ > 0.0) {
+    stream << ", avg-scaling-factor=" << max_change_scale_stats_ / update_count_
+           << ", active-scaling-portion="
+           << active_scaling_count_ / update_count_;
+  }
   return stream.str();
 }
 
 Component* NaturalGradientAffineComponent::Copy() const {
-  NaturalGradientAffineComponent *ans = new NaturalGradientAffineComponent();
-  ans->learning_rate_ = learning_rate_;
-  ans->rank_in_ = rank_in_;
-  ans->rank_out_ = rank_out_;
-  ans->update_period_ = update_period_;
-  ans->num_samples_history_ = num_samples_history_;
-  ans->alpha_ = alpha_;
-  ans->linear_params_ = linear_params_;
-  ans->bias_params_ = bias_params_;
-  ans->preconditioner_in_ = preconditioner_in_;
-  ans->preconditioner_out_ = preconditioner_out_;
-  ans->max_change_per_sample_ = max_change_per_sample_;
-  ans->is_gradient_ = is_gradient_;
-  ans->SetNaturalGradientConfigs();
-  return ans;
+  return new NaturalGradientAffineComponent(*this);
 }
 
-
-
-BaseFloat NaturalGradientAffineComponent::GetScalingFactor(
-    const CuVectorBase<BaseFloat> &in_products,
-    const std::string &debug_info,    
-    BaseFloat learning_rate_scale,
-    CuVectorBase<BaseFloat> *out_products) {
-  static int scaling_factor_warnings_remaining = 10;
-  int32 minibatch_size = in_products.Dim();
-
-  out_products->MulElements(in_products);
-  out_products->ApplyPow(0.5);
-  BaseFloat prod_sum = out_products->Sum();
-  BaseFloat tot_change_norm = learning_rate_scale * learning_rate_ * prod_sum,
-      max_change_norm = max_change_per_sample_ * minibatch_size;
-  // tot_change_norm is the product of norms that we are trying to limit
-  // to max_value_.
-  KALDI_ASSERT(tot_change_norm - tot_change_norm == 0.0 && "NaN in backprop");
-  KALDI_ASSERT(tot_change_norm >= 0.0);
-  if (tot_change_norm <= max_change_norm) return 1.0;
-  else {
-    BaseFloat factor = max_change_norm / tot_change_norm;
-    if (scaling_factor_warnings_remaining > 0) {
-      scaling_factor_warnings_remaining--;
-      KALDI_LOG << "Limiting step size using scaling factor "
-                << factor << ", for component " << debug_info;
-    }
-    return factor;
-  }
+NaturalGradientAffineComponent::NaturalGradientAffineComponent(
+    const NaturalGradientAffineComponent &other):
+    AffineComponent(other),
+    rank_in_(other.rank_in_),
+    rank_out_(other.rank_out_),
+    update_period_(other.update_period_),
+    num_samples_history_(other.num_samples_history_),
+    alpha_(other.alpha_),
+    preconditioner_in_(other.preconditioner_in_),
+    preconditioner_out_(other.preconditioner_out_),
+    max_change_per_sample_(other.max_change_per_sample_),
+    update_count_(other.update_count_),
+    active_scaling_count_(other.active_scaling_count_),
+    max_change_scale_stats_(other.max_change_scale_stats_) {
+  SetNaturalGradientConfigs();
 }
 
 void NaturalGradientAffineComponent::Update(
@@ -1151,11 +2545,6 @@ void NaturalGradientAffineComponent::Update(
   // (it's faster to have them output a scaling factor than to have them scale
   // their outputs).
   BaseFloat scale = in_scale * out_scale;
-  BaseFloat minibatch_scale = 1.0;
-
-  if (max_change_per_sample_ > 0.0)
-    minibatch_scale = GetScalingFactor(in_row_products, debug_info, scale,
-                                       &out_row_products);
 
   CuSubMatrix<BaseFloat> in_value_precon_part(in_value_temp,
                                               0, in_value_temp.NumRows(),
@@ -1166,26 +2555,44 @@ void NaturalGradientAffineComponent::Update(
 
   precon_ones.CopyColFromMat(in_value_temp, in_value_temp.NumCols() - 1);
 
-  BaseFloat local_lrate = scale * minibatch_scale * learning_rate_;
+  BaseFloat local_lrate = scale * learning_rate_;
+  update_count_ += 1.0;
   bias_params_.AddMatVec(local_lrate, out_deriv_temp, kTrans,
                          precon_ones, 1.0);
   linear_params_.AddMatMat(local_lrate, out_deriv_temp, kTrans,
                            in_value_precon_part, kNoTrans, 1.0);
 }
 
+void NaturalGradientAffineComponent::ZeroStats()  {
+  update_count_ = 0.0;
+  max_change_scale_stats_ = 0.0;
+  active_scaling_count_ = 0.0;
+}
+
+void NaturalGradientAffineComponent::Scale(BaseFloat scale) {
+  update_count_ *= scale;
+  max_change_scale_stats_ *= scale;
+  active_scaling_count_ *= scale;
+  linear_params_.Scale(scale);
+  bias_params_.Scale(scale);
+}
+
+void NaturalGradientAffineComponent::Add(BaseFloat alpha, const Component &other_in) {
+  const NaturalGradientAffineComponent *other =
+      dynamic_cast<const NaturalGradientAffineComponent*>(&other_in);
+  KALDI_ASSERT(other != NULL);
+  update_count_ += alpha * other->update_count_;
+  max_change_scale_stats_ += alpha * other->max_change_scale_stats_;
+  active_scaling_count_ += alpha * other->active_scaling_count_;
+  linear_params_.AddMat(alpha, other->linear_params_);
+  bias_params_.AddVec(alpha, other->bias_params_);
+}
+
 std::string FixedAffineComponent::Info() const {
-  std::stringstream stream;
-  BaseFloat linear_params_size =
-      static_cast<BaseFloat>(linear_params_.NumRows())
-      * static_cast<BaseFloat>(linear_params_.NumCols());
-  BaseFloat linear_params_stddev =
-      std::sqrt(TraceMatMat(linear_params_,
-                            linear_params_, kTrans) / linear_params_size);
-  BaseFloat bias_params_stddev =
-      std::sqrt(VecVec(bias_params_, bias_params_) / bias_params_.Dim());
-
-  stream << Component::Info() << ", linear-params-stddev="
-      << linear_params_stddev << ", bias-params-stddev=" << bias_params_stddev;
+  std::ostringstream stream;
+  stream << Component::Info();
+  PrintParameterStats(stream, "linear-params", linear_params_);
+  PrintParameterStats(stream, "bias", bias_params_, true);
   return stream.str();
 }
 
@@ -1212,7 +2619,7 @@ void FixedAffineComponent::InitFromConfig(ConfigLine *cfl) {
     KALDI_ASSERT(mat.NumRows() != 0);
     Init(mat);
   } else {
-    int32 input_dim, output_dim;
+    int32 input_dim = -1, output_dim = -1;
     if (!cfl->GetValue("input-dim", &input_dim) ||
         !cfl->GetValue("output-dim", &output_dim) || cfl->HasUnusedValues()) {
       KALDI_ERR << "Invalid initializer for layer of type "
@@ -1223,7 +2630,7 @@ void FixedAffineComponent::InitFromConfig(ConfigLine *cfl) {
     Init(mat);
   }
 }
-    
+
 
 void FixedAffineComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
                                      const CuMatrixBase<BaseFloat> &in,
@@ -1289,14 +2696,44 @@ void SumGroupComponent::Init(const std::vector<int32> &sizes) {
   this->output_dim_ = sizes.size();
 }
 
+void SumGroupComponent::Init(int32 input_dim, int32 output_dim) {
+  const int32 num_groups = output_dim;
+  KALDI_ASSERT(input_dim % num_groups == 0);
+  const int32 group_size = input_dim / num_groups;
+
+  std::vector<Int32Pair> cpu_vec(num_groups);
+  std::vector<int32> reverse_cpu_vec;
+  int32 cur_index = 0;
+  for (size_t i = 0; i < num_groups; i++) {
+    cpu_vec[i].first = cur_index;
+    cpu_vec[i].second = cur_index + group_size;
+    cur_index += group_size;
+    for (int32 j = cpu_vec[i].first; j < cpu_vec[i].second; j++)
+      reverse_cpu_vec.push_back(i);
+  }
+  this->indexes_ = cpu_vec;
+  this->reverse_indexes_ = reverse_cpu_vec;
+  this->input_dim_ = input_dim;
+  this->output_dim_ = num_groups;
+}
+
 void SumGroupComponent::InitFromConfig(ConfigLine *cfl) {
   std::vector<int32> sizes;
-  bool ok = cfl->GetValue("sizes", &sizes);
-
-  if (!ok || cfl->HasUnusedValues() || sizes.empty())
-    KALDI_ERR << "Invalid initializer for layer of type "
-              << Type() << ": \"" << cfl->WholeLine() << "\"";
-  this->Init(sizes);
+  bool has_sizes = cfl->GetValue("sizes", &sizes);
+  if (has_sizes) {
+    if (cfl->HasUnusedValues() || sizes.empty())
+      KALDI_ERR << "Invalid initializer for layer of type "
+                << Type() << ": \"" << cfl->WholeLine() << "\"";
+    this->Init(sizes);
+  } else { // each group has the same size
+    int32 input_dim = -1, output_dim = -1;
+    if (!cfl->GetValue("input-dim", &input_dim) ||
+        !cfl->GetValue("output-dim", &output_dim) || cfl->HasUnusedValues()) {
+      KALDI_ERR << "Invalid initializer for layer of type "
+                << Type() << ": \"" << cfl->WholeLine() << "\"";
+    }
+    Init(input_dim, output_dim);
+  }
 }
 
 Component* SumGroupComponent::Copy() const {
@@ -1408,7 +2845,7 @@ void SoftmaxComponent::StoreStats(const CuMatrixBase<BaseFloat> &out_value) {
   // We don't store derivative stats for this component type, just activation
   // stats.
   StoreStatsInternal(out_value, NULL);
-}  
+}
 
 
 void LogSoftmaxComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
@@ -1483,13 +2920,9 @@ void FixedScaleComponent::InitFromConfig(ConfigLine *cfl) {
 
 
 std::string FixedScaleComponent::Info() const {
-  std::stringstream stream;
-  BaseFloat scales_size = static_cast<BaseFloat>(scales_.Dim()),
-      scales_mean = scales_.Sum() / scales_size,
-      scales_stddev = std::sqrt(VecVec(scales_, scales_) / scales_size
-       - (scales_mean * scales_mean));
-  stream << Component::Info() << ", scales-mean=" << scales_mean
-         << ", scales-stddev=" << scales_stddev;
+  std::ostringstream stream;
+  stream << Component::Info();
+  PrintParameterStats(stream, "scales", scales_, true);
   return stream.str();
 }
 
@@ -1559,13 +2992,9 @@ void FixedBiasComponent::InitFromConfig(ConfigLine *cfl) {
 }
 
 std::string FixedBiasComponent::Info() const {
-  std::stringstream stream;
-  BaseFloat bias_size = static_cast<BaseFloat>(bias_.Dim()),
-      bias_mean = bias_.Sum() / bias_size,
-      bias_stddev = std::sqrt(VecVec(bias_, bias_) / bias_size)
-       - (bias_mean * bias_mean);
-  stream << Component::Info() << ", bias-mean=" << bias_mean
-         << ", bias-stddev=" << bias_stddev;
+  std::ostringstream stream;
+  stream << Component::Info();
+  PrintParameterStats(stream, "bias", bias_, true);
   return stream.str();
 }
 
@@ -1611,13 +3040,10 @@ void FixedBiasComponent::Read(std::istream &is, bool binary) {
 
 void NaturalGradientPerElementScaleComponent::Read(
     std::istream &is, bool binary) {
-  // might not see the begin marker part because of how ReadNew() works.
-  ExpectOneOrTwoTokens(is, binary, "<NaturalGradientPerElementScaleComponent>",
-                       "<LearningRate>");
-  ReadBasicType(is, binary, &learning_rate_);
+  ReadUpdatableCommon(is, binary);  // Read the opening tag and learning rate
   ExpectToken(is, binary, "<Params>");
   scales_.Read(is, binary);
-  ExpectToken(is, binary, "<IsGradient>");  
+  ExpectToken(is, binary, "<IsGradient>");
   ReadBasicType(is, binary, &is_gradient_);
   int32 rank, update_period;
   ExpectToken(is, binary, "<Rank>");
@@ -1640,9 +3066,7 @@ void NaturalGradientPerElementScaleComponent::Read(
 
 void NaturalGradientPerElementScaleComponent::Write(std::ostream &os,
                                                     bool binary) const {
-  WriteToken(os, binary, "<NaturalGradientPerElementScaleComponent>");
-  WriteToken(os, binary, "<LearningRate>");
-  WriteBasicType(os, binary, learning_rate_);
+  WriteUpdatableCommon(os, binary);  // Write the opening tag and learning rate
   WriteToken(os, binary, "<Params>");
   scales_.Write(os, binary);
   WriteToken(os, binary, "<IsGradient>");
@@ -1661,7 +3085,7 @@ void NaturalGradientPerElementScaleComponent::Write(std::ostream &os,
 }
 
 std::string NaturalGradientPerElementScaleComponent::Info() const {
-  std::stringstream stream;
+  std::ostringstream stream;
   stream << PerElementScaleComponent::Info()
          << ", rank=" << preconditioner_.GetRank()
          << ", update-period=" << preconditioner_.GetUpdatePeriod()
@@ -1682,62 +3106,66 @@ void NaturalGradientPerElementScaleComponent::InitFromConfig(ConfigLine *cfl) {
   // the parameter-change.  It has the same purpose as the max-change-per-sample in
   // the NaturalGradientAffineComponent.
   BaseFloat num_samples_history = 2000.0, alpha = 4.0,
-      max_change_per_minibatch = 0.5,
-      learning_rate = learning_rate_;  // default to value from constructor.
+      max_change_per_minibatch = 0.0;
   cfl->GetValue("rank", &rank);
   cfl->GetValue("update-period", &update_period);
   cfl->GetValue("num-samples-history", &num_samples_history);
   cfl->GetValue("alpha", &alpha);
   cfl->GetValue("max-change-per-minibatch", &max_change_per_minibatch);
-  cfl->GetValue("learning-rate", &learning_rate);
-
+  InitLearningRatesFromConfig(cfl);
   std::string filename;
   // Accepts "scales" config (for filename) or "dim" -> random init, for testing.
   if (cfl->GetValue("scales", &filename)) {
     if (cfl->HasUnusedValues())
       KALDI_ERR << "Invalid initializer for layer of type "
                 << Type() << ": \"" << cfl->WholeLine() << "\"";
-    Init(learning_rate, filename, rank, update_period, num_samples_history,
+    Init(filename, rank, update_period, num_samples_history,
          alpha, max_change_per_minibatch);
   } else {
+    BaseFloat param_mean = 1.0, param_stddev = 0.0;
+    cfl->GetValue("param-mean", &param_mean);
+    cfl->GetValue("param-stddev", &param_stddev);
+
     int32 dim;
     if (!cfl->GetValue("dim", &dim) || cfl->HasUnusedValues())
       KALDI_ERR << "Invalid initializer for layer of type "
                 << Type() << ": \"" << cfl->WholeLine() << "\"";
     KALDI_ASSERT(dim > 0);
 
-    BaseFloat param_mean = 1.0, param_stddev = 0.0;
-    cfl->GetValue("param-mean", &param_mean);
-    cfl->GetValue("param-stddev", &param_stddev);
-    Init(learning_rate, dim, param_mean, param_stddev, rank, update_period,
+    Init(dim, param_mean, param_stddev, rank, update_period,
          num_samples_history, alpha, max_change_per_minibatch);
-  }  
+  }
 }
 
 void NaturalGradientPerElementScaleComponent::Init(
-    BaseFloat learning_rate, int32 dim, BaseFloat param_mean,
+    int32 dim, BaseFloat param_mean,
     BaseFloat param_stddev, int32 rank, int32 update_period,
     BaseFloat num_samples_history, BaseFloat alpha,
     BaseFloat max_change_per_minibatch) {
-  PerElementScaleComponent::Init(learning_rate, dim, param_mean,
+  PerElementScaleComponent::Init(dim, param_mean,
                                  param_stddev);
   preconditioner_.SetRank(rank);
   preconditioner_.SetUpdatePeriod(update_period);
   preconditioner_.SetNumSamplesHistory(num_samples_history);
   preconditioner_.SetAlpha(alpha);
   max_change_per_minibatch_ = max_change_per_minibatch;
+  if (max_change_per_minibatch > 0.0)
+    KALDI_WARN << "You are setting a positive max_change_per_minibatch for "
+               << "NaturalGradientPerElementScaleComponent. But the per-component "
+               << "gradient clipping mechansim has been removed. Instead it's currently "
+               << "done at the whole model level.";
 }
 
 void NaturalGradientPerElementScaleComponent::Init(
-    BaseFloat learning_rate, std::string vector_filename,
+    std::string vector_filename,
     int32 rank, int32 update_period, BaseFloat num_samples_history,
     BaseFloat alpha, BaseFloat max_change_per_minibatch) {
-  PerElementScaleComponent::Init(learning_rate, vector_filename);
+  PerElementScaleComponent::Init(vector_filename);
   preconditioner_.SetRank(rank);
   preconditioner_.SetUpdatePeriod(update_period);
   preconditioner_.SetNumSamplesHistory(num_samples_history);
   preconditioner_.SetAlpha(alpha);
-  max_change_per_minibatch_ = max_change_per_minibatch;  
+  max_change_per_minibatch_ = max_change_per_minibatch;
 }
 
 
@@ -1758,9 +3186,7 @@ void NaturalGradientPerElementScaleComponent::Update(
     const std::string &debug_info,
     const CuMatrixBase<BaseFloat> &in_value,
     const CuMatrixBase<BaseFloat> &out_deriv) {
-  
-  static int max_change_warnings_remaining = 10;
-  
+
   CuMatrix<BaseFloat> derivs_per_frame(in_value);
   derivs_per_frame.MulElements(out_deriv);
   // the non-natural-gradient update would just do
@@ -1771,24 +3197,1568 @@ void NaturalGradientPerElementScaleComponent::Update(
 
   CuVector<BaseFloat> delta_scales(scales_.Dim());
   delta_scales.AddRowSumMat(scale * learning_rate_, derivs_per_frame);
+  scales_.AddVec(1.0, delta_scales);
+}
 
-  BaseFloat max_change_scale = 1.0,
-      param_delta = delta_scales.Norm(2.0);
-  if (param_delta > max_change_per_minibatch_) {
-    max_change_scale = max_change_per_minibatch_ / param_delta;
-    if (max_change_warnings_remaining >= 0) {
-      max_change_warnings_remaining--;
-      KALDI_WARN << "Parameter change " << param_delta
-                 << " exceeds --max-change-per-minibatch="
-                 << max_change_per_minibatch_ << " for this minibatch, "
-                 << "for node " << debug_info << ", scaling by factor "
-                 << max_change_scale;
-    }
-  }
-  scales_.AddVec(max_change_scale, delta_scales);
+// Constructors for the convolution component
+ConvolutionComponent::ConvolutionComponent():
+    UpdatableComponent(),
+    input_x_dim_(0), input_y_dim_(0), input_z_dim_(0),
+    filt_x_dim_(0), filt_y_dim_(0),
+    filt_x_step_(0), filt_y_step_(0),
+    input_vectorization_(kZyx),
+    is_gradient_(false) {}
+
+ConvolutionComponent::ConvolutionComponent(
+    const ConvolutionComponent &component):
+    UpdatableComponent(component),
+    input_x_dim_(component.input_x_dim_),
+    input_y_dim_(component.input_y_dim_),
+    input_z_dim_(component.input_z_dim_),
+    filt_x_dim_(component.filt_x_dim_),
+    filt_y_dim_(component.filt_y_dim_),
+    filt_x_step_(component.filt_x_step_),
+    filt_y_step_(component.filt_y_step_),
+    input_vectorization_(component.input_vectorization_),
+    filter_params_(component.filter_params_),
+    bias_params_(component.bias_params_),
+    is_gradient_(component.is_gradient_) {}
+
+ConvolutionComponent::ConvolutionComponent(
+    const CuMatrixBase<BaseFloat> &filter_params,
+    const CuVectorBase<BaseFloat> &bias_params,
+    int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
+    int32 filt_x_dim, int32 filt_y_dim,
+    int32 filt_x_step, int32 filt_y_step,
+    TensorVectorizationType input_vectorization,
+    BaseFloat learning_rate):
+    input_x_dim_(input_x_dim),
+    input_y_dim_(input_y_dim),
+    input_z_dim_(input_z_dim),
+    filt_x_dim_(filt_x_dim),
+    filt_y_dim_(filt_y_dim),
+    filt_x_step_(filt_x_step),
+    filt_y_step_(filt_y_step),
+    input_vectorization_(input_vectorization),
+    filter_params_(filter_params),
+    bias_params_(bias_params){
+  KALDI_ASSERT(filter_params.NumRows() == bias_params.Dim() &&
+               bias_params.Dim() != 0);
+  KALDI_ASSERT(filter_params.NumCols() == filt_x_dim * filt_y_dim * input_z_dim);
+  SetUnderlyingLearningRate(learning_rate);
+  is_gradient_ = false;
+}
+
+// aquire input dim
+int32 ConvolutionComponent::InputDim() const {
+  return input_x_dim_ * input_y_dim_ * input_z_dim_;
+}
+
+// aquire output dim
+int32 ConvolutionComponent::OutputDim() const {
+  int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_);
+  int32 num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_);
+  int32 num_filters = filter_params_.NumRows();
+  return num_x_steps * num_y_steps * num_filters;
 }
 
+// initialize the component using hyperparameters
+void ConvolutionComponent::Init(
+    int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
+    int32 filt_x_dim, int32 filt_y_dim,
+    int32 filt_x_step, int32 filt_y_step, int32 num_filters,
+    TensorVectorizationType input_vectorization,
+    BaseFloat param_stddev, BaseFloat bias_stddev) {
+  input_x_dim_ = input_x_dim;
+  input_y_dim_ = input_y_dim;
+  input_z_dim_ = input_z_dim;
+  filt_x_dim_ = filt_x_dim;
+  filt_y_dim_ = filt_y_dim;
+  filt_x_step_ = filt_x_step;
+  filt_y_step_ = filt_y_step;
+  input_vectorization_ = input_vectorization;
+  KALDI_ASSERT((input_x_dim_ - filt_x_dim_) % filt_x_step_ == 0);
+  KALDI_ASSERT((input_y_dim_ - filt_y_dim_) % filt_y_step_ == 0);
+  int32 filter_dim = filt_x_dim_ * filt_y_dim_ * input_z_dim_;
+  filter_params_.Resize(num_filters, filter_dim);
+  bias_params_.Resize(num_filters);
+  KALDI_ASSERT(param_stddev >= 0.0 && bias_stddev >= 0.0);
+  filter_params_.SetRandn();
+  filter_params_.Scale(param_stddev);
+  bias_params_.SetRandn();
+  bias_params_.Scale(bias_stddev);
+}
+
+// initialize the component using predefined matrix file
+void ConvolutionComponent::Init(
+    int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
+    int32 filt_x_dim, int32 filt_y_dim,
+    int32 filt_x_step, int32 filt_y_step,
+    TensorVectorizationType input_vectorization,
+    std::string matrix_filename) {
+  input_x_dim_ = input_x_dim;
+  input_y_dim_ = input_y_dim;
+  input_z_dim_ = input_z_dim;
+  filt_x_dim_ = filt_x_dim;
+  filt_y_dim_ = filt_y_dim;
+  filt_x_step_ = filt_x_step;
+  filt_y_step_ = filt_y_step;
+  input_vectorization_ = input_vectorization;
+  CuMatrix<BaseFloat> mat;
+  ReadKaldiObject(matrix_filename, &mat);
+  int32 filter_dim = (filt_x_dim_ * filt_y_dim_ * input_z_dim_);
+  int32 num_filters = mat.NumRows();
+  KALDI_ASSERT(mat.NumCols() == (filter_dim + 1));
+  filter_params_.Resize(num_filters, filter_dim);
+  bias_params_.Resize(num_filters);
+  filter_params_.CopyFromMat(mat.Range(0, num_filters, 0, filter_dim));
+  bias_params_.CopyColFromMat(mat, filter_dim);
+}
 
+// display information about component
+std::string ConvolutionComponent::Info() const {
+  std::ostringstream stream;
+  stream << UpdatableComponent::Info()
+         << ", input-x-dim=" << input_x_dim_
+         << ", input-y-dim=" << input_y_dim_
+         << ", input-z-dim=" << input_z_dim_
+         << ", filt-x-dim=" << filt_x_dim_
+         << ", filt-y-dim=" << filt_y_dim_
+         << ", filt-x-step=" << filt_x_step_
+         << ", filt-y-step=" << filt_y_step_
+         << ", input-vectorization=" << input_vectorization_
+         << ", num-filters=" << filter_params_.NumRows();
+  PrintParameterStats(stream, "filter-params", filter_params_);
+  PrintParameterStats(stream, "bias-params", bias_params_, true);
+  return stream.str();
+}
+
+// initialize the component using configuration file
+void ConvolutionComponent::InitFromConfig(ConfigLine *cfl) {
+  bool ok = true;
+  std::string matrix_filename;
+  int32 input_x_dim = -1, input_y_dim = -1, input_z_dim = -1,
+        filt_x_dim = -1, filt_y_dim = -1,
+        filt_x_step = -1, filt_y_step = -1,
+        num_filters = -1;
+  std::string input_vectorization_order = "zyx";
+  InitLearningRatesFromConfig(cfl);
+  ok = ok && cfl->GetValue("input-x-dim", &input_x_dim);
+  ok = ok && cfl->GetValue("input-y-dim", &input_y_dim);
+  ok = ok && cfl->GetValue("input-z-dim", &input_z_dim);
+  ok = ok && cfl->GetValue("filt-x-dim", &filt_x_dim);
+  ok = ok && cfl->GetValue("filt-y-dim", &filt_y_dim);
+  ok = ok && cfl->GetValue("filt-x-step", &filt_x_step);
+  ok = ok && cfl->GetValue("filt-y-step", &filt_y_step);
+
+  if (!ok)
+    KALDI_ERR << "Bad initializer " << cfl->WholeLine();
+  // optional argument
+  TensorVectorizationType input_vectorization;
+  cfl->GetValue("input-vectorization-order", &input_vectorization_order);
+  if (input_vectorization_order.compare("zyx") == 0) {
+    input_vectorization = kZyx;
+  } else if (input_vectorization_order.compare("yzx") == 0) {
+    input_vectorization = kYzx;
+  } else {
+    KALDI_ERR << "Unknown or unsupported input vectorization order "
+              << input_vectorization_order
+              << " accepted candidates are 'yzx' and 'zyx'";
+  }
+
+  if (cfl->GetValue("matrix", &matrix_filename)) {
+    // initialize from prefined parameter matrix
+    Init(input_x_dim, input_y_dim, input_z_dim,
+         filt_x_dim, filt_y_dim,
+         filt_x_step, filt_y_step,
+         input_vectorization,
+         matrix_filename);
+  } else {
+    ok = ok && cfl->GetValue("num-filters", &num_filters);
+    if (!ok)
+      KALDI_ERR << "Bad initializer " << cfl->WholeLine();
+    // initialize from configuration
+    int32 filter_input_dim = filt_x_dim * filt_y_dim * input_z_dim;
+    BaseFloat param_stddev = 1.0 / std::sqrt(filter_input_dim), bias_stddev = 1.0;
+    cfl->GetValue("param-stddev", &param_stddev);
+    cfl->GetValue("bias-stddev", &bias_stddev);
+    Init(input_x_dim, input_y_dim, input_z_dim,
+         filt_x_dim, filt_y_dim, filt_x_step, filt_y_step, num_filters,
+         input_vectorization, param_stddev, bias_stddev);
+  }
+  if (cfl->HasUnusedValues())
+    KALDI_ERR << "Could not process these elements in initializer: "
+	      << cfl->UnusedValues();
+  if (!ok)
+    KALDI_ERR << "Bad initializer " << cfl->WholeLine();
+}
+
+// Inline methods to convert from tensor index i.e., (x,y,z) index
+// to index in yzx or zyx vectorized tensors
+inline int32 YzxVectorIndex(int32 x, int32 y, int32 z,
+                            int32 input_x_dim,
+                            int32 input_y_dim,
+                            int32 input_z_dim) {
+  KALDI_PARANOID_ASSERT(x < input_x_dim && y < input_y_dim && z < input_z_dim);
+  return (input_y_dim * input_z_dim) * x + (input_y_dim) * z + y;
+}
+
+inline int32 ZyxVectorIndex(int32 x, int32 y, int32 z,
+                            int32 input_x_dim,
+                            int32 input_y_dim,
+                            int32 input_z_dim) {
+  KALDI_PARANOID_ASSERT(x < input_x_dim && y < input_y_dim && z < input_z_dim);
+  return (input_y_dim * input_z_dim) * x + (input_z_dim) * y + z;
+}
+
+// Method to convert from a matrix representing a minibatch of vectorized
+// 3D tensors to patches for convolution, each patch corresponds to
+// one dot product in the convolution
+void ConvolutionComponent::InputToInputPatches(
+    const CuMatrixBase<BaseFloat>& in,
+    CuMatrix<BaseFloat> *patches) const{
+  int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_);
+  int32 num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_);
+  const int32 filt_x_step = filt_x_step_,
+              filt_y_step = filt_y_step_,
+              filt_x_dim = filt_x_dim_,
+              filt_y_dim = filt_y_dim_,
+              input_x_dim = input_x_dim_,
+              input_y_dim = input_y_dim_,
+              input_z_dim = input_z_dim_,
+              filter_dim = filter_params_.NumCols();
+
+  std::vector<int32> column_map(patches->NumCols());
+  int32 column_map_size = column_map.size();
+  for (int32 x_step = 0; x_step < num_x_steps; x_step++) {
+    for (int32 y_step = 0; y_step < num_y_steps; y_step++)  {
+      int32 patch_number = x_step * num_y_steps + y_step;
+      int32 patch_start_index = patch_number * filter_dim;
+      for (int32 x = 0, index = patch_start_index; x < filt_x_dim; x++)  {
+        for (int32 y = 0; y < filt_y_dim; y++)  {
+          for (int32 z = 0; z < input_z_dim; z++, index++)  {
+            KALDI_ASSERT(index < column_map_size);
+            if (input_vectorization_ == kZyx)  {
+              column_map[index] = ZyxVectorIndex(x_step * filt_x_step + x,
+                                                 y_step * filt_y_step + y, z,
+                                                 input_x_dim, input_y_dim,
+                                                 input_z_dim);
+            } else if (input_vectorization_ == kYzx)  {
+              column_map[index] = YzxVectorIndex(x_step * filt_x_step + x,
+                                                  y_step * filt_y_step + y, z,
+                                                  input_x_dim, input_y_dim,
+                                                  input_z_dim);
+            }
+          }
+        }
+      }
+    }
+  }
+  CuArray<int32> cu_cols(column_map);
+  patches->CopyCols(in, cu_cols);
+}
+
+
+// propagation function
+// see function declaration in nnet-simple-component.h for details
+void ConvolutionComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
+                                         const CuMatrixBase<BaseFloat> &in,
+                                         CuMatrixBase<BaseFloat> *out) const {
+  const int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_),
+              num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_),
+              num_filters = filter_params_.NumRows(),
+              num_frames = in.NumRows(),
+              filter_dim = filter_params_.NumCols();
+  KALDI_ASSERT((*out).NumRows() == num_frames &&
+               (*out).NumCols() == (num_filters * num_x_steps * num_y_steps));
+
+  CuMatrix<BaseFloat> patches(num_frames,
+                              num_x_steps * num_y_steps * filter_dim,
+                              kUndefined);
+  InputToInputPatches(in, &patches);
+  CuSubMatrix<BaseFloat>* filter_params_elem = new CuSubMatrix<BaseFloat>(
+		  filter_params_, 0, filter_params_.NumRows(), 0,
+		  filter_params_.NumCols());
+  std::vector<CuSubMatrix<BaseFloat>* > tgt_batch, patch_batch,
+      filter_params_batch;
+
+  for (int32 x_step = 0; x_step < num_x_steps; x_step++)  {
+    for (int32 y_step = 0; y_step < num_y_steps; y_step++)  {
+      int32 patch_number = x_step * num_y_steps + y_step;
+      tgt_batch.push_back(new CuSubMatrix<BaseFloat>(
+              out->ColRange(patch_number * num_filters, num_filters)));
+      patch_batch.push_back(new CuSubMatrix<BaseFloat>(
+              patches.ColRange(patch_number * filter_dim, filter_dim)));
+      filter_params_batch.push_back(filter_params_elem);
+      tgt_batch[patch_number]->AddVecToRows(1.0, bias_params_, 1.0); // add bias
+    }
+  }
+  // apply all filters
+  AddMatMatBatched<BaseFloat>(1.0, tgt_batch, patch_batch,
+                              kNoTrans, filter_params_batch,
+                              kTrans, 1.0);
+  // release memory
+  delete filter_params_elem;
+  for (int32 p = 0; p < tgt_batch.size(); p++) {
+    delete tgt_batch[p];
+    delete patch_batch[p];
+  }
+}
+
+// scale the parameters
+void ConvolutionComponent::Scale(BaseFloat scale) {
+  filter_params_.Scale(scale);
+  bias_params_.Scale(scale);
+}
+
+// add another convolution component
+void ConvolutionComponent::Add(BaseFloat alpha, const Component &other_in) {
+  const ConvolutionComponent *other =
+      dynamic_cast<const ConvolutionComponent*>(&other_in);
+  KALDI_ASSERT(other != NULL);
+  filter_params_.AddMat(alpha, other->filter_params_);
+  bias_params_.AddVec(alpha, other->bias_params_);
+}
+
+/*
+ This function transforms a vector of lists into a list of vectors,
+ padded with -1.
+ @param[in] The input vector of lists. Let in.size() be D, and let
+            the longest list length (i.e. the max of in[i].size()) be L.
+ @param[out] The output list of vectors. The length of the list will
+            be L, each vector-dimension will be D (i.e. out[i].size() == D),
+            and if in[i] == j, then for some k we will have that
+            out[k][j] = i. The output vectors are padded with -1
+            where necessary if not all the input lists have the same side.
+*/
+void RearrangeIndexes(const std::vector<std::vector<int32> > &in,
+                                                std::vector<std::vector<int32> > *out) {
+  int32 D = in.size();
+  int32 L = 0;
+  for (int32 i = 0; i < D; i++)
+    if (in[i].size() > L)
+      L = in[i].size();
+  out->resize(L);
+  for (int32 i = 0; i < L; i++)
+    (*out)[i].resize(D, -1);
+  for (int32 i = 0; i < D; i++) {
+    for (int32 j = 0; j < in[i].size(); j++) {
+      (*out)[j][i] = in[i][j];
+    }
+  }
+}
+
+// Method to compute the input derivative matrix from the input derivatives
+// for patches, where each patch corresponds to one dot product
+// in the convolution
+void ConvolutionComponent::InderivPatchesToInderiv(
+    const CuMatrix<BaseFloat>& in_deriv_patches,
+    CuMatrixBase<BaseFloat> *in_deriv) const {
+
+  const int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_),
+              num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_),
+              filt_x_step = filt_x_step_,
+              filt_y_step = filt_y_step_,
+              filt_x_dim = filt_x_dim_,
+              filt_y_dim = filt_y_dim_,
+              input_x_dim = input_x_dim_,
+              input_y_dim = input_y_dim_,
+              input_z_dim = input_z_dim_,
+              filter_dim = filter_params_.NumCols();
+
+  // Compute the reverse column_map from the matrix with input
+  // derivative patches to input derivative matrix
+  std::vector<std::vector<int32> > reverse_column_map(in_deriv->NumCols());
+  int32 rev_col_map_size = reverse_column_map.size();
+  for (int32 x_step = 0; x_step < num_x_steps; x_step++) {
+    for (int32 y_step = 0; y_step < num_y_steps; y_step++)  {
+      int32 patch_number = x_step * num_y_steps + y_step;
+      int32 patch_start_index = patch_number * filter_dim;
+      for (int32 x = 0, index = patch_start_index; x < filt_x_dim; x++)  {
+        for (int32 y = 0; y < filt_y_dim; y++)  {
+          for (int32 z = 0; z < input_z_dim; z++, index++)  {
+            int32 vector_index;
+            if (input_vectorization_ == kZyx)  {
+              vector_index = ZyxVectorIndex(x_step * filt_x_step + x,
+                                            y_step * filt_y_step + y, z,
+                                            input_x_dim, input_y_dim,
+                                            input_z_dim);
+            } else {
+              KALDI_ASSERT(input_vectorization_ == kYzx);
+              vector_index = YzxVectorIndex(x_step * filt_x_step + x,
+                                            y_step * filt_y_step + y, z,
+                                            input_x_dim, input_y_dim,
+                                            input_z_dim);
+            }
+            KALDI_ASSERT(vector_index < rev_col_map_size);
+            reverse_column_map[vector_index].push_back(index);
+          }
+        }
+      }
+    }
+  }
+  std::vector<std::vector<int32> > rearranged_column_map;
+  RearrangeIndexes(reverse_column_map, &rearranged_column_map);
+  for (int32 p = 0; p < rearranged_column_map.size(); p++) {
+    CuArray<int32> cu_cols(rearranged_column_map[p]);
+    in_deriv->AddCols(in_deriv_patches, cu_cols);
+  }
+}
+
+// back propagation function
+// see function declaration in nnet-simple-component.h for details
+void ConvolutionComponent::Backprop(const std::string &debug_info,
+                                    const ComponentPrecomputedIndexes *indexes,
+                                    const CuMatrixBase<BaseFloat> &in_value,
+                                    const CuMatrixBase<BaseFloat> &, // out_value,
+                                    const CuMatrixBase<BaseFloat> &out_deriv,
+                                    Component *to_update_in,
+                                    CuMatrixBase<BaseFloat> *in_deriv) const {
+  ConvolutionComponent *to_update =
+      dynamic_cast<ConvolutionComponent*>(to_update_in);
+  const int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_),
+              num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_),
+              num_filters = filter_params_.NumRows(),
+              num_frames = out_deriv.NumRows(),
+              filter_dim = filter_params_.NumCols();
+
+  KALDI_ASSERT(out_deriv.NumRows() == num_frames &&
+               out_deriv.NumCols() ==
+               (num_filters * num_x_steps * num_y_steps));
+
+  // Compute inderiv patches
+  CuMatrix<BaseFloat> in_deriv_patches(num_frames,
+                                       num_x_steps * num_y_steps * filter_dim,
+                                       kSetZero);
+
+  std::vector<CuSubMatrix<BaseFloat>* > patch_deriv_batch, out_deriv_batch,
+	  filter_params_batch;
+  CuSubMatrix<BaseFloat>* filter_params_elem = new CuSubMatrix<BaseFloat>(
+		  filter_params_, 0, filter_params_.NumRows(), 0,
+		  filter_params_.NumCols());
+
+  for (int32 x_step = 0; x_step < num_x_steps; x_step++)  {
+    for (int32 y_step = 0; y_step < num_y_steps; y_step++)  {
+      int32 patch_number = x_step * num_y_steps + y_step;
+
+      patch_deriv_batch.push_back(new CuSubMatrix<BaseFloat>(
+              in_deriv_patches.ColRange(
+              patch_number * filter_dim, filter_dim)));
+      out_deriv_batch.push_back(new CuSubMatrix<BaseFloat>(out_deriv.ColRange(
+              patch_number * num_filters, num_filters)));
+      filter_params_batch.push_back(filter_params_elem);
+    }
+  }
+  AddMatMatBatched<BaseFloat>(1.0, patch_deriv_batch,
+                              out_deriv_batch, kNoTrans,
+                              filter_params_batch, kNoTrans, 0.0);
+
+  if (in_deriv) {
+    // combine the derivatives from the individual input deriv patches
+    // to compute input deriv matrix
+    InderivPatchesToInderiv(in_deriv_patches, in_deriv);
+  }
+
+  if (to_update != NULL)  {
+    to_update->Update(debug_info, in_value, out_deriv, out_deriv_batch);
+  }
+
+  // release memory
+  delete filter_params_elem;
+  for (int32 p = 0; p < patch_deriv_batch.size(); p++) {
+    delete patch_deriv_batch[p];
+    delete out_deriv_batch[p];
+  }
+}
+
+
+// update parameters
+// see function declaration in nnet-simple-component.h for details
+void ConvolutionComponent::Update(const std::string &debug_info,
+                                  const CuMatrixBase<BaseFloat> &in_value,
+                                  const CuMatrixBase<BaseFloat> &out_deriv,
+                                  const std::vector<CuSubMatrix<BaseFloat> *>& out_deriv_batch) {
+  // useful dims
+  const int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_),
+              num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_),
+              num_filters = filter_params_.NumRows(),
+              num_frames = out_deriv.NumRows(),
+              filter_dim = filter_params_.NumCols();
+  KALDI_ASSERT(out_deriv.NumRows() == num_frames &&
+               out_deriv.NumCols() ==
+               (num_filters * num_x_steps * num_y_steps));
+
+
+  CuMatrix<BaseFloat> filters_grad;
+  CuVector<BaseFloat> bias_grad;
+
+  CuMatrix<BaseFloat> input_patches(num_frames,
+                                    filter_dim * num_x_steps * num_y_steps,
+                                    kUndefined);
+  InputToInputPatches(in_value, &input_patches);
+
+  filters_grad.Resize(num_filters, filter_dim, kSetZero); // reset
+  bias_grad.Resize(num_filters, kSetZero); // reset
+
+  // create a single large matrix holding the smaller matrices
+  // from the vector container filters_grad_batch along the rows
+  CuMatrix<BaseFloat> filters_grad_blocks_batch(
+      num_x_steps * num_y_steps * filters_grad.NumRows(),
+      filters_grad.NumCols());
+
+  std::vector<CuSubMatrix<BaseFloat>* > filters_grad_batch, input_patch_batch;
+
+  for (int32 x_step = 0; x_step < num_x_steps; x_step++)  {
+    for (int32 y_step = 0; y_step < num_y_steps; y_step++)  {
+      int32 patch_number = x_step * num_y_steps + y_step;
+      filters_grad_batch.push_back(new CuSubMatrix<BaseFloat>(
+              filters_grad_blocks_batch.RowRange(
+				      patch_number * filters_grad.NumRows(),
+				    filters_grad.NumRows())));
+
+      input_patch_batch.push_back(new CuSubMatrix<BaseFloat>(
+              input_patches.ColRange(patch_number * filter_dim, filter_dim)));
+    }
+  }
+
+  AddMatMatBatched<BaseFloat>(1.0, filters_grad_batch, out_deriv_batch, kTrans,
+                              input_patch_batch, kNoTrans, 1.0);
+
+  // add the row blocks together to filters_grad
+  filters_grad.AddMatBlocks(1.0, filters_grad_blocks_batch);
+
+  // create a matrix holding the col blocks sum of out_deriv
+  CuMatrix<BaseFloat> out_deriv_col_blocks_sum(out_deriv.NumRows(),
+                                               num_filters);
+
+  // add the col blocks together to out_deriv_col_blocks_sum
+  out_deriv_col_blocks_sum.AddMatBlocks(1.0, out_deriv);
+
+  bias_grad.AddRowSumMat(1.0, out_deriv_col_blocks_sum, 1.0);
+
+  // release memory
+  for (int32 p = 0; p < input_patch_batch.size(); p++) {
+    delete filters_grad_batch[p];
+    delete input_patch_batch[p];
+  }
+
+  //
+  // update
+  //
+  filter_params_.AddMat(learning_rate_, filters_grad);
+  bias_params_.AddVec(learning_rate_, bias_grad);
+}
+
+void ConvolutionComponent::SetZero(bool treat_as_gradient) {
+  if (treat_as_gradient) {
+    SetActualLearningRate(1.0);
+    is_gradient_ = true;
+  }
+  filter_params_.SetZero();
+  bias_params_.SetZero();
+}
+
+void ConvolutionComponent::Read(std::istream &is, bool binary) {
+  ReadUpdatableCommon(is, binary);  // Read opening tag and learning rate.
+  ExpectToken(is, binary, "<InputXDim>");
+  ReadBasicType(is, binary, &input_x_dim_);
+  ExpectToken(is, binary, "<InputYDim>");
+  ReadBasicType(is, binary, &input_y_dim_);
+  ExpectToken(is, binary, "<InputZDim>");
+  ReadBasicType(is, binary, &input_z_dim_);
+  ExpectToken(is, binary, "<FiltXDim>");
+  ReadBasicType(is, binary, &filt_x_dim_);
+  ExpectToken(is, binary, "<FiltYDim>");
+  ReadBasicType(is, binary, &filt_y_dim_);
+  ExpectToken(is, binary, "<FiltXStep>");
+  ReadBasicType(is, binary, &filt_x_step_);
+  ExpectToken(is, binary, "<FiltYStep>");
+  ReadBasicType(is, binary, &filt_y_step_);
+  ExpectToken(is, binary, "<InputVectorization>");
+  int32 input_vectorization;
+  ReadBasicType(is, binary, &input_vectorization);
+  input_vectorization_ = static_cast<TensorVectorizationType>(input_vectorization);
+  ExpectToken(is, binary, "<FilterParams>");
+  filter_params_.Read(is, binary);
+  ExpectToken(is, binary, "<BiasParams>");
+  bias_params_.Read(is, binary);
+  std::string tok;
+  ReadToken(is, binary, &tok);
+  if (tok == "<IsGradient>") {
+    ReadBasicType(is, binary, &is_gradient_);
+    ExpectToken(is, binary, "</ConvolutionComponent>");
+  } else {
+    is_gradient_ = false;
+    KALDI_ASSERT(tok == "</ConvolutionComponent>");
+  }
+}
+
+void ConvolutionComponent::Write(std::ostream &os, bool binary) const {
+  WriteUpdatableCommon(os, binary);  // write opening tag and learning rate.
+  WriteToken(os, binary, "<InputXDim>");
+  WriteBasicType(os, binary, input_x_dim_);
+  WriteToken(os, binary, "<InputYDim>");
+  WriteBasicType(os, binary, input_y_dim_);
+  WriteToken(os, binary, "<InputZDim>");
+  WriteBasicType(os, binary, input_z_dim_);
+  WriteToken(os, binary, "<FiltXDim>");
+  WriteBasicType(os, binary, filt_x_dim_);
+  WriteToken(os, binary, "<FiltYDim>");
+  WriteBasicType(os, binary, filt_y_dim_);
+  WriteToken(os, binary, "<FiltXStep>");
+  WriteBasicType(os, binary, filt_x_step_);
+  WriteToken(os, binary, "<FiltYStep>");
+  WriteBasicType(os, binary, filt_y_step_);
+  WriteToken(os, binary, "<InputVectorization>");
+  WriteBasicType(os, binary, static_cast<int32>(input_vectorization_));
+  WriteToken(os, binary, "<FilterParams>");
+  filter_params_.Write(os, binary);
+  WriteToken(os, binary, "<BiasParams>");
+  bias_params_.Write(os, binary);
+  WriteToken(os, binary, "<IsGradient>");
+  WriteBasicType(os, binary, is_gradient_);
+  WriteToken(os, binary, "</ConvolutionComponent>");
+}
+
+BaseFloat ConvolutionComponent::DotProduct(const UpdatableComponent &other_in) const {
+  const ConvolutionComponent *other =
+      dynamic_cast<const ConvolutionComponent*>(&other_in);
+  return TraceMatMat(filter_params_, other->filter_params_, kTrans)
+         + VecVec(bias_params_, other->bias_params_);
+}
+
+Component* ConvolutionComponent::Copy() const {
+  ConvolutionComponent *ans = new ConvolutionComponent(*this);
+  return ans;
+}
+
+void ConvolutionComponent::PerturbParams(BaseFloat stddev) {
+  CuMatrix<BaseFloat> temp_filter_params(filter_params_);
+  temp_filter_params.SetRandn();
+  filter_params_.AddMat(stddev, temp_filter_params);
+
+  CuVector<BaseFloat> temp_bias_params(bias_params_);
+  temp_bias_params.SetRandn();
+  bias_params_.AddVec(stddev, temp_bias_params);
+}
+
+void ConvolutionComponent::SetParams(const VectorBase<BaseFloat> &bias,
+                                     const MatrixBase<BaseFloat> &filter) {
+  bias_params_ = bias;
+  filter_params_ = filter;
+  KALDI_ASSERT(bias_params_.Dim() == filter_params_.NumRows());
+}
+
+int32 ConvolutionComponent::NumParameters() const {
+  return (filter_params_.NumCols() + 1) * filter_params_.NumRows();
+}
+
+void ConvolutionComponent::Vectorize(VectorBase<BaseFloat> *params) const {
+  KALDI_ASSERT(params->Dim() == this->NumParameters());
+  int32 num_filter_params = filter_params_.NumCols() * filter_params_.NumRows();
+  params->Range(0, num_filter_params).CopyRowsFromMat(filter_params_);
+  params->Range(num_filter_params, bias_params_.Dim()).CopyFromVec(bias_params_);
+}
+void ConvolutionComponent::UnVectorize(const VectorBase<BaseFloat> &params) {
+  KALDI_ASSERT(params.Dim() == this->NumParameters());
+  int32 num_filter_params = filter_params_.NumCols() * filter_params_.NumRows();
+  filter_params_.CopyRowsFromVec(params.Range(0, num_filter_params));
+  bias_params_.CopyFromVec(params.Range(num_filter_params, bias_params_.Dim()));
+}
+
+// aquire input dim
+int32 MaxpoolingComponent::InputDim() const {
+  return input_x_dim_ * input_y_dim_ * input_z_dim_;
+}
+
+MaxpoolingComponent::MaxpoolingComponent(
+    const MaxpoolingComponent &component):
+    input_x_dim_(component.input_x_dim_),
+    input_y_dim_(component.input_y_dim_),
+    input_z_dim_(component.input_z_dim_),
+    pool_x_size_(component.pool_x_size_),
+    pool_y_size_(component.pool_y_size_),
+    pool_z_size_(component.pool_z_size_),
+    pool_x_step_(component.pool_x_step_),
+    pool_y_step_(component.pool_y_step_),
+    pool_z_step_(component.pool_z_step_) { }
+
+// aquire output dim
+int32 MaxpoolingComponent::OutputDim() const {
+  int32 num_pools_x = 1 + (input_x_dim_ - pool_x_size_) / pool_x_step_;
+  int32 num_pools_y = 1 + (input_y_dim_ - pool_y_size_) / pool_y_step_;
+  int32 num_pools_z = 1 + (input_z_dim_ - pool_z_size_) / pool_z_step_;
+  return num_pools_x * num_pools_y * num_pools_z;
+}
+
+// check the component parameters
+void MaxpoolingComponent::Check() const {
+  // sanity check of the max pooling parameters
+  KALDI_ASSERT(input_x_dim_ > 0);
+  KALDI_ASSERT(input_y_dim_ > 0);
+  KALDI_ASSERT(input_z_dim_ > 0);
+  KALDI_ASSERT(pool_x_size_ > 0);
+  KALDI_ASSERT(pool_y_size_ > 0);
+  KALDI_ASSERT(pool_z_size_ > 0);
+  KALDI_ASSERT(pool_x_step_ > 0);
+  KALDI_ASSERT(pool_y_step_ > 0);
+  KALDI_ASSERT(pool_z_step_ > 0);
+  KALDI_ASSERT(input_x_dim_ >= pool_x_size_);
+  KALDI_ASSERT(input_y_dim_ >= pool_y_size_);
+  KALDI_ASSERT(input_z_dim_ >= pool_z_size_);
+  KALDI_ASSERT(pool_x_size_ >= pool_x_step_);
+  KALDI_ASSERT(pool_y_size_ >= pool_y_step_);
+  KALDI_ASSERT(pool_z_size_ >= pool_z_step_);
+  KALDI_ASSERT((input_x_dim_ - pool_x_size_) % pool_x_step_  == 0);
+  KALDI_ASSERT((input_y_dim_ - pool_y_size_) % pool_y_step_  == 0);
+  KALDI_ASSERT((input_z_dim_ - pool_z_size_) % pool_z_step_  == 0);
+}
+
+// initialize the component using configuration file
+void MaxpoolingComponent::InitFromConfig(ConfigLine *cfl) {
+  bool ok = true;
+
+  ok = ok && cfl->GetValue("input-x-dim", &input_x_dim_);
+  ok = ok && cfl->GetValue("input-y-dim", &input_y_dim_);
+  ok = ok && cfl->GetValue("input-z-dim", &input_z_dim_);
+  ok = ok && cfl->GetValue("pool-x-size", &pool_x_size_);
+  ok = ok && cfl->GetValue("pool-y-size", &pool_y_size_);
+  ok = ok && cfl->GetValue("pool-z-size", &pool_z_size_);
+  ok = ok && cfl->GetValue("pool-x-step", &pool_x_step_);
+  ok = ok && cfl->GetValue("pool-y-step", &pool_y_step_);
+  ok = ok && cfl->GetValue("pool-z-step", &pool_z_step_);
+
+  if (cfl->HasUnusedValues())
+    KALDI_ERR << "Could not process these elements in initializer: "
+              << cfl->UnusedValues();
+  if (!ok)
+    KALDI_ERR << "Bad initializer " << cfl->WholeLine();
+
+  Check();
+}
+
+// Method to convert from a matrix representing a minibatch of vectorized
+// 3D tensors to patches for 3d max pooling, each patch corresponds to
+// the nodes having the same local coordinatenodes from each pool
+void MaxpoolingComponent::InputToInputPatches(
+    const CuMatrixBase<BaseFloat>& in,
+    CuMatrix<BaseFloat> *patches) const{
+  int32 num_pools_x = 1 + (input_x_dim_ - pool_x_size_) / pool_x_step_;
+  int32 num_pools_y = 1 + (input_y_dim_ - pool_y_size_) / pool_y_step_;
+  int32 num_pools_z = 1 + (input_z_dim_ - pool_z_size_) / pool_z_step_;
+
+  std::vector<int32> column_map(patches->NumCols());
+  int32 column_map_size = column_map.size();
+  for (int32 x = 0, index =0; x < pool_x_size_; x++) {
+    for (int32 y = 0; y < pool_y_size_; y++) {
+      for (int32 z = 0; z < pool_z_size_; z++) {
+        // given the local node coordinate, group them from each pool
+        // to form a patch
+        for (int32 x_pool = 0; x_pool < num_pools_x; x_pool++) {
+          for (int32 y_pool = 0; y_pool < num_pools_y; y_pool++) {
+            for (int32 z_pool = 0; z_pool < num_pools_z; z_pool++, index++) {
+              KALDI_ASSERT(index < column_map_size);
+              column_map[index] = (x_pool * pool_x_step_ + x) * input_y_dim_ * input_z_dim_ +
+                                  (y_pool * pool_y_step_ + y) * input_z_dim_ +
+                                  (z_pool * pool_z_step_ + z);
+
+            }
+          }
+        }
+      }
+    }
+  }
+  CuArray<int32> cu_cols(column_map);
+  patches->CopyCols(in, cu_cols);
+}
+
+/*
+  This is the 3d max pooling propagate function.
+  It is assumed that each row of the input matrix
+  is a vectorized 3D-tensor of type zxy.
+  Similar to the propagate function of ConvolutionComponent,
+  the input matrix is first arranged into patches so that
+  pools (with / without overlapping) could be
+  processed in a parallelizable manner.
+  The output matrix is also a vectorized 3D-tensor of type zxy.
+*/
+
+void MaxpoolingComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
+                                    const CuMatrixBase<BaseFloat> &in,
+                                    CuMatrixBase<BaseFloat> *out) const {
+  int32 num_frames = in.NumRows();
+  int32 num_pools = OutputDim();
+  int32 pool_size = pool_x_size_ * pool_y_size_ * pool_z_size_;
+  CuMatrix<BaseFloat> patches(num_frames, num_pools * pool_size, kUndefined);
+  InputToInputPatches(in, &patches);
+
+  out->Set(-1e20); // reset a large negative value
+  for (int32 q = 0; q < pool_size; q++)
+    out->Max(patches.ColRange(q * num_pools, num_pools));
+}
+
+// Method to compute the input derivative matrix from the input derivatives
+// for patches, where each patch corresponds to
+// the nodes having the same local coordinatenodes from each pool
+void MaxpoolingComponent::InderivPatchesToInderiv(
+    const CuMatrix<BaseFloat>& in_deriv_patches,
+    CuMatrixBase<BaseFloat> *in_deriv) const {
+
+  int32 num_pools_x = 1 + (input_x_dim_ - pool_x_size_) / pool_x_step_;
+  int32 num_pools_y = 1 + (input_y_dim_ - pool_y_size_) / pool_y_step_;
+  int32 num_pools_z = 1 + (input_z_dim_ - pool_z_size_) / pool_z_step_;
+
+  std::vector<std::vector<int32> > reverse_column_map(in_deriv->NumCols());
+  int32 rev_col_map_size = reverse_column_map.size();
+  for (int32 x = 0, index = 0; x < pool_x_size_; x++) {
+    for (int32 y = 0; y < pool_y_size_; y++) {
+      for (int32 z = 0; z < pool_z_size_; z++) {
+
+        for (int32 x_pool = 0; x_pool < num_pools_x; x_pool++) {
+          for (int32 y_pool = 0; y_pool < num_pools_y; y_pool++) {
+            for (int32 z_pool = 0; z_pool < num_pools_z; z_pool++, index++) {
+              int32 vector_index = (x_pool * pool_x_step_ + x) * input_y_dim_ * input_z_dim_ +
+                                  (y_pool * pool_y_step_ + y) * input_z_dim_ +
+                                  (z_pool * pool_z_step_ + z);
+
+              KALDI_ASSERT(vector_index < rev_col_map_size);
+              reverse_column_map[vector_index].push_back(index);
+            }
+          }
+        }
+      }
+    }
+  }
+  std::vector<std::vector<int32> > rearranged_column_map;
+  RearrangeIndexes(reverse_column_map, &rearranged_column_map);
+  for (int32 p = 0; p < rearranged_column_map.size(); p++) {
+    CuArray<int32> cu_cols(rearranged_column_map[p]);
+    in_deriv->AddCols(in_deriv_patches, cu_cols);
+  }
+}
+
+/*
+  3d max pooling backpropagate function
+  This function backpropagate the error from
+  out_deriv to in_deriv.
+  In order to select the node in each pool to
+  backpropagate the error, it has to compare
+  the output pool value stored in the out_value
+  matrix with each of its input pool member node
+  stroed in the in_value matrix.
+*/
+void MaxpoolingComponent::Backprop(const std::string &debug_info,
+                                   const ComponentPrecomputedIndexes *indexes,
+                                   const CuMatrixBase<BaseFloat> &in_value,
+                                   const CuMatrixBase<BaseFloat> &out_value,
+                                   const CuMatrixBase<BaseFloat> &out_deriv,
+                                   Component *, // to_update,
+                                   CuMatrixBase<BaseFloat> *in_deriv) const {
+  if (!in_deriv)
+    return;
+
+  int32 num_frames = in_value.NumRows();
+  int32 num_pools = OutputDim();
+  int32 pool_size = pool_x_size_ * pool_y_size_ * pool_z_size_;
+  CuMatrix<BaseFloat> patches(num_frames, num_pools * pool_size, kUndefined);
+  InputToInputPatches(in_value, &patches);
+
+  for (int32 q = 0; q < pool_size; q++) {
+    // zero-out mask
+    CuMatrix<BaseFloat> mask;
+    out_value.EqualElementMask(patches.ColRange(q * num_pools, num_pools), &mask);
+    mask.MulElements(out_deriv);
+    patches.ColRange(q * num_pools, num_pools).CopyFromMat(mask);
+  }
+
+  // combine the derivatives from the individual input deriv patches
+  // to compute input deriv matrix
+  InderivPatchesToInderiv(patches, in_deriv);
+}
+
+void MaxpoolingComponent::Read(std::istream &is, bool binary) {
+  ExpectOneOrTwoTokens(is, binary, "<MaxpoolingComponent>", "<InputXDim>");
+  ReadBasicType(is, binary, &input_x_dim_);
+  ExpectToken(is, binary, "<InputYDim>");
+  ReadBasicType(is, binary, &input_y_dim_);
+  ExpectToken(is, binary, "<InputZDim>");
+  ReadBasicType(is, binary, &input_z_dim_);
+  ExpectToken(is, binary, "<PoolXSize>");
+  ReadBasicType(is, binary, &pool_x_size_);
+  ExpectToken(is, binary, "<PoolYSize>");
+  ReadBasicType(is, binary, &pool_y_size_);
+  ExpectToken(is, binary, "<PoolZSize>");
+  ReadBasicType(is, binary, &pool_z_size_);
+  ExpectToken(is, binary, "<PoolXStep>");
+  ReadBasicType(is, binary, &pool_x_step_);
+  ExpectToken(is, binary, "<PoolYStep>");
+  ReadBasicType(is, binary, &pool_y_step_);
+  ExpectToken(is, binary, "<PoolZStep>");
+  ReadBasicType(is, binary, &pool_z_step_);
+  ExpectToken(is, binary, "</MaxpoolingComponent>");
+  Check();
+}
+
+void MaxpoolingComponent::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<MaxpoolingComponent>");
+  WriteToken(os, binary, "<InputXDim>");
+  WriteBasicType(os, binary, input_x_dim_);
+  WriteToken(os, binary, "<InputYDim>");
+  WriteBasicType(os, binary, input_y_dim_);
+  WriteToken(os, binary, "<InputZDim>");
+  WriteBasicType(os, binary, input_z_dim_);
+  WriteToken(os, binary, "<PoolXSize>");
+  WriteBasicType(os, binary, pool_x_size_);
+  WriteToken(os, binary, "<PoolYSize>");
+  WriteBasicType(os, binary, pool_y_size_);
+  WriteToken(os, binary, "<PoolZSize>");
+  WriteBasicType(os, binary, pool_z_size_);
+  WriteToken(os, binary, "<PoolXStep>");
+  WriteBasicType(os, binary, pool_x_step_);
+  WriteToken(os, binary, "<PoolYStep>");
+  WriteBasicType(os, binary, pool_y_step_);
+  WriteToken(os, binary, "<PoolZStep>");
+  WriteBasicType(os, binary, pool_z_step_);
+  WriteToken(os, binary, "</MaxpoolingComponent>");
+}
+
+// display information about component
+std::string MaxpoolingComponent::Info() const {
+  std::ostringstream stream;
+  stream << Type()
+         << ", input-x-dim = " << input_x_dim_
+         << ", input-y-dim = " << input_y_dim_
+         << ", input-z-dim = " << input_z_dim_
+         << ", pool-x-size = " << pool_x_size_
+         << ", pool-y-size = " << pool_y_size_
+         << ", pool-z-size = " << pool_z_size_
+         << ", pool-x-step = " << pool_x_step_
+         << ", pool-y-step = " << pool_y_step_
+         << ", pool-z-step = " << pool_z_step_;
+  return stream.str();
+}
+
+void PermuteComponent::ComputeReverseColumnMap() {
+  int32 dim = column_map_.Dim();
+  KALDI_ASSERT(dim > 0);
+  std::vector<int32> reverse_column_map_cpu(dim, -1),
+      column_map_cpu(dim);
+  column_map_.CopyToVec(&column_map_cpu);
+  for (int32 i = 0; i < dim; i++) {
+    int32 &dest = reverse_column_map_cpu[column_map_cpu[i]];
+    if (dest != -1)
+      KALDI_ERR << "Column map does not represent a permutation.";
+    dest = i;
+  }
+  reverse_column_map_.Resize(dim);
+  reverse_column_map_.CopyFromVec(reverse_column_map_cpu);
+}
+
+Component* PermuteComponent::Copy() const {
+  PermuteComponent *ans = new PermuteComponent();
+  ans->column_map_ = column_map_;
+  ans->reverse_column_map_ = reverse_column_map_;
+  return ans;
+}
+
+void PermuteComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
+                                 const CuMatrixBase<BaseFloat> &in,
+                                 CuMatrixBase<BaseFloat> *out) const  {
+  out->CopyCols(in, column_map_);
+}
+void PermuteComponent::Backprop(const std::string &debug_info,
+                                const ComponentPrecomputedIndexes *indexes,
+                                const CuMatrixBase<BaseFloat> &, //in_value
+                                const CuMatrixBase<BaseFloat> &, // out_value,
+                                const CuMatrixBase<BaseFloat> &out_deriv,
+                                Component *to_update,
+                                CuMatrixBase<BaseFloat> *in_deriv) const  {
+  in_deriv->CopyCols(out_deriv, reverse_column_map_);
+}
+
+void PermuteComponent::InitFromConfig(ConfigLine *cfl) {
+  bool ok = true;
+  std::string column_map_str;
+  ok = ok && cfl->GetValue("column-map", &column_map_str);
+  std::vector<int32> column_map;
+  if (!SplitStringToIntegers(column_map_str, ",", true, &column_map))
+    KALDI_ERR << "Bad initializer in PermuteComponent: column-map="
+              << column_map_str;
+  if (cfl->HasUnusedValues())
+    KALDI_ERR << "Could not process these elements in initializer: "
+	      << cfl->UnusedValues();
+  if (!ok)
+    KALDI_ERR << "Invalid initializer for layer of type "
+              << Type() << ": \"" << cfl->WholeLine() << "\"";
+  Init(column_map);
+}
+
+void PermuteComponent::Init(const std::vector<int32> &column_map) {
+  KALDI_ASSERT(column_map.size() > 0);
+  column_map_.CopyFromVec(column_map);
+  ComputeReverseColumnMap();
+}
+
+void PermuteComponent::Read(std::istream &is, bool binary) {
+  ExpectOneOrTwoTokens(is, binary, "<PermuteComponent>", "<ColumnMap>");
+  std::vector<int32> column_map;
+  if (binary && is.peek() == 'F') {
+    // back-compatibility code [temporary]
+    Vector<BaseFloat> float_map;
+    float_map.Read(is, binary);
+    column_map.resize(float_map.Dim());
+    for (int32 i = 0; i < float_map.Dim(); i++) {
+      // note: casting truncates toward zero: add 0.5 to approximate rounding.
+      column_map[i] = static_cast<int32>(float_map(i) + 0.5);
+    }
+    // the next line is a workaround for a bug in the old
+    // writing code, which now causes an assert failure.  it's only
+    // valid for the permutations we're currently using.  anyway all this
+    // code is only temporary.
+    column_map.back() = float_map.Dim() - 1;
+  } else {
+    ReadIntegerVector(is, binary, &column_map);
+  }
+  column_map_.CopyFromVec(column_map);
+  ExpectToken(is, binary, "</PermuteComponent>");
+  ComputeReverseColumnMap();
+}
+
+void PermuteComponent::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<PermuteComponent>");
+  WriteToken(os, binary, "<ColumnMap>");
+  std::ostringstream buffer;
+  std::vector<int32> column_map;
+  column_map_.CopyToVec(&column_map);
+  WriteIntegerVector(os, binary, column_map);
+  WriteToken(os, binary, "</PermuteComponent>");
+}
+
+std::string PermuteComponent::Info() const {
+  std::ostringstream stream;
+  stream << Type() << ", dim=" << column_map_.Dim();
+  stream << " , column-map=[ ";
+  std::vector<int32> column_map(column_map_.Dim());
+  column_map_.CopyToVec(&column_map);
+  int32 max_size = 5;
+  for (size_t i = 0; i < column_map.size() && i < max_size; i++)
+    stream << column_map[i] << ' ';
+  if (static_cast<int32>(column_map.size()) > max_size)
+    stream << "... ";
+  stream << "]";
+  return stream.str();
+}
+
+
+bool CompositeComponent::IsUpdatable() const {
+  for (std::vector<Component*>::const_iterator iter = components_.begin(),
+           end = components_.end(); iter != end; ++iter)
+    if (((*iter)->Properties() & kUpdatableComponent) != 0)
+      return true;
+  return false;
+}
+
+// virtual
+int32 CompositeComponent::InputDim() const {
+  KALDI_ASSERT(!components_.empty());
+  return components_.front()->InputDim();
+};
+
+// virtual
+int32 CompositeComponent::OutputDim() const {
+  KALDI_ASSERT(!components_.empty());
+  return components_.back()->OutputDim();
+};
+
+// virtual
+int32 CompositeComponent::Properties() const {
+  KALDI_ASSERT(!components_.empty());
+  int32 last_component_properties = components_.back()->Properties(),
+      first_component_properties = components_.front()->Properties();
+  // We always assume backprop needs the input, as this would be necessary to
+  // get the activations at intermediate layers, if these were not needed in
+  // backprop, there would be no reason to use a CompositeComponent.
+  int32 ans = kSimpleComponent | kBackpropNeedsInput |
+      (last_component_properties &
+       (kPropagateAdds|kBackpropNeedsOutput|kOutputContiguous)) |
+       (first_component_properties &
+        (kBackpropAdds|kInputContiguous)) |
+       (IsUpdatable() ? kUpdatableComponent : 0);
+  // note, we don't return the kStoresStats property because that function is
+  // not implemented; instead, for efficiency, we call StoreStats() on any
+  // sub-components as part of the backprop phase.
+  if (last_component_properties & kStoresStats)
+    ans |= kBackpropNeedsOutput;
+  return ans;
+};
+
+
+MatrixStrideType CompositeComponent::GetStrideType(int32 i) const {
+  int32 num_components = components_.size();
+  if ((components_[i]->Properties() & kOutputContiguous) ||
+      (i + 1 < num_components &&
+       (components_[i + 1]->Properties() & kInputContiguous)))
+    return kStrideEqualNumCols;
+  else
+    return kDefaultStride;
+}
+
+
+// virtual
+void CompositeComponent::Propagate(
+    const ComponentPrecomputedIndexes *, // indexes
+    const CuMatrixBase<BaseFloat> &in,
+    CuMatrixBase<BaseFloat> *out) const {
+  KALDI_ASSERT(in.NumRows() == out->NumRows() && in.NumCols() == InputDim() &&
+               out->NumCols() == OutputDim());
+  int32 num_rows = in.NumRows(),
+      num_components = components_.size();
+  if (max_rows_process_ > 0 && num_rows > max_rows_process_) {
+    // recurse and process smaller parts of the data, to save memory.
+    for (int32 row_offset = 0; row_offset < num_rows;
+         row_offset += max_rows_process_) {
+      int32 this_num_rows = std::min<int32>(max_rows_process_,
+                                            num_rows - row_offset);
+      const CuSubMatrix<BaseFloat> in_part(in, row_offset, this_num_rows,
+                                           0, in.NumCols());
+      CuSubMatrix<BaseFloat> out_part(*out, row_offset, this_num_rows,
+                                      0, out->NumCols());
+      this->Propagate(NULL, in_part, &out_part);
+    }
+    return;
+  }
+  std::vector<CuMatrix<BaseFloat> > intermediate_outputs(num_components - 1);
+  for (int32 i = 0; i < num_components; i++) {
+    if (i + 1 < num_components) {
+      MatrixResizeType resize_type =
+          ((components_[i]->Properties() & kPropagateAdds) ?
+           kSetZero : kUndefined);
+      intermediate_outputs[i].Resize(num_rows, components_[i]->OutputDim(),
+                                     resize_type, GetStrideType(i));
+    }
+    components_[i]->Propagate(NULL, (i == 0 ? in : intermediate_outputs[i-1]),
+               (i + 1 == num_components ? out : &(intermediate_outputs[i])));
+    if (i > 0)
+      intermediate_outputs[i-1].Resize(0, 0);
+  }
+}
+
+
+void CompositeComponent::Init(const std::vector<Component*> &components,
+                              int32 max_rows_process) {
+  DeletePointers(&components_);  // clean up.
+  components_ = components;
+  KALDI_ASSERT(!components.empty());
+  max_rows_process_ = max_rows_process;
+
+  for (size_t i = 0; i < components_.size(); i++) {
+    // make sure all constituent components are simple.
+    KALDI_ASSERT(components_[i]->Properties() & kSimpleComponent);
+    if (i > 0) {
+      // make sure all the internal dimensions match up.
+      KALDI_ASSERT(components_[i]->InputDim() ==
+                   components_[i-1]->OutputDim());
+    }
+  }
+}
+
+// virtual
+void CompositeComponent::Read(std::istream &is, bool binary) {
+  // Because we didn't previously write out the learning rate,
+  // we need some temporary code.
+  int32 max_rows_process;
+  if (false) {
+    ReadUpdatableCommon(is, binary);
+    ExpectToken(is, binary, "<MaxRowsProcess>");
+    ReadBasicType(is, binary, &max_rows_process);
+  } else {  // temporary code.
+    std::string token;
+    ReadToken(is, binary, &token);
+    if (token == "<CompositeComponent>") {
+      // if the first token is the opening tag, then
+      // ignore it and get the next tag.
+      ReadToken(is, binary, &token);
+    }
+    if (token == "<LearningRateFactor>") {
+      ReadBasicType(is, binary, &learning_rate_factor_);
+      ReadToken(is, binary, &token);
+    } else {
+      learning_rate_factor_ = 1.0;
+    }
+    if (token == "<IsGradient>") {
+      ReadBasicType(is, binary, &is_gradient_);
+      ReadToken(is, binary, &token);
+    } else {
+      is_gradient_ = false;
+    }
+    if (token == "<LearningRate>") {
+      ReadBasicType(is, binary, &learning_rate_);
+      ReadToken(is, binary, &token);
+    }
+    if (token != "<MaxRowsProcess>") {
+      KALDI_ERR << "Expected token <MaxRowsProcess>, got "
+                << token;
+    }
+    ReadBasicType(is, binary, &max_rows_process);
+  }
+  ExpectToken(is, binary, "<NumComponents>");
+  int32 num_components;
+  ReadBasicType(is, binary, &num_components); // Read dimension.
+  if (num_components < 0 || num_components > 100000)
+    KALDI_ERR << "Bad num-components";
+  std::vector<Component*> components(num_components);
+  for (int32 i = 0; i < num_components; i++)
+    components[i] = ReadNew(is, binary);
+  Init(components, max_rows_process);
+  ExpectToken(is, binary, "</CompositeComponent>");
+}
+
+// virtual
+void CompositeComponent::ZeroStats() {
+  // we call ZeroStats() on all components without checking their flags; this
+  // will do nothing if the component doesn't store stats.  (components like
+  // ReLU and sigmoid and tanh store stats on activations).
+  for (size_t i = 0; i < components_.size(); i++)
+   components_[i]->ZeroStats();
+}
+
+// virtual
+void CompositeComponent::Write(std::ostream &os, bool binary) const {
+  WriteUpdatableCommon(os, binary);  // Write opening tag and learning rate.
+  WriteToken(os, binary, "<MaxRowsProcess>");
+  WriteBasicType(os, binary, max_rows_process_);
+  WriteToken(os, binary, "<NumComponents>");
+  int32 num_components = components_.size();
+  WriteBasicType(os, binary, num_components);
+  for (int32 i = 0; i < num_components; i++)
+    components_[i]->Write(os, binary);
+  WriteToken(os, binary, "</CompositeComponent>");
+}
+
+
+// virtual
+void CompositeComponent::Backprop(const std::string &debug_info,
+                                  const ComponentPrecomputedIndexes *indexes,
+                                  const CuMatrixBase<BaseFloat> &in_value,
+                                  const CuMatrixBase<BaseFloat> &out_value,
+                                  const CuMatrixBase<BaseFloat> &out_deriv,
+                                  Component *to_update,
+                                  CuMatrixBase<BaseFloat> *in_deriv) const {
+  KALDI_ASSERT(in_value.NumRows() == out_deriv.NumRows() &&
+               in_value.NumCols() == InputDim() &&
+               out_deriv.NumCols() == OutputDim());
+  int32 num_rows = in_value.NumRows(),
+      num_components = components_.size();
+  if (max_rows_process_ > 0 && num_rows > max_rows_process_) {
+    KALDI_ASSERT(max_rows_process_ > 0);
+    // recurse and process smaller parts of the data, to save memory.
+    for (int32 row_offset = 0; row_offset < num_rows;
+         row_offset += max_rows_process_) {
+      bool have_output_value = (out_value.NumRows() != 0);
+      int32 this_num_rows = std::min<int32>(max_rows_process_,
+                                            num_rows - row_offset);
+      // out_value_part will only be used if out_value is nonempty; otherwise we
+      // make it a submatrix of 'out_deriv' to avoid errors in the constructor.
+      const CuSubMatrix<BaseFloat> out_value_part(have_output_value ? out_value : out_deriv,
+                                                  row_offset, this_num_rows,
+                                                  0, out_deriv.NumCols());
+      // in_deriv_value_part will only be used if in_deriv != NULL; otherwise we
+      // make it a submatrix of 'in_value' to avoid errors in the constructor.
+      CuSubMatrix<BaseFloat> in_deriv_part(in_deriv != NULL ? *in_deriv : in_value,
+                                            row_offset, this_num_rows,
+                                            0, in_value.NumCols());
+      CuSubMatrix<BaseFloat> in_value_part(in_value, row_offset, this_num_rows,
+                                           0, in_value.NumCols());
+      const CuSubMatrix<BaseFloat> out_deriv_part(out_deriv,
+                                                  row_offset, this_num_rows,
+                                                  0, out_deriv.NumCols());
+      CuMatrix<BaseFloat>  empty_mat;
+      this->Backprop(debug_info, NULL, in_value_part,
+                     (have_output_value ? static_cast<const CuMatrixBase<BaseFloat>&>(out_value_part) :
+                      static_cast<const CuMatrixBase<BaseFloat>&>(empty_mat)),
+                     out_deriv_part, to_update,
+                     in_deriv != NULL ? &in_deriv_part : NULL);
+    }
+    return;
+  }
+  // For now, assume all intermediate values and derivatives need to be
+  // computed.  in_value and out_deriv will always be supplied.
+
+  // intermediate_outputs[i] contains the output of component i.
+  std::vector<CuMatrix<BaseFloat> > intermediate_outputs(num_components - 1);
+  // intermediate_derivs[i] contains the deriative at the output of component i.
+  std::vector<CuMatrix<BaseFloat> > intermediate_derivs(num_components - 1);
+
+  // Do the propagation again, for all but the last component in the sequence.
+  // later on we can try being more careful about which ones we need to
+  // propagate.
+  for (int32 i = 0; i + 1 < num_components; i++) {
+    // skip the last-but-one component's propagate if the last component's
+    // backprop doesn't need the input and the one previous to that doesn't
+    // need the output.  [lowest hanging fruit for optimization]
+    if (i + 2 == num_components &&
+        !(components_[i+1]->Properties() & kBackpropNeedsInput) &&
+        !(components_[i]->Properties() & kBackpropNeedsOutput))
+      break;
+    MatrixResizeType resize_type =
+        ((components_[i]->Properties() & kPropagateAdds) ?
+         kSetZero : kUndefined);
+    intermediate_outputs[i].Resize(num_rows, components_[i]->OutputDim(),
+                                   resize_type, GetStrideType(i));
+    components_[i]->Propagate(NULL,
+                              (i == 0 ? in_value : intermediate_outputs[i-1]),
+                              &(intermediate_outputs[i]));
+  }
+  for (int32 i = num_components - 1; i >= 0; i--) {
+    Component *component_to_update =
+        (to_update == NULL ? NULL :
+         dynamic_cast<CompositeComponent*>(to_update)->components_[i]);
+
+    if (components_[i]->Properties() & kStoresStats &&
+        component_to_update != NULL)
+      component_to_update->StoreStats(
+          (i + 1 == num_components ? out_value : intermediate_outputs[i]));
+
+    // skip the first component's backprop if it's not updatable and in_deriv is
+    // not requested.  Again, this is the lowest-hanging fruit to optimize.
+    if (i == 0 && !(components_[0]->Properties() & kUpdatableComponent) &&
+        in_deriv == NULL)
+      break;
+    if (i > 0) {
+      MatrixResizeType resize_type =
+          ((components_[i]->Properties() & kBackpropAdds) ?
+           kSetZero : kUndefined);
+      intermediate_derivs[i-1].Resize(num_rows, components_[i]->InputDim(),
+                                      resize_type, GetStrideType(i - 1));
+    }
+    components_[i]->Backprop(debug_info, NULL,
+                             (i == 0 ? in_value : intermediate_outputs[i-1]),
+                             (i + 1 == num_components ? out_value : intermediate_outputs[i]),
+                             (i + 1 == num_components ? out_deriv : intermediate_derivs[i]),
+                             component_to_update,
+                             (i == 0 ? in_deriv : &(intermediate_derivs[i-1])));
+  }
+}
+
+
+// virtual
+std::string CompositeComponent::Info() const {
+  std::ostringstream stream;
+  stream << Type() << " ";
+  for (size_t i = 0; i < components_.size(); i++) {
+    if (i > 0) stream << ", ";
+    stream << "sub-component" << (i+1) << " = { "
+           << components_[i]->Info() << " }";
+  }
+  return stream.str();
+}
+
+// virtual
+void CompositeComponent::Scale(BaseFloat scale) {
+  for (size_t i = 0; i < components_.size(); i++)
+    components_[i]->Scale(scale);
+}
+
+// virtual
+void CompositeComponent::Add(BaseFloat alpha, const Component &other_in) {
+  const CompositeComponent *other = dynamic_cast<const CompositeComponent*>(
+      &other_in);
+  KALDI_ASSERT(other != NULL && other->components_.size() ==
+               components_.size() && "Mismatching nnet topologies");
+  for (size_t i = 0; i < components_.size(); i++)
+    components_[i]->Add(alpha, *(other->components_[i]));
+}
+
+// virtual
+void CompositeComponent::SetZero(bool treat_as_gradient) {
+  KALDI_ASSERT(this->IsUpdatable());  // or should not be called.
+  for (size_t i = 0; i < components_.size(); i++) {
+    if (components_[i]->Properties() & kUpdatableComponent) {
+      UpdatableComponent *uc =
+          dynamic_cast<UpdatableComponent*>(components_[i]);
+      uc->SetZero(treat_as_gradient);
+    }
+  }
+}
+
+// virtual
+void CompositeComponent::PerturbParams(BaseFloat stddev) {
+  KALDI_ASSERT(this->IsUpdatable());  // or should not be called.
+  for (size_t i = 0; i < components_.size(); i++) {
+    if (components_[i]->Properties() & kUpdatableComponent) {
+      UpdatableComponent *uc =
+          dynamic_cast<UpdatableComponent*>(components_[i]);
+      uc->PerturbParams(stddev);
+    }
+  }
+}
+
+void CompositeComponent::SetUnderlyingLearningRate(BaseFloat lrate) {
+  KALDI_ASSERT(this->IsUpdatable());  // or should not be called.
+  UpdatableComponent::SetUnderlyingLearningRate(lrate);
+
+  // apply any learning-rate-factor that's set at this level (ill-advised, but
+  // we'll do it.)
+  BaseFloat effective_lrate = LearningRate();
+  for (size_t i = 0; i < components_.size(); i++) {
+    if (components_[i]->Properties() & kUpdatableComponent) {
+      UpdatableComponent *uc =
+          dynamic_cast<UpdatableComponent*>(components_[i]);
+      uc->SetUnderlyingLearningRate(effective_lrate);
+    }
+  }
+}
+
+void CompositeComponent::SetActualLearningRate(BaseFloat lrate) {
+  KALDI_ASSERT(this->IsUpdatable());  // or should not be called.
+  UpdatableComponent::SetActualLearningRate(lrate);
+  for (size_t i = 0; i < components_.size(); i++) {
+    if (components_[i]->Properties() & kUpdatableComponent) {
+      UpdatableComponent *uc =
+          dynamic_cast<UpdatableComponent*>(components_[i]);
+      uc->SetActualLearningRate(lrate);
+    }
+  }
+}
+
+// virtual
+int32 CompositeComponent::NumParameters() const {
+  KALDI_ASSERT(this->IsUpdatable());  // or should not be called.
+  int32 ans = 0;
+  for (size_t i = 0; i < components_.size(); i++) {
+    if (components_[i]->Properties() & kUpdatableComponent) {
+      UpdatableComponent *uc =
+          dynamic_cast<UpdatableComponent*>(components_[i]);
+      ans += uc->NumParameters();
+    }
+  }
+  return ans;
+}
+
+// virtual
+void CompositeComponent::Vectorize(VectorBase<BaseFloat> *params) const {
+  int32 cur_offset = 0;
+  KALDI_ASSERT(this->IsUpdatable());  // or should not be called.
+  for (size_t i = 0; i < components_.size(); i++) {
+    if (components_[i]->Properties() & kUpdatableComponent) {
+      UpdatableComponent *uc =
+          dynamic_cast<UpdatableComponent*>(components_[i]);
+      int32 this_size = uc->NumParameters();
+      SubVector<BaseFloat> params_range(*params, cur_offset, this_size);
+      uc->Vectorize(&params_range);
+      cur_offset += this_size;
+    }
+  }
+  KALDI_ASSERT(cur_offset == params->Dim());
+}
+
+// virtual
+void CompositeComponent::UnVectorize(const VectorBase<BaseFloat> &params) {
+  int32 cur_offset = 0;
+  KALDI_ASSERT(this->IsUpdatable());  // or should not be called.
+  for (size_t i = 0; i < components_.size(); i++) {
+    if (components_[i]->Properties() & kUpdatableComponent) {
+      UpdatableComponent *uc =
+          dynamic_cast<UpdatableComponent*>(components_[i]);
+      int32 this_size = uc->NumParameters();
+      SubVector<BaseFloat> params_range(params, cur_offset, this_size);
+      uc->UnVectorize(params_range);
+      cur_offset += this_size;
+    }
+  }
+  KALDI_ASSERT(cur_offset == params.Dim());
+}
+
+// virtual
+BaseFloat CompositeComponent::DotProduct(
+    const UpdatableComponent &other_in) const {
+  const CompositeComponent *other = dynamic_cast<const CompositeComponent*>(
+      &other_in);
+  KALDI_ASSERT(other != NULL && other->components_.size() ==
+               components_.size() && "Mismatching nnet topologies");
+  BaseFloat ans = 0.0;
+  for (size_t i = 0.0; i < components_.size(); i++) {
+    if (components_[i]->Properties() & kUpdatableComponent) {
+      UpdatableComponent *uc =
+          dynamic_cast<UpdatableComponent*>(components_[i]);
+      const UpdatableComponent *uc_other =
+          dynamic_cast<UpdatableComponent*>(other->components_[i]);
+      KALDI_ASSERT(uc != NULL && uc_other != NULL);
+      ans += uc->DotProduct(*uc_other);
+    }
+  }
+  return ans;
+}
+
+// virtual
+Component* CompositeComponent::Copy() const {
+  std::vector<Component*> components(components_.size());
+  for (size_t i = 0; i < components_.size(); i++)
+    components[i] = components_[i]->Copy();
+  CompositeComponent *ans = new CompositeComponent();
+  ans->Init(components, max_rows_process_);
+  return ans;
+}
+
+
+// virtual
+void CompositeComponent::InitFromConfig(ConfigLine *cfl) {
+  int32 max_rows_process = 4096, num_components = -1;
+  cfl->GetValue("max-rows-process", &max_rows_process);
+  if (!cfl->GetValue("num-components", &num_components) ||
+      num_components < 1)
+    KALDI_ERR << "Expected num-components to be defined in "
+              << "CompositeComponent config line '" << cfl->WholeLine() << "'";
+  std::vector<Component*> components;
+  for (int32 i = 1; i <= num_components; i++) {
+    std::ostringstream name_stream;
+    name_stream << "component" << i;
+    std::string component_config;
+    if (!cfl->GetValue(name_stream.str(), &component_config)) {
+      DeletePointers(&components);
+      KALDI_ERR << "Expected '" << name_stream.str() << "' to be defined in "
+                << "CompositeComponent config line '" << cfl->WholeLine() << "'";
+    }
+    ConfigLine nested_line;
+    // note: the nested line may not contain comments.
+    std::string component_type;
+    Component *this_component = NULL;
+    if (!nested_line.ParseLine(component_config) ||
+        !nested_line.GetValue("type", &component_type) ||
+        !(this_component = NewComponentOfType(component_type))) {
+      DeletePointers(&components);
+      KALDI_ERR << "Could not parse config line for '" << name_stream.str()
+                << "(or undefined or bad component type [type=xxx]), in "
+                << "CompositeComponent config line '" << cfl->WholeLine() << "'";
+    }
+    if(this_component->Type() == "CompositeComponent") {
+      DeletePointers(&components);
+      delete this_component;
+      KALDI_ERR << "Found CompositeComponent nested within CompositeComponent."
+                << "Try decreasing max-rows-process instead."
+                << "Nested line: '" << nested_line.WholeLine() << "'\n"
+                << "Toplevel CompositeComponent line '" << cfl->WholeLine()
+                << "'";
+    }
+    this_component->InitFromConfig(&nested_line);
+    components.push_back(this_component);
+  }
+  if (cfl->HasUnusedValues())
+    KALDI_ERR << "Could not process these elements in initializer: "
+              << cfl->UnusedValues();
+  this->Init(components, max_rows_process);
+}
+
+const Component* CompositeComponent::GetComponent(int32 i) const {
+  KALDI_ASSERT(static_cast<size_t>(i) < components_.size());
+  return components_[i];
+}
+
+void CompositeComponent::SetComponent(int32 i, Component *component) {
+  KALDI_ASSERT(static_cast<size_t>(i) < components_.size());
+  delete components_[i];
+  components_[i] = component;
+}
 
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h
index 80848694320..bc8c6300320 100644
--- a/src/nnet3/nnet-simple-component.h
+++ b/src/nnet3/nnet-simple-component.h
@@ -2,9 +2,11 @@
 
 // Copyright 2011-2013  Karel Vesely
 //           2012-2015  Johns Hopkins University (author: Daniel Povey)
-//                2013  Xiaohui Zhang    
+//                2013  Xiaohui Zhang
 //           2014-2015  Vijayaditya Peddinti
 //           2014-2015  Guoguo Chen
+//                2015  Daniel Galvez
+//                2015  Tom Ko
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -50,7 +52,7 @@ class PnormComponent: public Component {
   }
   PnormComponent(): input_dim_(0), output_dim_(0) { }
   virtual std::string Type() const { return "PnormComponent"; }
-  virtual void InitFromConfig(ConfigLine *cfl); 
+  virtual void InitFromConfig(ConfigLine *cfl);
   virtual int32 InputDim() const { return input_dim_; }
   virtual int32 OutputDim() const { return output_dim_; }
   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
@@ -59,20 +61,19 @@ class PnormComponent: public Component {
   virtual void Backprop(const std::string &debug_info,
                         const ComponentPrecomputedIndexes *indexes,
                         const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,                        
+                        const CuMatrixBase<BaseFloat> &out_value,
                         const CuMatrixBase<BaseFloat> &out_deriv,
                         Component *to_update,
                         CuMatrixBase<BaseFloat> *in_deriv) const;
   virtual Component* Copy() const { return new PnormComponent(input_dim_,
                                                               output_dim_); }
-  
+
   virtual void Read(std::istream &is, bool binary); // This Read function
   // requires that the Component has the correct type.
-  
+
   /// Write component to stream
   virtual void Write(std::ostream &os, bool binary) const;
 
-  virtual std::string Info() const;
  protected:
   int32 input_dim_;
   int32 output_dim_;
@@ -89,7 +90,7 @@ class ElementwiseProductComponent: public Component {
   }
   ElementwiseProductComponent(): input_dim_(0), output_dim_(0) { }
   virtual std::string Type() const { return "ElementwiseProductComponent"; }
-  virtual void InitFromConfig(ConfigLine *cfl); 
+  virtual void InitFromConfig(ConfigLine *cfl);
   virtual int32 InputDim() const { return input_dim_; }
   virtual int32 OutputDim() const { return output_dim_; }
   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
@@ -98,37 +99,42 @@ class ElementwiseProductComponent: public Component {
   virtual void Backprop(const std::string &debug_info,
                         const ComponentPrecomputedIndexes *indexes,
                         const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,                        
+                        const CuMatrixBase<BaseFloat> &out_value,
                         const CuMatrixBase<BaseFloat> &out_deriv,
                         Component *to_update,
                         CuMatrixBase<BaseFloat> *in_deriv) const;
   virtual Component* Copy() const { return new ElementwiseProductComponent(input_dim_,
                                                               output_dim_); }
-  
+
   virtual void Read(std::istream &is, bool binary); // This Read function
   // requires that the Component has the correct type.
-  
+
   /// Write component to stream
   virtual void Write(std::ostream &os, bool binary) const;
 
-  virtual std::string Info() const;
  protected:
   int32 input_dim_;
   int32 output_dim_;
 };
 
-class NormalizeComponent: public NonlinearComponent {
-  // note: although we inherit from NonlinearComponent, we don't actually bohter
-  // accumulating the stats that NonlinearComponent is capable of accumulating.
+class NormalizeComponent: public Component {
  public:
-  explicit NormalizeComponent(int32 dim): NonlinearComponent(dim) { }
-  explicit NormalizeComponent(const NormalizeComponent &other): NonlinearComponent(other) { }
+ void Init(int32 input_dim, BaseFloat target_rms, bool add_log_stddev);
+  explicit NormalizeComponent(int32 input_dim,
+                              BaseFloat target_rms = 1.0,
+                              bool add_log_stddev = false) {
+    Init(input_dim, target_rms, add_log_stddev);
+  }
+  explicit NormalizeComponent(const NormalizeComponent &other);
   virtual int32 Properties() const {
-    return kSimpleComponent|kBackpropNeedsInput|kPropagateInPlace|
-        kBackpropInPlace;
+    return (add_log_stddev_ ?
+            kSimpleComponent|kBackpropNeedsInput|kBackpropAdds :
+            kSimpleComponent|kBackpropNeedsInput|kPropagateInPlace|
+            kBackpropAdds|kBackpropInPlace);
   }
-  NormalizeComponent() { }
+  NormalizeComponent(): target_rms_(1.0), add_log_stddev_(false) { }
   virtual std::string Type() const { return "NormalizeComponent"; }
+  virtual void InitFromConfig(ConfigLine *cfl);
   virtual Component* Copy() const { return new NormalizeComponent(*this); }
   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
                          const CuMatrixBase<BaseFloat> &in,
@@ -140,19 +146,32 @@ class NormalizeComponent: public NonlinearComponent {
                         const CuMatrixBase<BaseFloat> &out_deriv,
                         Component *to_update,
                         CuMatrixBase<BaseFloat> *in_deriv) const;
+
+  virtual void Read(std::istream &is, bool binary);
+  virtual void Write(std::ostream &os, bool binary) const;
+  virtual int32 InputDim() const { return input_dim_; }
+  virtual int32 OutputDim() const {
+    return (input_dim_ + (add_log_stddev_ ? 1 : 0));
+  }
+  virtual std::string Info() const;
  private:
   NormalizeComponent &operator = (const NormalizeComponent &other); // Disallow.
-  static const BaseFloat kNormFloor;
+  enum { kExpSquaredNormFloor = -66 };
+  static const BaseFloat kSquaredNormFloor;
+  int32 input_dim_;
+  BaseFloat target_rms_; // The target rms for outputs.
   // about 0.7e-20.  We need a value that's exactly representable in
   // float and whose inverse square root is also exactly representable
   // in float (hence, an even power of two).
+
+  bool add_log_stddev_; // If true, log(max(epsi, sqrt(row_in^T row_in / D)))
+                        // is an extra dimension of the output.
 };
 
 
 class SigmoidComponent: public NonlinearComponent {
  public:
-  explicit SigmoidComponent(int32 dim): NonlinearComponent(dim) { }
-  explicit SigmoidComponent(const SigmoidComponent &other): NonlinearComponent(other) { }    
+  explicit SigmoidComponent(const SigmoidComponent &other): NonlinearComponent(other) { }
   SigmoidComponent() { }
   virtual std::string Type() const { return "SigmoidComponent"; }
   virtual int32 Properties() const {
@@ -171,12 +190,16 @@ class SigmoidComponent: public NonlinearComponent {
                         CuMatrixBase<BaseFloat> *in_deriv) const;
   virtual void StoreStats(const CuMatrixBase<BaseFloat> &out_value);
  private:
+  // this function is called from Backprop code and only does something if the
+  // self-repair-scale config value is set.
+  void RepairGradients(const CuMatrixBase<BaseFloat> &out_value,
+                       CuMatrixBase<BaseFloat> *in_deriv) const;
+
   SigmoidComponent &operator = (const SigmoidComponent &other); // Disallow.
 };
 
 class TanhComponent: public NonlinearComponent {
  public:
-  explicit TanhComponent(int32 dim): NonlinearComponent(dim) { }
   explicit TanhComponent(const TanhComponent &other): NonlinearComponent(other) { }
   TanhComponent() { }
   virtual std::string Type() const { return "TanhComponent"; }
@@ -194,16 +217,21 @@ class TanhComponent: public NonlinearComponent {
                         const CuMatrixBase<BaseFloat> &out_deriv,
                         Component *to_update,
                         CuMatrixBase<BaseFloat> *in_deriv) const;
-  virtual void StoreStats(const CuMatrixBase<BaseFloat> &out_value);  
+  virtual void StoreStats(const CuMatrixBase<BaseFloat> &out_value);
  private:
+  // this function is called from Backprop code and only does something if the
+  // self-repair-scale config value is set.
+  void RepairGradients(const CuMatrixBase<BaseFloat> &out_value,
+                       CuMatrixBase<BaseFloat> *in_deriv) const;
+
   TanhComponent &operator = (const TanhComponent &other); // Disallow.
 };
 
 
 class RectifiedLinearComponent: public NonlinearComponent {
  public:
-  explicit RectifiedLinearComponent(int32 dim): NonlinearComponent(dim) { }
-  explicit RectifiedLinearComponent(const RectifiedLinearComponent &other): NonlinearComponent(other) { }
+  explicit RectifiedLinearComponent(const RectifiedLinearComponent &other):
+      NonlinearComponent(other) { }
   RectifiedLinearComponent() { }
   virtual std::string Type() const { return "RectifiedLinearComponent"; }
   virtual Component* Copy() const { return new RectifiedLinearComponent(*this); }
@@ -221,14 +249,68 @@ class RectifiedLinearComponent: public NonlinearComponent {
                         const CuMatrixBase<BaseFloat> &out_deriv,
                         Component *to_update,
                         CuMatrixBase<BaseFloat> *in_deriv) const;
-  virtual void StoreStats(const CuMatrixBase<BaseFloat> &out_value);  
+  virtual void StoreStats(const CuMatrixBase<BaseFloat> &out_value);
+
  private:
+  // this function is called from Backprop code and only does something if the
+  // self-repair-scale config value is set.
+  void RepairGradients(CuMatrixBase<BaseFloat> *in_deriv) const;
+
   RectifiedLinearComponent &operator = (const RectifiedLinearComponent &other); // Disallow.
 };
 
+/**
+   This component is a fixed (non-trainable) nonlinearity that sums its inputs
+   to produce outputs.  Currently the only supported configuration is that its
+   input-dim is interpreted as consisting of n blocks, and the output is just a
+   summation over the n blocks, where  n = input-dim / output-dim, so for instance
+    output[n] = input[n] + input[block-size + n] + .... .
+   Later if needed we can add a configuration variable that allows you to sum
+   over 'interleaved' input.
+ */
+class SumReduceComponent: public Component {
+ public:
+  void Init(int32 input_dim, int32 output_dim);
+  explicit SumReduceComponent(int32 input_dim, int32 output_dim) {
+    Init(input_dim, output_dim);
+  }
+  virtual int32 Properties() const {
+    return kSimpleComponent|kLinearInInput;
+  }
+  SumReduceComponent(): input_dim_(0), output_dim_(0) { }
+  virtual std::string Type() const { return "SumReduceComponent"; }
+  virtual void InitFromConfig(ConfigLine *cfl);
+  virtual int32 InputDim() const { return input_dim_; }
+  virtual int32 OutputDim() const { return output_dim_; }
+  virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
+                         const CuMatrixBase<BaseFloat> &in,
+                         CuMatrixBase<BaseFloat> *out) const;
+  virtual void Backprop(const std::string &debug_info,
+                        const ComponentPrecomputedIndexes *indexes,
+                        const CuMatrixBase<BaseFloat> &, // in_value
+                        const CuMatrixBase<BaseFloat> &, // out_value,
+                        const CuMatrixBase<BaseFloat> &out_deriv,
+                        Component *, // to_update
+                        CuMatrixBase<BaseFloat> *in_deriv) const;
+  virtual Component* Copy() const { return new SumReduceComponent(input_dim_,
+                                                                  output_dim_); }
+
+  virtual void Read(std::istream &is, bool binary); // This Read function
+  // requires that the Component has the correct type.
+
+  /// Write component to stream
+  virtual void Write(std::ostream &os, bool binary) const;
+
+ protected:
+  int32 input_dim_;
+  int32 output_dim_;
+};
+
+
 class FixedAffineComponent;
 class FixedScaleComponent;
 class PerElementScaleComponent;
+class PerElementOffsetComponent;
 
 // Affine means a linear function plus an offset.
 // Note: although this class can be instantiated, it also
@@ -237,13 +319,13 @@ class PerElementScaleComponent;
 class AffineComponent: public UpdatableComponent {
   friend class SoftmaxComponent; // Friend declaration relates to mixing up.
  public:
-  
+
   virtual int32 InputDim() const { return linear_params_.NumCols(); }
   virtual int32 OutputDim() const { return linear_params_.NumRows(); }
 
   virtual std::string Info() const;
-  virtual void InitFromConfig(ConfigLine *cfl); 
-  
+  virtual void InitFromConfig(ConfigLine *cfl);
+
   AffineComponent() { } // use Init to really initialize.
   virtual std::string Type() const { return "AffineComponent"; }
   virtual int32 Properties() const {
@@ -251,7 +333,7 @@ class AffineComponent: public UpdatableComponent {
         kBackpropNeedsInput|kBackpropAdds;
   }
 
-  
+
   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
                          const CuMatrixBase<BaseFloat> &in,
                          CuMatrixBase<BaseFloat> *out) const;
@@ -271,16 +353,16 @@ class AffineComponent: public UpdatableComponent {
 
   // Some functions from base-class UpdatableComponent.
   virtual void Scale(BaseFloat scale);
-  virtual void Add(BaseFloat alpha, const UpdatableComponent &other);
+  virtual void Add(BaseFloat alpha, const Component &other);
   virtual void SetZero(bool treat_as_gradient);
   virtual void PerturbParams(BaseFloat stddev);
-  virtual BaseFloat DotProduct(const UpdatableComponent &other) const;  
+  virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
   virtual int32 NumParameters() const;
   virtual void Vectorize(VectorBase<BaseFloat> *params) const;
   virtual void UnVectorize(const VectorBase<BaseFloat> &params);
 
   // Some functions that are specific to this class.
-  
+
   // This new function is used when mixing up:
   virtual void SetParams(const VectorBase<BaseFloat> &bias,
                          const MatrixBase<BaseFloat> &linear);
@@ -291,11 +373,9 @@ class AffineComponent: public UpdatableComponent {
   AffineComponent(const CuMatrixBase<BaseFloat> &linear_params,
                   const CuVectorBase<BaseFloat> &bias_params,
                   BaseFloat learning_rate);
-  void Init(BaseFloat learning_rate,
-            int32 input_dim, int32 output_dim,
+  void Init(int32 input_dim, int32 output_dim,
             BaseFloat param_stddev, BaseFloat bias_stddev);
-  void Init(BaseFloat learning_rate,
-            std::string matrix_filename);
+  void Init(std::string matrix_filename);
 
   // This function resizes the dimensions of the component, setting the
   // parameters to zero, while leaving any other configuration values the same.
@@ -308,7 +388,6 @@ class AffineComponent: public UpdatableComponent {
   Component *CollapseWithNext(const AffineComponent &next) const ;
   Component *CollapseWithNext(const FixedAffineComponent &next) const;
   Component *CollapseWithNext(const FixedScaleComponent &next) const;
-  Component *CollapseWithNext(const PerElementScaleComponent &next) const;
   Component *CollapseWithPrevious(const FixedAffineComponent &prev) const;
 
  protected:
@@ -325,19 +404,193 @@ class AffineComponent: public UpdatableComponent {
   // this if needed, but typically won't need to.
   virtual void UpdateSimple(
       const CuMatrixBase<BaseFloat> &in_value,
-      const CuMatrixBase<BaseFloat> &out_deriv);  
+      const CuMatrixBase<BaseFloat> &out_deriv);
 
   const AffineComponent &operator = (const AffineComponent &other); // Disallow.
   CuMatrix<BaseFloat> linear_params_;
   CuVector<BaseFloat> bias_params_;
+};
+
+class RepeatedAffineComponent;
+
+/// This class implements an affine transform using a block diagonal matrix
+/// e.g., one whose weight matrix is all zeros except for blocks on the
+/// diagonal. All these blocks have the same dimensions.
+///  input-dim: num cols of block diagonal matrix.
+///  output-dim: num rows of block diagonal matrix.
+/// num-blocks: number of blocks in diagonal of the matrix.
+/// num-blocks must divide both input-dim and output-dim
+class BlockAffineComponent : public UpdatableComponent {
+ public:
+  virtual int32 InputDim() const { return linear_params_.NumCols() * num_blocks_; }
+  virtual int32 OutputDim() const { return linear_params_.NumRows(); }
+
+  virtual std::string Info() const;
+  virtual void InitFromConfig(ConfigLine *cfl);
+
+  BlockAffineComponent() { }
+  virtual std::string Type() const { return "BlockAffineComponent"; }
+  virtual int32 Properties() const {
+    return kSimpleComponent|kUpdatableComponent|kLinearInParameters|
+      kBackpropNeedsInput|kBackpropAdds;
+  }
+
+  virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
+                         const CuMatrixBase<BaseFloat> &in,
+                         CuMatrixBase<BaseFloat> *out) const;
+
+  virtual void Backprop(const std::string &debug_info,
+                        const ComponentPrecomputedIndexes *indexes,
+                        const CuMatrixBase<BaseFloat> &in_value,
+                        const CuMatrixBase<BaseFloat> &, // out_value
+                        const CuMatrixBase<BaseFloat> &out_deriv,
+                        Component *to_update,
+                        CuMatrixBase<BaseFloat> *in_deriv) const;
+
+  virtual void Read(std::istream &is, bool binary);
+  virtual void Write(std::ostream &os, bool binary) const;
+
+  virtual Component* Copy() const;
+
+  // Functions from base-class UpdatableComponent.
+  virtual void Scale(BaseFloat scale);
+  virtual void Add(BaseFloat alpha, const Component &other);
+  virtual void SetZero(bool treat_as_gradient);
+  virtual void PerturbParams(BaseFloat stddev);
+  virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
+  virtual int32 NumParameters() const;
+  virtual void Vectorize(VectorBase<BaseFloat> *params) const;
+  virtual void UnVectorize(const VectorBase<BaseFloat> &params);
+
+  // BlockAffine-specific functions.
+  void Init(int32 input_dim, int32 output_dim, int32 num_blocks,
+            BaseFloat param_stddev, BaseFloat bias_mean,
+            BaseFloat bias_stddev);
+  explicit BlockAffineComponent(const BlockAffineComponent &other);
+  explicit BlockAffineComponent(const RepeatedAffineComponent &rac);
+ protected:
+  // The matrix linear_params_ has a block structure, with num_blocks_ blocks of
+  // equal size.  The blocks are stored in linear_params_ as
+  // [ M
+  //   N
+  //   O ] but we actually treat it as the matrix:
+  // [ M 0 0
+  //   0 N 0
+  //   0 0 O ]
+  CuMatrix<BaseFloat> linear_params_;
+  CuVector<BaseFloat> bias_params_;
+  int32 num_blocks_;
+ private:
+  const BlockAffineComponent &operator = (const BlockAffineComponent &other); // Disallow.
+};
+
+class RepeatedAffineComponent: public UpdatableComponent {
+ public:
+
+  virtual int32 InputDim() const { return linear_params_.NumCols() * num_repeats_; }
+  virtual int32 OutputDim() const { return linear_params_.NumRows() * num_repeats_; }
+
+  virtual std::string Info() const;
+  virtual void InitFromConfig(ConfigLine *cfl);
+
+  RepeatedAffineComponent() { } // use Init to really initialize.
+  virtual std::string Type() const { return "RepeatedAffineComponent"; }
+  virtual int32 Properties() const {
+    return kSimpleComponent|kUpdatableComponent|kLinearInParameters|
+        kBackpropNeedsInput|kBackpropAdds|kInputContiguous|kOutputContiguous;
+  }
+  virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
+                         const CuMatrixBase<BaseFloat> &in,
+                         CuMatrixBase<BaseFloat> *out) const;
+  virtual void Backprop(const std::string &debug_info,
+                        const ComponentPrecomputedIndexes *indexes,
+                        const CuMatrixBase<BaseFloat> &in_value,
+                        const CuMatrixBase<BaseFloat> &, // out_value
+                        const CuMatrixBase<BaseFloat> &out_deriv,
+                        Component *to_update,
+                        CuMatrixBase<BaseFloat> *in_deriv) const;
+
+  virtual void Read(std::istream &is, bool binary);
+  virtual void Write(std::ostream &os, bool binary) const;
+
+  virtual Component* Copy() const;
+
+  // Some functions from base-class UpdatableComponent.
+  virtual void Scale(BaseFloat scale);
+  virtual void Add(BaseFloat alpha, const Component &other);
+  virtual void SetZero(bool treat_as_gradient);
+  virtual void PerturbParams(BaseFloat stddev);
+  virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
+  virtual int32 NumParameters() const;
+  virtual void Vectorize(VectorBase<BaseFloat> *params) const;
+  virtual void UnVectorize(const VectorBase<BaseFloat> &params);
+
+  // Some functions that are specific to this class.
+  const CuVector<BaseFloat> &BiasParams() { return bias_params_; }
+  const CuMatrix<BaseFloat> &LinearParams() { return linear_params_; }
+  explicit RepeatedAffineComponent(const RepeatedAffineComponent &other);
+
+  void Init(int32 input_dim, int32 output_dim, int32 num_repeats,
+            BaseFloat param_stddev, BaseFloat bias_mean,
+            BaseFloat bias_stddev);
+  friend BlockAffineComponent::BlockAffineComponent(const RepeatedAffineComponent &rac);
+ protected:
+  // This function Update(), called from backprop, is broken out for
+  // extensibility to natural gradient update.
+  virtual void Update(
+      const CuMatrixBase<BaseFloat> &in_value,
+      const CuMatrixBase<BaseFloat> &out_deriv);
+
+  // This function does nothing here but is redefined in child-class
+  // NaturalGradientRepeatedAffineComponent.  This help avoid repeated code.
+  virtual void SetNaturalGradientConfigs() { }
+
+  const RepeatedAffineComponent &operator = (const RepeatedAffineComponent &other); // Disallow.
+  CuMatrix<BaseFloat> linear_params_;
+  CuVector<BaseFloat> bias_params_;
+  int32 num_repeats_;
+};
+
+class NaturalGradientRepeatedAffineComponent: public RepeatedAffineComponent {
+ public:
+  // Use Init() to really initialize.
+  NaturalGradientRepeatedAffineComponent() { }
+
+  // Most of the public functions are inherited from RepeatedAffineComponent.
+  virtual std::string Type() const {
+    return "NaturalGradientRepeatedAffineComponent";
+  }
+
+  virtual Component* Copy() const;
+
+  // Copy constructor
+  explicit NaturalGradientRepeatedAffineComponent(
+      const NaturalGradientRepeatedAffineComponent &other);
+ private:
+  virtual void Update(
+      const CuMatrixBase<BaseFloat> &in_value,
+      const CuMatrixBase<BaseFloat> &out_deriv);
+
+  const NaturalGradientRepeatedAffineComponent &operator=(
+      const NaturalGradientRepeatedAffineComponent &other); // Disallow.
 
+  // Applies the default configuration to preconditioner_in_.
+  virtual void SetNaturalGradientConfigs();
+
+  // For efficiency reasons we only apply the natural gradient to the input
+  // side, i.e. not to the space of output derivatives-- we believe the input
+  // side is the more important side.  We don't make the natural-gradient
+  // configurable; we just give it a reasonable configuration.
+  // Instead of using the individual data-points, for efficiency reasons we use
+  // the distribution of per-minibatch summed derivatives over each dimension of
+  // the output space, as the source for the Fisher matrix.
+  OnlineNaturalGradient preconditioner_in_;
 };
 
 class SoftmaxComponent: public NonlinearComponent {
  public:
-  explicit SoftmaxComponent(int32 dim): NonlinearComponent(dim) { }
   explicit SoftmaxComponent(const SoftmaxComponent &other):
-      NonlinearComponent(other) { } 
+      NonlinearComponent(other) { }
   SoftmaxComponent() { }
   virtual std::string Type() const { return "SoftmaxComponent"; }
   virtual int32 Properties() const {
@@ -349,12 +602,12 @@ class SoftmaxComponent: public NonlinearComponent {
   virtual void Backprop(const std::string &debug_info,
                         const ComponentPrecomputedIndexes *indexes,
                         const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,                        
+                        const CuMatrixBase<BaseFloat> &out_value,
                         const CuMatrixBase<BaseFloat> &out_deriv,
                         Component *to_update,
                         CuMatrixBase<BaseFloat> *in_deriv) const;
-  virtual void StoreStats(const CuMatrixBase<BaseFloat> &out_value);    
-  
+  virtual void StoreStats(const CuMatrixBase<BaseFloat> &out_value);
+
   virtual Component* Copy() const { return new SoftmaxComponent(*this); }
  private:
   SoftmaxComponent &operator = (const SoftmaxComponent &other); // Disallow.
@@ -362,7 +615,6 @@ class SoftmaxComponent: public NonlinearComponent {
 
 class LogSoftmaxComponent: public NonlinearComponent {
  public:
-  explicit LogSoftmaxComponent(int32 dim): NonlinearComponent(dim) { }
   explicit LogSoftmaxComponent(const LogSoftmaxComponent &other):
       NonlinearComponent(other) { }
   LogSoftmaxComponent() { }
@@ -400,26 +652,32 @@ class NaturalGradientAffineComponent: public AffineComponent {
   virtual std::string Type() const { return "NaturalGradientAffineComponent"; }
   virtual void Read(std::istream &is, bool binary);
   virtual void Write(std::ostream &os, bool binary) const;
-  void Init(BaseFloat learning_rate,
-            int32 input_dim, int32 output_dim,
-            BaseFloat param_stddev, BaseFloat bias_stddev,
+  void Init(int32 input_dim, int32 output_dim,
+            BaseFloat param_stddev, BaseFloat bias_stddev, BaseFloat bias_mean,
             int32 rank_in, int32 rank_out, int32 update_period,
             BaseFloat num_samples_history, BaseFloat alpha,
             BaseFloat max_change_per_sample);
-  void Init(BaseFloat learning_rate, int32 rank_in,
-            int32 rank_out, int32 update_period,
+  void Init(int32 rank_in, int32 rank_out, int32 update_period,
             BaseFloat num_samples_history,
             BaseFloat alpha, BaseFloat max_change_per_sample,
             std::string matrix_filename);
-
+  // this constructor does not really initialize, use Init() or Read().
+  NaturalGradientAffineComponent();
   virtual void Resize(int32 input_dim, int32 output_dim);
-  virtual void InitFromConfig(ConfigLine *cfl); 
+  virtual void InitFromConfig(ConfigLine *cfl);
   virtual std::string Info() const;
   virtual Component* Copy() const;
-  NaturalGradientAffineComponent(): max_change_per_sample_(0.0) { }
+  virtual void Scale(BaseFloat scale);
+  virtual void Add(BaseFloat alpha, const Component &other);
+  // copy constructor
+  explicit NaturalGradientAffineComponent(
+      const NaturalGradientAffineComponent &other);
+  virtual void ZeroStats();
 
  private:
-  KALDI_DISALLOW_COPY_AND_ASSIGN(NaturalGradientAffineComponent);
+  // disallow assignment operator.
+  NaturalGradientAffineComponent &operator= (
+      const NaturalGradientAffineComponent&);
 
   // Configs for preconditioner.  The input side tends to be better conditioned ->
   // smaller rank needed, so make them separately configurable.
@@ -428,13 +686,12 @@ class NaturalGradientAffineComponent: public AffineComponent {
   int32 update_period_;
   BaseFloat num_samples_history_;
   BaseFloat alpha_;
-  
+
   OnlineNaturalGradient preconditioner_in_;
 
   OnlineNaturalGradient preconditioner_out_;
 
-  BaseFloat max_change_per_sample_;
-  // If > 0, max_change_per_sample_ this is the maximum amount of parameter
+  // If > 0, max_change_per_sample_ is the maximum amount of parameter
   // change (in L2 norm) that we allow per sample, averaged over the minibatch.
   // This was introduced in order to control instability.
   // Instead of the exact L2 parameter change, for
@@ -443,19 +700,19 @@ class NaturalGradientAffineComponent: public AffineComponent {
   // for each minibatch, A suitable value might be, for
   // example, 10 or so; larger if there are more
   // parameters.
+  BaseFloat max_change_per_sample_;
 
-  /// The following function is only called if max_change_per_sample_ > 0, it returns a
-  /// scaling factor alpha <= 1.0 (1.0 in the normal case) that enforces the
-  /// "max-change" constraint.  "in_products" is the inner product with itself
-  /// of each row of the matrix of preconditioned input features; "out_products"
-  /// is the same for the output derivatives.  gamma_prod is a product of two
-  /// scalars that are output by the preconditioning code (for the input and
-  /// output), which we will need to multiply into the learning rate.
-  /// out_products is a pointer because we modify it in-place.
-  BaseFloat GetScalingFactor(const CuVectorBase<BaseFloat> &in_products,
-                             const std::string &debug_info,
-                             BaseFloat gamma_prod,
-                             CuVectorBase<BaseFloat> *out_products);
+  // update_count_ records how many updates we have done.
+  double update_count_;
+
+  // active_scaling_count_ records how many updates we have done,
+  // where the scaling factor is active (not 1.0).
+  double active_scaling_count_;
+
+  // max_change_scale_stats_ records the sum of scaling factors
+  // in each update, so we can compute the averaged scaling factor
+  // in Info().
+  double max_change_scale_stats_;
 
   // Sets the configs rank, alpha and eta in the preconditioner objects,
   // from the class variables.
@@ -472,16 +729,16 @@ class NaturalGradientAffineComponent: public AffineComponent {
 /// at network initialization time and is not trainable.
 class FixedAffineComponent: public Component {
  public:
-  FixedAffineComponent() { } 
+  FixedAffineComponent() { }
   virtual std::string Type() const { return "FixedAffineComponent"; }
   virtual std::string Info() const;
 
   /// matrix should be of size input-dim+1 to output-dim, last col is offset
-  void Init(const CuMatrixBase<BaseFloat> &matrix); 
+  void Init(const CuMatrixBase<BaseFloat> &matrix);
 
   // The ConfigLine cfl contains just the option matrix=<string>,
   // where the string is the filename of a Kaldi-format matrix to read.
-  virtual void InitFromConfig(ConfigLine *cfl); 
+  virtual void InitFromConfig(ConfigLine *cfl);
 
   virtual int32 Properties() const { return kSimpleComponent|kBackpropAdds; }
   virtual int32 InputDim() const { return linear_params_.NumCols(); }
@@ -509,26 +766,33 @@ class FixedAffineComponent: public Component {
   friend class AffineComponent;
   CuMatrix<BaseFloat> linear_params_;
   CuVector<BaseFloat> bias_params_;
-  
+
   KALDI_DISALLOW_COPY_AND_ASSIGN(FixedAffineComponent);
 };
 
-// SumGroupComponent is used to sum up groups of posteriors.
-// It's used to introduce a kind of Gaussian-mixture-model-like
-// idea into neural nets.  This is basically a degenerate case of
-// MixtureProbComponent; we had to implement it separately to
-// be efficient for CUDA (we can use this one regardless whether
-// we have CUDA or not; it's the normal case we want anyway). 
+/// SumGroupComponent is used to sum up groups of posteriors.
+/// It's used to introduce a kind of Gaussian-mixture-model-like
+/// idea into neural nets.  This is basically a degenerate case of
+/// MixtureProbComponent; we had to implement it separately to
+/// be efficient for CUDA (we can use this one regardless whether
+/// we have CUDA or not; it's the normal case we want anyway).
+///
+/// There are two forms of initialization in a config file: one
+/// where the number of elements are specified for each group
+/// individually as a vector, and one where only the total input
+/// dimension and the output dimension (number of groups) is specified.
+/// The second is used when all groups have the same size.
 class SumGroupComponent: public Component {
 public:
   virtual int32 InputDim() const { return input_dim_; }
   virtual int32 OutputDim() const { return output_dim_; }
   void Init(const std::vector<int32> &sizes); // the vector is of the input dim
                                               // (>= 1) for each output dim.
+  void Init(int32 input_dim, int32 output_dim);
   void GetSizes(std::vector<int32> *sizes) const; // Get a vector saying, for
                                                   // each output-dim, how many
                                                   // inputs were summed over.
-  virtual void InitFromConfig(ConfigLine *cfl); 
+  virtual void InitFromConfig(ConfigLine *cfl);
   SumGroupComponent() { }
   virtual std::string Type() const { return "SumGroupComponent"; }
   virtual int32 Properties() const { return kSimpleComponent|kLinearInInput; }
@@ -554,7 +818,7 @@ class SumGroupComponent: public Component {
                                // index.
   CuArray<int32> reverse_indexes_; // for each input index, the output index.
   int32 input_dim_;
-  int32 output_dim_;  
+  int32 output_dim_;
 };
 
 
@@ -563,19 +827,19 @@ class SumGroupComponent: public Component {
 /// model conversion).
 class FixedScaleComponent: public Component {
  public:
-  FixedScaleComponent() { } 
+  FixedScaleComponent() { }
   virtual std::string Type() const { return "FixedScaleComponent"; }
   virtual std::string Info() const;
   virtual int32 Properties() const {
     return kSimpleComponent|kLinearInInput|kPropagateInPlace|kBackpropInPlace;
   }
-  
+
   void Init(const CuVectorBase<BaseFloat> &scales);
 
   // The ConfigLine cfl contains only the option scales=<string>,
   // where the string is the filename of a Kaldi-format matrix to read.
   virtual void InitFromConfig(ConfigLine *cfl);
-  
+
   virtual int32 InputDim() const { return scales_.Dim(); }
   virtual int32 OutputDim() const { return scales_.Dim(); }
 
@@ -595,7 +859,7 @@ class FixedScaleComponent: public Component {
 
  protected:
   friend class AffineComponent;  // necessary for collapse
-  CuVector<BaseFloat> scales_;  
+  CuVector<BaseFloat> scales_;
   KALDI_DISALLOW_COPY_AND_ASSIGN(FixedScaleComponent);
 };
 
@@ -605,16 +869,16 @@ class FixedScaleComponent: public Component {
 /// model conversion.
 class FixedBiasComponent: public Component {
  public:
-  FixedBiasComponent() { } 
+  FixedBiasComponent() { }
   virtual std::string Type() const { return "FixedBiasComponent"; }
   virtual std::string Info() const;
 
   virtual int32 Properties() const {
     return kSimpleComponent|kPropagateInPlace|kBackpropInPlace;
   }
-  
-  void Init(const CuVectorBase<BaseFloat> &scales); 
-  
+
+  void Init(const CuVectorBase<BaseFloat> &scales);
+
   // The ConfigLine cfl contains only the option bias=<string>,
   // where the string is the filename of a Kaldi-format matrix to read.
   virtual void InitFromConfig(ConfigLine *cfl);
@@ -636,7 +900,7 @@ class FixedBiasComponent: public Component {
   virtual void Write(std::ostream &os, bool binary) const;
 
  protected:
-  CuVector<BaseFloat> bias_;  
+  CuVector<BaseFloat> bias_;
   KALDI_DISALLOW_COPY_AND_ASSIGN(FixedBiasComponent);
 };
 
@@ -644,8 +908,7 @@ class FixedBiasComponent: public Component {
 // very often, but it may sometimes make your life easier
 class NoOpComponent: public NonlinearComponent {
  public:
-  explicit NoOpComponent(int32 dim): NonlinearComponent(dim) { }
-  explicit NoOpComponent(const NoOpComponent &other): NonlinearComponent(other) { }    
+  explicit NoOpComponent(const NoOpComponent &other): NonlinearComponent(other) { }
   NoOpComponent() { }
   virtual std::string Type() const { return "NoOpComponent"; }
   virtual int32 Properties() const {
@@ -666,6 +929,145 @@ class NoOpComponent: public NonlinearComponent {
   NoOpComponent &operator = (const NoOpComponent &other); // Disallow.
 };
 
+// ClipGradientComponent just duplicates its input, but clips gradients
+// during backpropagation if they cross a predetermined threshold.
+// This component will be used to prevent gradient explosion problem in
+// recurrent neural networks
+class ClipGradientComponent: public Component {
+ public:
+  ClipGradientComponent(int32 dim, BaseFloat clipping_threshold,
+                        bool norm_based_clipping, int32 num_clipped,
+                        int32 count) {
+    Init(dim, clipping_threshold, norm_based_clipping, num_clipped, count);}
+
+  ClipGradientComponent(): dim_(0), clipping_threshold_(-1),
+    norm_based_clipping_(false), num_clipped_(0), count_(0) { }
+
+  virtual int32 InputDim() const { return dim_; }
+  virtual int32 OutputDim() const { return dim_; }
+  virtual void InitFromConfig(ConfigLine *cfl);
+  void Init(int32 dim, BaseFloat clipping_threshold, bool norm_based_clipping,
+            int32 num_clipped, int32 count);
+
+  virtual std::string Type() const { return "ClipGradientComponent"; }
+
+  virtual int32 Properties() const {
+    return kSimpleComponent|kLinearInInput|kPropagateInPlace|kBackpropInPlace;
+  }
+
+  virtual void ZeroStats();
+
+  virtual Component* Copy() const {
+    return new ClipGradientComponent(dim_,
+                                     clipping_threshold_,
+                                     norm_based_clipping_,
+                                     num_clipped_,
+                                     count_);}
+
+  virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
+                         const CuMatrixBase<BaseFloat> &in,
+                         CuMatrixBase<BaseFloat> *out) const;
+  virtual void Backprop(const std::string &debug_info,
+                        const ComponentPrecomputedIndexes *indexes,
+                        const CuMatrixBase<BaseFloat> &, //in_value
+                        const CuMatrixBase<BaseFloat> &, // out_value,
+                        const CuMatrixBase<BaseFloat> &out_deriv,
+                        Component *to_update,
+                        CuMatrixBase<BaseFloat> *in_deriv) const;
+
+  virtual void Scale(BaseFloat scale);
+  virtual void Add(BaseFloat alpha, const Component &other);
+  virtual void Read(std::istream &is, bool binary); // This Read function
+  // requires that the Component has the correct type.
+  /// Write component to stream
+  virtual void Write(std::ostream &os, bool binary) const;
+  virtual std::string Info() const;
+ private:
+  int32 dim_;  // input/output dimension
+  BaseFloat clipping_threshold_;  // threshold to be used for clipping
+                                  // could correspond to max-row-norm (if
+                                  // norm_based_clipping_ == true) or
+                                  // max-absolute-value (otherwise)
+  bool norm_based_clipping_;  // if true the max-row-norm will be clipped
+                              // else element-wise absolute value clipping is
+                              // done
+
+
+  ClipGradientComponent &operator =
+      (const ClipGradientComponent &other); // Disallow.
+
+ protected:
+  // variables to store stats
+  // An element corresponds to rows of derivative matrix, when
+  // norm_based_clipping_ is true,
+  // else it corresponds to each element of the derivative matrix
+  // Note: no stats are stored when norm_based_clipping_ is false
+  int32 num_clipped_;  // number of elements which were clipped
+  int32 count_;  // number of elements which were processed
+
+};
+
+/** PermuteComponent changes the order of the columns (i.e. the feature or
+    activation dimensions).  Output dimension i is mapped to input dimension
+    column_map_[i], so it's like doing:
+      for each row:
+        for each feature/activation dimension i:
+          output(row, i) = input(row, column_map_[i]).
+
+*/
+class PermuteComponent: public Component {
+ public:
+  PermuteComponent()  {}
+  PermuteComponent(const std::vector<int32> &column_map) { Init(column_map); }
+
+  virtual int32 InputDim() const { return column_map_.Dim(); }
+  virtual int32 OutputDim() const { return column_map_.Dim(); }
+  virtual void InitFromConfig(ConfigLine *cfl);
+  void Init(const std::vector<int32> &column_map);
+
+  virtual std::string Type() const { return "PermuteComponent"; }
+
+  virtual int32 Properties() const {
+    return kSimpleComponent|kLinearInInput;
+  }
+
+  virtual void ZeroStats() {}
+
+  virtual Component* Copy() const;
+
+  virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
+                         const CuMatrixBase<BaseFloat> &in,
+                         CuMatrixBase<BaseFloat> *out) const;
+  virtual void Backprop(const std::string &debug_info,
+                        const ComponentPrecomputedIndexes *indexes,
+                        const CuMatrixBase<BaseFloat> &, //in_value
+                        const CuMatrixBase<BaseFloat> &, // out_value,
+                        const CuMatrixBase<BaseFloat> &out_deriv,
+                        Component *to_update,
+                        CuMatrixBase<BaseFloat> *in_deriv) const;
+
+  virtual void Scale(BaseFloat scale) {}
+  virtual void Add(BaseFloat alpha, const Component &other) {}
+  virtual void Read(std::istream &is, bool binary); // This Read function
+  // requires that the Component has the correct type.
+  /// Write component to stream
+  virtual void Write(std::ostream &os, bool binary) const;
+  virtual std::string Info() const;
+ private:
+  // computes the reverse column map.  Must not be called if column_map_.Dim()
+  // == 0
+  void ComputeReverseColumnMap();
+  CuArray<int32> column_map_;
+  // the following is a derived variable, not written to disk.
+  // It is used in backprop.
+  CuArray<int32> reverse_column_map_;
+  PermuteComponent &operator =
+      (const PermuteComponent &other); // Disallow.
+};
+
+
+
+
 // PerElementScaleComponent scales each dimension of its input with a separate
 // trainable scale; it's like a linear component with a diagonal matrix.
 class PerElementScaleComponent: public UpdatableComponent {
@@ -674,15 +1076,15 @@ class PerElementScaleComponent: public UpdatableComponent {
   virtual int32 OutputDim() const { return scales_.Dim(); }
 
   virtual std::string Info() const;
-  virtual void InitFromConfig(ConfigLine *cfl); 
-  
+  virtual void InitFromConfig(ConfigLine *cfl);
+
   PerElementScaleComponent() { } // use Init to really initialize.
   virtual std::string Type() const { return "PerElementScaleComponent"; }
   virtual int32 Properties() const {
     return kSimpleComponent|kUpdatableComponent|kLinearInInput|
         kLinearInParameters|kBackpropNeedsInput|kPropagateInPlace;
   }
-  
+
   virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
                          const CuMatrixBase<BaseFloat> &in,
                          CuMatrixBase<BaseFloat> *out) const;
@@ -702,10 +1104,10 @@ class PerElementScaleComponent: public UpdatableComponent {
 
   // Some functions from base-class UpdatableComponent.
   virtual void Scale(BaseFloat scale);
-  virtual void Add(BaseFloat alpha, const UpdatableComponent &other);
+  virtual void Add(BaseFloat alpha, const Component &other);
   virtual void SetZero(bool treat_as_gradient);
   virtual void PerturbParams(BaseFloat stddev);
-  virtual BaseFloat DotProduct(const UpdatableComponent &other) const;  
+  virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
   virtual int32 NumParameters() const;
   virtual void Vectorize(VectorBase<BaseFloat> *params) const;
   virtual void UnVectorize(const VectorBase<BaseFloat> &params);
@@ -713,13 +1115,8 @@ class PerElementScaleComponent: public UpdatableComponent {
   // Some functions that are specific to this class.
   explicit PerElementScaleComponent(const PerElementScaleComponent &other);
 
-  void Init(BaseFloat learning_rate, int32 dim, BaseFloat param_mean,
-            BaseFloat param_stddev);
-  void Init(BaseFloat learning_rate, std::string vector_filename);
-
-  // This function resizes the dimensions of the component, setting the
-  // parameters to zero, while leaving any other configuration values the same.
-  virtual void Resize(int32 dim);
+  void Init(int32 dim, BaseFloat param_mean, BaseFloat param_stddev);
+  void Init(std::string vector_filename);
 
  protected:
   friend class AffineComponent;  // necessary for collapse
@@ -735,15 +1132,141 @@ class PerElementScaleComponent: public UpdatableComponent {
   // this if needed, but typically won't need to.
   virtual void UpdateSimple(
       const CuMatrixBase<BaseFloat> &in_value,
-      const CuMatrixBase<BaseFloat> &out_deriv);  
+      const CuMatrixBase<BaseFloat> &out_deriv);
 
-  const PerElementScaleComponent &operator 
+  const PerElementScaleComponent &operator
       = (const PerElementScaleComponent &other); // Disallow.
   CuVector<BaseFloat> scales_;
+};
+
+
+// PerElementOffsetComponent offsets each dimension of its input with a separate
+// trainable bias; it's like an affine component with fixed weight matrix which is always equal to I.
+class PerElementOffsetComponent: public UpdatableComponent {
+ public:
+  virtual int32 InputDim() const { return offsets_.Dim(); }
+  virtual int32 OutputDim() const { return offsets_.Dim(); }
+
+  virtual std::string Info() const;
+  virtual void InitFromConfig(ConfigLine *cfl);
+
+  PerElementOffsetComponent() { } // use Init to really initialize.
+  virtual std::string Type() const { return "PerElementOffsetComponent"; }
+  virtual int32 Properties() const {
+    return kSimpleComponent|kUpdatableComponent|
+           kBackpropInPlace|kPropagateInPlace;
+  }
+
+  virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
+                         const CuMatrixBase<BaseFloat> &in,
+                         CuMatrixBase<BaseFloat> *out) const;
+  virtual void Backprop(const std::string &debug_info,
+                        const ComponentPrecomputedIndexes *indexes,
+                        const CuMatrixBase<BaseFloat> &, // in_value
+                        const CuMatrixBase<BaseFloat> &, // out_value
+                        const CuMatrixBase<BaseFloat> &out_deriv,
+                        Component *to_update,
+                        CuMatrixBase<BaseFloat> *in_deriv) const;
+
+  virtual void Read(std::istream &is, bool binary);
+  virtual void Write(std::ostream &os, bool binary) const;
+
+  virtual Component* Copy() const;
+
+
+  // Some functions from base-class UpdatableComponent.
+  virtual void Scale(BaseFloat scale);
+  virtual void Add(BaseFloat alpha, const Component &other);
+  virtual void SetZero(bool treat_as_gradient);
+  virtual void PerturbParams(BaseFloat stddev);
+  virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
+  virtual int32 NumParameters() const;
+  virtual void Vectorize(VectorBase<BaseFloat> *params) const;
+  virtual void UnVectorize(const VectorBase<BaseFloat> &params);
+
+  // Some functions that are specific to this class.
+  explicit PerElementOffsetComponent(const PerElementOffsetComponent &other);
 
+  void Init(int32 dim, BaseFloat param_mean,
+            BaseFloat param_stddev);
+  void Init(std::string vector_filename);
+
+ protected:
+  const PerElementOffsetComponent &operator
+      = (const PerElementOffsetComponent &other); // Disallow.
+  CuVector<BaseFloat> offsets_;
 };
 
 
+// ConstantFunctionComponent returns constant function of its input,
+// i.e. its output does not depend on its input.  It is the same as
+// an affine component with the linear term fixed at zero.
+// It is optionally trainable, and optionally you can use natural
+// gradient.  The input is required only because the framework
+// requires components to have an input.
+class ConstantFunctionComponent: public UpdatableComponent {
+ public:
+  virtual int32 InputDim() const { return input_dim_; }
+  virtual int32 OutputDim() const { return output_.Dim(); }
+
+  virtual std::string Info() const;
+  // possible parameter values with their defaults:
+  // input-dim=-1 is-updatable=true use-natural-gradient=true output-dim=-1
+  // output-mean=0 output-stddev=0
+  virtual void InitFromConfig(ConfigLine *cfl);
+
+  ConstantFunctionComponent();
+
+  ConstantFunctionComponent(const ConstantFunctionComponent &other);
+
+  virtual std::string Type() const { return "ConstantFunctionComponent"; }
+  virtual int32 Properties() const {
+    return kSimpleComponent|
+        (is_updatable_ ? kUpdatableComponent|kLinearInParameters : 0) |
+        (InputDim() == OutputDim() ? kPropagateInPlace|kBackpropInPlace: 0) |
+        kBackpropAdds;
+  }
+  virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
+                         const CuMatrixBase<BaseFloat> &in,
+                         CuMatrixBase<BaseFloat> *out) const;
+  virtual void Backprop(const std::string &debug_info,
+                        const ComponentPrecomputedIndexes *indexes,
+                        const CuMatrixBase<BaseFloat> &, // in_value
+                        const CuMatrixBase<BaseFloat> &, // out_value
+                        const CuMatrixBase<BaseFloat> &out_deriv,
+                        Component *to_update,
+                        CuMatrixBase<BaseFloat> *in_deriv) const;
+
+  virtual void Read(std::istream &is, bool binary);
+  virtual void Write(std::ostream &os, bool binary) const;
+
+  virtual Component* Copy() const;
+
+  // Some functions from base-class UpdatableComponent.
+  virtual void Scale(BaseFloat scale);
+  virtual void Add(BaseFloat alpha, const Component &other);
+  virtual void SetZero(bool treat_as_gradient);
+  virtual void PerturbParams(BaseFloat stddev);
+  virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
+  virtual int32 NumParameters() const;
+  virtual void Vectorize(VectorBase<BaseFloat> *params) const;
+  virtual void UnVectorize(const VectorBase<BaseFloat> &params);
+ private:
+  int32 input_dim_;
+  // the output value-- a vector.
+  CuVector<BaseFloat> output_;
+
+  bool is_updatable_;
+  // if true, and if updatable, do natural-gradient update.
+  bool use_natural_gradient_;
+  OnlineNaturalGradient preconditioner_;
+
+  const ConstantFunctionComponent &operator
+  = (const ConstantFunctionComponent &other); // Disallow.
+};
+
+
+
 // NaturalGradientPerElementScaleComponent is like PerElementScaleComponent but
 // it uses a natural gradient update for the per-element scales, and enforces a
 // maximum amount of change per minibatch, for stability.
@@ -751,9 +1274,9 @@ class NaturalGradientPerElementScaleComponent: public PerElementScaleComponent {
  public:
 
   virtual std::string Info() const;
-  
-  virtual void InitFromConfig(ConfigLine *cfl); 
-  
+
+  virtual void InitFromConfig(ConfigLine *cfl);
+
   NaturalGradientPerElementScaleComponent() { } // use Init to really initialize.
   virtual std::string Type() const {
     return "NaturalGradientPerElementScaleComponent";
@@ -768,16 +1291,16 @@ class NaturalGradientPerElementScaleComponent: public PerElementScaleComponent {
   explicit NaturalGradientPerElementScaleComponent(
       const NaturalGradientPerElementScaleComponent &other);
 
-  void Init(BaseFloat learning_rate, int32 dim, BaseFloat param_mean,
+  void Init(int32 dim, BaseFloat param_mean,
             BaseFloat param_stddev, int32 rank, int32 update_period,
             BaseFloat num_samples_history, BaseFloat alpha,
             BaseFloat max_change_per_minibatch);
-  void Init(BaseFloat learning_rate, std::string vector_filename,
+  void Init(std::string vector_filename,
             int32 rank, int32 update_period, BaseFloat num_samples_history,
             BaseFloat alpha, BaseFloat max_change_per_minibatch);
 
  private:
-  // configuration value for imposing max-change... 
+  // configuration value for imposing max-change...
   BaseFloat max_change_per_minibatch_;
 
   // unlike the NaturalGradientAffineComponent, there is only one dimension to
@@ -786,7 +1309,7 @@ class NaturalGradientPerElementScaleComponent: public PerElementScaleComponent {
   // The preconditioner stores its own configuration values; we write and read
   // these, but not the preconditioner object itself.
   OnlineNaturalGradient preconditioner_;
-  
+
   // Override of the parent-class Update() function, called only
   // if this->is_gradient_ = false; this implements the natural
   // gradient update.
@@ -795,11 +1318,442 @@ class NaturalGradientPerElementScaleComponent: public PerElementScaleComponent {
       const CuMatrixBase<BaseFloat> &in_value,
       const CuMatrixBase<BaseFloat> &out_deriv);
 
-  const NaturalGradientPerElementScaleComponent &operator 
+  const NaturalGradientPerElementScaleComponent &operator
       = (const NaturalGradientPerElementScaleComponent &other); // Disallow.
 };
 
+/**
+ * ConvolutionalComponent implements 2d-convolution.
+ * It uses 3D filters on 3D inputs, but the 3D filters hop only over
+ * 2 dimensions as it has same size as the input along the 3rd dimension.
+ * Input : A matrix where each row is a  vectorized 3D-tensor.
+ *        The 3D tensor has dimensions
+ *        x: (e.g. time)
+ *        y: (e.g. frequency)
+ *        z: (e.g. channels like features/delta/delta-delta)
+ *
+ *        The component supports input vectorizations of type zyx and yzx.
+ *        The default vectorization type is zyx.
+ *        e.g. for input vectorization of type zyx the input is vectorized by
+ *        spanning axes z, y and x of the tensor in that order.
+ *        Given 3d tensor A with sizes (2, 2, 2) along the three dimensions
+ *        the zyx vectorized input looks like
+ *  A(0,0,0) A(0,0,1) A(0,1,0) A(0,1,1) A(1,0,0) A(1,0,1) A(1,1,0) A(1,1,1)
+ *
+ *
+ * Output : The output is also a 3D tensor vectorized in the zyx format.
+ *          The channel axis (z) in the output corresponds to the output of
+ *          different filters. The first channel corresponds to the first filter
+ *          i.e., first row of the filter_params_ matrix.
+ *
+ * Note: The component has to support yzx input vectorization as the binaries
+ * like add-deltas generate yz vectorized output. These input vectors are
+ * concatenated using the Append descriptor across time steps to form a yzx
+ * vectorized 3D tensor input.
+ * e.g. Append(Offset(input, -1), input, Offset(input, 1))
+ *
+ *
+ * For information on the hyperparameters and parameters of this component see
+ * the variable declarations.
+ *
+ * Propagation:
+ * ------------
+ * Convolution operation consists of a dot-products between the filter tensor
+ * and input tensor patch, for various shifts of filter tensor along the x and y
+ * axes input tensor. (Note: there is no shift along z-axis as the filter and
+ * input tensor have same size along this axis).
+ *
+ * For a particular shift (i,j) of the filter tensor
+ * along input tensor dimensions x and y, the elements of the input tensor which
+ * overlap with the filter form the input tensor patch. This patch is vectorized
+ * in zyx format. All the patches corresponding to various samples in the
+ * mini-batch are stacked into a matrix, where each row corresponds to one
+ * patch. Let this matrix be represented by X_{i,j}. The dot products with
+ * various filters are computed simultaneously by computing the matrix product
+ * with the filter_params_ matrix (W)
+ * Y_{i,j} = X_{i,j}*W^T.
+ * Each row of W corresponds to one filter 3D tensor vectorized in zyx format.
+ *
+ * All the matrix products corresponding to various shifts (i,j) of the
+ * filter tensor are computed simultaneously using the AddMatMatBatched
+ * call of CuMatrixBase class.
+ *
+ * BackPropagation:
+ * ----------------
+ *  Backpropagation to compute the input derivative (\nabla X_{i,j})
+ *  consists of the a series of matrix products.
+ *  \nablaX_{i,j} = \nablaY_{i,j}*W where \nablaY_{i,j} corresponds to the
+ *   output derivative for a particular shift of the filter.
+ *
+ *   Once again these matrix products are computed simultaneously.
+ *
+ * Update:
+ * -------
+ *  The weight gradient is computed as
+ *  \nablaW = \Sum_{i,j} (X_{i,j}^T *\nablaY_{i,j})
+ *
+ */
+class ConvolutionComponent: public UpdatableComponent {
+ public:
+  enum TensorVectorizationType  {
+    kYzx = 0,
+    kZyx = 1
+  };
+
+  ConvolutionComponent();
+  // constructor using another component
+  ConvolutionComponent(const ConvolutionComponent &component);
+  // constructor using parameters
+  ConvolutionComponent(
+    const CuMatrixBase<BaseFloat> &filter_params,
+    const CuVectorBase<BaseFloat> &bias_params,
+    int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
+    int32 filt_x_dim, int32 filt_y_dim,
+    int32 filt_x_step, int32 filt_y_step,
+    TensorVectorizationType input_vectorization,
+    BaseFloat learning_rate);
+
+  virtual int32 InputDim() const;
+  virtual int32 OutputDim() const;
+
+  virtual std::string Info() const;
+  virtual void InitFromConfig(ConfigLine *cfl);
+  virtual std::string Type() const { return "ConvolutionComponent"; }
+  virtual int32 Properties() const {
+    return kSimpleComponent|kUpdatableComponent|kBackpropNeedsInput|
+	    kBackpropAdds|kPropagateAdds;
+  }
+
+  virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
+                         const CuMatrixBase<BaseFloat> &in,
+                         CuMatrixBase<BaseFloat> *out) const;
+  virtual void Backprop(const std::string &debug_info,
+                        const ComponentPrecomputedIndexes *indexes,
+                        const CuMatrixBase<BaseFloat> &in_value,
+                        const CuMatrixBase<BaseFloat> &, // out_value,
+                        const CuMatrixBase<BaseFloat> &out_deriv,
+                        Component *to_update_in,
+                        CuMatrixBase<BaseFloat> *in_deriv) const;
+  void Update(const std::string &debug_info,
+              const CuMatrixBase<BaseFloat> &in_value,
+              const CuMatrixBase<BaseFloat> &out_deriv,
+              const std::vector<CuSubMatrix<BaseFloat> *>& out_deriv_batch);
+
+
+
+  virtual void Read(std::istream &is, bool binary);
+  virtual void Write(std::ostream &os, bool binary) const;
+
+  virtual Component* Copy() const;
+
+  // Some functions from base-class UpdatableComponent.
+  virtual void Scale(BaseFloat scale);
+  virtual void Add(BaseFloat alpha, const Component &other);
+  virtual void SetZero(bool treat_as_gradient);
+  virtual void PerturbParams(BaseFloat stddev);
+  virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
+  virtual int32 NumParameters() const;
+  virtual void Vectorize(VectorBase<BaseFloat> *params) const;
+  virtual void UnVectorize(const VectorBase<BaseFloat> &params);
+
+  // Some functions that are specific to this class.
+  void SetParams(const VectorBase<BaseFloat> &bias,
+                 const MatrixBase<BaseFloat> &filter);
+  const CuVector<BaseFloat> &BiasParams() { return bias_params_; }
+  const CuMatrix<BaseFloat> &LinearParams() { return filter_params_; }
+  void Init(int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
+            int32 filt_x_dim, int32 filt_y_dim,
+            int32 filt_x_step, int32 filt_y_step, int32 num_filters,
+            TensorVectorizationType input_vectorization,
+            BaseFloat param_stddev, BaseFloat bias_stddev);
+  // there is no filt_z_dim parameter as the length of the filter along
+  // z-dimension is same as the input
+  void Init(int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
+            int32 filt_x_dim, int32 filt_y_dim,
+            int32 filt_x_step, int32 filt_y_step,
+            TensorVectorizationType input_vectorization,
+            std::string matrix_filename);
+
+  // resize the component, setting the parameters to zero, while
+  // leaving any other configuration values the same
+  void Resize(int32 input_dim, int32 output_dim);
+
+  void Update(const std::string &debug_info,
+	      const CuMatrixBase<BaseFloat> &in_value,
+              const CuMatrixBase<BaseFloat> &out_deriv);
+
+
+ private:
+  int32 input_x_dim_;   // size of the input along x-axis
+                        // (e.g. number of time steps)
+
+  int32 input_y_dim_;   // size of input along y-axis
+                        // (e.g. number of mel-frequency bins)
+
+  int32 input_z_dim_;   // size of input along z-axis
+                        // (e.g. number of channels is 3 if the input has
+                        // features + delta + delta-delta features
+
+  int32 filt_x_dim_;    // size of the filter along x-axis
+
+  int32 filt_y_dim_;    // size of the filter along y-axis
+
+  // there is no filt_z_dim_ as it is always assumed to be
+  // the same as input_z_dim_
+
+  int32 filt_x_step_;   // the number of steps taken along x-axis of input
+                        //  before computing the next dot-product
+                        //  of filter and input
+
+  int32 filt_y_step_;   // the number of steps taken along y-axis of input
+                        // before computing the next dot-product of the filter
+                        // and input
+
+  // there is no filt_z_step_ as only dot product is possible along this axis
+
+  TensorVectorizationType input_vectorization_; // type of vectorization of the
+  // input 3D tensor. Accepts zyx and yzx formats
+
+  CuMatrix<BaseFloat> filter_params_;
+  // the filter (or kernel) matrix is a matrix of vectorized 3D filters
+  // where each row in the matrix corresponds to one filter.
+  // The 3D filter tensor is vectorizedin zyx format.
+  // The first row of the matrix corresponds to the first filter and so on.
+  // Keep in mind the vectorization type and order of filters when using file
+  // based initialization.
+
+  CuVector<BaseFloat> bias_params_;
+  // the filter-specific bias vector (i.e., there is a seperate bias added
+  // to the output of each filter).
+  bool is_gradient_;
+
+  void InputToInputPatches(const CuMatrixBase<BaseFloat>& in,
+                           CuMatrix<BaseFloat> *patches) const;
+  void InderivPatchesToInderiv(const CuMatrix<BaseFloat>& in_deriv_patches,
+                               CuMatrixBase<BaseFloat> *in_deriv) const;
+  const ConvolutionComponent &operator = (const ConvolutionComponent &other); // Disallow.
+};
+
+
+/*
+ * MaxPoolingComponent :
+ * Maxpooling component was firstly used in ConvNet for selecting an
+ * representative activation in an area. It inspired Maxout nonlinearity.
+ * Each output element of this component is the maximum of a block of
+ * input elements where the block has a 3D dimension (pool_x_size_,
+ * pool_y_size_, pool_z_size_).
+ * Blocks could overlap if the shift value on any axis is smaller
+ * than its corresponding pool size (e.g. pool_x_step_ < pool_x_size_).
+ * If the shift values are euqal to their pool size, there is no
+ * overlap; while if they all equal 1, the blocks overlap to
+ * the greatest possible extent.
+ *
+ * This component is designed to be used after a ConvolutionComponent
+ * so that the input matrix is propagated from a 2d-convolutional layer.
+ * This component implements 3d-maxpooling which performs
+ * max pooling along the three axes.
+ * Input : A matrix where each row is a vectorized 3D-tensor.
+ *        The 3D tensor has dimensions
+ *        x: (e.g. time)
+ *        y: (e.g. frequency)
+ *        z: (e.g. channels like number of filters in the ConvolutionComponent)
+ *
+ *        The component assumes input vectorizations of type zyx
+ *        which is the default output vectorization type of a ConvolutionComponent.
+ *        e.g. for input vectorization of type zyx the input is vectorized by
+ *        spanning axes z, y and x of the tensor in that order.
+ *        Given 3d tensor A with sizes (2, 2, 2) along the three dimensions
+ *        the zyx vectorized input looks like
+ *  A(0,0,0) A(0,0,1) A(0,1,0) A(0,1,1) A(1,0,0) A(1,0,1) A(1,1,0) A(1,1,1)
+ *
+ * Output : The output is also a 3D tensor vectorized in the zyx format.
+ *
+ * For information on the hyperparameters and parameters of this component see
+ * the variable declarations.
+ *
+ *
+ */
+
+class MaxpoolingComponent: public Component {
+ public:
+
+  MaxpoolingComponent(): input_x_dim_(0), input_y_dim_(0), input_z_dim_(0),
+                           pool_x_size_(0), pool_y_size_(0), pool_z_size_(0),
+                           pool_x_step_(0), pool_y_step_(0), pool_z_step_(0) { }
+  // constructor using another component
+  MaxpoolingComponent(const MaxpoolingComponent &component);
+
+  virtual int32 InputDim() const;
+  virtual int32 OutputDim() const;
+  virtual void Check() const;
+
+  virtual std::string Info() const;
+  virtual void InitFromConfig(ConfigLine *cfl);
+  virtual std::string Type() const { return "MaxpoolingComponent"; }
+  virtual int32 Properties() const {
+    return kSimpleComponent|kBackpropNeedsInput|kBackpropNeedsOutput|
+	    kBackpropAdds;
+  }
+
+  virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
+                         const CuMatrixBase<BaseFloat> &in,
+                         CuMatrixBase<BaseFloat> *out) const;
+  virtual void Backprop(const std::string &debug_info,
+                        const ComponentPrecomputedIndexes *indexes,
+                        const CuMatrixBase<BaseFloat> &in_value,
+                        const CuMatrixBase<BaseFloat> &out_value,
+                        const CuMatrixBase<BaseFloat> &out_deriv,
+                        Component *, // to_update,
+                        CuMatrixBase<BaseFloat> *in_deriv) const;
+
+  virtual void Read(std::istream &is, bool binary); // This Read function
+  // requires that the Component has the correct type.
+
+  /// Write component to stream
+  virtual void Write(std::ostream &os, bool binary) const;
+  virtual Component* Copy() const { return new MaxpoolingComponent(*this); }
+
+  void InputToInputPatches(const CuMatrixBase<BaseFloat>& in,
+                           CuMatrix<BaseFloat> *patches) const;
+  void InderivPatchesToInderiv(const CuMatrix<BaseFloat>& in_deriv_patches,
+                               CuMatrixBase<BaseFloat> *in_deriv) const;
+
+ protected:
+  int32 input_x_dim_;   // size of the input along x-axis
+  // (e.g. number of time steps)
+  int32 input_y_dim_;   // size of input along y-axis
+  // (e.g. number of mel-frequency bins)
+  int32 input_z_dim_;   // size of input along z-axis
+  // (e.g. number of filters in the ConvolutionComponent)
+
+  int32 pool_x_size_;    // size of the pooling window along x-axis
+  int32 pool_y_size_;    // size of the pooling window along y-axis
+  int32 pool_z_size_;    // size of the pooling window along z-axis
+
+  int32 pool_x_step_;   // the number of steps taken along x-axis of input
+  //  before computing the next pool
+  int32 pool_y_step_;   // the number of steps taken along y-axis of input
+  // before computing the next pool
+  int32 pool_z_step_;   // the number of steps taken along z-axis of input
+  // before computing the next pool
+
+};
+
+
+/**
+   CompositeComponent is a component representing a sequence of
+   [simple] components.  The config line would be something like the following
+   (imagine this is all on one line):
+
+   component name=composite1 type=CompositeComponent max-rows-process=2048 num-components=3 \
+      component1='type=BlockAffineComponent input-dim=1000 output-dim=10000 num-blocks=100' \
+      component2='type=RectifiedLinearComponent dim=10000' \
+      component3='type=BlockAffineComponent input-dim=10000 output-dim=1000 num-blocks=100'
+
+   The reason you might want to use this component, instead of directly using
+   the same sequence of components in the config file, is to save GPU memory (at
+   the expense of more compute)-- because doing it like this means we have to
+   re-do parts of the forward pass in the backprop phase, but we avoid using
+   much memory for very long (and you can make the memory usage very small by
+   making max-rows-process small).  We inherit from UpdatableComponent just in
+   case one or more of the components in the sequence are updatable.
+
+   It is an error to nest a CompositeComponent inside a CompositeComponent.
+   The same effect can be accomplished by specifying a smaller max-rows-process
+   in a single CompositeComponent.
+ */
+class CompositeComponent: public UpdatableComponent {
+ public:
+  virtual int32 InputDim() const;
+  virtual int32 OutputDim() const;
+
+  virtual std::string Info() const;
+
+  virtual void InitFromConfig(ConfigLine *cfl);
 
+  virtual Component* Copy() const;
+
+  CompositeComponent() { } // use Init() or InitFromConfig() to really initialize.
+
+  // Initialize from this list of components; takes ownership of the pointers.
+  void Init(const std::vector<Component*> &components,
+            int32 max_rows_process);
+
+  virtual std::string Type() const { return "CompositeComponent"; }
+
+  // The properties depend on the properties of the constituent components.  As
+  // a special case, we never return kStoresStats in the properties: by default
+  // we store things like activation stats (e.g. for nonlinear components like
+  // ReLU) as part of the backprop.  This means we may wastefully store stats
+  // even when not requested, but it does save time as a separate StoreStats()
+  // call would involve propagating the internals.
+  virtual int32 Properties() const;
+
+  virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
+                         const CuMatrixBase<BaseFloat> &in,
+                         CuMatrixBase<BaseFloat> *out) const;
+  virtual void Backprop(const std::string &debug_info,
+                        const ComponentPrecomputedIndexes *indexes,
+                        const CuMatrixBase<BaseFloat> &in_value,
+                        const CuMatrixBase<BaseFloat> &, // out_value
+                        const CuMatrixBase<BaseFloat> &out_deriv,
+                        Component *to_update,
+                        CuMatrixBase<BaseFloat> *in_deriv) const;
+
+  // note, we don't implement StoreStats() as it would be inefficient.  Instead,
+  // by default we call StoreStats() on all members that have the flag set,
+  // inside the Backprop.
+  virtual void ZeroStats();
+
+  virtual void Read(std::istream &is, bool binary);
+  virtual void Write(std::ostream &os, bool binary) const;
+
+  // Don't implement Copy() at this level: implement it in the child class.
+
+  // Some functions from base-class UpdatableComponent.
+  virtual void SetUnderlyingLearningRate(BaseFloat lrate);
+  virtual void SetActualLearningRate(BaseFloat lrate);
+  virtual void Scale(BaseFloat scale);
+  virtual void Add(BaseFloat alpha, const Component &other);
+  virtual void SetZero(bool treat_as_gradient);
+  virtual void PerturbParams(BaseFloat stddev);
+  virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
+  virtual int32 NumParameters() const;
+  virtual void Vectorize(VectorBase<BaseFloat> *params) const;
+  virtual void UnVectorize(const VectorBase<BaseFloat> &params);
+
+  // note: we dont implement the StoreStats function as it would be quite
+  // expensive; instead, by default we call StoreStats() for any components that
+  // want to store stats, as part of the backprop pass.  This is not 100% ideal
+  // but it will usually do what you want.  We can revisit this later if needed.
+
+  // Functions to iterate over the internal components
+
+  int32 NumComponents() const { return components_.size();}
+  /// Gets the ith component in this component.
+  /// The ordering is the same as in the config line. The caller
+  /// does not own the received component.
+  const Component* GetComponent(int32 i) const;
+  /// Sets the ith component. After this call, CompositeComponent owns
+  /// the reference to the argument component. Frees the previous
+  /// ith component.
+  void SetComponent(int32 i, Component *component);
+
+  virtual ~CompositeComponent() { DeletePointers(&components_); }
+ private:
+  // returns the stride type, kDefaultStride or kStrideEqualNumCols,
+  // at the output of the i'th component.
+  inline MatrixStrideType GetStrideType(int32 i) const;
+
+  // returns true if at least one of 'components_' returns the kUpdatable flag
+  // in its flags.
+  bool IsUpdatable() const;
+
+  // the maximum number of
+  int32 max_rows_process_;
+  std::vector<Component*> components_;
+
+};
 
 
 } // namespace nnet3
diff --git a/src/nnet3/nnet-test-utils.cc b/src/nnet3/nnet-test-utils.cc
index 11f7ab1ee1d..933808dc61c 100644
--- a/src/nnet3/nnet-test-utils.cc
+++ b/src/nnet3/nnet-test-utils.cc
@@ -1,7 +1,8 @@
 // nnet3/nnet-test-utils.cc
 
 // Copyright      2015  Johns Hopkins University (author: Daniel Povey)
-// Copyright      2015  Johns Hopkins University (author: Vijayaditya Peddinti)
+// Copyright      2015  Vijayaditya Peddinti
+// Copyright      2016  Daniel Galvez
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -35,13 +36,15 @@ void GenerateConfigSequenceSimplest(
   std::ostringstream os;
 
   int32 input_dim = 10 + Rand() % 20,
-       output_dim = 100 + Rand() % 200;
+      output_dim = (opts.output_dim > 0 ?
+                    opts.output_dim :
+                    100 + Rand() % 200);
+
 
-  
   os << "component name=affine1 type=AffineComponent input-dim="
      << input_dim << " output-dim=" << output_dim << std::endl;
 
-  os << "input-node name=input dim=" << input_dim << std::endl;  
+  os << "input-node name=input dim=" << input_dim << std::endl;
   os << "component-node name=affine1_node component=affine1 input=input\n";
   os << "output-node name=output input=affine1_node\n";
   configs->push_back(os.str());
@@ -59,10 +62,12 @@ void GenerateConfigSequenceSimpleContext(
       splice_context.push_back(i);
   if (splice_context.empty())
     splice_context.push_back(0);
-  
+
   int32 input_dim = 10 + Rand() % 20,
       spliced_dim = input_dim * splice_context.size(),
-       output_dim = 100 + Rand() % 200;
+      output_dim = (opts.output_dim > 0 ?
+                    opts.output_dim :
+                    100 + Rand() % 200);
 
   os << "component name=affine1 type=AffineComponent input-dim="
      << spliced_dim << " output-dim=" << output_dim << std::endl;
@@ -97,10 +102,12 @@ void GenerateConfigSequenceSimple(
       splice_context.push_back(i);
   if (splice_context.empty())
     splice_context.push_back(0);
-  
+
   int32 input_dim = 10 + Rand() % 20,
       spliced_dim = input_dim * splice_context.size(),
-      output_dim = 100 + Rand() % 200,
+      output_dim = (opts.output_dim > 0 ?
+                    opts.output_dim :
+                    100 + Rand() % 200),
       hidden_dim = 40 + Rand() % 50;
   bool use_final_nonlinearity = (opts.allow_final_nonlinearity &&
                                  RandInt(0, 1) == 0);
@@ -118,9 +125,9 @@ void GenerateConfigSequenceSimple(
       os << "component name=logsoftmax type=LogSoftmaxComponent dim="
          << output_dim << std::endl;
     }
-  }      
+  }
   os << "input-node name=input dim=" << input_dim << std::endl;
-  
+
   os << "component-node name=affine1_node component=affine1 input=Append(";
   for (size_t i = 0; i < splice_context.size(); i++) {
     int32 offset = splice_context[i];
@@ -156,6 +163,49 @@ void GenerateConfigSequenceSimple(
 }
 
 
+void GenerateConfigSequenceStatistics(
+    const NnetGenerationOptions &opts,
+    std::vector<std::string> *configs) {
+  int32 input_dim = RandInt(10, 30),
+      input_period = RandInt(1, 3),
+      stats_period = input_period * RandInt(1, 3),
+      left_context = stats_period * RandInt(1, 10),
+      right_context = stats_period * RandInt(1, 10),
+      log_count_features = RandInt(0, 3);
+  BaseFloat variance_floor = RandInt(1, 10) * 1.0e-10;
+  bool output_stddevs = (RandInt(0, 1) == 0);
+
+  int32 raw_stats_dim = 1 + input_dim + (output_stddevs ? input_dim : 0),
+      pooled_stats_dim = log_count_features + input_dim +
+        (output_stddevs ? input_dim : 0);
+  std::ostringstream os;
+  os << "input-node name=input dim=" << input_dim << std::endl;
+  os << "component name=statistics-extraction type=StatisticsExtractionComponent "
+     << "input-dim=" << input_dim << " input-period=" << input_period
+     << " output-period=" << stats_period << " include-variance="
+     << std::boolalpha << output_stddevs << "\n";
+
+  os << "component name=statistics-pooling type=StatisticsPoolingComponent "
+     << "input-dim=" << raw_stats_dim << " input-period=" << stats_period
+     << " left-context=" << left_context << " right-context=" << right_context
+     << " num-log-count-features=" << log_count_features << " output-stddevs="
+     << std::boolalpha << output_stddevs << " variance-floor="
+     << variance_floor << "\n";
+
+  os << "component name=affine type=AffineComponent "
+     << "input-dim=" << input_dim << " output-dim=" << pooled_stats_dim
+     << "\n";
+
+  os << "component-node name=statistics-extraction component=statistics-extraction "
+     << "input=input\n";
+  os << "component-node name=statistics-pooling component=statistics-pooling "
+     << "input=statistics-extraction\n";
+  os << "component-node name=affine component=affine input=input\n";
+  os << "output-node name=output input=Sum(affine, Round(statistics-pooling, "
+     << stats_period << "))\n";
+  configs->push_back(os.str());
+}
+
 // This generates a single config corresponding to an RNN.
 void GenerateConfigSequenceRnn(
     const NnetGenerationOptions &opts,
@@ -168,15 +218,22 @@ void GenerateConfigSequenceRnn(
       splice_context.push_back(i);
   if (splice_context.empty())
     splice_context.push_back(0);
-  
+
   int32 input_dim = 10 + Rand() % 20,
       spliced_dim = input_dim * splice_context.size(),
-      output_dim = 100 + Rand() % 200,
+      output_dim = (opts.output_dim > 0 ?
+                    opts.output_dim :
+                    100 + Rand() % 200),
       hidden_dim = 40 + Rand() % 50;
   os << "component name=affine1 type=NaturalGradientAffineComponent input-dim="
      << spliced_dim << " output-dim=" << hidden_dim << std::endl;
-  os << "component name=nonlin1 type=RectifiedLinearComponent dim="
-     << hidden_dim << std::endl;
+  if (RandInt(0, 1) == 0) {
+    os << "component name=nonlin1 type=RectifiedLinearComponent dim="
+       << hidden_dim << std::endl;
+  } else {
+    os << "component name=nonlin1 type=TanhComponent dim="
+       << hidden_dim << std::endl;
+  }
   os << "component name=recurrent_affine1 type=NaturalGradientAffineComponent input-dim="
      << hidden_dim << " output-dim=" << hidden_dim << std::endl;
   os << "component name=affine2 type=NaturalGradientAffineComponent input-dim="
@@ -184,7 +241,7 @@ void GenerateConfigSequenceRnn(
   os << "component name=logsoftmax type=LogSoftmaxComponent dim="
      << output_dim << std::endl;
   os << "input-node name=input dim=" << input_dim << std::endl;
-  
+
   os << "component-node name=affine1_node component=affine1 input=Append(";
   for (size_t i = 0; i < splice_context.size(); i++) {
     int32 offset = splice_context[i];
@@ -220,10 +277,12 @@ void GenerateConfigSequenceRnnClockwork(
       splice_context.push_back(i);
   if (splice_context.empty())
     splice_context.push_back(0);
-  
+
   int32 input_dim = 10 + Rand() % 20,
       spliced_dim = input_dim * splice_context.size(),
-      output_dim = 100 + Rand() % 200,
+      output_dim = (opts.output_dim > 0 ?
+                    opts.output_dim :
+                    100 + Rand() % 200),
       hidden_dim = 40 + Rand() % 50;
   os << "component name=affine1 type=NaturalGradientAffineComponent input-dim="
      << spliced_dim << " output-dim=" << hidden_dim << std::endl;
@@ -243,7 +302,7 @@ void GenerateConfigSequenceRnnClockwork(
   os << "component name=logsoftmax type=LogSoftmaxComponent dim="
      << output_dim << std::endl;
   os << "input-node name=input dim=" << input_dim << std::endl;
-  
+
   os << "component-node name=affine1_node component=affine1 input=Append(";
   for (size_t i = 0; i < splice_context.size(); i++) {
     int32 offset = splice_context[i];
@@ -258,7 +317,7 @@ void GenerateConfigSequenceRnnClockwork(
         "input=Sum(affine1_node, IfDefined(recurrent_affine1))\n";
   os << "component-node name=final_affine_0 component=final_affine_0 input=nonlin1\n";
   os << "component-node name=final_affine_1 component=final_affine_1 input=Offset(nonlin1, -1)\n";
-  os << "component-node name=final_affine_2 component=final_affine_1 input=Offset(nonlin1, 1)\n";
+  os << "component-node name=final_affine_2 component=final_affine_2 input=Offset(nonlin1, 1)\n";
   os << "component-node name=output_nonlin component=logsoftmax input=Switch(final_affine_0, final_affine_1, final_affine_2)\n";
   os << "output-node name=output input=output_nonlin\n";
   configs->push_back(os.str());
@@ -267,11 +326,11 @@ void GenerateConfigSequenceRnnClockwork(
 
 
 // This generates a single config corresponding to an LSTM.
-// based on the equations in 
+// based on the equations in
 // Sak et. al. "LSTM based RNN architectures for LVCSR", 2014
 // We name the components based on the following equations (Eqs 7-15 in paper)
 //      i(t) = S(Wix * x(t) + Wir * r(t-1) + Wic * c(t-1) + bi)
-//      f(t) = S(Wfx * x(t) + Wfr * r(t-1) + Wfc * c(t-1) + bf) 
+//      f(t) = S(Wfx * x(t) + Wfr * r(t-1) + Wfc * c(t-1) + bf)
 //      c(t) = {f(t) .* c(t-1)} + {i(t) .* g(Wcx * x(t) + Wcr * r(t-1) + bc)}
 //      o(t) = S(Wox * x(t) + Wor * r(t-1) + Woc * c(t) + bo)
 //      m(t) = o(t) .* h(c(t))
@@ -281,18 +340,18 @@ void GenerateConfigSequenceRnnClockwork(
 // where S : sigmoid
 // matrix with feed-forward connections
 // from the input x(t)
-// W*x = [Wix^T, Wfx^T, Wcx^T, Wox^T]^T 
+// W*x = [Wix^T, Wfx^T, Wcx^T, Wox^T]^T
 
 // matrix with recurrent (feed-back) connections
 // from the output projection
 // W*r = [Wir^T, Wfr^T, Wcr^T, Wor^T]^T
 
 // matrix to generate r(t) and p(t)
-// m(t) 
+// m(t)
 // W*m = [Wrm^T, Wpm^T]^T
 // matrix to generate y(t)
-// Wy* = [Wyr^T, Wyp^T] 
- 
+// Wy* = [Wyr^T, Wyp^T]
+
 // Diagonal matrices with recurrent connections and feed-forward connections
 // from the cell output c(t) since these can be both recurrent and
 // feed-forward we dont combine the matrices
@@ -310,10 +369,12 @@ void GenerateConfigSequenceLstm(
       splice_context.push_back(i);
   if (splice_context.empty())
     splice_context.push_back(0);
-  
+
   int32 input_dim = 10 + Rand() % 20,
       spliced_dim = input_dim * splice_context.size(),
-      output_dim = 100 + Rand() % 200,
+      output_dim = (opts.output_dim > 0 ?
+                    opts.output_dim :
+                    100 + Rand() % 200),
       cell_dim = 40 + Rand() % 50,
       projection_dim = std::ceil(cell_dim / (Rand() % 10 + 1));
 
@@ -329,7 +390,7 @@ void GenerateConfigSequenceLstm(
 
   // Forget gate control : Wf* matrices
   os << "component name=Wf-xr type=NaturalGradientAffineComponent"
-     << " input-dim=" << spliced_dim + projection_dim 
+     << " input-dim=" << spliced_dim + projection_dim
      << " output-dim=" << cell_dim << std::endl;
   os << "component name=Wfc type=PerElementScaleComponent "
      << " dim=" << cell_dim << std::endl;
@@ -359,8 +420,8 @@ void GenerateConfigSequenceLstm(
      << " output-dim=" << cell_dim << std::endl;
 
   // Defining the diagonal matrices
-  // Defining the final affine transform 
-  os << "component name=final_affine type=NaturalGradientAffineComponent " 
+  // Defining the final affine transform
+  os << "component name=final_affine type=NaturalGradientAffineComponent "
      << "input-dim=" << cell_dim << " output-dim=" << output_dim << std::endl;
   os << "component name=logsoftmax type=LogSoftmaxComponent dim="
      << output_dim << std::endl;
@@ -379,17 +440,17 @@ void GenerateConfigSequenceLstm(
   os << "component name=h type=TanhComponent dim="
      << cell_dim << std::endl;
   os << "component name=c1 type=ElementwiseProductComponent "
-     << " input-dim=" << 2 * cell_dim 
+     << " input-dim=" << 2 * cell_dim
      << " output-dim=" << cell_dim << std::endl;
   os << "component name=c2 type=ElementwiseProductComponent "
-     << " input-dim=" << 2 * cell_dim 
+     << " input-dim=" << 2 * cell_dim
      << " output-dim=" << cell_dim << std::endl;
   os << "component name=m type=ElementwiseProductComponent "
-     << " input-dim=" << 2 * cell_dim 
+     << " input-dim=" << 2 * cell_dim
      << " output-dim=" << cell_dim << std::endl;
 
   // Defining the computations
-  std::ostringstream temp_string_stream; 
+  std::ostringstream temp_string_stream;
   for (size_t i = 0; i < splice_context.size(); i++) {
     int32 offset = splice_context[i];
     temp_string_stream << "Offset(input, " << offset << ")";
@@ -397,7 +458,7 @@ void GenerateConfigSequenceLstm(
       temp_string_stream << ", ";
   }
   std::string spliced_input = temp_string_stream.str();
-  
+
   std::string c_tminus1 = "Sum(IfDefined(Offset(c1_t, -1)), IfDefined(Offset( c2_t, -1)))";
 
   // i_t
@@ -422,18 +483,18 @@ void GenerateConfigSequenceLstm(
 
   // h_t
   os << "component-node name=h_t component=h input=Sum(c1_t, c2_t)\n";
-  
+
   // g_t
   os << "component-node name=g1 component=Wc-xr input=Append("
      << spliced_input << ", IfDefined(Offset(r_t, -1)))\n";
   os << "component-node name=g_t component=g input=g1\n";
- 
+
   // parts of c_t
   os << "component-node name=c1_t component=c1 "
      << " input=Append(f_t, " << c_tminus1 << ")\n";
   os << "component-node name=c2_t component=c2 input=Append(i_t, g_t)\n";
 
-  // m_t 
+  // m_t
   os << "component-node name=m_t component=m input=Append(o_t, h_t)\n";
 
   // r_t and p_t
@@ -441,7 +502,7 @@ void GenerateConfigSequenceLstm(
   // Splitting outputs of Wy- node
   os << "dim-range-node name=r_t input-node=rp_t dim-offset=0 "
      << "dim=" << projection_dim << std::endl;
-  
+
   // y_t
   os << "component-node name=y_t component=Wy- input=rp_t\n";
 
@@ -466,10 +527,12 @@ void GenerateConfigSequenceLstmType2(
       splice_context.push_back(i);
   if (splice_context.empty())
     splice_context.push_back(0);
-  
+
   int32 input_dim = 10 + Rand() % 20,
       spliced_dim = input_dim * splice_context.size(),
-      output_dim = 100 + Rand() % 200,
+      output_dim = (opts.output_dim > 0 ?
+                    opts.output_dim :
+                    100 + Rand() % 200),
       cell_dim = 40 + Rand() % 50,
       projection_dim = std::ceil(cell_dim / (Rand() % 10 + 2));
 
@@ -492,8 +555,8 @@ void GenerateConfigSequenceLstmType2(
      << " dim=" << cell_dim << std::endl;
   os << "component name=Woc type=PerElementScaleComponent "
      << " dim=" << cell_dim << std::endl;
-  // Defining the final affine transform 
-  os << "component name=final_affine type=NaturalGradientAffineComponent " 
+  // Defining the final affine transform
+  os << "component name=final_affine type=NaturalGradientAffineComponent "
      << "input-dim=" << cell_dim << " output-dim=" << output_dim << std::endl;
   os << "component name=logsoftmax type=LogSoftmaxComponent dim="
      << output_dim << std::endl;
@@ -514,15 +577,15 @@ void GenerateConfigSequenceLstmType2(
   os << "component name=h type=TanhComponent dim="
      << cell_dim << std::endl;
   os << "component name=f_t-c_tminus1 type=ElementwiseProductComponent "
-     << " input-dim=" << 2 * cell_dim 
+     << " input-dim=" << 2 * cell_dim
      << " output-dim=" << cell_dim << std::endl;
   os << "component name=i_t-g type=ElementwiseProductComponent "
-     << " input-dim=" << 2 * cell_dim 
+     << " input-dim=" << 2 * cell_dim
      << " output-dim=" << cell_dim << std::endl;
   os << "component name=m_t type=ElementwiseProductComponent "
-     << " input-dim=" << 2 * cell_dim 
+     << " input-dim=" << 2 * cell_dim
      << " output-dim=" << cell_dim << std::endl;
-    
+
 
   // Defining the computations
   os << "component-node name=W-x component=W-x input=Append(";
@@ -545,7 +608,7 @@ void GenerateConfigSequenceLstmType2(
      << "dim=" << projection_dim << std::endl;
   os << "dim-range-node name=p_t input-node=W-m dim-offset=" << projection_dim
      << " dim=" << projection_dim << std::endl;
-  
+
   // Splitting outputs of W*x node
   os << "dim-range-node name=W_ix-x_t input-node=W-x dim-offset=0 "
      << "dim=" << cell_dim << std::endl;
@@ -555,7 +618,7 @@ void GenerateConfigSequenceLstmType2(
      << "dim-offset=" << 2 * cell_dim << " dim="<<cell_dim << std::endl;
   os << "dim-range-node name=W_ox-x_t input-node=W-x "
      << "dim-offset=" << 3 * cell_dim << " dim="<<cell_dim << std::endl;
-  
+
   // Splitting outputs of W*r node
   os << "dim-range-node name=W_ir-r_tminus1 input-node=W-r dim-offset=0 "
      << "dim=" << cell_dim << std::endl;
@@ -575,26 +638,171 @@ void GenerateConfigSequenceLstmType2(
   os << "component-node name=f_t-c_tminus1 component=f_t-c_tminus1 input=Append(f_t, Offset(c_t, -1))\n";
   os << "component-node name=i_t-g component=i_t-g input=Append(i_t, g)\n";
   os << "component-node name=m_t component=m_t input=Append(o_t, h)\n";
-  
+
   os << "component-node name=g component=g input=Sum(W_cx-x_t, W_cr-r_tminus1)\n";
-  
+
   // Final affine transform
   os << "component-node name=Wyr component=Wyr input=r_t\n";
   os << "component-node name=Wyp component=Wyp input=p_t\n";
 
   os << "component-node name=final_affine component=final_affine input=Sum(Wyr, Wyp)\n";
-  
+
   os << "component-node name=posteriors component=logsoftmax input=final_affine\n";
   os << "output-node name=output input=posteriors\n";
-  
+
+  configs->push_back(os.str());
+}
+
+void GenerateConfigSequenceCnn(
+    const NnetGenerationOptions &opts,
+    std::vector<std::string> *configs) {
+  std::ostringstream os;
+
+
+  int32 input_x_dim = 10 + Rand() % 20,
+        input_y_dim = 10 + Rand() % 20,
+        input_z_dim = 3 + Rand() % 10,
+        filt_x_dim = 1 + Rand() % input_x_dim,
+        filt_y_dim = 1 + Rand() % input_y_dim,
+        num_filters = 10 + Rand() % 20,
+        filt_x_step = (1 + Rand() % filt_x_dim),
+        filt_y_step = (1 + Rand() % filt_y_dim);
+  int32 remainder = (input_x_dim - filt_x_dim) % filt_x_step;
+  // adjusting input_x_dim to ensure divisibility
+  input_x_dim = input_x_dim - remainder;
+  remainder = (input_y_dim - filt_y_dim) % filt_y_step;
+  // adjusting input_x_dim to ensure divisibility
+  input_y_dim = input_y_dim - remainder;
+
+  int32 input_vectorization = Rand() % 2;
+  std::string vectorization;
+  if (input_vectorization == 0) {
+    vectorization = "yzx";
+  } else  {
+    vectorization = "zyx";
+  }
+
+  os << "component name=conv type=ConvolutionComponent "
+     << " input-x-dim=" << input_x_dim
+     << " input-y-dim=" << input_y_dim
+     << " input-z-dim=" << input_z_dim
+     << " filt-x-dim=" << filt_x_dim
+     << " filt-y-dim=" << filt_y_dim
+     << " filt-x-step=" << filt_x_step
+     << " filt-y-step=" << filt_y_step
+     << " num-filters=" << num_filters
+     << " input-vectorization-order=" << vectorization
+     << std::endl;
+
+  int32 conv_output_x_dim = (1 + (input_x_dim - filt_x_dim) / filt_x_step);
+  int32 conv_output_y_dim = (1 + (input_y_dim - filt_y_dim) / filt_y_step);
+  int32 conv_output_z_dim = num_filters;
+  int32 pool_x_size = 1 + Rand() % conv_output_x_dim;
+  int32 pool_y_size = 1 + Rand() % conv_output_y_dim;
+  int32 pool_z_size = 1 + Rand() % conv_output_z_dim;
+  int32 pool_x_step = 1;
+  int32 pool_y_step = 1;
+  int32 pool_z_step = 1;
+  do {
+    pool_x_step = (1 + Rand() % pool_x_size);
+  } while((conv_output_x_dim - pool_x_size) % pool_x_step);
+  do {
+    pool_y_step = (1 + Rand() % pool_y_size);
+  } while((conv_output_y_dim - pool_y_size) % pool_y_step);
+  do {
+    pool_z_step = (1 + Rand() % pool_z_size);
+  } while((conv_output_z_dim - pool_z_size) % pool_z_step);
+
+  os << "component name=maxpooling type=MaxpoolingComponent "
+     << " input-x-dim=" << conv_output_x_dim
+     << " input-y-dim=" << conv_output_y_dim
+     << " input-z-dim=" << conv_output_z_dim
+     << " pool-x-size=" << pool_x_size
+     << " pool-y-size=" << pool_y_size
+     << " pool-z-size=" << pool_z_size
+     << " pool-x-step=" << pool_x_step
+     << " pool-y-step=" << pool_y_step
+     << " pool-z-step=" << pool_z_step
+     << std::endl;
+
+  os << "input-node name=input dim=" << (input_x_dim * input_y_dim * input_z_dim) << std::endl;
+  os << "component-node name=conv_node component=conv input=input\n";
+  os << "component-node name=maxpooling_node component=maxpooling input=conv_node\n";
+  os << "output-node name=output input=conv_node\n";
+  configs->push_back(os.str());
+}
+
+// generates a config sequence involving DistributeComponent.
+void GenerateConfigSequenceDistribute(
+    const NnetGenerationOptions &opts,
+    std::vector<std::string> *configs) {
+  int32 output_dim = (opts.output_dim > 0 ? opts.output_dim : 100);
+  int32 x_expand = RandInt(1, 5), after_expand_dim = RandInt(10, 20),
+      input_dim = x_expand * after_expand_dim;
+  std::ostringstream os;
+  os << "input-node name=input dim=" << input_dim << std::endl;
+  os << "component name=distribute type=DistributeComponent input-dim="
+     << input_dim << " output-dim=" << after_expand_dim << std::endl;
+  os << "component-node name=distribute component=distribute input=input\n";
+  os << "component name=affine type=AffineComponent input-dim="
+     << after_expand_dim << " output-dim=" << output_dim << std::endl;
+  os << "component-node name=affine component=affine input=distribute\n";
+  os << "output-node name=output input=Sum(";
+  for (int32 i = 0; i < x_expand; i++) {
+    if (i > 0) os << ", ";
+    os << "ReplaceIndex(affine, x, " << i << ")";
+  }
+  os << ")\n";
+  configs->push_back(os.str());
+}
+
+/// Generate a config string with a composite component composed only
+/// of block affine, repeated affine, and natural gradient repeated affine
+/// components.
+void GenerateConfigSequenceCompositeBlock(const NnetGenerationOptions &opts,
+                                          std::vector<std::string> *configs) {
+  int32 num_components = RandInt(1,5);
+  int32 input_dim = 10 * RandInt(1,10);
+  if (opts.output_dim > 0) {
+    KALDI_WARN  << "This function doesn't take a requested output_dim due to "
+      "implementation complications.";
+  }
+  int32 max_rows_process = 512 + 512 * RandInt(1,3);
+  std::ostringstream os;
+  os << "component name=composite1 type=CompositeComponent max-rows-process="
+     << max_rows_process << " num-components=" << num_components;
+
+  int32 types_length = 3;
+  std::string types[] = {"BlockAffineComponent",
+                         "RepeatedAffineComponent",
+                         "NaturalGradientRepeatedAffineComponent"};
+  int32 last_output_dim = input_dim;
+  // components within a composite component are indexed from 1.
+  for(int32 i = 1; i <= num_components; i++) {
+    os << " component" << i << "=";
+    int32 rand_index = RandInt(0, types_length - 1);
+    std::string rand_type = types[rand_index];
+    os << "'type=" << rand_type << " input-dim=" << last_output_dim;
+    int32 current_output_dim = 10 * RandInt(1,10);
+    // must be a divisor or current_output_dim and last_output_dim
+    int32 num_repeats = 10;
+    os << " output-dim=" << current_output_dim;
+    std::string repeats_string = (rand_type == "BlockAffineComponent") ? "num-blocks": "num-repeats";
+    os << " " << repeats_string << "=" << num_repeats << "'";
+    last_output_dim = current_output_dim;
+  }
+  os << std::endl << std::endl;
+  os << "input-node name=input dim=" << input_dim << std::endl;
+  os << "component-node name=composite1 component=composite1 input=input\n";
+  os << "output-node name=output input=composite1\n";
   configs->push_back(os.str());
 }
- 
+
 void GenerateConfigSequence(
     const NnetGenerationOptions &opts,
     std::vector<std::string> *configs) {
 start:
-  int32 network_type = RandInt(0, 6);
+  int32 network_type = RandInt(0, 10);
   switch(network_type) {
     case 0:
       GenerateConfigSequenceSimplest(opts, configs);
@@ -606,7 +814,7 @@ void GenerateConfigSequence(
       break;
     case 2:
       if (!opts.allow_context || !opts.allow_nonlinearity)
-        goto start;        
+        goto start;
       GenerateConfigSequenceSimple(opts, configs);
       break;
     case 3:
@@ -633,9 +841,24 @@ void GenerateConfigSequence(
         goto start;
       GenerateConfigSequenceLstm(opts, configs);
       break;
+    case 7:
+      if (!opts.allow_nonlinearity)
+        goto start;
+      GenerateConfigSequenceCnn(opts, configs);
+      break;
+    case 8:
+      GenerateConfigSequenceDistribute(opts, configs);
+      break;
+    case 9:
+      GenerateConfigSequenceCompositeBlock(opts, configs);
+      break;
+    case 10:
+      GenerateConfigSequenceStatistics(opts, configs);
+      break;
     default:
       KALDI_ERR << "Error generating config sequence.";
   }
+  KALDI_ASSERT(!configs->empty());
 }
 
 void ComputeExampleComputationRequestSimple(
@@ -655,11 +878,16 @@ void ComputeExampleComputationRequestSimple(
       input_end_frame = output_end_frame + right_context + (Rand() % 3),
       n_offset = Rand() % 2;
   bool need_deriv = (Rand() % 2 == 0);
-  
+  // make sure there are at least 3 frames of input available.  this makes a
+  // difference for our tests of statistics-pooling and statistics-extraction
+  // component.
+  if (input_end_frame < input_start_frame + 3)
+    input_end_frame = input_start_frame + 3;
+
   request->inputs.clear();
   request->outputs.clear();
   inputs->clear();
-  
+
   std::vector<Index> input_indexes, ivector_indexes, output_indexes;
   for (int32 n = n_offset; n < n_offset + num_examples; n++) {
     for (int32 t = input_start_frame; t < input_end_frame; t++)
@@ -697,19 +925,28 @@ void ComputeExampleComputationRequestSimple(
 
 static void GenerateRandomComponentConfig(std::string *component_type,
                                           std::string *config) {
-  int32 n = RandInt(0, 16);
+
+  int32 n = RandInt(0, 27);
+  BaseFloat learning_rate = 0.001 * RandInt(1, 3);
+
   std::ostringstream os;
   switch(n) {
     case 0: {
       *component_type = "PnormComponent";
       int32 output_dim = RandInt(1, 50), group_size = RandInt(1, 15),
-             input_dim = output_dim * group_size;
+          input_dim = output_dim * group_size;
       os << "input-dim=" << input_dim << " output-dim=" << output_dim;
       break;
     }
     case 1: {
+      BaseFloat target_rms = (RandInt(1, 200) / 100.0);
+      std::string add_log_stddev = (Rand() % 2 == 0 ? "True" : "False");
       *component_type = "NormalizeComponent";
-      os << "dim=" << RandInt(1, 50);
+      // avoid dim=1 because the derivatives would be zero, which
+      // makes them hard to test.
+      os << "dim=" << RandInt(2, 50)
+         << " target-rms=" << target_rms
+         << " add-log-stddev=" << add_log_stddev;
       break;
     }
     case 2: {
@@ -751,13 +988,15 @@ static void GenerateRandomComponentConfig(std::string *component_type,
     case 9: {
       *component_type = "AffineComponent";
       int32 input_dim = RandInt(1, 50), output_dim = RandInt(1, 50);
-      os << "input-dim=" << input_dim << " output-dim=" << output_dim;
+      os << "input-dim=" << input_dim << " output-dim=" << output_dim
+         << " learning-rate=" << learning_rate;
       break;
     }
     case 10: {
       *component_type = "NaturalGradientAffineComponent";
       int32 input_dim = RandInt(1, 50), output_dim = RandInt(1, 50);
-      os << "input-dim=" << input_dim << " output-dim=" << output_dim;
+      os << "input-dim=" << input_dim << " output-dim=" << output_dim
+         << " learning-rate=" << learning_rate;
       break;
     }
     case 11: {
@@ -784,12 +1023,14 @@ static void GenerateRandomComponentConfig(std::string *component_type,
     }
     case 14: {
       *component_type = "NaturalGradientPerElementScaleComponent";
-      os << "dim=" << RandInt(1, 100);
+      os << "dim=" << RandInt(1, 100)
+         << " learning-rate=" << learning_rate;
       break;
     }
     case 15: {
       *component_type = "PerElementScaleComponent";
-      os << "dim=" << RandInt(1, 100);
+      os << "dim=" << RandInt(1, 100)
+         << " learning-rate=" << learning_rate;
       break;
     }
     case 16: {
@@ -799,6 +1040,171 @@ static void GenerateRandomComponentConfig(std::string *component_type,
       os << "input-dim=" << input_dim << " output-dim=" << output_dim;
       break;
     }
+    case 17: {
+      int32 input_vectorization = Rand() % 2;
+      std::string vectorization;
+      if (input_vectorization == 0) {
+        vectorization = "yzx";
+      } else  {
+        vectorization = "zyx";
+      }
+      *component_type = "ConvolutionComponent";
+      int32 input_x_dim = 10 + Rand() % 20,
+            input_y_dim = 10 + Rand() % 20,
+            input_z_dim = 3 + Rand() % 10,
+            filt_x_dim = 1 + Rand() % input_x_dim,
+            filt_y_dim = 1 + Rand() % input_y_dim,
+            num_filters = 1 + Rand() % 10,
+            filt_x_step = (1 + Rand() % filt_x_dim),
+            filt_y_step = (1 + Rand() % filt_y_dim);
+      int32 remainder = (input_x_dim - filt_x_dim) % filt_x_step;
+      // adjusting input_x_dim to ensure divisibility
+      input_x_dim = input_x_dim - remainder;
+      remainder = (input_y_dim - filt_y_dim) % filt_y_step;
+      // adjusting input_x_dim to ensure divisibility
+      input_y_dim = input_y_dim - remainder;
+
+      os << "input-x-dim=" << input_x_dim
+         << " input-y-dim=" << input_y_dim
+         << " input-z-dim=" << input_z_dim
+         << " filt-x-dim=" << filt_x_dim
+         << " filt-y-dim=" << filt_y_dim
+         << " filt-x-step=" << filt_x_step
+         << " filt-y-step=" << filt_y_step
+         << " num-filters=" << num_filters
+         << " input-vectorization-order=" << vectorization
+         << " learning-rate=" << learning_rate;
+      break;
+      // TODO : add test for file based initialization. But confirm how to write
+      // a file which is not going to be overwritten by other components
+    }
+    case 18: {
+      *component_type = "PermuteComponent";
+      int32 input_dim = 10 + Rand() % 100;
+      std::vector<int32> column_map(input_dim);
+      for (int32 i = 0; i < input_dim; i++)
+        column_map[i] = i;
+      std::random_shuffle(column_map.begin(), column_map.end());
+      std::ostringstream buffer;
+      for (int32 i = 0; i < input_dim-1; i++)
+        buffer << column_map[i] << ",";
+      buffer << column_map.back();
+      os << "column-map=" << buffer.str();
+      break;
+    }
+    case 19: {
+      *component_type = "PerElementOffsetComponent";
+      std::string param_config = RandInt(0, 1)?
+                                 " param-mean=0.0 param-stddev=0.0":
+                                 " param-mean=0.0 param-stddev=1.0";
+      os << "dim=" << RandInt(1, 100)
+         << " learning-rate=" << learning_rate << param_config;
+      break;
+    }
+    case 20: {
+      *component_type = "SumReduceComponent";
+      int32 output_dim = RandInt(1, 50), group_size = RandInt(1, 15),
+          input_dim = output_dim * group_size;
+      os << "input-dim=" << input_dim << " output-dim=" << output_dim;
+      break;
+    }
+    case 21: {
+      *component_type = "CompositeComponent";
+      int32 cur_dim = RandInt(20, 30), num_components = RandInt(1, 3),
+          max_rows_process = RandInt(1, 30);
+      os << "num-components=" << num_components
+         << " max-rows-process=" << max_rows_process;
+      std::vector<std::string> sub_configs;
+      for (int32 i = 1; i <= num_components; i++) {
+        if (RandInt(1, 3) == 1) {
+          os << " component" << i << "='type=RectifiedLinearComponent dim="
+             << cur_dim << "'";
+        } else if (RandInt(1, 2) == 1) {
+          os << " component" << i << "='type=TanhComponent dim="
+             << cur_dim << "'";
+        } else {
+          int32 next_dim = RandInt(20, 30);
+          os << " component" << i << "='type=AffineComponent input-dim="
+             << cur_dim << " output-dim=" << next_dim << "'";
+          cur_dim = next_dim;
+        }
+      }
+      break;
+    }
+    case 22: {
+      *component_type = "SumGroupComponent";
+      int32 num_groups = RandInt(1, 50),
+        input_dim = num_groups * RandInt(1, 15);
+      os << "input-dim=" << input_dim << " output-dim=" << num_groups;
+      break;
+    }
+    case 23: {
+      *component_type = "RepeatedAffineComponent";
+      int32 num_repeats = RandInt(1, 50),
+          input_dim = num_repeats * RandInt(1, 15),
+          output_dim = num_repeats * RandInt(1, 15);
+      os << "input-dim=" << input_dim << " output-dim=" << output_dim
+         << " num-repeats=" << num_repeats;
+      break;
+    }
+    case 24: {
+      *component_type = "BlockAffineComponent";
+      int32 num_blocks = RandInt(1, 50),
+          input_dim = num_blocks * RandInt(1, 15),
+          output_dim = num_blocks * RandInt(1, 15);
+      os << "input-dim=" << input_dim << " output-dim=" << output_dim
+         << " num-blocks=" << num_blocks;
+      break;
+    }
+    case 25: {
+      *component_type = "NaturalGradientRepeatedAffineComponent";
+      int32 num_repeats = RandInt(1, 50),
+          input_dim = num_repeats * RandInt(1, 15),
+          output_dim = num_repeats * RandInt(1, 15);
+      os << "input-dim=" << input_dim << " output-dim=" << output_dim
+         << " num-repeats=" << num_repeats;
+      break;
+    }
+    case 26: {
+      *component_type = "MaxpoolingComponent";
+      int32 input_x_dim = 5 + Rand() % 10,
+            input_y_dim = 5 + Rand() % 10,
+            input_z_dim = 5 + Rand() % 10;
+      int32 pool_x_size = 1 + Rand() % input_x_dim,
+            pool_y_size = 1 + Rand() % input_y_dim,
+            pool_z_size = 1 + Rand() % input_z_dim;
+      int32 pool_x_step = (1 + Rand() % pool_x_size),
+            pool_y_step = (1 + Rand() % pool_y_size),
+            pool_z_step = (1 + Rand() % pool_z_size);
+      // adjusting input dim to ensure divisibility
+      int32 remainder = (input_x_dim - pool_x_size) % pool_x_step;
+      input_x_dim = input_x_dim - remainder;
+      remainder = (input_y_dim - pool_y_size) % pool_y_step;
+      input_y_dim = input_y_dim - remainder;
+      remainder = (input_z_dim - pool_z_size) % pool_z_step;
+      input_z_dim = input_z_dim - remainder;
+      os << " input-x-dim=" << input_x_dim
+         << " input-y-dim=" << input_y_dim
+         << " input-z-dim=" << input_z_dim
+         << " pool-x-size=" << pool_x_size
+         << " pool-y-size=" << pool_y_size
+         << " pool-z-size=" << pool_z_size
+         << " pool-x-step=" << pool_x_step
+         << " pool-y-step=" << pool_y_step
+         << " pool-z-step=" << pool_z_step;
+      break;
+    }
+    case 27: {
+      *component_type = "ConstantFunctionComponent";
+      int32 input_dim = RandInt(1, 50), output_dim = RandInt(1, 50);
+      bool is_updatable = (RandInt(0, 1) == 0),
+          use_natural_gradient =  (RandInt(0, 1) == 0);
+      os << "input-dim=" << input_dim << " output-dim=" << output_dim
+         << " learning-rate=" << learning_rate
+         << " is-updatable=" << std::boolalpha << is_updatable
+         << " use-natural-gradient=" << std::boolalpha << use_natural_gradient;
+      break;
+    }
     default:
       KALDI_ERR << "Error generating random component";
   }
@@ -812,7 +1218,7 @@ Component *GenerateRandomSimpleComponent() {
   ConfigLine config_line;
   if (!config_line.ParseLine(config))
     KALDI_ERR << "Bad config line " << config;
-  
+
   Component *c = Component::NewComponentOfType(component_type);
   if (c == NULL)
     KALDI_ERR << "Invalid component type " << component_type;
@@ -862,7 +1268,7 @@ void GenerateSimpleNnetTrainingExample(
                right_context >= 0 && output_dim > 0 && input_dim > 0
                && example != NULL);
   example->io.clear();
-  
+
   int32 feature_t_begin = RandInt(0, 2);
   int32 num_feat_frames = left_context + right_context + num_supervised_frames;
   Matrix<BaseFloat> input_mat(num_feat_frames, input_dim);
diff --git a/src/nnet3/nnet-test-utils.h b/src/nnet3/nnet-test-utils.h
index b1e4207700c..18e4960f9bd 100644
--- a/src/nnet3/nnet-test-utils.h
+++ b/src/nnet3/nnet-test-utils.h
@@ -1,7 +1,7 @@
 // nnet3/nnet-test-utils.h
 
 // Copyright   2015  Johns Hopkins University (author: Daniel Povey)
-
+// Copyright   2016  Daniel Galvez
 // See ../../COPYING for clarification regarding multiple authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
@@ -38,6 +38,9 @@ struct NnetGenerationOptions {
   bool allow_multiple_inputs;
   bool allow_multiple_outputs;
   bool allow_final_nonlinearity;
+  // if set to a value >0, the output-dim of the network
+  // will be set to this value.
+  int32 output_dim;
 
   NnetGenerationOptions():
       allow_context(true),
@@ -46,7 +49,8 @@ struct NnetGenerationOptions {
       allow_clockwork(true),
       allow_multiple_inputs(true),
       allow_multiple_outputs(false),
-      allow_final_nonlinearity(true) { }
+      allow_final_nonlinearity(true),
+      output_dim(-1) { }
 };
 
 /** Generates a sequence of at least one config files, output as strings, where
@@ -55,11 +59,20 @@ struct NnetGenerationOptions {
 void GenerateConfigSequence(const NnetGenerationOptions &opts,
                             std::vector<std::string> *configs);
 
+/// Generate a config string with a composite component composed only
+/// of block affine, repeated affine, and natural gradient repeated affine
+/// components.
+void GenerateConfigSequenceCompositeBlock(const NnetGenerationOptions &opts,
+                                          std::vector<std::string> *configs);
 
 /**  This function computes an example computation request, for testing purposes.
      The "Simple" in the name means that it currently only supports neural nets
      that satisfy IsSimple(nnet) (defined in nnet-utils.h).
-     If there are 2 inputs, the "input" will be first, followed by "ivector". */
+     If there are 2 inputs, the "input" will be first, followed by "ivector".
+
+     In order to expand the range of things you can test with this, we guarantee
+     that there will always be at least 3 successive frames of input available.
+*/
 void ComputeExampleComputationRequestSimple(
     const Nnet &nnet,
     ComputationRequest *request,
@@ -83,7 +96,7 @@ bool NnetParametersAreIdentical(const Nnet &nnet1,
     ivector_dim <= 0).  This function generates exactly "left_context" or
     "right_context" frames of context on the left and right respectively. */
 void GenerateSimpleNnetTrainingExample(
-    int32 num_supervised_frames,    
+    int32 num_supervised_frames,
     int32 left_context,
     int32 right_context,
     int32 input_dim,
diff --git a/src/nnet3/nnet-training.cc b/src/nnet3/nnet-training.cc
index 3645c4bc63c..037bc45013b 100644
--- a/src/nnet3/nnet-training.cc
+++ b/src/nnet3/nnet-training.cc
@@ -1,6 +1,7 @@
 // nnet3/nnet-training.cc
 
 // Copyright      2015    Johns Hopkins University (author: Daniel Povey)
+//                2015    Xiaohui Zhang
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -29,8 +30,29 @@ NnetTrainer::NnetTrainer(const NnetTrainerOptions &config,
     nnet_(nnet),
     compiler_(*nnet, config_.optimize_config),
     num_minibatches_processed_(0) {
-  if (config.store_component_stats && config.zero_component_stats)
+  if (config.zero_component_stats)
     ZeroComponentStats(nnet);
+  if (config.momentum == 0.0 && config.max_param_change == 0.0) {
+    delta_nnet_= NULL;
+  } else {
+    KALDI_ASSERT(config.momentum >= 0.0 &&
+                 config.max_param_change >= 0.0);
+    delta_nnet_ = nnet_->Copy();
+    bool is_gradient = false;  // setting this to true would disable the
+                               // natural-gradient updates.
+    SetZero(is_gradient, delta_nnet_);
+  }
+  if (config_.read_cache != "") {
+    bool binary;
+    try {
+      Input ki(config_.read_cache, &binary);
+      compiler_.ReadCache(ki.Stream(), binary);
+      KALDI_LOG << "Read computation cache from " << config_.read_cache;
+    } catch (...) {
+      KALDI_WARN << "Could not open cached computation. "
+                    "Probably this is the first training iteration.";
+    }
+  } 
 }
 
 
@@ -42,20 +64,36 @@ void NnetTrainer::Train(const NnetExample &eg) {
                         &request);
   const NnetComputation *computation = compiler_.Compile(request);
 
-  const Nnet *const_nnet = (config_.update_per_minibatch ?
-                            static_cast<const Nnet*>(nnet_->Copy()) :
-                            nnet_);
   NnetComputer computer(config_.compute_config, *computation,
-                        *const_nnet, nnet_);
+                        *nnet_,
+                        (delta_nnet_ == NULL ? nnet_ : delta_nnet_));
   // give the inputs to the computer object.
-  computer.AcceptInputs(*nnet_, eg);
+  computer.AcceptInputs(*nnet_, eg.io);
   computer.Forward();
 
   this->ProcessOutputs(eg, &computer);
   computer.Backward();
-  
-  if (config_.update_per_minibatch)
-    delete const_nnet;
+
+  if (delta_nnet_ != NULL) {
+    BaseFloat scale = (1.0 - config_.momentum);
+    if (config_.max_param_change != 0.0) {
+      BaseFloat param_delta =
+          std::sqrt(DotProduct(*delta_nnet_, *delta_nnet_)) * scale;
+      if (param_delta > config_.max_param_change) {
+        if (param_delta - param_delta != 0.0) {
+          KALDI_WARN << "Infinite parameter change, will not apply.";
+          SetZero(false, delta_nnet_);
+        } else {
+          scale *= config_.max_param_change / param_delta;
+          KALDI_LOG << "Parameter change too big: " << param_delta << " > "
+                    << "--max-param-change=" << config_.max_param_change
+                    << ", scaling by " << config_.max_param_change / param_delta;
+        }
+      }
+    }
+    AddNnet(*delta_nnet_, scale, nnet_);
+    ScaleNnet(config_.momentum, delta_nnet_);
+  }
 }
 
 void NnetTrainer::ProcessOutputs(const NnetExample &eg,
@@ -98,7 +136,8 @@ void ObjectiveFunctionInfo::UpdateStats(
     int32 minibatches_per_phase,
     int32 minibatch_counter,
     BaseFloat this_minibatch_weight,
-    BaseFloat this_minibatch_tot_objf) {
+    BaseFloat this_minibatch_tot_objf,
+    BaseFloat this_minibatch_tot_aux_objf) {
   int32 phase = minibatch_counter / minibatches_per_phase;
   if (phase != current_phase) {
     KALDI_ASSERT(phase == current_phase + 1); // or doesn't really make sense.
@@ -106,11 +145,14 @@ void ObjectiveFunctionInfo::UpdateStats(
     current_phase = phase;
     tot_weight_this_phase = 0.0;
     tot_objf_this_phase = 0.0;
+    tot_aux_objf_this_phase = 0.0;
   }
   tot_weight_this_phase += this_minibatch_weight;
   tot_objf_this_phase += this_minibatch_tot_objf;
+  tot_aux_objf_this_phase += this_minibatch_tot_aux_objf;
   tot_weight += this_minibatch_weight;
   tot_objf += this_minibatch_tot_objf;
+  tot_aux_objf += this_minibatch_tot_aux_objf;
 }
 
 void ObjectiveFunctionInfo::PrintStatsForThisPhase(
@@ -118,19 +160,52 @@ void ObjectiveFunctionInfo::PrintStatsForThisPhase(
     int32 minibatches_per_phase) const {
   int32 start_minibatch = current_phase * minibatches_per_phase,
       end_minibatch = start_minibatch + minibatches_per_phase - 1;
-  KALDI_LOG << "Average objective function for '" << output_name
-            << "' for minibatches " << start_minibatch
-            << '-' << end_minibatch << " is "
-            << (tot_objf_this_phase / tot_weight_this_phase) << " over "
-            << tot_weight_this_phase << " frames.";
+
+  if (tot_aux_objf_this_phase == 0.0) {
+    KALDI_LOG << "Average objective function for '" << output_name
+              << "' for minibatches " << start_minibatch
+              << '-' << end_minibatch << " is "
+              << (tot_objf_this_phase / tot_weight_this_phase) << " over "
+              << tot_weight_this_phase << " frames.";
+  } else {
+    BaseFloat objf = (tot_objf_this_phase / tot_weight_this_phase),
+        aux_objf = (tot_aux_objf_this_phase / tot_weight_this_phase),
+        sum_objf = objf + aux_objf;
+    KALDI_LOG << "Average objective function for '" << output_name
+              << "' for minibatches " << start_minibatch
+              << '-' << end_minibatch << " is "
+              << objf << " + " << aux_objf << " = " << sum_objf
+              << " over " << tot_weight_this_phase << " frames.";
+  }
 }
 
 bool ObjectiveFunctionInfo::PrintTotalStats(const std::string &name) const {
-  KALDI_LOG << "Overall average objective function for '" << name << "'is "
-            << (tot_objf / tot_weight) << " over " << tot_weight << " frames.";
+  BaseFloat objf = (tot_objf / tot_weight),
+        aux_objf = (tot_aux_objf / tot_weight),
+        sum_objf = objf + aux_objf;
+  if (tot_aux_objf == 0.0) {
+    KALDI_LOG << "Overall average objective function for '" << name << "' is "
+              << (tot_objf / tot_weight) << " over " << tot_weight << " frames.";
+  } else {
+    KALDI_LOG << "Overall average objective function for '" << name << "' is "
+              << objf << " + " << aux_objf << " = " << sum_objf        
+              << " over " << tot_weight << " frames.";
+  }
+  KALDI_LOG << "[this line is to be parsed by a script:] "
+            << "log-prob-per-frame="
+            << objf;
   return (tot_weight != 0.0);
 }
 
+NnetTrainer::~NnetTrainer() {
+  if (config_.write_cache != "") {
+    Output ko(config_.write_cache, config_.binary_write_cache);
+    compiler_.WriteCache(ko.Stream(), config_.binary_write_cache);
+    KALDI_LOG << "Wrote computation cache to " << config_.write_cache;
+  } 
+  delete delta_nnet_;
+}
+
 void ComputeObjectiveFunction(const GeneralMatrix &supervision,
                               ObjectiveType objective_type,
                               const std::string &output_name,
@@ -144,7 +219,7 @@ void ComputeObjectiveFunction(const GeneralMatrix &supervision,
     KALDI_ERR << "Nnet versus example output dimension (num-classes) "
               << "mismatch for '" << output_name << "': " << output.NumCols()
               << " (nnet) vs. " << supervision.NumCols() << " (egs)\n";
-  
+
   switch (objective_type) {
     case kLinear: {
       // objective is x * y.
@@ -205,7 +280,7 @@ void ComputeObjectiveFunction(const GeneralMatrix &supervision,
     default:
       KALDI_ERR << "Objective function type " << objective_type
                 << " not handled.";
-  }      
+  }
 }
 
 
diff --git a/src/nnet3/nnet-training.h b/src/nnet3/nnet-training.h
index 3463c733ddb..08d1ec54e8f 100644
--- a/src/nnet3/nnet-training.h
+++ b/src/nnet3/nnet-training.h
@@ -1,6 +1,7 @@
 // nnet3/nnet-training.h
 
 // Copyright    2015  Johns Hopkins University (author: Daniel Povey)
+//              2016  Xiaohui Zhang
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -34,15 +35,20 @@ struct NnetTrainerOptions {
   bool store_component_stats;
   int32 print_interval;
   bool debug_computation;
-  bool update_per_minibatch;
+  BaseFloat momentum;
+  std::string read_cache;
+  std::string write_cache;
+  bool binary_write_cache;
+  BaseFloat max_param_change;
   NnetOptimizeOptions optimize_config;
   NnetComputeOptions compute_config;
   NnetTrainerOptions():
       zero_component_stats(true),
-      store_component_stats(false),
+      store_component_stats(true),
       print_interval(100),
       debug_computation(false),
-      update_per_minibatch(false) { }
+      momentum(0.0),
+      max_param_change(2.0) { }
   void Register(OptionsItf *opts) {
     opts->Register("store-component-stats", &store_component_stats,
                    "If true, store activations and derivatives for nonlinear "
@@ -53,10 +59,21 @@ struct NnetTrainerOptions {
     opts->Register("print-interval", &print_interval, "Interval (measured in "
                    "minibatches) after which we print out objective function "
                    "during training\n");
-    opts->Register("update-per-minibatch", &update_per_minibatch, "If true, "
-                   "wait to apply model changes until the whole minibatch has "
-                   "been processed (requires copying the model on each "
-                   "minibatch ");
+    opts->Register("max-param-change", &max_param_change, "The maximum change in"
+                   "parameters allowed per minibatch, measured in Frobenius norm "
+                   "over the entire model (change will be clipped to this value)");
+    opts->Register("momentum", &momentum, "momentum constant to apply during "
+                   "training (help stabilize update).  e.g. 0.9.  Note: we "
+                   "automatically multiply the learning rate by (1-momenum) "
+                   "so that the 'effective' learning rate is the same as "
+                   "before (because momentum would normally increase the "
+                   "effective learning rate by 1/(1-momentum))");
+    opts->Register("read-cache", &read_cache, "the location where we can read "
+                   "the cached computation from");
+    opts->Register("write-cache", &write_cache, "the location where we want to "
+                   "write the cached computation to");
+    opts->Register("binary-write-cache", &binary_write_cache, "Write "
+                   "computation cache in binary mode");
 
     // register the optimization options with the prefix "optimization".
     ParseOptions optimization_opts("optimization", opts);
@@ -77,23 +94,30 @@ struct ObjectiveFunctionInfo {
 
   double tot_weight;
   double tot_objf;
+  double tot_aux_objf;  // An 'auxiliary' objective function that is optional-
+                        // may be used when things like regularization are being
+                        // used.
 
   double tot_weight_this_phase;
   double tot_objf_this_phase;
+  double tot_aux_objf_this_phase;
 
   ObjectiveFunctionInfo():
       current_phase(0),
-      tot_weight(0.0), tot_objf(0.0),
-      tot_weight_this_phase(0.0), tot_objf_this_phase(0.0) { }
+      tot_weight(0.0), tot_objf(0.0), tot_aux_objf(0.0),
+      tot_weight_this_phase(0.0), tot_objf_this_phase(0.0),
+      tot_aux_objf_this_phase(0.0) { }
 
   // This function updates the stats and, if the phase has just changed,
   // prints a message indicating progress.  The phase equals
-  // minibatch_counter / minibatches_per_phase.
+  // minibatch_counter / minibatches_per_phase.  Its only function is to
+  // control how frequently we print logging messages.
   void UpdateStats(const std::string &output_name,
                    int32 minibatches_per_phase,
                    int32 minibatch_counter,
                    BaseFloat this_minibatch_weight,
-                   BaseFloat this_minibatch_tot_objf);
+                   BaseFloat this_minibatch_tot_objf,
+                   BaseFloat this_minibatch_tot_aux_objf = 0.0);
 
   // Prints stats for the current phase.
   void PrintStatsForThisPhase(const std::string &output_name,
@@ -125,19 +149,26 @@ class NnetTrainer {
 
   // Prints out the final stats, and return true if there was a nonzero count.
   bool PrintTotalStats() const;
+
+  ~NnetTrainer();
  private:
   void ProcessOutputs(const NnetExample &eg,
                       NnetComputer *computer);
-  
+
   const NnetTrainerOptions config_;
   Nnet *nnet_;
+  Nnet *delta_nnet_;  // Only used if momentum != 0.0 or max-param-change !=
+                      // 0.0.  nnet representing accumulated parameter-change
+                      // (we'd call this gradient_nnet_, but due to
+                      // natural-gradient update, it's better to consider it as
+                      // a delta-parameter nnet.
   CachingOptimizingCompiler compiler_;
 
   // This code supports multiple output layers, even though in the
   // normal case there will be just one output layer named "output".
-  // So we store the objective functions per output layer.  
+  // So we store the objective functions per output layer.
   int32 num_minibatches_processed_;
-    
+
   unordered_map<std::string, ObjectiveFunctionInfo, StringHasher> objf_info_;
 };
 
diff --git a/src/nnet3/nnet-utils-test.cc b/src/nnet3/nnet-utils-test.cc
index 28d0fd3f3a0..ef1588044b2 100644
--- a/src/nnet3/nnet-utils-test.cc
+++ b/src/nnet3/nnet-utils-test.cc
@@ -1,6 +1,7 @@
 // nnet3/nnet-utils-test.cc
 
 // Copyright 2015  Johns Hopkins University (author: Daniel Povey)
+//           2016  Daniel Galvez
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -18,6 +19,7 @@
 // limitations under the License.
 
 #include "nnet3/nnet-nnet.h"
+#include "nnet3/nnet-simple-component.h"
 #include "nnet3/nnet-test-utils.h"
 
 namespace kaldi {
@@ -40,6 +42,61 @@ void UnitTestNnetContext() {
     ComputeSimpleNnetContext(nnet, &left_context, &right_context);
     KALDI_LOG << "Left,right-context= " << left_context << ","
               << right_context << " for config: " << configs[0];
+
+    KALDI_LOG << "Info for nnet is: " << NnetInfo(nnet);
+  }
+}
+
+void UnitTestConvertRepeatedToBlockAffine() {
+  // a test without a composite component.
+  std::string config =
+    "component name=repeated-affine1 type=RepeatedAffineComponent "
+    "input-dim=100 output-dim=200 num-repeats=20\n"
+    "component name=relu1 type=RectifiedLinearComponent dim=200\n"
+    "component name=block-affine1 type=BlockAffineComponent "
+    "input-dim=200 output-dim=100 num-blocks=10\n"
+    "component name=relu2 type=RectifiedLinearComponent dim=100\n"
+    "component name=repeated-affine2 type=NaturalGradientRepeatedAffineComponent "
+    "input-dim=100 output-dim=200 num-repeats=10\n"
+    "\n"
+    "input-node name=input dim=100\n"
+    "component-node name=repeated-affine1 component=repeated-affine1 input=input\n"
+    "component-node name=relu1 component=relu1 input=repeated-affine1\n"
+    "component-node name=block-affine1 component=block-affine1 input=relu1\n"
+    "component-node name=relu2 component=relu2 component=relu2 input=block-affine1\n"
+    "component-node name=repeated-affine2 component=repeated-affine2 input=relu2\n"
+    "output-node name=output input=repeated-affine2\n";
+
+  Nnet nnet;
+  std::istringstream is(config);
+  nnet.ReadConfig(is);
+  ConvertRepeatedToBlockAffine(&nnet);
+
+  for(int i = 0; i < nnet.NumComponents(); i++) {
+    Component *c = nnet.GetComponent(i);
+    KALDI_ASSERT(c->Type() != "RepeatedAffineComponent"
+                 && c->Type() != "NaturalGradientRepeatedAffineComponent");
+  }
+}
+
+void UnitTestConvertRepeatedToBlockAffineComposite() {
+  // test that repeated affine components nested within a CompositeComponent
+  // are converted.
+  struct NnetGenerationOptions gen_config;
+  gen_config.output_dim = 0;
+  std::vector<std::string> configs;
+  // this function generates a neural net with one component:
+  // a composite component.
+  GenerateConfigSequenceCompositeBlock(gen_config, &configs);
+  Nnet nnet;
+  std::istringstream is(configs[0]);
+  nnet.ReadConfig(is);
+  KALDI_ASSERT(nnet.NumComponents() == 1);
+  ConvertRepeatedToBlockAffine(&nnet);
+  CompositeComponent *cc = dynamic_cast<CompositeComponent*>(nnet.GetComponent(0));
+  for(int i = 0; i < cc->NumComponents(); i++) {
+    const Component *c = cc->GetComponent(i);
+    KALDI_ASSERT(c->Type() == "BlockAffineComponent");
   }
 }
 
@@ -52,6 +109,8 @@ int main() {
   SetVerboseLevel(2);
 
   UnitTestNnetContext();
+  UnitTestConvertRepeatedToBlockAffine();
+  UnitTestConvertRepeatedToBlockAffineComposite();
 
   KALDI_LOG << "Nnet tests succeeded.";
 
diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc
index 888c914b00e..2282f5c4e05 100644
--- a/src/nnet3/nnet-utils.cc
+++ b/src/nnet3/nnet-utils.cc
@@ -1,5 +1,8 @@
-// Copyright      2015  Johns Hopkins University (author: Daniel Povey)
+// nnet3/nnet-utils.cc
 
+// Copyright      2015  Johns Hopkins University (author: Daniel Povey)
+//                2016  Daniel Galvez
+//
 // See ../../COPYING for clarification regarding multiple authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
@@ -16,6 +19,7 @@
 // limitations under the License.
 
 #include "nnet3/nnet-utils.h"
+#include "nnet3/nnet-simple-component.h"
 
 namespace kaldi {
 namespace nnet3 {
@@ -38,10 +42,8 @@ int32 NumInputNodes(const Nnet &nnet) {
 
 
 bool IsSimpleNnet(const Nnet &nnet) {
-  // check that we have just one output node and it is
-  // called "output".
-  if (NumOutputNodes(nnet) != 1 ||
-      nnet.GetNodeIndex("output") == -1 ||
+  // check that we have an output node and called "output".
+  if (nnet.GetNodeIndex("output") == -1 ||
       !nnet.IsOutputNode(nnet.GetNodeIndex("output")))
     return false;
   // check that there is an input node named "input".
@@ -110,7 +112,7 @@ static void ComputeSimpleNnetContextForShift(
     request.inputs.push_back(ivector);
   std::vector<std::vector<bool> > computable;
   EvaluateComputationRequest(nnet, request, &computable);
-  
+
   KALDI_ASSERT(computable.size() == 1);
   std::vector<bool> &output_ok = computable[0];
   std::vector<bool>::iterator iter =
@@ -134,7 +136,7 @@ void ComputeSimpleNnetContext(const Nnet &nnet,
   // are a multiple of this number.  We need to test all shifts modulo
   // this number in case the left and right context vary at all within
   // this range.
-  
+
   std::vector<int32> left_contexts(modulus + 1);
   std::vector<int32> right_contexts(modulus + 1);
 
@@ -181,6 +183,44 @@ void PerturbParams(BaseFloat stddev,
   }
 }
 
+void ComponentDotProducts(const Nnet &nnet1,
+                          const Nnet &nnet2,
+                          VectorBase<BaseFloat> *dot_prod) {
+  KALDI_ASSERT(nnet1.NumComponents() == nnet2.NumComponents());
+  int32 updatable_c = 0;
+  for (int32 c = 0; c < nnet1.NumComponents(); c++) {
+    const Component *comp1 = nnet1.GetComponent(c),
+                    *comp2 = nnet2.GetComponent(c);
+    if (comp1->Properties() & kUpdatableComponent) {
+      const UpdatableComponent
+          *u_comp1 = dynamic_cast<const UpdatableComponent*>(comp1),
+          *u_comp2 = dynamic_cast<const UpdatableComponent*>(comp2);
+      KALDI_ASSERT(u_comp1 != NULL && u_comp2 != NULL);
+      dot_prod->Data()[updatable_c] = u_comp1->DotProduct(*u_comp2);
+      updatable_c++;
+    }
+  }
+  KALDI_ASSERT(updatable_c == dot_prod->Dim());
+}
+
+std::string PrintVectorPerUpdatableComponent(const Nnet &nnet,
+                                             const VectorBase<BaseFloat> &vec) {
+  std::ostringstream os;
+  os << "[ ";
+  KALDI_ASSERT(NumUpdatableComponents(nnet) == vec.Dim());
+  int32 updatable_c = 0;
+  for (int32 c = 0; c < nnet.NumComponents(); c++) {
+    const Component *comp = nnet.GetComponent(c);
+    if (comp->Properties() & kUpdatableComponent) {
+      const std::string &component_name = nnet.GetComponentName(c);
+      os << component_name << ':' << vec(updatable_c) << ' ';
+      updatable_c++;
+    }
+  }
+  KALDI_ASSERT(updatable_c == vec.Dim());
+  os << ']';
+  return os.str();
+}
 
 BaseFloat DotProduct(const Nnet &nnet1,
                      const Nnet &nnet2) {
@@ -208,6 +248,22 @@ void ZeroComponentStats(Nnet *nnet) {
   }
 }
 
+void ScaleLearningRate(BaseFloat learning_rate_scale,
+                     Nnet *nnet) {
+  for (int32 c = 0; c < nnet->NumComponents(); c++) {
+    Component *comp = nnet->GetComponent(c);
+    if (comp->Properties() & kUpdatableComponent) {
+      // For now all updatable components inherit from class UpdatableComponent.
+      // If that changes in future, we will change this code.
+      UpdatableComponent *uc = dynamic_cast<UpdatableComponent*>(comp);
+      if (uc == NULL)
+        KALDI_ERR << "Updatable component does not inherit from class "
+            "UpdatableComponent; change this code.";
+      uc->SetActualLearningRate(uc->LearningRate() * learning_rate_scale);
+    }
+  }
+}
+
 void SetLearningRate(BaseFloat learning_rate,
                      Nnet *nnet) {
   for (int32 c = 0; c < nnet->NumComponents(); c++) {
@@ -219,12 +275,52 @@ void SetLearningRate(BaseFloat learning_rate,
       if (uc == NULL)
         KALDI_ERR << "Updatable component does not inherit from class "
             "UpdatableComponent; change this code.";
-      uc->SetLearningRate(learning_rate);
+      uc->SetUnderlyingLearningRate(learning_rate);
     }
-  }  
+  }
 }
 
-void ScaleNnet(BaseFloat scale, Nnet *nnet) {
+void SetLearningRates(const Vector<BaseFloat> &learning_rates,
+                     Nnet *nnet) {
+  int32 i = 0;
+  for (int32 c = 0; c < nnet->NumComponents(); c++) {
+    Component *comp = nnet->GetComponent(c);
+    if (comp->Properties() & kUpdatableComponent) {
+      // For now all updatable components inherit from class UpdatableComponent.
+      // If that changes in future, we will change this code.
+      UpdatableComponent *uc = dynamic_cast<UpdatableComponent*>(comp);
+      if (uc == NULL)
+        KALDI_ERR << "Updatable component does not inherit from class "
+            "UpdatableComponent; change this code.";
+      KALDI_ASSERT(i < learning_rates.Dim());
+      uc->SetActualLearningRate(learning_rates(i++));
+    }
+  }
+  KALDI_ASSERT(i == learning_rates.Dim());
+}
+
+void GetLearningRates(const Nnet &nnet, 
+                      Vector<BaseFloat> *learning_rates) {
+  learning_rates->Resize(NumUpdatableComponents(nnet));
+  int32 i = 0;
+  for (int32 c = 0; c < nnet.NumComponents(); c++) {
+    const Component *comp = nnet.GetComponent(c);
+    if (comp->Properties() & kUpdatableComponent) {
+      // For now all updatable components inherit from class UpdatableComponent.
+      // If that changes in future, we will change this code.
+      const UpdatableComponent *uc = dynamic_cast<const UpdatableComponent*>(comp);
+      if (uc == NULL)
+        KALDI_ERR << "Updatable component does not inherit from class "
+            "UpdatableComponent; change this code.";
+      (*learning_rates)(i++) = uc->LearningRate();
+    }
+  }
+  KALDI_ASSERT(i == learning_rates->Dim());
+}
+
+void ScaleNnetComponents(const Vector<BaseFloat> &scale_factors,
+                         Nnet *nnet) {
+  int32 i = 0;
   for (int32 c = 0; c < nnet->NumComponents(); c++) {
     Component *comp = nnet->GetComponent(c);
     if (comp->Properties() & kUpdatableComponent) {
@@ -234,11 +330,22 @@ void ScaleNnet(BaseFloat scale, Nnet *nnet) {
       if (uc == NULL)
         KALDI_ERR << "Updatable component does not inherit from class "
             "UpdatableComponent; change this code.";
-      uc->Scale(scale);
+      KALDI_ASSERT(i < scale_factors.Dim());
+      uc->Scale(scale_factors(i++));
+    }
+  }
+  KALDI_ASSERT(i == scale_factors.Dim());
+}
+
+void ScaleNnet(BaseFloat scale, Nnet *nnet) {
+  if (scale == 1.0) return;
+  else if (scale == 0.0) {
+    SetZero(false, nnet);
+  } else {
+    for (int32 c = 0; c < nnet->NumComponents(); c++) {
+      Component *comp = nnet->GetComponent(c);
+      comp->Scale(scale);
     }
-    NonlinearComponent *nc = dynamic_cast<NonlinearComponent*>(comp);
-    if (nc != NULL)  // Scale the activation stats
-      nc->Scale(scale);
   }
 }
 
@@ -248,28 +355,8 @@ void AddNnet(const Nnet &src, BaseFloat alpha, Nnet *dest) {
   for (int32 c = 0; c < src.NumComponents(); c++) {
     const Component *src_comp = src.GetComponent(c);
     Component *dest_comp = dest->GetComponent(c);
-    if (src_comp->Properties() & kUpdatableComponent) {
-      const UpdatableComponent *src_uc =
-          dynamic_cast<const UpdatableComponent*>(src_comp);
-      UpdatableComponent *dest_uc =
-          dynamic_cast<UpdatableComponent*>(dest_comp);
-      if (src_uc == NULL || dest_uc == NULL)
-        KALDI_ERR << "Updatable component does not inherit from class "
-            "UpdatableComponent; change this code.  Type = "
-                  << src_uc->Type();
-      dest_uc->Add(alpha, *src_uc);
-    }
-    {
-      const NonlinearComponent *src_nc =
-          dynamic_cast<const NonlinearComponent*>(src_comp);
-      NonlinearComponent *dest_nc =
-          dynamic_cast<NonlinearComponent*>(dest_comp);
-      if (src_nc != NULL) {
-        KALDI_ASSERT(dest_nc != NULL && "Trying to add incompatible nnets");
-        dest_nc->Add(alpha, *src_nc);
-      }
-    }
-  }    
+    dest_comp->Add(alpha, *src_comp);
+  }
 }
 
 int32 NumParameters(const Nnet &src) {
@@ -345,5 +432,68 @@ int32 NumUpdatableComponents(const Nnet &dest) {
   return ans;
 }
 
+void ConvertRepeatedToBlockAffine(CompositeComponent *c_component) {
+  for(int32 i = 0; i < c_component->NumComponents(); i++) {
+    const Component *c = c_component->GetComponent(i);
+    KALDI_ASSERT(c->Type() != "CompositeComponent" &&
+                 "Nesting CompositeComponent within CompositeComponent is not allowed.\n"
+                 "(We may change this as more complicated components are introduced.)");
+
+    if(c->Type() == "RepeatedAffineComponent" ||
+       c->Type() == "NaturalGradientRepeatedAffineComponent") {
+      // N.B.: NaturalGradientRepeatedAffineComponent is a subclass of
+      // RepeatedAffineComponent.
+      const RepeatedAffineComponent *rac =
+        dynamic_cast<const RepeatedAffineComponent*>(c);
+      KALDI_ASSERT(rac != NULL);
+      BlockAffineComponent *bac = new BlockAffineComponent(*rac);
+      // following call deletes rac
+      c_component->SetComponent(i, bac);
+    }
+  }
+}
+
+void ConvertRepeatedToBlockAffine(Nnet *nnet) {
+  for(int32 i = 0; i < nnet->NumComponents(); i++) {
+    const Component *const_c = nnet->GetComponent(i);
+    if(const_c->Type() == "RepeatedAffineComponent" ||
+       const_c->Type() == "NaturalGradientRepeatedAffineComponent") {
+      // N.B.: NaturalGradientRepeatedAffineComponent is a subclass of
+      // RepeatedAffineComponent.
+      const RepeatedAffineComponent *rac =
+        dynamic_cast<const RepeatedAffineComponent*>(const_c);
+      KALDI_ASSERT(rac != NULL);
+      BlockAffineComponent *bac = new BlockAffineComponent(*rac);
+      // following call deletes rac
+      nnet->SetComponent(i, bac);
+    } else if (const_c->Type() == "CompositeComponent") {
+      // We must modify the composite component, so we use the
+      // non-const GetComponent() call here.
+      Component *c = nnet->GetComponent(i);
+      CompositeComponent *cc = dynamic_cast<CompositeComponent*>(c);
+      KALDI_ASSERT(cc != NULL);
+      ConvertRepeatedToBlockAffine(cc);
+    }
+  }
+}
+
+std::string NnetInfo(const Nnet &nnet) {
+  std::ostringstream ostr;
+  if (IsSimpleNnet(nnet)) {
+    int32 left_context, right_context;
+    // this call will crash if the nnet is not 'simple'.
+    ComputeSimpleNnetContext(nnet, &left_context, &right_context);
+    ostr << "left-context: " << left_context << "\n";
+    ostr << "right-context: " << right_context << "\n";
+  }
+  ostr << "input-dim: " << nnet.InputDim("input") << "\n";
+  ostr << "ivector-dim: " << nnet.InputDim("ivector") << "\n";
+  ostr << "output-dim: " << nnet.OutputDim("output") << "\n";
+  ostr << "# Nnet info follows.\n";
+  ostr << nnet.Info();
+  return ostr.str();
+}
+
+
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h
index 36095eaea6a..9b869aa7933 100644
--- a/src/nnet3/nnet-utils.h
+++ b/src/nnet3/nnet-utils.h
@@ -1,7 +1,7 @@
 // nnet3/nnet-utils.h
 
 // Copyright   2015  Johns Hopkins University (author: Daniel Povey)
-
+//             2016  Daniel Galvez
 // See ../../COPYING for clarification regarding multiple authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
@@ -70,9 +70,23 @@ void PerturbParams(BaseFloat stddev,
 BaseFloat DotProduct(const Nnet &nnet1,
                      const Nnet &nnet2);
 
+/// Returns dot products between two networks of the same structure (calls the
+/// DotProduct functions of the Updatable components and fill in the output
+/// vector).
+void ComponentDotProducts(const Nnet &nnet1,
+                          const Nnet &nnet2,
+                          VectorBase<BaseFloat> *dot_prod);
+
+/// This function is for printing, to a string, a vector with one element per
+/// updatable component of the nnet (e.g. the output of ComponentDotProducts),
+/// in a human readable way, as [ component-name1:number1
+/// component-name2:number2 ... ].
+std::string PrintVectorPerUpdatableComponent(const Nnet &nnet,
+                                             const VectorBase<BaseFloat> &vec);
 
 /// This function returns true if the nnet has the following properties:
-///  It has one output, called "output".
+///  It has an called "output" (other outputs are allowed but may be
+///          ignored).
 ///  It has an input called "input", and possibly an extra input called
 ///    "ivector", but no other inputs.
 ///  There are probably some other properties that we really ought to
@@ -89,22 +103,46 @@ void ZeroComponentStats(Nnet *nnet);
 /// It does this by constructing a ComputationRequest with a certain number of inputs
 /// available, outputs can be computed..  It does the same after shifting the time
 /// index of the output to all values 0, 1, ... n-1, where n is the output
-/// of Modulus(nnet).   Then it returns the largest left context and the largest
+/// of nnet.Modulus().   Then it returns the largest left context and the largest
 /// right context that it infers from any of these computation requests.
 void ComputeSimpleNnetContext(const Nnet &nnet,
                               int32 *left_context,
                               int32 *right_context);
 
 
-/// Sets the learning rate for all the components in the nnet to this value.
+/// Sets the underlying learning rate for all the components in the nnet to this
+/// value.  this will get multiplied by the individual learning-rate-factors to
+/// produce the actual learning rates.
 void SetLearningRate(BaseFloat learning_rate,
                      Nnet *nnet);
 
+/// Scales the actual learning rate for all the components in the nnet
+/// by this factor
+void ScaleLearningRate(BaseFloat learning_rate_scale,
+                       Nnet *nnet);
+
+/// Sets the actual learning rates for all the updatable components in the
+/// neural net to the values in 'learning_rates' vector
+/// (one for each updatable component).
+void SetLearningRates(const Vector<BaseFloat> &learning_rates,
+                      Nnet *nnet);
+
+/// Get the learning rates for all the updatable components in the neural net 
+/// (the output must have dim equal to the number of updatable components).
+void GetLearningRates(const Nnet &nnet,
+                      Vector<BaseFloat> *learning_rates);
+
 /// Scales the nnet parameters and stats by this scale.
 void ScaleNnet(BaseFloat scale, Nnet *nnet);
-               
+  
+/// Scales the parameters of each of the updatable components.
+/// Here, scales is a vector of size equal to the number of updatable
+/// components
+void ScaleNnetComponents(const Vector<BaseFloat> &scales,
+                         Nnet *nnet);
+
 /// Does *dest += alpha * src (affects nnet parameters and
-///  stored stats).
+/// stored stats).
 void AddNnet(const Nnet &src, BaseFloat alpha, Nnet *dest);
 
 /// Returns the total of the number of parameters in the updatable components of
@@ -124,7 +162,17 @@ void UnVectorizeNnet(const VectorBase<BaseFloat> &params,
 
 /// Returns the number of updatable components in the nnet.
 int32 NumUpdatableComponents(const Nnet &dest);
-                     
+
+/// Convert all components of type RepeatedAffineComponent or
+/// NaturalGradientRepeatedAffineComponent to BlockAffineComponent in nnet.
+void ConvertRepeatedToBlockAffine(Nnet *nnet);
+
+/// This function returns various info about the neural net.
+/// If the nnet satisfied IsSimpleNnet(nnet), the info includes "left-context=5\nright-context=3\n...".  The info includes
+/// the output of nnet.Info().
+/// This is modeled after the info that AmNnetSimple returns in its
+/// Info() function (we need this in the CTC code).
+std::string NnetInfo(const Nnet &nnet);
 
 
 } // namespace nnet3
diff --git a/src/nnet3/online-nnet3-decodable-simple.cc b/src/nnet3/online-nnet3-decodable-simple.cc
new file mode 100644
index 00000000000..c93394dfebd
--- /dev/null
+++ b/src/nnet3/online-nnet3-decodable-simple.cc
@@ -0,0 +1,221 @@
+// nnet3/online-nnet3-decodable.cc
+
+// Copyright  2014  Johns Hopkins University (author: Daniel Povey)
+//            2016  Api.ai (Author: Ilya Platonov)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <nnet3/online-nnet3-decodable-simple.h>
+#include "nnet3/nnet-utils.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+DecodableNnet3SimpleOnline::DecodableNnet3SimpleOnline(
+    const AmNnetSimple &am_nnet,
+    const TransitionModel &trans_model,
+    const DecodableNnet3OnlineOptions &opts,
+    OnlineFeatureInterface *input_feats):
+    compiler_(am_nnet.GetNnet(), opts.optimize_config),
+    features_(input_feats),
+    am_nnet_(am_nnet),
+    trans_model_(trans_model),
+    opts_(opts),
+    feat_dim_(input_feats->Dim()),
+    num_pdfs_(am_nnet.GetNnet().OutputDim("output")),
+    begin_frame_(-1) {
+  KALDI_ASSERT(opts_.max_nnet_batch_size > 0);
+  log_priors_ = am_nnet_.Priors();
+  KALDI_ASSERT((log_priors_.Dim() == 0 || log_priors_.Dim() == trans_model_.NumPdfs()) &&
+               "Priors in neural network must match with transition model (if exist).");
+
+  ComputeSimpleNnetContext(am_nnet_.GetNnet(), &left_context_, &right_context_);
+  log_priors_.ApplyLog();
+
+  // Check that the dimensions are correct.
+  int32 input_dim = am_nnet_.GetNnet().InputDim("input");
+  int32 ivector_dim = std::max<int32>(0, am_nnet_.GetNnet().InputDim("ivector"));
+  // We use feature extraction code that was designed for nnet2, which just
+  // appends the mfcc and ivector features.  So here we have to separate them
+  // again.  This code just checks that the dimension is as we expect.
+  int32 feature_dim = features_->Dim();
+  if (feature_dim != input_dim + ivector_dim) {
+    KALDI_ERR << "Dimension of features " << feature_dim << " does not equal "
+              << "input dim " << input_dim << " + ivector dim " << ivector_dim
+              << " of neural network.  Likely the config and neural net "
+              << "mismatch.";
+  }
+}
+
+
+
+BaseFloat DecodableNnet3SimpleOnline::LogLikelihood(int32 frame, int32 index) {
+  ComputeForFrame(frame);
+  int32 pdf_id = trans_model_.TransitionIdToPdf(index);
+  KALDI_ASSERT(frame >= begin_frame_ &&
+               frame < begin_frame_ + scaled_loglikes_.NumRows());
+  return scaled_loglikes_(frame - begin_frame_, pdf_id);
+}
+
+
+bool DecodableNnet3SimpleOnline::IsLastFrame(int32 frame) const {
+  KALDI_ASSERT(false && "Method is not imlemented");
+  return false;
+}
+
+int32 DecodableNnet3SimpleOnline::NumFramesReady() const {
+  int32 features_ready = features_->NumFramesReady();
+  if (features_ready == 0)
+    return 0;
+  bool input_finished = features_->IsLastFrame(features_ready - 1);
+  if (opts_.pad_input) {
+    // normal case... we'll pad with duplicates of first + last frame to get the
+    // required left and right context.
+    if (input_finished) return NumSubsampledFrames(features_ready);
+    else return std::max<int32>(0, NumSubsampledFrames(features_ready - right_context_));
+  } else {
+    return std::max<int32>(0, NumSubsampledFrames(features_ready - right_context_ - left_context_));
+  }
+}
+
+int32 DecodableNnet3SimpleOnline::NumSubsampledFrames(int32 num_frames) const {
+  return (num_frames) / opts_.frame_subsampling_factor;
+}
+
+void DecodableNnet3SimpleOnline::ComputeForFrame(int32 subsampled_frame) {
+  int32 features_ready = features_->NumFramesReady();
+  bool input_finished = features_->IsLastFrame(features_ready - 1);
+  KALDI_ASSERT(subsampled_frame >= 0);
+  if (subsampled_frame >= begin_frame_ &&
+      subsampled_frame < begin_frame_ + scaled_loglikes_.NumRows())
+    return;
+  KALDI_ASSERT(subsampled_frame < NumFramesReady());
+
+  int32 subsample = opts_.frame_subsampling_factor;
+
+  int32 input_frame_begin;
+  if (opts_.pad_input)
+    input_frame_begin = subsampled_frame * subsample  - left_context_;
+  else
+    input_frame_begin = subsampled_frame * subsample;
+  int32 max_possible_input_frame_end = features_ready;
+  if (input_finished && opts_.pad_input)
+    max_possible_input_frame_end += right_context_;
+  int32 input_frame_end = std::min<int32>(max_possible_input_frame_end,
+                                          input_frame_begin +
+                                          left_context_ + right_context_ +
+                                          opts_.max_nnet_batch_size);
+  KALDI_ASSERT(input_frame_end > input_frame_begin);
+  Matrix<BaseFloat> features(input_frame_end - input_frame_begin,
+                             feat_dim_);
+  for (int32 t = input_frame_begin; t < input_frame_end; t++) {
+    SubVector<BaseFloat> row(features, t - input_frame_begin);
+    int32 t_modified = t;
+    // The next two if-statements take care of "pad_input"
+    if (t_modified < 0)
+      t_modified = 0;
+    if (t_modified >= features_ready)
+      t_modified = features_ready - 1;
+    features_->GetFrame(t_modified, &row);
+  }
+
+  int32 num_subsampled_frames = NumSubsampledFrames(input_frame_end - input_frame_begin -
+          left_context_ - right_context_);
+  int32 mfcc_dim = am_nnet_.GetNnet().InputDim("input");
+  int32 ivector_dim = am_nnet_.GetNnet().InputDim("ivector");
+  // MFCCs in the left chunk
+  SubMatrix<BaseFloat> mfcc_mat(features.ColRange(0, mfcc_dim));
+
+  Vector<BaseFloat> input_ivector;
+  if(ivector_dim != -1){
+    // iVectors in the right chunk
+    KALDI_ASSERT(features.NumCols() == mfcc_dim + ivector_dim && "Mismatch in features dim");
+    SubMatrix<BaseFloat> ivector_mat(features.ColRange(mfcc_dim, ivector_dim));
+    // Get last ivector... not sure if GetCurrentIvector is needed in the online context
+    // I think it should work fine just getting the last row for testing
+    input_ivector = ivector_mat.Row(ivector_mat.NumRows() - 1);
+  }
+
+  DoNnetComputation(input_frame_begin,
+    mfcc_mat, input_ivector, subsampled_frame * subsample, num_subsampled_frames);
+
+  begin_frame_ = subsampled_frame;
+}
+
+void DecodableNnet3SimpleOnline::DoNnetComputation(
+    int32 input_t_start,
+    const MatrixBase<BaseFloat> &input_feats,
+    const VectorBase<BaseFloat> &ivector,
+    int32 output_t_start,
+    int32 num_subsampled_frames) {
+  ComputationRequest request;
+  request.need_model_derivative = false;
+  request.store_component_stats = false;
+
+  bool shift_time = true; // shift the 'input' and 'output' to a consistent
+                          // time, to take advantage of caching in the compiler.
+                          // An optimization.
+  int32 time_offset = (shift_time ? -output_t_start : 0);
+
+  // First add the regular features-- named "input".
+  request.inputs.reserve(2);
+  request.inputs.push_back(
+      IoSpecification("input", time_offset + input_t_start,
+                      time_offset + input_t_start + input_feats.NumRows()));
+  if (ivector.Dim() != 0) {
+    std::vector<Index> indexes;
+    indexes.push_back(Index(0, 0, 0));
+    request.inputs.push_back(IoSpecification("ivector", indexes));
+  }
+  IoSpecification output_spec;
+  output_spec.name = "output";
+  output_spec.has_deriv = false;
+  int32 subsample = opts_.frame_subsampling_factor;
+  output_spec.indexes.resize(num_subsampled_frames);
+  // leave n and x values at 0 (the constructor sets these).
+  for (int32 i = 0; i < num_subsampled_frames; i++)
+    output_spec.indexes[i].t = time_offset + output_t_start + i * subsample;
+  request.outputs.resize(1);
+  request.outputs[0].Swap(&output_spec);
+
+  const NnetComputation *computation = compiler_.Compile(request);
+  Nnet *nnet_to_update = NULL;  // we're not doing any update.
+  NnetComputer computer(opts_.compute_config, *computation,
+                        am_nnet_.GetNnet(), nnet_to_update);
+
+  CuMatrix<BaseFloat> input_feats_cu(input_feats);
+  computer.AcceptInput("input", &input_feats_cu);
+  CuMatrix<BaseFloat> ivector_feats_cu;
+  if (ivector.Dim() > 0) {
+    ivector_feats_cu.Resize(1, ivector.Dim());
+    ivector_feats_cu.Row(0).CopyFromVec(ivector);
+    computer.AcceptInput("ivector", &ivector_feats_cu);
+  }
+  computer.Forward();
+  CuMatrix<BaseFloat> cu_output;
+  computer.GetOutputDestructive("output", &cu_output);
+  // subtract log-prior (divide by prior)
+  if (log_priors_.Dim() != 0)
+    cu_output.AddVecToRows(-1.0, log_priors_);
+  // apply the acoustic scale
+  cu_output.Scale(opts_.acoustic_scale);
+  scaled_loglikes_.Resize(0, 0);
+  // the following statement just swaps the pointers if we're not using a GPU.
+  cu_output.Swap(&scaled_loglikes_);
+}
+
+} // namespace nnet3
+} // namespace kaldi
diff --git a/src/nnet3/online-nnet3-decodable-simple.h b/src/nnet3/online-nnet3-decodable-simple.h
new file mode 100644
index 00000000000..cfd70ccea38
--- /dev/null
+++ b/src/nnet3/online-nnet3-decodable-simple.h
@@ -0,0 +1,152 @@
+// nnet3/online-nnet3-decodable-simple.h
+
+// Copyright  2014  Johns Hopkins Universithy (author: Daniel Povey)
+//            2016  Api.ai (Author: Ilya Platonov)
+
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_NNET3_ONLINE_NNET3_DECODABLE_H_
+#define KALDI_NNET3_ONLINE_NNET3_DECODABLE_H_
+
+#include "itf/online-feature-itf.h"
+#include "itf/decodable-itf.h"
+#include "nnet3/am-nnet-simple.h"
+#include "nnet3/nnet-compute.h"
+#include "nnet3/nnet-optimize.h"
+#include "hmm/transition-model.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+// Note: see also nnet-compute-online.h, which provides a different
+// (lower-level) interface and more efficient for progressive evaluation of an
+// nnet throughout an utterance, with re-use of already-computed activations.
+
+struct DecodableNnet3OnlineOptions {
+  int32 frame_subsampling_factor;
+  BaseFloat acoustic_scale;
+  bool pad_input;
+  int32 max_nnet_batch_size;
+  NnetComputeOptions compute_config;
+  NnetOptimizeOptions optimize_config;
+
+  DecodableNnet3OnlineOptions():
+      frame_subsampling_factor(1),
+      acoustic_scale(0.1),
+      pad_input(true),
+      max_nnet_batch_size(256) { }
+
+  void Register(OptionsItf *opts) {
+    opts->Register("acoustic-scale", &acoustic_scale,
+                   "Scaling factor for acoustic likelihoods");
+    opts->Register("pad-input", &pad_input,
+                   "If true, pad acoustic features with required acoustic context "
+                   "past edges of file.");
+    opts->Register("max-nnet-batch-size", &max_nnet_batch_size,
+                   "Maximum batch size we use in neural-network decodable object, "
+                   "in cases where we are not constrained by currently available "
+                   "frames (this will rarely make a difference)");
+
+    opts->Register("frame-subsampling-factor", &frame_subsampling_factor,
+                   "Required if the frame-rate of the output (e.g. in 'chain' "
+                   "models) is less than the frame-rate of the original "
+                   "alignment.");
+
+    // register the optimization options with the prefix "optimization".
+    ParseOptions optimization_opts("optimization", opts);
+    optimize_config.Register(&optimization_opts);
+
+    // register the compute options with the prefix "computation".
+    ParseOptions compute_opts("computation", opts);
+    compute_config.Register(&compute_opts);
+
+  }
+};
+
+
+/**
+   This Decodable object for class nnet3::AmNnetSimple takes feature input from class
+   OnlineFeatureInterface, unlike, say, class DecodableAmNnet which takes
+   feature input from a matrix.
+*/
+
+class DecodableNnet3SimpleOnline: public DecodableInterface {
+ public:
+  DecodableNnet3SimpleOnline(const AmNnetSimple &am_nnet,
+                             const TransitionModel &trans_model,
+                             const DecodableNnet3OnlineOptions &opts,
+                             OnlineFeatureInterface *input_feats);
+
+
+  /// Returns the scaled log likelihood
+  virtual BaseFloat LogLikelihood(int32 frame, int32 index);
+
+  virtual bool IsLastFrame(int32 frame) const;
+
+  virtual int32 NumFramesReady() const;
+
+  /// Indices are one-based!  This is for compatibility with OpenFst.
+  virtual int32 NumIndices() const { return trans_model_.NumTransitionIds(); }
+
+ private:
+
+  /// If the neural-network outputs for this frame are not cached, it computes
+  /// them (and possibly for some succeeding frames)
+  void ComputeForFrame(int32 frame);
+  // corrects number of frames by frame_subsampling_factor;
+  int32 NumSubsampledFrames(int32) const;
+
+  void DoNnetComputation(
+      int32 input_t_start,
+      const MatrixBase<BaseFloat> &input_feats,
+      const VectorBase<BaseFloat> &ivector,
+      int32 output_t_start,
+      int32 num_subsampled_frames);
+
+  CachingOptimizingCompiler compiler_;
+
+  OnlineFeatureInterface *features_;
+  const AmNnetSimple &am_nnet_;
+  const TransitionModel &trans_model_;
+  DecodableNnet3OnlineOptions opts_;
+  CuVector<BaseFloat> log_priors_;  // log-priors taken from the model.
+  int32 feat_dim_;  // dimensionality of the input features.
+  int32 left_context_;  // Left context of the network (cached here)
+  int32 right_context_;  // Right context of the network (cached here)
+  int32 num_pdfs_;  // Number of pdfs, equals output-dim of the network (cached
+                    // here)
+
+  int32 begin_frame_;  // First frame for which scaled_loglikes_ is valid
+                       // (i.e. the first frame of the batch of frames for
+                       // which we've computed the output).
+
+  // scaled_loglikes_ contains the neural network pseudo-likelihoods: the log of
+  // (prob divided by the prior), scaled by opts.acoustic_scale).  We may
+  // compute this using the GPU, but we transfer it back to the system memory
+  // when we store it here.  These scores are only kept for a subset of frames,
+  // starting at begin_frame_, whose length depends how many frames were ready
+  // at the time we called LogLikelihood(), and will never exceed
+  // opts_.max_nnet_batch_size.
+  Matrix<BaseFloat> scaled_loglikes_;
+
+  KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableNnet3SimpleOnline);
+};
+
+} // namespace nnet3
+} // namespace kaldi
+
+#endif // KALDI_NNET3_ONLINE_NNET3_DECODABLE_H_
diff --git a/src/nnet3bin/Makefile b/src/nnet3bin/Makefile
index 27546408adb..7f431b626db 100644
--- a/src/nnet3bin/Makefile
+++ b/src/nnet3bin/Makefile
@@ -10,7 +10,14 @@ BINFILES = nnet3-init nnet3-info nnet3-get-egs nnet3-copy-egs nnet3-subset-egs \
    nnet3-shuffle-egs nnet3-acc-lda-stats nnet3-merge-egs \
    nnet3-compute-from-egs nnet3-train nnet3-am-init nnet3-am-train-transitions \
    nnet3-am-adjust-priors nnet3-am-copy nnet3-compute-prob \
-   nnet3-average nnet3-am-info nnet3-combine nnet3-latgen-faster
+   nnet3-average nnet3-am-info nnet3-combine nnet3-latgen-faster \
+   nnet3-copy nnet3-show-progress nnet3-align-compiled \
+   nnet3-get-egs-dense-targets nnet3-compute nnet3-modify-learning-rates \
+	 nnet3-discriminative-get-egs nnet3-discriminative-copy-egs \
+	 nnet3-discriminative-merge-egs nnet3-discriminative-shuffle-egs \
+	 nnet3-discriminative-compute-objf nnet3-discriminative-train \
+	 discriminative-get-supervision nnet3-discriminative-subset-egs \
+	 nnet3-discriminative-compute-from-egs
 
 OBJFILES =
 
@@ -19,11 +26,11 @@ cuda-compiled.o: ../kaldi.mk
 
 TESTFILES =
 
-ADDLIBS = ../nnet3/kaldi-nnet3.a  ../gmm/kaldi-gmm.a \
+ADDLIBS = ../chain/kaldi-chain.a ../nnet3/kaldi-nnet3.a ../gmm/kaldi-gmm.a \
          ../decoder/kaldi-decoder.a ../lat/kaldi-lat.a ../hmm/kaldi-hmm.a  \
          ../transform/kaldi-transform.a ../tree/kaldi-tree.a \
-         ../thread/kaldi-thread.a ../cudamatrix/kaldi-cudamatrix.a \
+         ../cudamatrix/kaldi-cudamatrix.a \
          ../matrix/kaldi-matrix.a ../fstext/kaldi-fstext.a \
-         ../util/kaldi-util.a ../base/kaldi-base.a 
+         ../util/kaldi-util.a ../thread/kaldi-thread.a ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/nnet3bin/discriminative-get-supervision.cc b/src/nnet3bin/discriminative-get-supervision.cc
new file mode 100644
index 00000000000..32d66c1c55a
--- /dev/null
+++ b/src/nnet3bin/discriminative-get-supervision.cc
@@ -0,0 +1,100 @@
+// nnet3bin/discriminative-get-supervision.cc
+
+// Copyright      2015  Johns Hopkins University (author:  Daniel Povey)
+// Copyright 2014-2015  Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sstream>
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "nnet3/discriminative-supervision.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::discriminative;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Get a discriminative training supervision object for each file of training data.\n"
+        "This will normally be piped into nnet3-discriminative-get-egs, where it\n"
+        "will be split up into pieces and combined with the features.\n"
+        "Usage: discriminative-get-supervision [options] <ali-rspecifier> \\\n" 
+        "<den-lattice-rspecifier> <supervision-wspecifier>\n";
+
+    DiscriminativeSupervisionOptions sup_opts;
+
+    ParseOptions po(usage);
+
+    sup_opts.Register(&po);
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+    
+    std::string num_ali_rspecifier = po.GetArg(1),
+                den_lat_rspecifier = po.GetArg(2),
+                supervision_wspecifier = po.GetArg(3);
+
+    DiscriminativeSupervisionWriter supervision_writer(supervision_wspecifier);
+    RandomAccessLatticeReader den_lat_reader(den_lat_rspecifier);
+    SequentialInt32VectorReader ali_reader(num_ali_rspecifier);
+
+    int32 num_utts_done = 0, num_utts_error = 0;
+
+    for (; !ali_reader.Done(); ali_reader.Next())  {
+      const std::string &key = ali_reader.Key();
+      const std::vector<int32> &num_ali = ali_reader.Value();
+      
+      if (!den_lat_reader.HasKey(key)) {
+        KALDI_WARN << "Could not find denominator lattice for utterance "
+                   << key;
+        num_utts_error++;
+        continue;
+      }
+
+      const Lattice &den_lat = den_lat_reader.Value(key);
+
+      DiscriminativeSupervision supervision;
+
+      if (!supervision.Initialize(num_ali, den_lat, 1.0)) {
+        KALDI_WARN << "Failed to convert lattice to supervision "
+          << "for utterance " << key;
+        num_utts_error++;
+        continue;
+      }
+
+      supervision_writer.Write(key, supervision);
+      
+      num_utts_done++;
+    } 
+    
+    KALDI_LOG << "Generated discriminative supervision information for "
+              << num_utts_done << " utterances, errors on "
+              << num_utts_error;
+    return (num_utts_done > num_utts_error ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
diff --git a/src/nnet3bin/nnet3-acc-lda-stats.cc b/src/nnet3bin/nnet3-acc-lda-stats.cc
index 307feb91da1..b59f467c7da 100644
--- a/src/nnet3bin/nnet3-acc-lda-stats.cc
+++ b/src/nnet3bin/nnet3-acc-lda-stats.cc
@@ -45,7 +45,7 @@ class NnetLdaStatsAccumulator {
       options.debug = true;
     NnetComputer computer(options, computation, nnet_, NULL);
     
-    computer.AcceptInputs(nnet_, eg);
+    computer.AcceptInputs(nnet_, eg.io);
     computer.Forward();
     const CuMatrixBase<BaseFloat> &nnet_output = computer.GetOutput("output");
     AccStatsFromOutput(eg, nnet_output);
diff --git a/src/nnet3bin/nnet3-align-compiled.cc b/src/nnet3bin/nnet3-align-compiled.cc
new file mode 100644
index 00000000000..61d822edd82
--- /dev/null
+++ b/src/nnet3bin/nnet3-align-compiled.cc
@@ -0,0 +1,208 @@
+// nnet2bin/nnet-align-compiled.cc
+
+// Copyright 2009-2012     Microsoft Corporation
+//                         Johns Hopkins University (author: Daniel Povey)
+//                2015     Vijayaditya Peddinti
+//                2015-16  Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "gmm/am-diag-gmm.h"
+#include "hmm/transition-model.h"
+#include "hmm/hmm-utils.h"
+#include "fstext/fstext-lib.h"
+#include "decoder/decoder-wrappers.h"
+#include "decoder/training-graph-compiler.h"
+#include "nnet3/nnet-am-decodable-simple.h"
+#include "lat/kaldi-lattice.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    using fst::SymbolTable;
+    using fst::VectorFst;
+    using fst::StdArc;
+
+    const char *usage =
+        "Align features given nnet3 neural net model\n"
+        "Usage:   nnet3-align-compiled [options] <nnet-in> <graphs-rspecifier> "
+        "<features-rspecifier> <alignments-wspecifier>\n"
+        "e.g.: \n"
+        " nnet3-align-compiled 1.mdl ark:graphs.fsts scp:train.scp ark:1.ali\n"
+        "or:\n"
+        " compile-train-graphs tree 1.mdl lex.fst ark:train.tra b, ark:- | \\\n"
+        "   nnet3-align-compiled 1.mdl ark:- scp:train.scp t, ark:1.ali\n";
+
+    ParseOptions po(usage);
+    AlignConfig align_config;
+    NnetSimpleComputationOptions decodable_opts;
+    std::string use_gpu = "yes";
+    BaseFloat acoustic_scale = 1.0;
+    BaseFloat transition_scale = 1.0;
+    BaseFloat self_loop_scale = 1.0;
+
+    std::string ivector_rspecifier,
+        online_ivector_rspecifier,
+        utt2spk_rspecifier;
+    int32 online_ivector_period = 0;
+    align_config.Register(&po);
+    decodable_opts.Register(&po);
+    
+    po.Register("use-gpu", &use_gpu,
+                "yes|no|optional|wait, only has effect if compiled with CUDA");
+    po.Register("transition-scale", &transition_scale,
+                "Transition-probability scale [relative to acoustics]");
+    po.Register("acoustic-scale", &acoustic_scale,
+                "Scaling factor for acoustic likelihoods");
+    po.Register("self-loop-scale", &self_loop_scale,
+                "Scale of self-loop versus non-self-loop "
+                "log probs [relative to acoustics]");
+    po.Register("ivectors", &ivector_rspecifier, "Rspecifier for "
+                "iVectors as vectors (i.e. not estimated online); per utterance "
+                "by default, or per speaker if you provide the --utt2spk option.");
+    po.Register("online-ivectors", &online_ivector_rspecifier, "Rspecifier for "
+                "iVectors estimated online, as matrices.  If you supply this,"
+                " you must set the --online-ivector-period option.");
+    po.Register("online-ivector-period", &online_ivector_period, "Number of frames "
+                "between iVectors in matrices supplied to the --online-ivectors "
+                "option");
+    po.Register("use-gpu", &use_gpu,
+                "yes|no|optional|wait, only has effect if compiled with CUDA");
+    po.Read(argc, argv);
+
+    if (po.NumArgs() < 4 || po.NumArgs() > 5) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+#if HAVE_CUDA==1
+    CuDevice::Instantiate().SelectGpuId(use_gpu);
+#endif
+
+    std::string model_in_filename = po.GetArg(1),
+        fst_rspecifier = po.GetArg(2),
+        feature_rspecifier = po.GetArg(3),
+        alignment_wspecifier = po.GetArg(4),
+        scores_wspecifier = po.GetOptArg(5);
+
+    int num_done = 0, num_err = 0, num_retry = 0;
+    double tot_like = 0.0;
+    kaldi::int64 frame_count = 0;
+
+    {
+      TransitionModel trans_model;
+      AmNnetSimple am_nnet;
+      {
+        bool binary;
+        Input ki(model_in_filename, &binary);
+        trans_model.Read(ki.Stream(), binary);
+        am_nnet.Read(ki.Stream(), binary);
+      }
+
+      RandomAccessBaseFloatMatrixReader online_ivector_reader(
+          online_ivector_rspecifier);
+      RandomAccessBaseFloatVectorReaderMapped ivector_reader(
+          ivector_rspecifier, utt2spk_rspecifier);
+
+
+      SequentialTableReader<fst::VectorFstHolder> fst_reader(fst_rspecifier);
+      RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
+      Int32VectorWriter alignment_writer(alignment_wspecifier);
+      BaseFloatWriter scores_writer(scores_wspecifier);
+
+
+      for (; !fst_reader.Done(); fst_reader.Next()) {
+        std::string utt = fst_reader.Key();
+        if (!feature_reader.HasKey(utt)) {
+          KALDI_WARN << "No features for utterance " << utt;
+          num_err++;
+          continue;
+        }
+        const Matrix<BaseFloat> &features = feature_reader.Value(utt);
+        VectorFst<StdArc> decode_fst(fst_reader.Value());
+        fst_reader.FreeCurrent();  // this stops copy-on-write of the fst
+        // by deleting the fst inside the reader, since we're about to mutate
+        // the fst by adding transition probs.
+
+        if (features.NumRows() == 0) {
+          KALDI_WARN << "Zero-length utterance: " << utt;
+          num_err++;
+          continue;
+        }
+
+        const Matrix<BaseFloat> *online_ivectors = NULL;
+        const Vector<BaseFloat> *ivector = NULL;
+        if (!ivector_rspecifier.empty()) {
+          if (!ivector_reader.HasKey(utt)) {
+            KALDI_WARN << "No iVector available for utterance " << utt;
+            num_err++;
+            continue;
+          } else {
+            ivector = &ivector_reader.Value(utt);
+          }
+        }
+        if (!online_ivector_rspecifier.empty()) {
+          if (!online_ivector_reader.HasKey(utt)) {
+            KALDI_WARN << "No online iVector available for utterance " << utt;
+            num_err++;
+            continue;
+          } else {
+            online_ivectors = &online_ivector_reader.Value(utt);
+          }
+        }
+
+        {  // Add transition-probs to the FST.
+          std::vector<int32> disambig_syms;  // empty.
+          AddTransitionProbs(trans_model, disambig_syms,
+                             transition_scale, self_loop_scale,
+                             &decode_fst);
+        }
+
+        DecodableAmNnetSimple nnet_decodable(
+            decodable_opts, trans_model, am_nnet,
+            features, ivector, online_ivectors,
+            online_ivector_period);
+
+        AlignUtteranceWrapper(align_config, utt,
+                              decodable_opts.acoustic_scale,
+                              &decode_fst, &nnet_decodable,
+                              &alignment_writer, &scores_writer,
+                              &num_done, &num_err, &num_retry,
+                              &tot_like, &frame_count);
+      }
+      KALDI_LOG << "Overall log-likelihood per frame is "
+                << (tot_like/frame_count)
+                << " over " << frame_count<< " frames.";
+      KALDI_LOG << "Retried " << num_retry << " out of "
+                << (num_done + num_err) << " utterances.";
+      KALDI_LOG << "Done " << num_done << ", errors on " << num_err;
+    }
+
+#if HAVE_CUDA==1
+    CuDevice::Instantiate().PrintProfile();
+#endif
+    return (num_done != 0 ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
+
+
diff --git a/src/nnet3bin/nnet3-am-copy.cc b/src/nnet3bin/nnet3-am-copy.cc
index d6f4be3217e..dd38288418e 100644
--- a/src/nnet3bin/nnet3-am-copy.cc
+++ b/src/nnet3bin/nnet3-am-copy.cc
@@ -1,6 +1,7 @@
 // nnet3bin/nnet3-am-copy.cc
 
-// Copyright 2012  Johns Hopkins University (author:  Daniel Povey)
+// Copyright 2012-2015  Johns Hopkins University (author:  Daniel Povey)
+//           2016 Daniel Galvez
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -33,9 +34,8 @@ int main(int argc, char *argv[]) {
     const char *usage =
         "Copy nnet3 neural-net acoustic model file; supports conversion\n"
         "to raw model (--raw=true).\n"
-        "Also supports multiplying all the learning rates by a factor\n"
-        "(the --learning-rate-factor option) and setting them all to supplied\n"
-        "values (the --learning-rate and --learning-rates options),\n"
+        "Also supports setting all learning rates to a supplied\n"
+        "value (the --learning-rate option),\n"
         "and supports replacing the raw nnet in the model (the Nnet)\n"
         "with a provided raw nnet (the --set-raw-nnet option)\n"
         "\n"
@@ -47,8 +47,11 @@ int main(int argc, char *argv[]) {
     bool binary_write = true,
         raw = false;
     BaseFloat learning_rate = -1;
+    BaseFloat learning_rate_scale = 1;
     std::string set_raw_nnet = "";
-    
+    bool convert_repeated_to_block = false;
+    BaseFloat scale = 1.0;
+
     ParseOptions po(usage);
     po.Register("binary", &binary_write, "Write output in binary mode");
     po.Register("raw", &raw, "If true, write only 'raw' neural net "
@@ -57,13 +60,22 @@ int main(int argc, char *argv[]) {
                 "Set the raw nnet inside the model to the one provided in "
                 "the option string (interpreted as an rxfilename).  Done "
                 "before the learning-rate is changed.");
+    po.Register("convert-repeated-to-block", &convert_repeated_to_block,
+                "Convert all RepeatedAffineComponents and "
+                "NaturalGradientRepeatedAffineComponents to "
+                "BlockAffineComponents in the model. Done after set-raw-nnet.");
     po.Register("learning-rate", &learning_rate,
                 "If supplied, all the learning rates of updatable components"
-                "are set to this value.");
+                " are set to this value.");
+    po.Register("learning-rate-scale", &learning_rate_scale,
+                "Scales the learning rate of updatable components by this "
+                "factor");
+    po.Register("scale", &scale, "The parameter matrices are scaled"
+                " by the specified value.");
 
 
     po.Read(argc, argv);
-    
+
     if (po.NumArgs() != 2) {
       po.PrintUsage();
       exit(1);
@@ -71,7 +83,7 @@ int main(int argc, char *argv[]) {
 
     std::string nnet_rxfilename = po.GetArg(1),
         nnet_wxfilename = po.GetArg(2);
-    
+
     TransitionModel trans_model;
     AmNnetSimple am_nnet;
     {
@@ -86,16 +98,26 @@ int main(int argc, char *argv[]) {
       ReadKaldiObject(set_raw_nnet, &nnet);
       am_nnet.SetNnet(nnet);
     }
-    
+
+    if(convert_repeated_to_block)
+      ConvertRepeatedToBlockAffine(&(am_nnet.GetNnet()));
+
     if (learning_rate >= 0)
       SetLearningRate(learning_rate, &(am_nnet.GetNnet()));
+    
+    KALDI_ASSERT(learning_rate_scale >= 0.0);
+
+    if (learning_rate_scale != 1.0)
+      ScaleLearningRate(learning_rate_scale, &(am_nnet.GetNnet()));
 
+    if (scale != 1.0)
+      ScaleNnet(scale, &(am_nnet.GetNnet()));
 
     if (raw) {
       WriteKaldiObject(am_nnet.GetNnet(), nnet_wxfilename, binary_write);
       KALDI_LOG << "Copied neural net from " << nnet_rxfilename
                 << " to raw format as " << nnet_wxfilename;
-      
+
     } else {
       Output ko(nnet_wxfilename, binary_write);
       trans_model.Write(ko.Stream(), binary_write);
diff --git a/src/nnet3bin/nnet3-combine.cc b/src/nnet3bin/nnet3-combine.cc
index 2bb71ade3db..525fe664353 100644
--- a/src/nnet3bin/nnet3-combine.cc
+++ b/src/nnet3bin/nnet3-combine.cc
@@ -85,23 +85,27 @@ int main(int argc, char *argv[]) {
     
     
     int32 num_nnets = po.NumArgs() - 2;
-    NnetCombiner combiner(combine_config, num_nnets, egs, nnet);
+    if (num_nnets > 1 || !combine_config.enforce_sum_to_one) {
+      NnetCombiner combiner(combine_config, num_nnets, egs, nnet);
+      
+      for (int32 n = 1; n < num_nnets; n++) {
+        ReadKaldiObject(po.GetArg(1 + n), &nnet);
+        combiner.AcceptNnet(nnet);
+      }
 
-    
-    for (int32 n = 1; n < num_nnets; n++) {
-      ReadKaldiObject(po.GetArg(1 + n), &nnet);
-      combiner.AcceptNnet(nnet);
-    }
-
-    combiner.Combine();
+      combiner.Combine();
 
 
 #if HAVE_CUDA==1
-    CuDevice::Instantiate().PrintProfile();
+      CuDevice::Instantiate().PrintProfile();
 #endif
 
-    WriteKaldiObject(combiner.GetNnet(), nnet_wxfilename, binary_write);
-    
+      WriteKaldiObject(combiner.GetNnet(), nnet_wxfilename, binary_write);
+    } else {
+      KALDI_LOG << "Copying the single input model directly to the output, "
+                << "without any combination.";
+      WriteKaldiObject(nnet, nnet_wxfilename, binary_write);
+    } 
     KALDI_LOG << "Finished combining neural nets, wrote model to "
               << nnet_wxfilename;
   } catch(const std::exception &e) {
diff --git a/src/nnet3bin/nnet3-compute-from-egs.cc b/src/nnet3bin/nnet3-compute-from-egs.cc
index 6a5766252a7..66eace0dab5 100644
--- a/src/nnet3bin/nnet3-compute-from-egs.cc
+++ b/src/nnet3bin/nnet3-compute-from-egs.cc
@@ -45,7 +45,7 @@ class NnetComputerFromEg {
     if (GetVerboseLevel() >= 3)
       options.debug = true;
     NnetComputer computer(options, computation, nnet_, NULL);
-    computer.AcceptInputs(nnet_, eg);
+    computer.AcceptInputs(nnet_, eg.io);
     computer.Forward();
     const CuMatrixBase<BaseFloat> &nnet_output = computer.GetOutput("output");
     output->Resize(nnet_output.NumRows(), nnet_output.NumCols());
diff --git a/src/nnet3bin/nnet3-compute.cc b/src/nnet3bin/nnet3-compute.cc
new file mode 100644
index 00000000000..9db3569500e
--- /dev/null
+++ b/src/nnet3bin/nnet3-compute.cc
@@ -0,0 +1,173 @@
+// nnet3bin/nnet3-compute.cc
+
+// Copyright 2012-2015   Johns Hopkins University (author: Daniel Povey)
+//                2015   Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "nnet3/nnet-am-decodable-simple.h"
+#include "base/timer.h"
+#include "nnet3/nnet-utils.h"
+
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Propagate the features through raw neural network model "
+        "and write the output.\n"
+        "If --apply-exp=true, apply the Exp() function to the output "
+        "before writing it out.\n"
+        "\n"
+        "Usage: nnet3-compute [options] <raw-nnet-in> <features-rspecifier> <matrix-wspecifier>\n"
+        " e.g.: nnet3-compute final.raw scp:feats.scp ark:nnet_prediction.ark\n"
+        "See also: nnet3-compute-from-egs\n";
+
+    ParseOptions po(usage);
+    Timer timer;
+
+    NnetSimpleComputationOptions opts;
+    opts.acoustic_scale = 1.0; // by default do no scaling in this recipe.
+
+    bool apply_exp = false;
+    std::string use_gpu = "yes";
+
+    std::string word_syms_filename;
+    std::string ivector_rspecifier,
+                online_ivector_rspecifier,
+                utt2spk_rspecifier;
+    int32 online_ivector_period = 0;
+    opts.Register(&po);
+
+    po.Register("ivectors", &ivector_rspecifier, "Rspecifier for "
+                "iVectors as vectors (i.e. not estimated online); per utterance "
+                "by default, or per speaker if you provide the --utt2spk option.");
+    po.Register("utt2spk", &utt2spk_rspecifier, "Rspecifier for "
+                "utt2spk option used to get ivectors per speaker");
+    po.Register("online-ivectors", &online_ivector_rspecifier, "Rspecifier for "
+                "iVectors estimated online, as matrices.  If you supply this,"
+                " you must set the --online-ivector-period option.");
+    po.Register("online-ivector-period", &online_ivector_period, "Number of frames "
+                "between iVectors in matrices supplied to the --online-ivectors "
+                "option");
+    po.Register("apply-exp", &apply_exp, "If true, apply exp function to "
+                "output");
+    po.Register("use-gpu", &use_gpu,
+                "yes|no|optional|wait, only has effect if compiled with CUDA");
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+#if HAVE_CUDA==1
+    CuDevice::Instantiate().SelectGpuId(use_gpu);
+#endif
+
+    std::string nnet_rxfilename = po.GetArg(1),
+                feature_rspecifier = po.GetArg(2),
+                matrix_wspecifier = po.GetArg(3);
+
+    Nnet nnet;
+    ReadKaldiObject(nnet_rxfilename, &nnet);
+
+    RandomAccessBaseFloatMatrixReader online_ivector_reader(
+        online_ivector_rspecifier);
+    RandomAccessBaseFloatVectorReaderMapped ivector_reader(
+        ivector_rspecifier, utt2spk_rspecifier);
+
+    BaseFloatMatrixWriter matrix_writer(matrix_wspecifier);
+
+    int32 num_success = 0, num_fail = 0;
+    int64 frame_count = 0;
+
+    SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
+
+    for (; !feature_reader.Done(); feature_reader.Next()) {
+      std::string utt = feature_reader.Key();
+      const Matrix<BaseFloat> &features (feature_reader.Value());
+      if (features.NumRows() == 0) {
+        KALDI_WARN << "Zero-length utterance: " << utt;
+        num_fail++;
+        continue;
+      }
+      const Matrix<BaseFloat> *online_ivectors = NULL;
+      const Vector<BaseFloat> *ivector = NULL;
+      if (!ivector_rspecifier.empty()) {
+        if (!ivector_reader.HasKey(utt)) {
+          KALDI_WARN << "No iVector available for utterance " << utt;
+          num_fail++;
+          continue;
+        } else {
+          ivector = &ivector_reader.Value(utt);
+        }
+      }
+      if (!online_ivector_rspecifier.empty()) {
+        if (!online_ivector_reader.HasKey(utt)) {
+          KALDI_WARN << "No online iVector available for utterance " << utt;
+          num_fail++;
+          continue;
+        } else {
+          online_ivectors = &online_ivector_reader.Value(utt);
+        }
+      }
+
+      Vector<BaseFloat> priors;
+      NnetDecodableBase nnet_computer(
+          opts, nnet, priors,
+          features,
+          ivector, online_ivectors,
+          online_ivector_period);
+
+      Matrix<BaseFloat> matrix(nnet_computer.NumFrames(),
+                               nnet_computer.OutputDim());
+      for (int32 t = 0; t < nnet_computer.NumFrames(); t++) {
+        SubVector<BaseFloat> row(matrix, t);
+        nnet_computer.GetOutputForFrame(t, &row);
+      }
+
+      if (apply_exp)
+        matrix.ApplyExp();
+
+      matrix_writer.Write(utt, matrix);
+
+      frame_count += features.NumRows();
+      num_success++;
+    }
+
+    double elapsed = timer.Elapsed();
+    KALDI_LOG << "Time taken "<< elapsed
+              << "s: real-time factor assuming 100 frames/sec is "
+              << (elapsed*100.0/frame_count);
+    KALDI_LOG << "Done " << num_success << " utterances, failed for "
+              << num_fail;
+
+    if (num_success != 0) return 0;
+    else return 1;
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
diff --git a/src/nnet3bin/nnet3-copy-egs.cc b/src/nnet3bin/nnet3-copy-egs.cc
index 304a63ac042..efb51f51910 100644
--- a/src/nnet3bin/nnet3-copy-egs.cc
+++ b/src/nnet3bin/nnet3-copy-egs.cc
@@ -22,6 +22,7 @@
 #include "util/common-utils.h"
 #include "hmm/transition-model.h"
 #include "nnet3/nnet-example.h"
+#include "nnet3/nnet-example-utils.h"
 
 namespace kaldi {
 namespace nnet3 {
@@ -40,7 +41,7 @@ int32 GetCount(double expected_count) {
 /** Returns true if the "eg" contains just a single example, meaning
     that all the "n" values in the indexes are zero, and the example
     has NnetIo members named both "input" and "output"
-    
+
     Also computes the minimum and maximum "t" values in the "input" and
     "output" NnetIo members.
  */
@@ -178,7 +179,7 @@ void FilterExample(const NnetExample &eg,
 
    If left_context != -1 it removes any inputs with t < (smallest output - left_context).
       If left_context != -1 it removes any inputs with t < (smallest output - left_context).
-   
+
    It returns true if it was able to select a frame.  We only anticipate it ever
    returning false in situations where frame is an integer, and the eg came from
    the end of a file and has a smaller than normal number of supervised frames.
@@ -188,6 +189,7 @@ bool SelectFromExample(const NnetExample &eg,
                        std::string frame_str,
                        int32 left_context,
                        int32 right_context,
+                       int32 frame_shift,
                        NnetExample *eg_out) {
   int32 min_input_t, max_input_t,
       min_output_t, max_output_t;
@@ -233,6 +235,11 @@ bool SelectFromExample(const NnetExample &eg,
                 min_input_t, max_input_t,
                 min_output_t, max_output_t,
                 eg_out);
+  if (frame_shift != 0) {
+    std::vector<std::string> exclude_names;  // we can later make this
+    exclude_names.push_back(std::string("ivector")); // configurable.
+    ShiftExampleTimes(frame_shift, exclude_names, eg_out);
+  }
   return true;
 }
 
@@ -258,9 +265,10 @@ int main(int argc, char *argv[]) {
         "nnet3-copy-egs ark:train.egs ark,t:text.egs\n"
         "or:\n"
         "nnet3-copy-egs ark:train.egs ark:1.egs ark:2.egs\n";
-        
+
     bool random = false;
     int32 srand_seed = 0;
+    int32 frame_shift = 0;
     BaseFloat keep_proportion = 1.0;
 
     // The following config variables, if set, can be used to extract a single
@@ -271,10 +279,14 @@ int main(int argc, char *argv[]) {
     // you can set frame to a number to select a single frame with a particular
     // offset, or to 'random' to select a random single frame.
     std::string frame_str;
-    
+
     ParseOptions po(usage);
     po.Register("random", &random, "If true, will write frames to output "
                 "archives randomly, not round-robin.");
+    po.Register("frame-shift", &frame_shift, "Allows you to shift time values "
+                "in the supervision data (excluding iVector data).  Only really "
+                "useful in clockwork topologies (i.e. any topology for which "
+                "modulus != 1).  Shifting is done after any frame selection.");
     po.Register("keep-proportion", &keep_proportion, "If <1.0, this program will "
                 "randomly keep this proportion of the input samples.  If >1.0, it will "
                 "in expectation copy a sample this many times.  It will copy it a number "
@@ -290,7 +302,7 @@ int main(int argc, char *argv[]) {
     po.Register("right-context", &right_context, "Can be used to truncate the "
                 "feature right-context that we output.");
 
-    
+
     po.Read(argc, argv);
 
     srand(srand_seed);
@@ -309,22 +321,23 @@ int main(int argc, char *argv[]) {
     for (int32 i = 0; i < num_outputs; i++)
       example_writers[i] = new NnetExampleWriter(po.GetArg(i+2));
 
-    
+
     int64 num_read = 0, num_written = 0;
     for (; !example_reader.Done(); example_reader.Next(), num_read++) {
       // count is normally 1; could be 0, or possibly >1.
-      int32 count = GetCount(keep_proportion);  
+      int32 count = GetCount(keep_proportion);
       std::string key = example_reader.Key();
       const NnetExample &eg = example_reader.Value();
       for (int32 c = 0; c < count; c++) {
         int32 index = (random ? Rand() : num_written) % num_outputs;
-        if (frame_str == "" && left_context == -1 && right_context == -1) {
+        if (frame_str == "" && left_context == -1 && right_context == -1 &&
+            frame_shift == 0) {
           example_writers[index]->Write(key, eg);
           num_written++;
         } else { // the --frame option or context options were set.
           NnetExample eg_modified;
           if (SelectFromExample(eg, frame_str, left_context, right_context,
-                                &eg_modified)) {
+                                frame_shift, &eg_modified)) {
             // this branch of the if statement will almost always be taken (should only
             // not be taken for shorter-than-normal egs from the end of a file.
             example_writers[index]->Write(key, eg_modified);
@@ -333,7 +346,7 @@ int main(int argc, char *argv[]) {
         }
       }
     }
-    
+
     for (int32 i = 0; i < num_outputs; i++)
       delete example_writers[i];
     KALDI_LOG << "Read " << num_read << " neural-network training examples, wrote "
diff --git a/src/nnet3bin/nnet3-copy.cc b/src/nnet3bin/nnet3-copy.cc
new file mode 100644
index 00000000000..8d171cfa121
--- /dev/null
+++ b/src/nnet3bin/nnet3-copy.cc
@@ -0,0 +1,77 @@
+// nnet3bin/nnet3-copy.cc
+
+// Copyright 2012  Johns Hopkins University (author:  Daniel Povey)
+//           2015  Xingyu Na
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <typeinfo>
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "hmm/transition-model.h"
+#include "nnet3/am-nnet-simple.h"
+#include "nnet3/nnet-utils.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+
+    const char *usage =
+        "Copy 'raw' nnet3 neural network to standard output\n"
+        "Also supports setting all the learning rates to a value\n"
+        "(the --learning-rate option)\n"
+        "\n"
+        "Usage:  nnet3-copy [options] <nnet-in> <nnet-out>\n"
+        "e.g.:\n"
+        " nnet3-copy --binary=false 0.raw text.raw\n";
+
+    bool binary_write = true;
+    BaseFloat learning_rate = -1;
+    
+    ParseOptions po(usage);
+    po.Register("binary", &binary_write, "Write output in binary mode");
+    po.Register("learning-rate", &learning_rate,
+                "If supplied, all the learning rates of updatable components"
+                "are set to this value.");
+
+    po.Read(argc, argv);
+    
+    if (po.NumArgs() != 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string raw_nnet_rxfilename = po.GetArg(1),
+                raw_nnet_wxfilename = po.GetArg(2);
+    
+    Nnet nnet;
+    ReadKaldiObject(raw_nnet_rxfilename, &nnet);
+    
+    if (learning_rate >= 0)
+      SetLearningRate(learning_rate, &nnet);
+
+    WriteKaldiObject(nnet, raw_nnet_wxfilename, binary_write);
+    KALDI_LOG << "Copied raw neural net from " << raw_nnet_rxfilename
+              << " to " << raw_nnet_wxfilename;
+
+    return 0;
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
diff --git a/src/nnet3bin/nnet3-discriminative-compute-from-egs.cc b/src/nnet3bin/nnet3-discriminative-compute-from-egs.cc
new file mode 100644
index 00000000000..7736290d1d5
--- /dev/null
+++ b/src/nnet3bin/nnet3-discriminative-compute-from-egs.cc
@@ -0,0 +1,151 @@
+// nnet3bin/nnet3-discriminative-compute-from-egs.cc
+
+// Copyright 2015  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "hmm/transition-model.h"
+#include "nnet3/nnet-nnet.h"
+#include "nnet3/nnet-example-utils.h"
+#include "nnet3/nnet-discriminative-example.h"
+#include "nnet3/nnet-optimize.h"
+
+
+namespace kaldi {
+namespace nnet3 {
+
+class NnetComputerFromEg {
+ public:
+  NnetComputerFromEg(const Nnet &nnet):
+      nnet_(nnet), compiler_(nnet) { }
+
+  // Compute the output (which will have the same number of rows as the number
+  // of Indexes in the output of the eg), and put it in "output".
+  void Compute(const NnetExample &eg, Matrix<BaseFloat> *output) {
+    ComputationRequest request;
+    bool need_backprop = false, store_stats = false;
+    GetComputationRequest(nnet_, eg, need_backprop, store_stats, &request);
+    const NnetComputation &computation = *(compiler_.Compile(request));
+    NnetComputeOptions options;
+    if (GetVerboseLevel() >= 3)
+      options.debug = true;
+    NnetComputer computer(options, computation, nnet_, NULL);
+    computer.AcceptInputs(nnet_, eg.io);
+    computer.Forward();
+    const CuMatrixBase<BaseFloat> &nnet_output = computer.GetOutput("output");
+    output->Resize(nnet_output.NumRows(), nnet_output.NumCols());
+    nnet_output.CopyToMat(output);
+  }
+ private:
+  const Nnet &nnet_;
+  CachingOptimizingCompiler compiler_;
+  
+};
+
+}
+}
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Read input nnet discriminative training examples, and compute the "
+        "output for each one. This program is similar to "
+        "nnet3-compute-from-egs, but works with discriminative egs. \n"
+        "If --apply-exp=true, apply the Exp() function to the output before writing\n"
+        "it out.\n"
+        "Note: This program uses only the input; it does not do forward-backward\n"
+        "over the lattice. See nnet3-discriminative-compute-objf for that.\n"
+        "\n"
+        "Usage:  nnet3-discriminative-compute-from-egs [options] <raw-nnet-in> <training-examples-in> <matrices-out>\n"
+        "e.g.:\n"
+        "nnet3-discriminative-compute-from-egs --apply-exp=true 0.raw ark:1.degs ark:- | matrix-sum-rows ark:- ... \n"
+        "See also: nnet3-compute nnet3-compute-from-egs\n";
+    
+    bool binary_write = true,
+        apply_exp = false;
+    std::string use_gpu = "yes";
+
+    ParseOptions po(usage);
+    po.Register("binary", &binary_write, "Write output in binary mode");
+    po.Register("apply-exp", &apply_exp, "If true, apply exp function to "
+                "output");
+    po.Register("use-gpu", &use_gpu,
+                "yes|no|optional|wait, only has effect if compiled with CUDA");
+
+    po.Read(argc, argv);
+    
+    if (po.NumArgs() != 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+#if HAVE_CUDA==1
+    CuDevice::Instantiate().SelectGpuId(use_gpu);
+#endif
+    
+    std::string nnet_rxfilename = po.GetArg(1),
+        examples_rspecifier = po.GetArg(2),
+        matrix_wspecifier = po.GetArg(3);
+
+    Nnet nnet;
+    ReadKaldiObject(nnet_rxfilename, &nnet);
+
+    NnetComputerFromEg computer(nnet);
+
+    int64 num_egs = 0;
+    
+    SequentialNnetDiscriminativeExampleReader example_reader(examples_rspecifier);
+    BaseFloatMatrixWriter matrix_writer(matrix_wspecifier);
+    
+    for (; !example_reader.Done(); example_reader.Next(), num_egs++) {
+      Matrix<BaseFloat> output;
+      NnetExample eg;
+      NnetDiscriminativeExample disc_eg = example_reader.Value();
+      eg.io.swap(disc_eg.inputs);
+
+      for (int32 i = 0; i < disc_eg.outputs.size(); i++) {
+        NnetIo io;
+        io.name = disc_eg.outputs[i].name;
+        io.indexes = disc_eg.outputs[i].indexes;
+        eg.io.push_back(io);
+      }
+
+      computer.Compute(eg, &output);
+      KALDI_ASSERT(output.NumRows() != 0);
+      if (apply_exp)
+        output.ApplyExp();
+      matrix_writer.Write(example_reader.Key(), output);
+    }
+#if HAVE_CUDA==1
+    CuDevice::Instantiate().PrintProfile();
+#endif
+    KALDI_LOG << "Processed " << num_egs << " examples.";
+    return 0;
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
+
+
diff --git a/src/nnet3bin/nnet3-discriminative-compute-objf.cc b/src/nnet3bin/nnet3-discriminative-compute-objf.cc
new file mode 100644
index 00000000000..555272a2048
--- /dev/null
+++ b/src/nnet3bin/nnet3-discriminative-compute-objf.cc
@@ -0,0 +1,94 @@
+// nnet3bin/nnet3-discriminative-compute-objf.cc
+
+// Copyright 2012-2015  Johns Hopkins University (author: Daniel Povey)
+//           2014-2015  Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "nnet3/nnet-discriminative-diagnostics.h"
+#include "nnet3/am-nnet-simple.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Computes and prints to in logging messages the objective function per frame of\n"
+        "the given data with an nnet3 neural net.  The input of this is the output of\n"
+        "e.g. nnet3-discriminative-get-egs | nnet3-discriminative-merge-egs.\n"
+        "\n"
+        "Usage:  nnet3-discrminative-compute-objf [options] <nnet3-model-in> <training-examples-in>\n"
+        "e.g.: nnet3-discriminative-compute-objf 0.mdl ark:valid.degs\n";
+
+
+    // This program doesn't support using a GPU, because these probabilities are
+    // used for diagnostics, and you can just compute them with a small enough
+    // amount of data that a CPU can do it within reasonable time.
+    // It wouldn't be hard to make it support GPU, though.
+
+    NnetComputeProbOptions nnet_opts;
+    discriminative::DiscriminativeOptions discriminative_opts;
+
+    ParseOptions po(usage);
+
+    nnet_opts.Register(&po);
+    discriminative_opts.Register(&po);
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string model_rxfilename = po.GetArg(1),
+        examples_rspecifier = po.GetArg(2);
+
+    TransitionModel tmodel;
+    AmNnetSimple am_nnet;
+
+    {
+      bool binary;
+      Input ki(model_rxfilename, &binary);
+      tmodel.Read(ki.Stream(), binary);
+      am_nnet.Read(ki.Stream(), binary);
+    }
+
+    NnetDiscriminativeComputeObjf discriminative_objf_computer(nnet_opts, 
+                                              discriminative_opts, 
+                                              tmodel, am_nnet.Priors(), 
+                                              am_nnet.GetNnet());
+
+    SequentialNnetDiscriminativeExampleReader example_reader(examples_rspecifier);
+
+    for (; !example_reader.Done(); example_reader.Next())
+      discriminative_objf_computer.Compute(example_reader.Value());
+
+    bool ok = discriminative_objf_computer.PrintTotalStats();
+
+    return (ok ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
+
diff --git a/src/nnet3bin/nnet3-discriminative-copy-egs.cc b/src/nnet3bin/nnet3-discriminative-copy-egs.cc
new file mode 100644
index 00000000000..831484ebb11
--- /dev/null
+++ b/src/nnet3bin/nnet3-discriminative-copy-egs.cc
@@ -0,0 +1,139 @@
+// nnet3bin/nnet3-discriminative-copy-egs.cc
+
+// Copyright 2012-2015  Johns Hopkins University (author:  Daniel Povey)
+//           2014-2015  Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "hmm/transition-model.h"
+#include "nnet3/nnet-discriminative-example.h"
+
+namespace kaldi {
+// returns an integer randomly drawn with expected value "expected_count"
+// (will be either floor(expected_count) or ceil(expected_count)).
+int32 GetCount(double expected_count) {
+  KALDI_ASSERT(expected_count >= 0.0);
+  int32 ans = floor(expected_count);
+  expected_count -= ans;
+  if (WithProb(expected_count))
+    ans++;
+  return ans;
+}
+}
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Copy examples for nnet3 discriminative training, possibly changing the binary mode.\n"
+        "Supports multiple wspecifiers, in which case it will write the examples\n"
+        "round-robin to the outputs.\n"
+        "\n"
+        "Usage:  nnet3-discriminative-copy-egs [options] <egs-rspecifier> <egs-wspecifier1> [<egs-wspecifier2> ...]\n"
+        "\n"
+        "e.g.\n"
+        "nnet3-discriminative-copy-egs ark:train.degs ark,t:text.degs\n"
+        "or:\n"
+        "nnet3-discriminative-copy-egs ark:train.degs ark:1.degs ark:2.degs\n";
+
+    bool random = false;
+    int32 srand_seed = 0;
+    int32 frame_shift = 0;
+    int32 truncate_deriv_weights = 0;
+    BaseFloat keep_proportion = 1.0;
+
+    ParseOptions po(usage);
+    po.Register("random", &random, "If true, will write frames to output "
+                "archives randomly, not round-robin.");
+    po.Register("keep-proportion", &keep_proportion, "If <1.0, this program will "
+                "randomly keep this proportion of the input samples.  If >1.0, it will "
+                "in expectation copy a sample this many times.  It will copy it a number "
+                "of times equal to floor(keep-proportion) or ceil(keep-proportion).");
+    po.Register("srand", &srand_seed, "Seed for random number generator "
+                "(only relevant if --random=true or --keep-proportion != 1.0)");
+    po.Register("frame-shift", &frame_shift, "Allows you to shift time values "
+                "in the supervision data (excluding iVector data) - useful in "
+                "augmenting data.  Note, the outputs will remain at the closest "
+                "exact multiples of the frame subsampling factor");
+    po.Register("truncate-deriv-weights", &truncate_deriv_weights,
+                "If nonzero, the number of initial/final subsample frames that "
+                "will have their derivatives' weights set to zero.");
+
+    po.Read(argc, argv);
+
+    srand(srand_seed);
+
+    if (po.NumArgs() < 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string examples_rspecifier = po.GetArg(1);
+
+    SequentialNnetDiscriminativeExampleReader example_reader(examples_rspecifier);
+
+    int32 num_outputs = po.NumArgs() - 1;
+    std::vector<NnetDiscriminativeExampleWriter*> example_writers(num_outputs);
+    for (int32 i = 0; i < num_outputs; i++)
+      example_writers[i] = new NnetDiscriminativeExampleWriter(po.GetArg(i+2));
+
+    std::vector<std::string> exclude_names; // names we never shift times of;
+                                            // not configurable for now.
+    exclude_names.push_back(std::string("ivector"));
+
+
+    int64 num_read = 0, num_written = 0;
+    for (; !example_reader.Done(); example_reader.Next(), num_read++) {
+      // count is normally 1; could be 0, or possibly >1.
+      int32 count = GetCount(keep_proportion);
+      std::string key = example_reader.Key();
+      if (frame_shift == 0 && truncate_deriv_weights == 0) {
+        const NnetDiscriminativeExample &eg = example_reader.Value();
+        for (int32 c = 0; c < count; c++) {
+          int32 index = (random ? Rand() : num_written) % num_outputs;
+          example_writers[index]->Write(key, eg);
+          num_written++;
+        }
+      } else if (count > 0) {
+        NnetDiscriminativeExample eg = example_reader.Value();
+        if (frame_shift != 0)
+          ShiftDiscriminativeExampleTimes(frame_shift, exclude_names, &eg);
+        if (truncate_deriv_weights != 0)
+          TruncateDerivWeights(truncate_deriv_weights, &eg);
+        for (int32 c = 0; c < count; c++) {
+          int32 index = (random ? Rand() : num_written) % num_outputs;
+          example_writers[index]->Write(key, eg);
+          num_written++;
+        }
+      }
+    }
+    for (int32 i = 0; i < num_outputs; i++)
+      delete example_writers[i];
+    KALDI_LOG << "Read " << num_read
+              << " neural-network training examples, wrote " << num_written;
+    return (num_written == 0 ? 1 : 0);
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
diff --git a/src/nnet3bin/nnet3-discriminative-get-egs.cc b/src/nnet3bin/nnet3-discriminative-get-egs.cc
new file mode 100644
index 00000000000..786ed609a33
--- /dev/null
+++ b/src/nnet3bin/nnet3-discriminative-get-egs.cc
@@ -0,0 +1,358 @@
+// nnet3bin/nnet3-discriminative-get-egs.cc
+
+// Copyright      2015  Johns Hopkins University (author:  Daniel Povey)
+//           2014-2015  Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sstream>
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "hmm/transition-model.h"
+#include "hmm/posterior.h"
+#include "nnet3/nnet-discriminative-example.h"
+#include "nnet3/discriminative-supervision.h"
+#include "nnet3/nnet-example-utils.h"
+#include "chain/chain-supervision.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+/**
+   This function does all the processing for one utterance, and outputs the
+   supervision objects to 'example_writer'.  
+*/
+
+static bool ProcessFile(
+                        const discriminative::SplitDiscriminativeSupervisionOptions &config,
+                        const TransitionModel &tmodel,
+                        const MatrixBase<BaseFloat> &feats,
+                        const MatrixBase<BaseFloat> *ivector_feats,
+                        const discriminative::DiscriminativeSupervision &supervision,
+                        const std::string &utt_id,
+                        bool compress,
+                        int32 left_context,
+                        int32 right_context,
+                        int32 frames_per_eg,
+                        int32 frames_overlap_per_eg,
+                        int32 frame_subsampling_factor,
+                        int64 *num_frames_written,
+                        int64 *num_egs_written,
+                        NnetDiscriminativeExampleWriter *example_writer) {
+  KALDI_ASSERT(supervision.num_sequences == 1);
+  int32 num_feature_frames = feats.NumRows(),
+      num_output_frames = supervision.frames_per_sequence,
+      num_feature_frames_subsampled =
+                             (num_feature_frames + frame_subsampling_factor - 1)/
+                             frame_subsampling_factor;
+  if (num_output_frames != num_feature_frames_subsampled)
+    KALDI_ERR << "Mismatch in num-frames: discriminative supervision has "
+              << num_output_frames
+              << " versus features/frame_subsampling_factor = "
+              << num_feature_frames << " / " << frame_subsampling_factor
+              << ": check that --frame-subsampling-factor option is set "
+              << "the same as to discriminative-get-supervision.";
+
+  KALDI_ASSERT(frames_per_eg % frame_subsampling_factor == 0);
+
+  int32 frames_per_eg_subsampled = frames_per_eg / frame_subsampling_factor,
+      frames_overlap_subsampled = frames_overlap_per_eg / frame_subsampling_factor,
+      frames_shift_subsampled = frames_per_eg_subsampled - frames_overlap_subsampled;
+
+  if (frames_per_eg != -1 && num_feature_frames_subsampled < frames_per_eg_subsampled) {
+    KALDI_WARN << "No output for utterance " << utt_id
+               << " (num-frames=" << num_feature_frames
+               << ") because too short for --frames-per-eg="
+               << frames_per_eg;
+    return false;
+  }
+
+  // we don't do any padding, as it would be a bit tricky to pad the discriminative training supervision.
+  // Instead we select ranges of frames that fully fit within the file;  these
+  // might slightly overlap with each other or have gaps.
+  std::vector<int32> range_starts_subsampled;
+  if (frames_per_eg != -1) {
+    chain::SplitIntoRanges(num_feature_frames_subsampled -
+                           frames_overlap_subsampled,
+                           frames_shift_subsampled,
+                           &range_starts_subsampled);
+  } else {
+    range_starts_subsampled.push_back(0);
+  }
+  // The 'deriv_weights' make sure we don't count frames twice, and also ensure
+  // that we tend to avoid having nonzero weights on the derivatives that are
+  // too close to the edge of the corresponding 'range' (these derivatives close
+  // to the edge are not as accurate as they could be, because when we split we
+  // don't know the correct alphas and betas).
+  std::vector<Vector<BaseFloat> > deriv_weights;
+  if (frames_per_eg != -1) {
+    chain::GetWeightsForRanges(frames_per_eg_subsampled,
+                        range_starts_subsampled,
+                        &deriv_weights);
+
+    if (range_starts_subsampled.empty()) {
+      KALDI_WARN << "No output for utterance " << utt_id
+                 << " (num-frames=" << num_feature_frames
+                 << ") because too short for --frames-per-eg="
+                 << frames_per_eg;
+      return false;
+    }
+  } else {
+    deriv_weights.push_back(Vector<BaseFloat>());
+  }
+
+  discriminative::DiscriminativeSupervisionSplitter splitter(config, tmodel, 
+                                                             supervision);
+
+  for (size_t i = 0; i < range_starts_subsampled.size(); i++) {
+
+    NnetDiscriminativeExample nnet_discriminative_eg;
+    nnet_discriminative_eg.outputs.resize(1);
+    int32 range_start_subsampled = range_starts_subsampled[i],
+        range_start = range_start_subsampled * frame_subsampling_factor;
+    
+    if (frames_per_eg != -1) {
+
+      discriminative::DiscriminativeSupervision supervision_part;
+
+      splitter.GetFrameRange(range_start_subsampled,
+                             frames_per_eg_subsampled,
+                             (i == 0 ? false : true),
+                             &supervision_part);
+
+      int32 first_frame = 0;  // we shift the time-indexes of all these parts so
+                              // that the supervised part starts from frame 0.
+      NnetDiscriminativeSupervision nnet_supervision("output", supervision_part,
+                                                     deriv_weights[i],
+                                                     first_frame, 
+                                                     frame_subsampling_factor);
+      nnet_discriminative_eg.outputs[0].Swap(&nnet_supervision);
+    } else {
+      int32 first_frame = 0;  // we shift the time-indexes of all these parts so
+                              // that the supervised part starts from frame 0.
+      NnetDiscriminativeSupervision nnet_supervision("output", supervision,
+                                                     deriv_weights[i],
+                                                     first_frame, 
+                                                     frame_subsampling_factor);
+      nnet_discriminative_eg.outputs[0].Swap(&nnet_supervision);
+    }
+
+    nnet_discriminative_eg.inputs.resize(ivector_feats != NULL ? 2 : 1);
+
+    int32 this_frames_per_eg = frames_per_eg != -1 ? frames_per_eg : supervision.frames_per_sequence;
+
+    int32 tot_frames = left_context + this_frames_per_eg + right_context;
+    Matrix<BaseFloat> input_frames(tot_frames, feats.NumCols(), kUndefined);
+
+    // Set up "input_frames".
+    for (int32 j = -left_context; j < this_frames_per_eg + right_context; j++) {
+      int32 t = range_start + j;
+      if (t < 0) t = 0;
+      if (t >= feats.NumRows()) t = feats.NumRows() - 1;
+      SubVector<BaseFloat> src(feats, t),
+          dest(input_frames, j + left_context);
+      dest.CopyFromVec(src);
+    }
+    NnetIo input_io("input", - left_context,
+                    input_frames);
+    nnet_discriminative_eg.inputs[0].Swap(&input_io);
+
+    if (ivector_feats != NULL) {
+      // if applicable, add the iVector feature.
+      // try to get closest frame to middle of window to get
+      // a representative iVector.
+      int32 closest_frame = range_start + this_frames_per_eg / 2;
+      KALDI_ASSERT(ivector_feats->NumRows() > 0);
+      if (closest_frame >= ivector_feats->NumRows())
+        closest_frame = ivector_feats->NumRows() - 1;
+      Matrix<BaseFloat> ivector(1, ivector_feats->NumCols());
+      ivector.Row(0).CopyFromVec(ivector_feats->Row(closest_frame));
+      NnetIo ivector_io("ivector", 0, ivector);
+      nnet_discriminative_eg.inputs[1].Swap(&ivector_io);
+    }
+
+    if (compress)
+      nnet_discriminative_eg.Compress();
+
+    std::ostringstream os;
+    os << utt_id << "-" << range_start;
+
+    std::string key = os.str(); // key is <utt_id>-<frame_id>
+
+    *num_frames_written += this_frames_per_eg;
+    *num_egs_written += 1;
+
+    example_writer->Write(key, nnet_discriminative_eg);
+  }
+  return true;
+}
+
+
+} // namespace nnet3
+} // namespace kaldi
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Get frame-by-frame examples of data for nnet3+sequence neural network\n"
+        "training.  This involves breaking up utterances into pieces of a\n"
+        "fixed size.  Input will come from discriminative-get-supervision.\n"
+        "\n"
+        "Usage:  nnet3-discriminative-get-egs [options] <model> <features-rspecifier> "
+        "<discriminative-supervision-rspecifier> <egs-wspecifier>\n"
+        "\n"
+        "An example [where $feats expands to the actual features]:\n"
+        "discriminative-get-supervision [args] | \\\n"
+        "  nnet3-discriminative-get-egs --left-context=25 --right-context=9 --num-frames=20 \\\n"
+        "  \"$feats\" ark,s,cs:- ark:degs.1.ark\n"
+        "Note: the --frame-subsampling-factor option must be the same as given to\n"
+        "discriminative-get-supervision.\n";
+
+    bool compress = true;
+    int32 left_context = 0, right_context = 0, num_frames = 1,
+        num_frames_overlap = 0, length_tolerance = 100,
+        frame_subsampling_factor = 1;
+
+    std::string ivector_rspecifier;
+    discriminative::SplitDiscriminativeSupervisionOptions splitter_config;
+
+    ParseOptions po(usage);
+    po.Register("compress", &compress, "If true, write egs in "
+                "compressed format (recommended)");
+    po.Register("left-context", &left_context, "Number of frames of left "
+                "context the neural net requires.");
+    po.Register("right-context", &right_context, "Number of frames of right "
+                "context the neural net requires.");
+    po.Register("num-frames", &num_frames, "Number of frames with labels "
+                "that each example contains.  Will be rounded up to a multiple "
+                "of --frame-subsampling-factor.");
+    po.Register("num-frames-overlap", &num_frames_overlap, "Number of frames of "
+                "overlap between each example (could be useful in conjunction "
+                "--min-deriv-time and --max-deriv-time, to avoid wasting data). "
+                "Each time we shift by --num-frames minus --num-frames-overlap.");
+    po.Register("ivectors", &ivector_rspecifier, "Rspecifier of ivector "
+                "features, as a matrix.");
+    po.Register("length-tolerance", &length_tolerance, "Tolerance for "
+                "difference in num-frames between feat and ivector matrices");
+    po.Register("frame-subsampling-factor", &frame_subsampling_factor, "Used "
+                "if the frame-rate at the output will be less than the "
+                "frame-rate of the input");
+    
+    ParseOptions splitter_opts("supervision-splitter", &po);
+    splitter_config.Register(&splitter_opts);
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 4) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    if (left_context < 0 || right_context < 0 ||
+        length_tolerance < 0 || frame_subsampling_factor <= 0)
+      KALDI_ERR << "One of the integer options is out of the allowed range.";
+
+    if (frame_subsampling_factor != 1)
+      RoundUpNumFrames(frame_subsampling_factor,
+                       &num_frames, &num_frames_overlap);
+
+    std::string model_wxfilename, feature_rspecifier,
+                supervision_rspecifier,
+                examples_wspecifier;
+
+    model_wxfilename = po.GetArg(1);
+    feature_rspecifier = po.GetArg(2);
+    supervision_rspecifier = po.GetArg(3);
+    examples_wspecifier = po.GetArg(4);
+
+    TransitionModel tmodel;
+    { 
+      bool binary;
+      Input ki(model_wxfilename, &binary);
+      tmodel.Read(ki.Stream(), binary);
+    }
+
+    SequentialBaseFloatMatrixReader feat_reader(feature_rspecifier);
+    discriminative::RandomAccessDiscriminativeSupervisionReader supervision_reader(
+        supervision_rspecifier);
+    NnetDiscriminativeExampleWriter example_writer(examples_wspecifier);
+    RandomAccessBaseFloatMatrixReader ivector_reader(ivector_rspecifier);
+
+    int32 num_done = 0, num_err = 0;
+    int64 num_frames_written = 0, num_egs_written = 0;
+
+    for (; !feat_reader.Done(); feat_reader.Next()) {
+      std::string key = feat_reader.Key();
+      const Matrix<BaseFloat> &feats = feat_reader.Value();
+      if (!supervision_reader.HasKey(key)) {
+        KALDI_WARN << "No supervision for key " << key;
+        num_err++;
+      } else {
+        const discriminative::DiscriminativeSupervision &supervision = supervision_reader.Value(key);
+        const Matrix<BaseFloat> *ivector_feats = NULL;
+        if (!ivector_rspecifier.empty()) {
+          if (!ivector_reader.HasKey(key)) {
+            KALDI_WARN << "No iVectors for utterance " << key;
+            num_err++;
+            continue;
+          } else {
+            // this address will be valid until we call HasKey() or Value()
+            // again.
+            ivector_feats = &(ivector_reader.Value(key));
+          }
+        }
+        if (ivector_feats != NULL &&
+            (std::abs(feats.NumRows() - ivector_feats->NumRows()) > length_tolerance
+             || ivector_feats->NumRows() == 0)) {
+          KALDI_WARN << "Length difference between feats " << feats.NumRows()
+                     << " and iVectors " << ivector_feats->NumRows()
+                     << "exceeds tolerance " << length_tolerance;
+          num_err++;
+          continue;
+        }
+        if (ProcessFile(splitter_config, tmodel,
+                        feats, ivector_feats, supervision,
+                        key, compress, left_context, right_context, num_frames,
+                        num_frames_overlap, frame_subsampling_factor,
+                        &num_frames_written, &num_egs_written,
+                        &example_writer))
+          num_done++;
+        else {
+          KALDI_WARN << "Failed to process utterance into nnet example "
+                     << "for key " << key;
+          num_err++;
+        }
+      }
+    }
+
+    KALDI_LOG << "Finished generating nnet3-discriminative examples, "
+              << "successfully processed " << num_done
+              << " feature files, wrote " << num_egs_written << " examples, "
+              << " with " << num_frames_written << " frames in total; "
+              << num_err << " files had errors.";
+    return (num_egs_written == 0 || num_err > num_done ? 1 : 0);
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
diff --git a/src/nnet3bin/nnet3-discriminative-merge-egs.cc b/src/nnet3bin/nnet3-discriminative-merge-egs.cc
new file mode 100644
index 00000000000..5c386bd40b3
--- /dev/null
+++ b/src/nnet3bin/nnet3-discriminative-merge-egs.cc
@@ -0,0 +1,102 @@
+// nnet3bin/nnet3-discriminative-merge-egs.cc
+
+// Copyright 2012-2015  Johns Hopkins University (author:  Daniel Povey)
+//           2014-2015  Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "hmm/transition-model.h"
+#include "nnet3/nnet-discriminative-example.h"
+
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "This copies nnet3 discriminative training examples from input to output, merging them\n"
+        "into composite examples.  The --minibatch-size option controls how many egs\n"
+        "are merged into a single output eg.\n"
+        "\n"
+        "Usage:  nnet3-discriminative-egs [options] <egs-rspecifier> <egs-wspecifier>\n"
+        "e.g.\n"
+        "nnet3-discriminative-merge-egs --minibatch-size=128 ark:1.degs ark:- | nnet3-discriminative-train ... \n"
+        "See also nnet3-discriminative-copy-egs\n";
+
+    bool compress = false;
+    int32 minibatch_size = 64;
+
+    ParseOptions po(usage);
+    po.Register("minibatch-size", &minibatch_size, "Target size of minibatches "
+                "when merging (see also --measure-output-frames)");
+    po.Register("compress", &compress, "If true, compress the output examples "
+                "(not recommended unless you are writing to disk");
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string examples_rspecifier = po.GetArg(1),
+        examples_wspecifier = po.GetArg(2);
+
+    SequentialNnetDiscriminativeExampleReader example_reader(examples_rspecifier);
+    NnetDiscriminativeExampleWriter example_writer(examples_wspecifier);
+
+    std::vector<NnetDiscriminativeExample> examples;
+    examples.reserve(minibatch_size);
+
+    int64 num_read = 0, num_written = 0;
+    while (!example_reader.Done()) {
+      const NnetDiscriminativeExample &cur_eg = example_reader.Value();
+      examples.resize(examples.size() + 1);
+      examples.back() = cur_eg;
+
+      bool minibatch_ready =
+          static_cast<int32>(examples.size()) >= minibatch_size;
+
+      // Do Next() now, so we can test example_reader.Done() below .
+      example_reader.Next();
+      num_read++;
+
+      if (minibatch_ready || (example_reader.Done() && !examples.empty())) {
+        NnetDiscriminativeExample merged_eg;
+        MergeDiscriminativeExamples(compress, &examples, &merged_eg);
+        std::ostringstream ostr;
+        ostr << "merged-" << num_written;
+        num_written++;
+        std::string output_key = ostr.str();
+        example_writer.Write(output_key, merged_eg);
+        examples.clear();
+      }
+    }
+    KALDI_LOG << "Merged " << num_read << " egs to " << num_written << '.';
+    return (num_written != 0 ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
+
+
diff --git a/src/nnet3bin/nnet3-discriminative-shuffle-egs.cc b/src/nnet3bin/nnet3-discriminative-shuffle-egs.cc
new file mode 100644
index 00000000000..2a029123852
--- /dev/null
+++ b/src/nnet3bin/nnet3-discriminative-shuffle-egs.cc
@@ -0,0 +1,116 @@
+// nnet3bin/nnet3-discriminative-shuffle-egs.cc
+
+// Copyright 2012-2015  Johns Hopkins University (author:  Daniel Povey)
+//           2014-2015  Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "hmm/transition-model.h"
+#include "nnet3/nnet-discriminative-example.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Copy nnet3 discriminative training examples from the input to output,\n"
+        "while randomly shuffling the order.  This program will keep all of the examples\n"
+        "in memory at once, unless you use the --buffer-size option\n"
+        "\n"
+        "Usage:  nnet3-discriminative-shuffle-egs [options] <egs-rspecifier> <egs-wspecifier>\n"
+        "\n"
+        "nnet3-discriminative-shuffle-egs --srand=1 ark:train.egs ark:shuffled.egs\n";
+
+    int32 srand_seed = 0;
+    int32 buffer_size = 0;
+    ParseOptions po(usage);
+    po.Register("srand", &srand_seed, "Seed for random number generator ");
+    po.Register("buffer-size", &buffer_size, "If >0, size of a buffer we use "
+                "to do limited-memory partial randomization.  Otherwise, do "
+                "full randomization.");
+
+    po.Read(argc, argv);
+
+    srand(srand_seed);
+
+    if (po.NumArgs() != 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string examples_rspecifier = po.GetArg(1),
+        examples_wspecifier = po.GetArg(2);
+
+    int64 num_done = 0;
+
+    std::vector<std::pair<std::string, NnetDiscriminativeExample*> > egs;
+
+    SequentialNnetDiscriminativeExampleReader example_reader(examples_rspecifier);
+    NnetDiscriminativeExampleWriter example_writer(examples_wspecifier);
+    if (buffer_size == 0) { // Do full randomization
+      // Putting in an extra level of indirection here to avoid excessive
+      // computation and memory demands when we have to resize the vector.
+
+      for (; !example_reader.Done(); example_reader.Next())
+        egs.push_back(std::pair<std::string, NnetDiscriminativeExample*>(
+            example_reader.Key(),
+            new NnetDiscriminativeExample(example_reader.Value())));
+
+      std::random_shuffle(egs.begin(), egs.end());
+    } else {
+      KALDI_ASSERT(buffer_size > 0);
+      egs.resize(buffer_size,
+                 std::pair<std::string, NnetDiscriminativeExample*>("", NULL));
+      for (; !example_reader.Done(); example_reader.Next()) {
+        int32 index = RandInt(0, buffer_size - 1);
+        if (egs[index].second == NULL) {
+          egs[index] = std::pair<std::string, NnetDiscriminativeExample*>(
+              example_reader.Key(),
+              new NnetDiscriminativeExample(example_reader.Value()));
+        } else {
+          example_writer.Write(egs[index].first, *(egs[index].second));
+          egs[index].first = example_reader.Key();
+          *(egs[index].second) = example_reader.Value();
+          num_done++;
+        }
+      }
+    }
+    for (size_t i = 0; i < egs.size(); i++) {
+      if (egs[i].second != NULL) {
+        example_writer.Write(egs[i].first, *(egs[i].second));
+        delete egs[i].second;
+        num_done++;
+      }
+    }
+
+    KALDI_LOG << "Shuffled order of " << num_done
+              << " neural-network training examples "
+              << (buffer_size ? "using a buffer (partial randomization)" : "");
+
+    return (num_done == 0 ? 1 : 0);
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
+
+
diff --git a/src/nnet3bin/nnet3-discriminative-subset-egs.cc b/src/nnet3bin/nnet3-discriminative-subset-egs.cc
new file mode 100644
index 00000000000..748665adbcf
--- /dev/null
+++ b/src/nnet3bin/nnet3-discriminative-subset-egs.cc
@@ -0,0 +1,102 @@
+// nnet3bin/nnet3-discriminative-subset-egs.cc
+
+// Copyright 2012-2015  Johns Hopkins University (author:  Daniel Povey)
+//                2014  Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "nnet3/nnet-discriminative-example.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Creates a random subset of the input examples, of a specified size.\n"
+        "Uses no more memory than the size of the subset.\n"
+        "\n"
+        "Usage:  nnet3-discriminative-subset-egs [options] <degs-rspecifier> [<degs-wspecifier2> ...]\n"
+        "\n"
+        "e.g.\n"
+        "nnet3-discriminative-copy-egs [args] ark:degs.1.ark ark:- | nnet-discriminative-subset-egs --n=1000 ark:- ark:subset.egs\n";
+    
+    int32 srand_seed = 0;
+    int32 n = 1000;
+    bool randomize_order = true;
+    ParseOptions po(usage);
+    po.Register("srand", &srand_seed, "Seed for random number generator ");
+    po.Register("n", &n, "Number of examples to output");
+    po.Register("randomize-order", &randomize_order, "If true, randomize the order "
+                "of the output");
+    
+    po.Read(argc, argv);
+    
+    srand(srand_seed);
+    
+    if (po.NumArgs() != 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string examples_rspecifier = po.GetArg(1),
+        examples_wspecifier = po.GetArg(2);
+
+    std::vector<std::pair<std::string, NnetDiscriminativeExample> > egs;
+    egs.reserve(n);
+    
+    SequentialNnetDiscriminativeExampleReader example_reader(examples_rspecifier);
+
+    int64 num_read = 0;
+    for (; !example_reader.Done(); example_reader.Next()) {
+      num_read++;
+      if (num_read <= n) {
+        egs.resize(egs.size() + 1);
+        egs.back().first = example_reader.Key();
+        egs.back().second = example_reader.Value();
+      } else {
+        BaseFloat keep_prob = n / static_cast<BaseFloat>(num_read);
+        if (WithProb(keep_prob)) { // With probability "keep_prob"
+          int32 index = RandInt(0, n-1);
+          egs[index].first = example_reader.Key();
+          egs[index].second = example_reader.Value();
+        }
+      }
+    }
+    if (randomize_order)
+      std::random_shuffle(egs.begin(), egs.end());
+
+    NnetDiscriminativeExampleWriter writer(examples_wspecifier);
+    for (size_t i = 0; i < egs.size(); i++) {
+      writer.Write(egs[i].first, egs[i].second);
+    }
+    
+    KALDI_LOG << "Selected a subset of " << egs.size() << " out of " << num_read
+              << " neural-network discriminative training examples ";
+    
+    return (num_read != 0 ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
+
+
diff --git a/src/nnet3bin/nnet3-discriminative-train.cc b/src/nnet3bin/nnet3-discriminative-train.cc
new file mode 100644
index 00000000000..ad088a4b618
--- /dev/null
+++ b/src/nnet3bin/nnet3-discriminative-train.cc
@@ -0,0 +1,102 @@
+// nnet3bin/nnet3-discriminative-train.cc
+
+// Copyright 2015  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "nnet3/nnet-discriminative-training.h"
+#include "nnet3/am-nnet-simple.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Train nnet3 neural network parameters with discriminative sequence objective \n"
+        "gradient descent.  Minibatches are to be created by nnet3-discriminative-merge-egs in\n"
+        "the input pipeline.  This training program is single-threaded (best to\n"
+        "use it with a GPU).\n"
+        "\n"
+        "Usage:  nnet3-discriminative-train [options] <nnet-in> <discriminative-training-examples-in> <raw-nnet-out>\n"
+        "\n"
+        "nnet3-discriminative-train 1.mdl 'ark:nnet3-merge-egs 1.degs ark:-|' 2.raw\n";
+
+    bool binary_write = true;
+    std::string use_gpu = "yes";
+    NnetDiscriminativeOptions opts;
+
+    ParseOptions po(usage);
+    po.Register("binary", &binary_write, "Write output in binary mode");
+    po.Register("use-gpu", &use_gpu,
+                "yes|no|optional|wait, only has effect if compiled with CUDA");
+
+    opts.Register(&po);
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+    
+#if HAVE_CUDA==1
+    CuDevice::Instantiate().SelectGpuId(use_gpu);
+#endif
+
+    std::string model_rxfilename = po.GetArg(1),
+        examples_rspecifier = po.GetArg(2),
+        model_wxfilename = po.GetArg(3);
+
+    TransitionModel tmodel;
+    AmNnetSimple am_nnet;
+
+    bool binary;
+    Input ki(model_rxfilename, &binary);
+    
+    tmodel.Read(ki.Stream(), binary);
+    am_nnet.Read(ki.Stream(), binary);
+    
+    Nnet nnet = am_nnet.GetNnet();
+    const VectorBase<BaseFloat> &priors = am_nnet.Priors();
+
+    NnetDiscriminativeTrainer trainer(opts, tmodel, priors, &nnet);
+
+    SequentialNnetDiscriminativeExampleReader example_reader(examples_rspecifier);
+
+    for (; !example_reader.Done(); example_reader.Next())
+      trainer.Train(example_reader.Value());
+
+    bool ok = trainer.PrintTotalStats();
+
+#if HAVE_CUDA==1
+    CuDevice::Instantiate().PrintProfile();
+#endif
+    Output ko(model_wxfilename, binary_write);
+    nnet.Write(ko.Stream(), binary_write);
+    
+    KALDI_LOG << "Wrote raw nnet model to " << model_wxfilename;
+    return (ok ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
diff --git a/src/nnet3bin/nnet3-get-egs-dense-targets.cc b/src/nnet3bin/nnet3-get-egs-dense-targets.cc
new file mode 100644
index 00000000000..23bf8922a5b
--- /dev/null
+++ b/src/nnet3bin/nnet3-get-egs-dense-targets.cc
@@ -0,0 +1,258 @@
+// nnet3bin/nnet3-get-egs-dense-targets.cc
+
+// Copyright 2012-2015  Johns Hopkins University (author:  Daniel Povey)
+//           2014-2015  Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sstream>
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "hmm/transition-model.h"
+#include "hmm/posterior.h"
+#include "nnet3/nnet-example.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+
+static void ProcessFile(const MatrixBase<BaseFloat> &feats,
+                        const MatrixBase<BaseFloat> *ivector_feats,
+                        const MatrixBase<BaseFloat> &targets,
+                        const std::string &utt_id,
+                        bool compress,
+                        int32 num_targets,
+                        int32 left_context,
+                        int32 right_context,
+                        int32 frames_per_eg,
+                        int64 *num_frames_written,
+                        int64 *num_egs_written,
+                        NnetExampleWriter *example_writer) {
+  KALDI_ASSERT(feats.NumRows() == static_cast<int32>(targets.NumRows()));
+  
+  for (int32 t = 0; t < feats.NumRows(); t += frames_per_eg) {
+
+    // actual_frames_per_eg is the number of frames with actual targets.
+    // At the end of the file, we pad with the last frame repeated
+    // so that all examples have the same structure (prevents the need
+    // for recompilations).
+    // TODO: We might need to ignore the end of the file.
+    int32 actual_frames_per_eg = std::min(frames_per_eg,
+                                          feats.NumRows() - t);
+
+
+    int32 tot_frames = left_context + frames_per_eg + right_context;
+
+    Matrix<BaseFloat> input_frames(tot_frames, feats.NumCols());
+    
+    // Set up "input_frames".
+    for (int32 j = -left_context; j < frames_per_eg + right_context; j++) {
+      int32 t2 = j + t;
+      if (t2 < 0) t2 = 0;
+      if (t2 >= feats.NumRows()) t2 = feats.NumRows() - 1;
+      SubVector<BaseFloat> src(feats, t2),
+          dest(input_frames, j + left_context);
+      dest.CopyFromVec(src);
+    }
+
+    NnetExample eg;
+    
+    // call the regular input "input".
+    eg.io.push_back(NnetIo("input", - left_context,
+                           input_frames));
+
+    // if applicable, add the iVector feature.
+    if (ivector_feats != NULL) {
+      // try to get closest frame to middle of window to get
+      // a representative iVector.
+      int32 closest_frame = t + (actual_frames_per_eg / 2);
+      KALDI_ASSERT(ivector_feats->NumRows() > 0);
+      if (closest_frame >= ivector_feats->NumRows())
+        closest_frame = ivector_feats->NumRows() - 1;
+      Matrix<BaseFloat> ivector(1, ivector_feats->NumCols());
+      ivector.Row(0).CopyFromVec(ivector_feats->Row(closest_frame));
+      eg.io.push_back(NnetIo("ivector", 0, ivector));
+    }
+
+    // add the labels.
+    Matrix<BaseFloat> targets_dest(frames_per_eg, targets.NumCols());
+    for (int32 i = 0; i < actual_frames_per_eg; i++) {
+      // Copy the i^th row of the target matrix from the (t+i)^th row of the
+      // input targets matrix
+      SubVector<BaseFloat> this_target_dest(targets_dest, i);
+      SubVector<BaseFloat> this_target_src(targets, t+i);
+      this_target_dest.CopyFromVec(this_target_src);
+    } 
+    
+    // Copy the last frame's target to the padded frames
+    for (int32 i = actual_frames_per_eg; i < frames_per_eg; i++) {
+      // Copy the i^th row of the target matrix from the last row of the 
+      // input targets matrix
+      KALDI_ASSERT(t + actual_frames_per_eg - 1 == feats.NumRows() - 1);
+      SubVector<BaseFloat> this_target_dest(targets_dest, i);
+      SubVector<BaseFloat> this_target_src(targets, t+actual_frames_per_eg-1);
+      this_target_dest.CopyFromVec(this_target_src);
+    } 
+
+    // push this created targets matrix into the eg
+    eg.io.push_back(NnetIo("output", 0, targets_dest));
+    
+    if (compress)
+      eg.Compress();
+      
+    std::ostringstream os;
+    os << utt_id << "-" << t;
+
+    std::string key = os.str(); // key is <utt_id>-<frame_id>
+
+    *num_frames_written += actual_frames_per_eg;
+    *num_egs_written += 1;
+
+    example_writer->Write(key, eg);
+  }
+}
+
+
+} // namespace nnet2
+} // namespace kaldi
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Get frame-by-frame examples of data for nnet3 neural network training.\n"
+        "This program is similar to nnet3-get-egs, but the targets here are "
+        "dense matrices instead of posteriors (sparse matrices).\n"
+        "This is useful when you want the targets to be continuous real-valued "
+        "with the neural network possibly trained with a quadratic objective\n"
+        "\n"
+        "Usage:  nnet3-get-egs-dense-targets --num-targets=<n> [options] "
+        "<features-rspecifier> <targets-rspecifier> <egs-out>\n"
+        "\n"
+        "An example [where $feats expands to the actual features]:\n"
+        "nnet-get-egs-dense-targets --num-targets=26 --left-context=12 \\\n"
+        "--right-context=9 --num-frames=8 \"$feats\" \\\n"
+        "\"ark:copy-matrix ark:exp/snrs/snr.1.ark ark:- |\"\n"
+        "   ark:- \n";
+        
+
+    bool compress = true;
+    int32 num_targets = -1, left_context = 0, right_context = 0,
+        num_frames = 1, length_tolerance = 100;
+        
+    std::string ivector_rspecifier;
+    
+    ParseOptions po(usage);
+    po.Register("compress", &compress, "If true, write egs in "
+                "compressed format.");
+    po.Register("num-targets", &num_targets, "Number of targets for the neural network");
+    po.Register("left-context", &left_context, "Number of frames of left "
+                "context the neural net requires.");
+    po.Register("right-context", &right_context, "Number of frames of right "
+                "context the neural net requires.");
+    po.Register("num-frames", &num_frames, "Number of frames with labels "
+                "that each example contains.");
+    po.Register("ivectors", &ivector_rspecifier, "Rspecifier of ivector "
+                "features, as matrix.");
+    po.Register("length-tolerance", &length_tolerance, "Tolerance for "
+                "difference in num-frames between feat and ivector matrices");
+    
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    if (num_targets <= 0)
+      KALDI_ERR << "--num-targets options is required.";
+
+    std::string feature_rspecifier = po.GetArg(1),
+        matrix_rspecifier = po.GetArg(2),
+        examples_wspecifier = po.GetArg(3);
+
+    // Read in all the training files.
+    SequentialBaseFloatMatrixReader feat_reader(feature_rspecifier);
+    RandomAccessBaseFloatMatrixReader matrix_reader(matrix_rspecifier);
+    NnetExampleWriter example_writer(examples_wspecifier);
+    RandomAccessBaseFloatMatrixReader ivector_reader(ivector_rspecifier);
+    
+    int32 num_done = 0, num_err = 0;
+    int64 num_frames_written = 0, num_egs_written = 0;
+    
+    for (; !feat_reader.Done(); feat_reader.Next()) {
+      std::string key = feat_reader.Key();
+      const Matrix<BaseFloat> &feats = feat_reader.Value();
+      if (!matrix_reader.HasKey(key)) {
+        KALDI_WARN << "No target matrix for key " << key;
+        num_err++;
+      } else {
+        const Matrix<BaseFloat> &target_matrix = matrix_reader.Value(key);
+        if (target_matrix.NumRows() != feats.NumRows()) {
+          KALDI_WARN << "Target matrix has wrong size " 
+                     << target_matrix.NumRows()
+                     << " versus " << feats.NumRows();
+          num_err++;
+          continue;
+        }
+        const Matrix<BaseFloat> *ivector_feats = NULL;
+        if (!ivector_rspecifier.empty()) {
+          if (!ivector_reader.HasKey(key)) {
+            KALDI_WARN << "No iVectors for utterance " << key;
+            num_err++;
+            continue;
+          } else {
+            // this address will be valid until we call HasKey() or Value()
+            // again.
+            ivector_feats = &(ivector_reader.Value(key));
+          }
+        }
+
+        if (ivector_feats != NULL &&
+            (abs(feats.NumRows() - ivector_feats->NumRows()) > length_tolerance
+             || ivector_feats->NumRows() == 0)) {
+          KALDI_WARN << "Length difference between feats " << feats.NumRows()
+                     << " and iVectors " << ivector_feats->NumRows()
+                     << "exceeds tolerance " << length_tolerance;
+          num_err++;
+          continue;
+        }
+          
+        ProcessFile(feats, ivector_feats, target_matrix, key, compress,
+                    num_targets, left_context, right_context, num_frames,
+                    &num_frames_written, &num_egs_written,
+                    &example_writer);
+        num_done++;
+      }
+    }
+
+    KALDI_LOG << "Finished generating examples, "
+              << "successfully processed " << num_done
+              << " feature files, wrote " << num_egs_written << " examples, "
+              << " with " << num_frames_written << " egs in total; "
+              << num_err << " files had errors.";
+    return (num_egs_written == 0 || num_err > num_done ? 1 : 0);
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
diff --git a/src/nnet3bin/nnet3-get-egs.cc b/src/nnet3bin/nnet3-get-egs.cc
index 36f7057d3f4..75f264f1ceb 100644
--- a/src/nnet3bin/nnet3-get-egs.cc
+++ b/src/nnet3bin/nnet3-get-egs.cc
@@ -56,7 +56,7 @@ static void ProcessFile(const MatrixBase<BaseFloat> &feats,
 
     int32 tot_frames = left_context + frames_per_eg + right_context;
 
-    Matrix<BaseFloat> input_frames(tot_frames, feats.NumCols());
+    Matrix<BaseFloat> input_frames(tot_frames, feats.NumCols(), kUndefined);
     
     // Set up "input_frames".
     for (int32 j = -left_context; j < frames_per_eg + right_context; j++) {
@@ -134,7 +134,7 @@ int main(int argc, char *argv[]) {
         "<pdf-post-rspecifier> <egs-out>\n"
         "\n"
         "An example [where $feats expands to the actual features]:\n"
-        "nnet-get-egs --num-pdfs=2658 --left-context=12 --right-context=9 --num-frames=8 \"$feats\"\\\n"
+        "nnet3-get-egs --num-pdfs=2658 --left-context=12 --right-context=9 --num-frames=8 \"$feats\"\\\n"
         "\"ark:gunzip -c exp/nnet/ali.1.gz | ali-to-pdf exp/nnet/1.nnet ark:- ark:- | ali-to-post ark:- ark:- |\" \\\n"
         "   ark:- \n";
         
@@ -157,7 +157,7 @@ int main(int argc, char *argv[]) {
     po.Register("num-frames", &num_frames, "Number of frames with labels "
                 "that each example contains.");
     po.Register("ivectors", &ivector_rspecifier, "Rspecifier of ivector "
-                "features, as matrix.");
+                "features, as a matrix.");
     po.Register("length-tolerance", &length_tolerance, "Tolerance for "
                 "difference in num-frames between feat and ivector matrices");
     
diff --git a/src/nnet3bin/nnet3-latgen-faster.cc b/src/nnet3bin/nnet3-latgen-faster.cc
index 0e8464eae6d..6728b6224fd 100644
--- a/src/nnet3bin/nnet3-latgen-faster.cc
+++ b/src/nnet3bin/nnet3-latgen-faster.cc
@@ -48,10 +48,9 @@ int main(int argc, char *argv[]) {
     ParseOptions po(usage);
     Timer timer;
     bool allow_partial = false;
-    BaseFloat acoustic_scale = 0.1;
     LatticeFasterDecoderConfig config;
-    DecodableAmNnetSimpleOptions decodable_opts;
-    
+    NnetSimpleComputationOptions decodable_opts;
+
     std::string word_syms_filename;
     std::string ivector_rspecifier,
         online_ivector_rspecifier,
@@ -72,9 +71,9 @@ int main(int argc, char *argv[]) {
     po.Register("online-ivector-period", &online_ivector_period, "Number of frames "
                 "between iVectors in matrices supplied to the --online-ivectors "
                 "option");
-    
+
     po.Read(argc, argv);
-    
+
     if (po.NumArgs() < 4 || po.NumArgs() > 6) {
       po.PrintUsage();
       exit(1);
@@ -86,7 +85,7 @@ int main(int argc, char *argv[]) {
         lattice_wspecifier = po.GetArg(4),
         words_wspecifier = po.GetOptArg(5),
         alignment_wspecifier = po.GetOptArg(6);
-    
+
     TransitionModel trans_model;
     AmNnetSimple am_nnet;
     {
@@ -108,12 +107,12 @@ int main(int argc, char *argv[]) {
         online_ivector_rspecifier);
     RandomAccessBaseFloatVectorReaderMapped ivector_reader(
         ivector_rspecifier, utt2spk_rspecifier);
-    
+
     Int32VectorWriter words_writer(words_wspecifier);
     Int32VectorWriter alignment_writer(alignment_wspecifier);
 
     fst::SymbolTable *word_syms = NULL;
-    if (word_syms_filename != "") 
+    if (word_syms_filename != "")
       if (!(word_syms = fst::SymbolTable::ReadText(word_syms_filename)))
         KALDI_ERR << "Could not read symbol table from file "
                    << word_syms_filename;
@@ -124,13 +123,13 @@ int main(int argc, char *argv[]) {
 
     if (ClassifyRspecifier(fst_in_str, NULL, NULL) == kNoRspecifier) {
       SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-      
+
       // Input FST is just one FST, not a table of FSTs.
       VectorFst<StdArc> *decode_fst = fst::ReadFstKaldi(fst_in_str);
 
       {
         LatticeFasterDecoder decoder(*decode_fst, config);
-    
+
         for (; !feature_reader.Done(); feature_reader.Next()) {
           std::string utt = feature_reader.Key();
           const Matrix<BaseFloat> &features (feature_reader.Value());
@@ -159,7 +158,7 @@ int main(int argc, char *argv[]) {
               online_ivectors = &online_ivector_reader.Value(utt);
             }
           }
-          
+
           DecodableAmNnetSimple nnet_decodable(
               decodable_opts, trans_model, am_nnet,
               features, ivector, online_ivectors,
@@ -168,8 +167,9 @@ int main(int argc, char *argv[]) {
           double like;
           if (DecodeUtteranceLatticeFaster(
                   decoder, nnet_decodable, trans_model, word_syms, utt,
-                  acoustic_scale, determinize, allow_partial, &alignment_writer,
-                  &words_writer, &compact_lattice_writer, &lattice_writer,
+                  decodable_opts.acoustic_scale, determinize, allow_partial,
+                  &alignment_writer, &words_writer, &compact_lattice_writer,
+                  &lattice_writer,
                   &like)) {
             tot_like += like;
             frame_count += features.NumRows();
@@ -180,7 +180,7 @@ int main(int argc, char *argv[]) {
       delete decode_fst; // delete this only after decoder goes out of scope.
     } else { // We have different FSTs for different utterances.
       SequentialTableReader<fst::VectorFstHolder> fst_reader(fst_in_str);
-      RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);          
+      RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
       for (; !fst_reader.Done(); fst_reader.Next()) {
         std::string utt = fst_reader.Key();
         if (!feature_reader.HasKey(utt)) {
@@ -195,7 +195,7 @@ int main(int argc, char *argv[]) {
           num_fail++;
           continue;
         }
-        
+
         LatticeFasterDecoder decoder(fst_reader.Value(), config);
 
         const Matrix<BaseFloat> *online_ivectors = NULL;
@@ -218,25 +218,25 @@ int main(int argc, char *argv[]) {
             online_ivectors = &online_ivector_reader.Value(utt);
           }
         }
-        
+
         DecodableAmNnetSimple nnet_decodable(
             decodable_opts, trans_model, am_nnet,
             features, ivector, online_ivectors,
             online_ivector_period);
-        
+
         double like;
         if (DecodeUtteranceLatticeFaster(
                 decoder, nnet_decodable, trans_model, word_syms, utt,
-                acoustic_scale, determinize, allow_partial, &alignment_writer,
-                &words_writer, &compact_lattice_writer, &lattice_writer,
-                &like)) {
+                decodable_opts.acoustic_scale, determinize, allow_partial,
+                &alignment_writer, &words_writer, &compact_lattice_writer,
+                &lattice_writer, &like)) {
           tot_like += like;
           frame_count += features.NumRows();
           num_success++;
         } else num_fail++;
       }
     }
-      
+
     double elapsed = timer.Elapsed();
     KALDI_LOG << "Time taken "<< elapsed
               << "s: real-time factor assuming 100 frames/sec is "
@@ -246,7 +246,7 @@ int main(int argc, char *argv[]) {
     KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like/frame_count) << " over "
               << frame_count<<" frames.";
 
-    if (word_syms) delete word_syms;
+    delete word_syms;
     if (num_success != 0) return 0;
     else return 1;
   } catch(const std::exception &e) {
diff --git a/src/nnet3bin/nnet3-merge-egs.cc b/src/nnet3bin/nnet3-merge-egs.cc
index 22edc10661c..8627671f53a 100644
--- a/src/nnet3bin/nnet3-merge-egs.cc
+++ b/src/nnet3bin/nnet3-merge-egs.cc
@@ -57,11 +57,12 @@ int main(int argc, char *argv[]) {
         "Usage:  nnet3-merge-egs [options] <egs-rspecifier> <egs-wspecifier>\n"
         "e.g.\n"
         "nnet3-merge-egs --minibatch-size=512 ark:1.egs ark:- | nnet3-train-simple ... \n"
-        "See also nnet-copy-egs\n";
+        "See also nnet3-copy-egs\n";
         
     bool compress = false;
     int32 minibatch_size = 512;
     bool measure_output_frames = true;
+    bool discard_partial_minibatches = false;
 
     ParseOptions po(usage);
     po.Register("minibatch-size", &minibatch_size, "Target size of minibatches "
@@ -71,7 +72,10 @@ int main(int argc, char *argv[]) {
                 "false, --minibatch-size is the number of input examples to "
                 "merge.");
     po.Register("compress", &compress, "If true, compress the output examples "
-                "(not recommended unless you are writing to disk");
+                "(not recommended unless you are writing to disk)");
+    po.Register("discard-partial-minibatches", &discard_partial_minibatches,
+		"discard any partial minibatches of 'uneven' size that may be "
+		"encountered at the end.");
     
     po.Read(argc, argv);
 
@@ -105,8 +109,9 @@ int main(int argc, char *argv[]) {
       // Do Next() now, so we can test example_reader.Done() below .
       example_reader.Next();
       num_read++;
-      
-      if (minibatch_ready || (example_reader.Done() && !examples.empty())) {
+
+      if (minibatch_ready || (!discard_partial_minibatches &&
+			      (example_reader.Done() && !examples.empty()))) {
         NnetExample merged_eg;
         MergeExamples(examples, compress, &merged_eg);
         std::ostringstream ostr;
diff --git a/src/nnet3bin/nnet3-modify-learning-rates.cc b/src/nnet3bin/nnet3-modify-learning-rates.cc
new file mode 100644
index 00000000000..89e14a5e819
--- /dev/null
+++ b/src/nnet3bin/nnet3-modify-learning-rates.cc
@@ -0,0 +1,186 @@
+// nnet3bin/nnet3-modify-learning-rates.cc
+
+// Copyright 2013  Guoguo Chen
+//           2015  Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "hmm/transition-model.h"
+#include "nnet3/nnet-utils.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "This program modifies the learning rates so as to equalize the\n"
+        "relative changes in parameters for each layer, while keeping their\n"
+        "geometric mean the same (or changing it to a value specified using\n"
+        "the --average-learning-rate option).\n"
+        "\n"
+        "Usage: nnet3-modify-learning-rates [options] <prev-model> \\\n"
+        "                                  <cur-model> <modified-cur-model>\n"
+        "e.g.: nnet-modify-learning-rates --average-learning-rate=0.0002 \\\n"
+        "                                 5.mdl 6.mdl 6.mdl\n";
+
+    bool binary_write = true;
+    bool retroactive = false;
+    BaseFloat average_learning_rate = 0.0;
+    BaseFloat first_layer_factor = 1.0;
+    BaseFloat last_layer_factor = 1.0;
+    
+    ParseOptions po(usage);
+    po.Register("binary", &binary_write, "Write output in binary mode");
+    po.Register("average-learning-rate", &average_learning_rate,
+                "If supplied, change learning rate geometric mean to the given "
+                "value.");
+    po.Register("first-layer-factor", &first_layer_factor, "Factor that "
+                "reduces the target relative learning rate for first layer.");
+    po.Register("last-layer-factor", &last_layer_factor, "Factor that "
+                "reduces the target relative learning rate for last layer.");
+    po.Register("retroactive", &retroactive, "If true, scale the parameter "
+                "differences as well.");
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    KALDI_ASSERT(average_learning_rate >= 0);
+
+    std::string prev_nnet_rxfilename = po.GetArg(1),
+        cur_nnet_rxfilename = po.GetArg(2),
+        modified_cur_nnet_rxfilename = po.GetOptArg(3);
+
+    TransitionModel trans_model;
+    Nnet prev_nnet, cur_nnet;
+    {
+      bool binary_read;
+      Input ki(prev_nnet_rxfilename, &binary_read);
+      prev_nnet.Read(ki.Stream(), binary_read);
+    }
+    {
+      bool binary_read;
+      Input ki(cur_nnet_rxfilename, &binary_read);
+      cur_nnet.Read(ki.Stream(), binary_read);
+    }
+
+    int32 ret = 0;
+
+    // Get info about magnitude of parameter change.
+    Nnet diff_nnet(prev_nnet);
+    AddNnet(cur_nnet, -1.0, &diff_nnet);
+    int32 num_updatable = NumUpdatableComponents(diff_nnet);
+    Vector<BaseFloat> dot_prod(num_updatable);
+    ComponentDotProducts(diff_nnet, diff_nnet, &dot_prod);
+    dot_prod.ApplyPow(0.5); // take sqrt to get l2 norm of diff
+    KALDI_LOG << "Parameter differences per layer are "
+      << PrintVectorPerUpdatableComponent(prev_nnet, dot_prod);
+
+    Vector<BaseFloat> baseline_prod(num_updatable);
+    ComponentDotProducts(prev_nnet, prev_nnet, &baseline_prod);
+    baseline_prod.ApplyPow(0.5);
+    dot_prod.DivElements(baseline_prod);
+    KALDI_LOG << "Relative parameter differences per layer are "
+      << PrintVectorPerUpdatableComponent(prev_nnet, dot_prod);
+
+    // If relative parameter difference for a certain is zero, set it to the
+    // mean of the rest values.
+    int32 num_zero = 0;
+    for (int32 i = 0; i < num_updatable; i++) {
+      if (dot_prod(i) == 0.0) {
+        num_zero++;
+      }
+    }
+    
+    if (num_zero > 0) {
+      BaseFloat average_diff = dot_prod.Sum()
+        / static_cast<BaseFloat>(num_updatable - num_zero);
+      for (int32 i = 0; i < num_updatable; i++) {
+        if (dot_prod(i) == 0.0) {
+          dot_prod(i) = average_diff;
+        }
+      }
+      KALDI_LOG << "Zeros detected in the relative parameter difference "
+        << "vector, updating the vector to " << dot_prod ;
+    }
+
+    // Gets learning rates for previous neural net.
+    Vector<BaseFloat> prev_nnet_learning_rates(num_updatable),
+                      cur_nnet_learning_rates(num_updatable);
+    GetLearningRates(prev_nnet, &prev_nnet_learning_rates);
+    GetLearningRates(cur_nnet, &cur_nnet_learning_rates);
+    KALDI_LOG << "Learning rates for previous model per layer are "
+              << prev_nnet_learning_rates;
+    KALDI_LOG << "Learning rates for current model per layer are "
+              << cur_nnet_learning_rates;
+    
+    // Gets target geometric mean.
+    BaseFloat target_geometric_mean = 0.0; 
+    if (average_learning_rate == 0.0) {
+      target_geometric_mean = Exp(cur_nnet_learning_rates.SumLog()
+                                  / static_cast<BaseFloat>(num_updatable));
+    } else {
+      target_geometric_mean = average_learning_rate;
+    }
+    KALDI_ASSERT(target_geometric_mean > 0.0);
+
+    // Works out the new learning rates.  We start from the previous model;
+    // this ensures that if this program is run twice, we get consistent
+    // results even if it's overwritten the current model.
+    Vector<BaseFloat> nnet_learning_rates(prev_nnet_learning_rates);
+    nnet_learning_rates.DivElements(dot_prod);
+    KALDI_ASSERT(last_layer_factor > 0.0);
+    nnet_learning_rates(num_updatable - 1) *= last_layer_factor;
+    KALDI_ASSERT(first_layer_factor > 0.0);
+    nnet_learning_rates(0) *= first_layer_factor;
+    BaseFloat cur_geometric_mean = Exp(nnet_learning_rates.SumLog()
+                                 / static_cast<BaseFloat>(num_updatable));
+    nnet_learning_rates.Scale(target_geometric_mean / cur_geometric_mean);
+    KALDI_LOG << "New learning rates for current model per layer are "
+              << nnet_learning_rates;
+
+    // Changes the parameter differences if --retroactivate is set to true.
+    if (retroactive) {
+      Vector<BaseFloat> scale_factors(nnet_learning_rates);
+      scale_factors.DivElements(prev_nnet_learning_rates);
+      AddNnet(prev_nnet, -1.0, &cur_nnet);
+      ScaleNnetComponents(scale_factors, &cur_nnet);
+      AddNnet(prev_nnet, 1.0, &cur_nnet);
+      KALDI_LOG << "Scale parameter difference retroactively. Scaling factors "
+                << "are " << scale_factors;
+    }
+
+    // Sets learning rates and writes updated model.
+    SetLearningRates(nnet_learning_rates, &cur_nnet);
+
+    Output ko(modified_cur_nnet_rxfilename, binary_write);
+    cur_nnet.Write(ko.Stream(), binary_write);
+
+    return ret;
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
diff --git a/src/nnet3bin/nnet3-show-progress.cc b/src/nnet3bin/nnet3-show-progress.cc
new file mode 100644
index 00000000000..10898dc0ca6
--- /dev/null
+++ b/src/nnet3bin/nnet3-show-progress.cc
@@ -0,0 +1,157 @@
+// nnet3bin/nnet3-show-progress.cc
+
+// Copyright 2015 Johns Hopkins University (author:  Daniel Povey)
+//           2015 Xingyu Na
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "hmm/transition-model.h"
+#include "nnet3/nnet-utils.h"
+#include "nnet3/nnet-diagnostics.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Given an old and a new 'raw' nnet3 network and some training examples\n"
+        "(possibly held-out), show the average objective function given the\n"
+        "mean of the two networks, and the breakdown by component of why this\n"
+        "happened (computed from derivative information). Also shows parameter\n"
+        "differences per layer. If training examples not provided, only shows\n"
+        "parameter differences per layer.\n"
+        "\n"
+        "Usage:  nnet3-show-progress [options] <old-net-in> <new-net-in>"
+        " [<training-examples-in>]\n"
+        "e.g.: nnet3-show-progress 1.nnet 2.nnet ark:valid.egs\n";
+
+    ParseOptions po(usage);
+
+    int32 num_segments = 1;
+    std::string use_gpu = "no";
+    NnetComputeProbOptions compute_prob_opts;
+    compute_prob_opts.compute_deriv = true;
+
+    po.Register("num-segments", &num_segments,
+                "Number of line segments used for computing derivatives");
+    po.Register("use-gpu", &use_gpu,
+                "yes|no|optional|wait, only has effect if compiled with CUDA");
+    compute_prob_opts.Register(&po);
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() < 2 || po.NumArgs() > 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+#if HAVE_CUDA==1
+    CuDevice::Instantiate().SelectGpuId(use_gpu);
+#endif
+
+    std::string nnet1_rxfilename = po.GetArg(1),
+                nnet2_rxfilename = po.GetArg(2),
+                examples_rspecifier = po.GetOptArg(3);
+
+    Nnet nnet1, nnet2;
+    ReadKaldiObject(nnet1_rxfilename, &nnet1);
+    ReadKaldiObject(nnet2_rxfilename, &nnet2);
+
+    if (NumParameters(nnet1) != NumParameters(nnet2)) {
+      KALDI_WARN << "Parameter-dim mismatch, cannot show progress.";
+      exit(0);
+    }
+
+    if (!examples_rspecifier.empty()) {
+      std::vector<NnetExample> examples;
+      SequentialNnetExampleReader example_reader(examples_rspecifier);
+      for (; !example_reader.Done(); example_reader.Next())
+        examples.push_back(example_reader.Value());
+
+      int32 num_examples = examples.size();
+
+      if (num_examples == 0)
+        KALDI_ERR << "No examples read.";
+
+      int32 num_updatable = NumUpdatableComponents(nnet1);
+      Vector<BaseFloat> diff(num_updatable);
+
+      for (int32 s = 0; s < num_segments; s++) {
+        // start and end segments of the line between 0 and 1
+        BaseFloat start = (s + 0.0) / num_segments,
+            end = (s + 1.0) / num_segments, middle = 0.5 * (start + end);
+        Nnet interp_nnet(nnet2);
+        ScaleNnet(middle, &interp_nnet);
+        AddNnet(nnet1, 1.0 - middle, &interp_nnet);
+
+        NnetComputeProb prob_computer(compute_prob_opts, interp_nnet);
+        std::vector<NnetExample>::const_iterator eg_iter = examples.begin(),
+                                                 eg_end = examples.end();
+        for (; eg_iter != eg_end; ++eg_iter)
+          prob_computer.Compute(*eg_iter);
+        const SimpleObjectiveInfo *objf_info = prob_computer.GetObjective("output");
+        double objf_per_frame = objf_info->tot_objective / objf_info->tot_weight;
+        const Nnet &nnet_gradient = prob_computer.GetDeriv();
+        KALDI_LOG << "At position " << middle
+                  << ", objf per frame is " << objf_per_frame;
+
+        Vector<BaseFloat> old_dotprod(num_updatable), new_dotprod(num_updatable);
+        ComponentDotProducts(nnet_gradient, nnet1, &old_dotprod);
+        ComponentDotProducts(nnet_gradient, nnet2, &new_dotprod);
+        old_dotprod.Scale(1.0 / objf_info->tot_weight);
+        new_dotprod.Scale(1.0 / objf_info->tot_weight);
+        diff.AddVec(1.0/ num_segments, new_dotprod);
+        diff.AddVec(-1.0 / num_segments, old_dotprod);
+        KALDI_VLOG(1) << "By segment " << s << ", objf change is "
+                      << PrintVectorPerUpdatableComponent(nnet1, diff);
+      }
+      KALDI_LOG << "Total objf change per component is "
+                << PrintVectorPerUpdatableComponent(nnet1, diff);
+    }
+
+    { // Get info about magnitude of parameter change.
+      Nnet diff_nnet(nnet1);
+      AddNnet(nnet2, -1.0, &diff_nnet);
+      int32 num_updatable = NumUpdatableComponents(diff_nnet);
+      Vector<BaseFloat> dot_prod(num_updatable);
+      ComponentDotProducts(diff_nnet, diff_nnet, &dot_prod);
+      dot_prod.ApplyPow(0.5); // take sqrt to get l2 norm of diff
+      KALDI_LOG << "Parameter differences per layer are "
+                << PrintVectorPerUpdatableComponent(nnet1, dot_prod);
+
+      Vector<BaseFloat> baseline_prod(num_updatable);
+      ComponentDotProducts(nnet1, nnet1, &baseline_prod);
+      baseline_prod.ApplyPow(0.5);
+      dot_prod.DivElements(baseline_prod);
+      KALDI_LOG << "Relative parameter differences per layer are "
+                << PrintVectorPerUpdatableComponent(nnet1, dot_prod);
+    }
+#if HAVE_CUDA==1
+    CuDevice::Instantiate().PrintProfile();
+#endif
+    return 0;
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
+
diff --git a/src/nnet3bin/nnet3-shuffle-egs.cc b/src/nnet3bin/nnet3-shuffle-egs.cc
index 5467c73700d..4c0bd9560f4 100644
--- a/src/nnet3bin/nnet3-shuffle-egs.cc
+++ b/src/nnet3bin/nnet3-shuffle-egs.cc
@@ -39,7 +39,7 @@ int main(int argc, char *argv[]) {
         "Usage:  nnet3-shuffle-egs [options] <egs-rspecifier> <egs-wspecifier>\n"
         "\n"
         "nnet3-shuffle-egs --srand=1 ark:train.egs ark:shuffled.egs\n";
-    
+
     int32 srand_seed = 0;
     int32 buffer_size = 0;
     ParseOptions po(usage);
@@ -47,11 +47,11 @@ int main(int argc, char *argv[]) {
     po.Register("buffer-size", &buffer_size, "If >0, size of a buffer we use "
                 "to do limited-memory partial randomization.  Otherwise, do "
                 "full randomization.");
-    
+
     po.Read(argc, argv);
 
     srand(srand_seed);
-    
+
     if (po.NumArgs() != 2) {
       po.PrintUsage();
       exit(1);
@@ -69,20 +69,20 @@ int main(int argc, char *argv[]) {
     if (buffer_size == 0) { // Do full randomization
       // Putting in an extra level of indirection here to avoid excessive
       // computation and memory demands when we have to resize the vector.
-    
+
       for (; !example_reader.Done(); example_reader.Next())
-        egs.push_back(std::make_pair(example_reader.Key(), 
+        egs.push_back(std::make_pair(example_reader.Key(),
                                     new NnetExample(example_reader.Value())));
-      
+
       std::random_shuffle(egs.begin(), egs.end());
     } else {
       KALDI_ASSERT(buffer_size > 0);
-      egs.resize(buffer_size, std::make_pair(std::string(""), 
-                                             static_cast<NnetExample*>(NULL)));
+      egs.resize(buffer_size,
+                 std::pair<std::string, NnetExample*>("", NULL));
       for (; !example_reader.Done(); example_reader.Next()) {
         int32 index = RandInt(0, buffer_size - 1);
         if (egs[index].second == NULL) {
-          egs[index] = std::make_pair(example_reader.Key(), 
+          egs[index] = std::make_pair(example_reader.Key(),
                                     new NnetExample(example_reader.Value()));
         } else {
           example_writer.Write(egs[index].first, *(egs[index].second));
@@ -90,20 +90,20 @@ int main(int argc, char *argv[]) {
           *(egs[index].second) = example_reader.Value();
           num_done++;
         }
-      }      
+      }
     }
     for (size_t i = 0; i < egs.size(); i++) {
       if (egs[i].second != NULL) {
         example_writer.Write(egs[i].first, *(egs[i].second));
         delete egs[i].second;
+        num_done++;
       }
-      num_done++;
     }
 
     KALDI_LOG << "Shuffled order of " << num_done
               << " neural-network training examples "
               << (buffer_size ? "using a buffer (partial randomization)" : "");
-                  
+
     return (num_done == 0 ? 1 : 0);
   } catch(const std::exception &e) {
     std::cerr << e.what() << '\n';
diff --git a/src/nnet3bin/nnet3-subset-egs.cc b/src/nnet3bin/nnet3-subset-egs.cc
index 7baebdbcc0d..2a52b80ac96 100644
--- a/src/nnet3bin/nnet3-subset-egs.cc
+++ b/src/nnet3bin/nnet3-subset-egs.cc
@@ -36,7 +36,7 @@ int main(int argc, char *argv[]) {
         "Usage:  nnet3-subset-egs [options] <egs-rspecifier> [<egs-wspecifier2> ...]\n"
         "\n"
         "e.g.\n"
-        "nnet3-subset-egs [args] ark:- | nnet-subset-egs --n=1000 ark:- ark:subset.egs\n";
+        "nnet3-copy-egs [args] ark:egs.1.ark ark:- | nnet-subset-egs --n=1000 ark:- ark:subset.egs\n";
     
     int32 srand_seed = 0;
     int32 n = 1000;
diff --git a/src/nnet3bin/nnet3-train.cc b/src/nnet3bin/nnet3-train.cc
index 977eac6c4ca..4c2d0efe971 100644
--- a/src/nnet3bin/nnet3-train.cc
+++ b/src/nnet3bin/nnet3-train.cc
@@ -40,25 +40,25 @@ int main(int argc, char *argv[]) {
         "\n"
         "e.g.:\n"
         "nnet3-train 1.raw 'ark:nnet3-merge-egs 1.egs ark:-|' 2.raw\n";
-    
+
     bool binary_write = true;
     std::string use_gpu = "yes";
     NnetTrainerOptions train_config;
-    
+
     ParseOptions po(usage);
     po.Register("binary", &binary_write, "Write output in binary mode");
     po.Register("use-gpu", &use_gpu,
                 "yes|no|optional|wait, only has effect if compiled with CUDA");
-    
+
     train_config.Register(&po);
 
     po.Read(argc, argv);
-    
+
     if (po.NumArgs() != 3) {
       po.PrintUsage();
       exit(1);
     }
-    
+
 #if HAVE_CUDA==1
     CuDevice::Instantiate().SelectGpuId(use_gpu);
 #endif
@@ -71,14 +71,14 @@ int main(int argc, char *argv[]) {
     ReadKaldiObject(nnet_rxfilename, &nnet);
 
     NnetTrainer trainer(train_config, &nnet);
-    
+
     SequentialNnetExampleReader example_reader(examples_rspecifier);
 
     for (; !example_reader.Done(); example_reader.Next())
       trainer.Train(example_reader.Value());
 
     bool ok = trainer.PrintTotalStats();
-    
+
 #if HAVE_CUDA==1
     CuDevice::Instantiate().PrintProfile();
 #endif
diff --git a/src/nnetbin/Makefile b/src/nnetbin/Makefile
index 1475041b99a..0b5e58c0d1e 100644
--- a/src/nnetbin/Makefile
+++ b/src/nnetbin/Makefile
@@ -16,7 +16,7 @@ BINFILES = nnet-train-frmshuff \
         transf-to-nnet cmvn-to-nnet nnet-initialize \
         nnet-kl-hmm-acc nnet-kl-hmm-mat-to-component \
 	feat-to-post paste-post train-transitions \
-	cuda-gpu-available
+	cuda-gpu-available nnet-set-learnrate
 
 OBJFILES =
 
@@ -26,6 +26,6 @@ TESTFILES =
 
 ADDLIBS = ../nnet/kaldi-nnet.a ../cudamatrix/kaldi-cudamatrix.a ../lat/kaldi-lat.a \
           ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a ../matrix/kaldi-matrix.a \
-          ../util/kaldi-util.a ../base/kaldi-base.a 
+          ../util/kaldi-util.a ../thread/kaldi-thread.a ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/nnetbin/cmvn-to-nnet.cc b/src/nnetbin/cmvn-to-nnet.cc
index 2271a430ef0..980a5eaabfb 100644
--- a/src/nnetbin/cmvn-to-nnet.cc
+++ b/src/nnetbin/cmvn-to-nnet.cc
@@ -1,6 +1,6 @@
 // nnetbin/cmvn-to-nnet.cc
 
-// Copyright 2012  Brno University of Technology
+// Copyright 2012-2016  Brno University of Technology
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -29,22 +29,22 @@ int main(int argc, char *argv[]) {
     typedef kaldi::int32 int32;
 
     const char *usage =
-        "Convert cmvn-stats into <AddShift> and <Rescale> components.\n"
-        "Usage:  cmvn-to-nnet [options] <transf-in> <nnet-out>\n"
-        "e.g.:\n"
-        " cmvn-to-nnet --binary=false transf.mat nnet.mdl\n";
+      "Convert cmvn-stats into <AddShift> and <Rescale> components.\n"
+      "Usage:  cmvn-to-nnet [options] <transf-in> <nnet-out>\n"
+      "e.g.:\n"
+      " cmvn-to-nnet --binary=false transf.mat nnet.mdl\n";
 
 
     bool binary_write = false;
-    bool tied_normalzation = false;
     float var_floor = 1e-10;
     float learn_rate_coef = 0.0;
-    
+
     ParseOptions po(usage);
     po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("tied-normalization", &tied_normalzation, "The normalization is tied accross all the input dimensions");
-    po.Register("var-floor", &var_floor, "Floor the variance, so the factors in <Rescale> are bounded.");
-    po.Register("learn-rate-coef", &learn_rate_coef, "Initialize learning-rate coefficient to a value.");
+    po.Register("var-floor", &var_floor,
+        "Floor the variance, so the factors in <Rescale> are bounded.");
+    po.Register("learn-rate-coef", &learn_rate_coef,
+        "Initialize learning-rate coefficient to a value.");
 
     po.Read(argc, argv);
 
@@ -53,100 +53,67 @@ int main(int argc, char *argv[]) {
       exit(1);
     }
 
-    std::string transform_rxfilename = po.GetArg(1),
+    std::string cmvn_stats_rxfilename = po.GetArg(1),
         model_out_filename = po.GetArg(2);
 
-    // read the matrix
+    // read the matrix,
     Matrix<double> cmvn_stats;
     {
       bool binary_read;
-      Input ki(transform_rxfilename, &binary_read);
+      Input ki(cmvn_stats_rxfilename, &binary_read);
       cmvn_stats.Read(ki.Stream(), binary_read);
     }
     KALDI_ASSERT(cmvn_stats.NumRows() == 2);
     KALDI_ASSERT(cmvn_stats.NumCols() > 1);
 
-    // get the count
-    double count = cmvn_stats(0,cmvn_stats.NumCols()-1);
-   
-    // buffers for shift and scale 
-    Vector<BaseFloat> shift(cmvn_stats.NumCols()-1);
-    Vector<BaseFloat> scale(cmvn_stats.NumCols()-1);
-    
+    int32 num_dims = cmvn_stats.NumCols() - 1;
+    double frame_count = cmvn_stats(0, cmvn_stats.NumCols() - 1);
+
+    // buffers for shift and scale
+    Vector<BaseFloat> shift(num_dims);
+    Vector<BaseFloat> scale(num_dims);
+
     // compute the shift and scale per each dimension
-    for(int32 d=0; d<cmvn_stats.NumCols()-1; d++) {
-      BaseFloat mean = cmvn_stats(0,d)/count;
-      BaseFloat var = cmvn_stats(1,d)/count - mean*mean;
+    for (int32 d = 0; d < num_dims; d++) {
+      BaseFloat mean = cmvn_stats(0, d) / frame_count;
+      BaseFloat var = cmvn_stats(1, d) / frame_count - mean * mean;
       if (var <= var_floor) {
-        KALDI_WARN << "Very small variance " << var << " flooring to " << var_floor;
+        KALDI_WARN << "Very small variance " << var
+                   << " flooring to " << var_floor;
         var = var_floor;
       }
       shift(d) = -mean;
       scale(d) = 1.0 / sqrt(var);
     }
 
-    if(tied_normalzation) {
-      // just average the variances
-      BaseFloat sum_var = 0.0;
-      for(int32 i=0; i<scale.Dim(); i++) {
-        sum_var += 1.0 / (scale(i)*scale(i));
-      }
-      BaseFloat mean_var = sum_var / scale.Dim();
-      BaseFloat tied_scale = 1.0 / sqrt(mean_var);
-      scale.Set(tied_scale);
-    }
-
-    // we will put the shift and scale to the nnet
+    // create empty nnet,
     Nnet nnet;
 
-    // create the shift component
+    // append shift component to nnet,
     {
-      AddShift* shift_component = new AddShift(shift.Dim(), shift.Dim());
-      // the pointer will be given to the nnet, so we don't need to call delete
-      
-      // convert Vector to CuVector
-      CuVector<BaseFloat> cu_shift(shift);
-
-      // set the weights
-      shift_component->SetShiftVec(cu_shift);
-
-      // set the learn-rate coef
-      shift_component->SetLearnRateCoef(learn_rate_coef);
-
-      // append layer to the nnet
+      AddShift shift_component(shift.Dim(), shift.Dim());
+      shift_component.SetParams(shift);
+      shift_component.SetLearnRateCoef(learn_rate_coef);
       nnet.AppendComponent(shift_component);
     }
 
-    // create the scale component
+    // append scale component to nnet,
     {
-      Rescale* scale_component = new Rescale(scale.Dim(), scale.Dim());
-      // the pointer will be given to the nnet, so we don't need to call delete
-      
-      // convert Vector to CuVector
-      CuVector<BaseFloat> cu_scale(scale);
-
-      // set the weights
-      scale_component->SetScaleVec(cu_scale);
-
-      // set the learn-rate coef
-      scale_component->SetLearnRateCoef(learn_rate_coef);
-
-      // append layer to the nnet
+      Rescale scale_component(scale.Dim(), scale.Dim());
+      scale_component.SetParams(scale);
+      scale_component.SetLearnRateCoef(learn_rate_coef);
       nnet.AppendComponent(scale_component);
     }
-      
-    // write the nnet
+
+    // write the nnet,
     {
       Output ko(model_out_filename, binary_write);
       nnet.Write(ko.Stream(), binary_write);
+      KALDI_LOG << "Written cmvn in 'nnet1' model to: " << model_out_filename;
     }
-
-    KALDI_LOG << "Written model to " << model_out_filename;
     return 0;
-  } catch(const std::exception& e) {
-    std::cerr << e.what() << '\n';
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
     return -1;
   }
 }
-
-
diff --git a/src/nnetbin/cuda-gpu-available.cc b/src/nnetbin/cuda-gpu-available.cc
index 86ecf0d8b97..897f01a8241 100644
--- a/src/nnetbin/cuda-gpu-available.cc
+++ b/src/nnetbin/cuda-gpu-available.cc
@@ -17,14 +17,14 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
-#include "base/kaldi-common.h"
-#include "cudamatrix/cu-device.h"
-
 #ifndef _MSC_VER
   #include <unistd.h>
   #include <errno.h>
 #endif
 
+#include "base/kaldi-common.h"
+#include "cudamatrix/cu-device.h"
+
 using namespace kaldi;
 
 int main(int argc, char *argv[]) try {
@@ -33,36 +33,44 @@ int main(int argc, char *argv[]) try {
   if (gethostname(hostname, 100)) {
     KALDI_WARN << "Cannot get hostname, " << strerror(errno);
   }
-#endif  
-  std::cerr << "### IS CUDA GPU AVAILABLE? '" 
-            << hostname << "' ###" << std::endl;
-#if HAVE_CUDA==1
+#endif
+  std::cerr
+    << "### IS CUDA GPU AVAILABLE? '"
+    << hostname << "' ###" << std::endl;
+#if HAVE_CUDA == 1
   CuDevice::Instantiate().SelectGpuId("yes");
-  std::cerr << "### HURRAY, WE GOT A CUDA GPU FOR COMPUTATION!!! ###" << std::endl;
+  std::cerr
+    << "### HURRAY, WE GOT A CUDA GPU FOR COMPUTATION!!! ###"
+    << std::endl;
   return 0;
 #else
-  std::cerr << "### CUDA WAS NOT COMPILED IN! ###" << std::endl 
-            << "To support CUDA, you must run 'configure' on a machine "
-            << "that has the CUDA compiler 'nvcc' available.";
+  std::cerr
+    << "### CUDA WAS NOT COMPILED IN! ###" << std::endl
+    << "To support CUDA, you must run 'configure' on a machine "
+    << "that has the CUDA compiler 'nvcc' available.";
   return 1;
 #endif
 } catch (const std::exception &e) {
-  // Error message 'e' already printed in base/kaldi-error.cc:175,
-  std::cerr << "### WE DID NOT GET A CUDA GPU!!! ###" << std::endl
-            << "### If it's your 1st experiment with CUDA, try updating "
-            << "'display drivers' and 'CUDA toolkit'." << std::endl
-            << "### In other cases run 'nvidia-smi' in terminal "
-            << "(gets installed with display drivers) :" 
-            << std::endl
-            << "### - Check that you see your GPU." 
-            << std::endl
-            << "### - Burnt GPUs indicate error or disappear from the list until reboot." 
-            << std::endl
-            << "### - Check 'Memory-Usage' and 'GPU fan', the GPU may be taken by other process." 
-            << std::endl
-            << "### - Check with NVidia web that your 'display driver' and 'CUDA toolkit' is not too old." 
-            << std::endl;
-  static_cast<void>(e);  //To avoid "unreferenced local variable"
-  return 1;
+  std::cerr << e.what();
+  std::cerr
+    << "### WE DID NOT GET A CUDA GPU!!! ###" << std::endl
+    << "### If it's your 1st experiment with CUDA, try reinstalling "
+    << "'CUDA toolkit' from NVidia web (it contains the drivers)."
+    << std::endl
+    << "### In other cases run 'nvidia-smi' in terminal "
+    << "(gets installed with display drivers) :"
+    << std::endl
+    << "### - Check that you see your GPU."
+    << std::endl
+    << "### - Bad GPUs are reporting error or disappear from the list "
+    << "until reboot."
+    << std::endl
+    << "### - Check 'Memory-Usage' and 'GPU fan', "
+    << "which will tell you if the GPU was taken by other process."
+    << std::endl
+    << "### - Check there is same version of 'NVIDIA-SMI' and "
+    << "'Driver', and that it is not too old for your GPU."
+    << std::endl;
+  return -1;
 }
 
diff --git a/src/nnetbin/feat-to-post.cc b/src/nnetbin/feat-to-post.cc
index 9041b779a38..6e4f4306938 100644
--- a/src/nnetbin/feat-to-post.cc
+++ b/src/nnetbin/feat-to-post.cc
@@ -22,18 +22,19 @@
 #include "util/common-utils.h"
 #include "hmm/posterior.h"
 
-/** @brief Convert features into posterior format, is used to specify NN training targets. */
+/** @brief Converts features into posterior format, which is the generic
+ *  format of NN training targets in 'nnet1'. */
 int main(int argc, char *argv[]) {
   using namespace kaldi;
   typedef kaldi::int32 int32;
   try {
     const char *usage =
-        "Convert features into posterior format, which is used for NN training targets\n"
-        "in Karel's nnet1 tools.\n"
-        "(speed is not an issue for reasonably low NN-output dimensions)\n"
-        "Usage:  feat-to-post [options] feat-rspecifier posteriors-wspecifier\n"
-        "e.g.:\n"
-        " feat-to-post scp:feats.scp ark:feats.post\n";
+      "Convert features into posterior format, which is the generic format \n"
+      "of NN training targets in Karel's nnet1 tools.\n"
+      "(speed is not an issue for reasonably low NN-output dimensions)\n"
+      "Usage:  feat-to-post [options] feat-rspecifier posteriors-wspecifier\n"
+      "e.g.:\n"
+      " feat-to-post scp:feats.scp ark:feats.post\n";
 
     ParseOptions po(usage);
 
@@ -59,9 +60,9 @@ int main(int argc, char *argv[]) {
       // Posterior is vector<vector<pair<int32, BaseFloat> > >
       Posterior post(num_frames);
       // Fill posterior with matrix values,
-      for (int32 f=0; f<num_frames; f++) {
-        for (int32 d=0; d<num_dims; d++) {
-          post[f].push_back(std::make_pair(d,mat(f,d)));
+      for (int32 f = 0; f < num_frames; f++) {
+        for (int32 d = 0; d < num_dims; d++) {
+          post[f].push_back(std::make_pair(d, mat(f, d)));
         }
         KALDI_ASSERT(post[f].size() == num_dims);
       }
diff --git a/src/nnetbin/nnet-concat.cc b/src/nnetbin/nnet-concat.cc
index 6dac9ec7f6b..71c72d05b0a 100644
--- a/src/nnetbin/nnet-concat.cc
+++ b/src/nnetbin/nnet-concat.cc
@@ -28,13 +28,13 @@ int main(int argc, char *argv[]) {
     typedef kaldi::int32 int32;
 
     const char *usage =
-        "Concatenate Neural Networks (and possibly change binary/text format)\n"
-        "Usage:  nnet-concat [options] <model-in1> <...> <model-inN> <model-out>\n"
-        "e.g.:\n"
-        " nnet-concat --binary=false nnet.1 nnet.2 nnet.1.2\n";
-    
+      "Concatenate Neural Networks (and possibly change binary/text format)\n"
+      "Usage: nnet-concat [options] <nnet-in1> <...> <nnet-inN> <nnet-out>\n"
+      "e.g.:\n"
+      " nnet-concat --binary=false nnet.1 nnet.2 nnet.1.2\n";
+
     ParseOptions po(usage);
-    
+
     bool binary_write = true;
     po.Register("binary", &binary_write, "Write output in binary mode");
 
@@ -49,18 +49,18 @@ int main(int argc, char *argv[]) {
     std::string model_in_filename_next;
     std::string model_out_filename = po.GetArg(po.NumArgs());
 
-    //read the first nnet
+    // read the first nnet,
     KALDI_LOG << "Reading " << model_in_filename;
-    Nnet nnet; 
+    Nnet nnet;
     {
       bool binary_read;
       Input ki(model_in_filename, &binary_read);
       nnet.Read(ki.Stream(), binary_read);
     }
 
-    //read all the other nnets
-    for(int32 i=2; i<po.NumArgs(); i++) {
-      //read the nnet
+    // read all the other nnets,
+    for (int32 i = 2; i < po.NumArgs(); i++) {
+      // read the nnet,
       model_in_filename_next = po.GetArg(i);
       KALDI_LOG << "Concatenating " << model_in_filename_next;
       Nnet nnet_next;
@@ -69,11 +69,11 @@ int main(int argc, char *argv[]) {
         Input ki(model_in_filename_next, &binary_read);
         nnet_next.Read(ki.Stream(), binary_read);
       }
-      //append nnet_next to the network nnet
+      // append nnet_next to the network nnet,
       nnet.AppendNnet(nnet_next);
     }
 
-    //finally write the nnet to disk
+    // finally write the nnet to disk,
     {
       Output ko(model_out_filename, binary_write);
       nnet.Write(ko.Stream(), binary_write);
@@ -82,7 +82,7 @@ int main(int argc, char *argv[]) {
     KALDI_LOG << "Written model to " << model_out_filename;
     return 0;
   } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
+    std::cerr << e.what();
     return -1;
   }
 }
diff --git a/src/nnetbin/nnet-copy.cc b/src/nnetbin/nnet-copy.cc
index 2f1d9a5a588..b58ce1126c2 100644
--- a/src/nnetbin/nnet-copy.cc
+++ b/src/nnetbin/nnet-copy.cc
@@ -20,6 +20,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "nnet/nnet-nnet.h"
+#include "nnet/nnet-parallel-component.h"
 
 int main(int argc, char *argv[]) {
   try {
@@ -28,22 +29,33 @@ int main(int argc, char *argv[]) {
     typedef kaldi::int32 int32;
 
     const char *usage =
-        "Copy Neural Network model (and possibly change binary/text format)\n"
-        "Usage:  nnet-copy [options] <model-in> <model-out>\n"
-        "e.g.:\n"
-        " nnet-copy --binary=false nnet.mdl nnet_txt.mdl\n";
-
+      "Copy Neural Network model (and possibly change binary/text format)\n"
+      "Usage:  nnet-copy [options] <model-in> <model-out>\n"
+      "e.g.:\n"
+      " nnet-copy --binary=false nnet.mdl nnet_txt.mdl\n";
 
     bool binary_write = true;
     int32 remove_first_components = 0;
     int32 remove_last_components = 0;
-    
+
     ParseOptions po(usage);
     po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("remove-first-layers", &remove_first_components, "Deprecated, please use --remove-first-components");
-    po.Register("remove-last-layers", &remove_last_components, "Deprecated, please use --remove-last-components");
-    po.Register("remove-first-components", &remove_first_components, "Remove N first Components from the Nnet");
-    po.Register("remove-last-components", &remove_last_components, "Remove N last layers Components from the Nnet");
+
+    po.Register("remove-first-layers", &remove_first_components,
+        "Deprecated, please use --remove-first-components");
+    po.Register("remove-last-layers", &remove_last_components,
+        "Deprecated, please use --remove-last-components");
+
+    po.Register("remove-first-components", &remove_first_components,
+        "Remove N first Components from the Nnet");
+    po.Register("remove-last-components", &remove_last_components,
+        "Remove N last layers Components from the Nnet");
+
+    std::string from_parallel_component;
+    po.Register("from-parallel-component", &from_parallel_component,
+        "Extract nested network from parallel component (two possibilities: "
+        "'3' = search for ParallelComponent and get its 3rd network; "
+        "'1:3' = get 3nd network from 1st component; ID = 1..N).");
 
     po.Read(argc, argv);
 
@@ -56,23 +68,60 @@ int main(int argc, char *argv[]) {
         model_out_filename = po.GetArg(2);
 
     // load the network
-    Nnet nnet; 
+    Nnet nnet;
     {
       bool binary_read;
       Input ki(model_in_filename, &binary_read);
       nnet.Read(ki.Stream(), binary_read);
     }
 
-    // optionally remove N first layers
-    if(remove_first_components > 0) {
-      for(int32 i=0; i<remove_first_components; i++) {
+    // eventually replace 'nnet' by nested network from <ParallelComponent>,
+    if (from_parallel_component != "") {
+      std::vector<int32> component_id_nested_id;
+      kaldi::SplitStringToIntegers(from_parallel_component, ":", false,
+                                   &component_id_nested_id);
+      // parse the argument,
+      int32 component_id = -1, nested_id = 0;
+      switch (component_id_nested_id.size()) {
+        case 1:
+          nested_id = component_id_nested_id[0];
+          break;
+        case 2:
+          component_id = component_id_nested_id[0];
+          nested_id = component_id_nested_id[1];
+          break;
+        default:
+          KALDI_ERR << "Check the csl '--from-parallel-component='"
+                    << from_parallel_component
+                    << " There must be 1 or 2 elements.";
+      }
+      // search for first <ParallelComponent> (we don't know component_id yet),
+      if (component_id == -1) {
+        for (int32 i = 0; i < nnet.NumComponents(); i++) {
+          if (nnet.GetComponent(i).GetType() == Component::kParallelComponent) {
+            component_id = i+1;
+            break;
+          }
+        }
+      }
+      // replace the nnet,
+      KALDI_ASSERT(nnet.GetComponent(component_id-1).GetType() ==
+                   Component::kParallelComponent);
+      ParallelComponent& parallel_comp =
+        dynamic_cast<ParallelComponent&>(nnet.GetComponent(component_id-1));
+      nnet = parallel_comp.GetNestedNnet(nested_id-1);  // replace!
+    }
+
+    // optionally remove N first components,
+    if (remove_first_components > 0) {
+      for (int32 i = 0; i < remove_first_components; i++) {
         nnet.RemoveComponent(0);
       }
     }
-   
-    // optionally remove N last layers
-    if(remove_last_components > 0) {
-      for(int32 i=0; i<remove_last_components; i++) {
+
+    // optionally remove N last components,
+    if (remove_last_components > 0) {
+      for (int32 i = 0; i < remove_last_components; i++) {
         nnet.RemoveLastComponent();
       }
     }
@@ -83,12 +132,10 @@ int main(int argc, char *argv[]) {
       nnet.Write(ko.Stream(), binary_write);
     }
 
-    KALDI_LOG << "Written model to " << model_out_filename;
+    KALDI_LOG << "Written 'nnet1' to " << model_out_filename;
     return 0;
   } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
+    std::cerr << e.what();
     return -1;
   }
 }
-
-
diff --git a/src/nnetbin/nnet-forward.cc b/src/nnetbin/nnet-forward.cc
index 0cbba00bd11..8ee4114ca37 100644
--- a/src/nnetbin/nnet-forward.cc
+++ b/src/nnetbin/nnet-forward.cc
@@ -32,11 +32,9 @@ int main(int argc, char *argv[]) {
   using namespace kaldi::nnet1;
   try {
     const char *usage =
-        "Perform forward pass through Neural Network.\n"
-        "\n"
-        "Usage:  nnet-forward [options] <model-in> <feature-rspecifier> <feature-wspecifier>\n"
-        "e.g.: \n"
-        " nnet-forward nnet ark:features.ark ark:mlpoutput.ark\n";
+      "Perform forward pass through Neural Network.\n"
+      "Usage: nnet-forward [options] <nnet1-in> <feature-rspecifier> <feature-wspecifier>\n"
+      "e.g.: nnet-forward final.nnet ark:input.ark ark:output.ark\n";
 
     ParseOptions po(usage);
 
@@ -44,22 +42,29 @@ int main(int argc, char *argv[]) {
     prior_opts.Register(&po);
 
     std::string feature_transform;
-    po.Register("feature-transform", &feature_transform, "Feature transform in front of main network (in nnet format)");
+    po.Register("feature-transform", &feature_transform,
+        "Feature transform in front of main network (in nnet format)");
 
     bool no_softmax = false;
-    po.Register("no-softmax", &no_softmax, "No softmax on MLP output (or remove it if found), the pre-softmax activations will be used as log-likelihoods, log-priors will be subtracted");
+    po.Register("no-softmax", &no_softmax,
+        "Removes the last component with Softmax, if found. The pre-softmax "
+        "activations are the output of the network. Decoding them leads to "
+        "the same lattices as if we had used 'log-posteriors'.");
+
     bool apply_log = false;
-    po.Register("apply-log", &apply_log, "Transform MLP output to logscale");
+    po.Register("apply-log", &apply_log, "Transform NN output by log()");
 
     std::string use_gpu="no";
-    po.Register("use-gpu", &use_gpu, "yes|no|optional, only has effect if compiled with CUDA"); 
+    po.Register("use-gpu", &use_gpu,
+        "yes|no|optional, only has effect if compiled with CUDA");
 
     using namespace kaldi;
     using namespace kaldi::nnet1;
     typedef kaldi::int32 int32;
 
     int32 time_shift = 0;
-    po.Register("time-shift", &time_shift, "LSTM : repeat last input frame N-times, discrad N initial output frames."); 
+    po.Register("time-shift", &time_shift,
+        "LSTM : repeat last input frame N-times, discrad N initial output frames.");
 
     po.Read(argc, argv);
 
@@ -71,9 +76,9 @@ int main(int argc, char *argv[]) {
     std::string model_filename = po.GetArg(1),
         feature_rspecifier = po.GetArg(2),
         feature_wspecifier = po.GetArg(3);
-        
-    //Select the GPU
-#if HAVE_CUDA==1
+
+    // Select the GPU
+#if HAVE_CUDA == 1
     CuDevice::Instantiate().SelectGpuId(use_gpu);
 #endif
 
@@ -85,23 +90,27 @@ int main(int argc, char *argv[]) {
     Nnet nnet;
     nnet.Read(model_filename);
     // optionally remove softmax,
-    Component::ComponentType last_type = nnet.GetComponent(nnet.NumComponents()-1).GetType();
+    Component::ComponentType last_comp_type = nnet.GetLastComponent().GetType();
     if (no_softmax) {
-      if (last_type == Component::kSoftmax || last_type == Component::kBlockSoftmax) {
-        KALDI_LOG << "Removing " << Component::TypeToMarker(last_type) << " from the nnet " << model_filename;
-        nnet.RemoveComponent(nnet.NumComponents()-1);
+      if (last_comp_type == Component::kSoftmax ||
+          last_comp_type == Component::kBlockSoftmax) {
+        KALDI_LOG << "Removing " << Component::TypeToMarker(last_comp_type)
+                  << " from the nnet " << model_filename;
+        nnet.RemoveLastComponent();
       } else {
-        KALDI_WARN << "Cannot remove softmax using --no-softmax=true, as the last component is " << Component::TypeToMarker(last_type);
+        KALDI_WARN << "Last component 'NOT-REMOVED' by --no-softmax=true, "
+          << "the component was " << Component::TypeToMarker(last_comp_type);
       }
     }
 
     // avoid some bad option combinations,
     if (apply_log && no_softmax) {
-      KALDI_ERR << "Cannot use both --apply-log=true --no-softmax=true, use only one of the two!";
+      KALDI_ERR << "Cannot use both --apply-log=true --no-softmax=true, "
+                << "use only one of the two!";
     }
 
     // we will subtract log-priors later,
-    PdfPrior pdf_prior(prior_opts); 
+    PdfPrior pdf_prior(prior_opts);
 
     // disable dropout,
     nnet_transf.SetDropoutRetention(1.0);
@@ -115,72 +124,65 @@ int main(int argc, char *argv[]) {
     CuMatrix<BaseFloat> feats, feats_transf, nnet_out;
     Matrix<BaseFloat> nnet_out_host;
 
-
     Timer time;
     double time_now = 0;
     int32 num_done = 0;
-    // iterate over all feature files
+
+    // main loop,
     for (; !feature_reader.Done(); feature_reader.Next()) {
       // read
       Matrix<BaseFloat> mat = feature_reader.Value();
       std::string utt = feature_reader.Key();
-      KALDI_VLOG(2) << "Processing utterance " << num_done+1 
+      KALDI_VLOG(2) << "Processing utterance " << num_done+1
                     << ", " << utt
                     << ", " << mat.NumRows() << "frm";
 
-      
-      if (!KALDI_ISFINITE(mat.Sum())) { // check there's no nan/inf,
+
+      if (!KALDI_ISFINITE(mat.Sum())) {  // check there's no nan/inf,
         KALDI_ERR << "NaN or inf found in features for " << utt;
       }
 
       // time-shift, copy the last frame of LSTM input N-times,
       if (time_shift > 0) {
-        int32 last_row = mat.NumRows() - 1; // last row,
+        int32 last_row = mat.NumRows() - 1;  // last row,
         mat.Resize(mat.NumRows() + time_shift, mat.NumCols(), kCopyData);
-        for (int32 r = last_row+1; r<mat.NumRows(); r++) {
-          mat.CopyRowFromVec(mat.Row(last_row), r); // copy last row,
+        for (int32 r = last_row+1; r < mat.NumRows(); r++) {
+          mat.CopyRowFromVec(mat.Row(last_row), r);  // copy last row,
         }
       }
-      
+
       // push it to gpu,
       feats = mat;
 
       // fwd-pass, feature transform,
       nnet_transf.Feedforward(feats, &feats_transf);
-      if (!KALDI_ISFINITE(feats_transf.Sum())) { // check there's no nan/inf,
+      if (!KALDI_ISFINITE(feats_transf.Sum())) {  // check there's no nan/inf,
         KALDI_ERR << "NaN or inf found in transformed-features for " << utt;
       }
 
       // fwd-pass, nnet,
       nnet.Feedforward(feats_transf, &nnet_out);
-      if (!KALDI_ISFINITE(nnet_out.Sum())) { // check there's no nan/inf,
+      if (!KALDI_ISFINITE(nnet_out.Sum())) {  // check there's no nan/inf,
         KALDI_ERR << "NaN or inf found in nn-output for " << utt;
       }
-      
+
       // convert posteriors to log-posteriors,
       if (apply_log) {
         if (!(nnet_out.Min() >= 0.0 && nnet_out.Max() <= 1.0)) {
-          KALDI_WARN << utt << " "
-                     << "Applying 'log' to data which don't seem to be probabilities "
-                     << "(is there a softmax somwhere?)";
+          KALDI_WARN << "Applying 'log()' to data which don't seem to be "
+                     << "probabilities," << utt;
         }
-        nnet_out.Add(1e-20); // avoid log(0),
+        nnet_out.Add(1e-20);  // avoid log(0),
         nnet_out.ApplyLog();
       }
-     
+
       // subtract log-priors from log-posteriors or pre-softmax,
       if (prior_opts.class_frame_counts != "") {
-        if (nnet_out.Min() >= 0.0 && nnet_out.Max() <= 1.0) {
-          KALDI_WARN << utt << " " 
-                     << "Subtracting log-prior on 'probability-like' data in range [0..1] " 
-                     << "(Did you forget --no-softmax=true or --apply-log=true ?)";
-        }
         pdf_prior.SubtractOnLogpost(&nnet_out);
       }
 
       // download from GPU,
-      nnet_out_host.Resize(nnet_out.NumRows(), nnet_out.NumCols());
-      nnet_out.CopyToMat(&nnet_out_host);
+      nnet_out_host = Matrix<BaseFloat>(nnet_out);
 
       // time-shift, remove N first frames of LSTM output,
       if (time_shift > 0) {
@@ -189,12 +191,12 @@ int main(int argc, char *argv[]) {
       }
 
       // write,
-      if (!KALDI_ISFINITE(nnet_out_host.Sum())) { // check there's no nan/inf,
+      if (!KALDI_ISFINITE(nnet_out_host.Sum())) {  // check there's no nan/inf,
         KALDI_ERR << "NaN or inf found in final output nn-output for " << utt;
       }
       feature_writer.Write(feature_reader.Key(), nnet_out_host);
 
-      // progress log
+      // progress log,
       if (num_done % 100 == 0) {
         time_now = time.Elapsed();
         KALDI_VLOG(1) << "After " << num_done << " utterances: time elapsed = "
@@ -204,13 +206,13 @@ int main(int argc, char *argv[]) {
       num_done++;
       tot_t += mat.NumRows();
     }
-    
-    // final message
-    KALDI_LOG << "Done " << num_done << " files" 
-              << " in " << time.Elapsed()/60 << "min," 
-              << " (fps " << tot_t/time.Elapsed() << ")"; 
 
-#if HAVE_CUDA==1
+    // final message,
+    KALDI_LOG << "Done " << num_done << " files"
+              << " in " << time.Elapsed()/60 << "min,"
+              << " (fps " << tot_t/time.Elapsed() << ")";
+
+#if HAVE_CUDA == 1
     if (kaldi::g_kaldi_verbose_level >= 1) {
       CuDevice::Instantiate().PrintProfile();
     }
diff --git a/src/nnetbin/nnet-info.cc b/src/nnetbin/nnet-info.cc
index 5518ce18057..b35ef7da605 100644
--- a/src/nnetbin/nnet-info.cc
+++ b/src/nnetbin/nnet-info.cc
@@ -1,4 +1,4 @@
-// nnetbin/nnet1-info.cc
+// nnetbin/nnet-info.cc
 
 // Copyright 2013  Brno University of Technology (Author: Karel Vesely)
 
@@ -33,7 +33,7 @@ int main(int argc, char *argv[]) {
         "Usage:  nnet-info [options] <nnet-in>\n"
         "e.g.:\n"
         " nnet-info 1.nnet\n";
-    
+
     ParseOptions po(usage);
     po.Read(argc, argv);
 
@@ -45,19 +45,19 @@ int main(int argc, char *argv[]) {
     std::string nnet_rxfilename = po.GetArg(1);
 
     // load the network
-    Nnet nnet; 
+    Nnet nnet;
     {
       bool binary_read;
       Input ki(nnet_rxfilename, &binary_read);
       nnet.Read(ki.Stream(), binary_read);
     }
 
-    std::cout << nnet.Info(); 
+    std::cout << nnet.Info();
 
     KALDI_LOG << "Printed info about " << nnet_rxfilename;
     return 0;
   } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
+    std::cerr << e.what();
     return -1;
   }
 }
diff --git a/src/nnetbin/nnet-initialize.cc b/src/nnetbin/nnet-initialize.cc
index 6d7ff4c6117..fed255575d0 100644
--- a/src/nnetbin/nnet-initialize.cc
+++ b/src/nnetbin/nnet-initialize.cc
@@ -28,12 +28,11 @@ int main(int argc, char *argv[]) {
     typedef kaldi::int32 int32;
 
     const char *usage =
-        "Initialize Neural Network parameters according to a prototype (nnet1).\n"
-        "Usage:  nnet-initialize [options] <nnet-prototype-in> <nnet-out>\n"
-        "e.g.:\n"
-        " nnet-initialize --binary=false nnet.proto nnet.init\n";
+      "Initialize Neural Network parameters according to a prototype (nnet1).\n"
+      "Usage:  nnet-initialize [options] <nnet-prototype-in> <nnet-out>\n"
+      "e.g.: nnet-initialize --binary=false nnet.proto nnet.init\n";
 
-    SetVerboseLevel(1); // be verbose by default
+    SetVerboseLevel(1);  // be verbose by default,
 
     ParseOptions po(usage);
     bool binary_write = true;
@@ -55,8 +54,8 @@ int main(int argc, char *argv[]) {
 
     // initialize the network
     Nnet nnet;
-    nnet.Init(nnet_config_in_filename); 
-    
+    nnet.Init(nnet_config_in_filename);
+
     // store the network
     Output ko(nnet_out_filename, binary_write);
     nnet.Write(ko.Stream(), binary_write);
@@ -64,7 +63,7 @@ int main(int argc, char *argv[]) {
     KALDI_LOG << "Written initialized model to " << nnet_out_filename;
     return 0;
   } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
+    std::cerr << e.what();
     return -1;
   }
 }
diff --git a/src/nnetbin/nnet-kl-hmm-acc.cc b/src/nnetbin/nnet-kl-hmm-acc.cc
index 3be68252dbd..a4b32315658 100644
--- a/src/nnetbin/nnet-kl-hmm-acc.cc
+++ b/src/nnetbin/nnet-kl-hmm-acc.cc
@@ -31,10 +31,10 @@ int main(int argc, char *argv[]) {
   typedef kaldi::int32 int32;
   try {
     const char *usage =
-        "Collect the statistics for the Kl-HMM trainign.\n"
-        "Usage:  nnet-kl-hmm-acc [options] <feature-rspecifier> <alignments-rspecifier> <kl-hmm-accumulator>\n"
-        "e.g.: \n"
-        " nnet-kl-hmm-acc scp:train.scp ark:train.ali kl-hmm.acc\n";
+      "Collect the statistics for the Kl-HMM training.\n"
+      "Usage: nnet-kl-hmm-acc [options] <feature-rspecifier> "
+      "<alignments-rspecifier> <kl-hmm-accumulator>\n"
+      "e.g.: nnet-kl-hmm-acc ark:feats.ark ark:ali.ark kl-hmm.acc\n";
 
     ParseOptions po(usage);
 
@@ -43,7 +43,7 @@ int main(int argc, char *argv[]) {
     po.Register("binary", &binary, "Write output in binary mode");
     po.Register("nkl-states", &n_kl_states, "Number of states in Kl-HMM");
 
-    
+
     po.Read(argc, argv);
 
     if (po.NumArgs() != 3) {
@@ -64,44 +64,46 @@ int main(int argc, char *argv[]) {
     SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
     RandomAccessInt32VectorReader alignments_reader(alignments_rspecifier);
     int32 posterior_dim = feature_reader.Value().NumCols();
-    KlHmm kl_hmm(posterior_dim,n_kl_states);
+    KlHmm kl_hmm(posterior_dim, n_kl_states);
+
+    int32 num_done = 0,
+          num_no_alignment = 0,
+          num_other_error = 0;
 
-    int32 num_done = 0, num_no_alignment = 0, num_other_error = 0;
+    // main loop,
     for (; !feature_reader.Done(); feature_reader.Next()) {
       std::string utt = feature_reader.Key();
-
       if (!alignments_reader.HasKey(utt)) {
         num_no_alignment++;
       } else {
-        
         const Matrix<BaseFloat> &mat = feature_reader.Value();
         const std::vector<int32> &alignment = alignments_reader.Value(utt);
-                 
-        if ((int32)alignment.size() != mat.NumRows()) {
-          KALDI_WARN << "Alignment has wrong size "<< (alignment.size()) << " vs. "<< (mat.NumRows());
+        // Check,
+        if (static_cast<int32>(alignment.size()) != mat.NumRows()) {
+          KALDI_WARN << "Length mismatch! alignment " << alignment.size()
+                     << ", feature-rows " << mat.NumRows()
+                     << ", " << utt;
           num_other_error++;
           continue;
         }
-
-        // Accumulate the statistics
-        kl_hmm.Accumulate(mat, alignment);  
-        // log
-    KALDI_VLOG(2) << "utt " << utt << ", frames " << alignment.size();
+        // Accumulate the statistics,
+        kl_hmm.Accumulate(mat, alignment);
+        KALDI_VLOG(2) << "utt " << utt << ", frames " << alignment.size();
         total_frames += mat.NumRows();
-      } 
-      num_done++;
+        num_done++;
+      }
     }
-    KALDI_WARN << "Before writing...";
-    KALDI_LOG << "Done " << num_done << " files, " << num_no_alignment
-              << " with no alignments, " << num_other_error
-              << " with other errors.";
 
-    //store the accumulator
+    // Store the accumulator,
     {
-      Output out(kl_hmm_accumulator, binary); 
+      Output out(kl_hmm_accumulator, binary);
       kl_hmm.WriteData(out.Stream(), binary);
     }
 
+    KALDI_LOG << "Done " << num_done << " files, " << num_no_alignment
+              << " with no alignments, " << num_other_error
+              << " with other errors.";
+
     return 0;
   } catch(const std::exception &e) {
     std::cerr << e.what();
diff --git a/src/nnetbin/nnet-kl-hmm-mat-to-component.cc b/src/nnetbin/nnet-kl-hmm-mat-to-component.cc
index 4494f53d8b0..70db5a78d2a 100644
--- a/src/nnetbin/nnet-kl-hmm-mat-to-component.cc
+++ b/src/nnetbin/nnet-kl-hmm-mat-to-component.cc
@@ -1,4 +1,4 @@
-// nnetbin/nnet-kl-gmm-sum-accs.cc
+// nnetbin/nnet-kl-hmm-mat-to-component.cc
 
 // Copyright 2013  Idiap Research Institute (Author: David Imseng)
 //                 Karlsruhe Institute of Technology (Author: Ngoc Thang Vu)
@@ -32,8 +32,8 @@ int main(int argc, char *argv[]) {
   try {
     typedef int32 int32;
     const char *usage =
-        "Convert matrix of KL-HMM training to nnet component.\n"
-        "Usage: nnet-kl-hmm-mat-to-component [options] nnet-component matrix\n";
+      "Convert matrix of KL-HMM training to nnet component.\n"
+      "Usage: nnet-kl-hmm-mat-to-component [options] nnet-component matrix\n";
 
     bool binary = true;
     int32 n_kl_states = 0;
@@ -41,7 +41,10 @@ int main(int argc, char *argv[]) {
     ParseOptions po(usage);
     po.Register("binary", &binary, "Write output in binary mode");
     po.Register("nkl-states", &n_kl_states, "Number of states in Kl-HMM");
-    po.Register("posterior-dim", &n_posterior_dim, "Dimensionality of posterior features");
+
+    po.Register("posterior-dim", &n_posterior_dim,
+        "Dimensionality of posterior features");
+
     po.Read(argc, argv);
 
     if (po.NumArgs() < 2) {
@@ -51,17 +54,17 @@ int main(int argc, char *argv[]) {
 
     std::string nnet_component_filename = po.GetArg(1);
     std::string mat_filename = po.GetArg(2);
-    
+
     Matrix<BaseFloat> kl_stats;
     {
       bool binary_read;
       Input ki(mat_filename, &binary_read);
       kl_stats.Read(ki.Stream(), binary_read);
     }
-    
+
     KlHmm kl_hmm(kl_stats.NumCols(), kl_stats.NumRows());
     kl_hmm.SetStats(kl_stats);
-    
+
 
     // Write out the accs
     {
@@ -71,7 +74,7 @@ int main(int argc, char *argv[]) {
 
     KALDI_LOG << "Written nnet component to " << nnet_component_filename;
   } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
+    std::cerr << e.what();
     return -1;
   }
 }
diff --git a/src/nnetbin/nnet-kl-hmm-sum-accs.cc b/src/nnetbin/nnet-kl-hmm-sum-accs.cc
index 21634cfc8f5..2dc17696b62 100644
--- a/src/nnetbin/nnet-kl-hmm-sum-accs.cc
+++ b/src/nnetbin/nnet-kl-hmm-sum-accs.cc
@@ -1,4 +1,4 @@
-// nnetbin/nnet-kl-gmm-sum-accs.cc
+// nnetbin/nnet-kl-hmm-sum-accs.cc
 
 // Copyright 2013  Idiap Research Institute (Author: David Imseng)
 //                 Karlsruhe Institute of Technology (Author: Ngoc Thang Vu)
@@ -32,8 +32,8 @@ int main(int argc, char *argv[]) {
   try {
     typedef int32 int32;
     const char *usage =
-        "Sum multiple accumulated stats files for KL-HMM training.\n"
-        "Usage: nnet-kl-hmm-sum-accs [options] nnet-component stats-in1 stats-in2 ...\n";
+      "Sum multiple accumulated stats files for KL-HMM training.\n"
+      "Usage: nnet-kl-hmm-sum-accs [options] nnet-component stats-in1 .. stats-inN\n";
 
     bool binary = true;
     int32 n_kl_states = 0;
@@ -41,7 +41,10 @@ int main(int argc, char *argv[]) {
     ParseOptions po(usage);
     po.Register("binary", &binary, "Write output in binary mode");
     po.Register("nkl-states", &n_kl_states, "Number of states in Kl-HMM");
-    po.Register("posterior-dim", &n_posterior_dim, "Dimensionality of posterior features");
+
+    po.Register("posterior-dim", &n_posterior_dim,
+        "Dimensionality of posterior features");
+
     po.Read(argc, argv);
 
     if (po.NumArgs() < 2) {
@@ -68,7 +71,7 @@ int main(int argc, char *argv[]) {
     KALDI_LOG << "Summed " << num_accs << " stats ";
     KALDI_LOG << "Written nnet component to " << nnet_component_filename;
   } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
+    std::cerr << e.what();
     return -1;
   }
 }
diff --git a/src/nnetbin/nnet-set-learnrate.cc b/src/nnetbin/nnet-set-learnrate.cc
new file mode 100644
index 00000000000..c520e6bdbbc
--- /dev/null
+++ b/src/nnetbin/nnet-set-learnrate.cc
@@ -0,0 +1,104 @@
+// nnetbin/nnet-set-learnrate.cc
+
+// Copyright 2016,  Brno University of Technology
+//                  (author: Katerina Zmolikova, Karel Vesely)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "util/common-utils.h"
+#include "nnet/nnet-nnet.h"
+#include "nnet/nnet-component.h"
+#include "nnet/nnet-affine-transform.h"
+#include "nnet/nnet-activation.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet1;
+    typedef kaldi::int32 int32;
+
+    const char *usage =
+      "Sets learning rate coefficient inside of 'nnet1' model\n"
+      "Usage: nnet-set-learnrate --components=<csl> --coef=<float> <nnet-in> <nnet-out>\n"
+      "e.g.: nnet-set-learnrate --components=1:3:5 --coef=0.5 --bias-coef=0.1 nnet-in nnet-out\n";
+
+    ParseOptions po(usage);
+    bool binary = true;
+    po.Register("binary", &binary, "Write output in binary mode");
+
+    std::string components_str = "";
+    po.Register("components", &components_str,
+        "Select components by 'csl' of 1..N values. Layout is the same as in "
+        "'nnet-info' output, (example 1:3:5)");
+
+    float coef = 1.0,
+          weight_coef = 1.0,
+          bias_coef = 1.0;
+
+    po.Register("coef", &coef,
+        "Learn-rate coefficient for both weight matrices and biases.");
+    po.Register("weight-coef", &weight_coef,
+        "Learn-rate coefficient for weight matrices "
+        "(used as: coef * weight_coef).");
+    po.Register("bias-coef", &bias_coef,
+        "Learn-rate coefficient for bias (used as: coef * bias_coef).");
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string nnet_in_filename = po.GetArg(1),
+      nnet_out_filename = po.GetArg(2);
+
+    Nnet nnet;
+    nnet.Read(nnet_in_filename);
+
+    // A vector which contains indices of components,
+    // where we will set the 'learn-rate coefficients',
+    std::vector<int32> components;
+    if (components_str != "") {
+      // components were selected by the option,
+      kaldi::SplitStringToIntegers(components_str, ":", false, &components);
+    } else {
+      // otherwise select all the components (1..Ncomp),
+      for (int32 i = 1; i <= nnet.NumComponents(); i++) {
+        components.push_back(i);
+      }
+    }
+
+    // Setting the learning rate coefficients,
+    for (int32 i = 0; i < components.size(); i++) {
+      if (nnet.GetComponent(components[i]-1).IsUpdatable()) {
+        UpdatableComponent& comp =
+          dynamic_cast<UpdatableComponent&>(nnet.GetComponent(components[i]-1));
+        comp.SetLearnRateCoef(coef * weight_coef);  // weight matrices, etc.,
+        comp.SetBiasLearnRateCoef(coef * bias_coef);  // biases,
+      }
+    }
+
+    // Write the 'nnet1' network,
+    nnet.Write(nnet_out_filename, binary);
+
+    return 0;
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
+
diff --git a/src/nnetbin/nnet-train-blstm-streams.cc b/src/nnetbin/nnet-train-blstm-streams.cc
index 3cf419ab00f..8c37b74e92a 100644
--- a/src/nnetbin/nnet-train-blstm-streams.cc
+++ b/src/nnetbin/nnet-train-blstm-streams.cc
@@ -1,4 +1,4 @@
-// nnetbin/nnet-train-blstm-parallel.cc
+// nnetbin/nnet-train-blstm-streams.cc
 
 // Copyright 2015 Chongjia Ni
 
@@ -33,50 +33,53 @@ int main(int argc, char *argv[]) {
 
   try {
     const char *usage =
-        "Perform one iteration of senones training by SGD.\n"
-        "The updates are done per-utternace and by processing multiple utterances in parallel.\n"
-        "\n"
-        "Usage: nnet-train-blstm-streams [options] <feature-rspecifier> <labels-rspecifier> <model-in> [<model-out>]\n"
-        "e.g.: \n"
-        " nnet-train-blstm-streams scp:feature.scp ark:labels.ark nnet.init nnet.iter1\n";
+      "Perform one iteration of SGD training, optimizing frame cross-entropy.\n"
+      "The updates are done per-utterance, while several utterances are \n"
+      "processed at the same time.\n"
+      "\n"
+      "Usage: nnet-train-blstm-streams [options] <feature-rspecifier> <labels-rspecifier> <model-in> [<model-out>]\n"
+      "e.g.: nnet-train-blstm-streams scp:feats.scp ark:targets.ark nnet.init nnet.iter1\n";
 
     ParseOptions po(usage);
-    // training options
+
+    // training options,
     NnetTrainOptions trn_opts;
     trn_opts.Register(&po);
 
-    bool binary = true,
-         crossvalidate = false;
-    po.Register("binary", &binary, "Write model  in binary mode");
-    po.Register("cross-validate", &crossvalidate, "Perform cross-validation (no backpropagation)");
+    bool binary = true;
+    po.Register("binary", &binary, "Write model in binary mode");
+
+    bool crossvalidate = false;
+    po.Register("cross-validate", &crossvalidate,
+        "Perform cross-validation (no backpropagation)");
 
     std::string feature_transform;
-    po.Register("feature-transform", &feature_transform, "Feature transform in Nnet format");
+    po.Register("feature-transform", &feature_transform,
+        "Feature transform in Nnet format");
 
     int32 length_tolerance = 5;
-    po.Register("length-tolerance", &length_tolerance, "Allowed length difference of features/targets (frames)");
+    po.Register("length-tolerance", &length_tolerance,
+        "Allowed length difference of features/targets (frames)");
 
     std::string frame_weights;
-    po.Register("frame-weights", &frame_weights, "Per-frame weights to scale gradients (frame selection/weighting).");
-
-    std::string objective_function = "xent";
-    po.Register("objective-function", &objective_function, "Objective function : xent|mse");
+    po.Register("frame-weights", &frame_weights,
+        "Per-frame weights to scale gradients (frame selection/weighting).");
 
     int32 num_streams = 4;
-    po.Register("num_streams", &num_streams, "Number of sequences processed in parallel");
+    po.Register("num_streams", &num_streams,
+        "Number of sentences processed in parallel");
 
     double frame_limit = 100000;
-    po.Register("frame-limit", &frame_limit, "Max number of frames to be processed");
-
-    int32 report_step = 100;
-    po.Register("report-step", &report_step, "Step (number of sequences) for status reporting");
+    po.Register("frame-limit", &frame_limit,
+        "Max number of frames to be processed");
 
     std::string use_gpu = "yes";
-    // po.Register("use-gpu", &use_gpu, "yes|no|optional, only has effect if compiled with CUDA");
+    po.Register("use-gpu", &use_gpu,
+        "yes|no|optional, only has effect if compiled with CUDA");
 
     po.Read(argc, argv);
 
-    if (po.NumArgs() != 4-(crossvalidate?1:0)) {
+    if (po.NumArgs() != 3 + (crossvalidate ? 0 : 1)) {
       po.PrintUsage();
       exit(1);
     }
@@ -110,7 +113,7 @@ int main(int argc, char *argv[]) {
 
     kaldi::int64 total_frames = 0;
 
-    // Initialize feature ans labels readers
+    // Initialize feature and target readers,
     SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
     RandomAccessPosteriorReader targets_reader(targets_rspecifier);
     RandomAccessBaseFloatVectorReader weights_reader;
@@ -119,23 +122,26 @@ int main(int argc, char *argv[]) {
     }
 
     Xent xent;
-    Mse mse;
 
     CuMatrix<BaseFloat> feats, feats_transf, nnet_out, obj_diff;
 
     Timer time;
-    KALDI_LOG << (crossvalidate?"CROSS-VALIDATION":"TRAINING") << " STARTED";
-    // Feature matrix of every utterance
+    KALDI_LOG << (crossvalidate ? "CROSS-VALIDATION" : "TRAINING")
+              << " STARTED";
+
+    // Feature matrix of every utterance,
     std::vector< Matrix<BaseFloat> > feats_utt(num_streams);
-    // Label vector of every utterance
+    // Label vector of every utterance,
     std::vector< Posterior > labels_utt(num_streams);
     std::vector< Vector<BaseFloat> > weights_utt(num_streams);
 
     int32 feat_dim = nnet.InputDim();
 
-    int32 num_done = 0, num_no_tgt_mat = 0, num_other_error = 0;
-    while (1) {
+    int32 num_done = 0,
+          num_no_tgt_mat = 0,
+          num_other_error = 0;
 
+    while (!feature_reader.Done()) {
       std::vector<int32> frame_num_utt;
       int32 sequence_index = 0, max_frame_num = 0;
 
@@ -160,21 +166,21 @@ int main(int argc, char *argv[]) {
         // correct small length mismatch ... or drop sentence
         {
           // add lengths to vector
-          std::vector<int32> lenght;
-          lenght.push_back(mat.NumRows());
-          lenght.push_back(targets.size());
-          lenght.push_back(weights.Dim());
+          std::vector<int32> length;
+          length.push_back(mat.NumRows());
+          length.push_back(targets.size());
+          length.push_back(weights.Dim());
           // find min, max
-          int32 min = *std::min_element(lenght.begin(), lenght.end());
-          int32 max = *std::max_element(lenght.begin(), lenght.end());
+          int32 min = *std::min_element(length.begin(), length.end());
+          int32 max = *std::max_element(length.begin(), length.end());
           // fix or drop ?
           if (max - min < length_tolerance) {
             if (mat.NumRows() != min) mat.Resize(min, mat.NumCols(), kCopyData);
             if (targets.size() != min) targets.resize(min);
             if (weights.Dim() != min) weights.Resize(min, kCopyData);
           } else {
-            KALDI_WARN << utt << ", length mismatch of targets " << targets.size()
-                       << " and features " << mat.NumRows();
+            KALDI_WARN << "Length mismatch! Targets " << targets.size()
+                       << ", features " << mat.NumRows() << ", " << utt;
             num_other_error++;
             continue;
           }
@@ -187,16 +193,20 @@ int main(int argc, char *argv[]) {
 
         frame_num_utt.push_back(mat.NumRows());
         sequence_index++;
-        // If the total number of frames reaches frame_limit, then stop adding more sequences, regardless of whether
-        // the number of utterances reaches num_sequence or not.
-        if (frame_num_utt.size() == num_streams || frame_num_utt.size() * max_frame_num > frame_limit) {
-            feature_reader.Next(); break;
+        // If the total number of frames reaches frame_limit, then stop adding
+        // more sequences, regardless of whether the number of utterances
+        // reaches num_sequence or not.
+        if (frame_num_utt.size() == num_streams ||
+            frame_num_utt.size() * max_frame_num > frame_limit) {
+            feature_reader.Next();
+            break;
         }
       }
       int32 cur_sequence_num = frame_num_utt.size();
 
-      // Create the final feature matrix. Every utterance is padded to the max length within this group of utterances
-      Matrix<BaseFloat> feat_mat_host(cur_sequence_num * max_frame_num, feat_dim, kSetZero);
+      // Create the final feature matrix. Every utterance is padded to the max
+      // length within this group of utterances,
+      Matrix<BaseFloat> feat_mat_host(cur_sequence_num * max_frame_num, feat_dim);
       Posterior target_host;
       Vector<BaseFloat> weight_host;
 
@@ -227,26 +237,19 @@ int main(int argc, char *argv[]) {
       // Set the original lengths of utterances before padding
       nnet.SetSeqLengths(frame_num_utt);
 
-      // Propagation and xent training
+      // Propagation
       nnet.Propagate(feats_transf, &nnet_out);
 
-      if (objective_function == "xent") {
-          // gradients re-scaled by weights in Eval,
-          xent.Eval(weight_host, nnet_out, target_host, &obj_diff);
-      } else if (objective_function == "mse") {
-          // gradients re-scaled by weights in Eval,
-          mse.Eval(weight_host, nnet_out, target_host, &obj_diff);
-      } else {
-          KALDI_ERR << "Unknown objective function code : " << objective_function;
-      }
+      // Per-frame cross-entropy, gradients get re-scaled by weights,
+      xent.Eval(weight_host, nnet_out, target_host, &obj_diff);
 
       // Backward pass
       if (!crossvalidate) {
         nnet.Backpropagate(obj_diff, NULL);
       }
 
-      // 1st minibatch : show what happens in network
-      if (kaldi::g_kaldi_verbose_level >= 2 && total_frames == 0) {  // vlog-1
+      // 1st model update : show what happens in network,
+      if (total_frames == 0) {
         KALDI_VLOG(1) << "### After " << total_frames << " frames,";
         KALDI_VLOG(1) << nnet.InfoPropagate();
         if (!crossvalidate) {
@@ -255,13 +258,24 @@ int main(int argc, char *argv[]) {
         }
       }
 
+      // monitor the NN training (--verbose=2),
+      if (kaldi::g_kaldi_verbose_level >= 2) {
+        // print every 25k frames,
+        if ((total_frames / 25000) != ((total_frames + feats_transf.NumRows()) / 25000)) {
+          KALDI_VLOG(2) << "### After " << total_frames << " frames,";
+          KALDI_VLOG(2) << nnet.InfoPropagate();
+          if (!crossvalidate) {
+            KALDI_VLOG(2) << nnet.InfoBackPropagate();
+            KALDI_VLOG(2) << nnet.InfoGradient();
+          }
+        }
+      }
+
       num_done += cur_sequence_num;
       total_frames += feats_transf.NumRows();
-
-      if (feature_reader.Done()) break;  // end loop of while(1)
     }
 
-    // Check network parameters and gradients when training finishes
+    // after last model update : show what happens in network,
     if (kaldi::g_kaldi_verbose_level >= 1) {  // vlog-1
       KALDI_VLOG(1) << "### After " << total_frames << " frames,";
       KALDI_VLOG(1) << nnet.InfoPropagate();
@@ -278,15 +292,14 @@ int main(int argc, char *argv[]) {
     KALDI_LOG << "Done " << num_done << " files, " << num_no_tgt_mat
               << " with no tgt_mats, " << num_other_error
               << " with other errors. "
-              << "[" << (crossvalidate?"CROSS-VALIDATION":"TRAINING")
-              << ", " << time.Elapsed()/60 << " min, fps" << total_frames/time.Elapsed()
-              << "]";
+              << "[" << (crossvalidate ? "CROSS-VALIDATION" : "TRAINING")
+              << ", " << time.Elapsed() / 60 << " min, "
+              << "fps" << total_frames / time.Elapsed() << "]";
     KALDI_LOG << xent.Report();
 
 #if HAVE_CUDA == 1
     CuDevice::Instantiate().PrintProfile();
 #endif
-
     return 0;
   } catch(const std::exception &e) {
     std::cerr << e.what();
diff --git a/src/nnetbin/nnet-train-frmshuff.cc b/src/nnetbin/nnet-train-frmshuff.cc
index 9d173e1d196..41f841784b3 100644
--- a/src/nnetbin/nnet-train-frmshuff.cc
+++ b/src/nnetbin/nnet-train-frmshuff.cc
@@ -1,6 +1,6 @@
 // nnetbin/nnet-train-frmshuff.cc
 
-// Copyright 2013  Brno University of Technology (Author: Karel Vesely)
+// Copyright 2013-2016  Brno University of Technology (Author: Karel Vesely)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -29,15 +29,15 @@
 int main(int argc, char *argv[]) {
   using namespace kaldi;
   using namespace kaldi::nnet1;
-  typedef kaldi::int32 int32;  
-  
+  typedef kaldi::int32 int32;
+
   try {
     const char *usage =
-        "Perform one iteration of Neural Network training by mini-batch Stochastic Gradient Descent.\n"
-        "This version use pdf-posterior as targets, prepared typically by ali-to-post.\n"
-        "Usage:  nnet-train-frmshuff [options] <feature-rspecifier> <targets-rspecifier> <model-in> [<model-out>]\n"
-        "e.g.: \n"
-        " nnet-train-frmshuff scp:feature.scp ark:posterior.ark nnet.init nnet.iter1\n";
+      "Perform one iteration (epoch) of Neural Network training with\n"
+      "mini-batch Stochastic Gradient Descent. The training targets\n"
+      "are usually pdf-posteriors, prepared by ali-to-post.\n"
+      "Usage:  nnet-train-frmshuff [options] <feature-rspecifier> <targets-rspecifier> <model-in> [<model-out>]\n"
+      "e.g.: nnet-train-frmshuff scp:feats.scp ark:posterior.ark nnet.init nnet.iter1\n";
 
     ParseOptions po(usage);
 
@@ -46,34 +46,50 @@ int main(int argc, char *argv[]) {
     NnetDataRandomizerOptions rnd_opts;
     rnd_opts.Register(&po);
 
-    bool binary = true, 
-         crossvalidate = false,
-         randomize = true;
+    bool binary = true;
     po.Register("binary", &binary, "Write output in binary mode");
-    po.Register("cross-validate", &crossvalidate, "Perform cross-validation (don't backpropagate)");
-    po.Register("randomize", &randomize, "Perform the frame-level shuffling within the Cache::");
+
+    bool crossvalidate = false;
+    po.Register("cross-validate", &crossvalidate,
+        "Perform cross-validation (don't back-propagate)");
+
+    bool randomize = true;
+    po.Register("randomize", &randomize,
+        "Perform the frame-level shuffling within the Cache::");
 
     std::string feature_transform;
-    po.Register("feature-transform", &feature_transform, "Feature transform in Nnet format");
+    po.Register("feature-transform", &feature_transform,
+        "Feature transform in Nnet format");
+
     std::string objective_function = "xent";
-    po.Register("objective-function", &objective_function, "Objective function : xent|mse");
+    po.Register("objective-function", &objective_function,
+        "Objective function : xent|mse|multitask");
 
     int32 length_tolerance = 5;
-    po.Register("length-tolerance", &length_tolerance, "Allowed length difference of features/targets (frames)");
-    
+    po.Register("length-tolerance", &length_tolerance,
+        "Allowed length mismatch of features/targets/weights "
+        "(in frames, we truncate to the shortest)");
+
     std::string frame_weights;
-    po.Register("frame-weights", &frame_weights, "Per-frame weights to scale gradients (frame selection/weighting).");
+    po.Register("frame-weights", &frame_weights,
+        "Per-frame weights, used to re-scale gradients.");
+
+    std::string utt_weights;
+    po.Register("utt-weights", &utt_weights,
+        "Per-utterance weights, used to re-scale frame-weights.");
 
     std::string use_gpu="yes";
-    po.Register("use-gpu", &use_gpu, "yes|no|optional, only has effect if compiled with CUDA");
-    
+    po.Register("use-gpu", &use_gpu,
+        "yes|no|optional, only has effect if compiled with CUDA");
+
     double dropout_retention = 0.0;
-    po.Register("dropout-retention", &dropout_retention, "number between 0..1, saying how many neurons to preserve (0.0 will keep original value");
-     
-    
+    po.Register("dropout-retention", &dropout_retention,
+        "number between 0..1, controls how many neurons are preserved "
+        "(0.0 will keep the value unchanged)");
+
     po.Read(argc, argv);
 
-    if (po.NumArgs() != 4-(crossvalidate?1:0)) {
+    if (po.NumArgs() != 3 + (crossvalidate ? 0 : 1)) {
       po.PrintUsage();
       exit(1);
     }
@@ -81,7 +97,7 @@ int main(int argc, char *argv[]) {
     std::string feature_rspecifier = po.GetArg(1),
       targets_rspecifier = po.GetArg(2),
       model_filename = po.GetArg(3);
-        
+
     std::string target_model_filename;
     if (!crossvalidate) {
       target_model_filename = po.GetArg(4);
@@ -91,13 +107,12 @@ int main(int argc, char *argv[]) {
     using namespace kaldi::nnet1;
     typedef kaldi::int32 int32;
 
-    //Select the GPU
-#if HAVE_CUDA==1
+#if HAVE_CUDA == 1
     CuDevice::Instantiate().SelectGpuId(use_gpu);
 #endif
 
     Nnet nnet_transf;
-    if(feature_transform != "") {
+    if (feature_transform != "") {
       nnet_transf.Read(feature_transform);
     }
 
@@ -122,6 +137,10 @@ int main(int argc, char *argv[]) {
     if (frame_weights != "") {
       weights_reader.Open(frame_weights);
     }
+    RandomAccessBaseFloatReader utt_weights_reader;
+    if (utt_weights != "") {
+      utt_weights_reader.Open(utt_weights);
+    }
 
     RandomizerMask randomizer_mask(rnd_opts);
     MatrixRandomizer feature_randomizer(rnd_opts);
@@ -130,139 +149,201 @@ int main(int argc, char *argv[]) {
 
     Xent xent;
     Mse mse;
-    
+
     MultiTaskLoss multitask;
-    if (0 == objective_function.compare(0,9,"multitask")) {
-      // objective_function contains something like : 
+    if (0 == objective_function.compare(0, 9, "multitask")) {
+      // objective_function contains something like :
       // 'multitask,xent,2456,1.0,mse,440,0.001'
       //
       // the meaning is following:
       // 'multitask,<type1>,<dim1>,<weight1>,...,<typeN>,<dimN>,<weightN>'
       multitask.InitFromString(objective_function);
     }
-    
+
     CuMatrix<BaseFloat> feats_transf, nnet_out, obj_diff;
 
     Timer time;
-    KALDI_LOG << (crossvalidate?"CROSS-VALIDATION":"TRAINING") << " STARTED";
+    KALDI_LOG << (crossvalidate ? "CROSS-VALIDATION" : "TRAINING")
+              << " STARTED";
+
+    int32 num_done = 0,
+          num_no_tgt_mat = 0,
+          num_other_error = 0;
 
-    int32 num_done = 0, num_no_tgt_mat = 0, num_other_error = 0;
+    // main loop,
     while (!feature_reader.Done()) {
-#if HAVE_CUDA==1
-      // check the GPU is not overheated
+#if HAVE_CUDA == 1
+      // check that GPU computes accurately,
       CuDevice::Instantiate().CheckGpuHealth();
 #endif
-      // fill the randomizer
+      // fill the randomizer,
       for ( ; !feature_reader.Done(); feature_reader.Next()) {
-        if (feature_randomizer.IsFull()) break; // suspend, keep utt for next loop
+        if (feature_randomizer.IsFull()) {
+          // break the loop without calling Next(),
+          // we keep the 'utt' for next round,
+          break;
+        }
         std::string utt = feature_reader.Key();
         KALDI_VLOG(3) << "Reading " << utt;
-        // check that we have targets
+        // check that we have targets,
         if (!targets_reader.HasKey(utt)) {
           KALDI_WARN << utt << ", missing targets";
           num_no_tgt_mat++;
           continue;
         }
-        // check we have per-frame weights
+        // check we have per-frame weights,
         if (frame_weights != "" && !weights_reader.HasKey(utt)) {
           KALDI_WARN << utt << ", missing per-frame weights";
           num_other_error++;
           continue;
         }
-        // get feature / target pair
+        // check we have per-utterance weights,
+        if (utt_weights != "" && !utt_weights_reader.HasKey(utt)) {
+          KALDI_WARN << utt << ", missing per-utterance weight";
+          num_other_error++;
+          continue;
+        }
+        // get feature / target pair,
         Matrix<BaseFloat> mat = feature_reader.Value();
         Posterior targets = targets_reader.Value(utt);
-        // get per-frame weights
+        // get per-frame weights,
         Vector<BaseFloat> weights;
         if (frame_weights != "") {
           weights = weights_reader.Value(utt);
-        } else { // all per-frame weights are 1.0
+        } else {  // all per-frame weights are 1.0,
           weights.Resize(mat.NumRows());
           weights.Set(1.0);
         }
-        // correct small length mismatch ... or drop sentence
+        // multiply with per-utterance weight,
+        if (utt_weights != "") {
+          BaseFloat w = utt_weights_reader.Value(utt);
+          KALDI_ASSERT(w >= 0.0);
+          if (w == 0.0) continue;  // remove sentence from training,
+          weights.Scale(w);
+        }
+
+        // correct small length mismatch or drop sentence,
         {
-          // add lengths to vector
-          std::vector<int32> lenght;
-          lenght.push_back(mat.NumRows());
-          lenght.push_back(targets.size());
-          lenght.push_back(weights.Dim());
-          // find min, max
-          int32 min = *std::min_element(lenght.begin(),lenght.end());
-          int32 max = *std::max_element(lenght.begin(),lenght.end());
+          // add lengths to vector,
+          std::vector<int32> length;
+          length.push_back(mat.NumRows());
+          length.push_back(targets.size());
+          length.push_back(weights.Dim());
+          // find min, max,
+          int32 min = *std::min_element(length.begin(), length.end());
+          int32 max = *std::max_element(length.begin(), length.end());
           // fix or drop ?
           if (max - min < length_tolerance) {
-            if(mat.NumRows() != min) mat.Resize(min, mat.NumCols(), kCopyData);
-            if(targets.size() != min) targets.resize(min);
-            if(weights.Dim() != min) weights.Resize(min, kCopyData);
+            // we truncate to shortest,
+            if (mat.NumRows() != min) mat.Resize(min, mat.NumCols(), kCopyData);
+            if (targets.size() != min) targets.resize(min);
+            if (weights.Dim() != min) weights.Resize(min, kCopyData);
           } else {
-            KALDI_WARN << utt << ", length mismatch of targets " << targets.size()
-                       << " and features " << mat.NumRows();
+            KALDI_WARN << "Length mismatch! Targets " << targets.size()
+                       << ", features " << mat.NumRows() << ", " << utt;
             num_other_error++;
             continue;
           }
         }
-        // apply optional feature transform
+        // apply feature transform (if empty, input is copied),
         nnet_transf.Feedforward(CuMatrix<BaseFloat>(mat), &feats_transf);
 
-        // pass data to randomizers
+        // remove frames with '0' weight from training,
+        {
+          // are there any frames to be removed? (frames with zero weight),
+          BaseFloat weight_min = weights.Min();
+          KALDI_ASSERT(weight_min >= 0.0);
+          if (weight_min == 0.0) {
+            // create vector with frame-indices to keep,
+            std::vector<MatrixIndexT> keep_frames;
+            for (int32 i = 0; i < weights.Dim(); i++) {
+              if (weights(i) > 0.0) {
+                keep_frames.push_back(i);
+              }
+            }
+
+            // when all frames are removed, we skip the sentence,
+            if (keep_frames.size() == 0) continue;
+
+            // filter feature-frames,
+            CuMatrix<BaseFloat> tmp_feats(keep_frames.size(), feats_transf.NumCols());
+            tmp_feats.CopyRows(feats_transf, CuArray<MatrixIndexT>(keep_frames));
+            tmp_feats.Swap(&feats_transf);
+
+            // filter targets,
+            Posterior tmp_targets;
+            for (int32 i = 0; i < keep_frames.size(); i++) {
+              tmp_targets.push_back(targets[keep_frames[i]]);
+            }
+            tmp_targets.swap(targets);
+
+            // filter weights,
+            Vector<BaseFloat> tmp_weights(keep_frames.size());
+            for (int32 i = 0; i < keep_frames.size(); i++) {
+              tmp_weights(i) = weights(keep_frames[i]);
+            }
+            tmp_weights.Swap(&weights);
+          }
+        }
+
+        // pass data to randomizers,
         KALDI_ASSERT(feats_transf.NumRows() == targets.size());
         feature_randomizer.AddData(feats_transf);
         targets_randomizer.AddData(targets);
         weights_randomizer.AddData(weights);
         num_done++;
-      
-        // report the speed
+
+        // report the speed,
         if (num_done % 5000 == 0) {
           double time_now = time.Elapsed();
-          KALDI_VLOG(1) << "After " << num_done << " utterances: time elapsed = "
-                        << time_now/60 << " min; processed " << total_frames/time_now
-                        << " frames per second.";
+          KALDI_VLOG(1) << "After " << num_done << " utterances: "
+            << "time elapsed = " << time_now / 60 << " min; "
+            << "processed " << total_frames / time_now << " frames per sec.";
         }
       }
 
-      // randomize
+      // randomize,
       if (!crossvalidate && randomize) {
-        const std::vector<int32>& mask = randomizer_mask.Generate(feature_randomizer.NumFrames());
+        const std::vector<int32>& mask =
+          randomizer_mask.Generate(feature_randomizer.NumFrames());
         feature_randomizer.Randomize(mask);
         targets_randomizer.Randomize(mask);
         weights_randomizer.Randomize(mask);
       }
 
-      // train with data from randomizers (using mini-batches)
+      // train with data from randomizers (using mini-batches),
       for ( ; !feature_randomizer.Done(); feature_randomizer.Next(),
                                           targets_randomizer.Next(),
                                           weights_randomizer.Next()) {
-        // get block of feature/target pairs
+        // get block of feature/target pairs,
         const CuMatrixBase<BaseFloat>& nnet_in = feature_randomizer.Value();
         const Posterior& nnet_tgt = targets_randomizer.Value();
         const Vector<BaseFloat>& frm_weights = weights_randomizer.Value();
 
-        // forward pass
+        // forward pass,
         nnet.Propagate(nnet_in, &nnet_out);
 
-        // evaluate objective function we've chosen
+        // evaluate objective function we've chosen,
         if (objective_function == "xent") {
           // gradients re-scaled by weights in Eval,
-          xent.Eval(frm_weights, nnet_out, nnet_tgt, &obj_diff); 
+          xent.Eval(frm_weights, nnet_out, nnet_tgt, &obj_diff);
         } else if (objective_function == "mse") {
           // gradients re-scaled by weights in Eval,
           mse.Eval(frm_weights, nnet_out, nnet_tgt, &obj_diff);
-        } else if (0 == objective_function.compare(0,9,"multitask")) {
+        } else if (0 == objective_function.compare(0, 9, "multitask")) {
           // gradients re-scaled by weights in Eval,
           multitask.Eval(frm_weights, nnet_out, nnet_tgt, &obj_diff);
         } else {
           KALDI_ERR << "Unknown objective function code : " << objective_function;
         }
 
-        // backward pass
         if (!crossvalidate) {
-          // backpropagate
+          // back-propagate, and do the update,
           nnet.Backpropagate(obj_diff, NULL);
         }
 
-        // 1st minibatch : show what happens in network 
-        if (kaldi::g_kaldi_verbose_level >= 1 && total_frames == 0) { // vlog-1
+        // 1st mini-batch : show what happens in network,
+        if (total_frames == 0) {
           KALDI_VLOG(1) << "### After " << total_frames << " frames,";
           KALDI_VLOG(1) << nnet.InfoPropagate();
           if (!crossvalidate) {
@@ -270,55 +351,60 @@ int main(int argc, char *argv[]) {
             KALDI_VLOG(1) << nnet.InfoGradient();
           }
         }
-        
-        // monitor the NN training
-        if (kaldi::g_kaldi_verbose_level >= 2) { // vlog-2
-          if ((total_frames/25000) != ((total_frames+nnet_in.NumRows())/25000)) { // print every 25k frames
+
+        // VERBOSE LOG
+        // monitor the NN training (--verbose=2),
+        if (kaldi::g_kaldi_verbose_level >= 2) {
+          static int32 counter = 0;
+          counter += nnet_in.NumRows();
+          // print every 25k frames,
+          if (counter >= 25000) {
             KALDI_VLOG(2) << "### After " << total_frames << " frames,";
             KALDI_VLOG(2) << nnet.InfoPropagate();
             if (!crossvalidate) {
+              KALDI_VLOG(2) << nnet.InfoBackPropagate();
               KALDI_VLOG(2) << nnet.InfoGradient();
             }
+            counter = 0;
           }
         }
-        
+
         total_frames += nnet_in.NumRows();
       }
-    }
-    
-    // after last minibatch : show what happens in network 
-    if (kaldi::g_kaldi_verbose_level >= 1) { // vlog-1
-      KALDI_VLOG(1) << "### After " << total_frames << " frames,";
-      KALDI_VLOG(1) << nnet.InfoPropagate();
-      if (!crossvalidate) {
-        KALDI_VLOG(1) << nnet.InfoBackPropagate();
-        KALDI_VLOG(1) << nnet.InfoGradient();
-      }
+    }  // main loop,
+
+    // after last mini-batch : show what happens in network,
+    KALDI_VLOG(1) << "### After " << total_frames << " frames,";
+    KALDI_VLOG(1) << nnet.InfoPropagate();
+    if (!crossvalidate) {
+      KALDI_VLOG(1) << nnet.InfoBackPropagate();
+      KALDI_VLOG(1) << nnet.InfoGradient();
     }
 
     if (!crossvalidate) {
       nnet.Write(target_model_filename, binary);
     }
 
-    KALDI_LOG << "Done " << num_done << " files, " << num_no_tgt_mat
-              << " with no tgt_mats, " << num_other_error
-              << " with other errors. "
-              << "[" << (crossvalidate?"CROSS-VALIDATION":"TRAINING")
-              << ", " << (randomize?"RANDOMIZED":"NOT-RANDOMIZED") 
-              << ", " << time.Elapsed()/60 << " min, fps" << total_frames/time.Elapsed()
-              << "]";  
+    KALDI_LOG << "Done " << num_done << " files, "
+      << num_no_tgt_mat << " with no tgt_mats, "
+      << num_other_error << " with other errors. "
+      << "[" << (crossvalidate ? "CROSS-VALIDATION" : "TRAINING")
+      << ", " << (randomize ? "RANDOMIZED" : "NOT-RANDOMIZED")
+      << ", " << time.Elapsed() / 60 << " min, processing "
+      << total_frames / time.Elapsed() << " frames per sec.]";
 
     if (objective_function == "xent") {
+      KALDI_LOG << xent.ReportPerClass();
       KALDI_LOG << xent.Report();
     } else if (objective_function == "mse") {
       KALDI_LOG << mse.Report();
-    } else if (0 == objective_function.compare(0,9,"multitask")) {
+    } else if (0 == objective_function.compare(0, 9, "multitask")) {
       KALDI_LOG << multitask.Report();
     } else {
       KALDI_ERR << "Unknown objective function code : " << objective_function;
     }
 
-#if HAVE_CUDA==1
+#if HAVE_CUDA == 1
     CuDevice::Instantiate().PrintProfile();
 #endif
 
diff --git a/src/nnetbin/nnet-train-lstm-streams.cc b/src/nnetbin/nnet-train-lstm-streams.cc
index bff9e5a117f..d3d9be5d737 100644
--- a/src/nnetbin/nnet-train-lstm-streams.cc
+++ b/src/nnetbin/nnet-train-lstm-streams.cc
@@ -1,6 +1,6 @@
 // nnetbin/nnet-train-lstm-streams.cc
 
-// Copyright 2015  Brno University of Technology (Author: Karel Vesely)
+// Copyright 2015-2016  Brno University of Technology (Author: Karel Vesely)
 //           2014  Jiayu DU (Jerry), Wei Li
 
 // See ../../COPYING for clarification regarding multiple authors
@@ -18,7 +18,7 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
-
+#include <numeric>
 
 #include "nnet/nnet-trnopts.h"
 #include "nnet/nnet-nnet.h"
@@ -32,68 +32,72 @@
 int main(int argc, char *argv[]) {
   using namespace kaldi;
   using namespace kaldi::nnet1;
-  typedef kaldi::int32 int32;  
-  
+  typedef kaldi::int32 int32;
+
   try {
     const char *usage =
         "Perform one iteration of LSTM training by Stochastic Gradient Descent.\n"
-        "This version use pdf-posterior as targets, prepared typically by ali-to-post.\n"
-        "The updates are done per-utterance, shuffling options are dummy for compatibility reason.\n"
+        "The training targets are pdf-posteriors, usually prepared by ali-to-post.\n"
+        "The updates are per-utterance.\n"
         "\n"
-        "Usage: nnet-train-lstm-streams [options] <feature-rspecifier> <targets-rspecifier> <model-in> [<model-out>]\n"
-        "e.g.: \n"
-        " nnet-train-lstm-streams scp:feature.scp ark:posterior.ark nnet.init nnet.iter1\n";
+        "Usage: nnet-train-lstm-streams [options] "
+          "<feature-rspecifier> <targets-rspecifier> <model-in> [<model-out>]\n"
+        "e.g.: nnet-train-lstm-streams scp:feature.scp ark:posterior.ark nnet.init nnet.iter1\n";
 
     ParseOptions po(usage);
 
     NnetTrainOptions trn_opts;
     trn_opts.Register(&po);
 
-    bool binary = true, 
-         crossvalidate = false;
+    bool binary = true;
     po.Register("binary", &binary, "Write output in binary mode");
-    po.Register("cross-validate", &crossvalidate, "Perform cross-validation (don't backpropagate)");
+
+    bool crossvalidate = false;
+    po.Register("cross-validate", &crossvalidate,
+        "Perform cross-validation (don't back-propagate)");
 
     std::string feature_transform;
-    po.Register("feature-transform", &feature_transform, "Feature transform in Nnet format");
+    po.Register("feature-transform", &feature_transform,
+        "Feature transform in Nnet format");
+
     std::string objective_function = "xent";
-    po.Register("objective-function", &objective_function, "Objective function : xent|mse");
+    po.Register("objective-function", &objective_function,
+        "Objective function : xent|mse");
 
     /*
     int32 length_tolerance = 5;
-    po.Register("length-tolerance", &length_tolerance, "Allowed length difference of features/targets (frames)");
-    
+    po.Register("length-tolerance", &length_tolerance,
+      "Allowed length difference of features/targets (frames)");
+
     std::string frame_weights;
-    po.Register("frame-weights", &frame_weights, "Per-frame weights to scale gradients (frame selection/weighting).");
+    po.Register("frame-weights", &frame_weights,
+      "Per-frame weights to scale gradients (frame selection/weighting).");
     */
 
     std::string use_gpu="yes";
-    po.Register("use-gpu", &use_gpu, "yes|no|optional, only has effect if compiled with CUDA"); 
-    
-    //<jiayu>
-    int32 targets_delay=5;
-    po.Register("targets-delay", &targets_delay, "---LSTM--- BPTT targets delay"); 
+    po.Register("use-gpu", &use_gpu,
+        "yes|no|optional, only has effect if compiled with CUDA");
 
-    int32 batch_size=20;
-    po.Register("batch-size", &batch_size, "---LSTM--- BPTT batch size"); 
+    // <jiayu>
+    int32 targets_delay = 5;
+    po.Register("targets-delay", &targets_delay, "[LSTM] BPTT targets delay");
 
-    int32 num_stream=4;
-    po.Register("num-stream", &num_stream, "---LSTM--- BPTT multi-stream training"); 
+    int32 batch_size = 20;
+    po.Register("batch-size", &batch_size, "[LSTM] BPTT batch size");
 
-    int32 dump_interval=0;
-    po.Register("dump-interval", &dump_interval, "---LSTM--- num utts between model dumping [ 0 == disabled ]"); 
-    //</jiayu>
+    int32 num_stream = 4;
+    po.Register("num-stream", &num_stream, "[LSTM] BPTT multi-stream training");
+    // </jiayu>
 
-    // Add dummy randomizer options, to make the tool compatible with standard scripts
-    NnetDataRandomizerOptions rnd_opts;
-    rnd_opts.Register(&po);
+    //// Add dummy option for compatibility with default scheduler,
     bool randomize = false;
-    po.Register("randomize", &randomize, "Dummy option, for compatibility...");
-    //
-    
+    po.Register("randomize", &randomize,
+        "Dummy, for compatibility with 'steps/nnet/train_scheduler.sh'");
+    ////
+
     po.Read(argc, argv);
 
-    if (po.NumArgs() != 4-(crossvalidate?1:0)) {
+    if (po.NumArgs() != 3 + (crossvalidate ? 0 : 1)) {
       po.PrintUsage();
       exit(1);
     }
@@ -101,7 +105,7 @@ int main(int argc, char *argv[]) {
     std::string feature_rspecifier = po.GetArg(1),
       targets_rspecifier = po.GetArg(2),
       model_filename = po.GetArg(3);
-        
+
     std::string target_model_filename;
     if (!crossvalidate) {
       target_model_filename = po.GetArg(4);
@@ -111,13 +115,12 @@ int main(int argc, char *argv[]) {
     using namespace kaldi::nnet1;
     typedef kaldi::int32 int32;
 
-    //Select the GPU
-#if HAVE_CUDA==1
+#if HAVE_CUDA == 1
     CuDevice::Instantiate().SelectGpuId(use_gpu);
 #endif
 
     Nnet nnet_transf;
-    if(feature_transform != "") {
+    if (feature_transform != "") {
       nnet_transf.Read(feature_transform);
     }
 
@@ -129,7 +132,7 @@ int main(int argc, char *argv[]) {
 
     SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
     RandomAccessPosteriorReader target_reader(targets_rspecifier);
-    
+
     /*
     RandomAccessBaseFloatVectorReader weights_reader;
     if (frame_weights != "") {
@@ -137,28 +140,26 @@ int main(int argc, char *argv[]) {
     }
     */
 
-    RandomizerMask randomizer_mask(rnd_opts);
-    MatrixRandomizer feature_randomizer(rnd_opts);
-    PosteriorRandomizer targets_randomizer(rnd_opts);
-    VectorRandomizer weights_randomizer(rnd_opts);
-
     Xent xent;
     Mse mse;
-    
+
     Timer time;
-    KALDI_LOG << (crossvalidate?"CROSS-VALIDATION":"TRAINING") << " STARTED";
+    KALDI_LOG << (crossvalidate ? "CROSS-VALIDATION" : "TRAINING")
+              << " STARTED";
 
-    int32 num_done = 0, num_no_tgt_mat = 0, num_other_error = 0;
+    int32 num_done = 0,
+          num_no_tgt_mat = 0,
+          num_other_error = 0;
 
-    //  book-keeping for multi-streams
+    // book-keeping for multi-streams,
     std::vector<std::string> keys(num_stream);
     std::vector<Matrix<BaseFloat> > feats(num_stream);
     std::vector<Posterior> targets(num_stream);
-    std::vector<int> curt(num_stream, 0);
-    std::vector<int> lent(num_stream, 0);
-    std::vector<int> new_utt_flags(num_stream, 0);
+    std::vector<int32> curt(num_stream, 0);
+    std::vector<int32> lent(num_stream, 0);
+    std::vector<int32> new_utt_flags(num_stream, 0);
 
-    // bptt batch buffer
+    // bptt batch buffer,
     int32 feat_dim = nnet.InputDim();
     Vector<BaseFloat> frame_mask(batch_size * num_stream, kSetZero);
     Matrix<BaseFloat> feat(batch_size * num_stream, feat_dim, kSetZero);
@@ -166,174 +167,176 @@ int main(int argc, char *argv[]) {
     CuMatrix<BaseFloat> feat_transf, nnet_out, obj_diff;
 
     while (1) {
-        // loop over all streams, check if any stream reaches the end of its utterance,
-        // if any, feed the exhausted stream with a new utterance, update book-keeping infos
-        for (int s = 0; s < num_stream; s++) {
-            // this stream still has valid frames
-            if (curt[s] < lent[s]) {
-                new_utt_flags[s] = 0;
-                continue;
-            }
-            // else, this stream exhausted, need new utterance
-            while (!feature_reader.Done()) {
-                keys[s]  = feature_reader.Key();
-                const Matrix<BaseFloat> &mat = feature_reader.Value();
-                { // apply optional feature transform,
-                  // Karel: feature transform may contain <Splice> which does clone
-                  // frames on sentence boundaries. It is better to apply feature 
-                  // transform to whole sentences.
-                  nnet_transf.Feedforward(CuMatrix<BaseFloat>(mat), &feat_transf);
-                  feats[s].Resize(feat_transf.NumRows(), feat_transf.NumCols());
-                  feat_transf.CopyToMat(&feats[s]); 
-                }
-                if (!target_reader.HasKey(keys[s])) {
-                    KALDI_WARN << keys[s] << ", missing targets";
-                    num_no_tgt_mat++;
-                    feature_reader.Next();
-                    continue;
-                }
-                targets[s] = target_reader.Value(keys[s]);
-                if (feats[s].NumRows() != targets[s].size()) {
-                    KALDI_WARN << keys[s] << ", length miss-match between feats and targets, skip";
-                    feature_reader.Next();
-                    continue;
-                }
-                curt[s] = 0;
-                lent[s] = feats[s].NumRows();
-                new_utt_flags[s] = 1;  // a new utterance feeded to this stream
-                feature_reader.Next();
-                break;
-            }
+      // loop over all streams, check if any stream reaches the end of its utterance,
+      // if any, feed the exhausted stream with a new utterance, update book-keeping infos
+      for (int s = 0; s < num_stream; s++) {
+        // this stream still has valid frames
+        if (curt[s] < lent[s]) {
+          new_utt_flags[s] = 0;
+          continue;
         }
+        // else, this stream exhausted, need new utterance
+        while (!feature_reader.Done()) {
+          const std::string& key = feature_reader.Key();
+          // get the feature matrix,
+          const Matrix<BaseFloat> &mat = feature_reader.Value();
+          // forward the features through a feature-transform,
+          nnet_transf.Feedforward(CuMatrix<BaseFloat>(mat), &feat_transf);
+
+          // get the labels,
+          if (!target_reader.HasKey(key)) {
+            KALDI_WARN << key << ", missing targets";
+            num_no_tgt_mat++;
+            feature_reader.Next();
+            continue;
+          }
+          const Posterior& target = target_reader.Value(key);
+
+          // check that the length matches,
+          if (feat_transf.NumRows() != target.size()) {
+            KALDI_WARN << key
+              << ", length miss-match between feats and targets, skipping";
+            num_other_error++;
+            feature_reader.Next();
+            continue;
+          }
 
-        // we are done if all streams are exhausted
-        int done = 1;
-        for (int s = 0; s < num_stream; s++) {
-            if (curt[s] < lent[s]) done = 0;  // this stream still contains valid data, not exhausted
-        }
-        if (done) break;
-
-        // fill a multi-stream bptt batch
-        // * frame_mask: 0 indicates padded frames, 1 indicates valid frames
-        // * target: padded to batch_size
-        // * feat: first shifted to achieve targets delay; then padded to batch_size
-        for (int t = 0; t < batch_size; t++) {
-            for (int s = 0; s < num_stream; s++) {
-                // frame_mask & targets padding
-                if (curt[s] < lent[s]) {
-                    frame_mask(t * num_stream + s) = 1;
-                    target[t * num_stream + s] = targets[s][curt[s]];
-                } else {
-                    frame_mask(t * num_stream + s) = 0;
-                    target[t * num_stream + s] = targets[s][lent[s]-1];
-                }
-                // feat shifting & padding
-                if (curt[s] + targets_delay < lent[s]) {
-                    feat.Row(t * num_stream + s).CopyFromVec(feats[s].Row(curt[s]+targets_delay));
-                } else {
-                    feat.Row(t * num_stream + s).CopyFromVec(feats[s].Row(lent[s]-1));
-                }
-
-                curt[s]++;
-            }
+          // checks ok, put the data in the buffers,
+          keys[s] = key;
+          feats[s].Resize(feat_transf.NumRows(), feat_transf.NumCols());
+          feat_transf.CopyToMat(&feats[s]);
+          targets[s] = target;
+          curt[s] = 0;
+          lent[s] = feats[s].NumRows();
+          new_utt_flags[s] = 1;  // a new utterance feeded to this stream
+          feature_reader.Next();
+          break;
         }
+      }
 
-        // for streams with new utterance, history states need to be reset
-        nnet.ResetLstmStreams(new_utt_flags);
-
-        // forward pass
-        nnet.Propagate(CuMatrix<BaseFloat>(feat), &nnet_out);
-    
-        // evaluate objective function we've chosen
-        if (objective_function == "xent") {
-            xent.Eval(frame_mask, nnet_out, target, &obj_diff);
-        //} else if (objective_function == "mse") {     // not supported yet
-        //    mse.Eval(frame_mask, nnet_out, targets_batch, &obj_diff);
-        } else {
-            KALDI_ERR << "Unknown objective function code : " << objective_function;
-        }
-    
-        // backward pass
-        if (!crossvalidate) {
-            nnet.Backpropagate(obj_diff, NULL);
-        }
+      // we are done if all streams are exhausted
+      int done = 1;
+      for (int s = 0; s < num_stream; s++) {
+        // this stream still contains valid data, not yet exhausted,
+        if (curt[s] < lent[s]) done = 0;
+      }
+      if (done) break;
 
-        // 1st minibatch : show what happens in network 
-        if (kaldi::g_kaldi_verbose_level >= 1 && total_frames == 0) { // vlog-1
-            KALDI_VLOG(1) << "### After " << total_frames << " frames,";
-            KALDI_VLOG(1) << nnet.InfoPropagate();
-            if (!crossvalidate) {
-                KALDI_VLOG(1) << nnet.InfoBackPropagate();
-                KALDI_VLOG(1) << nnet.InfoGradient();
-            }
+      // fill a multi-stream bptt batch
+      // * frame_mask: 0 indicates padded frames, 1 indicates valid frames
+      // * target: padded to batch_size
+      // * feat: first shifted to achieve targets delay; then padded to batch_size
+      for (int t = 0; t < batch_size; t++) {
+        for (int s = 0; s < num_stream; s++) {
+          // frame_mask & targets padding
+          if (curt[s] < lent[s]) {
+            frame_mask(t * num_stream + s) = 1;
+            target[t * num_stream + s] = targets[s][curt[s]];
+          } else {
+            frame_mask(t * num_stream + s) = 0;
+            target[t * num_stream + s] = targets[s][lent[s]-1];
+          }
+          // feat shifting & padding
+          if (curt[s] + targets_delay < lent[s]) {
+            feat.Row(t * num_stream + s).CopyFromVec(feats[s].Row(curt[s] + targets_delay));
+          } else {
+            feat.Row(t * num_stream + s).CopyFromVec(feats[s].Row(lent[s] - 1));
+          }
+          curt[s]++;
         }
+      }
 
-        int frame_progress = frame_mask.Sum();
-        total_frames += frame_progress;
+      // for streams with new utterance, history states need to be reset
+      nnet.ResetLstmStreams(new_utt_flags);
 
-        int num_done_progress = 0;
-        for (int i =0; i < new_utt_flags.size(); i++) {
-            num_done_progress += new_utt_flags[i];
-        }
-        num_done += num_done_progress;
-        
-        // monitor the NN training
-        if (kaldi::g_kaldi_verbose_level >= 2) { // vlog-2
-            if ((total_frames-frame_progress)/25000 != (total_frames/25000)) { // print every 25k frames
-                KALDI_VLOG(2) << "### After " << total_frames << " frames,";
-                KALDI_VLOG(2) << nnet.InfoPropagate();
-                if (!crossvalidate) {
-                    KALDI_VLOG(2) << nnet.InfoBackPropagate();
-                    KALDI_VLOG(2) << nnet.InfoGradient();
-                }
-            }
-        }
+      // forward pass
+      nnet.Propagate(CuMatrix<BaseFloat>(feat), &nnet_out);
 
-        // report the speed
-        if ((num_done-num_done_progress)/1000 != (num_done/1000)) {
-            double time_now = time.Elapsed();
-            KALDI_VLOG(1) << "After " << num_done << " utterances: time elapsed = "
-                        << time_now/60 << " min; processed " << total_frames/time_now
-                        << " frames per second.";
-            
-#if HAVE_CUDA==1
-            // check the GPU is not overheated
-            CuDevice::Instantiate().CheckGpuHealth();
-#endif
+      // evaluate objective function we've chosen,
+      if (objective_function == "xent") {
+        xent.Eval(frame_mask, nnet_out, target, &obj_diff);
+      } else if (objective_function == "mse") {
+        mse.Eval(frame_mask, nnet_out, target, &obj_diff);
+      } else {
+        KALDI_ERR << "Unknown objective function code : "
+                  << objective_function;
+      }
+
+      if (!crossvalidate) {
+        // back-propagate, and do the update,
+        nnet.Backpropagate(obj_diff, NULL);
+      }
+
+      // 1st minibatch : show what happens in network,
+      if (total_frames == 0) {
+        KALDI_VLOG(1) << "### After " << total_frames << " frames,";
+        KALDI_VLOG(1) << nnet.InfoPropagate();
+        if (!crossvalidate) {
+          KALDI_VLOG(1) << nnet.InfoBackPropagate();
+          KALDI_VLOG(1) << nnet.InfoGradient();
         }
+      }
 
-        if (dump_interval > 0) { // disabled by 'dump_interval == 0',
-          if ((num_done-num_done_progress)/dump_interval != (num_done/dump_interval)) {
-              char nnet_name[512];
-              if (!crossvalidate) {
-                  sprintf(nnet_name, "%s_utt%d", target_model_filename.c_str(), num_done);
-                  nnet.Write(nnet_name, binary);
-              }
+      // VERBOSE LOG
+      // monitor the NN training (--verbose=2),
+      if (kaldi::g_kaldi_verbose_level >= 2) {
+        static int32 counter = 0;
+        counter += frame_mask.Sum();
+        // print every 25k frames,
+        if (counter >= 25000) {
+          KALDI_VLOG(2) << "### After " << total_frames << " frames,";
+          KALDI_VLOG(2) << nnet.InfoPropagate();
+          if (!crossvalidate) {
+            KALDI_VLOG(2) << nnet.InfoBackPropagate();
+            KALDI_VLOG(2) << nnet.InfoGradient();
           }
+          counter = 0;
+        }
+      }
+
+      num_done +=
+        std::accumulate(new_utt_flags.begin(), new_utt_flags.end(), 0);
+
+      total_frames += frame_mask.Sum();
+
+      {  // do this every 5000 uttearnces,
+        static int32 utt_counter = 0;
+        utt_counter +=
+          std::accumulate(new_utt_flags.begin(), new_utt_flags.end(), 0);
+        if (utt_counter > 5000) {
+          utt_counter = 0;
+          // report speed,
+          double time_now = time.Elapsed();
+          KALDI_VLOG(1) << "After " << num_done << " utterances: "
+            << "time elapsed = " << time_now / 60 << " min; "
+            << "processed " << total_frames / time_now << " frames per sec.";
+#if HAVE_CUDA == 1
+          // check that GPU computes accurately,
+          CuDevice::Instantiate().CheckGpuHealth();
+#endif
         }
-    }
-      
-    // after last minibatch : show what happens in network 
-    if (kaldi::g_kaldi_verbose_level >= 1) { // vlog-1
-      KALDI_VLOG(1) << "### After " << total_frames << " frames,";
-      KALDI_VLOG(1) << nnet.InfoPropagate();
-      if (!crossvalidate) {
-        KALDI_VLOG(1) << nnet.InfoBackPropagate();
-        KALDI_VLOG(1) << nnet.InfoGradient();
       }
     }
 
+    // after last minibatch : show what happens in network,
+    KALDI_VLOG(1) << "### After " << total_frames << " frames,";
+    KALDI_VLOG(1) << nnet.InfoPropagate();
+    if (!crossvalidate) {
+      KALDI_VLOG(1) << nnet.InfoBackPropagate();
+      KALDI_VLOG(1) << nnet.InfoGradient();
+    }
+
     if (!crossvalidate) {
       nnet.Write(target_model_filename, binary);
     }
 
-    KALDI_LOG << "Done " << num_done << " files, " << num_no_tgt_mat
-              << " with no tgt_mats, " << num_other_error
-              << " with other errors. "
-              << "[" << (crossvalidate?"CROSS-VALIDATION":"TRAINING")
-              << ", " << (randomize?"RANDOMIZED":"NOT-RANDOMIZED") 
-              << ", " << time.Elapsed()/60 << " min, fps" << total_frames/time.Elapsed()
-              << "]";  
+    KALDI_LOG << "Done " << num_done << " files, "
+      << num_no_tgt_mat << " with no tgt_mats, "
+      << num_other_error << " with other errors. "
+      << "[" << (crossvalidate ? "CROSS-VALIDATION" : "TRAINING")
+      << ", " << (randomize ? "RANDOMIZED" : "NOT-RANDOMIZED")
+      << ", " << time.Elapsed() / 60 << " min, processing "
+      << total_frames / time.Elapsed() << " frames per sec.]";
 
     if (objective_function == "xent") {
       KALDI_LOG << xent.Report();
@@ -343,7 +346,7 @@ int main(int argc, char *argv[]) {
       KALDI_ERR << "Unknown objective function code : " << objective_function;
     }
 
-#if HAVE_CUDA==1
+#if HAVE_CUDA == 1
     CuDevice::Instantiate().PrintProfile();
 #endif
 
diff --git a/src/nnetbin/nnet-train-mmi-sequential.cc b/src/nnetbin/nnet-train-mmi-sequential.cc
index 3ed5d211675..dd457ab5154 100644
--- a/src/nnetbin/nnet-train-mmi-sequential.cc
+++ b/src/nnetbin/nnet-train-mmi-sequential.cc
@@ -1,6 +1,6 @@
 // nnetbin/nnet-train-mmi-sequential.cc
 
-// Copyright 2012-2013  Brno University of Technology (author: Karel Vesely)
+// Copyright 2012-2016  Brno University of Technology (author: Karel Vesely)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -17,6 +17,7 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
+#include <iomanip>
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
@@ -37,8 +38,6 @@
 #include "base/timer.h"
 #include "cudamatrix/cu-device.h"
 
-#include <iomanip>
-
 
 namespace kaldi {
 namespace nnet1 {
@@ -89,25 +88,28 @@ int main(int argc, char *argv[]) {
   typedef kaldi::int32 int32;
   try {
     const char *usage =
-        "Perform one iteration of DNN-MMI training by stochastic "
-        "gradient descent.\n"
-        "The network weights are updated on each utterance.\n"
-        "Usage:  nnet-train-mmi-sequential [options] <model-in> <transition-model-in> "
-        "<feature-rspecifier> <den-lat-rspecifier> <ali-rspecifier> [<model-out>]\n"
-        "e.g.: \n"
-        " nnet-train-mmi-sequential nnet.init trans.mdl scp:train.scp scp:denlats.scp ark:train.ali "
-        "nnet.iter1\n";
+      "Perform one iteration of MMI training using SGD with per-utterance"
+      "updates\n"
+
+      "Usage:  nnet-train-mmi-sequential [options] "
+      "<model-in> <transition-model-in> <feature-rspecifier> "
+      "<den-lat-rspecifier> <ali-rspecifier> [<model-out>]\n"
+
+      "e.g.: nnet-train-mmi-sequential nnet.init trans.mdl scp:feats.scp "
+      "scp:denlats.scp ark:ali.ark nnet.iter1\n";
 
     ParseOptions po(usage);
 
-    NnetTrainOptions trn_opts; trn_opts.learn_rate=0.00001;
+    NnetTrainOptions trn_opts;
+    trn_opts.learn_rate = 0.00001;  // changing default,
     trn_opts.Register(&po);
 
-    bool binary = true; 
+    bool binary = true;
     po.Register("binary", &binary, "Write output in binary mode");
 
     std::string feature_transform;
-    po.Register("feature-transform", &feature_transform, "Feature transform in Nnet format");
+    po.Register("feature-transform", &feature_transform,
+        "Feature transform in 'nnet1' format");
 
     PdfPriorOptions prior_opts;
     prior_opts.Register(&po);
@@ -115,23 +117,29 @@ int main(int argc, char *argv[]) {
     BaseFloat acoustic_scale = 1.0,
         lm_scale = 1.0,
         old_acoustic_scale = 0.0;
+
     po.Register("acoustic-scale", &acoustic_scale,
-                "Scaling factor for acoustic likelihoods");
+        "Scaling factor for acoustic likelihoods");
+
     po.Register("lm-scale", &lm_scale,
-                "Scaling factor for \"graph costs\" (including LM costs)");
+        "Scaling factor for \"graph costs\" (including LM costs)");
+
     po.Register("old-acoustic-scale", &old_acoustic_scale,
-                "Add in the scores in the input lattices with this scale, rather "
-                "than discarding them.");
-    kaldi::int32 max_frames = 6000; // Allow segments maximum of one minute by default
-    po.Register("max-frames",&max_frames, "Maximum number of frames a segment can have to be processed");
-    
+        "Add in the scores in the input lattices with this scale, "
+        "rather than discarding them.");
+
+    kaldi::int32 max_frames = 6000;
+    po.Register("max-frames", &max_frames,
+        "Maximum number of frames an utterance can have (skipped if longer)");
+
     bool drop_frames = true;
-    po.Register("drop-frames", &drop_frames, 
-                "Drop frames, where is zero den-posterior under numerator path "
-                "(ie. path not in lattice)");
+    po.Register("drop-frames", &drop_frames,
+        "Drop frames, where is zero den-posterior under numerator path "
+        "(ie. path not in lattice)");
 
     std::string use_gpu="yes";
-    po.Register("use-gpu", &use_gpu, "yes|no|optional, only has effect if compiled with CUDA"); 
+    po.Register("use-gpu", &use_gpu,
+        "yes|no|optional, only has effect if compiled with CUDA");
 
     po.Read(argc, argv);
 
@@ -144,17 +152,13 @@ int main(int argc, char *argv[]) {
         transition_model_filename = po.GetArg(2),
         feature_rspecifier = po.GetArg(3),
         den_lat_rspecifier = po.GetArg(4),
-        num_ali_rspecifier = po.GetArg(5);
+        num_ali_rspecifier = po.GetArg(5),
+        target_model_filename = po.GetArg(6);
 
-    std::string target_model_filename;
-    target_model_filename = po.GetArg(6);
-
-     
     using namespace kaldi;
     using namespace kaldi::nnet1;
     typedef kaldi::int32 int32;
 
-    // Select the GPU
 #if HAVE_CUDA == 1
     CuDevice::Instantiate().SelectGpuId(use_gpu);
 #endif
@@ -166,20 +170,25 @@ int main(int argc, char *argv[]) {
 
     Nnet nnet;
     nnet.Read(model_filename);
-    // using activations directly: remove softmax, if present
-    if (nnet.GetComponent(nnet.NumComponents()-1).GetType() ==
+    // we will use pre-softmax activations, removing softmax,
+    // - pre-softmax activations are equivalent to 'log-posterior + C_frame',
+    // - all paths crossing a frame share same 'C_frame',
+    // - with GMM, we also have the unnormalized acoustic likelihoods,
+    if (nnet.GetLastComponent().GetType() ==
         kaldi::nnet1::Component::kSoftmax) {
       KALDI_LOG << "Removing softmax from the nnet " << model_filename;
-      nnet.RemoveComponent(nnet.NumComponents()-1);
+      nnet.RemoveLastComponent();
     } else {
-      KALDI_LOG << "The nnet was without softmax " << model_filename;
+      KALDI_LOG << "The nnet was without softmax. "
+                << "The last component in " << model_filename << " was "
+                << Component::TypeToMarker(nnet.GetLastComponent().GetType());
     }
     nnet.SetTrainOptions(trn_opts);
 
-    // Read the class-frame-counts, compute priors
+    // Read the class-frame-counts, compute priors,
     PdfPrior log_prior(prior_opts);
 
-    // Read transition model
+    // Read transition model,
     TransitionModel trans_model;
     ReadKaldiObject(transition_model_filename, &trans_model);
 
@@ -187,7 +196,7 @@ int main(int argc, char *argv[]) {
     RandomAccessLatticeReader den_lat_reader(den_lat_rspecifier);
     RandomAccessInt32VectorReader num_ali_reader(num_ali_rspecifier);
 
-    CuMatrix<BaseFloat> feats, feats_transf, nnet_out, nnet_diff;
+    CuMatrix<BaseFloat> feats_transf, nnet_out, nnet_diff;
     Matrix<BaseFloat> nnet_out_h, nnet_diff_h;
 
     if (drop_frames) {
@@ -202,173 +211,166 @@ int main(int argc, char *argv[]) {
     double time_now = 0;
     KALDI_LOG << "TRAINING STARTED";
 
-    int32 num_done = 0, num_no_num_ali = 0, num_no_den_lat = 0, 
+    int32 num_done = 0, num_no_num_ali = 0, num_no_den_lat = 0,
           num_other_error = 0, num_frm_drop = 0;
 
     kaldi::int64 total_frames = 0;
-    double lat_like; // total likelihood of the lattice
-    double lat_ac_like; // acoustic likelihood weighted by posterior.
+    double lat_like;  // total likelihood of the lattice
+    double lat_ac_like;  // acoustic likelihood weighted by posterior.
     double total_mmi_obj = 0.0, mmi_obj = 0.0;
     double total_post_on_ali = 0.0, post_on_ali = 0.0;
 
-    // do per-utterance processing
-    for( ; !feature_reader.Done(); feature_reader.Next()) {
+    // main loop over utterances,
+    for ( ; !feature_reader.Done(); feature_reader.Next()) {
       std::string utt = feature_reader.Key();
-      if (!den_lat_reader.HasKey(utt)) { 
-        KALDI_WARN << "Utterance " << utt << ": found no lattice.";
+      if (!den_lat_reader.HasKey(utt)) {
+        KALDI_WARN << "Missing lattice of " << utt;
         num_no_den_lat++;
         continue;
       }
-      if (!num_ali_reader.HasKey(utt)) { 
-        KALDI_WARN << "Utterance " << utt << ": found no reference alignment.";
+      if (!num_ali_reader.HasKey(utt)) {
+        KALDI_WARN << "Missing alignment of " << utt;
         num_no_num_ali++;
         continue;
       }
 
-      // 1) get the features, numerator alignment
+      // 1) get the features, numerator alignment,
       const Matrix<BaseFloat> &mat = feature_reader.Value();
       const std::vector<int32> &num_ali = num_ali_reader.Value(utt);
-      // check for temporal length of numerator alignments
-      if ((int32)num_ali.size() != mat.NumRows()) {
-        KALDI_WARN << "Numerator alignment has wrong length "
-                   << num_ali.size() << " vs. "<< mat.NumRows();
+      // check duration of numerator alignments
+      if (static_cast<int32>(num_ali.size()) != mat.NumRows()) {
+        KALDI_WARN << "Duration mismatch!"
+                   << " alignment " << num_ali.size()
+                   << " features " << mat.NumRows();
         num_other_error++;
         continue;
       }
       if (mat.NumRows() > max_frames) {
-    KALDI_WARN << "Utterance " << utt << ": Skipped because it has " << mat.NumRows() << 
-      " frames, which is more than " << max_frames << ".";
-    num_other_error++;
-    continue;
+        KALDI_WARN << "Skipping " << utt
+          << " that has " << mat.NumRows() << " frames,"
+          << " it is longer than '--max-frames'" << max_frames;
+        num_other_error++;
+        continue;
       }
-      
-      // 2) get the denominator lattice, preprocess
+
+      // 2) get the denominator-lattice, preprocess
       Lattice den_lat = den_lat_reader.Value(utt);
       if (den_lat.Start() == -1) {
-        KALDI_WARN << "Empty lattice for utt " << utt;
+        KALDI_WARN << "Empty lattice of " << utt << ", skipping.";
         num_other_error++;
         continue;
       }
       if (old_acoustic_scale != 1.0) {
-        fst::ScaleLattice(fst::AcousticLatticeScale(old_acoustic_scale), &den_lat);
+        fst::ScaleLattice(fst::AcousticLatticeScale(old_acoustic_scale),
+                          &den_lat);
       }
       // optional sort it topologically
       kaldi::uint64 props = den_lat.Properties(fst::kFstProperties, false);
       if (!(props & fst::kTopSorted)) {
-        if (fst::TopSort(&den_lat) == false)
+        if (fst::TopSort(&den_lat) == false) {
           KALDI_ERR << "Cycles detected in lattice.";
+        }
       }
-      // get the lattice length and times of states
+      // get the lattice length and times of states,
       vector<int32> state_times;
       int32 max_time = kaldi::LatticeStateTimes(den_lat, &state_times);
-      // check for temporal length of denominator lattices
+      // check duration of den. lattice,
       if (max_time != mat.NumRows()) {
-        KALDI_WARN << "Denominator lattice has wrong length "
-                   << max_time << " vs. " << mat.NumRows();
+        KALDI_WARN << "Duration mismatch!"
+          << " denominator lattice " << max_time
+          << " features " << mat.NumRows() << ","
+          << " skipping " << utt;
         num_other_error++;
         continue;
       }
-     
-      // get actual dims for this utt and nnet
+
+      // get dims,
       int32 num_frames = mat.NumRows(),
-          num_fea = mat.NumCols(),
-          num_pdfs = nnet.OutputDim();
-      
-      // 3) propagate the feature to get the log-posteriors (nnet w/o sofrmax)
-      // push features to GPU
-      feats.Resize(num_frames, num_fea, kUndefined);
-      feats.CopyFromMat(mat);
-      // possibly apply transform
-      nnet_transf.Feedforward(feats, &feats_transf);
-      // propagate through the nnet (assuming w/o softmax)
+            num_pdfs = nnet.OutputDim();
+
+      // 3) get the pre-softmax outputs from NN,
+      // apply transform,
+      nnet_transf.Feedforward(CuMatrix<BaseFloat>(mat), &feats_transf);
+      // propagate through the nnet (we know it's w/o softmax),
       nnet.Propagate(feats_transf, &nnet_out);
-      // subtract the log_prior
-      if(prior_opts.class_frame_counts != "") {
+      // subtract the log_prior,
+      if (prior_opts.class_frame_counts != "") {
         log_prior.SubtractOnLogpost(&nnet_out);
       }
-      // transfer it back to the host
-      nnet_out_h.Resize(num_frames,num_pdfs, kUndefined);
-      nnet_out.CopyToMat(&nnet_out_h);
-      // release the buffers we don't need anymore
-      feats.Resize(0,0);
-      feats_transf.Resize(0,0);
-      nnet_out.Resize(0,0);
-
-      // 4) rescore the latice
+      // transfer it back to the host,
+      nnet_out_h = Matrix<BaseFloat>(nnet_out);
+      // release the buffers we don't need anymore,
+      feats_transf.Resize(0, 0);
+      nnet_out.Resize(0, 0);
+
+      // 4) rescore the latice,
       LatticeAcousticRescore(nnet_out_h, trans_model, state_times, &den_lat);
       if (acoustic_scale != 1.0 || lm_scale != 1.0)
         fst::ScaleLattice(fst::LatticeScale(lm_scale, acoustic_scale), &den_lat);
 
-      // 5) get the posteriors
+      // 5) get the posteriors,
       kaldi::Posterior post;
       lat_like = kaldi::LatticeForwardBackward(den_lat, &post, &lat_ac_like);
 
-      // 6) convert the Posterior to a matrix
-      nnet_diff_h.Resize(num_frames, num_pdfs, kSetZero);
-      for (int32 t = 0; t < post.size(); t++) {
-        for (int32 arc = 0; arc < post[t].size(); arc++) {
-          int32 pdf = trans_model.TransitionIdToPdf(post[t][arc].first);
-          nnet_diff_h(t, pdf) += post[t][arc].second;
-        }
-      }
+      // 6) convert the Posterior to a matrix,
+      PosteriorToMatrixMapped(post, trans_model, &nnet_diff_h);
 
-      // 7) Calculate the MMI-objective function
-      // Calculate the likelihood of correct path from acoustic score, 
+      // 7) Calculate the MMI-objective function,
+      // Calculate the likelihood of correct path from acoustic score,
       // the denominator likelihood is the total likelihood of the lattice.
       double path_ac_like = 0.0;
-      for(int32 t=0; t<num_frames; t++) {
+      for (int32 t = 0; t < num_frames; t++) {
         int32 pdf = trans_model.TransitionIdToPdf(num_ali[t]);
-        path_ac_like += nnet_out_h(t,pdf);
+        path_ac_like += nnet_out_h(t, pdf);
       }
       path_ac_like *= acoustic_scale;
-      mmi_obj = path_ac_like - lat_like; 
+      mmi_obj = path_ac_like - lat_like;
       //
       // Note: numerator likelihood does not include graph score,
       // while denominator likelihood contains graph scores.
       // The result is offset at the MMI-objective.
       // However the offset is constant for given alignment,
-      // so it is not harmful.
-      
-      // Sum the den-posteriors under the correct path:
+      // so it does not change accross epochs.
+
+      // Sum the den-posteriors under the correct path,
       post_on_ali = 0.0;
-      for(int32 t=0; t<num_frames; t++) {
+      for (int32 t = 0; t < num_frames; t++) {
         int32 pdf = trans_model.TransitionIdToPdf(num_ali[t]);
         double posterior = nnet_diff_h(t, pdf);
         post_on_ali += posterior;
       }
 
-      // Report
+      // Report,
       KALDI_VLOG(1) << "Lattice #" << num_done + 1 << " processed"
-                    << " (" << utt << "): found " << den_lat.NumStates()
-                    << " states and " << fst::NumArcs(den_lat) << " arcs.";
+        << " (" << utt << "): found " << den_lat.NumStates()
+        << " states and " << fst::NumArcs(den_lat) << " arcs.";
 
       KALDI_VLOG(1) << "Utterance " << utt << ": Average MMI obj. value = "
-                    << (mmi_obj/num_frames) << " over " << num_frames
-                    << " frames."
-                    << " (Avg. den-posterior on ali " << post_on_ali/num_frames << ")";
+        << (mmi_obj/num_frames) << " over " << num_frames << " frames."
+        << " (Avg. den-posterior on ali " << post_on_ali / num_frames << ")";
 
 
-      // 7a) Search for the frames with num/den mismatch
+      // 7a) Search for the frames with num/den mismatch,
       int32 frm_drop = 0;
-      std::vector<int32> frm_drop_vec; 
-      for(int32 t=0; t<num_frames; t++) {
+      std::vector<int32> frm_drop_vec;
+      for (int32 t = 0; t < num_frames; t++) {
         int32 pdf = trans_model.TransitionIdToPdf(num_ali[t]);
         double posterior = nnet_diff_h(t, pdf);
-        if(posterior < 1e-20) {
+        if (posterior < 1e-20) {
           frm_drop++;
           frm_drop_vec.push_back(t);
         }
       }
 
-      // 8) subtract the pdf-Viterbi-path
-      for(int32 t=0; t<nnet_diff_h.NumRows(); t++) {
+      // 8) subtract the pdf-Viterbi-path,
+      for (int32 t = 0; t < nnet_diff_h.NumRows(); t++) {
         int32 pdf = trans_model.TransitionIdToPdf(num_ali[t]);
         nnet_diff_h(t, pdf) -= 1.0;
       }
 
-      // 9) Drop mismatched frames from the training by zeroing the derivative
-      if(drop_frames) {
-        for(int32 i=0; i<frm_drop_vec.size(); i++) {
+      // 9) Drop mismatched frames from the training by zeroing the derivative,
+      if (drop_frames) {
+        for (int32 i = 0; i < frm_drop_vec.size(); i++) {
           nnet_diff_h.Row(frm_drop_vec[i]).Set(0.0);
         }
         num_frm_drop += frm_drop;
@@ -376,32 +378,33 @@ int main(int argc, char *argv[]) {
       // Report the frame dropping
       if (frm_drop > 0) {
         std::stringstream ss;
-        ss << (drop_frames?"Dropped":"[dropping disabled] Would drop") 
-           << " frames in " << utt << " " << frm_drop << "/" << num_frames << ",";
-        //get frame intervals from vec frm_drop_vec
+        ss << (drop_frames?"Dropped":"[dropping disabled] Would drop")
+           << " frames in " << utt << " " << frm_drop << "/" << num_frames
+           << ",";
+        // get frame intervals from vec frm_drop_vec,
         ss << " intervals :";
-        //search for streaks of consecutive numbers:
-        int32 beg_streak=frm_drop_vec[0];
-        int32 len_streak=0;
+        // search for streaks of consecutive numbers,
+        int32 beg_streak = frm_drop_vec[0];
+        int32 len_streak = 0;
         int32 i;
-        for(i=0; i<frm_drop_vec.size(); i++,len_streak++) {
-          if(beg_streak + len_streak != frm_drop_vec[i]) {
+        for (i = 0; i < frm_drop_vec.size(); i++, len_streak++) {
+          if (beg_streak + len_streak != frm_drop_vec[i]) {
             ss << " " << beg_streak << ".." << frm_drop_vec[i-1] << "frm";
             beg_streak = frm_drop_vec[i];
             len_streak = 0;
           }
         }
         ss << " " << beg_streak << ".." << frm_drop_vec[i-1] << "frm";
-        //print
+        // print,
         KALDI_WARN << ss.str();
       }
 
-      // 10) backpropagate through the nnet
+      // 10) backpropagate through the nnet, update,
       nnet_diff.Resize(num_frames, num_pdfs, kUndefined);
       nnet_diff.CopyFromMat(nnet_diff_h);
       nnet.Backpropagate(nnet_diff, NULL);
-      // relase the buffer, we don't need anymore
-      nnet_diff.Resize(0,0);
+      // relase the buffer, we don't need anymore,
+      nnet_diff.Resize(0, 0);
 
       // increase time counter
       total_mmi_obj += mmi_obj;
@@ -411,20 +414,41 @@ int main(int argc, char *argv[]) {
 
       if (num_done % 100 == 0) {
         time_now = time.Elapsed();
-        KALDI_VLOG(1) << "After " << num_done << " utterances: time elapsed = "
-                      << time_now/60 << " min; processed " << total_frames/time_now
-                      << " frames per second.";
-#if HAVE_CUDA==1
-        // check the GPU is not overheated
+        KALDI_VLOG(1) << "After " << num_done << " utterances: "
+          << "time elapsed = " << time_now / 60 << " min; "
+          << "processed " << total_frames / time_now << " frames per sec.";
+#if HAVE_CUDA == 1
+        // check that GPU computes accurately,
         CuDevice::Instantiate().CheckGpuHealth();
 #endif
       }
-    }
-       
-    //add back the softmax
+
+      // GRADIENT LOGGING
+      // First utterance,
+      if (num_done == 1) {
+        KALDI_VLOG(1) << nnet.InfoPropagate();
+        KALDI_VLOG(1) << nnet.InfoBackPropagate();
+        KALDI_VLOG(1) << nnet.InfoGradient();
+      }
+      // Every 1000 utterances (--verbose=2),
+      if (kaldi::g_kaldi_verbose_level >= 2) {
+        if (num_done % 1000 == 0) {
+          KALDI_VLOG(2) << nnet.InfoPropagate();
+          KALDI_VLOG(2) << nnet.InfoBackPropagate();
+          KALDI_VLOG(2) << nnet.InfoGradient();
+        }
+      }
+    }  // main loop over utterances,
+
+    // After last utterance,
+    KALDI_VLOG(1) << nnet.InfoPropagate();
+    KALDI_VLOG(1) << nnet.InfoBackPropagate();
+    KALDI_VLOG(1) << nnet.InfoGradient();
+
+    // Add the softmax layer back before writing,
     KALDI_LOG << "Appending the softmax " << target_model_filename;
-    nnet.AppendComponent(new Softmax(nnet.OutputDim(),nnet.OutputDim()));
-    //store the nnet
+    nnet.AppendComponentPointer(new Softmax(nnet.OutputDim(), nnet.OutputDim()));
+    // Store the nnet,
     nnet.Write(target_model_filename, binary);
 
     time_now = time.Elapsed();
@@ -432,16 +456,18 @@ int main(int argc, char *argv[]) {
               << "Time taken = " << time_now/60 << " min; processed "
               << (total_frames/time_now) << " frames per second.";
 
-    KALDI_LOG << "Done " << num_done << " files, " 
-              << num_no_num_ali << " with no numerator alignments, " 
-              << num_no_den_lat << " with no denominator lattices, " 
+    KALDI_LOG << "Done " << num_done << " files, "
+              << num_no_num_ali << " with no numerator alignments, "
+              << num_no_den_lat << " with no denominator lattices, "
               << num_other_error << " with other errors.";
 
-    KALDI_LOG << "Overall MMI-objective/frame is " 
-              << std::setprecision(8) << (total_mmi_obj/total_frames) 
-              << " over " << total_frames << " frames."
-              << " (average den-posterior on ali " << (total_post_on_ali/total_frames) << ","
-              << " dropped " << num_frm_drop << " frames with num/den mismatch)";
+    KALDI_LOG << "Overall MMI-objective/frame is "
+              << std::setprecision(8) << total_mmi_obj / total_frames
+              << " over " << total_frames << " frames,"
+              << " (average den-posterior on ali "
+              << total_post_on_ali / total_frames << ","
+              << " dropped " << num_frm_drop
+              << " frames with num/den mismatch)";
 
 #if HAVE_CUDA == 1
     CuDevice::Instantiate().PrintProfile();
diff --git a/src/nnetbin/nnet-train-mpe-sequential.cc b/src/nnetbin/nnet-train-mpe-sequential.cc
index 7e932ade5d7..5683d30ec3a 100644
--- a/src/nnetbin/nnet-train-mpe-sequential.cc
+++ b/src/nnetbin/nnet-train-mpe-sequential.cc
@@ -1,6 +1,7 @@
 // nnetbin/nnet-train-mpe-sequential.cc
 
-// Copyright 2011-2013  Brno University of Technology (author: Karel Vesely);  Arnab Ghoshal
+// Copyright 2011-2016  Brno University of Technology (author: Karel Vesely);
+//                      Arnab Ghoshal
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -87,55 +88,66 @@ int main(int argc, char *argv[]) {
   typedef kaldi::int32 int32;
   try {
     const char *usage =
-        "Perform iteration of Neural Network MPE/sMBR training by stochastic "
-        "gradient descent.\n"
-        "The network weights are updated on each utterance.\n"
-        "Usage:  nnet-train-mpe-sequential [options] <model-in> <transition-model-in> "
-        "<feature-rspecifier> <den-lat-rspecifier> <ali-rspecifier> [<model-out>]\n"
-        "e.g.: \n"
-        " nnet-train-mpe-sequential nnet.init trans.mdl scp:train.scp scp:denlats.scp ark:train.ali "
-        "nnet.iter1\n";
+      "Perform one iteration of MPE/sMBR training using SGD with per-utterance"
+      "updates.\n"
+
+      "Usage:  nnet-train-mpe-sequential [options] "
+      "<model-in> <transition-model-in> <feature-rspecifier> "
+      "<den-lat-rspecifier> <ali-rspecifier> [<model-out>]\n"
+
+      "e.g.: nnet-train-mpe-sequential nnet.init trans.mdl scp:feats.scp "
+      "scp:denlats.scp ark:ali.ark nnet.iter1\n";
 
     ParseOptions po(usage);
 
-    NnetTrainOptions trn_opts; trn_opts.learn_rate=0.00001;
+    NnetTrainOptions trn_opts;
+    trn_opts.learn_rate = 0.00001;  // changing default,
     trn_opts.Register(&po);
 
-    bool binary = true; 
+    bool binary = true;
     po.Register("binary", &binary, "Write output in binary mode");
 
     std::string feature_transform;
-    po.Register("feature-transform", &feature_transform, 
-                "Feature transform in Nnet format");
+    po.Register("feature-transform", &feature_transform,
+                "Feature transform in 'nnet1' format");
+
     std::string silence_phones_str;
-    po.Register("silence-phones", &silence_phones_str, "Colon-separated list "
-                "of integer id's of silence phones, e.g. 46:47");
+    po.Register("silence-phones", &silence_phones_str,
+        "Colon-separated list of integer id's of silence phones, e.g. 46:47");
 
     PdfPriorOptions prior_opts;
     prior_opts.Register(&po);
 
-    bool one_silence_class = false;
     BaseFloat acoustic_scale = 1.0,
         lm_scale = 1.0,
         old_acoustic_scale = 0.0;
+
     po.Register("acoustic-scale", &acoustic_scale,
-                "Scaling factor for acoustic likelihoods");
+        "Scaling factor for acoustic likelihoods");
+
     po.Register("lm-scale", &lm_scale,
-                "Scaling factor for \"graph costs\" (including LM costs)");
+        "Scaling factor for \"graph costs\" (including LM costs)");
+
     po.Register("old-acoustic-scale", &old_acoustic_scale,
-                "Add in the scores in the input lattices with this scale, rather "
-                "than discarding them.");
-    po.Register("one-silence-class", &one_silence_class, "If true, newer "
-                "behavior which will tend to reduce insertions.");
-    kaldi::int32 max_frames = 6000; // Allow segments maximum of one minute by default
-    po.Register("max-frames",&max_frames, "Maximum number of frames a segment can have to be processed");
+        "Add in the scores in the input lattices with this scale, rather "
+        "than discarding them.");
+
+    bool one_silence_class = false;
+    po.Register("one-silence-class", &one_silence_class,
+        "If true, the newer behavior reduces insertions.");
+
+    kaldi::int32 max_frames = 6000;
+    po.Register("max-frames", &max_frames,
+        "Maximum number of frames an utterance can have (skipped if longer)");
+
     bool do_smbr = false;
-    po.Register("do-smbr", &do_smbr, "Use state-level accuracies instead of "
-                "phone accuracies.");
+    po.Register("do-smbr", &do_smbr,
+        "Use state-level accuracies instead of phone accuracies.");
 
     std::string use_gpu="yes";
-    po.Register("use-gpu", &use_gpu, "yes|no|optional, only has effect if compiled with CUDA");
-     
+    po.Register("use-gpu", &use_gpu,
+        "yes|no|optional, only has effect if compiled with CUDA");
+
     po.Read(argc, argv);
 
     if (po.NumArgs() != 6) {
@@ -147,20 +159,19 @@ int main(int argc, char *argv[]) {
         transition_model_filename = po.GetArg(2),
         feature_rspecifier = po.GetArg(3),
         den_lat_rspecifier = po.GetArg(4),
-        ref_ali_rspecifier = po.GetArg(5);
-
-    std::string target_model_filename;
-    target_model_filename = po.GetArg(6);
+        ref_ali_rspecifier = po.GetArg(5),
+        target_model_filename = po.GetArg(6);
 
     std::vector<int32> silence_phones;
     if (!kaldi::SplitStringToIntegers(silence_phones_str, ":", false,
-                                      &silence_phones))
+                                      &silence_phones)) {
       KALDI_ERR << "Invalid silence-phones string " << silence_phones_str;
+    }
     kaldi::SortAndUniq(&silence_phones);
-    if (silence_phones.empty())
+    if (silence_phones.empty()) {
       KALDI_LOG << "No silence phones specified.";
+    }
 
-    // Select the GPU
 #if HAVE_CUDA == 1
     CuDevice::Instantiate().SelectGpuId(use_gpu);
 #endif
@@ -172,20 +183,25 @@ int main(int argc, char *argv[]) {
 
     Nnet nnet;
     nnet.Read(model_filename);
-    // using activations directly: remove softmax, if present
-    if (nnet.GetComponent(nnet.NumComponents()-1).GetType() ==
+    // we will use pre-softmax activations, removing softmax,
+    // - pre-softmax activations are equivalent to 'log-posterior + C_frame',
+    // - all paths crossing a frame share same 'C_frame',
+    // - with GMM, we also have the unnormalized acoustic likelihoods,
+    if (nnet.GetLastComponent().GetType() ==
         kaldi::nnet1::Component::kSoftmax) {
       KALDI_LOG << "Removing softmax from the nnet " << model_filename;
-      nnet.RemoveComponent(nnet.NumComponents()-1);
+      nnet.RemoveLastComponent();
     } else {
-      KALDI_LOG << "The nnet was without softmax " << model_filename;
+      KALDI_LOG << "The nnet was without softmax. "
+                << "The last component in " << model_filename << " was "
+                << Component::TypeToMarker(nnet.GetLastComponent().GetType());
     }
     nnet.SetTrainOptions(trn_opts);
 
-    // Read the class-frame-counts, compute priors
+    // Read the class-frame-counts, compute priors,
     PdfPrior log_prior(prior_opts);
 
-    // Read transition model
+    // Read transition model,
     TransitionModel trans_model;
     ReadKaldiObject(transition_model_filename, &trans_model);
 
@@ -193,53 +209,58 @@ int main(int argc, char *argv[]) {
     RandomAccessLatticeReader den_lat_reader(den_lat_rspecifier);
     RandomAccessInt32VectorReader ref_ali_reader(ref_ali_rspecifier);
 
-    CuMatrix<BaseFloat> feats, feats_transf, nnet_out, nnet_diff;
+    CuMatrix<BaseFloat> feats_transf, nnet_out, nnet_diff;
     Matrix<BaseFloat> nnet_out_h;
 
     Timer time;
     double time_now = 0;
     KALDI_LOG << "TRAINING STARTED";
 
-    int32 num_done = 0, num_no_ref_ali = 0, num_no_den_lat = 0,
-      num_other_error = 0;
+    int32 num_done = 0,
+          num_no_ref_ali = 0,
+          num_no_den_lat = 0,
+          num_other_error = 0;
 
     kaldi::int64 total_frames = 0;
     double total_frame_acc = 0.0, utt_frame_acc;
 
-    // do per-utterance processing
+    // main loop over utterances,
     for (; !feature_reader.Done(); feature_reader.Next()) {
       std::string utt = feature_reader.Key();
       if (!den_lat_reader.HasKey(utt)) {
-        KALDI_WARN << "Utterance " << utt << ": found no lattice.";
+        KALDI_WARN << "Missing lattice for " << utt;
         num_no_den_lat++;
         continue;
       }
       if (!ref_ali_reader.HasKey(utt)) {
-        KALDI_WARN << "Utterance " << utt << ": found no reference alignment.";
+        KALDI_WARN << "Missing alignment for " << utt;
         num_no_ref_ali++;
         continue;
       }
 
-      // 1) get the features, numerator alignment
+      // 1) get the features, numerator alignment,
       const Matrix<BaseFloat> &mat = feature_reader.Value();
       const std::vector<int32> &ref_ali = ref_ali_reader.Value(utt);
-      // check for temporal length of numerator alignments
+      // check duration of numerator alignments,
       if (static_cast<MatrixIndexT>(ref_ali.size()) != mat.NumRows()) {
-        KALDI_WARN << "Numerator alignment has wrong length "
-                   << ref_ali.size() << " vs. "<< mat.NumRows();
+        KALDI_WARN << "Duration mismatch!"
+                   << " alignment " << ref_ali.size()
+                   << " features " << mat.NumRows();
         num_other_error++;
         continue;
       }
       if (mat.NumRows() > max_frames) {
-    KALDI_WARN << "Utterance " << utt << ": Skipped because it has " << mat.NumRows() << 
-      " frames, which is more than " << max_frames << ".";
-    num_other_error++;
-    continue;
+        KALDI_WARN << "Skipping " << utt
+          << " that has " << mat.NumRows() << " frames,"
+          << " it is longer than '--max-frames'" << max_frames;
+        num_other_error++;
+        continue;
       }
+
       // 2) get the denominator lattice, preprocess
       Lattice den_lat = den_lat_reader.Value(utt);
       if (den_lat.Start() == -1) {
-        KALDI_WARN << "Empty lattice for utt " << utt;
+        KALDI_WARN << "Empty lattice of " << utt << ", skipping.";
         num_other_error++;
         continue;
       }
@@ -250,44 +271,40 @@ int main(int argc, char *argv[]) {
       // optional sort it topologically
       kaldi::uint64 props = den_lat.Properties(fst::kFstProperties, false);
       if (!(props & fst::kTopSorted)) {
-        if (fst::TopSort(&den_lat) == false)
+        if (fst::TopSort(&den_lat) == false) {
           KALDI_ERR << "Cycles detected in lattice.";
+        }
       }
       // get the lattice length and times of states
       vector<int32> state_times;
       int32 max_time = kaldi::LatticeStateTimes(den_lat, &state_times);
       // check for temporal length of denominator lattices
       if (max_time != mat.NumRows()) {
-        KALDI_WARN << "Denominator lattice has wrong length "
-                   << max_time << " vs. " << mat.NumRows();
+        KALDI_WARN << "Duration mismatch!"
+          << " denominator lattice " << max_time
+          << " features " << mat.NumRows() << ","
+          << " skipping " << utt;
         num_other_error++;
         continue;
       }
 
-      // get actual dims for this utt and nnet
-      int32 num_frames = mat.NumRows(),
-          num_fea = mat.NumCols(),
-          num_pdfs = nnet.OutputDim();
-
-      // 3) propagate the feature to get the log-posteriors (nnet w/o sofrmax)
-      // push features to GPU
-      feats.Resize(num_frames, num_fea, kUndefined);
-      feats.CopyFromMat(mat);
-      // possibly apply transform
-      nnet_transf.Feedforward(feats, &feats_transf);
-      // propagate through the nnet (assuming w/o softmax)
+      // get dims,
+      int32 num_frames = mat.NumRows();
+
+      // 3) get the pre-softmax outputs from NN,
+      // apply transform,
+      nnet_transf.Feedforward(CuMatrix<BaseFloat>(mat), &feats_transf);
+      // propagate through the nnet (we know it's w/o softmax),
       nnet.Propagate(feats_transf, &nnet_out);
-      // subtract the log_prior
+      // subtract the log_prior,
       if (prior_opts.class_frame_counts != "") {
         log_prior.SubtractOnLogpost(&nnet_out);
       }
-      // transfer it back to the host
-      nnet_out_h.Resize(num_frames, num_pdfs, kUndefined);
-      nnet_out.CopyToMat(&nnet_out_h);
+      // transfer it back to the host,
+      nnet_out_h = Matrix<BaseFloat>(nnet_out);
       // release the buffers we don't need anymore
-      feats.Resize(0,0);
-      feats_transf.Resize(0,0);
-      nnet_out.Resize(0,0);
+      feats_transf.Resize(0, 0);
+      nnet_out.Resize(0, 0);
 
       // 4) rescore the latice
       LatticeAcousticRescore(nnet_out_h, trans_model, state_times, &den_lat);
@@ -295,12 +312,13 @@ int main(int argc, char *argv[]) {
         fst::ScaleLattice(fst::LatticeScale(lm_scale, acoustic_scale), &den_lat);
 
       kaldi::Posterior post;
-
-      if (do_smbr) {  // use state-level accuracies, i.e. sMBR estimation
+      if (do_smbr) {
+        // use state-level accuracies, i.e. sMBR estimation,
         utt_frame_acc = LatticeForwardBackwardMpeVariants(
             trans_model, silence_phones, den_lat, ref_ali, "smbr",
             one_silence_class, &post);
-      } else {  // use phone-level accuracies, i.e. MPFE (minimum phone frame error)
+      } else {
+        // use phone-level accuracies, i.e. MPFE (minimum phone frame error),
         utt_frame_acc = LatticeForwardBackwardMpeVariants(
             trans_model, silence_phones, den_lat, ref_ali, "mpfe",
             one_silence_class, &post);
@@ -308,7 +326,7 @@ int main(int argc, char *argv[]) {
 
       // 6) convert the Posterior to a matrix,
       PosteriorToMatrixMapped(post, trans_model, &nnet_diff);
-      nnet_diff.Scale(-1.0); // need to flip the sign of derivative,
+      nnet_diff.Scale(-1.0);  // need to flip the sign of derivative,
 
       KALDI_VLOG(1) << "Lattice #" << num_done + 1 << " processed"
                     << " (" << utt << "): found " << den_lat.NumStates()
@@ -317,11 +335,12 @@ int main(int argc, char *argv[]) {
       KALDI_VLOG(1) << "Utterance " << utt << ": Average frame accuracy = "
                     << (utt_frame_acc/num_frames) << " over " << num_frames
                     << " frames,"
-                    << " diff-range(" << nnet_diff.Min() << "," << nnet_diff.Max() << ")";
+                    << " diff-range(" << nnet_diff.Min() << ","
+                                      << nnet_diff.Max() << ")";
 
-      // 7) backpropagate through the nnet,
+      // 7) backpropagate through the nnet, update,
       nnet.Backpropagate(nnet_diff, NULL);
-      nnet_diff.Resize(0,0); // release GPU memory,
+      nnet_diff.Resize(0, 0);  // release GPU memory,
 
       // increase time counter
       total_frame_acc += utt_frame_acc;
@@ -330,26 +349,47 @@ int main(int argc, char *argv[]) {
 
       if (num_done % 100 == 0) {
         time_now = time.Elapsed();
-        KALDI_VLOG(1) << "After " << num_done << " utterances: time elapsed = "
-                      << time_now/60 << " min; processed " << total_frames/time_now
-                      << " frames per second.";
-#if HAVE_CUDA==1        
-        // check the GPU is not overheated
+        KALDI_VLOG(1) << "After " << num_done << " utterances: "
+          << "time elapsed = " << time_now / 60 << " min; "
+          << "processed " << total_frames / time_now << " frames per sec.";
+#if HAVE_CUDA == 1
+        // check that GPU computes accurately,
         CuDevice::Instantiate().CheckGpuHealth();
 #endif
       }
-    }
 
-    // add the softmax layer back before writing
+      // GRADIENT LOGGING
+      // First utterance,
+      if (num_done == 1) {
+        KALDI_VLOG(1) << nnet.InfoPropagate();
+        KALDI_VLOG(1) << nnet.InfoBackPropagate();
+        KALDI_VLOG(1) << nnet.InfoGradient();
+      }
+      // Every 1000 utterances (--verbose=2),
+      if (kaldi::g_kaldi_verbose_level >= 2) {
+        if (num_done % 1000 == 0) {
+          KALDI_VLOG(2) << nnet.InfoPropagate();
+          KALDI_VLOG(2) << nnet.InfoBackPropagate();
+          KALDI_VLOG(2) << nnet.InfoGradient();
+        }
+      }
+    }  // main loop over utterances,
+
+    // After last utterance,
+    KALDI_VLOG(1) << nnet.InfoPropagate();
+    KALDI_VLOG(1) << nnet.InfoBackPropagate();
+    KALDI_VLOG(1) << nnet.InfoGradient();
+
+    // Add the softmax layer back before writing,
     KALDI_LOG << "Appending the softmax " << target_model_filename;
-    nnet.AppendComponent(new Softmax(nnet.OutputDim(),nnet.OutputDim()));
-    //store the nnet
+    nnet.AppendComponentPointer(new Softmax(nnet.OutputDim(), nnet.OutputDim()));
+    // Store the nnet,
     nnet.Write(target_model_filename, binary);
 
     time_now = time.Elapsed();
     KALDI_LOG << "TRAINING FINISHED; "
-              << "Time taken = " << time_now/60 << " min; processed "
-              << (total_frames/time_now) << " frames per second.";
+              << "Time taken = " << time_now / 60 << " min; processed "
+              << total_frames / time_now << " frames per second.";
 
     KALDI_LOG << "Done " << num_done << " files, "
               << num_no_ref_ali << " with no reference alignments, "
@@ -357,8 +397,8 @@ int main(int argc, char *argv[]) {
               << num_other_error << " with other errors.";
 
     KALDI_LOG << "Overall average frame-accuracy is "
-              << (total_frame_acc/total_frames) << " over " << total_frames
-              << " frames.";
+              << total_frame_acc / total_frames << " over "
+              << total_frames << " frames.";
 
 #if HAVE_CUDA == 1
     CuDevice::Instantiate().PrintProfile();
diff --git a/src/nnetbin/nnet-train-perutt.cc b/src/nnetbin/nnet-train-perutt.cc
index b4da74e0748..10e416ee3ac 100644
--- a/src/nnetbin/nnet-train-perutt.cc
+++ b/src/nnetbin/nnet-train-perutt.cc
@@ -29,51 +29,58 @@
 int main(int argc, char *argv[]) {
   using namespace kaldi;
   using namespace kaldi::nnet1;
-  typedef kaldi::int32 int32;  
-  
+  typedef kaldi::int32 int32;
+
   try {
     const char *usage =
-        "Perform one iteration of Neural Network training by Stochastic Gradient Descent.\n"
-        "This version use pdf-posterior as targets, prepared typically by ali-to-post.\n"
-        "The updates are done per-utterance, shuffling options are dummy for compatibility reason.\n"
-        "\n"
-        "Usage:  nnet-train-perutt [options] <feature-rspecifier> <targets-rspecifier> <model-in> [<model-out>]\n"
-        "e.g.: \n"
-        " nnet-train-perutt scp:feature.scp ark:posterior.ark nnet.init nnet.iter1\n";
+      "Perform one iteration of NN training by SGD with per-utterance updates.\n"
+      "The training targets are represented as pdf-posteriors, usually prepared "
+      "by ali-to-post.\n"
+      "Usage: nnet-train-perutt [options] "
+      "<feature-rspecifier> <targets-rspecifier> <model-in> [<model-out>]\n"
+      "e.g.: nnet-train-perutt scp:feature.scp ark:posterior.ark nnet.init nnet.iter1\n";
 
     ParseOptions po(usage);
 
     NnetTrainOptions trn_opts;
     trn_opts.Register(&po);
 
-    bool binary = true, 
-         crossvalidate = false;
+    bool binary = true;
     po.Register("binary", &binary, "Write output in binary mode");
-    po.Register("cross-validate", &crossvalidate, "Perform cross-validation (don't backpropagate)");
+
+    bool crossvalidate = false;
+    po.Register("cross-validate", &crossvalidate,
+        "Perform cross-validation (don't backpropagate)");
 
     std::string feature_transform;
-    po.Register("feature-transform", &feature_transform, "Feature transform in Nnet format");
+    po.Register("feature-transform", &feature_transform,
+        "Feature transform in Nnet format");
+
     std::string objective_function = "xent";
-    po.Register("objective-function", &objective_function, "Objective function : xent|mse");
+    po.Register("objective-function", &objective_function,
+        "Objective function : xent|mse");
 
     int32 length_tolerance = 5;
-    po.Register("length-tolerance", &length_tolerance, "Allowed length difference of features/targets (frames)");
-    
+    po.Register("length-tolerance", &length_tolerance,
+        "Allowed length difference of features/targets (frames)");
+
     std::string frame_weights;
-    po.Register("frame-weights", &frame_weights, "Per-frame weights to scale gradients (frame selection/weighting).");
+    po.Register("frame-weights", &frame_weights,
+        "Per-frame weights to scale gradients (frame selection/weighting).");
 
     std::string use_gpu="yes";
-    po.Register("use-gpu", &use_gpu, "yes|no|optional, only has effect if compiled with CUDA"); 
+    po.Register("use-gpu", &use_gpu,
+        "yes|no|optional, only has effect if compiled with CUDA");
 
-    // Add dummy randomizer options, to make the tool compatible with standard scripts
-    NnetDataRandomizerOptions rnd_opts;
-    rnd_opts.Register(&po);
+    //// Add dummy option for compatibility with default scheduler,
     bool randomize = false;
-    po.Register("randomize", &randomize, "Dummy option, for compatibility...");
-    
+    po.Register("randomize", &randomize,
+        "Dummy, for compatibility with 'steps/nnet/train_scheduler.sh'");
+    ////
+
     po.Read(argc, argv);
 
-    if (po.NumArgs() != 4-(crossvalidate?1:0)) {
+    if (po.NumArgs() != 3 + (crossvalidate ? 0 : 1)) {
       po.PrintUsage();
       exit(1);
     }
@@ -81,7 +88,7 @@ int main(int argc, char *argv[]) {
     std::string feature_rspecifier = po.GetArg(1),
       targets_rspecifier = po.GetArg(2),
       model_filename = po.GetArg(3);
-        
+
     std::string target_model_filename;
     if (!crossvalidate) {
       target_model_filename = po.GetArg(4);
@@ -91,13 +98,12 @@ int main(int argc, char *argv[]) {
     using namespace kaldi::nnet1;
     typedef kaldi::int32 int32;
 
-    //Select the GPU
-#if HAVE_CUDA==1
+#if HAVE_CUDA == 1
     CuDevice::Instantiate().SelectGpuId(use_gpu);
 #endif
 
     Nnet nnet_transf;
-    if(feature_transform != "") {
+    if (feature_transform != "") {
       nnet_transf.Read(feature_transform);
     }
 
@@ -114,20 +120,19 @@ int main(int argc, char *argv[]) {
       weights_reader.Open(frame_weights);
     }
 
-    RandomizerMask randomizer_mask(rnd_opts);
-    MatrixRandomizer feature_randomizer(rnd_opts);
-    PosteriorRandomizer targets_randomizer(rnd_opts);
-    VectorRandomizer weights_randomizer(rnd_opts);
-
     Xent xent;
     Mse mse;
-    
+
     CuMatrix<BaseFloat> feats, feats_transf, nnet_out, obj_diff;
 
     Timer time;
     KALDI_LOG << (crossvalidate?"CROSS-VALIDATION":"TRAINING") << " STARTED";
 
-    int32 num_done = 0, num_no_tgt_mat = 0, num_other_error = 0;
+    int32 num_done = 0,
+          num_no_tgt_mat = 0,
+          num_other_error = 0;
+
+    // main loop,
     for ( ; !feature_reader.Done(); feature_reader.Next()) {
       std::string utt = feature_reader.Key();
       KALDI_VLOG(3) << "Reading " << utt;
@@ -151,25 +156,25 @@ int main(int argc, char *argv[]) {
       Vector<BaseFloat> weights;
       if (frame_weights != "") {
         weights = weights_reader.Value(utt);
-      } else { // all per-frame weights are 1.0
+      } else {  // all per-frame weights are 1.0
         weights.Resize(mat.NumRows());
         weights.Set(1.0);
       }
       // correct small length mismatch ... or drop sentence
       {
         // add lengths to vector
-        std::vector<int32> lenght;
-        lenght.push_back(mat.NumRows());
-        lenght.push_back(targets.size());
-        lenght.push_back(weights.Dim());
+        std::vector<int32> length;
+        length.push_back(mat.NumRows());
+        length.push_back(targets.size());
+        length.push_back(weights.Dim());
         // find min, max
-        int32 min = *std::min_element(lenght.begin(),lenght.end());
-        int32 max = *std::max_element(lenght.begin(),lenght.end());
+        int32 min = *std::min_element(length.begin(), length.end());
+        int32 max = *std::max_element(length.begin(), length.end());
         // fix or drop ?
         if (max - min < length_tolerance) {
-          if(mat.NumRows() != min) mat.Resize(min, mat.NumCols(), kCopyData);
-          if(targets.size() != min) targets.resize(min);
-          if(weights.Dim() != min) weights.Resize(min, kCopyData);
+          if (mat.NumRows() != min) mat.Resize(min, mat.NumCols(), kCopyData);
+          if (targets.size() != min) targets.resize(min);
+          if (weights.Dim() != min) weights.Resize(min, kCopyData);
         } else {
           KALDI_WARN << utt << ", length mismatch of targets " << targets.size()
                      << " and features " << mat.NumRows();
@@ -179,32 +184,29 @@ int main(int argc, char *argv[]) {
       }
       // apply optional feature transform
       nnet_transf.Feedforward(CuMatrix<BaseFloat>(mat), &feats_transf);
- 
-      // get block of feature/target pairs
-      //const Vector<BaseFloat>& frm_weights = weights_randomizer.Value();
 
       // forward pass
       nnet.Propagate(feats_transf, &nnet_out);
 
-      // evaluate objective function we've chosen
+      // evaluate objective function we've chosen,
       if (objective_function == "xent") {
-        // gradients re-scaled by weights in Eval,
+        // gradients are re-scaled by weights inside Eval,
         xent.Eval(weights, nnet_out, targets, &obj_diff);
       } else if (objective_function == "mse") {
-        // gradients re-scaled by weights in Eval,
+        // gradients are re-scaled by weights inside Eval,
         mse.Eval(weights, nnet_out, targets, &obj_diff);
       } else {
-        KALDI_ERR << "Unknown objective function code : " << objective_function;
+        KALDI_ERR << "Unknown objective function code : "
+                  << objective_function;
       }
 
-      // backward pass
       if (!crossvalidate) {
-        // backpropagate
+        // backpropagate and update,
         nnet.Backpropagate(obj_diff, NULL);
       }
 
-      // 1st minibatch : show what happens in network 
-      if (kaldi::g_kaldi_verbose_level >= 1 && total_frames == 0) { // vlog-1
+      // 1st minibatch : show what happens in network,
+      if (total_frames == 0) {
         KALDI_VLOG(1) << "### After " << total_frames << " frames,";
         KALDI_VLOG(1) << nnet.InfoPropagate();
         if (!crossvalidate) {
@@ -212,56 +214,63 @@ int main(int argc, char *argv[]) {
           KALDI_VLOG(1) << nnet.InfoGradient();
         }
       }
-      
-      // monitor the NN training
-      if (kaldi::g_kaldi_verbose_level >= 2) { // vlog-2
-        if ((total_frames/25000) != ((total_frames+feats_transf.NumRows())/25000)) { // print every 25k frames
+
+      // VERBOSE LOG
+      // monitor the NN training (--verbose=2),
+      if (kaldi::g_kaldi_verbose_level >= 2) {
+        static int32 counter = 0;
+        counter += mat.NumRows();
+        // print every 25k frames,
+        if (counter >= 25000) {
           KALDI_VLOG(2) << "### After " << total_frames << " frames,";
           KALDI_VLOG(2) << nnet.InfoPropagate();
           if (!crossvalidate) {
+            KALDI_VLOG(2) << nnet.InfoBackPropagate();
             KALDI_VLOG(2) << nnet.InfoGradient();
           }
+          counter = 0;
         }
       }
-      
-      // report the speed
+
       num_done++;
-      total_frames += feats_transf.NumRows();
+      total_frames += weights.Sum();
+
+      // do this every 5000 utterances,
       if (num_done % 5000 == 0) {
+        // report the speed,
         double time_now = time.Elapsed();
-        KALDI_VLOG(1) << "After " << num_done << " utterances: time elapsed = "
-                      << time_now/60 << " min; processed " << total_frames/time_now
-                      << " frames per second.";
-#if HAVE_CUDA==1
-        // check the GPU is not overheated
+        KALDI_VLOG(1) << "After " << num_done << " utterances: "
+          << "time elapsed = " << time_now / 60 << " min; "
+          << "processed " << total_frames / time_now << " frames per sec.";
+#if HAVE_CUDA == 1
+        // check that GPU computes accurately,
         CuDevice::Instantiate().CheckGpuHealth();
 #endif
       }
-    }
-      
-    // after last minibatch : show what happens in network 
-    if (kaldi::g_kaldi_verbose_level >= 1) { // vlog-1
-      KALDI_VLOG(1) << "### After " << total_frames << " frames,";
-      KALDI_VLOG(1) << nnet.InfoPropagate();
-      if (!crossvalidate) {
-        KALDI_VLOG(1) << nnet.InfoBackPropagate();
-        KALDI_VLOG(1) << nnet.InfoGradient();
-      }
+    }  // main loop,
+
+    // after last minibatch : show what happens in network,
+    KALDI_VLOG(1) << "### After " << total_frames << " frames,";
+    KALDI_VLOG(1) << nnet.InfoPropagate();
+    if (!crossvalidate) {
+      KALDI_VLOG(1) << nnet.InfoBackPropagate();
+      KALDI_VLOG(1) << nnet.InfoGradient();
     }
 
     if (!crossvalidate) {
       nnet.Write(target_model_filename, binary);
     }
 
-    KALDI_LOG << "Done " << num_done << " files, " << num_no_tgt_mat
-              << " with no tgt_mats, " << num_other_error
-              << " with other errors. "
-              << "[" << (crossvalidate?"CROSS-VALIDATION":"TRAINING")
-              << ", " << (randomize?"RANDOMIZED":"NOT-RANDOMIZED") 
-              << ", " << time.Elapsed()/60 << " min, fps" << total_frames/time.Elapsed()
-              << "]";  
+    KALDI_LOG << "Done " << num_done << " files, "
+      << num_no_tgt_mat << " with no tgt_mats, "
+      << num_other_error << " with other errors. "
+      << "[" << (crossvalidate ? "CROSS-VALIDATION" : "TRAINING")
+      << ", " << (randomize ? "RANDOMIZED" : "NOT-RANDOMIZED")
+      << ", " << time.Elapsed() / 60 << " min, processing "
+      << total_frames / time.Elapsed() << " frames per sec.]";
 
     if (objective_function == "xent") {
+      KALDI_LOG << xent.ReportPerClass();
       KALDI_LOG << xent.Report();
     } else if (objective_function == "mse") {
       KALDI_LOG << mse.Report();
@@ -269,7 +278,7 @@ int main(int argc, char *argv[]) {
       KALDI_ERR << "Unknown objective function code : " << objective_function;
     }
 
-#if HAVE_CUDA==1
+#if HAVE_CUDA == 1
     CuDevice::Instantiate().PrintProfile();
 #endif
 
diff --git a/src/nnetbin/paste-post.cc b/src/nnetbin/paste-post.cc
index a7cb88e5255..f5c2c9a41df 100644
--- a/src/nnetbin/paste-post.cc
+++ b/src/nnetbin/paste-post.cc
@@ -35,26 +35,27 @@ int main(int argc, char *argv[]) {
   typedef kaldi::int32 int32;
   try {
     const char *usage =
-        "Combine 2 or more streams with NN-training targets into single stream.\n"
-        "As the posterior streams are pasted, the output dimension is the sum\n"
-        "of the input dimensions. This is used when training NN with\n"
-        "multiple softmaxes on its output. This is used in multi-task, \n"
-        "multi-lingual or multi-database training. Depending on the context,\n"
-        "an utterance is not required to be in all the input streams.\n"
-        "For a multi-database training only 1 output layer will be active.\n"
-        "\n"
-        "The lengths of utterances are provided as 1st argument.\n"
-        "The dimensions of input stream are set as 2nd in argument.\n"
-        "Follow the input and output streams which are in 'posterior' format.\n"
-        "\n"
-        "Usage: paste-post <featlen-rspecifier> <dims-csl> <post1-rspecifier> ... <postN-rspecifier> <post-wspecifier>\n"
-        "e.g.:\n"
-        " paste-post 'ark:feat-to-len $feats ark,t:-|' 1029:1124 ark:post1.ark ark:post2.ark ark:pasted.ark\n";
+      "Combine 2 or more streams with NN-training targets into single stream.\n"
+      "As the posterior streams are pasted, the output dimension is the sum\n"
+      "of the input dimensions. This is used when training NN with\n"
+      "multiple softmaxes on its output. This is used in multi-task, \n"
+      "multi-lingual or multi-database training. Depending on the context,\n"
+      "an utterance is not required to be in all the input streams.\n"
+      "For a multi-database training only 1 output layer will be active.\n"
+      "\n"
+      "The lengths of utterances are provided as 1st argument.\n"
+      "The dimensions of input stream are set as 2nd in argument.\n"
+      "Follow the input and output streams which are in 'posterior' format.\n"
+      "\n"
+      "Usage: paste-post <featlen-rspecifier> <dims-csl> <post1-rspecifier> "
+      "... <postN-rspecifier> <post-wspecifier>\n"
+      "e.g.: paste-post 'ark:feat-to-len $feats ark,t:-|' 1029:1124 "
+      "ark:post1.ark ark:post2.ark ark:pasted.ark\n";
 
     ParseOptions po(usage);
 
     bool allow_partial = false;
-    po.Register("allow-partial", &allow_partial, 
+    po.Register("allow-partial", &allow_partial,
                 "Produce output also when the utterance is not in all input streams.");
 
     po.Read(argc, argv);
@@ -71,8 +72,9 @@ int main(int argc, char *argv[]) {
 
     // read the dims of input posterior streams,
     std::vector<int32> stream_dims;
-    if (!kaldi::SplitStringToIntegers(stream_dims_str, ":,", false, &stream_dims))
+    if (!kaldi::SplitStringToIntegers(stream_dims_str, ":,", false, &stream_dims)) {
       KALDI_ERR << "Invalid stream-dims string " << stream_dims_str;
+    }
     if (stream_count != stream_dims.size()) {
       KALDI_ERR << "Mismatch in input posterior-stream count " << stream_count
                 << " and --stream-dims count" << stream_dims.size()
@@ -100,7 +102,7 @@ int main(int argc, char *argv[]) {
       bool ok = true, empty = true;
       std::string utt = featlen_reader.Key();
       int32 num_frames = featlen_reader.Value();
-      
+
       // show which streams are non-empty,
       if (allow_partial && kaldi::g_kaldi_verbose_level >= 2) {
         std::string nonempty_streams;
@@ -109,8 +111,8 @@ int main(int argc, char *argv[]) {
             nonempty_streams += " " + ToString(s);
           }
         }
-        KALDI_VLOG(2) << "Processing " << utt 
-                      << ", frames " << num_frames 
+        KALDI_VLOG(2) << "Processing " << utt
+                      << ", frames " << num_frames
                       << ", pasted-from streams " << nonempty_streams;
       }
 
@@ -153,7 +155,8 @@ int main(int argc, char *argv[]) {
       }
     }
     KALDI_LOG << "Pasted posteriors for " << num_done << " sentences, "
-              << "missing sentences " << num_empty << ", failed for " << num_err;
+              << "missing sentences " << num_empty << ", "
+              << "failed for " << num_err;
     return (num_done != 0 ? 0 : 1);
   } catch(const std::exception &e) {
     std::cerr << e.what();
diff --git a/src/nnetbin/rbm-convert-to-nnet.cc b/src/nnetbin/rbm-convert-to-nnet.cc
index 5f60b207ec3..3ac89626376 100644
--- a/src/nnetbin/rbm-convert-to-nnet.cc
+++ b/src/nnetbin/rbm-convert-to-nnet.cc
@@ -36,7 +36,7 @@ int main(int argc, char *argv[]) {
 
 
     bool binary_write = true;
-    
+
     ParseOptions po(usage);
     po.Register("binary", &binary_write, "Write output in binary mode");
 
@@ -50,13 +50,13 @@ int main(int argc, char *argv[]) {
     std::string model_in_filename = po.GetArg(1),
         model_out_filename = po.GetArg(2);
 
-    Nnet nnet; 
+    Nnet nnet;
     {
       bool binary_read;
       Input ki(model_in_filename, &binary_read);
       nnet.Read(ki.Stream(), binary_read);
     }
-    
+
     KALDI_ASSERT(nnet.NumComponents() == 1);
     KALDI_ASSERT(nnet.GetComponent(0).GetType() == kaldi::nnet1::Component::kRbm);
     RbmBase& rbm = dynamic_cast<RbmBase&>(nnet.GetComponent(0));
@@ -68,8 +68,8 @@ int main(int argc, char *argv[]) {
 
     KALDI_LOG << "Written model to " << model_out_filename;
     return 0;
-  } catch(const std::exception& e) {
-    std::cerr << e.what() << '\n';
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
     return -1;
   }
 }
diff --git a/src/nnetbin/rbm-train-cd1-frmshuff.cc b/src/nnetbin/rbm-train-cd1-frmshuff.cc
index cad9b9dadeb..3baca714cd0 100644
--- a/src/nnetbin/rbm-train-cd1-frmshuff.cc
+++ b/src/nnetbin/rbm-train-cd1-frmshuff.cc
@@ -35,42 +35,47 @@ int main(int argc, char *argv[]) {
   typedef kaldi::int32 int32;
   try {
     const char *usage =
-        "Train RBM by Contrastive Divergence alg. with 1 step of "
-        "Markov Chain Monte-Carlo.\n"
-        "The tool can perform several iterations (--num-iters) "
-        "or it can subsample the training dataset (--drop-data)\n"
-        "Usage:  rbm-train-cd1-frmshuff [options] <model-in> <feature-rspecifier> <model-out>\n"
-        "e.g.: \n"
-        " rbm-train-cd1-frmshuff 1.rbm.init scp:train.scp 1.rbm\n";
+      "Train RBM by Contrastive Divergence alg. with 1 step of "
+      "Markov Chain Monte-Carlo.\n"
+      "The tool can perform several iterations (--num-iters) "
+      "or it can subsample the training dataset (--drop-data)\n"
+
+      "Usage: rbm-train-cd1-frmshuff [options] <model-in> "
+      "<feature-rspecifier> <model-out>\n"
+      "e.g.: rbm-train-cd1-frmshuff 1.rbm.init scp:train.scp 1.rbm\n";
 
     ParseOptions po(usage);
 
     RbmTrainOptions trn_opts, trn_opts_rbm;
     trn_opts.Register(&po);
 
-    bool binary = false; 
+    bool binary = false;
     po.Register("binary", &binary, "Write output in binary mode");
 
-    bool with_bug = true; 
-    po.Register("with-bug", &with_bug, "Apply bug which led to better results (set-initial-momentum-to-max)");
-    
-    int32 num_iters = 1; 
-    po.Register("num-iters", &num_iters, 
+    bool with_bug = true;
+    po.Register("with-bug", &with_bug,
+        "Apply bug which led to better results (set-initial-momentum-to-max)");
+
+    int32 num_iters = 1;
+    po.Register("num-iters", &num_iters,
                 "Number of iterations (smaller datasets should have more iterations, "
                 "iterating within tool because of linear momentum scheduling)");
 
     std::string feature_transform;
-    po.Register("feature-transform", &feature_transform, "Feature transform in Nnet format");
+    po.Register("feature-transform", &feature_transform,
+        "Feature transform in 'nnet1' format");
 
     NnetDataRandomizerOptions rnd_opts;
     rnd_opts.minibatch_size = 100;
     rnd_opts.Register(&po);
 
-    kaldi::int32 max_frames = 6000; // Allow segments maximum of 30 seconds by default
-    po.Register("max-frames",&max_frames, "Maximum number of frames a segment can have to be processed");
-    
+    kaldi::int32 max_frames = 6000;
+    po.Register("max-frames", &max_frames,
+        "Maximum number of frames an utterance can have (skipped if longer)");
+
     std::string use_gpu="yes";
-    po.Register("use-gpu", &use_gpu, "yes|no|optional, only has effect if compiled with CUDA"); 
+    po.Register("use-gpu", &use_gpu,
+        "yes|no|optional, only has effect if compiled with CUDA");
 
     po.Read(argc, argv);
 
@@ -81,42 +86,44 @@ int main(int argc, char *argv[]) {
 
     std::string model_filename = po.GetArg(1),
         feature_rspecifier = po.GetArg(2);
-        
+
     std::string target_model_filename;
     target_model_filename = po.GetArg(3);
 
-     
+
     using namespace kaldi;
     using namespace kaldi::nnet1;
     typedef kaldi::int32 int32;
 
-#if HAVE_CUDA==1
+#if HAVE_CUDA == 1
     CuDevice::Instantiate().SelectGpuId(use_gpu);
 #endif
 
     Nnet rbm_transf;
-    if(feature_transform != "") {
+    if (feature_transform != "") {
       rbm_transf.Read(feature_transform);
     }
 
-    // Read nnet, extract the RBM
+    // Read nnet, extract the RBM,
     Nnet nnet;
     nnet.Read(model_filename);
-    KALDI_ASSERT(nnet.NumComponents()==1);
+    KALDI_ASSERT(nnet.NumComponents() == 1);
     KALDI_ASSERT(nnet.GetComponent(0).GetType() == kaldi::nnet1::Component::kRbm);
     RbmBase &rbm = dynamic_cast<RbmBase&>(nnet.GetComponent(0));
 
     // Configure the RBM,
-    // make some constants accessible, will use them later:
+    // make some constants accessible, will use them later,
     const BaseFloat& learn_rate = trn_opts.learn_rate;
     const BaseFloat& momentum = trn_opts.momentum;
     const BaseFloat& momentum_max = trn_opts.momentum_max;
     const int32& momentum_steps = trn_opts.momentum_steps;
     const int32& momentum_step_period = trn_opts.momentum_step_period;
-    // trn_opts_rbm is for RBM, copy the opts
+
+    // 'trn_opts_rbm' is a local copy of 'trn_opts' which is passed to RBM,
     trn_opts_rbm = trn_opts;
-    trn_opts_rbm.learn_rate = learn_rate*(1-momentum); // keep `effective' learning rate constant
-    // pass options to RBM
+    // keep `effective' learning rate constant
+    trn_opts_rbm.learn_rate = learn_rate * (1 - momentum);
+    // pass options to RBM,
     rbm.SetRbmTrainOptions(trn_opts_rbm);
 
     kaldi::int64 total_frames = 0;
@@ -125,11 +132,11 @@ int main(int argc, char *argv[]) {
     RandomizerMask randomizer_mask(rnd_opts);
     MatrixRandomizer feature_randomizer(rnd_opts);
 
-    CuRand<BaseFloat> cu_rand; // parallel random number generator
+    CuRand<BaseFloat> cu_rand;  // parallel random number generator,
     Mse mse;
-    
-    CuMatrix<BaseFloat> feats, feats_transf, 
-                        pos_hid, pos_hid_aux, 
+
+    CuMatrix<BaseFloat> feats_transf,
+                        pos_hid, pos_hid_aux,
                         neg_vis, neg_hid;
     CuMatrix<BaseFloat> dummy_mse_mat;
 
@@ -141,72 +148,75 @@ int main(int argc, char *argv[]) {
 
     int32 num_done = 0, num_other_error = 0;
     while (!feature_reader.Done()) {
-#if HAVE_CUDA==1      
-      // check the GPU is not overheated
+#if HAVE_CUDA == 1
+      // check that GPU is computing accurately,
       CuDevice::Instantiate().CheckGpuHealth();
 #endif
-      // fill the randomizer
+      // fill the randomizer,
       for ( ; !feature_reader.Done(); feature_reader.Next()) {
-        if (feature_randomizer.IsFull()) break; // suspend, keep utt for next loop
+        if (feature_randomizer.IsFull()) {
+          // break the loop without calling Next(),
+          // we keep the 'utt' for next round,
+          break;
+        }
         std::string utt = feature_reader.Key();
-        KALDI_VLOG(3) << utt;
-        // get feature matrix
+        KALDI_VLOG(3) << "Reading " << utt;
+        // get feature matrix,
         const Matrix<BaseFloat> &mat = feature_reader.Value();
         // skip too long segments (avoid runinning out of memory)
         if (mat.NumRows() > max_frames) {
-          KALDI_WARN << "Utterance " << utt << ": Skipped because it has " << mat.NumRows() << 
-            " frames, which is more than " << max_frames << ".";
+          KALDI_WARN << "Skipping " << utt
+            << " that has " << mat.NumRows() << " frames,"
+            << " it is longer than '--max-frames'" << max_frames;
           num_other_error++;
           continue;
         }
-        // push features to GPU
-        feats.Resize(mat.NumRows(),mat.NumCols());
-        feats.CopyFromMat(mat);
-        // apply optional feature transform
-        rbm_transf.Feedforward(feats, &feats_transf);
-        // add to randomizer
+        // apply feature transform,
+        rbm_transf.Feedforward(CuMatrix<BaseFloat>(mat), &feats_transf);
+        // add to randomizer,
         feature_randomizer.AddData(feats_transf);
         num_done++;
 
         // report the speed
         if (num_done % 5000 == 0) {
           double time_now = time.Elapsed();
-          KALDI_VLOG(1) << "After " << num_done << " utterances: time elapsed = "
-                        << time_now/60 << " min; processed " << total_frames/time_now
-                        << " frames per second.";
+          KALDI_VLOG(1) << "After " << num_done << " utterances: "
+            << "time elapsed = " << time_now / 60 << " min; "
+            << "processed " << total_frames / time_now << " frames per sec.";
         }
       }
 
-      // randomize
-      feature_randomizer.Randomize(randomizer_mask.Generate(feature_randomizer.NumFrames()));
+      // randomize,
+      feature_randomizer.Randomize(
+        randomizer_mask.Generate(feature_randomizer.NumFrames())
+      );
 
       // train with data from randomizer (using mini-batches)
-      for( ; !feature_randomizer.Done(); feature_randomizer.Next()) {
-        // get block of feature/target pairs
+      for ( ; !feature_randomizer.Done(); feature_randomizer.Next()) {
+        // get the mini-batch,
         const CuMatrixBase<BaseFloat>& pos_vis = feature_randomizer.Value();
-        // get the dims 
+        // get the dims,
         int32 num_frames = pos_vis.NumRows(),
               dim_hid = rbm.OutputDim();
         // Create dummy frame-weights for Mse::Eval,
         Vector<BaseFloat> dummy_weights(num_frames);
         dummy_weights.Set(1.0);
 
-        // TRAIN with CD1
-        // forward pass
+        // TRAIN with CD1,
+        // forward pass,
         rbm.Propagate(pos_vis, &pos_hid);
 
-        // alter the hidden values, so we can generate negative example
+        // alter the hidden values, so we can generate negative example,
         if (rbm.HidType() == Rbm::Bernoulli) {
           pos_hid_aux.Resize(num_frames, dim_hid);
-          cu_rand.BinarizeProbs(pos_hid, &pos_hid_aux);
+          cu_rand.BinarizeProbs(pos_hid, &pos_hid_aux);  // => 0 / 1,
         } else {
-          // assume HidType Rbm::GAUSSIAN
-          pos_hid_aux.Resize(num_frames, dim_hid);
-          pos_hid_aux.CopyFromMat(pos_hid);
+          KALDI_ASSERT(rbm.HidType() == Rbm::Gaussian);
+          pos_hid_aux = pos_hid;
           cu_rand.AddGaussNoise(&pos_hid_aux);
         }
 
-        // reconstruct pass
+        // reconstruct pass,
         rbm.Reconstruct(pos_hid_aux, &neg_vis);
         // propagate negative examples
         rbm.Propagate(neg_vis, &neg_hid);
@@ -221,21 +231,23 @@ int main(int argc, char *argv[]) {
         {
           static int32 n_prev = -1;
           BaseFloat step = (momentum_max - momentum) / momentum_steps;
-          int32 n = total_frames / momentum_step_period; //change every momentum_step_period data
+          // change every momentum_step_period data,
+          int32 n = total_frames / momentum_step_period;
           BaseFloat momentum_actual;
-          if(n > momentum_steps) {
+          if (n > momentum_steps) {
             momentum_actual = momentum_max;
           } else {
             momentum_actual = momentum + n*step;
           }
-          if(n - n_prev > 0) {
+          if (n - n_prev > 0) {
             n_prev = n;
             BaseFloat learning_rate_actual = learn_rate*(1-momentum_actual);
-            KALDI_VLOG(1) << "Setting momentum " << (with_bug ? momentum_max : momentum_actual)
-                          << " and learning rate " << learning_rate_actual
-                          << " after processing " 
-                          << static_cast<double>(total_frames)/360000 << "h";
-            // pass values to rbm
+            KALDI_VLOG(1) << "Setting momentum "
+              << (with_bug ? momentum_max : momentum_actual)
+              << " and learning rate " << learning_rate_actual
+              << " after processing "
+              << static_cast<double>(total_frames) / 360000 << " h";
+            // pass values to rbm,
             trn_opts_rbm.momentum = (with_bug ? momentum_max : momentum_actual);
             trn_opts_rbm.learn_rate = learning_rate_actual;
             rbm.SetRbmTrainOptions(trn_opts_rbm);
@@ -253,15 +265,16 @@ int main(int argc, char *argv[]) {
     }
 
     nnet.Write(target_model_filename, binary);
-    
+
     KALDI_LOG << "Done " << iter << " iterations, " << num_done << " files, "
               << "skipped " << num_other_error << " files. "
-              << "[" << time.Elapsed()/60 << " min, fps" << total_frames/time.Elapsed() 
-              << "]";
+              << "[" << time.Elapsed() / 60 << " min, "
+              << "processing" << total_frames / time.Elapsed() << " "
+              << "frames per sec.]";
 
     KALDI_LOG << mse.Report();
 
-#if HAVE_CUDA==1
+#if HAVE_CUDA == 1
     CuDevice::Instantiate().PrintProfile();
 #endif
     return 0;
diff --git a/src/nnetbin/train-transitions.cc b/src/nnetbin/train-transitions.cc
index e18f953b060..0226e0973d2 100644
--- a/src/nnetbin/train-transitions.cc
+++ b/src/nnetbin/train-transitions.cc
@@ -29,21 +29,22 @@ int main(int argc, char *argv[]) {
     typedef kaldi::int32 int32;
 
     const char *usage =
-        "Train the transition probabilities in transition-model (used in nnet1 recipe)\n"
+        "Train the transition probabilities in transition-model "
+        "(used in nnet1 recipe).\n"
         "\n"
-        "Usage:  train-transitions [options] <trans-model-in> <alignments-rspecifier> <trans-model-out>\n"
-        "e.g.:\n"
-        " train-transitions 1.mdl \"ark:gunzip -c ali.*.gz|\" 2.mdl\n";
-    
+        "Usage: train-transitions [options] "
+        "<trans-model-in> <alignments-rspecifier> <trans-model-out>\n"
+        "e.g.: train-transitions 1.mdl \"ark:gunzip -c ali.*.gz|\" 2.mdl\n";
+
     bool binary_write = true;
     MleTransitionUpdateConfig transition_update_config;
-    
+
     ParseOptions po(usage);
     po.Register("binary", &binary_write, "Write output in binary mode");
     transition_update_config.Register(&po);
-    
+
     po.Read(argc, argv);
-    
+
     if (po.NumArgs() != 3) {
       po.PrintUsage();
       exit(1);
@@ -52,20 +53,20 @@ int main(int argc, char *argv[]) {
     std::string trans_model_rxfilename = po.GetArg(1),
         ali_rspecifier = po.GetArg(2),
         trans_model_wxfilename = po.GetArg(3);
-    
+
     TransitionModel trans_model;
     {
       bool binary_read;
       Input ki(trans_model_rxfilename, &binary_read);
       trans_model.Read(ki.Stream(), binary_read);
     }
-    
+
     Vector<double> transition_accs;
     trans_model.InitStats(&transition_accs);
 
     int32 num_done = 0;
     SequentialInt32VectorReader ali_reader(ali_rspecifier);
-    for (; ! ali_reader.Done(); ali_reader.Next()) {
+    for (; !ali_reader.Done(); ali_reader.Next()) {
       const std::vector<int32> alignment(ali_reader.Value());
       for (size_t i = 0; i < alignment.size(); i++) {
         int32 tid = alignment[i];
@@ -85,7 +86,7 @@ int main(int argc, char *argv[]) {
                 << " log-like improvement per frame over " << count
                 << " frames.";
     }
-        
+
     {
       Output ko(trans_model_wxfilename, binary_write);
       trans_model.Write(ko.Stream(), binary_write);
@@ -94,7 +95,7 @@ int main(int argc, char *argv[]) {
               << trans_model_wxfilename;
     return 0;
   } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
+    std::cerr << e.what();
     return -1;
   }
 }
diff --git a/src/nnetbin/transf-to-nnet.cc b/src/nnetbin/transf-to-nnet.cc
index 88b2f608e40..f83c71cc47d 100644
--- a/src/nnetbin/transf-to-nnet.cc
+++ b/src/nnetbin/transf-to-nnet.cc
@@ -20,7 +20,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "nnet/nnet-nnet.h"
-#include "nnet/nnet-affine-transform.h"
+#include "nnet/nnet-linear-transform.h"
 
 int main(int argc, char *argv[]) {
   try {
@@ -34,9 +34,8 @@ int main(int argc, char *argv[]) {
         "e.g.:\n"
         " transf-to-nnet --binary=false transf.mat nnet.mdl\n";
 
-
     bool binary_write = false;
-    
+
     ParseOptions po(usage);
     po.Register("binary", &binary_write, "Write output in binary mode");
 
@@ -50,39 +49,29 @@ int main(int argc, char *argv[]) {
     std::string transform_rxfilename = po.GetArg(1),
         model_out_filename = po.GetArg(2);
 
-    //read the matrix
+    // read the matrix,
     Matrix<BaseFloat> transform;
     {
       bool binary_read;
       Input ki(transform_rxfilename, &binary_read);
       transform.Read(ki.Stream(), binary_read);
     }
-    
-    //we will put the transform to the nnet
-    Nnet nnet;
-    //create affine transform layer
-    AffineTransform* layer = new AffineTransform(transform.NumCols(),transform.NumRows());
-    //the pointer will be given to the nnet, so we don't need to call delete
 
-    //convert Matrix to CuMatrix
-    CuMatrix<BaseFloat> cu_transform(transform);
-
-    //set the weights
-    layer->SetLinearity(cu_transform);
+    // wrapping as Nnet with <LinearTransform>,
+    Nnet nnet;
+    LinearTransform lin_tran(transform.NumCols(), transform.NumRows());
+    lin_tran.SetLinearity(transform);
+    nnet.AppendComponent(lin_tran);
 
-    //append layer to the nnet
-    nnet.AppendComponent(layer);
-    
-    //write the nnet
+    // write the nnet,
     {
       Output ko(model_out_filename, binary_write);
       nnet.Write(ko.Stream(), binary_write);
+      KALDI_LOG << "Written transform in 'nnet1' model: " << model_out_filename;
     }
-
-    KALDI_LOG << "Written model to " << model_out_filename;
     return 0;
-  } catch(const std::exception& e) {
-    std::cerr << e.what() << '\n';
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
     return -1;
   }
 }
diff --git a/src/online/Makefile b/src/online/Makefile
index 4c0ad9faff1..f243f72934a 100644
--- a/src/online/Makefile
+++ b/src/online/Makefile
@@ -14,7 +14,7 @@ endif
 ifeq ($(UNAME), Linux)
   ifneq "$(wildcard ../../tools/portaudio/install/lib/libportaudio.a)" ""
     EXTRA_LDLIBS = ../../tools/portaudio/install/lib/libportaudio.a
-  else 
+  else
     EXTRA_LDLIBS = ../../tools/portaudio/install/lib64/libportaudio.a
   endif
   ifneq ($(wildcard ../../tools/portaudio/install/include/pa_linux_alsa.h),)
@@ -22,6 +22,9 @@ ifeq ($(UNAME), Linux)
   else
     EXTRA_LDLIBS += -lrt
   endif
+  ifneq ($(wildcard ../../tools/portaudio/install/include/pa_jack.h),)
+    EXTRA_LDLIBS += -ljack
+  endif
 endif
 
 
@@ -34,7 +37,7 @@ LIBNAME = kaldi-online
 ADDLIBS = ../decoder/kaldi-decoder.a ../lat/kaldi-lat.a ../feat/kaldi-feat.a \
 	../transform/kaldi-transform.a ../gmm/kaldi-gmm.a ../hmm/kaldi-hmm.a \
 	../tree/kaldi-tree.a ../matrix/kaldi-matrix.a  ../util/kaldi-util.a \
-	../base/kaldi-base.a ../thread/kaldi-thread.a
+	../thread/kaldi-thread.a ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
 
diff --git a/src/online/online-tcp-source.cc b/src/online/online-tcp-source.cc
index 7bbab9679f1..870390f4a33 100644
--- a/src/online/online-tcp-source.cc
+++ b/src/online/online-tcp-source.cc
@@ -1,4 +1,4 @@
-// online/online-audio-source.cc
+// online/online-tcp-source.cc
 
 // Copyright 2013 Polish-Japanese Institute of Information Technology (author: Danijel Korzinek)
 
diff --git a/src/online/online-tcp-source.h b/src/online/online-tcp-source.h
index 09c4ffadf79..2bdc122a4ee 100644
--- a/src/online/online-tcp-source.h
+++ b/src/online/online-tcp-source.h
@@ -1,4 +1,4 @@
-// online/online-audio-source.h
+// online/online-tcp-source.h
 
 // Copyright 2013 Polish-Japanese Institute of Information Technology (author: Danijel Korzinek)
 
diff --git a/src/online2/Makefile b/src/online2/Makefile
index e8205d35aa5..5cc9e85bf06 100644
--- a/src/online2/Makefile
+++ b/src/online2/Makefile
@@ -8,15 +8,19 @@ TESTFILES =
 OBJFILES = online-gmm-decodable.o online-feature-pipeline.o online-ivector-feature.o \
            online-nnet2-feature-pipeline.o online-gmm-decoding.o online-timing.o \
            online-endpoint.o onlinebin-util.o online-speex-wrapper.o \
-           online-nnet2-decoding.o online-nnet2-decoding-threaded.o
+           online-nnet2-decoding.o online-nnet2-decoding-threaded.o \
+           online-nnet3-decoding.o
 
 LIBNAME = kaldi-online2
 
 ADDLIBS = ../gmm/kaldi-gmm.a ../transform/kaldi-transform.a ../feat/kaldi-feat.a \
-     ../matrix/kaldi-matrix.a ../util/kaldi-util.a ../base/kaldi-base.a \
      ../lat/kaldi-lat.a ../decoder/kaldi-decoder.a ../hmm/kaldi-hmm.a \
-     ../thread/kaldi-thread.a ../ivector/kaldi-ivector.a \
-     ../cudamatrix/kaldi-cudamatrix.a ../nnet2/kaldi-nnet2.a
+     ../nnet2/kaldi-nnet2.a ../nnet3/kaldi-nnet3.a ../chain/kaldi-chain.a \
+     ../ivector/kaldi-ivector.a \
+     ../cudamatrix/kaldi-cudamatrix.a ../matrix/kaldi-matrix.a \
+     ../util/kaldi-util.a ../thread/kaldi-thread.a ../base/kaldi-base.a
+
+
 
 
 include ../makefiles/default_rules.mk
diff --git a/src/online2/online-feature-pipeline.cc b/src/online2/online-feature-pipeline.cc
index 6f2435e51e2..32adfe964da 100644
--- a/src/online2/online-feature-pipeline.cc
+++ b/src/online2/online-feature-pipeline.cc
@@ -48,12 +48,12 @@ OnlineFeaturePipelineConfig::OnlineFeaturePipelineConfig(
   }  // else use the defaults.
 
   if (config.fbank_config != "") {
-    ReadConfigFromFile(config.plp_config, &fbank_opts);
+    ReadConfigFromFile(config.fbank_config, &fbank_opts);
     if (feature_type != "fbank")
       KALDI_WARN << "--fbank-config option has no effect "
                  << "since feature type is set to " << feature_type << ".";
   }  // else use the defaults.
-  
+
   add_pitch = config.add_pitch;
   if (config.pitch_config != "") {
     ReadConfigFromFile(config.pitch_config, &pitch_opts);
diff --git a/src/online2/online-gmm-decodable.h b/src/online2/online-gmm-decodable.h
index cfbd9b217ff..c037ad0efe4 100644
--- a/src/online2/online-gmm-decodable.h
+++ b/src/online2/online-gmm-decodable.h
@@ -1,4 +1,4 @@
-// online/online-gmm-decodable.h
+// online2/online-gmm-decodable.h
 
 // Copyright 2012  Cisco Systems (author: Matthias Paulik)
 //           2013  Vassil Panayotov
diff --git a/src/online2/online-gmm-decoding.h b/src/online2/online-gmm-decoding.h
index 41c9ca4c14d..8bec6cd9ab9 100644
--- a/src/online2/online-gmm-decoding.h
+++ b/src/online2/online-gmm-decoding.h
@@ -71,10 +71,10 @@ struct OnlineGmmDecodingAdaptationPolicyConfig {
     opts->Register("adaptation-first-utt-ratio", &adaptation_first_utt_ratio,
                    "Ratio that controls frequency of fMLLR adaptation for first "
                    "utterance of each speaker");
-    opts->Register("adaptation-delay", &adaptation_first_utt_delay,
+    opts->Register("adaptation-delay", &adaptation_delay,
                    "Delay before first basis-fMLLR adaptation for not-first "
                    "utterances of each speaker");
-    opts->Register("adaptation-ratio", &adaptation_first_utt_ratio,
+    opts->Register("adaptation-ratio", &adaptation_ratio,
                    "Ratio that controls frequency of fMLLR adaptation for "
                    "not-first utterances of each speaker");
   }
diff --git a/src/online2/online-ivector-feature.cc b/src/online2/online-ivector-feature.cc
index 3067c9bda1a..fcdab88408e 100644
--- a/src/online2/online-ivector-feature.cc
+++ b/src/online2/online-ivector-feature.cc
@@ -42,7 +42,7 @@ void OnlineIvectorExtractionInfo::Init(
     use_most_recent_ivector = true;
   }
   max_remembered_frames = config.max_remembered_frames;
-  
+
   std::string note = "(note: this may be needed "
       "in the file supplied to --ivector-extractor-config)";
   if (config.lda_mat_rxfilename == "")
@@ -72,7 +72,7 @@ void OnlineIvectorExtractionInfo::Check() const {
   int32 base_feat_dim = global_cmvn_stats.NumCols() - 1,
       num_splice = splice_opts.left_context + 1 + splice_opts.right_context,
       spliced_input_dim = base_feat_dim * num_splice;
-  
+
   KALDI_ASSERT(lda_mat.NumCols() == spliced_input_dim ||
                lda_mat.NumCols() == spliced_input_dim + 1);
   KALDI_ASSERT(lda_mat.NumRows() == diag_ubm.Dim());
@@ -113,7 +113,7 @@ void OnlineIvectorExtractorAdaptationState::LimitFrames(
   if (ivector_stats.Count() > max_remembered_frames_scaled) {
     ivector_stats.Scale(max_remembered_frames_scaled /
                         ivector_stats.Count());
-  }  
+  }
 }
 
 void OnlineIvectorExtractorAdaptationState::Write(std::ostream &os, bool binary) const {
@@ -152,6 +152,10 @@ int32 OnlineIvectorFeature::NumFramesReady() const {
   return lda_->NumFramesReady();
 }
 
+BaseFloat OnlineIvectorFeature::FrameShiftInSeconds() const {
+  return lda_->FrameShiftInSeconds();
+}
+
 void OnlineIvectorFeature::UpdateFrameWeights(
     const std::vector<std::pair<int32, BaseFloat> > &delta_weights) {
   // add the elements to delta_weights_, which is a priority queue.  The top
@@ -173,7 +177,7 @@ void OnlineIvectorFeature::UpdateFrameWeights(
 
 void OnlineIvectorFeature::UpdateStatsForFrame(int32 t,
                                                BaseFloat weight) {
-  int32 feat_dim = lda_normalized_->Dim();      
+  int32 feat_dim = lda_normalized_->Dim();
   Vector<BaseFloat> feat(feat_dim),  // features given to iVector extractor
       log_likes(info_.diag_ubm.NumGauss());
   lda_normalized_->GetFrame(t, &feat);
@@ -193,10 +197,10 @@ void OnlineIvectorFeature::UpdateStatsUntilFrame(int32 frame) {
   KALDI_ASSERT(frame >= 0 && frame < this->NumFramesReady() &&
                !delta_weights_provided_);
   updated_with_no_delta_weights_ = true;
-  
+
   int32 ivector_period = info_.ivector_period;
   int32 num_cg_iters = info_.num_cg_iters;
-  
+
   for (; num_frames_stats_ <= frame; num_frames_stats_++) {
     int32 t = num_frames_stats_;
     UpdateStatsForFrame(t, 1.0);
@@ -264,7 +268,7 @@ void OnlineIvectorFeature::GetFrame(int32 frame,
     UpdateStatsUntilFrameWeighted(frame_to_update_until);
 
   KALDI_ASSERT(feat->Dim() == this->Dim());
-  
+
   if (info_.use_most_recent_ivector) {
     KALDI_VLOG(5) << "due to --use-most-recent-ivector=true, using iVector "
                   << "from frame " << num_frames_stats_ << " for frame "
@@ -295,7 +299,7 @@ void OnlineIvectorFeature::PrintDiagnostics() const {
 
     Vector<BaseFloat> temp_ivector(current_ivector_);
     temp_ivector(0) -= info_.extractor.PriorOffset();
-    
+
     KALDI_VLOG(3) << "By the end of the utterance, objf change/frame "
                   << "from estimating iVector (vs. default) was "
                   << ivector_stats_.ObjfChange(current_ivector_)
@@ -355,7 +359,7 @@ OnlineIvectorFeature::OnlineIvectorFeature(
 
   // Set the iVector to its default value, [ prior_offset, 0, 0, ... ].
   current_ivector_.Resize(info_.extractor.IvectorDim());
-  current_ivector_(0) = info_.extractor.PriorOffset(); 
+  current_ivector_(0) = info_.extractor.PriorOffset();
 }
 
 void OnlineIvectorFeature::SetAdaptationState(
@@ -405,7 +409,7 @@ void OnlineSilenceWeighting::ComputeCurrentTraceback(
     KALDI_ERR << "Number of frames decoded decreased";  // Likely bug
 
   if (num_frames_decoded == 0)
-    return;  
+    return;
   int32 frame = num_frames_decoded - 1;
   bool use_final_probs = false;
   LatticeFasterOnlineDecoder::BestPathIterator iter =
@@ -417,20 +421,20 @@ void OnlineSilenceWeighting::ComputeCurrentTraceback(
       iter = decoder.TraceBackBestPath(iter, &arc);
     // note, the iter.frame values are slightly unintuitively defined,
     // they are one less than you might expect.
-    KALDI_ASSERT(iter.frame == frame - 1); 
-    
+    KALDI_ASSERT(iter.frame == frame - 1);
+
     if (frame_info_[frame].token == iter.tok) {
       // we know that the traceback from this point back will be identical, so
       // no point tracing back further.  Note: we are comparing memory addresses
       // of tokens of the decoder; this guarantees it's the same exact token
       // because tokens, once allocated on a frame, are only deleted, never
       // reallocated for that frame.
-      break;  
+      break;
     }
 
     if (num_frames_output_and_correct_ > frame)
       num_frames_output_and_correct_ = frame;
-    
+
     frame_info_[frame].token = iter.tok;
     frame_info_[frame].transition_id = arc.ilabel;
     frame--;
@@ -450,7 +454,7 @@ int32 OnlineSilenceWeighting::GetBeginFrame() {
   // run of length greater than max_duration, since this would force it
   // to be treated as silence (note: typically a non-silence phone that's very
   // long is really silence, for example this can happen with the word "mm").
-  
+
   int32 t_last_untouched = num_frames_output_and_correct_ - 1,
       t_end = frame_info_.size();
   int32 transition_id = frame_info_[t_last_untouched].transition_id;
@@ -459,17 +463,17 @@ int32 OnlineSilenceWeighting::GetBeginFrame() {
   int32 lower_search_bound = std::max(0, t_last_untouched - max_duration),
       upper_search_bound = std::min(t_last_untouched + max_duration, t_end - 1),
       t_lower, t_upper;
-  
+
   // t_lower will be the first index in the run of equal transition-ids.
   for (t_lower = t_last_untouched;
        t_lower > lower_search_bound &&
-           frame_info_[t_lower - 1].transition_id == transition_id; t_lower++);
+           frame_info_[t_lower - 1].transition_id == transition_id; t_lower--);
 
   // t_lower will be the last index in the run of equal transition-ids.
   for (t_upper = t_last_untouched;
        t_upper < upper_search_bound &&
            frame_info_[t_upper + 1].transition_id == transition_id; t_upper++);
-  
+
   int32 run_length = t_upper - t_lower + 1;
   if (run_length <= max_duration) {
     // we wouldn't treat this run as being silence, as it's within
@@ -499,10 +503,10 @@ void OnlineSilenceWeighting::GetDeltaWeights(
   const BaseFloat silence_weight = config_.silence_weight;
 
   delta_weights->clear();
-  
+
   if (frame_info_.size() < static_cast<size_t>(num_frames_ready))
     frame_info_.resize(num_frames_ready);
-  
+
   // we may have to make begin_frame earlier than num_frames_output_and_correct_
   // so that max_state_duration is properly enforced.   GetBeginFrame() handles
   // this logic.
@@ -577,7 +581,7 @@ void OnlineSilenceWeighting::GetDeltaWeights(
     if (weight_diff != 0.0 || offset + 1 == frames_out)
       delta_weights->push_back(std::make_pair(frame, weight_diff));
   }
-  
+
 }
 
 }  // namespace kaldi
diff --git a/src/online2/online-ivector-feature.h b/src/online2/online-ivector-feature.h
index 6eaa8543ea4..5ba289aa79d 100644
--- a/src/online2/online-ivector-feature.h
+++ b/src/online2/online-ivector-feature.h
@@ -266,6 +266,7 @@ class OnlineIvectorFeature: public OnlineFeatureInterface {
   virtual int32 Dim() const;
   virtual bool IsLastFrame(int32 frame) const;
   virtual int32 NumFramesReady() const;
+  virtual BaseFloat FrameShiftInSeconds() const;
   virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);
 
   /// Set the adaptation state to a particular value, e.g. reflecting previous
@@ -510,5 +511,5 @@ class OnlineSilenceWeighting {
 /// @} End of "addtogroup onlinefeat"
 }  // namespace kaldi
 
-#endif  // KALDI_ONLINE2_ONLINE_NNET2_FEATURE_PIPELINE_H_
+#endif  // KALDI_ONLINE2_ONLINE_IVECTOR_FEATURE_H_
 
diff --git a/src/online2/online-nnet2-decoding.cc b/src/online2/online-nnet2-decoding.cc
index 4c0ac3c2245..fdd1b78a880 100644
--- a/src/online2/online-nnet2-decoding.cc
+++ b/src/online2/online-nnet2-decoding.cc
@@ -28,7 +28,7 @@ SingleUtteranceNnet2Decoder::SingleUtteranceNnet2Decoder(
     const TransitionModel &tmodel,
     const nnet2::AmNnet &model,
     const fst::Fst<fst::StdArc> &fst,
-    OnlineNnet2FeaturePipeline *feature_pipeline):
+    OnlineFeatureInterface *feature_pipeline):
     config_(config),
     feature_pipeline_(feature_pipeline),
     tmodel_(tmodel),
diff --git a/src/online2/online-nnet2-decoding.h b/src/online2/online-nnet2-decoding.h
index 47c54988527..2d48971694b 100644
--- a/src/online2/online-nnet2-decoding.h
+++ b/src/online2/online-nnet2-decoding.h
@@ -29,7 +29,7 @@
 #include "util/common-utils.h"
 #include "base/kaldi-error.h"
 #include "nnet2/online-nnet2-decodable.h"
-#include "online2/online-nnet2-feature-pipeline.h"
+#include "itf/online-feature-itf.h"
 #include "online2/online-endpoint.h"
 #include "decoder/lattice-faster-online-decoder.h"
 #include "hmm/transition-model.h"
@@ -72,7 +72,7 @@ class SingleUtteranceNnet2Decoder {
                               const TransitionModel &tmodel,
                               const nnet2::AmNnet &model,
                               const fst::Fst<fst::StdArc> &fst,
-                              OnlineNnet2FeaturePipeline *feature_pipeline);
+                              OnlineFeatureInterface *feature_pipeline);
   
   /// advance the decoding as far as we can.
   void AdvanceDecoding();
@@ -111,7 +111,7 @@ class SingleUtteranceNnet2Decoder {
 
   OnlineNnet2DecodingConfig config_;
 
-  OnlineNnet2FeaturePipeline *feature_pipeline_;
+  OnlineFeatureInterface *feature_pipeline_;
 
   const TransitionModel &tmodel_;
   
diff --git a/src/online2/online-nnet2-feature-pipeline.h b/src/online2/online-nnet2-feature-pipeline.h
index 50a747fd83b..77746bbd634 100644
--- a/src/online2/online-nnet2-feature-pipeline.h
+++ b/src/online2/online-nnet2-feature-pipeline.h
@@ -266,4 +266,4 @@ class OnlineNnet2FeaturePipeline: public OnlineFeatureInterface {
 
 
 
-#endif  // KALDI_ONLINE2_ONLINE_FEATURE_PIPELINE_H_
+#endif  // KALDI_ONLINE2_ONLINE_NNET2_FEATURE_PIPELINE_H_
diff --git a/src/online2/online-nnet3-decoding.cc b/src/online2/online-nnet3-decoding.cc
new file mode 100644
index 00000000000..fd4881666ae
--- /dev/null
+++ b/src/online2/online-nnet3-decoding.cc
@@ -0,0 +1,82 @@
+// online2/online-nnet3-decoding.cc
+
+// Copyright    2013-2014  Johns Hopkins University (author: Daniel Povey)
+//              2016  Api.ai (Author: Ilya Platonov)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "online2/online-nnet3-decoding.h"
+#include "lat/lattice-functions.h"
+#include "lat/determinize-lattice-pruned.h"
+
+namespace kaldi {
+
+SingleUtteranceNnet3Decoder::SingleUtteranceNnet3Decoder(
+    const OnlineNnet3DecodingConfig &config,
+    const TransitionModel &tmodel,
+    const nnet3::AmNnetSimple &am_model,
+    const fst::Fst<fst::StdArc> &fst,
+    OnlineFeatureInterface *feature_pipeline):
+    config_(config),
+    feature_pipeline_(feature_pipeline),
+    tmodel_(tmodel),
+    decodable_(am_model, tmodel, config.decodable_opts, feature_pipeline),
+    decoder_(fst, config.decoder_opts) {
+  decoder_.InitDecoding();
+}
+
+void SingleUtteranceNnet3Decoder::AdvanceDecoding() {
+  decoder_.AdvanceDecoding(&decodable_);
+}
+
+void SingleUtteranceNnet3Decoder::FinalizeDecoding() {
+  decoder_.FinalizeDecoding();
+}
+
+int32 SingleUtteranceNnet3Decoder::NumFramesDecoded() const {
+  return decoder_.NumFramesDecoded();
+}
+
+void SingleUtteranceNnet3Decoder::GetLattice(bool end_of_utterance,
+                                             CompactLattice *clat) const {
+  if (NumFramesDecoded() == 0)
+    KALDI_ERR << "You cannot get a lattice if you decoded no frames.";
+  Lattice raw_lat;
+  decoder_.GetRawLattice(&raw_lat, end_of_utterance);
+
+  if (!config_.decoder_opts.determinize_lattice)
+    KALDI_ERR << "--determinize-lattice=false option is not supported at the moment";
+
+  BaseFloat lat_beam = config_.decoder_opts.lattice_beam;
+  DeterminizeLatticePhonePrunedWrapper(
+      tmodel_, &raw_lat, lat_beam, clat, config_.decoder_opts.det_opts);
+}
+
+void SingleUtteranceNnet3Decoder::GetBestPath(bool end_of_utterance,
+                                              Lattice *best_path) const {
+  decoder_.GetBestPath(best_path, end_of_utterance);
+}
+
+bool SingleUtteranceNnet3Decoder::EndpointDetected(
+    const OnlineEndpointConfig &config) {
+  return kaldi::EndpointDetected(config, tmodel_,
+                                 feature_pipeline_->FrameShiftInSeconds(),
+                                 decoder_);  
+}
+
+
+}  // namespace kaldi
+
diff --git a/src/online2/online-nnet3-decoding.h b/src/online2/online-nnet3-decoding.h
new file mode 100644
index 00000000000..788c713080b
--- /dev/null
+++ b/src/online2/online-nnet3-decoding.h
@@ -0,0 +1,132 @@
+// online2/online-nnet3-decoding.h
+
+// Copyright 2014  Johns Hopkins University (author: Daniel Povey)
+//           2016  Api.ai (Author: Ilya Platonov)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#ifndef KALDI_ONLINE2_ONLINE_NNET3_DECODING_H_
+#define KALDI_ONLINE2_ONLINE_NNET3_DECODING_H_
+
+#include <string>
+#include <vector>
+#include <deque>
+
+#include "nnet3/online-nnet3-decodable-simple.h"
+#include "matrix/matrix-lib.h"
+#include "util/common-utils.h"
+#include "base/kaldi-error.h"
+#include "itf/online-feature-itf.h"
+#include "online2/online-endpoint.h"
+#include "decoder/lattice-faster-online-decoder.h"
+#include "hmm/transition-model.h"
+#include "hmm/posterior.h"
+
+namespace kaldi {
+/// @addtogroup  onlinedecoding OnlineDecoding
+/// @{
+
+
+
+
+
+// This configuration class contains the configuration classes needed to create
+// the class SingleUtteranceNnet3Decoder.  The actual command line program
+// requires other configs that it creates separately, and which are not included
+// here: namely, OnlineNnet2FeaturePipelineConfig and OnlineEndpointConfig.
+struct OnlineNnet3DecodingConfig {
+  
+  LatticeFasterDecoderConfig decoder_opts;
+  nnet3::DecodableNnet3OnlineOptions decodable_opts;
+  
+  OnlineNnet3DecodingConfig() {  decodable_opts.acoustic_scale = 0.1; }
+  
+  void Register(OptionsItf *opts) {
+    decoder_opts.Register(opts);
+    decodable_opts.Register(opts);
+  }
+};
+
+/**
+   You will instantiate this class when you want to decode a single
+   utterance using the online-decoding setup for neural nets.
+*/
+class SingleUtteranceNnet3Decoder {
+ public:
+  // Constructor.  The feature_pipeline_ pointer is not owned in this
+  // class, it's owned externally.
+  SingleUtteranceNnet3Decoder(const OnlineNnet3DecodingConfig &config,
+                              const TransitionModel &tmodel,
+                              const nnet3::AmNnetSimple &am_model,
+                              const fst::Fst<fst::StdArc> &fst,
+                              OnlineFeatureInterface *feature_pipeline);
+  
+  /// advance the decoding as far as we can.
+  void AdvanceDecoding();
+
+  /// Finalizes the decoding. Cleans up and prunes remaining tokens, so the
+  /// GetLattice() call will return faster.  You must not call this before
+  /// calling (TerminateDecoding() or InputIsFinished()) and then Wait().
+  void FinalizeDecoding();
+
+  int32 NumFramesDecoded() const;
+  
+  /// Gets the lattice.  The output lattice has any acoustic scaling in it
+  /// (which will typically be desirable in an online-decoding context); if you
+  /// want an un-scaled lattice, scale it using ScaleLattice() with the inverse
+  /// of the acoustic weight.  "end_of_utterance" will be true if you want the
+  /// final-probs to be included.
+  void GetLattice(bool end_of_utterance,
+                  CompactLattice *clat) const;
+  
+  /// Outputs an FST corresponding to the single best path through the current
+  /// lattice. If "use_final_probs" is true AND we reached the final-state of
+  /// the graph then it will include those as final-probs, else it will treat
+  /// all final-probs as one.
+  void GetBestPath(bool end_of_utterance,
+                   Lattice *best_path) const;
+
+
+  /// This function calls EndpointDetected from online-endpoint.h,
+  /// with the required arguments.
+  bool EndpointDetected(const OnlineEndpointConfig &config);
+
+  const LatticeFasterOnlineDecoder &Decoder() const { return decoder_; }
+  
+  ~SingleUtteranceNnet3Decoder() { }
+ private:
+
+  OnlineNnet3DecodingConfig config_;
+
+  OnlineFeatureInterface *feature_pipeline_;
+
+  const TransitionModel &tmodel_;
+  
+  nnet3::DecodableNnet3SimpleOnline decodable_;
+  
+  LatticeFasterOnlineDecoder decoder_;
+  
+};
+
+  
+/// @} End of "addtogroup onlinedecoding"
+
+}  // namespace kaldi
+
+
+
+#endif  // KALDI_ONLINE2_ONLINE_NNET3_DECODING_H_
diff --git a/src/online2/onlinebin-util.cc b/src/online2/onlinebin-util.cc
index ffc481b2129..74c594eeb79 100644
--- a/src/online2/onlinebin-util.cc
+++ b/src/online2/onlinebin-util.cc
@@ -1,4 +1,4 @@
-// online/onlinebin-util.cc
+// online2/onlinebin-util.cc
 
 // Copyright 2012 Cisco Systems (author: Matthias Paulik)
 
diff --git a/src/online2/onlinebin-util.h b/src/online2/onlinebin-util.h
index 13c4d7d11e4..66158278fe7 100644
--- a/src/online2/onlinebin-util.h
+++ b/src/online2/onlinebin-util.h
@@ -1,4 +1,4 @@
-// online/onlinebin-util.h
+// online2/onlinebin-util.h
 
 // Copyright 2012 Cisco Systems (author: Matthias Paulik)
 
diff --git a/src/online2bin/Makefile b/src/online2bin/Makefile
index d088f642290..7c4c4e2046b 100644
--- a/src/online2bin/Makefile
+++ b/src/online2bin/Makefile
@@ -10,18 +10,19 @@ BINFILES = online2-wav-gmm-latgen-faster apply-cmvn-online \
      extend-wav-with-silence compress-uncompress-speex \
      online2-wav-nnet2-latgen-faster ivector-extract-online2 \
      online2-wav-dump-features ivector-randomize \
-     online2-wav-nnet2-am-compute  online2-wav-nnet2-latgen-threaded
+     online2-wav-nnet2-am-compute  online2-wav-nnet2-latgen-threaded \
+     online2-wav-nnet3-latgen-faster
 
-OBJFILES = 
+OBJFILES =
 
 TESTFILES =
 
 ADDLIBS = ../online2/kaldi-online2.a ../ivector/kaldi-ivector.a \
-           ../nnet2/kaldi-nnet2.a ../lat/kaldi-lat.a \
+           ../nnet2/kaldi-nnet2.a ../nnet3/kaldi-nnet3.a ../lat/kaldi-lat.a \
           ../decoder/kaldi-decoder.a  ../cudamatrix/kaldi-cudamatrix.a \
           ../feat/kaldi-feat.a ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../thread/kaldi-thread.a ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a \
+          ../chain/kaldi-chain.a ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a \
           ../matrix/kaldi-matrix.a ../fstext/kaldi-fstext.a \
-          ../util/kaldi-util.a ../base/kaldi-base.a 
+          ../util/kaldi-util.a ../thread/kaldi-thread.a ../base/kaldi-base.a \
 
 include ../makefiles/default_rules.mk
diff --git a/src/online2bin/compress-uncompress-speex.cc b/src/online2bin/compress-uncompress-speex.cc
index 55e0b62f8eb..677e6ed77d7 100644
--- a/src/online2bin/compress-uncompress-speex.cc
+++ b/src/online2bin/compress-uncompress-speex.cc
@@ -1,4 +1,4 @@
-// online2/compress-uncompress-speex.cc
+// online2bin/compress-uncompress-speex.cc
 
 // 2014  IMSL, PKU-HKUST (author: Wei Shi)
 
diff --git a/src/online2bin/extend-wav-with-silence.cc b/src/online2bin/extend-wav-with-silence.cc
index 22e48034f15..ce4e3ef904c 100644
--- a/src/online2bin/extend-wav-with-silence.cc
+++ b/src/online2bin/extend-wav-with-silence.cc
@@ -1,4 +1,4 @@
-// online2/extend-wav-with-silence.cc
+// online2bin/extend-wav-with-silence.cc
 
 // 2014  IMSL, PKU-HKUST (author: Wei Shi)
 // 2015  Tom Ko
diff --git a/src/online2bin/online2-wav-dump-features.cc b/src/online2bin/online2-wav-dump-features.cc
index 67024c350ec..6ddd3bf83e5 100644
--- a/src/online2bin/online2-wav-dump-features.cc
+++ b/src/online2bin/online2-wav-dump-features.cc
@@ -1,4 +1,4 @@
-// onlinebin/online2-wav-dump-features.cc
+// online2bin/online2-wav-dump-features.cc
 
 // Copyright 2014  Johns Hopkins University (author: Daniel Povey)
 
@@ -19,6 +19,7 @@
 
 #include "feat/wave-reader.h"
 #include "online2/online-nnet2-decoding.h"
+#include "online2/online-nnet2-feature-pipeline.h"
 #include "online2/onlinebin-util.h"
 #include "online2/online-timing.h"
 #include "online2/online-endpoint.h"
diff --git a/src/online2bin/online2-wav-gmm-latgen-faster.cc b/src/online2bin/online2-wav-gmm-latgen-faster.cc
index 0e198eec7c0..57a3f4d71fc 100644
--- a/src/online2bin/online2-wav-gmm-latgen-faster.cc
+++ b/src/online2bin/online2-wav-gmm-latgen-faster.cc
@@ -1,4 +1,4 @@
-// onlinebin/online2-wav-gmm-latgen-faster.cc
+// online2bin/online2-wav-gmm-latgen-faster.cc
 
 // Copyright 2014  Johns Hopkins University (author: Daniel Povey)
 
diff --git a/src/online2bin/online2-wav-nnet2-am-compute.cc b/src/online2bin/online2-wav-nnet2-am-compute.cc
index 669d8d01ed0..4fa707f8b13 100644
--- a/src/online2bin/online2-wav-nnet2-am-compute.cc
+++ b/src/online2bin/online2-wav-nnet2-am-compute.cc
@@ -20,6 +20,7 @@
 
 #include "feat/wave-reader.h"
 #include "online2/online-nnet2-decoding.h"
+#include "online2/online-nnet2-feature-pipeline.h"
 #include "online2/onlinebin-util.h"
 
 int main(int argc, char *argv[]) {
diff --git a/src/online2bin/online2-wav-nnet2-latgen-faster.cc b/src/online2bin/online2-wav-nnet2-latgen-faster.cc
index c086dfd82c6..ad8f323aea1 100644
--- a/src/online2bin/online2-wav-nnet2-latgen-faster.cc
+++ b/src/online2bin/online2-wav-nnet2-latgen-faster.cc
@@ -1,4 +1,4 @@
-// onlinebin/online2-wav-nnet2-latgen-faster.cc
+// online2bin/online2-wav-nnet2-latgen-faster.cc
 
 // Copyright 2014  Johns Hopkins University (author: Daniel Povey)
 
@@ -19,6 +19,7 @@
 
 #include "feat/wave-reader.h"
 #include "online2/online-nnet2-decoding.h"
+#include "online2/online-nnet2-feature-pipeline.h"
 #include "online2/onlinebin-util.h"
 #include "online2/online-timing.h"
 #include "online2/online-endpoint.h"
diff --git a/src/online2bin/online2-wav-nnet2-latgen-threaded.cc b/src/online2bin/online2-wav-nnet2-latgen-threaded.cc
index f681c4431e0..1366b5bc3ca 100644
--- a/src/online2bin/online2-wav-nnet2-latgen-threaded.cc
+++ b/src/online2bin/online2-wav-nnet2-latgen-threaded.cc
@@ -1,4 +1,4 @@
-// onlinebin/online2-wav-nnet2-latgen-thread.cc
+// online2bin/online2-wav-nnet2-latgen-threaded.cc
 
 // Copyright 2014-2015  Johns Hopkins University (author: Daniel Povey)
 
@@ -19,6 +19,7 @@
 
 #include "feat/wave-reader.h"
 #include "online2/online-nnet2-decoding-threaded.h"
+#include "online2/online-nnet2-feature-pipeline.h"
 #include "online2/onlinebin-util.h"
 #include "online2/online-timing.h"
 #include "online2/online-endpoint.h"
diff --git a/src/online2bin/online2-wav-nnet3-latgen-faster.cc b/src/online2bin/online2-wav-nnet3-latgen-faster.cc
new file mode 100644
index 00000000000..740c9e2221b
--- /dev/null
+++ b/src/online2bin/online2-wav-nnet3-latgen-faster.cc
@@ -0,0 +1,289 @@
+// online2bin/online2-wav-nnet3-latgen-faster.cc
+
+// Copyright 2014  Johns Hopkins University (author: Daniel Povey)
+//           2016  Api.ai (Author: Ilya Platonov)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "feat/wave-reader.h"
+#include "online2/online-nnet3-decoding.h"
+#include "online2/online-nnet2-feature-pipeline.h"
+#include "online2/onlinebin-util.h"
+#include "online2/online-timing.h"
+#include "online2/online-endpoint.h"
+#include "fstext/fstext-lib.h"
+#include "lat/lattice-functions.h"
+#include "thread/kaldi-thread.h"
+
+namespace kaldi {
+
+void GetDiagnosticsAndPrintOutput(const std::string &utt,
+                                  const fst::SymbolTable *word_syms,
+                                  const CompactLattice &clat,
+                                  int64 *tot_num_frames,
+                                  double *tot_like) {
+  if (clat.NumStates() == 0) {
+    KALDI_WARN << "Empty lattice.";
+    return;
+  }
+  CompactLattice best_path_clat;
+  CompactLatticeShortestPath(clat, &best_path_clat);
+  
+  Lattice best_path_lat;
+  ConvertLattice(best_path_clat, &best_path_lat);
+  
+  double likelihood;
+  LatticeWeight weight;
+  int32 num_frames;
+  std::vector<int32> alignment;
+  std::vector<int32> words;
+  GetLinearSymbolSequence(best_path_lat, &alignment, &words, &weight);
+  num_frames = alignment.size();
+  likelihood = -(weight.Value1() + weight.Value2());
+  *tot_num_frames += num_frames;
+  *tot_like += likelihood;
+  KALDI_VLOG(2) << "Likelihood per frame for utterance " << utt << " is "
+                << (likelihood / num_frames) << " over " << num_frames
+                << " frames.";
+             
+  if (word_syms != NULL) {
+    std::cerr << utt << ' ';
+    for (size_t i = 0; i < words.size(); i++) {
+      std::string s = word_syms->Find(words[i]);
+      if (s == "")
+        KALDI_ERR << "Word-id " << words[i] << " not in symbol table.";
+      std::cerr << s << ' ';
+    }
+    std::cerr << std::endl;
+  }
+}
+
+}
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace fst;
+    
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+    
+    const char *usage =
+        "Reads in wav file(s) and simulates online decoding with neural nets\n"
+        "(nnet3 setup), with optional iVector-based speaker adaptation and\n"
+        "optional endpointing.  Note: some configuration values and inputs are\n"
+        "set via config files whose filenames are passed as options\n"
+        "\n"
+        "Usage: online2-wav-nnet3-latgen-faster [options] <nnet3-in> <fst-in> "
+        "<spk2utt-rspecifier> <wav-rspecifier> <lattice-wspecifier>\n"
+        "The spk2utt-rspecifier can just be <utterance-id> <utterance-id> if\n"
+        "you want to decode utterance by utterance.\n";
+    
+    ParseOptions po(usage);
+    
+    std::string word_syms_rxfilename;
+    
+    OnlineEndpointConfig endpoint_config;
+
+    // feature_config includes configuration for the iVector adaptation,
+    // as well as the basic features.
+    OnlineNnet2FeaturePipelineConfig feature_config;
+    OnlineNnet3DecodingConfig nnet3_decoding_config;
+
+    BaseFloat chunk_length_secs = 0.18;
+    bool do_endpointing = false;
+    bool online = true;
+    
+    po.Register("chunk-length", &chunk_length_secs,
+                "Length of chunk size in seconds, that we process.  Set to <= 0 "
+                "to use all input in one chunk.");
+    po.Register("word-symbol-table", &word_syms_rxfilename,
+                "Symbol table for words [for debug output]");
+    po.Register("do-endpointing", &do_endpointing,
+                "If true, apply endpoint detection");
+    po.Register("online", &online,
+                "You can set this to false to disable online iVector estimation "
+                "and have all the data for each utterance used, even at "
+                "utterance start.  This is useful where you just want the best "
+                "results and don't care about online operation.  Setting this to "
+                "false has the same effect as setting "
+                "--use-most-recent-ivector=true and --greedy-ivector-extractor=true "
+                "in the file given to --ivector-extraction-config, and "
+                "--chunk-length=-1.");
+    po.Register("num-threads-startup", &g_num_threads,
+                "Number of threads used when initializing iVector extractor.");
+    
+    feature_config.Register(&po);
+    nnet3_decoding_config.Register(&po);
+    endpoint_config.Register(&po);
+    
+    po.Read(argc, argv);
+    
+    if (po.NumArgs() != 5) {
+      po.PrintUsage();
+      return 1;
+    }
+    
+    std::string nnet3_rxfilename = po.GetArg(1),
+        fst_rxfilename = po.GetArg(2),
+        spk2utt_rspecifier = po.GetArg(3),
+        wav_rspecifier = po.GetArg(4),
+        clat_wspecifier = po.GetArg(5);
+    
+    OnlineNnet2FeaturePipelineInfo feature_info(feature_config);
+
+    if (!online) {
+      feature_info.ivector_extractor_info.use_most_recent_ivector = true;
+      feature_info.ivector_extractor_info.greedy_ivector_extractor = true;
+      chunk_length_secs = -1.0;
+    }
+    
+    TransitionModel trans_model;
+    nnet3::AmNnetSimple am_nnet;
+    {
+      bool binary;
+      Input ki(nnet3_rxfilename, &binary);
+      trans_model.Read(ki.Stream(), binary);
+      am_nnet.Read(ki.Stream(), binary);
+    }
+    
+    fst::Fst<fst::StdArc> *decode_fst = ReadFstKaldi(fst_rxfilename);
+    
+    fst::SymbolTable *word_syms = NULL;
+    if (word_syms_rxfilename != "")
+      if (!(word_syms = fst::SymbolTable::ReadText(word_syms_rxfilename)))
+        KALDI_ERR << "Could not read symbol table from file "
+                  << word_syms_rxfilename;
+    
+    int32 num_done = 0, num_err = 0;
+    double tot_like = 0.0;
+    int64 num_frames = 0;
+    
+    SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
+    RandomAccessTableReader<WaveHolder> wav_reader(wav_rspecifier);
+    CompactLatticeWriter clat_writer(clat_wspecifier);
+    
+    OnlineTimingStats timing_stats;
+    
+    for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
+      std::string spk = spk2utt_reader.Key();
+      const std::vector<std::string> &uttlist = spk2utt_reader.Value();
+      OnlineIvectorExtractorAdaptationState adaptation_state(
+          feature_info.ivector_extractor_info);
+      for (size_t i = 0; i < uttlist.size(); i++) {
+        std::string utt = uttlist[i];
+        if (!wav_reader.HasKey(utt)) {
+          KALDI_WARN << "Did not find audio for utterance " << utt;
+          num_err++;
+          continue;
+        }
+        const WaveData &wave_data = wav_reader.Value(utt);
+        // get the data for channel zero (if the signal is not mono, we only
+        // take the first channel).
+        SubVector<BaseFloat> data(wave_data.Data(), 0);
+
+        OnlineNnet2FeaturePipeline feature_pipeline(feature_info);
+        feature_pipeline.SetAdaptationState(adaptation_state);
+
+        OnlineSilenceWeighting silence_weighting(
+            trans_model,
+            feature_info.silence_weighting_config);
+        
+        SingleUtteranceNnet3Decoder decoder(nnet3_decoding_config,
+                                            trans_model,
+                                            am_nnet,
+                                            *decode_fst,
+                                            &feature_pipeline);
+        OnlineTimer decoding_timer(utt);
+        
+        BaseFloat samp_freq = wave_data.SampFreq();
+        int32 chunk_length;
+        if (chunk_length_secs > 0) {
+          chunk_length = int32(samp_freq * chunk_length_secs);
+          if (chunk_length == 0) chunk_length = 1;
+        } else {
+          chunk_length = std::numeric_limits<int32>::max();
+        }
+        
+        int32 samp_offset = 0;
+        std::vector<std::pair<int32, BaseFloat> > delta_weights;
+        
+        while (samp_offset < data.Dim()) {
+          int32 samp_remaining = data.Dim() - samp_offset;
+          int32 num_samp = chunk_length < samp_remaining ? chunk_length
+                                                         : samp_remaining;
+          
+          SubVector<BaseFloat> wave_part(data, samp_offset, num_samp);
+          feature_pipeline.AcceptWaveform(samp_freq, wave_part);
+
+          samp_offset += num_samp;
+          decoding_timer.WaitUntil(samp_offset / samp_freq);
+          if (samp_offset == data.Dim()) {
+            // no more input. flush out last frames
+            feature_pipeline.InputFinished();
+          }
+    
+          if (silence_weighting.Active()) {
+            silence_weighting.ComputeCurrentTraceback(decoder.Decoder());
+            silence_weighting.GetDeltaWeights(feature_pipeline.NumFramesReady(),
+                                              &delta_weights);
+            feature_pipeline.UpdateFrameWeights(delta_weights);
+          }
+          
+          decoder.AdvanceDecoding();
+          
+          if (do_endpointing && decoder.EndpointDetected(endpoint_config))
+            break;
+        }
+        decoder.FinalizeDecoding();
+
+        CompactLattice clat;
+        bool end_of_utterance = true;
+        decoder.GetLattice(end_of_utterance, &clat);
+        
+        GetDiagnosticsAndPrintOutput(utt, word_syms, clat,
+                                     &num_frames, &tot_like);
+        
+        decoding_timer.OutputStats(&timing_stats);
+        
+        // In an application you might avoid updating the adaptation state if
+        // you felt the utterance had low confidence.  See lat/confidence.h
+        feature_pipeline.GetAdaptationState(&adaptation_state);
+        
+        // we want to output the lattice with un-scaled acoustics.
+        BaseFloat inv_acoustic_scale =
+            1.0 / nnet3_decoding_config.decodable_opts.acoustic_scale;
+        ScaleLattice(AcousticLatticeScale(inv_acoustic_scale), &clat);
+
+        clat_writer.Write(utt, clat);
+        KALDI_LOG << "Decoded utterance " << utt;
+        num_done++;
+      }
+    }
+    timing_stats.Print(online);
+    
+    KALDI_LOG << "Decoded " << num_done << " utterances, "
+              << num_err << " with errors.";
+    KALDI_LOG << "Overall likelihood per frame was " << (tot_like / num_frames)
+              << " per frame over " << num_frames << " frames.";
+    delete decode_fst;
+    delete word_syms; // will delete if non-NULL.
+    return (num_done != 0 ? 0 : 1);
+  } catch(const std::exception& e) {
+    std::cerr << e.what();
+    return -1;
+  }
+} // main()
diff --git a/src/onlinebin/Makefile b/src/onlinebin/Makefile
index 1f2e944e19d..8e12b506622 100644
--- a/src/onlinebin/Makefile
+++ b/src/onlinebin/Makefile
@@ -8,7 +8,7 @@ include ../kaldi.mk
 
 ifneq "$(wildcard ../../tools/portaudio/install/lib/libportaudio.a)" ""
     EXTRA_LDLIBS = ../../tools/portaudio/install/lib/libportaudio.a
-else 
+else
     EXTRA_LDLIBS = ../../tools/portaudio/install/lib64/libportaudio.a
 endif
 
@@ -38,7 +38,7 @@ TESTFILES =
 
 ADDLIBS = ../online/kaldi-online.a ../lat/kaldi-lat.a ../decoder/kaldi-decoder.a  \
           ../feat/kaldi-feat.a ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../thread/kaldi-thread.a ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a \
-          ../matrix/kaldi-matrix.a ../util/kaldi-util.a ../base/kaldi-base.a 
+          ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a ../matrix/kaldi-matrix.a \
+          ../util/kaldi-util.a ../thread/kaldi-thread.a ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/onlinebin/online-audio-server-decode-faster.cc b/src/onlinebin/online-audio-server-decode-faster.cc
index 704b4bc4882..ca4bfeb8858 100644
--- a/src/onlinebin/online-audio-server-decode-faster.cc
+++ b/src/onlinebin/online-audio-server-decode-faster.cc
@@ -320,16 +320,13 @@ int32 main(int argc, char *argv[]) {
           }
         }
       }
-      if (feat_transform)
-        delete feat_transform;
+      delete feat_transform;
     }
 
     std::cout << "Deinitizalizing..." << std::endl;
 
-    if (word_syms)
-      delete word_syms;
-    if (decode_fst)
-      delete decode_fst;
+    delete word_syms;
+    delete decode_fst;
     return 0;
 
   } catch (const std::exception& e) {
diff --git a/src/onlinebin/online-gmm-decode-faster.cc b/src/onlinebin/online-gmm-decode-faster.cc
index a005b840022..8ad86a489d4 100644
--- a/src/onlinebin/online-gmm-decode-faster.cc
+++ b/src/onlinebin/online-gmm-decode-faster.cc
@@ -191,9 +191,9 @@ int main(int argc, char *argv[]) {
       }
     }
 
-    if (feat_transform) delete feat_transform;
-    if (word_syms) delete word_syms;
-    if (decode_fst) delete decode_fst;
+    delete feat_transform;
+    delete word_syms;
+    delete decode_fst;
     return 0;
   } catch(const std::exception& e) {
     std::cerr << e.what();
diff --git a/src/onlinebin/online-server-gmm-decode-faster.cc b/src/onlinebin/online-server-gmm-decode-faster.cc
index 1cac741ed9d..80973bf0705 100644
--- a/src/onlinebin/online-server-gmm-decode-faster.cc
+++ b/src/onlinebin/online-server-gmm-decode-faster.cc
@@ -199,9 +199,9 @@ int main(int argc, char *argv[]) {
       }
     }
 
-    if (feat_transform) delete feat_transform;
-    if (word_syms) delete word_syms;
-    if (decode_fst) delete decode_fst;
+    delete feat_transform;
+    delete word_syms;
+    delete decode_fst;
     return 0;
   } catch(const std::exception& e) {
     std::cerr << e.what();
diff --git a/src/onlinebin/online-wav-gmm-decode-faster.cc b/src/onlinebin/online-wav-gmm-decode-faster.cc
index 7039f6a4028..e5d54b80db5 100644
--- a/src/onlinebin/online-wav-gmm-decode-faster.cc
+++ b/src/onlinebin/online-wav-gmm-decode-faster.cc
@@ -229,10 +229,10 @@ int main(int argc, char *argv[]) {
           }
         }
       }
-      if (feat_transform) delete feat_transform;
+      delete feat_transform;
     }
-    if (word_syms) delete word_syms;
-    if (decode_fst) delete decode_fst;
+    delete word_syms;
+    delete decode_fst;
     return 0;
   } catch(const std::exception& e) {
     std::cerr << e.what();
diff --git a/src/probe/exp-test.cc b/src/probe/exp-test.cc
index 26f4c252303..1fd8a64c6a6 100644
--- a/src/probe/exp-test.cc
+++ b/src/probe/exp-test.cc
@@ -1,4 +1,4 @@
-// configuration/exp-test.cc
+// probe/exp-test.cc
 
 // Copyright 2014  Yandex LLC (Author: Ilya Edrenkin)
 
diff --git a/src/probe/mkl-test.cc b/src/probe/mkl-test.cc
new file mode 100644
index 00000000000..c5113e104b5
--- /dev/null
+++ b/src/probe/mkl-test.cc
@@ -0,0 +1,44 @@
+// probe/mkl-test.cc
+
+// Copyright (c) 2015, Johns Hopkins University (Yenda Trmal<jtrmal@gmail.com>)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+#include <mkl.h>
+
+int main(void)
+{
+  int len=512;
+  char buf[len+1];
+  mkl_get_version_string(buf, len);
+  std::cerr << buf << std::endl;
+
+
+  float a[] = { 0.11, 0.12,
+                0.21, 0.22 };
+
+  float b[] = { 1011, 1012,
+                1031, 1032 };
+
+  float c[] = { 0.00, 0.00,
+                0.00, 0.00 };
+
+  cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, 2, 2, 2, 1.0, a, 2, b, 2, 0.0, c,2);
+
+  return 0;
+}
+
diff --git a/src/sgmm/Makefile b/src/sgmm/Makefile
index 2a5429380d0..097165909af 100644
--- a/src/sgmm/Makefile
+++ b/src/sgmm/Makefile
@@ -1,7 +1,7 @@
 all:
 
-OPENFST_CXXFLAGS = 
-OPENFST_LDLIBS = 
+OPENFST_CXXFLAGS =
+OPENFST_LDLIBS =
 include ../kaldi.mk
 
 TESTFILES = am-sgmm-test estimate-am-sgmm-test fmllr-sgmm-test \
@@ -11,8 +11,8 @@ OBJFILES = am-sgmm.o estimate-am-sgmm.o fmllr-sgmm.o sgmm-clusterable.o \
            estimate-am-sgmm-ebw.o estimate-am-sgmm-multi.o decodable-am-sgmm.o
 
 LIBNAME = kaldi-sgmm
-ADDLIBS = ../gmm/kaldi-gmm.a ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a ../transform/kaldi-transform.a \
-          ../thread/kaldi-thread.a ../matrix/kaldi-matrix.a ../util/kaldi-util.a \
-					../base/kaldi-base.a
+ADDLIBS = ../gmm/kaldi-gmm.a ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a \
+          ../transform/kaldi-transform.a ../matrix/kaldi-matrix.a \
+          ../util/kaldi-util.a ../thread/kaldi-thread.a ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/sgmm2/Makefile b/src/sgmm2/Makefile
index 4ca855210dd..8da24aeb3f1 100644
--- a/src/sgmm2/Makefile
+++ b/src/sgmm2/Makefile
@@ -1,7 +1,7 @@
 all:
 
-OPENFST_CXXFLAGS = 
-OPENFST_LDLIBS = 
+OPENFST_CXXFLAGS =
+OPENFST_LDLIBS =
 include ../kaldi.mk
 
 TESTFILES = am-sgmm2-test estimate-am-sgmm2-test  \
@@ -12,8 +12,8 @@ OBJFILES = am-sgmm2.o estimate-am-sgmm2.o estimate-am-sgmm2-ebw.o fmllr-sgmm2.o
 
 LIBNAME = kaldi-sgmm2
 
-ADDLIBS = ../base/kaldi-base.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
-	        ../gmm/kaldi-gmm.a ../tree/kaldi-tree.a ../transform/kaldi-transform.a \
-					../thread/kaldi-thread.a
+ADDLIBS = ../matrix/kaldi-matrix.a \
+	      ../gmm/kaldi-gmm.a ../tree/kaldi-tree.a ../transform/kaldi-transform.a \
+		  ../util/kaldi-util.a ../thread/kaldi-thread.a ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/sgmm2/am-sgmm2-project.h b/src/sgmm2/am-sgmm2-project.h
index c3360e59fd3..d85fd7106dc 100644
--- a/src/sgmm2/am-sgmm2-project.h
+++ b/src/sgmm2/am-sgmm2-project.h
@@ -83,4 +83,4 @@ class Sgmm2Project {
 
 } // end namespace kaldi
 
-#endif  // KALDI_SGMM_AM_SGMM_PROJECT_H_
+#endif  // KALDI_SGMM2_AM_SGMM2_PROJECT_H_
diff --git a/src/sgmm2/am-sgmm2.h b/src/sgmm2/am-sgmm2.h
index 4e6b82918ce..4eb224933ec 100644
--- a/src/sgmm2/am-sgmm2.h
+++ b/src/sgmm2/am-sgmm2.h
@@ -583,4 +583,4 @@ typedef TableWriter<Sgmm2GauPostHolder> Sgmm2GauPostWriter;
 }  // namespace kaldi
 
 
-#endif  // KALDI_SGMM_AM_SGMM_H_
+#endif  // KALDI_SGMM2_AM_SGMM2_H_
diff --git a/src/sgmm2/decodable-am-sgmm2.h b/src/sgmm2/decodable-am-sgmm2.h
index 717c5069dd2..75144650568 100644
--- a/src/sgmm2/decodable-am-sgmm2.h
+++ b/src/sgmm2/decodable-am-sgmm2.h
@@ -135,4 +135,4 @@ class DecodableAmSgmm2Scaled : public DecodableAmSgmm2 {
 
 }  // namespace kaldi
 
-#endif  // KALDI_DECODER_DECODABLE_AM_SGMM_H_
+#endif  // KALDI_SGMM2_DECODABLE_AM_SGMM2_H_
diff --git a/src/sgmm2/estimate-am-sgmm2-ebw.h b/src/sgmm2/estimate-am-sgmm2-ebw.h
index aa40bd5d5ff..c1ec188f367 100644
--- a/src/sgmm2/estimate-am-sgmm2-ebw.h
+++ b/src/sgmm2/estimate-am-sgmm2-ebw.h
@@ -239,4 +239,4 @@ class EbwAmSgmm2Updater {
 }  // namespace kaldi
 
 
-#endif  // KALDI_SGMM_ESTIMATE_AM_SGMM_EBW_H_
+#endif  // KALDI_SGMM2_ESTIMATE_AM_SGMM2_EBW_H_
diff --git a/src/sgmm2/estimate-am-sgmm2-test.cc b/src/sgmm2/estimate-am-sgmm2-test.cc
index 9093c1108db..bfdb161d95f 100644
--- a/src/sgmm2/estimate-am-sgmm2-test.cc
+++ b/src/sgmm2/estimate-am-sgmm2-test.cc
@@ -58,7 +58,6 @@ void TestSgmm2AccsIO(const AmSgmm2 &sgmm,
   accs.CommitStatsForSpk(sgmm, empty);
 
   kaldi::MleAmSgmm2Options update_opts;
-  update_opts.check_v = (Rand()%2 == 0);
   AmSgmm2 *sgmm1 = new AmSgmm2();
   sgmm1->CopyFromSgmm2(sgmm, false, false);
   kaldi::MleAmSgmm2Updater updater(update_opts);
@@ -66,7 +65,7 @@ void TestSgmm2AccsIO(const AmSgmm2 &sgmm,
   sgmm1->ComputeDerivedVars();
   std::vector<int32> gselect;
   Sgmm2LikelihoodCache like_cache(sgmm.NumGroups(), sgmm.NumPdfs());
-  
+
   sgmm1->GaussianSelection(sgmm_config, feats.Row(0), &gselect);
   sgmm1->ComputePerFrameVars(feats.Row(0), gselect, empty, &frame_vars);
   BaseFloat loglike1 = sgmm1->LogLikelihood(frame_vars, 0, &like_cache, &empty);
@@ -119,7 +118,7 @@ void TestSgmm2AccsIO(const AmSgmm2 &sgmm,
   delete accs2;
   delete sgmm2;
   delete sgmm3;
-  
+
   unlink("tmpf");
   unlink("tmpfb");
 }
@@ -136,7 +135,7 @@ void UnitTestEstimateSgmm2() {
   pdf2group.push_back(0);
   sgmm.InitializeFromFullGmm(full_gmm, pdf2group, dim+1, dim, false, 0.9); // TODO-- make this true!
   sgmm.ComputeNormalizers();
-  
+
   kaldi::Matrix<BaseFloat> feats;
 
   {  // First, generate random means and variances
diff --git a/src/sgmm2/estimate-am-sgmm2.h b/src/sgmm2/estimate-am-sgmm2.h
index 507976cdee4..1e2f0ed3d77 100644
--- a/src/sgmm2/estimate-am-sgmm2.h
+++ b/src/sgmm2/estimate-am-sgmm2.h
@@ -48,19 +48,12 @@ struct MleAmSgmm2Options {
   /// Should probably be related to numerical properties of machine
   /// or BaseFloat type.
   BaseFloat max_cond;
-  /// Set check_v to true if you want to use the "checking" version of the update
-  /// for the v's, in which it checks the "real" objective function value and
-  /// backtracks if necessary;
-  bool check_v;
 
   bool renormalize_V;  // Renormalize the phonetic space.
   bool renormalize_N;  // Renormalize the speaker space.
 
   /// Number of iters when re-estimating weight projections "w".
   int weight_projections_iters;
-  /// The "sequential" weight update that checks each i in turn.
-  /// (if false, uses the "parallel" one).
-  bool use_sequential_weight_update;
 
   BaseFloat epsilon;  ///< very small value used to prevent SVD crashing.
   BaseFloat max_impr_u; ///< max improvement per frame allowed in update of u.
@@ -149,7 +142,7 @@ class MleAmSgmm2Accs {
   void Check(const AmSgmm2 &model, bool show_properties = true) const;
 
   /// Resizes the accumulators to the correct sizes given the model. The flags
-  /// argument controls which accumulators to resize. 
+  /// argument controls which accumulators to resize.
   void ResizeAccumulators(const AmSgmm2 &model, SgmmUpdateFlagsType flags,
                           bool have_spk_vecs);
 
@@ -174,7 +167,7 @@ class MleAmSgmm2Accs {
   /// speaker's data.
   void CommitStatsForSpk(const AmSgmm2 &model,
                          const Sgmm2PerSpkDerivedVars &spk_vars);
-  
+
   /// Accessors
   void GetStateOccupancies(Vector<BaseFloat> *occs) const;
   int32 FeatureDim() const { return feature_dim_; }
@@ -218,17 +211,17 @@ class MleAmSgmm2Accs {
   /// which is the same thing but for purposes of computing
   /// the speaker-vector v^{(s)}.
   Vector<double> a_s_;
-  
+
   /// the U_i quantities from the less-exact version of the SSGMM update for the
   /// speaker weight projections.  Dimension is [I][T][T]
   std::vector<SpMatrix<double> > U_;
-  
+
   /// Sub-state occupancies gamma_{jm}^{(c)} for each sub-state.  In the
   /// SCTM version of the SGMM, for compactness we store two separate
   /// sets of gamma statistics, one to estimate the v_{jm} quantities
   /// and one to estimate the sub-state weights c_{jm}.
   std::vector< Vector<double> > gamma_c_;
-  
+
   /// gamma_{i}^{(s)}.  Per-speaker counts for each Gaussian. Dimension is [I]
   /// Needed for stats R_.  This can be viewed as a temporary variable; it
   /// does not form part of the stats that we eventually dump to disk.
@@ -261,7 +254,7 @@ class MleAmSgmm2Updater {
   void Update(const MleAmSgmm2Accs &accs,
               AmSgmm2 *model,
               SgmmUpdateFlagsType flags);
-  
+
  private:
   friend class UpdateWClass;
   friend class UpdatePhoneVectorsClass;
@@ -279,7 +272,7 @@ class MleAmSgmm2Updater {
   friend class EbwAmSgmm2Updater;
 
   MleAmSgmm2Options options_;
-  
+
   // Called from UpdatePhoneVectors; updates a subset of states
   // (relates to multi-threading).
   void UpdatePhoneVectorsInternal(const MleAmSgmm2Accs &accs,
@@ -289,7 +282,7 @@ class MleAmSgmm2Updater {
                                   double *auxf_impr,
                                   int32 num_threads,
                                   int32 thread_id) const;
-  
+
   double UpdatePhoneVectors(const MleAmSgmm2Accs &accs,
                             const std::vector<SpMatrix<double> > &H,
                             const std::vector<Matrix<double> > &log_a,
@@ -303,7 +296,7 @@ class MleAmSgmm2Updater {
   void RenormalizeV(const MleAmSgmm2Accs &accs, AmSgmm2 *model,
                     const Vector<double> &gamma_i,
                     const std::vector<SpMatrix<double> > &H);
-    
+
   double UpdateN(const MleAmSgmm2Accs &accs, const Vector<double> &gamma_i,
                  AmSgmm2 *model);
   void RenormalizeN(const MleAmSgmm2Accs &accs, const Vector<double> &gamma_i,
@@ -330,15 +323,15 @@ class MleAmSgmm2Updater {
                        Matrix<double> *F_i,
                        Matrix<double> *g_i,
                        double *tot_like,
-                       int32 num_threads, 
+                       int32 num_threads,
                        int32 thread_id);
-  
+
   double UpdateSubstateWeights(const MleAmSgmm2Accs &accs,
                                AmSgmm2 *model);
 
   static void ComputeLogA(const MleAmSgmm2Accs &accs,
                           std::vector<Matrix<double> > *log_a); // [SSGMM]
-  
+
   void ComputeMPrior(AmSgmm2 *model);  // TODO(arnab): Maybe make this static?
   double MapUpdateM(const MleAmSgmm2Accs &accs,
                     const std::vector< SpMatrix<double> > &Q,
@@ -373,7 +366,7 @@ class MleSgmm2SpeakerAccs {
                        int32 pdf_index,
                        BaseFloat weight,
                        Sgmm2PerSpkDerivedVars *spk_vars);
-  
+
   /// Accumulate statistics, given posteriors.  Returns total
   /// count accumulated, which may differ from posteriors.Sum()
   /// due to randomized pruning.
@@ -382,7 +375,7 @@ class MleSgmm2SpeakerAccs {
                                      const Matrix<BaseFloat> &posteriors,
                                      int32 pdf_index,
                                      Sgmm2PerSpkDerivedVars *spk_vars);
-  
+
   /// Update speaker vector.  If v_s was empty, will assume it started as zero
   /// and will resize it to the speaker-subspace size.
   void Update(const AmSgmm2 &model,
@@ -390,7 +383,7 @@ class MleSgmm2SpeakerAccs {
               Vector<BaseFloat> *v_s,
               BaseFloat *objf_impr_out,
               BaseFloat *count_out);
-  
+
  private:
   // Update without speaker-dependent weights (vectors u_i),
   // i.e. not symmetric SGMM (SSGMM)
@@ -402,8 +395,8 @@ class MleSgmm2SpeakerAccs {
                    Vector<BaseFloat> *v_s,
                    BaseFloat *objf_impr_out,
                    BaseFloat *count_out);
-  
-  
+
+
   /// Statistics for speaker adaptation (vectors), stored per-speaker.
   /// Per-speaker stats for vectors, y^{(s)}. Dimension [T].
   Vector<double> y_s_;
@@ -416,7 +409,7 @@ class MleSgmm2SpeakerAccs {
   /// relates to the speaker subspace.
   /// Eq. (82): H_{i}^{spk} = N_{i}^T \Sigma_{i}^{-1} N_{i}
   std::vector< SpMatrix<double> > H_spk_;
-  
+
   /// N_i^T \Sigma_{i}^{-1}. Needed for y^{(s)}
   std::vector< Matrix<double> > NtransSigmaInv_;
 
@@ -444,13 +437,13 @@ class UpdateWClass: public MultiThreadable {
     F_i_.Resize(F_i->NumRows(), F_i->NumCols());
     g_i_.Resize(g_i->NumRows(), g_i->NumCols());
   }
-    
+
   ~UpdateWClass() {
     F_i_ptr_->AddMat(1.0, F_i_, kNoTrans);
     g_i_ptr_->AddMat(1.0, g_i_, kNoTrans);
     *tot_like_ptr_ += tot_like_;
   }
-  
+
   inline void operator() () {
     // Note: give them local copy of the sums we're computing,
     // which will be propagated to the total sums in the destructor.
@@ -475,4 +468,4 @@ class UpdateWClass: public MultiThreadable {
 }  // namespace kaldi
 
 
-#endif  // KALDI_SGMM_ESTIMATE_AM_SGMM_H_
+#endif  // KALDI_SGMM2_ESTIMATE_AM_SGMM2_H_
diff --git a/src/sgmm2bin/Makefile b/src/sgmm2bin/Makefile
index e779ed16281..0fb00d0fc0e 100644
--- a/src/sgmm2bin/Makefile
+++ b/src/sgmm2bin/Makefile
@@ -20,7 +20,7 @@ TESTFILES =
 ADDLIBS =  ../decoder/kaldi-decoder.a ../lat/kaldi-lat.a ../feat/kaldi-feat.a \
 	../sgmm2/kaldi-sgmm2.a ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
 	../hmm/kaldi-hmm.a ../tree/kaldi-tree.a ../matrix/kaldi-matrix.a  \
-	../thread/kaldi-thread.a ../fstext/kaldi-fstext.a \
-    ../util/kaldi-util.a ../base/kaldi-base.a 
+	../fstext/kaldi-fstext.a ../util/kaldi-util.a ../thread/kaldi-thread.a \
+    ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/sgmm2bin/sgmm2-latgen-faster-parallel.cc b/src/sgmm2bin/sgmm2-latgen-faster-parallel.cc
index c7c495a3837..344e65270b0 100644
--- a/src/sgmm2bin/sgmm2-latgen-faster-parallel.cc
+++ b/src/sgmm2bin/sgmm2-latgen-faster-parallel.cc
@@ -267,8 +267,8 @@ int main(int argc, char *argv[]) {
     }
     sequencer.Wait(); // Wait till all tasks are done.
     
-    if (decode_fst) delete decode_fst; 
-    if (word_syms) delete word_syms;
+    delete decode_fst; 
+    delete word_syms;
     
     double elapsed = timer.Elapsed();
     KALDI_LOG << "Decoded with " << sequencer_config.num_threads << " threads.";
diff --git a/src/sgmm2bin/sgmm2-latgen-faster.cc b/src/sgmm2bin/sgmm2-latgen-faster.cc
index a787b3e1c3c..fa6482211b5 100644
--- a/src/sgmm2bin/sgmm2-latgen-faster.cc
+++ b/src/sgmm2bin/sgmm2-latgen-faster.cc
@@ -257,7 +257,7 @@ int main(int argc, char *argv[]) {
     KALDI_LOG << "Overall log-likelihood per frame = " << (tot_like/frame_count)
               << " over " << frame_count << " frames.";
 
-    if (word_syms) delete word_syms;
+    delete word_syms;
     return (num_success != 0 ? 0 : 1);
   } catch(const std::exception &e) {
     std::cerr << e.what();
diff --git a/src/sgmmbin/sgmm-acc-stats2.cc b/src/sgmmbin/sgmm-acc-stats2.cc
index 35231b0677d..2f835b727d1 100644
--- a/src/sgmmbin/sgmm-acc-stats2.cc
+++ b/src/sgmmbin/sgmm-acc-stats2.cc
@@ -138,8 +138,8 @@ int main(int argc, char *argv[]) {
             am_sgmm.ComputePerSpkDerivedVars(&spk_vars);
           } else {
             KALDI_WARN << "Cannot find speaker vector for " << utt;
-            continue;
             num_other_error++;
+            continue;
           }
         }  // else spk_vars is "empty"
 
diff --git a/src/sgmmbin/sgmm-decode-faster.cc b/src/sgmmbin/sgmm-decode-faster.cc
index eb3c2607830..b20808e144e 100644
--- a/src/sgmmbin/sgmm-decode-faster.cc
+++ b/src/sgmmbin/sgmm-decode-faster.cc
@@ -208,7 +208,7 @@ int main(int argc, char *argv[]) {
     KALDI_LOG << "Overall log-likelihood per frame = " << (tot_like/frame_count)
               << " over " << frame_count << " frames.";
 
-    if (word_syms) delete word_syms;
+    delete word_syms;
     delete decode_fst;
     return (num_success != 0 ? 0 : 1);
   } catch(const std::exception &e) {
diff --git a/src/sgmmbin/sgmm-latgen-faster.cc b/src/sgmmbin/sgmm-latgen-faster.cc
index 45c423a969e..83470540d37 100644
--- a/src/sgmmbin/sgmm-latgen-faster.cc
+++ b/src/sgmmbin/sgmm-latgen-faster.cc
@@ -260,7 +260,7 @@ int main(int argc, char *argv[]) {
     KALDI_LOG << "Overall log-likelihood per frame = " << (tot_like/frame_count)
               << " over " << frame_count << " frames.";
 
-    if (word_syms) delete word_syms;
+    delete word_syms;
     return (num_success != 0 ? 0 : 1);
   } catch(const std::exception &e) {
     std::cerr << e.what();
diff --git a/src/sgmmbin/sgmm-latgen-simple.cc b/src/sgmmbin/sgmm-latgen-simple.cc
index 840c3ea3cb6..69e272ba9c6 100644
--- a/src/sgmmbin/sgmm-latgen-simple.cc
+++ b/src/sgmmbin/sgmm-latgen-simple.cc
@@ -220,7 +220,7 @@ int main(int argc, char *argv[]) {
     KALDI_LOG << "Overall log-likelihood per frame = " << (tot_like/frame_count)
               << " over " << frame_count << " frames.";
 
-    if (word_syms) delete word_syms;
+    delete word_syms;
     delete decode_fst;
     return (num_success != 0 ? 0 : 1);
   } catch(const std::exception &e) {
diff --git a/src/thread/Makefile b/src/thread/Makefile
index 1f780be5641..a3d511ce8b6 100644
--- a/src/thread/Makefile
+++ b/src/thread/Makefile
@@ -8,7 +8,7 @@ TESTFILES = kaldi-thread-test kaldi-task-sequence-test
 OBJFILES =  kaldi-thread.o kaldi-mutex.o kaldi-semaphore.o kaldi-barrier.o
 
 LIBNAME = kaldi-thread
-ADDLIBS = ../matrix/kaldi-matrix.a ../base/kaldi-base.a
+ADDLIBS = ../base/kaldi-base.a
 
 
 include ../makefiles/default_rules.mk
diff --git a/src/thread/kaldi-thread.h b/src/thread/kaldi-thread.h
index e11271253a0..169aaf6a0f6 100644
--- a/src/thread/kaldi-thread.h
+++ b/src/thread/kaldi-thread.h
@@ -86,7 +86,7 @@ class MultiThreadable {
     // function so the one in the child class will be called.
     return NULL;
   }
-  
+
  public:
   // Do not redeclare thread_id_ and num_threads_ in derived classes.
   int32 thread_id_;  // 0 <= thread_id_ < num_threads_
@@ -101,13 +101,13 @@ class ExampleClass: public MultiThreadable {
  public:
   ExampleClass(int32 *foo); // Typically there will be an initializer that
   // takes arguments.
-  
+
   ExampleClass(const ExampleClass &other); // A copy constructor is also needed;
   // some example classes use the default version of this.
 
   void operator() () {
     // Does the main function of the class.  This
-    // function will typically want to look at the values of the 
+    // function will typically want to look at the values of the
     // member variables thread_id_ and num_threads_, inherited
     // from MultiThreadable.
   }
diff --git a/src/transform/Makefile b/src/transform/Makefile
index d5cb0c52274..21db6c73462 100644
--- a/src/transform/Makefile
+++ b/src/transform/Makefile
@@ -14,7 +14,7 @@ OBJFILES = regression-tree.o regtree-mllr-diag-gmm.o lda-estimate.o \
 
 LIBNAME = kaldi-transform
 
-ADDLIBS = ../gmm/kaldi-gmm.a ../thread/kaldi-thread.a ../tree/kaldi-tree.a \
-   ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a
+ADDLIBS = ../gmm/kaldi-gmm.a ../tree/kaldi-tree.a ../matrix/kaldi-matrix.a \
+   ../util/kaldi-util.a ../thread/kaldi-thread.a ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/transform/decodable-am-diag-gmm-regtree.h b/src/transform/decodable-am-diag-gmm-regtree.h
index eb1eb366f51..9da4b7f1591 100644
--- a/src/transform/decodable-am-diag-gmm-regtree.h
+++ b/src/transform/decodable-am-diag-gmm-regtree.h
@@ -54,7 +54,7 @@ class DecodableAmDiagGmmRegtreeFmllr: public DecodableAmDiagGmmUnmapped {
                                          trans_model_.TransitionIdToPdf(tid));
   }
 
-  virtual int32 NumFramesReady() { return feature_matrix_.NumRows(); }
+  virtual int32 NumFramesReady() const { return feature_matrix_.NumRows(); }
 
   // Indices are one-based!  This is for compatibility with OpenFst.
   virtual int32 NumIndices() const { return trans_model_.NumTransitionIds(); }
@@ -97,13 +97,13 @@ class DecodableAmDiagGmmRegtreeMllr: public DecodableAmDiagGmmUnmapped {
                                          trans_model_.TransitionIdToPdf(tid));
   }
 
-  virtual int32 NumFramesReady() { return feature_matrix_.NumRows(); }
+  virtual int32 NumFramesReady() const { return feature_matrix_.NumRows(); }
 
   // Indices are one-based!  This is for compatibility with OpenFst.
   virtual int32 NumIndices() const { return trans_model_.NumTransitionIds(); }
 
   const TransitionModel *TransModel() { return &trans_model_; }
-  
+
  protected:
   virtual BaseFloat LogLikelihoodZeroBased(int32 frame, int32 state_index);
 
diff --git a/src/transform/fmllr-diag-gmm.h b/src/transform/fmllr-diag-gmm.h
index c3e987bc6db..54300f9ad60 100644
--- a/src/transform/fmllr-diag-gmm.h
+++ b/src/transform/fmllr-diag-gmm.h
@@ -81,7 +81,10 @@ class FmllrDiagGmmAccs: public AffineXformStats {
   void Init(size_t dim) {
     AffineXformStats::Init(dim, dim); single_frame_stats_.Init(dim);
   }
-
+  void Read(std::istream &in, bool binary, bool add) {
+      AffineXformStats::Read(in, binary, add);
+      single_frame_stats_.Init(Dim());
+  }
   /// Accumulate stats for a single GMM in the model; returns log likelihood.
   BaseFloat AccumulateForGmm(const DiagGmm &gmm,
                              const VectorBase<BaseFloat> &data,
diff --git a/src/transform/regression-tree.cc b/src/transform/regression-tree.cc
index 55edc6b1733..c5acb2edba6 100644
--- a/src/transform/regression-tree.cc
+++ b/src/transform/regression-tree.cc
@@ -132,6 +132,16 @@ static bool GetActiveParents(int32 node, const vector<int32> &parents,
   KALDI_ASSERT(parents.size() == is_active.size());
   KALDI_ASSERT(static_cast<size_t>(node) < parents.size());
   active_parents_out->clear();
+
+  if (node == static_cast<int32> (parents.size() - 1)) {  // root node
+    if (is_active[node]) {
+      active_parents_out->push_back(node);
+      return true;
+    } else {
+      return false;
+    }
+  }
+
   bool ret_val = false;
   while (node < static_cast<int32> (parents.size() - 1)) {  // exclude the root
     node = parents[node];
@@ -141,15 +151,6 @@ static bool GetActiveParents(int32 node, const vector<int32> &parents,
     }
   }
   return ret_val;  // will return if not starting from root
-  if (node == static_cast<int32> (parents.size() - 1)) {  // root node
-    if (is_active[node]) {
-      active_parents_out->push_back(node);
-      return true;
-    } else {
-      return false;
-    }
-  }
-  KALDI_ASSERT(false);  // Never reached
 }
 
 /// Parses the regression tree and finds the nodes whose occupancies (read
diff --git a/src/transform/regtree-fmllr-diag-gmm.h b/src/transform/regtree-fmllr-diag-gmm.h
index f9ccb615039..9130850ab8c 100644
--- a/src/transform/regtree-fmllr-diag-gmm.h
+++ b/src/transform/regtree-fmllr-diag-gmm.h
@@ -44,7 +44,7 @@ struct RegtreeFmllrOptions {
 
   RegtreeFmllrOptions(): update_type("full"), min_count(1000.0),
                          num_iters(10), use_regtree(true) { }
-  
+
   void Register(OptionsItf *opts) {
     opts->Register("fmllr-update-type", &update_type,
                    "Update type for fMLLR (\"full\"|\"diag\"|\"offset\"|\"none\")");
@@ -138,7 +138,7 @@ typedef RandomAccessTableReader< KaldiObjectHolder<RegtreeFmllrDiagGmm> >
             RandomAccessRegtreeFmllrDiagGmmReader;
 typedef RandomAccessTableReaderMapped< KaldiObjectHolder<RegtreeFmllrDiagGmm> >
             RandomAccessRegtreeFmllrDiagGmmReaderMapped;
-typedef SequentialTableReader< KaldiObjectHolder<RegtreeFmllrDiagGmm> >  RegtreeFmllrDiagGmmSeqReader;  
+typedef SequentialTableReader< KaldiObjectHolder<RegtreeFmllrDiagGmm> >  RegtreeFmllrDiagGmmSeqReader;
 
 /** \class RegtreeFmllrDiagGmmAccs
  *  Class for computing the accumulators needed for the maximum-likelihood
@@ -154,7 +154,10 @@ class RegtreeFmllrDiagGmmAccs {
   void SetZero();
 
   /// Accumulate stats for a single GMM in the model; returns log likelihood.
-  /// This does not work with multiple feature transforms.
+  /// This does not work if the features have already been transformed
+  /// with multiple feature transforms (so you can't use use this to
+  /// do a 2nd pass of regression-tree fMLLR estimation, which as I write
+  /// (Dan, 2016) I'm not sure that this framework even supports.
   BaseFloat AccumulateForGmm(const RegressionTree &regtree,
                              const AmDiagGmm &am,
                              const VectorBase<BaseFloat> &data,
diff --git a/src/tree/Makefile b/src/tree/Makefile
index 1acc528effd..e46208ca8c1 100644
--- a/src/tree/Makefile
+++ b/src/tree/Makefile
@@ -13,7 +13,8 @@ OBJFILES = event-map.o context-dep.o clusterable-classes.o cluster-utils.o \
 					 build-tree-utils.o build-tree.o build-tree-questions.o tree-renderer.o
 
 LIBNAME = kaldi-tree
-ADDLIBS = ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a
+ADDLIBS = ../matrix/kaldi-matrix.a ../util/kaldi-util.a ../thread/kaldi-thread.a \
+          ../base/kaldi-base.a
 
 
 include ../makefiles/default_rules.mk
diff --git a/src/tree/build-tree-utils.cc b/src/tree/build-tree-utils.cc
index eae77b7449c..8e0b6db4204 100644
--- a/src/tree/build-tree-utils.cc
+++ b/src/tree/build-tree-utils.cc
@@ -160,7 +160,7 @@ EventMap *DoTableSplitMultiple(const EventMap &orig, const std::vector<EventKeyT
     EventMap *cur = NULL;  // would make it &orig, except for const issues.
     for (size_t i = 0; i < keys.size(); i++) {
       EventMap *next = DoTableSplit( (cur ? *cur : orig), keys[i], stats, num_leaves);
-      if (cur != NULL) delete cur;  // delete intermediate maps.
+      delete cur;  // delete intermediate maps.
       cur = next;
     }
     return cur;
@@ -316,7 +316,7 @@ BaseFloat ComputeInitialSplit(const std::vector<Clusterable*> &summed_stats,
     const std::vector<EventValueType> &yes_set = questions_of_this_key[i];
     std::vector<int32> assignments(summed_stats.size(), 0);  // 0 is index of "no".
     std::vector<Clusterable*> clusters(2);  // no and yes clusters.
-    for (std::vector<EventValueType>::const_iterator iter = yes_set.begin(); iter != yes_set.end(); iter++) {
+    for (std::vector<EventValueType>::const_iterator iter = yes_set.begin(); iter != yes_set.end(); ++iter) {
       KALDI_ASSERT(*iter>=0);
       if (*iter < (EventValueType)assignments.size()) assignments[*iter] = 1;
     }
@@ -367,7 +367,7 @@ BaseFloat FindBestSplitForKey(const BuildTreeStatsType &stats,
   // find best basic question.
 
   std::vector<int32> assignments(summed_stats.size(), 0);  // assigns to "no" (0) by default.
-  for (std::vector<EventValueType>::const_iterator iter = yes_set.begin(); iter != yes_set.end(); iter++) {
+  for (std::vector<EventValueType>::const_iterator iter = yes_set.begin(); iter != yes_set.end(); ++iter) {
     KALDI_ASSERT(*iter>=0);
     if (*iter < (EventValueType)assignments.size()) {
       // this guard necessary in case stats did not have all the
@@ -450,8 +450,8 @@ class DecisionTreeSplitter {
     FindBestSplit();
   }
   ~DecisionTreeSplitter() {
-    if (yes_) delete yes_;
-    if (no_) delete no_;
+    delete yes_;
+    delete no_;
   }
  private:
   void DoSplitInternal(int32 *next_leaf) {
diff --git a/src/tree/build-tree.cc b/src/tree/build-tree.cc
index f260a8b56fb..62735c55421 100644
--- a/src/tree/build-tree.cc
+++ b/src/tree/build-tree.cc
@@ -150,7 +150,7 @@ EventMap *BuildTree(Questions &qopts,
 
   // the inputs will be further checked in GetStubMap.
   int32 num_leaves = 0;  // allocator for leaves.
-  
+
   EventMap *tree_stub = GetStubMap(P,
                                    phone_sets,
                                    phone2num_pdf_classes,
@@ -180,7 +180,7 @@ EventMap *BuildTree(Questions &qopts,
                                            filtered_stats,
                                            qopts, thresh, max_leaves,
                                            &num_leaves, &impr, &smallest_split);
-  
+
   if (cluster_thresh < 0.0) {
     KALDI_LOG <<  "Setting clustering threshold to smallest split " << smallest_split;
     cluster_thresh = smallest_split;
@@ -190,15 +190,15 @@ EventMap *BuildTree(Questions &qopts,
       impr_normalized = impr / normalizer,
       normalizer_filt = SumNormalizer(filtered_stats),
       impr_normalized_filt = impr / normalizer_filt;
-  
+
   KALDI_VLOG(1) <<  "After decision tree split, num-leaves = " << num_leaves
                 << ", like-impr = " << impr_normalized << " per frame over "
                 << normalizer << " frames.";
- 
-  KALDI_VLOG(1) <<  "Including just phones that were split, improvement is " 
+
+  KALDI_VLOG(1) <<  "Including just phones that were split, improvement is "
                 << impr_normalized_filt << " per frame over "
                 << normalizer_filt << " frames.";
-  
+
 
   if (cluster_thresh != 0.0) {   // Cluster the tree.
     BaseFloat objf_before_cluster = ObjfGivenMap(stats, *tree_split);
@@ -261,7 +261,7 @@ static void ComputeTreeMapping(const EventMap &small_tree,
   // it's really an error condition and it will cause errors later (e.g. when
   // you initialize your model), but at this point we will try to handle it
   // gracefully.
-  
+
   for (int32 i = 0; i < num_leaves_small; i++) {
     if (static_cast<size_t>(i) >= split_stats_small.size() ||
         split_stats_small[i].empty()) {
@@ -330,7 +330,7 @@ EventMap *BuildTreeTwoLevel(Questions &qopts,
   KALDI_ASSERT(first_level_tree != NULL);
   KALDI_LOG << "****BuildTreeTwoLevel: done building first level tree";
 
-  
+
   std::vector<int32> nonsplit_phones;
   for (size_t i = 0; i < phone_sets.size(); i++)
     if (!do_split[i])
@@ -342,7 +342,7 @@ EventMap *BuildTreeTwoLevel(Questions &qopts,
   FilterStatsByKey(stats, P, nonsplit_phones, false,  // retain only those not
                    // in "nonsplit_phones"
                    &filtered_stats);
-  
+
   int32 num_leaves = first_level_tree->MaxResult() + 1,
       old_num_leaves = num_leaves;
 
@@ -353,14 +353,14 @@ EventMap *BuildTreeTwoLevel(Questions &qopts,
                                      filtered_stats,
                                      qopts, 0.0, max_leaves_second,
                                      &num_leaves, &impr, &smallest_split);
-  
+
   KALDI_LOG << "Building second-level tree: increased #leaves from "
             << old_num_leaves << " to " << num_leaves << ", smallest split was "
             << smallest_split;
-  
+
   BaseFloat normalizer = SumNormalizer(stats),
       impr_normalized = impr / normalizer;
-  
+
   KALDI_LOG <<  "After second decision tree split, num-leaves = "
             << num_leaves << ", like-impr = " << impr_normalized
             << " per frame over " << normalizer << " frames.";
@@ -377,12 +377,12 @@ EventMap *BuildTreeTwoLevel(Questions &qopts,
                                                               *first_level_tree,
                                                               &num_removed);
     KALDI_LOG <<  "BuildTreeTwoLevel: removed " << num_removed << " leaves.";
-    
+
     int32 num_leaves = 0;
     EventMap *tree_renumbered = RenumberEventMap(*tree_clustered, &num_leaves);
-    
+
     BaseFloat objf_after_cluster = ObjfGivenMap(stats, *tree_renumbered);
-    
+
     KALDI_LOG << "Objf change due to clustering "
               << ((objf_after_cluster-objf_before_cluster) / SumNormalizer(stats))
               << " per frame.";
@@ -419,7 +419,7 @@ EventMap *BuildTreeTwoLevel(Questions &qopts,
     delete tree;
     tree = renumbered_tree;
   }
-  
+
   delete first_level_tree;
   return tree;
 }
@@ -454,6 +454,27 @@ void ReadSymbolTableAsIntegers(std::string filename,
     KALDI_ERR << "Symbol table "<<filename<<" seems to contain duplicate symbols.";
 }
 
+// Used in ObtainSetsOfPhones, this function removes duplicates from a vector of vectors,
+// while otherwise preserving the order.  It also prints how many it removed.
+static void RemoveDuplicates(std::vector<std::vector<int32 > > *vecs) {
+  unordered_set<std::vector<int32>, VectorHasher<int32> > vec_set;
+  std::vector<std::vector<int32 > > new_vecs;
+  new_vecs.reserve(vecs->size());
+  int32 num_not_inserted = 0;
+  for (std::vector<std::vector<int32 > >::const_iterator iter = vecs->begin(),
+           end = vecs->end(); iter != end; iter++) {
+    if (vec_set.insert(*iter).second) {  // if this vector was not already in
+                                         // the set...
+      new_vecs.push_back(*iter);
+    } else {
+      num_not_inserted++;
+    }
+  }
+  KALDI_VLOG(2) << "Removed " << num_not_inserted
+                << " duplicates from the phone sets.";
+  vecs->swap(new_vecs);
+}
+
 
 /// ObtainSetsOfPhones is called by AutomaticallyObtainQuestions.
 /// It processes the output of ClusterTopDown to obtain the sets
@@ -495,6 +516,11 @@ static void ObtainSetsOfPhones(const std::vector<std::vector<int32> > &phone_set
                               raw_sets[j].end());
     }
   }
+  // Reverse the 'raw_sets' so the most important things (top-level questions)
+  // appear at the front... this will end up mattering because of the
+  // --truncate-leftmost-questions option to compile-questions.
+  std::reverse(raw_sets.begin(), raw_sets.end());
+
   // Now add the original sets-of-phones to the raw sets, to make sure all of
   // these are present.  (The main reason they might be absent is if the stats
   // are empty, but we want to ensure they are all there regardless).  note these
@@ -504,12 +530,11 @@ static void ObtainSetsOfPhones(const std::vector<std::vector<int32> > &phone_set
     raw_sets.push_back(phone_sets[i]);
   }
   // Remove duplicate sets from "raw_sets".
-  SortAndUniq(&raw_sets);
+  RemoveDuplicates(&raw_sets);
   sets_out->reserve(raw_sets.size());
-  for (size_t i = 0; i < raw_sets.size(); i++) {
-    if (! raw_sets[i].empty()) // if the empty set is present, remove it...
+  for (size_t i = 0; i < raw_sets.size(); i++)
+    if (! raw_sets[i].empty())  // if the empty set is present, remove it...
       sets_out->push_back(raw_sets[i]);
-  }
 }
 
 
@@ -614,7 +639,7 @@ void AutomaticallyObtainQuestions(BuildTreeStatsType &stats,
                << "--pdf-class-list option to change this if needed. See "
                << "also any warnings above.";
   }
-  
+
 
   TreeClusterOptions topts;
   topts.kmeans_cfg.num_tries = 10;  // This is a slow-but-accurate setting,
@@ -643,7 +668,6 @@ void AutomaticallyObtainQuestions(BuildTreeStatsType &stats,
   // used here do not allocate].
   DeletePointers(&summed_stats);
   DeletePointers(&summed_stats_per_set);
-
 }
 
 
diff --git a/src/tree/cluster-utils.cc b/src/tree/cluster-utils.cc
index 53e4332e572..53de0825e08 100644
--- a/src/tree/cluster-utils.cc
+++ b/src/tree/cluster-utils.cc
@@ -1055,12 +1055,12 @@ class TreeClusterer {
 
   ~TreeClusterer() {
     for (int32 leaf = 0; leaf < static_cast<int32>(leaf_nodes_.size());leaf++) {
-      if (leaf_nodes_[leaf]->node_total) delete leaf_nodes_[leaf]->node_total;
+      delete leaf_nodes_[leaf]->node_total;
       DeletePointers(&(leaf_nodes_[leaf]->leaf.clusters));
       delete leaf_nodes_[leaf];
     }
     for (int32 nonleaf = 0; nonleaf < static_cast<int32>(nonleaf_nodes_.size()); nonleaf++) {
-      if (nonleaf_nodes_[nonleaf]->node_total) delete nonleaf_nodes_[nonleaf]->node_total;
+      delete nonleaf_nodes_[nonleaf]->node_total;
       delete nonleaf_nodes_[nonleaf];
     }
   }
diff --git a/src/tree/context-dep.cc b/src/tree/context-dep.cc
index aad76cbf0b0..81eee5bb4ee 100644
--- a/src/tree/context-dep.cc
+++ b/src/tree/context-dep.cc
@@ -35,8 +35,9 @@ bool ContextDependency::Compute(const std::vector<int32> &phoneseq,
   KALDI_COMPILE_TIME_ASSERT(kPdfClass < 0);  // or it would not be sorted.
   for (int32 i = 0;i < N_;i++) {
     event_vec.push_back(std::make_pair
-                        (static_cast<EventKeyType>(i), static_cast<EventValueType>(phoneseq[i])));
-    KALDI_ASSERT(static_cast<EventAnswerType>(phoneseq[i]) != -1);  // >=0 ?
+                        (static_cast<EventKeyType>(i),
+                         static_cast<EventValueType>(phoneseq[i])));
+    KALDI_ASSERT(static_cast<EventAnswerType>(phoneseq[i]) >= 0);
   }
   KALDI_ASSERT(pdf_id != NULL);
   return to_pdf_->Map(event_vec, pdf_id);
@@ -53,20 +54,23 @@ ContextDependency *GenRandContextDependency(const std::vector<int32> &phone_ids,
   float ctx_dep_prob = 0.7 + 0.3*RandUniform();
   int32 max_phone = *std::max_element(phone_ids.begin(), phone_ids.end());
   hmm_lengths->clear();
-  hmm_lengths->resize(max_phone+1, -1);
-  std::vector<bool> is_ctx_dep(max_phone+1);
+  hmm_lengths->resize(max_phone + 1, -1);
+  std::vector<bool> is_ctx_dep(max_phone + 1);
 
   for (int32 i = 0; i <= max_phone; i++) {
     (*hmm_lengths)[i] = 1 + Rand() % 3;
     is_ctx_dep[i] = (RandUniform() < ctx_dep_prob);  // true w.p. ctx_dep_prob.
   }
-  for (size_t i = 0;i < (size_t) num_phones;i++) {
-    KALDI_VLOG(2) <<  "For idx = "<< i << ", (phone_id, hmm_length, is_ctx_dep) == " << (phone_ids[i]) << " " << ((*hmm_lengths)[phone_ids[i]]) << " " << (is_ctx_dep[phone_ids[i]]);
-  }
+  for (size_t i = 0; i < (size_t) num_phones; i++)
+    KALDI_VLOG(2) <<  "For idx = " << i
+                  << ", (phone_id, hmm_length, is_ctx_dep) == "
+                  << (phone_ids[i]) << " " << ((*hmm_lengths)[phone_ids[i]])
+                  << " " << (is_ctx_dep[phone_ids[i]]);
   // Generate rand stats.
   BuildTreeStatsType stats;
   size_t dim = 3 + Rand() % 20;
-  GenRandStats(dim, num_stats, N, P, phone_ids, *hmm_lengths, is_ctx_dep, ensure_all_covered, &stats);
+  GenRandStats(dim, num_stats, N, P, phone_ids, *hmm_lengths,
+               is_ctx_dep, ensure_all_covered, &stats);
 
   // Now build the tree.
 
@@ -101,14 +105,14 @@ ContextDependency *GenRandContextDependencyLarge(const std::vector<int32> &phone
   KALDI_ASSERT(num_phones > 0);
   hmm_lengths->clear();
   int32 max_phone = *std::max_element(phone_ids.begin(), phone_ids.end());
-  hmm_lengths->resize(max_phone+1, -1);
-  std::vector<bool> is_ctx_dep(max_phone+1);
+  hmm_lengths->resize(max_phone + 1, -1);
+  std::vector<bool> is_ctx_dep(max_phone + 1);
 
   for (int32 i = 0; i <= max_phone; i++) {
     (*hmm_lengths)[i] = 1 + Rand() % 3;
     is_ctx_dep[i] = (RandUniform() < ctx_dep_prob);  // true w.p. ctx_dep_prob.
   }
-  for (size_t i = 0;i < (size_t) num_phones;i++) {
+  for (size_t i = 0; i < (size_t) num_phones; i++) {
     KALDI_VLOG(2) <<  "For idx = "<< i << ", (phone_id, hmm_length, is_ctx_dep) == " << (phone_ids[i]) << " " << ((*hmm_lengths)[phone_ids[i]]) << " " << (is_ctx_dep[phone_ids[i]]);
   }
   // Generate rand stats.
@@ -161,7 +165,7 @@ void ContextDependency::Read (std::istream &is, bool binary) {
   ReadToken(is, binary, &token);
   if (token == "ToLength") {  // back-compat.
     EventMap *to_num_pdf_classes = EventMap::Read(is, binary);
-    if (to_num_pdf_classes) delete to_num_pdf_classes;
+    delete to_num_pdf_classes;
     ReadToken(is, binary, &token);
   }
   if (token == "ToPdf") {
@@ -176,7 +180,7 @@ void ContextDependency::Read (std::istream &is, bool binary) {
 
 void ContextDependency::GetPdfInfo(const std::vector<int32> &phones,
                                    const std::vector<int32> &num_pdf_classes,  // indexed by phone,
-                                   std::vector<std::vector<std::pair<int32, int32> > > *pdf_info) const {  
+                                   std::vector<std::vector<std::pair<int32, int32> > > *pdf_info) const {
 
   EventType vec;
   KALDI_ASSERT(pdf_info != NULL);
diff --git a/src/tree/context-dep.h b/src/tree/context-dep.h
index 307fcd47443..08dc974570d 100644
--- a/src/tree/context-dep.h
+++ b/src/tree/context-dep.h
@@ -61,7 +61,9 @@ class ContextDependency: public ContextDependencyInterface {
   virtual int32 CentralPosition() const { return P_; }
 
 
-  /// returns success or failure; outputs pdf to pdf_id
+  /// returns success or failure; outputs pdf to pdf_id For positions that were
+  /// outside the sequence (due to end effects), put zero.  Naturally
+  /// phoneseq[CentralPosition()] must be nonzero.
   virtual bool Compute(const std::vector<int32> &phoneseq,
                        int32 pdf_class, int32 *pdf_id) const;
 
@@ -90,15 +92,14 @@ class ContextDependency: public ContextDependencyInterface {
       N_(N), P_(P), to_pdf_(to_pdf) { }
   void Write (std::ostream &os, bool binary) const;
 
-  ~ContextDependency() { if (to_pdf_ != NULL) delete to_pdf_; }
+  ~ContextDependency() { delete to_pdf_; }
 
   const EventMap &ToPdfMap() const { return *to_pdf_; }
 
   /// GetPdfInfo returns a vector indexed by pdf-id, saying for each pdf which
   /// pairs of (phone, pdf-class) it can correspond to.  (Usually just one).
   /// c.f. hmm/hmm-topology.h for meaning of pdf-class.
-
-  void GetPdfInfo(const std::vector<int32> &phones,  // list of phones
+  virtual void GetPdfInfo(const std::vector<int32> &phones,  // list of phones
                   const std::vector<int32> &num_pdf_classes,  // indexed by phone,
                   std::vector<std::vector<std::pair<int32, int32> > > *pdf_info)
       const;
diff --git a/src/tree/event-map-test.cc b/src/tree/event-map-test.cc
index 40f898b971b..7178437d414 100644
--- a/src/tree/event-map-test.cc
+++ b/src/tree/event-map-test.cc
@@ -233,7 +233,7 @@ void TestEventMapPrune() {
   EventMap *mapped_em = em->Copy(new_leaves),
       *pruned_em = mapped_em->Prune();
   for (size_t i = 0; i < new_leaves.size(); i++)
-    if (new_leaves[i]) delete new_leaves[i];
+    delete new_leaves[i];
   for (int32 i = 0; i < 10; i++) {
     EventType event;
     for (int32 key = 1; key <= 2; key++) {
diff --git a/src/util/Makefile b/src/util/Makefile
index 82a3324b5db..ff0c94fb655 100644
--- a/src/util/Makefile
+++ b/src/util/Makefile
@@ -10,10 +10,10 @@ TESTFILES = const-integer-set-test stl-utils-test text-utils-test \
     kaldi-table-test simple-options-test
 
 OBJFILES = text-utils.o kaldi-io.o \
-         kaldi-table.o parse-options.o simple-options.o simple-io-funcs.o 
+        kaldi-holder.o  kaldi-table.o parse-options.o simple-options.o simple-io-funcs.o
 
 LIBNAME = kaldi-util
 
-ADDLIBS = ../matrix/kaldi-matrix.a ../base/kaldi-base.a
+ADDLIBS = ../matrix/kaldi-matrix.a ../base/kaldi-base.a ../thread/kaldi-thread.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/util/basic-filebuf.h b/src/util/basic-filebuf.h
index cf2e079b1f0..1ea1fa8daa2 100644
--- a/src/util/basic-filebuf.h
+++ b/src/util/basic-filebuf.h
@@ -15,16 +15,15 @@
 #include <fstream>
 #include <cstdio>
 #include <cstring>
+#include <string>
+#include <algorithm>
 
 ///////////////////////////////////////////////////////////////////////////////
-namespace kaldi
-{
-
+namespace kaldi {
 ///////////////////////////////////////////////////////////////////////////////
 template <typename CharT, typename Traits = std::char_traits<CharT> >
-class basic_filebuf : public std::basic_streambuf<CharT, Traits>
-{
-public:
+class basic_filebuf : public std::basic_streambuf<CharT, Traits> {
+ public:
     typedef CharT                            char_type;
     typedef Traits                           traits_type;
     typedef typename traits_type::int_type   int_type;
@@ -49,19 +48,22 @@ class basic_filebuf : public std::basic_streambuf<CharT, Traits>
     FILE* file() { return this->_M_file; }
     int fd() { return fileno(this->_M_file); }
 
-protected:
+ protected:
     int_type underflow() override;
     int_type pbackfail(int_type c = traits_type::eof()) override;
-    int_type overflow (int_type c = traits_type::eof()) override;
-    std::basic_streambuf<char_type, traits_type>* setbuf(char_type* s, std::streamsize n) override;
+    int_type overflow(int_type c = traits_type::eof()) override;
+    std::basic_streambuf<char_type, traits_type>*
+      setbuf(char_type* s, std::streamsize n) override;
     pos_type seekoff(off_type off, std::ios_base::seekdir way,
-                     std::ios_base::openmode wch = std::ios_base::in | std::ios_base::out) override;
+                     std::ios_base::openmode wch =
+                     std::ios_base::in | std::ios_base::out) override;
     pos_type seekpos(pos_type sp,
-                     std::ios_base::openmode wch = std::ios_base::in | std::ios_base::out) override;
+                     std::ios_base::openmode wch =
+                     std::ios_base::in | std::ios_base::out) override;
     int sync() override;
     void imbue(const std::locale& loc) override;
 
-protected:
+ protected:
     char*       _M_extbuf;
     const char* _M_extbufnext;
     const char* _M_extbufend;
@@ -101,11 +103,11 @@ basic_filebuf<CharT, Traits>::basic_filebuf()
       _M_cm(std::ios_base::openmode(0)),
       _M_owns_eb(false),
       _M_owns_ib(false),
-      _M_always_noconv(false)
-{
-    if (std::has_facet<std::codecvt<char_type, char, state_type> >(this->getloc()))
-    {
-        _M_cv = &std::use_facet<std::codecvt<char_type, char, state_type> >(this->getloc());
+      _M_always_noconv(false) {
+    if (std::has_facet<std::codecvt<char_type, char, state_type> >
+        (this->getloc())) {
+        _M_cv = &std::use_facet<std::codecvt<char_type, char, state_type> >
+          (this->getloc());
         _M_always_noconv = _M_cv->always_noconv();
     }
     setbuf(0, 4096);
@@ -114,16 +116,12 @@ basic_filebuf<CharT, Traits>::basic_filebuf()
 ///////////////////////////////////////////////////////////////////////////////
 template <class CharT, class Traits>
 basic_filebuf<CharT, Traits>::basic_filebuf(basic_filebuf&& rhs)
-    : std::basic_streambuf<CharT, Traits>(rhs)
-{
-    if (rhs._M_extbuf == rhs._M_extbuf_min)
-    {
+    : std::basic_streambuf<CharT, Traits>(rhs) {
+    if (rhs._M_extbuf == rhs._M_extbuf_min) {
         _M_extbuf = _M_extbuf_min;
         _M_extbufnext = _M_extbuf + (rhs._M_extbufnext - rhs._M_extbuf);
         _M_extbufend = _M_extbuf + (rhs._M_extbufend - rhs._M_extbuf);
-    }
-    else
-    {
+    } else {
         _M_extbuf = rhs._M_extbuf;
         _M_extbufnext = rhs._M_extbufnext;
         _M_extbufend = rhs._M_extbufend;
@@ -140,24 +138,24 @@ basic_filebuf<CharT, Traits>::basic_filebuf(basic_filebuf&& rhs)
     _M_owns_eb = rhs._M_owns_eb;
     _M_owns_ib = rhs._M_owns_ib;
     _M_always_noconv = rhs._M_always_noconv;
-    if (rhs.pbase())
-    {
+    if (rhs.pbase()) {
         if (rhs.pbase() == rhs._M_intbuf)
             this->setp(_M_intbuf, _M_intbuf + (rhs. epptr() - rhs.pbase()));
         else
-            this->setp((char_type*)_M_extbuf,
-                       (char_type*)_M_extbuf + (rhs. epptr() - rhs.pbase()));
+            this->setp(reinterpret_cast<char_type*>(_M_extbuf),
+                       reinterpret_cast<char_type*>(_M_extbuf)
+                       + (rhs. epptr() - rhs.pbase()));
         this->pbump(rhs. pptr() - rhs.pbase());
-    }
-    else if (rhs.eback())
-    {
+    } else if (rhs.eback()) {
         if (rhs.eback() == rhs._M_intbuf)
             this->setg(_M_intbuf, _M_intbuf + (rhs.gptr() - rhs.eback()),
                                   _M_intbuf + (rhs.egptr() - rhs.eback()));
         else
-            this->setg((char_type*)_M_extbuf,
-                       (char_type*)_M_extbuf + (rhs.gptr() - rhs.eback()),
-                       (char_type*)_M_extbuf + (rhs.egptr() - rhs.eback()));
+            this->setg(reinterpret_cast<char_type*>(_M_extbuf),
+                       reinterpret_cast<char_type*>(_M_extbuf) +
+                       (rhs.gptr() - rhs.eback()),
+                       reinterpret_cast<char_type*>(_M_extbuf) +
+                       (rhs.egptr() - rhs.eback()));
     }
     rhs._M_extbuf = nullptr;
     rhs._M_extbufnext = nullptr;
@@ -180,8 +178,7 @@ basic_filebuf<CharT, Traits>::basic_filebuf(basic_filebuf&& rhs)
 template <class CharT, class Traits>
 inline
 basic_filebuf<CharT, Traits>&
-basic_filebuf<CharT, Traits>::operator=(basic_filebuf&& rhs)
-{
+basic_filebuf<CharT, Traits>::operator=(basic_filebuf&& rhs) {
     close();
     swap(rhs);
     return *this;
@@ -189,8 +186,7 @@ basic_filebuf<CharT, Traits>::operator=(basic_filebuf&& rhs)
 
 ///////////////////////////////////////////////////////////////////////////////
 template <class CharT, class Traits>
-basic_filebuf<CharT, Traits>::~basic_filebuf()
-{
+basic_filebuf<CharT, Traits>::~basic_filebuf() {
     // try
     // {
     //     close();
@@ -207,28 +203,22 @@ basic_filebuf<CharT, Traits>::~basic_filebuf()
 ///////////////////////////////////////////////////////////////////////////////
 template <class CharT, class Traits>
 void
-basic_filebuf<CharT, Traits>::swap(basic_filebuf& rhs)
-{
+basic_filebuf<CharT, Traits>::swap(basic_filebuf& rhs) {
     std::basic_streambuf<char_type, traits_type>::swap(rhs);
-    if (_M_extbuf != _M_extbuf_min && rhs._M_extbuf != rhs._M_extbuf_min)
-    {
+    if (_M_extbuf != _M_extbuf_min && rhs._M_extbuf != rhs._M_extbuf_min) {
         std::swap(_M_extbuf, rhs._M_extbuf);
         std::swap(_M_extbufnext, rhs._M_extbufnext);
         std::swap(_M_extbufend, rhs._M_extbufend);
-    }
-    else
-    {
+    } else {
         ptrdiff_t ln = _M_extbufnext - _M_extbuf;
         ptrdiff_t le = _M_extbufend - _M_extbuf;
         ptrdiff_t rn = rhs._M_extbufnext - rhs._M_extbuf;
         ptrdiff_t re = rhs._M_extbufend - rhs._M_extbuf;
-        if (_M_extbuf == _M_extbuf_min && rhs._M_extbuf != rhs._M_extbuf_min)
-        {
+        if (_M_extbuf == _M_extbuf_min && rhs._M_extbuf != rhs._M_extbuf_min) {
             _M_extbuf = rhs._M_extbuf;
             rhs._M_extbuf = rhs._M_extbuf_min;
-        }
-        else if (_M_extbuf != _M_extbuf_min && rhs._M_extbuf == rhs._M_extbuf_min)
-        {
+        } else if (_M_extbuf != _M_extbuf_min &&
+            rhs._M_extbuf == rhs._M_extbuf_min) {
             rhs._M_extbuf = _M_extbuf;
             _M_extbuf = _M_extbuf_min;
         }
@@ -249,36 +239,31 @@ basic_filebuf<CharT, Traits>::swap(basic_filebuf& rhs)
     std::swap(_M_owns_eb, rhs._M_owns_eb);
     std::swap(_M_owns_ib, rhs._M_owns_ib);
     std::swap(_M_always_noconv, rhs._M_always_noconv);
-    if (this->eback() == (char_type*)rhs._M_extbuf_min)
-    {
+    if (this->eback() == reinterpret_cast<char_type*>(rhs._M_extbuf_min)) {
         ptrdiff_t n = this->gptr() - this->eback();
         ptrdiff_t e = this->egptr() - this->eback();
-        this->setg((char_type*)_M_extbuf_min,
-                   (char_type*)_M_extbuf_min + n,
-                   (char_type*)_M_extbuf_min + e);
-    }
-    else if (this->pbase() == (char_type*)rhs._M_extbuf_min)
-    {
+        this->setg(reinterpret_cast<char_type*>(_M_extbuf_min),
+                   reinterpret_cast<char_type*>(_M_extbuf_min) + n,
+                   reinterpret_cast<char_type*>(_M_extbuf_min) + e);
+    } else if (this->pbase() ==
+	           reinterpret_cast<char_type*>(rhs._M_extbuf_min)) {
         ptrdiff_t n = this->pptr() - this->pbase();
         ptrdiff_t e = this->epptr() - this->pbase();
-        this->setp((char_type*)_M_extbuf_min,
-                   (char_type*)_M_extbuf_min + e);
+        this->setp(reinterpret_cast<char_type*>(_M_extbuf_min),
+                   reinterpret_cast<char_type*>(_M_extbuf_min) + e);
         this->pbump(n);
     }
-    if (rhs.eback() == (char_type*)_M_extbuf_min)
-    {
+    if (rhs.eback() == reinterpret_cast<char_type*>(_M_extbuf_min)) {
         ptrdiff_t n = rhs.gptr() - rhs.eback();
         ptrdiff_t e = rhs.egptr() - rhs.eback();
-        rhs.setg((char_type*)rhs._M_extbuf_min,
-                 (char_type*)rhs._M_extbuf_min + n,
-                 (char_type*)rhs._M_extbuf_min + e);
-    }
-    else if (rhs.pbase() == (char_type*)_M_extbuf_min)
-    {
+        rhs.setg(reinterpret_cast<char_type*>(rhs._M_extbuf_min),
+                 reinterpret_cast<char_type*>(rhs._M_extbuf_min) + n,
+                 reinterpret_cast<char_type*>(rhs._M_extbuf_min) + e);
+    } else if (rhs.pbase() == reinterpret_cast<char_type*>(_M_extbuf_min)) {
         ptrdiff_t n = rhs.pptr() - rhs.pbase();
         ptrdiff_t e = rhs.epptr() - rhs.pbase();
-        rhs.setp((char_type*)rhs._M_extbuf_min,
-                 (char_type*)rhs._M_extbuf_min + e);
+        rhs.setp(reinterpret_cast<char_type*>(rhs._M_extbuf_min),
+                 reinterpret_cast<char_type*>(rhs._M_extbuf_min) + e);
         rhs.pbump(n);
     }
 }
@@ -287,8 +272,7 @@ basic_filebuf<CharT, Traits>::swap(basic_filebuf& rhs)
 template <class CharT, class Traits>
 inline
 void
-swap(basic_filebuf<CharT, Traits>& x, basic_filebuf<CharT, Traits>& y)
-{
+swap(basic_filebuf<CharT, Traits>& x, basic_filebuf<CharT, Traits>& y) {
     x.swap(y);
 }
 
@@ -296,17 +280,15 @@ swap(basic_filebuf<CharT, Traits>& x, basic_filebuf<CharT, Traits>& y)
 template <class CharT, class Traits>
 inline
 bool
-basic_filebuf<CharT, Traits>::is_open() const
-{
+basic_filebuf<CharT, Traits>::is_open() const {
     return _M_file != nullptr;
 }
 
 ///////////////////////////////////////////////////////////////////////////////
 template <class CharT, class Traits>
-const char* basic_filebuf<CharT, Traits>::_M_get_mode(std::ios_base::openmode mode)
-{
-    switch ((mode & ~std::ios_base::ate) | 0)
-    {
+const char* basic_filebuf<CharT, Traits>::
+_M_get_mode(std::ios_base::openmode mode) {
+    switch ((mode & ~std::ios_base::ate) | 0) {
     case std::ios_base::out:
     case std::ios_base::out | std::ios_base::trunc:
         return "w";
@@ -333,9 +315,11 @@ const char* basic_filebuf<CharT, Traits>::_M_get_mode(std::ios_base::openmode mo
         return "rb";
     case std::ios_base::in  | std::ios_base::out | std::ios_base::binary:
         return "r+b";
-    case std::ios_base::in  | std::ios_base::out | std::ios_base::trunc | std::ios_base::binary:
+    case std::ios_base::in  | std::ios_base::out | std::ios_base::trunc |
+      std::ios_base::binary:
         return "w+b";
-    case std::ios_base::in  | std::ios_base::out | std::ios_base::app | std::ios_base::binary:
+    case std::ios_base::in  | std::ios_base::out | std::ios_base::app |
+      std::ios_base::binary:
     case std::ios_base::in  | std::ios_base::app | std::ios_base::binary:
         return "a+b";
     default:
@@ -346,23 +330,18 @@ const char* basic_filebuf<CharT, Traits>::_M_get_mode(std::ios_base::openmode mo
 ///////////////////////////////////////////////////////////////////////////////
 template <class CharT, class Traits>
 basic_filebuf<CharT, Traits>*
-basic_filebuf<CharT, Traits>::open(const char* s, std::ios_base::openmode mode)
-{
+basic_filebuf<CharT, Traits>::
+open(const char* s, std::ios_base::openmode mode) {
     basic_filebuf<CharT, Traits>* rt = nullptr;
-    if (_M_file == nullptr)
-    {
+    if (_M_file == nullptr) {
         const char* md= _M_get_mode(mode);
-        if (md)
-        {
+        if (md) {
             _M_file = fopen(s, md);
-            if (_M_file)
-            {
+            if (_M_file) {
                 rt = this;
                 _M_om = mode;
-                if (mode & std::ios_base::ate)
-                {
-                    if (fseek(_M_file, 0, SEEK_END))
-                    {
+                if (mode & std::ios_base::ate) {
+                    if (fseek(_M_file, 0, SEEK_END)) {
                         fclose(_M_file);
                         _M_file = nullptr;
                         rt = nullptr;
@@ -378,31 +357,29 @@ basic_filebuf<CharT, Traits>::open(const char* s, std::ios_base::openmode mode)
 template <class CharT, class Traits>
 inline
 basic_filebuf<CharT, Traits>*
-basic_filebuf<CharT, Traits>::open(const std::string& s, std::ios_base::openmode mode)
-{
+basic_filebuf<CharT, Traits>::open(const std::string& s,
+    std::ios_base::openmode mode) {
     return open(s.c_str(), mode);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
 template <class CharT, class Traits>
 basic_filebuf<CharT, Traits>*
-basic_filebuf<CharT, Traits>::open(int fd, std::ios_base::openmode mode)
-{
+basic_filebuf<CharT, Traits>::open(int fd, std::ios_base::openmode mode) {
     const char* md= this->_M_get_mode(mode);
-    if (md)
-    {
+    if (md) {
         this->_M_file= fdopen(fd, md);
         this->_M_om = mode;
         return this;
+    } else {
+      return nullptr;
     }
-    else return nullptr;
 }
 
 ///////////////////////////////////////////////////////////////////////////////
 template <class CharT, class Traits>
 basic_filebuf<CharT, Traits>*
-basic_filebuf<CharT, Traits>::open(FILE* f, std::ios_base::openmode mode)
-{
+basic_filebuf<CharT, Traits>::open(FILE* f, std::ios_base::openmode mode) {
     this->_M_file = f;
     this->_M_om = mode;
     return this;
@@ -411,11 +388,9 @@ basic_filebuf<CharT, Traits>::open(FILE* f, std::ios_base::openmode mode)
 ///////////////////////////////////////////////////////////////////////////////
 template <class CharT, class Traits>
 basic_filebuf<CharT, Traits>*
-basic_filebuf<CharT, Traits>::close()
-{
+basic_filebuf<CharT, Traits>::close() {
     basic_filebuf<CharT, Traits>* rt = nullptr;
-    if (_M_file)
-    {
+    if (_M_file) {
         rt = this;
         std::unique_ptr<FILE, int(*)(FILE*)> h(_M_file, fclose);
         if (sync())
@@ -431,43 +406,43 @@ basic_filebuf<CharT, Traits>::close()
 ///////////////////////////////////////////////////////////////////////////////
 template <class CharT, class Traits>
 typename basic_filebuf<CharT, Traits>::int_type
-basic_filebuf<CharT, Traits>::underflow()
-{
+basic_filebuf<CharT, Traits>::underflow() {
     if (_M_file == nullptr)
         return traits_type::eof();
     bool initial = _M_read_mode();
     char_type buf;
     if (this->gptr() == nullptr)
         this->setg(&buf, &buf+1, &buf+1);
-    const size_t unget_sz = initial ? 0 : std::min<size_t>((this->egptr() - this->eback()) / 2, 4);
+    const size_t unget_sz = initial ? 0 : std::
+      min<size_t>((this->egptr() - this->eback()) / 2, 4);
     int_type c = traits_type::eof();
-    if (this->gptr() == this->egptr())
-    {
-        memmove(this->eback(), this->egptr() - unget_sz, unget_sz * sizeof(char_type));
-        if (_M_always_noconv)
-        {
-            size_t nmemb = static_cast<size_t>(this->egptr() - this->eback() - unget_sz);
+    if (this->gptr() == this->egptr()) {
+        memmove(this->eback(), this->egptr() - unget_sz,
+            unget_sz * sizeof(char_type));
+        if (_M_always_noconv) {
+            size_t nmemb = static_cast<size_t>
+              (this->egptr() - this->eback() - unget_sz);
             nmemb = fread(this->eback() + unget_sz, 1, nmemb, _M_file);
-            if (nmemb != 0)
-            {
+            if (nmemb != 0) {
                 this->setg(this->eback(),
                            this->eback() + unget_sz,
                            this->eback() + unget_sz + nmemb);
                 c = traits_type::to_int_type(*this->gptr());
             }
-        }
-        else
-        {
+        } else {
             memmove(_M_extbuf, _M_extbufnext, _M_extbufend - _M_extbufnext);
             _M_extbufnext = _M_extbuf + (_M_extbufend - _M_extbufnext);
-            _M_extbufend = _M_extbuf + (_M_extbuf == _M_extbuf_min ? sizeof(_M_extbuf_min) : _M_ebs);
+            _M_extbufend = _M_extbuf +
+              (_M_extbuf == _M_extbuf_min ? sizeof(_M_extbuf_min) : _M_ebs);
             size_t nmemb = std::min(static_cast<size_t>(_M_ibs - unget_sz),
-                                    static_cast<size_t>(_M_extbufend - _M_extbufnext));
+                                    static_cast<size_t>
+                                    (_M_extbufend - _M_extbufnext));
             std::codecvt_base::result r;
             _M_st_last = _M_st;
-            size_t nr = fread((void*)_M_extbufnext, 1, nmemb, _M_file);
-            if (nr != 0)
-            {
+            size_t nr = fread(
+                reinterpret_cast<void*>(const_cast<char_type*>(_M_extbufnext)),
+                1, nmemb, _M_file);
+            if (nr != 0) {
                 if (!_M_cv)
                     throw std::bad_cast();
                 _M_extbufend = _M_extbufnext + nr;
@@ -475,21 +450,20 @@ basic_filebuf<CharT, Traits>::underflow()
                 r = _M_cv->in(_M_st, _M_extbuf, _M_extbufend, _M_extbufnext,
                               this->eback() + unget_sz,
                               this->eback() + _M_ibs, inext);
-                if (r == std::codecvt_base::noconv)
-                {
-                    this->setg((char_type*)_M_extbuf, (char_type*)_M_extbuf, (char_type*)_M_extbufend);
+                if (r == std::codecvt_base::noconv) {
+                    this->setg(reinterpret_cast<char_type*>(_M_extbuf),
+                               reinterpret_cast<char_type*>(_M_extbuf),
+                               const_cast<char_type*>(_M_extbufend));
                     c = traits_type::to_int_type(*this->gptr());
-                }
-                else if (inext != this->eback() + unget_sz)
-                {
+                } else if (inext != this->eback() + unget_sz) {
                     this->setg(this->eback(), this->eback() + unget_sz, inext);
                     c = traits_type::to_int_type(*this->gptr());
                 }
             }
         }
-    }
-    else
+    } else {
         c = traits_type::to_int_type(*this->gptr());
+    }
     if (this->eback() == &buf)
         this->setg(0, 0, 0);
     return c;
@@ -498,18 +472,14 @@ basic_filebuf<CharT, Traits>::underflow()
 ///////////////////////////////////////////////////////////////////////////////
 template <class CharT, class Traits>
 typename basic_filebuf<CharT, Traits>::int_type
-basic_filebuf<CharT, Traits>::pbackfail(int_type c)
-{
-    if (_M_file && this->eback() < this->gptr())
-    {
-        if (traits_type::eq_int_type(c, traits_type::eof()))
-        {
+basic_filebuf<CharT, Traits>::pbackfail(int_type c) {
+    if (_M_file && this->eback() < this->gptr()) {
+        if (traits_type::eq_int_type(c, traits_type::eof())) {
             this->gbump(-1);
             return traits_type::not_eof(c);
         }
         if ((_M_om & std::ios_base::out) ||
-            traits_type::eq(traits_type::to_char_type(c), this->gptr()[-1]))
-        {
+            traits_type::eq(traits_type::to_char_type(c), this->gptr()[-1])) {
             this->gbump(-1);
             *this->gptr() = traits_type::to_char_type(c);
             return c;
@@ -521,35 +491,29 @@ basic_filebuf<CharT, Traits>::pbackfail(int_type c)
 ///////////////////////////////////////////////////////////////////////////////
 template <class CharT, class Traits>
 typename basic_filebuf<CharT, Traits>::int_type
-basic_filebuf<CharT, Traits>::overflow(int_type c)
-{
+basic_filebuf<CharT, Traits>::overflow(int_type c) {
     if (_M_file == nullptr)
         return traits_type::eof();
     _M_write_mode();
     char_type buf;
     char_type* pb_save = this->pbase();
     char_type* epb_save = this->epptr();
-    if (!traits_type::eq_int_type(c, traits_type::eof()))
-    {
+    if (!traits_type::eq_int_type(c, traits_type::eof())) {
         if (this->pptr() == nullptr)
             this->setp(&buf, &buf+1);
         *this->pptr() = traits_type::to_char_type(c);
         this->pbump(1);
     }
-    if (this->pptr() != this->pbase())
-    {
-        if (_M_always_noconv)
-        {
+    if (this->pptr() != this->pbase()) {
+        if (_M_always_noconv) {
             size_t nmemb = static_cast<size_t>(this->pptr() - this->pbase());
-            if (fwrite(this->pbase(), sizeof(char_type), nmemb, _M_file) != nmemb)
+            if (fwrite(this->pbase(), sizeof(char_type),
+                  nmemb, _M_file) != nmemb)
                 return traits_type::eof();
-        }
-        else
-        {
+        } else {
             char* extbe = _M_extbuf;
             std::codecvt_base::result r;
-            do
-            {
+            do {
                 if (!_M_cv)
                     throw std::bad_cast();
                 const char_type* e;
@@ -557,25 +521,24 @@ basic_filebuf<CharT, Traits>::overflow(int_type c)
                                _M_extbuf, _M_extbuf + _M_ebs, extbe);
                 if (e == this->pbase())
                     return traits_type::eof();
-                if (r == std::codecvt_base::noconv)
-                {
-                    size_t nmemb = static_cast<size_t>(this->pptr() - this->pbase());
+                if (r == std::codecvt_base::noconv) {
+                    size_t nmemb = static_cast<size_t>
+                      (this->pptr() - this->pbase());
                     if (fwrite(this->pbase(), 1, nmemb, _M_file) != nmemb)
                         return traits_type::eof();
-                }
-                else if (r == std::codecvt_base::ok || r == std::codecvt_base::partial)
-                {
+                } else if (r == std::codecvt_base::ok ||
+                    r == std::codecvt_base::partial) {
                     size_t nmemb = static_cast<size_t>(extbe - _M_extbuf);
                     if (fwrite(_M_extbuf, 1, nmemb, _M_file) != nmemb)
                         return traits_type::eof();
-                    if (r == std::codecvt_base::partial)
-                    {
-                        this->setp((char_type*)e, this->pptr());
+                    if (r == std::codecvt_base::partial) {
+                        this->setp(const_cast<char_type*>(e),
+                            this->pptr());
                         this->pbump(this->epptr() - this->pbase());
                     }
-                }
-                else
+                } else {
                     return traits_type::eof();
+                }
             } while (r == std::codecvt_base::partial);
         }
         this->setp(pb_save, epb_save);
@@ -586,8 +549,7 @@ basic_filebuf<CharT, Traits>::overflow(int_type c)
 ///////////////////////////////////////////////////////////////////////////////
 template <class CharT, class Traits>
 std::basic_streambuf<CharT, Traits>*
-basic_filebuf<CharT, Traits>::setbuf(char_type* s, std::streamsize n)
-{
+basic_filebuf<CharT, Traits>::setbuf(char_type* s, std::streamsize n) {
     this->setg(0, 0, 0);
     this->setp(0, 0);
     if (_M_owns_eb)
@@ -595,41 +557,29 @@ basic_filebuf<CharT, Traits>::setbuf(char_type* s, std::streamsize n)
     if (_M_owns_ib)
         delete [] _M_intbuf;
     _M_ebs = n;
-    if (_M_ebs > sizeof(_M_extbuf_min))
-    {
-        if (_M_always_noconv && s)
-        {
-            _M_extbuf = (char*)s;
+    if (_M_ebs > sizeof(_M_extbuf_min)) {
+        if (_M_always_noconv && s) {
+            _M_extbuf = reinterpret_cast<char*>(s);
             _M_owns_eb = false;
-        }
-        else
-        {
+        } else {
             _M_extbuf = new char[_M_ebs];
             _M_owns_eb = true;
         }
-    }
-    else
-    {
+    } else {
         _M_extbuf = _M_extbuf_min;
         _M_ebs = sizeof(_M_extbuf_min);
         _M_owns_eb = false;
     }
-    if (!_M_always_noconv)
-    {
+    if (!_M_always_noconv) {
         _M_ibs = std::max<std::streamsize>(n, sizeof(_M_extbuf_min));
-        if (s && _M_ibs >= sizeof(_M_extbuf_min))
-        {
+        if (s && _M_ibs >= sizeof(_M_extbuf_min)) {
             _M_intbuf = s;
             _M_owns_ib = false;
-        }
-        else
-        {
+        } else {
             _M_intbuf = new char_type[_M_ibs];
             _M_owns_ib = true;
         }
-    }
-    else
-    {
+    } else {
         _M_ibs = 0;
         _M_intbuf = 0;
         _M_owns_ib = false;
@@ -641,8 +591,7 @@ basic_filebuf<CharT, Traits>::setbuf(char_type* s, std::streamsize n)
 template <class CharT, class Traits>
 typename basic_filebuf<CharT, Traits>::pos_type
 basic_filebuf<CharT, Traits>::seekoff(off_type off, std::ios_base::seekdir way,
-                                      std::ios_base::openmode)
-{
+                                      std::ios_base::openmode) {
     if (!_M_cv)
         throw std::bad_cast();
     int width = _M_cv->encoding();
@@ -650,8 +599,7 @@ basic_filebuf<CharT, Traits>::seekoff(off_type off, std::ios_base::seekdir way,
         return pos_type(off_type(-1));
     // width > 0 || off == 0
     int whence;
-    switch (way)
-    {
+    switch (way) {
     case std::ios_base::beg:
         whence = SEEK_SET;
         break;
@@ -680,8 +628,7 @@ basic_filebuf<CharT, Traits>::seekoff(off_type off, std::ios_base::seekdir way,
 ///////////////////////////////////////////////////////////////////////////////
 template <class CharT, class Traits>
 typename basic_filebuf<CharT, Traits>::pos_type
-basic_filebuf<CharT, Traits>::seekpos(pos_type sp, std::ios_base::openmode)
-{
+basic_filebuf<CharT, Traits>::seekpos(pos_type sp, std::ios_base::openmode) {
     if (_M_file == nullptr || sync())
         return pos_type(off_type(-1));
 #if _WIN32
@@ -698,20 +645,17 @@ basic_filebuf<CharT, Traits>::seekpos(pos_type sp, std::ios_base::openmode)
 ///////////////////////////////////////////////////////////////////////////////
 template <class CharT, class Traits>
 int
-basic_filebuf<CharT, Traits>::sync()
-{
+basic_filebuf<CharT, Traits>::sync() {
     if (_M_file == nullptr)
         return 0;
     if (!_M_cv)
         throw std::bad_cast();
-    if (_M_cm & std::ios_base::out)
-    {
+    if (_M_cm & std::ios_base::out) {
         if (this->pptr() != this->pbase())
             if (overflow() == traits_type::eof())
                 return -1;
         std::codecvt_base::result r;
-        do
-        {
+        do {
             char* extbe;
             r = _M_cv->unshift(_M_st, _M_extbuf, _M_extbuf + _M_ebs, extbe);
             size_t nmemb = static_cast<size_t>(extbe - _M_extbuf);
@@ -722,27 +666,22 @@ basic_filebuf<CharT, Traits>::sync()
             return -1;
         if (fflush(_M_file))
             return -1;
-    }
-    else if (_M_cm & std::ios_base::in)
-    {
+    } else if (_M_cm & std::ios_base::in) {
         off_type c;
         state_type state = _M_st_last;
         bool update_st = false;
-        if (_M_always_noconv)
+        if (_M_always_noconv) {
             c = this->egptr() - this->gptr();
-        else
-        {
+        } else {
             int width = _M_cv->encoding();
             c = _M_extbufend - _M_extbufnext;
-            if (width > 0)
+            if (width > 0) {
                 c += width * (this->egptr() - this->gptr());
-            else
-            {
-                if (this->gptr() != this->egptr())
-                {
-                    const int off =  _M_cv->length(state, _M_extbuf,
-                                                   _M_extbufnext,
-                                                   this->gptr() - this->eback());
+            } else {
+                if (this->gptr() != this->egptr()) {
+                    const int off = _M_cv->length(state, _M_extbuf,
+                                                  _M_extbufnext,
+                                                  this->gptr() - this->eback());
                     c += _M_extbufnext - _M_extbuf - off;
                     update_st = true;
                 }
@@ -767,40 +706,34 @@ basic_filebuf<CharT, Traits>::sync()
 ///////////////////////////////////////////////////////////////////////////////
 template <class CharT, class Traits>
 void
-basic_filebuf<CharT, Traits>::imbue(const std::locale& loc)
-{
+basic_filebuf<CharT, Traits>::imbue(const std::locale& loc) {
     sync();
     _M_cv = &std::use_facet<std::codecvt<char_type, char, state_type> >(loc);
     bool old_anc = _M_always_noconv;
     _M_always_noconv = _M_cv->always_noconv();
-    if (old_anc != _M_always_noconv)
-    {
+    if (old_anc != _M_always_noconv) {
         this->setg(0, 0, 0);
         this->setp(0, 0);
         // invariant, char_type is char, else we couldn't get here
-        if (_M_always_noconv)  // need to dump _M_intbuf
-        {
+        // need to dump _M_intbuf
+        if (_M_always_noconv) {
             if (_M_owns_eb)
                 delete [] _M_extbuf;
             _M_owns_eb = _M_owns_ib;
             _M_ebs = _M_ibs;
-            _M_extbuf = (char*)_M_intbuf;
+            _M_extbuf = reinterpret_cast<char*>(_M_intbuf);
             _M_ibs = 0;
             _M_intbuf = nullptr;
             _M_owns_ib = false;
-        }
-        else  // need to obtain an _M_intbuf.
-        {     // If _M_extbuf is user-supplied, use it, else new _M_intbuf
-            if (!_M_owns_eb && _M_extbuf != _M_extbuf_min)
-            {
+        } else {  // need to obtain an _M_intbuf.
+         // If _M_extbuf is user-supplied, use it, else new _M_intbuf
+            if (!_M_owns_eb && _M_extbuf != _M_extbuf_min) {
                 _M_ibs = _M_ebs;
-                _M_intbuf = (char_type*)_M_extbuf;
+                _M_intbuf = reinterpret_cast<char_type*>(_M_extbuf);
                 _M_owns_ib = false;
                 _M_extbuf = new char[_M_ebs];
                 _M_owns_eb = true;
-            }
-            else
-            {
+            } else {
                 _M_ibs = _M_ebs;
                 _M_intbuf = new char_type[_M_ibs];
                 _M_owns_ib = true;
@@ -812,15 +745,13 @@ basic_filebuf<CharT, Traits>::imbue(const std::locale& loc)
 ///////////////////////////////////////////////////////////////////////////////
 template <class CharT, class Traits>
 bool
-basic_filebuf<CharT, Traits>::_M_read_mode()
-{
-    if (!(_M_cm & std::ios_base::in))
-    {
+basic_filebuf<CharT, Traits>::_M_read_mode() {
+    if (!(_M_cm & std::ios_base::in)) {
         this->setp(0, 0);
         if (_M_always_noconv)
-            this->setg((char_type*)_M_extbuf,
-                       (char_type*)_M_extbuf + _M_ebs,
-                       (char_type*)_M_extbuf + _M_ebs);
+            this->setg(reinterpret_cast<char_type*>(_M_extbuf),
+                       reinterpret_cast<char_type*>(_M_extbuf) + _M_ebs,
+                       reinterpret_cast<char_type*>(_M_extbuf) + _M_ebs);
         else
             this->setg(_M_intbuf, _M_intbuf + _M_ibs, _M_intbuf + _M_ibs);
         _M_cm = std::ios_base::in;
@@ -832,21 +763,19 @@ basic_filebuf<CharT, Traits>::_M_read_mode()
 ///////////////////////////////////////////////////////////////////////////////
 template <class CharT, class Traits>
 void
-basic_filebuf<CharT, Traits>::_M_write_mode()
-{
-    if (!(_M_cm & std::ios_base::out))
-    {
+basic_filebuf<CharT, Traits>::_M_write_mode() {
+    if (!(_M_cm & std::ios_base::out)) {
         this->setg(0, 0, 0);
-        if (_M_ebs > sizeof(_M_extbuf_min))
-        {
+        if (_M_ebs > sizeof(_M_extbuf_min)) {
             if (_M_always_noconv)
-                this->setp((char_type*)_M_extbuf,
-                           (char_type*)_M_extbuf + (_M_ebs - 1));
+                this->setp(reinterpret_cast<char_type*>(_M_extbuf),
+                           reinterpret_cast<char_type*>(_M_extbuf) +
+                           (_M_ebs - 1));
             else
                 this->setp(_M_intbuf, _M_intbuf + (_M_ibs - 1));
-        }
-        else
+        } else {
             this->setp(0, 0);
+        }
         _M_cm = std::ios_base::out;
     }
 }
@@ -855,7 +784,7 @@ basic_filebuf<CharT, Traits>::_M_write_mode()
 }
 
 ///////////////////////////////////////////////////////////////////////////////
-#endif // KALDI_UTIL_BASIC_FILEBUF_H_
+#endif  // KALDI_UTIL_BASIC_FILEBUF_H_
 
 ///////////////////////////////////////////////////////////////////////////////
 
diff --git a/src/util/common-utils.h b/src/util/common-utils.h
index 9d39f9d566f..cfb0c255c77 100644
--- a/src/util/common-utils.h
+++ b/src/util/common-utils.h
@@ -28,4 +28,4 @@
 #include "util/table-types.h"
 #include "util/text-utils.h"
 
-#endif
+#endif  // KALDI_UTIL_COMMON_UTILS_H_
diff --git a/src/util/const-integer-set-inl.h b/src/util/const-integer-set-inl.h
index 8f92ab26724..325605350ae 100644
--- a/src/util/const-integer-set-inl.h
+++ b/src/util/const-integer-set-inl.h
@@ -41,10 +41,11 @@ void ConstIntegerSet<I>::InitInternal() {
     size_t range = highest_member_ + 1 - lowest_member_;
     if (range == slow_set_.size()) {
       contiguous_ = true;
-      quick_=false;
+      quick_= false;
     } else {
       contiguous_ = false;
-      if (range < slow_set_.size() * 8 * sizeof(I)) {  // If it would be more compact to store as bool
+      // If it would be more compact to store as bool
+      if (range < slow_set_.size() * 8 * sizeof(I)) {
         // (assuming 1 bit per element)...
         quick_set_.resize(range, false);
         for (size_t i = 0;i < slow_set_.size();i++)
@@ -59,11 +60,13 @@ void ConstIntegerSet<I>::InitInternal() {
 
 template<class I>
 int ConstIntegerSet<I>::count(I i) const {
-  if (i < lowest_member_ || i > highest_member_) return 0;
-  else {
+  if (i < lowest_member_ || i > highest_member_) {
+    return 0;
+  } else {
     if (contiguous_) return true;
-    if (quick_) return (quick_set_[i-lowest_member_] ? 1 : 0);
-    else {
+    if (quick_) {
+      return (quick_set_[i-lowest_member_] ? 1 : 0);
+    } else {
       bool ans = std::binary_search(slow_set_.begin(), slow_set_.end(), i);
       return (ans ? 1 : 0);
     }
@@ -83,6 +86,6 @@ void ConstIntegerSet<I>::Read(std::istream &is, bool binary) {
 
 
 
-} // end namespace kaldi
+}  // end namespace kaldi
 
-#endif
+#endif  // KALDI_UTIL_CONST_INTEGER_SET_INL_H_
diff --git a/src/util/const-integer-set-test.cc b/src/util/const-integer-set-test.cc
index 42b3b570032..a863e70340c 100644
--- a/src/util/const-integer-set-test.cc
+++ b/src/util/const-integer-set-test.cc
@@ -19,11 +19,11 @@
 
 
 #include "util/const-integer-set.h"
-#include "util/kaldi-io.h"
-#include <set> // for baseline.
+#include <set>  // for baseline.
 #include <cassert>
 #include <cstdlib>
 #include <iostream>
+#include "util/kaldi-io.h"
 
 namespace kaldi {
 
@@ -32,7 +32,7 @@ template<class Int> void TestSetOfNumbers(bool binary) {
   size_t n_in_set = (Rand() % 3) * 50 + (Rand() % 4);  // may be less than this.
   size_t max = (Int) (Rand() % 100) + 1;
   for (size_t i = 0; i < n_in_set; i++) {
-    Int to_add  ((Int) (Rand() % max));
+    Int to_add((Int) (Rand() % max));
     baseline_set.insert(to_add);
   }
 
@@ -41,13 +41,14 @@ template<class Int> void TestSetOfNumbers(bool binary) {
        iter!= baseline_set.end();iter++)
     vector_set.push_back(*iter);
   if (vector_set.size() != 0) {
-    for (size_t i = 0;i < 10;i++) // randomize order.
-      std::swap(vector_set[Rand()%vector_set.size()],  vector_set[Rand()%vector_set.size()]);
+    for (size_t i = 0;i < 10;i++)  // randomize order.
+      std::swap(vector_set[Rand()%vector_set.size()],
+                vector_set[Rand()%vector_set.size()]);
   }
 
   ConstIntegerSet<Int> my_set1(baseline_set);
 
-  ConstIntegerSet<Int> my_set2(vector_set) ;
+  ConstIntegerSet<Int> my_set2(vector_set);
 
   ConstIntegerSet<Int> my_set3;
   my_set3.Init(baseline_set);
@@ -70,7 +71,7 @@ template<class Int> void TestSetOfNumbers(bool binary) {
   // if (enable_iterators) {
   size_t sz = baseline_set.size(), sz1 = my_set1.size(), sz2 = my_set2.size(),
       sz3 = my_set3.size(), sz4 = my_set4.size(), sz5 = my_set5.size();
-  KALDI_ASSERT(sz == sz1 && sz == sz2 && sz == sz3 && sz == sz4 && sz==sz5);
+  KALDI_ASSERT(sz == sz1 && sz == sz2 && sz == sz3 && sz == sz4 && sz == sz5);
   // }
   for (size_t i = 0;i < 100;i++) {
     Int some_int;
@@ -88,7 +89,8 @@ template<class Int> void TestSetOfNumbers(bool binary) {
     if (in_baseline) {
       KALDI_ASSERT(in_my_set1&&in_my_set2&&in_my_set3&&in_my_set4&&in_my_set5);
     } else {
-      KALDI_ASSERT(!in_my_set1&&!in_my_set2&&!in_my_set3&&!in_my_set4&&!in_my_set5);
+      KALDI_ASSERT(!in_my_set1&&!in_my_set2&&!in_my_set3&&!in_my_set4&&
+                   !in_my_set5);
     }
   }
 
@@ -125,7 +127,7 @@ template<class Int> void TestSetOfNumbers(bool binary) {
   // }
 }
 
-} // end namespace kaldi
+}  // end namespace kaldi
 
 
 
@@ -134,8 +136,8 @@ int main() {
   for (size_t i = 0;i < 10;i++) {
     TestSetOfNumbers<int>(Rand()%2);
     TestSetOfNumbers<unsigned int>(Rand()%2);
-    TestSetOfNumbers<short int>(Rand()%2);
-    TestSetOfNumbers<short unsigned int>(Rand()%2);
+    TestSetOfNumbers<int16>(Rand()%2);
+    TestSetOfNumbers<int16>(Rand()%2);
     TestSetOfNumbers<char>(Rand()%2);
     TestSetOfNumbers<unsigned char>(Rand()%2);
   }
diff --git a/src/util/const-integer-set.h b/src/util/const-integer-set.h
index ffdce4d905e..bb10a5043d8 100644
--- a/src/util/const-integer-set.h
+++ b/src/util/const-integer-set.h
@@ -63,7 +63,8 @@ template<class I> class ConstIntegerSet {
     CopySetToVector(input, &slow_set_);
     InitInternal();
   }
-  explicit ConstIntegerSet(const ConstIntegerSet<I> &other): slow_set_(other.slow_set_) {
+  explicit ConstIntegerSet(const ConstIntegerSet<I> &other):
+                           slow_set_(other.slow_set_) {
     InitInternal();
   }
 
@@ -88,8 +89,8 @@ template<class I> class ConstIntegerSet {
   void InitInternal();
 };
 
-} // end namespace kaldi
+}  // end namespace kaldi
 
-#include "const-integer-set-inl.h"
+#include "util/const-integer-set-inl.h"
 
-#endif
+#endif  // KALDI_UTIL_CONST_INTEGER_SET_H_
diff --git a/src/util/edit-distance-inl.h b/src/util/edit-distance-inl.h
index ebbfb711ed3..c1d1682804c 100644
--- a/src/util/edit-distance-inl.h
+++ b/src/util/edit-distance-inl.h
@@ -19,9 +19,11 @@
 
 #ifndef KALDI_UTIL_EDIT_DISTANCE_INL_H_
 #define KALDI_UTIL_EDIT_DISTANCE_INL_H_
+#include <algorithm>
+#include <utility>
+#include <vector>
 #include "util/stl-utils.h"
 
-
 namespace kaldi {
 
 template<class T>
@@ -64,7 +66,7 @@ int32 LevenshteinEditDistance(const std::vector<T> &a,
   return e.back();
 }
 //
-struct error_stats{
+struct error_stats {
   int32 ins_num;
   int32 del_num;
   int32 sub_num;
@@ -89,38 +91,38 @@ int32 LevenshteinEditDistance(const std::vector<T> &ref,
     e[i].total_cost = i;
   }
 
- // for other alignments
- for (size_t hyp_index = 1; hyp_index <= hyp.size(); hyp_index ++) {
-   cur_e[0] = e[0];
-   cur_e[0].ins_num ++;
-   cur_e[0].total_cost ++;
-   for (size_t ref_index = 1; ref_index <= ref.size(); ref_index ++) {
-
+  // for other alignments
+  for (size_t hyp_index = 1; hyp_index <= hyp.size(); hyp_index ++) {
+    cur_e[0] = e[0];
+    cur_e[0].ins_num++;
+    cur_e[0].total_cost++;
+    for (size_t ref_index = 1; ref_index <= ref.size(); ref_index ++) {
      int32 ins_err = e[ref_index].total_cost + 1;
      int32 del_err = cur_e[ref_index-1].total_cost + 1;
      int32 sub_err = e[ref_index-1].total_cost;
       if (hyp[hyp_index-1] != ref[ref_index-1])
-       sub_err ++;
+       sub_err++;
 
      if (sub_err < ins_err && sub_err < del_err) {
         cur_e[ref_index] =e[ref_index-1];
         if (hyp[hyp_index-1] != ref[ref_index-1])
-          cur_e[ref_index].sub_num ++;   // substitution error should be increased
+          cur_e[ref_index].sub_num++;  // substitution error should be increased
         cur_e[ref_index].total_cost = sub_err;
-     }else if (del_err < ins_err ) {
+     } else if (del_err < ins_err) {
         cur_e[ref_index] = cur_e[ref_index-1];
         cur_e[ref_index].total_cost = del_err;
-        cur_e[ref_index].del_num ++;    // deletion number is increased.
-     }else{
+        cur_e[ref_index].del_num++;    // deletion number is increased.
+     } else {
         cur_e[ref_index] = e[ref_index];
         cur_e[ref_index].total_cost = ins_err;
-        cur_e[ref_index].ins_num ++;    // insertion number is increased.
+        cur_e[ref_index].ins_num++;    // insertion number is increased.
      }
-   }
-   e = cur_e;  // alternate for the next recursion.
- }
+  }
+  e = cur_e;  // alternate for the next recursion.
+  }
   size_t ref_index = e.size()-1;
-  *ins = e[ref_index].ins_num, *del = e[ref_index].del_num, *sub = e[ref_index].sub_num;
+  *ins = e[ref_index].ins_num, *del =
+    e[ref_index].del_num, *sub = e[ref_index].sub_num;
   return e[ref_index].total_cost;
 }
 
@@ -153,22 +155,31 @@ int32 LevenshteinAlignment(const std::vector<T> &a,
     }
   }
   // get time-reversed output first: trace back.
-  m = M; n = N;
+  m = M;
+  n = N;
   while (m != 0 || n != 0) {
     size_t last_m, last_n;
-    if (m == 0) { last_m = m; last_n = n-1; }
-    else if (n == 0) { last_m = m-1; last_n = n; }
-    else {
+    if (m == 0) {
+      last_m = m;
+      last_n = n-1;
+    } else if (n == 0) {
+      last_m = m-1;
+      last_n = n;
+    } else {
       int32 sub_or_ok = e[m-1][n-1] + (a[m-1] == b[n-1] ? 0 : 1);
       int32 del = e[m-1][n] + 1;  // assumes a == ref, b == hyp.
       int32 ins = e[m][n-1] + 1;
-      if (sub_or_ok <= std::min(del, ins)) {  // choose sub_or_ok if all else equal.
-        last_m = m-1; last_n = n-1;
+      // choose sub_or_ok if all else equal.
+      if (sub_or_ok <= std::min(del, ins)) {
+        last_m = m-1;
+        last_n = n-1;
       } else {
         if (del <= ins) {  // choose del over ins if equal.
-          last_m = m-1; last_n = n;
+          last_m = m-1;
+          last_n = n;
         } else {
-          last_m = m; last_n = n-1;
+          last_m = m;
+          last_n = n-1;
         }
       }
     }
@@ -186,4 +197,4 @@ int32 LevenshteinAlignment(const std::vector<T> &a,
 
 }  // end namespace kaldi
 
-#endif // KALDI_UTIL_EDIT_DISTANCE_INL_H_
+#endif  // KALDI_UTIL_EDIT_DISTANCE_INL_H_
diff --git a/src/util/edit-distance-test.cc b/src/util/edit-distance-test.cc
index 6973d1cccb3..f0acaf5e686 100644
--- a/src/util/edit-distance-test.cc
+++ b/src/util/edit-distance-test.cc
@@ -24,7 +24,6 @@
 namespace kaldi {
 
 void TestEditDistance() {
-
   std::vector<int32> a;
   std::vector<int32> b;
   KALDI_ASSERT(LevenshteinEditDistance(a, b) == 0);
@@ -63,12 +62,10 @@ void TestEditDistance() {
   b.push_back(10);
 
   KALDI_ASSERT(LevenshteinEditDistance(a, b) == 3);
-
 }
 
 
 void TestEditDistanceString() {
-
   std::vector<std::string> a;
   std::vector<std::string> b;
   KALDI_ASSERT(LevenshteinEditDistance(a, b) == 0);
@@ -107,7 +104,6 @@ void TestEditDistanceString() {
   b.push_back("10");
 
   KALDI_ASSERT(LevenshteinEditDistance(a, b) == 3);
-
 }
 
 
@@ -130,13 +126,14 @@ void TestEditDistance2() {
   ref.push_back(6);
   ref.push_back(7);
   total_cost = LevenshteinEditDistance(ref, hyp, &ins, &del, &sub);
-  KALDI_ASSERT(total_cost == 3  && ins == 0 && del == 2 && sub == 1 );
+  KALDI_ASSERT(total_cost == 3  && ins == 0 && del == 2 && sub == 1);
 
   std::swap(hyp, ref);
   total_cost = LevenshteinEditDistance(ref, hyp, &ins, &del, &sub);
   KALDI_ASSERT(total_cost == 3 && ins == 2 && del == 0 && sub == 1);
 
-  hyp.clear(); ref.clear();
+  hyp.clear();
+  ref.clear();
   hyp.push_back(1);
   ref.push_back(1);
   total_cost = LevenshteinEditDistance(ref, hyp, &ins, &del, &sub);
@@ -150,7 +147,8 @@ void TestEditDistance2() {
   for (; num < 1000; num ++) {
     int32  hyp_len = Rand()%11;
     int32  ref_len = Rand()%3;
-    hyp.resize(hyp_len);  ref.resize(ref_len);
+    hyp.resize(hyp_len);
+    ref.resize(ref_len);
 
     int32 index = 0;
     for (; index < hyp_len; index ++)
@@ -188,13 +186,14 @@ void TestEditDistance2String() {
   ref.push_back("6");
   ref.push_back("7");
   total_cost = LevenshteinEditDistance(ref, hyp, &ins, &del, &sub);
-  KALDI_ASSERT(total_cost == 3  && ins == 0 && del == 2 && sub == 1 );
+  KALDI_ASSERT(total_cost == 3  && ins == 0 && del == 2 && sub == 1);
 
   std::swap(hyp, ref);
   total_cost = LevenshteinEditDistance(ref, hyp, &ins, &del, &sub);
   KALDI_ASSERT(total_cost == 3 && ins == 2 && del == 0 && sub == 1);
 
-  hyp.clear(); ref.clear();
+  hyp.clear();
+  ref.clear();
   hyp.push_back("1");
   ref.push_back("1");
   total_cost = LevenshteinEditDistance(ref, hyp, &ins, &del, &sub);
@@ -208,7 +207,8 @@ void TestEditDistance2String() {
   for (; num < 1000; num ++) {
     int32  hyp_len = Rand()%11;
     int32  ref_len = Rand()%3;
-    hyp.resize(hyp_len);  ref.resize(ref_len);
+    hyp.resize(hyp_len);
+    ref.resize(ref_len);
 
     int32 index = 0;
     for (; index < hyp_len; index ++)
@@ -251,7 +251,7 @@ void TestLevenshteinAlignment() {
   }
 }
 
-} // end namespace kaldi
+}  // end namespace kaldi
 
 int main() {
   using namespace kaldi;
diff --git a/src/util/edit-distance.h b/src/util/edit-distance.h
index 6000622e701..5eac4aea1cf 100644
--- a/src/util/edit-distance.h
+++ b/src/util/edit-distance.h
@@ -25,6 +25,8 @@
 #include <algorithm>
 #include <limits>
 #include <cassert>
+#include <utility>
+#include "util/edit-distance-inl.h"
 #include "base/kaldi-types.h"
 
 namespace kaldi {
@@ -36,7 +38,8 @@ int32 LevenshteinEditDistance(const std::vector<T> &a,
 
 
 // edit distance calculation with conventional method.
-// note: noise word must be filtered out from the hypothesis and reference sequence
+// note: noise word must be filtered out from the hypothesis and
+// reference sequence
 // before the following procedure conducted.
 template<class T>
 int32 LevenshteinEditDistance(const std::vector<T> &ref,
@@ -56,8 +59,6 @@ int32 LevenshteinAlignment(const std::vector<T> &a,
                            T eps_symbol,
                            std::vector<std::pair<T, T> > *output);
 
-} // end namespace kaldi
+}  // end namespace kaldi
 
-#include "edit-distance-inl.h"
-
-#endif
+#endif  // KALDI_UTIL_EDIT_DISTANCE_H_
diff --git a/src/util/hash-list-inl.h b/src/util/hash-list-inl.h
index 19c2bb68009..3fe16182b82 100644
--- a/src/util/hash-list-inl.h
+++ b/src/util/hash-list-inl.h
@@ -36,15 +36,16 @@ template<class I, class T> HashList<I, T>::HashList() {
 
 template<class I, class T> void HashList<I, T>::SetSize(size_t size) {
   hash_size_ = size;
-  KALDI_ASSERT(list_head_ == NULL && bucket_list_tail_ == static_cast<size_t>(-1));  // make sure empty.
+  KALDI_ASSERT(list_head_ == NULL &&
+      bucket_list_tail_ == static_cast<size_t>(-1));  // make sure empty.
   if (size > buckets_.size())
     buckets_.resize(size, HashBucket(0, NULL));
 }
 
 template<class I, class T>
 typename HashList<I, T>::Elem* HashList<I, T>::Clear() {
-  // Clears the hashtable and gives ownership of the currently contained list to the
-  // user.
+  // Clears the hashtable and gives ownership of the currently contained list
+  // to the user.
   for (size_t cur_bucket = bucket_list_tail_;
       cur_bucket != static_cast<size_t>(-1);
       cur_bucket = buckets_[cur_bucket].prev_bucket) {
@@ -161,23 +162,23 @@ void HashList<I, T>::InsertMore(I key, T val) {
   elem->key = key;
   elem->val = val;
 
-  KALDI_ASSERT(bucket.last_elem != NULL); // we assume there is already one element
-  if (bucket.last_elem->key == key) { // standard behavior: add as last element
+  KALDI_ASSERT(bucket.last_elem != NULL);  // assume one element is already here
+  if (bucket.last_elem->key == key) {  // standard behavior: add as last element
     elem->tail = bucket.last_elem->tail;
     bucket.last_elem->tail = elem;
     bucket.last_elem = elem;
     return;
-  } 
+  }
   Elem *e = (bucket.prev_bucket == static_cast<size_t>(-1) ?
              list_head_ : buckets_[bucket.prev_bucket].last_elem->tail);
-  // find place to insert in linked list 
+  // find place to insert in linked list
   while (e != bucket.last_elem->tail && e->key != key) e = e->tail;
-  KALDI_ASSERT(e->key == key); // not found? - should not happen
+  KALDI_ASSERT(e->key == key);  // not found? - should not happen
   elem->tail = e->tail;
   e->tail = elem;
 }
 
 
-} // end namespace kaldi
+}  // end namespace kaldi
 
-#endif
+#endif  // KALDI_UTIL_HASH_LIST_INL_H_
diff --git a/src/util/hash-list-test.cc b/src/util/hash-list-test.cc
index 211fc603ea9..ee37c70033d 100644
--- a/src/util/hash-list-test.cc
+++ b/src/util/hash-list-test.cc
@@ -19,8 +19,8 @@
 // limitations under the License.
 
 
-#include "hash-list.h"
-#include <map> // for baseline.
+#include "util/hash-list.h"
+#include <map>  // for baseline.
 #include <cstdlib>
 #include <iostream>
 
@@ -45,7 +45,6 @@ template<class Int, class T> void TestHashList() {
   std::map<Int, T> m2;
 
   for (int i = 0; i < 100; i++) {
-
     m2.clear();
     for (typename std::map<Int, T>::const_iterator iter = m1.begin();
         iter != m1.end();
@@ -56,8 +55,9 @@ template<class Int, class T> void TestHashList() {
 
     Elem *h = hash.Clear(), *tmp;
 
-    hash.SetSize(100 + Rand() % 100);  // note, SetSize is relatively cheap operation as long
-    // as we are not increasing the size more than it's ever previously been increased to.
+    hash.SetSize(100 + Rand() % 100);  // note, SetSize is relatively cheap
+    // operation as long as we are not increasing the size more than it's ever
+    // previously been increased to.
 
     for (; h != NULL; h = tmp) {
       hash.Insert(h->key + 1, h->val);
@@ -77,7 +77,7 @@ template<class Int, class T> void TestHashList() {
       bool found_m1 = (m1.find(key) != m1.end());
       if (found_m1) m1[key];
       Elem *e = hash.Find(key);
-      KALDI_ASSERT( (e != NULL) == found_m1 );
+      KALDI_ASSERT((e != NULL) == found_m1);
       if (found_m1)
         KALDI_ASSERT(m1[key] == e->val);
     }
@@ -89,7 +89,7 @@ template<class Int, class T> void TestHashList() {
 
 
 
-} // end namespace kaldi
+}  // end namespace kaldi
 
 
 
@@ -98,8 +98,8 @@ int main() {
   for (size_t i = 0;i < 3;i++) {
     TestHashList<int, unsigned int>();
     TestHashList<unsigned int, int>();
-    TestHashList<short int, long int>();
-    TestHashList<short unsigned int, long int>();
+    TestHashList<int16, int32>();
+    TestHashList<int16, int32>();
     TestHashList<char, unsigned char>();
     TestHashList<unsigned char, int>();
   }
diff --git a/src/util/hash-list.h b/src/util/hash-list.h
index 45247594954..67257d053cd 100644
--- a/src/util/hash-list.h
+++ b/src/util/hash-list.h
@@ -48,7 +48,6 @@
 namespace kaldi {
 
 template<class I, class T> class HashList {
-
  public:
   struct Elem {
     I key;
@@ -56,7 +55,8 @@ template<class I, class T> class HashList {
     Elem *tail;
   };
 
-  /// Constructor takes no arguments.  Call SetSize to inform it of the likely size.
+  /// Constructor takes no arguments.
+  /// Call SetSize to inform it of the likely size.
   HashList();
 
   /// Clears the hash and gives the head of the current list to the user;
@@ -65,43 +65,50 @@ template<class I, class T> class HashList {
   Elem *Clear();
 
   /// Gives the head of the current list to the user.  Ownership retained in the
-  /// class.  Caution: in December 2013 the return type was changed to const Elem*
-  /// and this function was made const.  You may need to change some types of
-  /// local Elem* variables to const if this produces compilation errors.
+  /// class.  Caution: in December 2013 the return type was changed to const
+  /// Elem* and this function was made const.  You may need to change some types
+  /// of local Elem* variables to const if this produces compilation errors.
   const Elem *GetList() const;
 
   /// Think of this like delete().  It is to be called for each Elem in turn
-  /// after you "obtained ownership" by doing Clear().  This is not the opposite of
-  /// Insert, it is the opposite of New.  It's really a memory operation.
+  /// after you "obtained ownership" by doing Clear().  This is not the opposite
+  /// of. Insert, it is the opposite of New.  It's really a memory operation.
   inline void Delete(Elem *e);
 
-  /// This should probably not be needed to be called directly by the user.  Think of it as opposite
+  /// This should probably not be needed to be called directly by the user.
+  /// Think of it as opposite
   /// to Delete();
   inline Elem *New();
 
   /// Find tries to find this element in the current list using the hashtable.
-  /// It returns NULL if not present.  The Elem it returns is not owned by the user,
-  /// it is part of the internal list owned by this object, but the user is
-  /// free to modify the "val" element.
+  /// It returns NULL if not present.  The Elem it returns is not owned by the
+  /// user, it is part of the internal list owned by this object, but the user
+  /// is free to modify the "val" element.
   inline Elem *Find(I key);
-  
-  /// Insert inserts a new element into the hashtable/stored list.  By calling this,
+
+  /// Insert inserts a new element into the hashtable/stored list.  By calling
+  /// this,
   /// the user asserts that it is not already present (e.g. Find was called and
-  /// returned NULL).  With current code, calling this if an element already exists will
-  /// result in duplicate elements in the structure, and Find() will find the
-  /// first one that was added.  [but we don't guarantee this behavior].
+  /// returned NULL).  With current code, calling this if an element already
+  ///  exists will result in duplicate elements in the structure, and Find()
+  ///  will find the first one that was added.
+  /// [but we don't guarantee this behavior].
   inline void Insert(I key, T val);
 
-  /// Insert inserts another element with same key into the hashtable/stored list.
-  /// By calling this, the user asserts that one element with that key is already present.
-  /// We insert it that way, that all elements with the same key follow each other.
+  /// Insert inserts another element with same key into the hashtable/
+  /// stored list.
+  /// By calling this, the user asserts that one element with that key is
+  /// already present.
+  /// We insert it that way, that all elements with the same key
+  /// follow each other.
   /// Find() will return the first one of the elements with the same key.
   inline void InsertMore(I key, T val);
 
-  /// SetSize tells the object how many hash buckets to allocate (should typically be
-  /// at least twice the number of objects we expect to go in the structure, for fastest
-  /// performance).  It must be called while the hash is empty (e.g. after Clear() or
-  /// after initializing the object, but before adding anything to the hash.
+  /// SetSize tells the object how many hash buckets to allocate (should
+  /// typically be at least twice the number of objects we expect to go in the
+  /// structure, for fastest performance).  It must be called while the hash
+  /// is empty (e.g. after Clear() or after initializing the object, but before
+  /// adding anything to the hash.
   void SetSize(size_t sz);
 
   /// Returns current number of hash buckets.
@@ -111,8 +118,8 @@ template<class I, class T> class HashList {
  private:
 
   struct HashBucket {
-    size_t prev_bucket;  // index to next bucket (-1 if list tail).  Note: list of buckets
-    // goes in opposite direction to list of Elems.
+    size_t prev_bucket;  // index to next bucket (-1 if list tail).  Note:
+    // list of buckets goes in opposite direction to list of Elems.
     Elem *last_elem;  // pointer to last element in this bucket (NULL if empty)
     inline HashBucket(size_t i, Elem *e): prev_bucket(i), last_elem(e) {}
   };
@@ -124,17 +131,19 @@ template<class I, class T> class HashList {
 
   std::vector<HashBucket> buckets_;
 
-  Elem *freed_head_;  // head of list of currently freed elements. [ready for allocation]
+  Elem *freed_head_;  // head of list of currently freed elements. [ready for
+  // allocation]
 
   std::vector<Elem*> allocated_;  // list of allocated blocks.
 
-  static const size_t allocate_block_size_ = 1024;  // Number of Elements to allocate in one block.  Must be
-  // largish so storing allocated_ doesn't become a problem.
+  static const size_t allocate_block_size_ = 1024;  // Number of Elements to
+  // allocate in one block.  Must be largish so storing allocated_ doesn't
+  // become a problem.
 };
 
 
-} // end namespace kaldi
+}  // end namespace kaldi
 
-#include "hash-list-inl.h"
+#include "util/hash-list-inl.h"
 
-#endif
+#endif  // KALDI_UTIL_HASH_LIST_H_
diff --git a/src/util/kaldi-cygwin-io-inl.h b/src/util/kaldi-cygwin-io-inl.h
index caf6f6495af..8a3cd91ac13 100644
--- a/src/util/kaldi-cygwin-io-inl.h
+++ b/src/util/kaldi-cygwin-io-inl.h
@@ -23,6 +23,8 @@
 #error This is a Windows-compatibility file. Something went wery wrong.
 #endif
 
+#include <string>
+
 // This file is included only into kaldi-io.cc, and only if
 // KALDI_CYGWIN_COMPAT is enabled.
 //
diff --git a/src/util/kaldi-holder-inl.h b/src/util/kaldi-holder-inl.h
index 6a66e612108..4297af9a2e2 100644
--- a/src/util/kaldi-holder-inl.h
+++ b/src/util/kaldi-holder-inl.h
@@ -1,6 +1,7 @@
 // util/kaldi-holder-inl.h
 
 // Copyright 2009-2011     Microsoft Corporation
+//                2016     Xiaohui Zhang
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -22,6 +23,9 @@
 #define KALDI_UTIL_KALDI_HOLDER_INL_H_
 
 #include <algorithm>
+#include <vector>
+#include <utility>
+#include <string>
 #include "util/kaldi-io.h"
 #include "util/text-utils.h"
 #include "matrix/kaldi-matrix.h"
@@ -47,9 +51,8 @@ template<class KaldiType> class KaldiObjectHolder {
     try {
       t.Write(os, binary);
       return os.good();
-    } catch (const std::exception &e) {
-      KALDI_WARN << "Exception caught writing Table object: " << e.what();
-      if (!IsKaldiError(e.what())) { std::cerr << e.what(); }
+    } catch(const std::exception &e) {
+      KALDI_WARN << "Exception caught writing Table object. " << e.what();
       return false;  // Write failure.
     }
   }
@@ -63,9 +66,10 @@ template<class KaldiType> class KaldiObjectHolder {
 
   // Reads into the holder.
   bool Read(std::istream &is) {
-    if (t_) delete t_;
+    delete t_;
     t_ = new T;
-    // Don't want any existing state to complicate the read functioN: get new object.
+    // Don't want any existing state to complicate the read function: get new
+    // object.
     bool is_binary;
     if (!InitKaldiInputStream(is, &is_binary)) {
       KALDI_WARN << "Reading Table object, failed reading binary header\n";
@@ -74,9 +78,8 @@ template<class KaldiType> class KaldiObjectHolder {
     try {
       t_->Read(is, is_binary);
       return true;
-    } catch (std::exception &e) {
-      KALDI_WARN << "Exception caught reading Table object ";
-      if (!IsKaldiError(e.what())) { std::cerr << e.what(); }
+    } catch(const std::exception &e) {
+      KALDI_WARN << "Exception caught reading Table object. " << e.what();
       delete t_;
       t_ = NULL;
       return false;
@@ -93,7 +96,20 @@ template<class KaldiType> class KaldiObjectHolder {
     return *t_;
   }
 
-  ~KaldiObjectHolder() { if (t_) delete t_; }
+  void Swap(KaldiObjectHolder<T> *other) {
+    std::swap(t_, other->t_);
+  }
+
+  bool ExtractRange(const KaldiObjectHolder<T> &other,
+                    const std::string &range) {
+    KALDI_ASSERT(other.t_ != NULL);
+    delete t_;
+    t_ = new T;
+    // this call will fail for most object types.
+    return ExtractObjectRange(*(other.t_), range, t_);
+  }
+
+  ~KaldiObjectHolder() { delete t_; }
  private:
   KALDI_DISALLOW_COPY_AND_ASSIGN(KaldiObjectHolder);
   T *t_;
@@ -118,9 +134,8 @@ template<class BasicType> class BasicHolder {
       if (!binary) os << '\n';  // Makes output format more readable and
       // easier to manipulate.
       return os.good();
-    } catch (const std::exception &e) {
-      KALDI_WARN << "Exception caught writing Table object: " << e.what();
-      if (!IsKaldiError(e.what())) { std::cerr << e.what(); }
+    } catch(const std::exception &e) {
+      KALDI_WARN << "Exception caught writing Table object. " << e.what();
       return false;  // Write failure.
     }
   }
@@ -131,27 +146,34 @@ template<class BasicType> class BasicHolder {
   bool Read(std::istream &is) {
     bool is_binary;
     if (!InitKaldiInputStream(is, &is_binary)) {
-      KALDI_WARN << "Reading Table object [integer type], failed reading binary header\n";
+      KALDI_WARN << "Reading Table object [integer type], failed reading binary"
+          " header\n";
       return false;
     }
     try {
       int c;
-      if (!is_binary) {  // This is to catch errors, the class would work without it..
+      if (!is_binary) {  // This is to catch errors, the class would work
+        // without it..
         // Eat up any whitespace and make sure it's not newline.
-        while (isspace((c = is.peek())) && c != static_cast<int>('\n')) is.get();
+        while (isspace((c = is.peek())) && c != static_cast<int>('\n')) {
+          is.get();
+        }
         if (is.peek() == '\n') {
           KALDI_WARN << "Found newline but expected basic type.";
           return false;  // This is just to catch a more-
-          // likely-than average type of error (empty line before the token), since
-          // ReadBasicType will eat it up.
+          // likely-than average type of error (empty line before the token),
+          // since ReadBasicType will eat it up.
         }
       }
 
       ReadBasicType(is, is_binary, &t_);
 
-      if (!is_binary) {  // This is to catch errors, the class would work without it..
+      if (!is_binary) {  // This is to catch errors, the class would work
+        // without it..
         // make sure there is a newline.
-        while (isspace((c = is.peek())) && c != static_cast<int>('\n')) is.get();
+        while (isspace((c = is.peek())) && c != static_cast<int>('\n')) {
+          is.get();
+        }
         if (is.peek() != '\n') {
           KALDI_WARN << "BasicHolder::Read, expected newline, got "
                      << CharToString(is.peek()) << ", position " << is.tellg();
@@ -160,9 +182,8 @@ template<class BasicType> class BasicHolder {
         is.get();  // Consume the newline.
       }
       return true;
-    } catch (std::exception &e) {
-      KALDI_WARN << "Exception caught reading Table object";
-      if (!IsKaldiError(e.what())) { std::cerr << e.what(); }
+    } catch(const std::exception &e) {
+      KALDI_WARN << "Exception caught reading Table object. " << e.what();
       return false;
     }
   }
@@ -175,6 +196,15 @@ template<class BasicType> class BasicHolder {
     return t_;
   }
 
+  void Swap(BasicHolder<T> *other) {
+    std::swap(t_, other->t_);
+  }
+
+  bool ExtractRange(const BasicHolder<T> &other, const std::string &range) {
+    KALDI_ERR << "ExtractRange is not defined for this type of holder.";
+    return false;
+  }
+
   ~BasicHolder() { }
  private:
   KALDI_DISALLOW_COPY_AND_ASSIGN(BasicHolder);
@@ -198,27 +228,28 @@ template<class BasicType> class BasicVectorHolder {
     InitKaldiOutputStream(os, binary);  // Puts binary header if binary mode.
     try {
       if (binary) {  // need to write the size, in binary mode.
-        KALDI_ASSERT(static_cast<size_t>(static_cast<int32>(t.size())) == t.size());
+        KALDI_ASSERT(static_cast<size_t>(static_cast<int32>(t.size())) ==
+                     t.size());
         // Or this Write routine cannot handle such a large vector.
         // use int32 because it's fixed size regardless of compilation.
         // change to int64 (plus in Read function) if this becomes a problem.
         WriteBasicType(os, binary, static_cast<int32>(t.size()));
         for (typename std::vector<BasicType>::const_iterator iter = t.begin();
-            iter != t.end(); ++iter)
+             iter != t.end(); ++iter)
           WriteBasicType(os, binary, *iter);
 
       } else {
         for (typename std::vector<BasicType>::const_iterator iter = t.begin();
-            iter != t.end(); ++iter)
+             iter != t.end(); ++iter)
           WriteBasicType(os, binary, *iter);
         os << '\n';  // Makes output format more readable and
-        // easier to manipulate.  In text mode, this function writes something like
-        // "1 2 3\n".
+        // easier to manipulate.  In text mode, this function writes something
+        // like "1 2 3\n".
       }
       return os.good();
-    } catch (const std::exception &e) {
-      KALDI_WARN << "Exception caught writing Table object (BasicVector). ";
-      if (!IsKaldiError(e.what())) { std::cerr << e.what(); }
+    } catch(const std::exception &e) {
+      KALDI_WARN << "Exception caught writing Table object (BasicVector). "
+                 << e.what();
       return false;  // Write failure.
     }
   }
@@ -230,7 +261,8 @@ template<class BasicType> class BasicVectorHolder {
     t_.clear();
     bool is_binary;
     if (!InitKaldiInputStream(is, &is_binary)) {
-      KALDI_WARN << "Reading Table object [integer type], failed reading binary header\n";
+      KALDI_WARN << "Reading Table object [integer type], failed reading binary"
+          " header\n";
       return false;
     }
     if (!is_binary) {
@@ -238,7 +270,8 @@ template<class BasicType> class BasicVectorHolder {
       std::string line;
       getline(is, line);  // this will discard the \n, if present.
       if (is.fail()) {
-        KALDI_WARN << "BasicVectorHolder::Read, error reading line " << (is.eof() ? "[eof]" : "");
+        KALDI_WARN << "BasicVectorHolder::Read, error reading line " <<
+            (is.eof() ? "[eof]" : "");
         return false;  // probably eof.  fail in any case.
       }
       std::istringstream line_is(line);
@@ -251,9 +284,9 @@ template<class BasicType> class BasicVectorHolder {
           t_.push_back(bt);
         }
         return true;
-      } catch(std::exception &e) {
-        KALDI_WARN << "BasicVectorHolder::Read, could not interpret line: " << line;
-        if (!IsKaldiError(e.what())) { std::cerr << e.what(); }
+      } catch(const std::exception &e) {
+        KALDI_WARN << "BasicVectorHolder::Read, could not interpret line: "
+                   << "'" << line << "'" << "\n" << e.what();
         return false;
       }
     } else {  // binary mode.
@@ -263,13 +296,14 @@ template<class BasicType> class BasicVectorHolder {
         ReadBasicType(is, true, &size);
         t_.resize(size);
         for (typename std::vector<BasicType>::iterator iter = t_.begin();
-            iter != t_.end();
-            ++iter) {
+             iter != t_.end();
+             ++iter) {
           ReadBasicType(is, true, &(*iter));
         }
         return true;
-      } catch (...) {
-        KALDI_WARN << "BasicVectorHolder::Read, read error or unexpected data at archive entry beginning at file position " << filepos;
+      } catch(...) {
+        KALDI_WARN << "BasicVectorHolder::Read, read error or unexpected data"
+            " at archive entry beginning at file position " << filepos;
         return false;
       }
     }
@@ -281,6 +315,16 @@ template<class BasicType> class BasicVectorHolder {
 
   const T &Value() const {  return t_; }
 
+  void Swap(BasicVectorHolder<BasicType> *other) {
+    t_.swap(other->t_);
+  }
+
+  bool ExtractRange(const BasicVectorHolder<BasicType> &other,
+                    const std::string &range) {
+    KALDI_ERR << "ExtractRange is not defined for this type of holder.";
+    return false;
+  }
+
   ~BasicVectorHolder() { }
  private:
   KALDI_DISALLOW_COPY_AND_ASSIGN(BasicVectorHolder);
@@ -303,17 +347,21 @@ template<class BasicType> class BasicVectorVectorHolder {
     InitKaldiOutputStream(os, binary);  // Puts binary header if binary mode.
     try {
       if (binary) {  // need to write the size, in binary mode.
-        KALDI_ASSERT(static_cast<size_t>(static_cast<int32>(t.size())) == t.size());
+        KALDI_ASSERT(static_cast<size_t>(static_cast<int32>(t.size())) ==
+                     t.size());
         // Or this Write routine cannot handle such a large vector.
         // use int32 because it's fixed size regardless of compilation.
         // change to int64 (plus in Read function) if this becomes a problem.
         WriteBasicType(os, binary, static_cast<int32>(t.size()));
-        for (typename std::vector<std::vector<BasicType> >::const_iterator iter = t.begin();
-            iter != t.end(); ++iter) {
-          KALDI_ASSERT(static_cast<size_t>(static_cast<int32>(iter->size())) == iter->size());
+        for (typename std::vector<std::vector<BasicType> >::const_iterator
+                 iter = t.begin();
+             iter != t.end(); ++iter) {
+          KALDI_ASSERT(static_cast<size_t>(static_cast<int32>(iter->size()))
+                       == iter->size());
           WriteBasicType(os, binary, static_cast<int32>(iter->size()));
-          for (typename std::vector<BasicType>::const_iterator iter2=iter->begin();
-              iter2 != iter->end(); ++iter2) {
+          for (typename std::vector<BasicType>::const_iterator
+                   iter2 = iter->begin();
+               iter2 != iter->end(); ++iter2) {
             WriteBasicType(os, binary, *iter2);
           }
         }
@@ -323,10 +371,12 @@ template<class BasicType> class BasicVectorVectorHolder {
         // where the semicolon is a terminator, not a separator
         // (a separator would cause ambiguity between an
         // empty list, and a list containing a single empty list).
-        for (typename std::vector<std::vector<BasicType> >::const_iterator iter = t.begin();
-            iter != t.end();
+        for (typename std::vector<std::vector<BasicType> >::const_iterator
+                 iter = t.begin();
+             iter != t.end();
              ++iter) {
-          for (typename std::vector<BasicType>::const_iterator iter2=iter->begin();
+          for (typename std::vector<BasicType>::const_iterator
+                   iter2 = iter->begin();
                iter2 != iter->end(); ++iter2)
             WriteBasicType(os, binary, *iter2);
           os << "; ";
@@ -334,9 +384,8 @@ template<class BasicType> class BasicVectorVectorHolder {
         os << '\n';
       }
       return os.good();
-    } catch (const std::exception &e) {
-      KALDI_WARN << "Exception caught writing Table object. ";
-      if (!IsKaldiError(e.what())) { std::cerr << e.what(); }
+    } catch(const std::exception &e) {
+      KALDI_WARN << "Exception caught writing Table object. " << e.what();
       return false;  // Write failure.
     }
   }
@@ -364,7 +413,10 @@ template<class BasicType> class BasicVectorVectorHolder {
             if (!v.empty()) {
               KALDI_WARN << "No semicolon before newline (wrong format)";
               return false;
-            } else { is.get(); return true; }
+            } else {
+              is.get();
+              return true;
+            }
           } else if (std::isspace(i)) {
             is.get();
           } else if (static_cast<char>(i) == ';') {
@@ -377,9 +429,8 @@ template<class BasicType> class BasicVectorVectorHolder {
             v.push_back(b);
           }
         }
-      } catch(std::exception &e) {
-        KALDI_WARN << "BasicVectorVectorHolder::Read, read error";
-        if (!IsKaldiError(e.what())) { std::cerr << e.what(); }
+      } catch(const std::exception &e) {
+        KALDI_WARN << "BasicVectorVectorHolder::Read, read error. " << e.what();
         return false;
       }
     } else {  // binary mode.
@@ -388,20 +439,22 @@ template<class BasicType> class BasicVectorVectorHolder {
         int32 size;
         ReadBasicType(is, true, &size);
         t_.resize(size);
-        for (typename std::vector<std::vector<BasicType> >::iterator iter = t_.begin();
-            iter != t_.end();
-            ++iter) {
+        for (typename std::vector<std::vector<BasicType> >::iterator
+                 iter = t_.begin();
+             iter != t_.end();
+             ++iter) {
           int32 size2;
           ReadBasicType(is, true, &size2);
           iter->resize(size2);
           for (typename std::vector<BasicType>::iterator iter2 = iter->begin();
-              iter2 != iter->end();
-              ++iter2)
+               iter2 != iter->end();
+               ++iter2)
             ReadBasicType(is, true, &(*iter2));
         }
         return true;
-      } catch (...) {
-        KALDI_WARN << "Read error or unexpected data at archive entry beginning at file position " << filepos;
+      } catch(...) {
+        KALDI_WARN << "Read error or unexpected data at archive entry beginning"
+            " at file position " << filepos;
         return false;
       }
     }
@@ -413,6 +466,16 @@ template<class BasicType> class BasicVectorVectorHolder {
 
   const T &Value() const {  return t_; }
 
+  void Swap(BasicVectorVectorHolder<BasicType> *other) {
+    t_.swap(other->t_);
+  }
+
+  bool ExtractRange(BasicVectorVectorHolder<BasicType> &other,
+                    const std::string &range) {
+    KALDI_ERR << "ExtractRange is not defined for this type of holder.";
+    return false;
+  }
+
   ~BasicVectorVectorHolder() { }
  private:
   KALDI_DISALLOW_COPY_AND_ASSIGN(BasicVectorVectorHolder);
@@ -430,18 +493,19 @@ template<class BasicType> class BasicPairVectorHolder {
   typedef std::vector<std::pair<BasicType, BasicType> > T;
 
   BasicPairVectorHolder() { }
-  
+
   static bool Write(std::ostream &os, bool binary, const T &t) {
     InitKaldiOutputStream(os, binary);  // Puts binary header if binary mode.
     try {
       if (binary) {  // need to write the size, in binary mode.
-        KALDI_ASSERT(static_cast<size_t>(static_cast<int32>(t.size())) == t.size());
+        KALDI_ASSERT(static_cast<size_t>(static_cast<int32>(t.size())) ==
+                     t.size());
         // Or this Write routine cannot handle such a large vector.
         // use int32 because it's fixed size regardless of compilation.
         // change to int64 (plus in Read function) if this becomes a problem.
         WriteBasicType(os, binary, static_cast<int32>(t.size()));
         for (typename T::const_iterator iter = t.begin();
-            iter != t.end(); ++iter) {
+             iter != t.end(); ++iter) {
           WriteBasicType(os, binary, iter->first);
           WriteBasicType(os, binary, iter->second);
         }
@@ -460,13 +524,12 @@ template<class BasicType> class BasicPairVectorHolder {
         os << '\n';
       }
       return os.good();
-    } catch (const std::exception &e) {
-      KALDI_WARN << "Exception caught writing Table object. ";
-      if (!IsKaldiError(e.what())) { std::cerr << e.what(); }
+    } catch(const std::exception &e) {
+      KALDI_WARN << "Exception caught writing Table object. " << e.what();
       return false;  // Write failure.
     }
   }
-  
+
   void Clear() { t_.clear(); }
 
   // Reads into the holder.
@@ -474,7 +537,8 @@ template<class BasicType> class BasicPairVectorHolder {
     t_.clear();
     bool is_binary;
     if (!InitKaldiInputStream(is, &is_binary)) {
-      KALDI_WARN << "Reading Table object [integer type], failed reading binary header\n";
+      KALDI_WARN << "Reading Table object [integer type], failed reading binary"
+          " header\n";
       return false;
     }
     if (!is_binary) {
@@ -516,9 +580,8 @@ template<class BasicType> class BasicPairVectorHolder {
             v.push_back(b);
           }
         }
-      } catch(std::exception &e) {
-        KALDI_WARN << "BasicPairVectorHolder::Read, read error";
-        if (!IsKaldiError(e.what())) { std::cerr << e.what(); }
+      } catch(const std::exception &e) {
+        KALDI_WARN << "BasicPairVectorHolder::Read, read error. " << e.what();
         return false;
       }
     } else {  // binary mode.
@@ -528,14 +591,15 @@ template<class BasicType> class BasicPairVectorHolder {
         ReadBasicType(is, true, &size);
         t_.resize(size);
         for (typename T::iterator iter = t_.begin();
-            iter != t_.end();
-            ++iter) {
+             iter != t_.end();
+             ++iter) {
           ReadBasicType(is, true, &(iter->first));
           ReadBasicType(is, true, &(iter->second));
         }
         return true;
-      } catch (...) {
-        KALDI_WARN << "BasicVectorHolder::Read, read error or unexpected data at archive entry beginning at file position " << filepos;
+      } catch(...) {
+        KALDI_WARN << "BasicVectorHolder::Read, read error or unexpected data"
+            " at archive entry beginning at file position " << filepos;
         return false;
       }
     }
@@ -547,6 +611,16 @@ template<class BasicType> class BasicPairVectorHolder {
 
   const T &Value() const {  return t_; }
 
+  void Swap(BasicPairVectorHolder<BasicType> *other) {
+    t_.swap(other->t_);
+  }
+
+  bool ExtractRange(const BasicPairVectorHolder<BasicType> &other,
+                    const std::string &range) {
+    KALDI_ERR << "ExtractRange is not defined for this type of holder.";
+    return false;
+  }
+
   ~BasicPairVectorHolder() { }
  private:
   KALDI_DISALLOW_COPY_AND_ASSIGN(BasicPairVectorHolder);
@@ -565,7 +639,7 @@ class TokenHolder {
 
   TokenHolder() {}
 
-  static bool Write(std::ostream &os, bool, const T &t) {  // ignore binary-mode.
+  static bool Write(std::ostream &os, bool, const T &t) {  // ignore binary-mode
     KALDI_ASSERT(IsToken(t));
     os << t << '\n';
     return os.good();
@@ -580,7 +654,8 @@ class TokenHolder {
     char c;
     while (isspace(c = is.peek()) && c!= '\n') is.get();
     if (is.peek() != '\n') {
-      KALDI_ERR << "TokenHolder::Read, expected newline, got char " << CharToString(is.peek())
+      KALDI_ERR << "TokenHolder::Read, expected newline, got char " <<
+          CharToString(is.peek())
                 << ", at stream pos " << is.tellg();
       return false;
     }
@@ -596,6 +671,17 @@ class TokenHolder {
   const T &Value() const { return t_; }
 
   ~TokenHolder() { }
+
+  void Swap(TokenHolder *other) {
+    t_.swap(other->t_);
+  }
+
+  bool ExtractRange(const TokenHolder &other,
+                    const std::string &range) {
+    KALDI_ERR << "ExtractRange is not defined for this type of holder.";
+    return false;
+  }
+
  private:
   KALDI_DISALLOW_COPY_AND_ASSIGN(TokenHolder);
   T t_;
@@ -609,11 +695,12 @@ class TokenVectorHolder {
 
   TokenVectorHolder() { }
 
-  static bool Write(std::ostream &os, bool, const T &t) {  // ignore binary-mode.
+  static bool Write(std::ostream &os, bool, const T &t) {  // ignore binary-mode
     for (std::vector<std::string>::const_iterator iter = t.begin();
-        iter != t.end();
-        ++iter) {
-      KALDI_ASSERT(IsToken(*iter));  // make sure it's whitespace-free, printable and nonempty.
+         iter != t.end();
+         ++iter) {
+      KALDI_ASSERT(IsToken(*iter));  // make sure it's whitespace-free,
+      // printable and nonempty.
       os << *iter << ' ';
     }
     os << '\n';
@@ -632,21 +719,32 @@ class TokenVectorHolder {
     std::string line;
     getline(is, line);  // this will discard the \n, if present.
     if (is.fail()) {
-      KALDI_WARN << "BasicVectorHolder::Read, error reading line " << (is.eof() ? "[eof]" : "");
+      KALDI_WARN << "BasicVectorHolder::Read, error reading line " << (is.eof()
+                                                                       ? "[eof]" : "");
       return false;  // probably eof.  fail in any case.
     }
     const char *white_chars = " \t\n\r\f\v";
-    SplitStringToVector(line, white_chars, true, &t_);  // true== omit empty strings e.g.
-    // between spaces.
+    SplitStringToVector(line, white_chars, true, &t_);  // true== omit
+    // empty strings e.g. between spaces.
     return true;
   }
 
-  // Read in text format since it's basically a text-mode thing.. doesn't really matter,
-  // it would work either way since we ignore the extra '\r'.
+  // Read in text format since it's basically a text-mode thing.. doesn't really
+  // matter, it would work either way since we ignore the extra '\r'.
   static bool IsReadInBinary() { return false; }
 
   const T &Value() const { return t_; }
 
+  void Swap(TokenVectorHolder *other) {
+    t_.swap(other->t_);
+  }
+
+  bool ExtractRange(const TokenVectorHolder &other,
+                    const std::string &range) {
+    KALDI_ERR << "ExtractRange is not defined for this type of holder.";
+    return false;
+  }
+
  private:
   KALDI_DISALLOW_COPY_AND_ASSIGN(TokenVectorHolder);
   T t_;
@@ -685,8 +783,17 @@ class HtkMatrixHolder {
 
   const T &Value() const { return t_; }
 
+  void Swap(HtkMatrixHolder *other) {
+    t_.first.Swap(&(other->t_.first));
+    std::swap(t_.second, other->t_.second);
+  }
 
-  // No destructor.
+  bool ExtractRange(const HtkMatrixHolder &other,
+                    const std::string &range) {
+    KALDI_ERR << "ExtractRange is not defined for this type of holder.";
+    return false;
+  }
+  // Default destructor.
  private:
   KALDI_DISALLOW_COPY_AND_ASSIGN(HtkMatrixHolder);
   T t_;
@@ -721,18 +828,19 @@ template<int kFeatDim> class SphinxMatrixHolder {
     int32 size = m.NumRows() * m.NumCols();
     if (MachineIsLittleEndian())
       KALDI_SWAP4(size);
-    os.write((char*) &size, sizeof(size)); // write the header
+    // write the header
+    os.write(reinterpret_cast<char*> (&size), sizeof(size));
 
     for (MatrixIndexT i = 0; i < m.NumRows(); i++) {
-      float32 tmp[m.NumCols()];
+      std::vector<float32> tmp(m.NumCols());
       for (MatrixIndexT j = 0; j < m.NumCols(); j++) {
         tmp[j] = static_cast<float32>(m(i, j));
         if (MachineIsLittleEndian())
           KALDI_SWAP4(tmp[j]);
       }
-      os.write((char*) tmp, sizeof(tmp));
+      os.write(reinterpret_cast<char*>(&(tmp[0])),
+               tmp.size() * 4);
     }
-
     return true;
   }
 
@@ -740,7 +848,7 @@ template<int kFeatDim> class SphinxMatrixHolder {
   bool Read(std::istream &is) {
     int32 nmfcc;
 
-    is.read((char*) &nmfcc, sizeof(nmfcc));
+    is.read(reinterpret_cast<char*> (&nmfcc), sizeof(nmfcc));
     if (MachineIsLittleEndian())
       KALDI_SWAP4(nmfcc);
     KALDI_VLOG(2) << "#feats: " << nmfcc;
@@ -753,23 +861,24 @@ template<int kFeatDim> class SphinxMatrixHolder {
     feats_.Resize(nfvec, kFeatDim);
     for (MatrixIndexT i = 0; i < feats_.NumRows(); i++) {
       if (sizeof(BaseFloat) == sizeof(float32)) {
-        is.read((char*) feats_.RowData(i), kFeatDim * sizeof(float32));
+        is.read(reinterpret_cast<char*> (feats_.RowData(i)),
+                kFeatDim * sizeof(float32));
         if (!is.good()) {
           KALDI_WARN << "Unexpected error/EOF while reading Sphinx features ";
           return false;
         }
         if (MachineIsLittleEndian()) {
-          for (MatrixIndexT j=0; j < kFeatDim; j++)
+          for (MatrixIndexT j = 0; j < kFeatDim; j++)
             KALDI_SWAP4(feats_(i, j));
         }
-      } else { // KALDI_DOUBLEPRECISION=1
+      } else {  // KALDI_DOUBLEPRECISION=1
         float32 tmp[kFeatDim];
-        is.read((char*) tmp, sizeof(tmp));
+        is.read(reinterpret_cast<char*> (tmp), sizeof(tmp));
         if (!is.good()) {
           KALDI_WARN << "Unexpected error/EOF while reading Sphinx features ";
           return false;
         }
-        for (MatrixIndexT j=0; j < kFeatDim; j++) {
+        for (MatrixIndexT j = 0; j < kFeatDim; j++) {
           if (MachineIsLittleEndian())
             KALDI_SWAP4(tmp[j]);
           feats_(i, j) = static_cast<BaseFloat>(tmp[j]);
@@ -785,6 +894,16 @@ template<int kFeatDim> class SphinxMatrixHolder {
 
   const T &Value() const { return feats_; }
 
+  void Swap(SphinxMatrixHolder *other) {
+    feats_.Swap(&(other->feats_));
+  }
+
+  bool ExtractRange(const SphinxMatrixHolder &other,
+                    const std::string &range) {
+    KALDI_ERR << "ExtractRange is not defined for this type of holder.";
+    return false;
+  }
+
  private:
   KALDI_DISALLOW_COPY_AND_ASSIGN(SphinxMatrixHolder);
   T feats_;
@@ -793,8 +912,8 @@ template<int kFeatDim> class SphinxMatrixHolder {
 
 /// @} end "addtogroup holders"
 
-} // end namespace kaldi
+}  // end namespace kaldi
 
 
 
-#endif
+#endif  // KALDI_UTIL_KALDI_HOLDER_INL_H_
diff --git a/src/util/kaldi-holder.cc b/src/util/kaldi-holder.cc
new file mode 100644
index 00000000000..b45c8bc2968
--- /dev/null
+++ b/src/util/kaldi-holder.cc
@@ -0,0 +1,79 @@
+// util/kaldi-holder.cc
+
+// Copyright 2009-2011     Microsoft Corporation
+//                2016     Xiaohui Zhang
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "util/kaldi-holder.h"
+#include "matrix/kaldi-matrix.h"
+
+namespace kaldi {
+
+template<class Real>
+bool ExtractObjectRange(const Matrix<Real> &input, const std::string &range,
+                        Matrix<Real> *output) {
+  if (range.empty()) {
+    KALDI_ERR << "Empty range specifier.";
+    return false;
+  }
+  std::vector<std::string> splits;
+  SplitStringToVector(range, ",", false, &splits);
+  if (!((splits.size() == 1 && !splits[0].empty()) ||
+        (splits.size() == 2  && !splits[0].empty() && !splits[1].empty()))) {
+    KALDI_ERR << "Invalid range specifier: " << range;
+    return false;
+  }
+  std::vector<int32> row_range, col_range;
+  bool status = true;
+  if (splits[0] != ":")
+    status = SplitStringToIntegers(splits[0], ":", false, &row_range);
+  if (splits.size() == 2 && splits[1] != ":") {
+    status = status && SplitStringToIntegers(splits[1], ":", false, &col_range);
+  }
+  if (row_range.size() == 0) {
+    row_range.push_back(0);
+    row_range.push_back(input.NumRows() - 1);
+  }
+  if (col_range.size() == 0) {
+    col_range.push_back(0);
+    col_range.push_back(input.NumCols() - 1);
+  }
+  if (!(status && row_range.size() == 2 && col_range.size() == 2 &&
+        row_range[0] >= 0 && row_range[0] <= row_range[1] &&
+        row_range[1] < input.NumRows() && col_range[0] >=0 &&
+        col_range[0] <= col_range[1] && col_range[1] < input.NumCols())) {
+    KALDI_ERR << "Invalid range specifier: " << range
+              << " for matrix of size " << input.NumRows()
+              << "x" << input.NumCols();
+    return false;
+  }
+  int32 row_size = row_range[1] - row_range[0] + 1,
+        col_size = col_range[1] - col_range[0] + 1;
+  output->Resize(row_size, col_size, kUndefined);
+  output->CopyFromMat(input.Range(row_range[0], row_size,
+                                  col_range[0], col_size));
+  return true;
+}
+
+// template instantiation
+template bool ExtractObjectRange(const Matrix<double> &, const std::string &,
+                                 Matrix<double> *);
+template bool ExtractObjectRange(const Matrix<BaseFloat> &, const std::string &,
+                                 Matrix<BaseFloat> *);
+
+
+}  // end namespace kaldi
diff --git a/src/util/kaldi-holder.h b/src/util/kaldi-holder.h
index e81d7a6245d..8da5ae6028e 100644
--- a/src/util/kaldi-holder.h
+++ b/src/util/kaldi-holder.h
@@ -1,6 +1,8 @@
 // util/kaldi-holder.h
 
 // Copyright 2009-2011     Microsoft Corporation
+//                2016     Johns Hopkins University (author: Daniel Povey)
+//                2016     Xiaohui Zhang
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -50,14 +52,15 @@ namespace kaldi {
 // text or binary mode (but it's OK if it doesn't eat up trailing space).
 //
 //     [Desirable property: when writing in text mode the output should contain
-//      exactly one newline, at the end of the output; this makes it easier to manipulate]
+//      exactly one newline, at the end of the output; this makes it easier to
+//      manipulate]
 //
 //     [Desirable property for classes: the output should just be a binary-mode
 //      header (if in binary mode and it's a Kaldi object, or no header
 //      othewise), and then the output of Object.Write().  This means that when
-//      written to individual files with the scp: type of wspecifier, we can read
-//      the individual files in the "normal" Kaldi way by reading the binary
-//      header and then the object.]
+//      written to individual files with the scp: type of wspecifier, we can
+//      read the individual files in the "normal" Kaldi way by reading the
+//      binary header and then the object.]
 //
 //
 // The Write function takes a 'binary' argument.  In general, each object will
@@ -83,7 +86,7 @@ template<class SomeType> class GenericHolder {
   /// Must have a constructor that takes no arguments.
   GenericHolder() { }
 
-  /// Write writes this object of type T.  Possibly also writes a binary-mode
+  /// Write() writes this object of type T.  Possibly also writes a binary-mode
   /// header so that the Read function knows which mode to read in (since the
   /// Read function does not get this information).  It's a static member so we
   /// can write those not inside this class (can use this function with Value()
@@ -92,22 +95,19 @@ template<class SomeType> class GenericHolder {
   /// assume the stream has been opened in the given mode (where relevant).  The
   /// object can write the data how it likes.
   static bool Write(std::ostream &os, bool binary, const T &t);
-  
-  /// Reads into the holder.  Must work out from the stream (which will be opened
-  /// on Windows in binary mode if the IsReadInBinary() function of this class
-  /// returns true, and text mode otherwise) whether the actual data is binary or
-  /// not (usually via reading the Kaldi binary-mode header).  We put the
-  /// responsibility for reading the Kaldi binary-mode header in the Read
-  /// function (rather than making the binary mode an argument to this function),
-  /// so that for non-Kaldi binary files we don't have to write the header, which
-  /// would prevent the file being read by non-Kaldi programs (e.g. if we write
-  /// to individual files using an scp).
-  ///
+
+  /// Reads into the holder.  Must work out from the stream (which will be
+  /// opened on Windows in binary mode if the IsReadInBinary() function of this
+  /// class returns true, and text mode otherwise) whether the actual data is
+  /// binary or not (usually via reading the Kaldi binary-mode header).
+  /// We put the responsibility for reading the Kaldi binary-mode header in the
+  /// Read function (rather than making the binary mode an argument to this
+  /// function), so that for non-Kaldi binary files we don't have to write the
+  /// header, which would prevent the file being read by non-Kaldi programs
+  /// (e.g. if we write to individual files using an scp).
   /// Read must deallocate any existing data we have here, if applicable (must
   /// not assume the object was newly constructed).
-  ///
   /// Returns true on success.
-  ///
   /// If Read() returns false, the contents of this object and hence the value
   /// returned by Value() may be undefined.
   bool Read(std::istream &is);
@@ -124,12 +124,29 @@ template<class SomeType> class GenericHolder {
   /// Returns the value of the object held here.  Will only
   /// ever be called if Read() has been previously called and it returned
   /// true (so OK to throw exception if no object was read).
-  const T &Value() const { return t_; } // if t is a pointer, would return *t_;
+  const T &Value() const { return t_; }  // if t is a pointer, would return *t_;
 
   /// The Clear() function doesn't have to do anything.  Its purpose is to
   /// allow the object to free resources if they're no longer needed.
   void Clear() { }
 
+  /// This swaps the objects held by *this and *other (preferably a shallow
+  /// swap).  Note, this is just an example.  The swap is with the *same type*
+  /// of holder, not with some nonexistent base-class (remember, GenericHolder is
+  /// an example for documentation, not a base-class).
+  void Swap(GenericHolder<T> *other) { std::swap(t_, other->t_); }
+
+  /// At the time of writing this will only do something meaningful
+  /// KaldiObjectHolder holding matrix objects, in order to extract a holder
+  /// holding a sub-matrix specified by 'range', e.g. [0:3,2:10], like in Matlab
+  /// but with zero-based indexing. It returns true with successful extraction
+  /// of the range, false if the range was invalid or outside the bounds of the
+  /// matrix.  For other types of holder it just throws an error.
+  bool ExtractRange(const GenericHolder<T> &other, const std::string &range) {
+    KALDI_ERR << "ExtractRange is not defined for this type of holder.";
+    return false;
+  }
+
   /// If the object held pointers, the destructor would free them.
   ~GenericHolder() { }
 
@@ -150,8 +167,8 @@ template<class SomeType> class GenericHolder {
 /// \addtogroup holders
 /// @{
 
-/// KaldiObjectHolder works for Kaldi objects that have the "standard" Read and Write
-/// functions, and a copy constructor.
+/// KaldiObjectHolder works for Kaldi objects that have the "standard" Read
+/// and Write functions, and a copy constructor.
 template<class KaldiType> class KaldiObjectHolder;
 
 /// BasicHolder is valid for float, double, bool, and integer
@@ -186,10 +203,12 @@ template<class BasicType> class BasicPairVectorHolder;
 
 /// We define a Token (not a typedef, just a word) as a nonempty, printable,
 /// whitespace-free std::string.  The binary and text formats here are the same
-/// (newline-terminated) and as such we don't bother with the binary-mode headers.
+/// (newline-terminated) and as such we don't bother with the binary-mode
+/// headers.
 class TokenHolder;
 
-/// Class TokenVectorHolder is a Holder class for vectors of Tokens (T == std::string).
+/// Class TokenVectorHolder is a Holder class for vectors of Tokens
+/// (T == std::string).
 class TokenVectorHolder;
 
 /// A class for reading/writing HTK-format matrices.
@@ -197,14 +216,37 @@ class TokenVectorHolder;
 class HtkMatrixHolder;
 
 /// A class for reading/writing Sphinx format matrices.
-template<int kFeatDim=13> class SphinxMatrixHolder;
-
+template<int kFeatDim = 13> class SphinxMatrixHolder;
+
+/// This templated function exists so that we can write .scp files with
+/// 'object ranges' specified: the canonical example is a [first:last] range
+/// of rows of a matrix, or [first-row:last-row,first-column,last-column]
+/// of a matrix.  We can also support [begin-time:end-time] of a wave
+/// file.  The string 'range' is whatever is in the square brackets; it is
+/// parsed inside this function.
+/// This function returns true if the partial object was successfully extracted,
+/// and false if there was an error such as an invalid range.
+/// The generic version of this function just fails; we overload the template
+/// whenever we need it for a specific class.
+template <class T>
+bool ExtractObjectRange(const T &input, const std::string &range, T *output) {
+  KALDI_ERR << "Ranges not supported for objects of this type.";
+  return false;
+}
+
+/// The template is specialized with a version that actually does something,
+/// for types Matrix<float> and Matrix<double>.  We can later add versions of
+/// this template for other types, such as Vector, which can meaningfully
+/// have ranges extracted.
+template <class Real>
+bool ExtractObjectRange(const Matrix<Real> &input, const std::string &range,
+                        Matrix<Real> *output);
 
 /// @} end "addtogroup holders"
 
 
-} // end namespace kaldi
+}  // end namespace kaldi
 
-#include "kaldi-holder-inl.h"
+#include "util/kaldi-holder-inl.h"
 
-#endif
+#endif  // KALDI_UTIL_KALDI_HOLDER_H_
diff --git a/src/util/kaldi-io-inl.h b/src/util/kaldi-io-inl.h
index 7df75050b7c..2474f701431 100644
--- a/src/util/kaldi-io-inl.h
+++ b/src/util/kaldi-io-inl.h
@@ -19,6 +19,7 @@
 #ifndef KALDI_UTIL_KALDI_IO_INL_H_
 #define KALDI_UTIL_KALDI_IO_INL_H_
 
+#include<string>
 
 namespace kaldi {
 
@@ -42,4 +43,4 @@ bool Output::IsOpen() {
 }  // end namespace kaldi.
 
 
-#endif
+#endif  // KALDI_UTIL_KALDI_IO_INL_H_
diff --git a/src/util/kaldi-io-test.cc b/src/util/kaldi-io-test.cc
index 6c2cccdb57d..dbeafdf4684 100644
--- a/src/util/kaldi-io-test.cc
+++ b/src/util/kaldi-io-test.cc
@@ -16,13 +16,13 @@
 // MERCHANTABLITY OR NON-INFRINGEMENT.
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
+#ifndef _MSC_VER
+#include <unistd.h>
+#endif
 #include "base/io-funcs.h"
 #include "util/kaldi-io.h"
 #include "base/kaldi-math.h"
 #include "base/kaldi-utils.h"
-#ifndef _MSC_VER
-#include <unistd.h>
-#endif
 
 namespace kaldi {
 
@@ -132,7 +132,7 @@ void UnitTestIoNew(bool binary) {
       ReadIntegerVector(infile, binary_in, &vec3_in);
       KALDI_ASSERT(vec3_in == vec3);
       std::string  token1_in, token2_in;
-      KALDI_ASSERT(Peek(infile, binary_in) == (int)*token1);
+      KALDI_ASSERT(Peek(infile, binary_in) == static_cast<int>(*token1));
       ReadToken(infile, binary_in, &token1_in);
       KALDI_ASSERT(token1_in == (std::string)token1);
       ReadToken(infile, binary_in, &token2_in);
@@ -245,7 +245,7 @@ void UnitTestIoPipe(bool binary) {
       ReadIntegerVector(infile, binary_in, &vec3_in);
       KALDI_ASSERT(vec3_in == vec3);
       std::string  token1_in, token2_in;
-      KALDI_ASSERT(Peek(infile, binary_in) == (int)*token1);
+      KALDI_ASSERT(Peek(infile, binary_in) == static_cast<int>(*token1));
       ReadToken(infile, binary_in, &token1_in);
       KALDI_ASSERT(token1_in == (std::string)token1);
       ReadToken(infile, binary_in, &token2_in);
@@ -357,6 +357,7 @@ int main(int argc, const char** argv) {
   UnitTestClassifyRxfilename();
   UnitTestClassifyWxfilename();
 
-  KALDI_ASSERT(1);  // just wanted to check that KALDI_ASSERT does not fail for 1.
+  KALDI_ASSERT(1);  // just wanted to check that KALDI_ASSERT does not fail
+  // for 1.
   return 0;
 }
diff --git a/src/util/kaldi-io.cc b/src/util/kaldi-io.cc
index 7c1b68b0876..22a82d35f51 100644
--- a/src/util/kaldi-io.cc
+++ b/src/util/kaldi-io.cc
@@ -1,6 +1,7 @@
 // util/kaldi-io.cc
 
 // Copyright 2009-2011  Microsoft Corporation;  Jan Silovsky
+//                2016  Xiaohui Zhang
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -17,11 +18,11 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 #include "util/kaldi-io.h"
+#include <errno.h>
+#include <cstdlib>
 #include "base/kaldi-math.h"
 #include "util/text-utils.h"
 #include "util/parse-options.h"
-#include <cstdlib>
-#include <errno.h>
 
 #include "util/kaldi-pipebuf.h"
 
@@ -44,8 +45,8 @@ static FILE *popen(const char* command, const char* mode) {
 
 namespace kaldi {
 
-#ifndef _MSC_VER // on VS, we don't need this type.
-// could replace basic_pipebuf<char> with stdio_filebuf<char>, on some platforms.
+#ifndef _MSC_VER  // on VS, we don't need this type.
+// could replace basic_pipebuf<char> with stdio_filebuf<char> on some platforms.
 // Would mean we could use less of our own code.
 typedef basic_pipebuf<char> PipebufType;
 #endif
@@ -54,8 +55,9 @@ typedef basic_pipebuf<char> PipebufType;
 namespace kaldi {
 
 std::string PrintableRxfilename(std::string rxfilename) {
-  if (rxfilename == "" || rxfilename == "-") return "standard input";
-  else {
+  if (rxfilename == "" || rxfilename == "-") {
+    return "standard input";
+  } else {
     // If this call to Escape later causes compilation issues,
     // just replace it with "return rxfilename"; it's only a
     // pretty-printing issue.
@@ -65,8 +67,9 @@ std::string PrintableRxfilename(std::string rxfilename) {
 
 
 std::string PrintableWxfilename(std::string wxfilename) {
-  if (wxfilename == "" || wxfilename == "-") return "standard output";
-  else {
+  if (wxfilename == "" || wxfilename == "-") {
+    return "standard output";
+  } else {
     // If this call to Escape later causes compilation issues,
     // just replace it with "return rxfilename"; it's only a
     // pretty-printing issue.
@@ -77,11 +80,12 @@ std::string PrintableWxfilename(std::string wxfilename) {
 
 OutputType ClassifyWxfilename(const std::string &filename) {
   const char *c = filename.c_str();
-  if (*c == '\0' || (*c == '-' && c[1] == '\0')) return kStandardOutput;  // "" or "-".
+  if (*c == '\0' || (*c == '-' && c[1] == '\0')) return kStandardOutput;  // ""
+  // or "-".
   else if (*c == '|') return kPipeOutput;  // An output pipe like "|blah".
-  else if (isspace(*c) || isspace(c[filename.length()-1])) return kNoOutput;  // Leading or
-  // trailing space: can't interpret this.
-  else if ( (*c == 't'||*c == 'b') && c[1] == ',') {
+  else if (isspace(*c) || isspace(c[filename.length()-1])) return kNoOutput;  //
+  // Leading or trailing space: can't interpret this.
+  else if ((*c == 't'||*c == 'b') && c[1] == ',') {
     // We have detected that the user has supplied a wspecifier
     // or rspecifier (as in kaldi-table.h) where a wxfilename was
     // needed.  Since this is almost certain not to be a real filename
@@ -91,25 +95,32 @@ OutputType ClassifyWxfilename(const std::string &filename) {
   } else {
     const char *d = c;
     while (d[1] != '\0') d++;  // go to last char.
-    if (*d == '|' || isspace(*d)) return kNoOutput;  // An input pipe (not allowed in
-    // this context) or trailing space which is just wrong.
+    if (*d == '|' || isspace(*d)) return kNoOutput;  // An input pipe (not
+    // allowed in this context) or trailing space which is just wrong.
     else if (isdigit(*d)) {
-      // OK, it could be a file, but we have to see if it's an offset into a file,
-      // which is not allowed.
+      // OK, it could be a file, but we have to see if it's an offset into a
+      // file, which is not allowed.
       while (isdigit(*d) && d > c) d--;
-      if (*d == ':') return kNoOutput;  // Filename is like some_file:12345; not allowed,
-      else return kFileOutput;
-      // as we cannot write to an offset into a file (and if we interpreted it as an
-      // actual filename, the reading code would misinterpret it as an offset.
+      if (*d == ':') return kNoOutput;  // Filename is like some_file:12345;
+      // not allowed,
+      else
+        return kFileOutput;
+      // as we cannot write to an offset into a file (and if we interpreted it
+      // as an actual filename, the reading code would misinterpret it as an
+      // offset.
     } else {
-      // at this point it matched no other pattern so we assume a filename, but we
-      // check for '|' as it's a common source of errors to have pipe commands without
-      // the pipe in the right place.  Say that it can't be classified.
+      // at this point it matched no other pattern so we assume a filename, but
+      // we check for '|' as it's a common source of errors to have pipe
+      // commands without the pipe in the right place.  Say that it can't be
+      // classified.
       if (strchr(c, '|') != NULL) {
-        KALDI_WARN << "Trying to classify wxfilename with pipe symbol in the wrong place (pipe without | at the beginning?): " << filename;
+        KALDI_WARN << "Trying to classify wxfilename with pipe symbol in the"
+                      " wrong place (pipe without | at the beginning?): " <<
+                      filename;
         return kNoOutput;
       }
-      return kFileOutput;  // matched no other pattern: assume it's an actual filename.
+      return kFileOutput;  // matched no other pattern: assume it's an actual
+      // filename.
     }
   }
 }
@@ -117,11 +128,13 @@ OutputType ClassifyWxfilename(const std::string &filename) {
 
 InputType ClassifyRxfilename(const std::string &filename) {
   const char *c = filename.c_str();
-  if (*c == '\0' || (*c == '-' && c[1] == '\0')) return kStandardInput;  // "" or "-".
-  else if (*c == '|') return kNoInput;  // An output pipe like "|blah": not valid for input.
-  else if (isspace(*c) || isspace(c[filename.length()-1])) return kNoInput;  // Leading or
-  // trailing space.
-  else if ( (*c == 't'||*c == 'b') && c[1] == ',') {
+  if (*c == '\0' || (*c == '-' && c[1] == '\0')) return kStandardInput;  // ""
+  // or "-".
+  else if (*c == '|') return kNoInput;  // An output pipe like "|blah": not
+  // valid for input.
+  else if (isspace(*c) || isspace(c[filename.length()-1])) return kNoInput;  //
+  // Leading or trailing space.
+  else if ((*c == 't'||*c == 'b') && c[1] == ',') {
     // We have detected that the user has supplied a wspecifier
     // or rspecifier (as in kaldi-table.h) where a wxfilename was
     // needed.  Since this is almost certain not to be a real filename
@@ -137,17 +150,22 @@ InputType ClassifyRxfilename(const std::string &filename) {
       // OK, it could be an offset into a file
       // which is not allowed.
       while (isdigit(*d) && d > c) d--;
-      if (*d == ':') return kOffsetFileInput;  // Filename is like some_file:12345
-      else return kFileInput;
+      if (*d == ':') return kOffsetFileInput;  // Filename is like
+      // some_file:12345
+      else
+        return kFileInput;
     } else {
-      // at this point it matched no other pattern so we assume a filename, but we
-      // check for '|' as it's a common source of errors to have pipe commands without
-      // the pipe in the right place.  Say that it can't be classified in this case.
+      // at this point it matched no other pattern so we assume a filename, but
+      // we check for '|' as it's a common source of errors to have pipe
+      // commands without the pipe in the right place.  Say that it can't be
+      // classified in this case.
       if (strchr(c, '|') != NULL) {
-        KALDI_WARN << "Trying to classify rxfilename with pipe symbol in the wrong place (pipe without | at the end?): " << filename;
+        KALDI_WARN << "Trying to classify rxfilename with pipe symbol in the"
+                      " wrong place (pipe without | at the end?): " << filename;
         return kNoInput;
       }
-      return kFileInput;  // matched no other pattern: assume it's an actual filename.
+      return kFileInput;  // matched no other pattern: assume it's an actual
+      // filename.
     }
   }
 }
@@ -183,7 +201,8 @@ class FileOutputImpl: public OutputImplBase {
   }
 
   virtual bool Close() {
-    if (!os_.is_open()) KALDI_ERR << "FileOutputImpl::Close(), file is not open.";
+    if (!os_.is_open())
+      KALDI_ERR << "FileOutputImpl::Close(), file is not open.";
     // I believe this error can only arise from coding error.
     os_.close();
     return !(os_.fail());
@@ -215,13 +234,15 @@ class StandardOutputImpl: public OutputImplBase {
   }
 
   virtual std::ostream &Stream() {
-    if (!is_open_) KALDI_ERR << "StandardOutputImpl::Stream(), object not initialized.";
+    if (!is_open_)
+      KALDI_ERR << "StandardOutputImpl::Stream(), object not initialized.";
     // I believe this error can only arise from coding error.
     return std::cout;
   }
 
   virtual bool Close() {
-    if (!is_open_) KALDI_ERR << "StandardOutputImpl::Close(), file is not open.";
+    if (!is_open_)
+      KALDI_ERR << "StandardOutputImpl::Close(), file is not open.";
     is_open_ = false;
     std::cout << std::flush;
     return !(std::cout.fail());
@@ -239,12 +260,13 @@ class StandardOutputImpl: public OutputImplBase {
 
 class PipeOutputImpl: public OutputImplBase {
  public:
-  PipeOutputImpl(): f_ (NULL), os_(NULL) { }
+  PipeOutputImpl(): f_(NULL), os_(NULL) { }
 
   virtual bool Open(const std::string &wxfilename, bool binary) {
     filename_ = wxfilename;
     KALDI_ASSERT(f_ == NULL);  // Make sure closed.
-    KALDI_ASSERT(wxfilename.length() != 0 && wxfilename[0] == '|');  // should start with '|'
+    KALDI_ASSERT(wxfilename.length() != 0 && wxfilename[0] == '|');  // should
+    // start with '|'
     std::string cmd_name(wxfilename, 1);
 #ifdef _MSC_VER
     f_ = popen(cmd_name.c_str(), (binary ? "wb" : "w"));
@@ -257,9 +279,11 @@ class PipeOutputImpl: public OutputImplBase {
       return false;
     } else {
 #ifndef _MSC_VER
-      fb_ = new PipebufType(f_,  // Using this constructor won't make the destructor
-                                  // try to close the stream when we're done.
-                                  (binary ? std::ios_base::out|std::ios_base::binary
+      fb_ = new PipebufType(f_,  // Using this constructor won't make the
+                                 // destructor try to close the stream when
+                                 // we're done.
+                                  (binary ? std::ios_base::out|
+                                   std::ios_base::binary
                                    :std::ios_base::out));
       KALDI_ASSERT(fb_ != NULL);  // or would be alloc error.
       os_ = new std::ostream(fb_);
@@ -271,7 +295,8 @@ class PipeOutputImpl: public OutputImplBase {
   }
 
   virtual std::ostream &Stream() {
-    if (os_ == NULL) KALDI_ERR << "PipeOutputImpl::Stream(), object not initialized.";
+    if (os_ == NULL) KALDI_ERR << "PipeOutputImpl::Stream(),"
+                                  " object not initialized.";
     // I believe this error can only arise from coding error.
     return *os_;
   }
@@ -301,7 +326,7 @@ class PipeOutputImpl: public OutputImplBase {
   }
   virtual ~PipeOutputImpl() {
     if (os_) {
-      if (! Close())
+      if (!Close())
         KALDI_ERR << "Error writing to pipe " << PrintableWxfilename(filename_);
     }
   }
@@ -328,9 +353,11 @@ class InputImplBase {
   // is for non-Kaldi files.
   virtual bool Open(const std::string &filename, bool binary) = 0;
   virtual std::istream &Stream() = 0;
-  virtual void Close() = 0;  // don't bother checking failure
+  virtual int32 Close() = 0;  // We only need to check failure in the case of 
+                              // kPipeInput.
   // on close for input streams.
-  virtual InputType MyType() = 0;  // Because if it's kOffsetFileInput, we may call Open twice
+  virtual InputType MyType() = 0;  // Because if it's kOffsetFileInput, we may
+                                   // call Open twice
   // (has efficiency benefits).
 
   virtual ~InputImplBase() { }
@@ -348,16 +375,19 @@ class FileInputImpl: public InputImplBase {
   }
 
   virtual std::istream &Stream() {
-    if (!is_.is_open()) KALDI_ERR << "FileInputImpl::Stream(), file is not open.";
+    if (!is_.is_open())
+      KALDI_ERR << "FileInputImpl::Stream(), file is not open.";
     // I believe this error can only arise from coding error.
     return is_;
   }
 
-  virtual void Close() {
-    if (!is_.is_open()) KALDI_ERR << "FileInputImpl::Close(), file is not open.";
+  virtual int32 Close() {
+    if (!is_.is_open())
+      KALDI_ERR << "FileInputImpl::Close(), file is not open.";
     // I believe this error can only arise from coding error.
     is_.close();
     // Don't check status.
+    return 0;
   }
 
   virtual InputType MyType() { return kFileInput; }
@@ -387,16 +417,18 @@ class StandardInputImpl: public InputImplBase {
   }
 
   virtual std::istream &Stream() {
-    if (!is_open_) KALDI_ERR << "StandardInputImpl::Stream(), object not initialized.";
+    if (!is_open_)
+      KALDI_ERR << "StandardInputImpl::Stream(), object not initialized.";
     // I believe this error can only arise from coding error.
     return std::cin;
   }
 
   virtual InputType MyType() { return kStandardInput; }
 
-  virtual void Close() {
+  virtual int32 Close() {
     if (!is_open_) KALDI_ERR << "StandardInputImpl::Close(), file is not open.";
     is_open_ = false;
+    return 0;
   }
   virtual ~StandardInputImpl() { }
  private:
@@ -405,7 +437,7 @@ class StandardInputImpl: public InputImplBase {
 
 class PipeInputImpl: public InputImplBase {
  public:
-  PipeInputImpl(): f_ (NULL), is_(NULL) { }
+  PipeInputImpl(): f_(NULL), is_(NULL) { }
 
   virtual bool Open(const std::string &rxfilename, bool binary) {
     filename_ = rxfilename;
@@ -427,7 +459,8 @@ class PipeInputImpl: public InputImplBase {
 #ifndef _MSC_VER
       fb_ = new PipebufType(f_,  // Using this constructor won't lead the
                                  // destructor to close the stream.
-                                 (binary ? std::ios_base::in|std::ios_base::binary
+                                 (binary ? std::ios_base::in|
+                                  std::ios_base::binary
                                   :std::ios_base::in));
       KALDI_ASSERT(fb_ != NULL);  // or would be alloc error.
       is_ = new std::istream(fb_);
@@ -446,16 +479,18 @@ class PipeInputImpl: public InputImplBase {
   }
 
   virtual std::istream &Stream() {
-    if (is_ == NULL) KALDI_ERR << "PipeInputImpl::Stream(), object not initialized.";
+    if (is_ == NULL)
+      KALDI_ERR << "PipeInputImpl::Stream(), object not initialized.";
     // I believe this error can only arise from coding error.
     return *is_;
   }
 
-  virtual void Close() {
-    if (is_ == NULL) KALDI_ERR << "PipeInputImpl::Close(), file is not open.";
+  virtual int32 Close() {
+    if (is_ == NULL)
+      KALDI_ERR << "PipeInputImpl::Close(), file is not open.";
     delete is_;
     is_ = NULL;
-    int status;
+    int32 status;
 #ifdef _MSC_VER
     status = _pclose(f_);
 #else
@@ -469,6 +504,7 @@ class PipeInputImpl: public InputImplBase {
     delete fb_;
     fb_ = NULL;
 #endif
+    return status;
   }
   virtual ~PipeInputImpl() {
     if (is_)
@@ -491,7 +527,8 @@ class PipeInputImpl: public InputImplBase {
 // called.
 class PipeInputImpl: public InputImplBase {
  public:
-  PipeInputImpl() { KALDI_ASSERT(0 && "Pipe input not yet supported on this platform."); }
+  PipeInputImpl() { KALDI_ASSERT(0 && "Pipe input not yet supported on this
+  platform."); }
   virtual bool Open(const std::string, bool) { return 0; }
   virtual std::istream &Stream() const { return NULL; }
   virtual void Close() {}
@@ -511,8 +548,8 @@ class OffsetFileInputImpl: public InputImplBase {
                             std::string *filename,
                             size_t *offset) {
     size_t pos = rxfilename.find_last_of(':');
-    KALDI_ASSERT(pos != std::string::npos);  // would indicate error in calling code,
-    // as the filename is supposed to be of the correct form at this
+    KALDI_ASSERT(pos != std::string::npos);  // would indicate error in calling
+    // code, as the filename is supposed to be of the correct form at this
     // point.
     *filename = std::string(rxfilename, 0, pos);
     std::string number(rxfilename, pos+1);
@@ -565,7 +602,8 @@ class OffsetFileInputImpl: public InputImplBase {
                  binary ? std::ios_base::in | std::ios_base::binary
                         : std::ios_base::in);
         if (!is_.is_open()) return false;
-        else return Seek(offset);
+        else
+          return Seek(offset);
       }
     } else {
       size_t offset;
@@ -575,21 +613,25 @@ class OffsetFileInputImpl: public InputImplBase {
                 binary ? std::ios_base::in | std::ios_base::binary
                       : std::ios_base::in);
       if (!is_.is_open()) return false;
-      else return Seek(offset);
+      else
+        return Seek(offset);
     }
   }
 
   virtual std::istream &Stream() {
-    if (!is_.is_open()) KALDI_ERR << "FileInputImpl::Stream(), file is not open.";
+    if (!is_.is_open())
+      KALDI_ERR << "FileInputImpl::Stream(), file is not open.";
     // I believe this error can only arise from coding error.
     return is_;
   }
 
-  virtual void Close() {
-    if (!is_.is_open()) KALDI_ERR << "FileInputImpl::Close(), file is not open.";
+  virtual int32 Close() {
+    if (!is_.is_open())
+      KALDI_ERR << "FileInputImpl::Close(), file is not open.";
     // I believe this error can only arise from coding error.
     is_.close();
     // Don't check status.
+    return 0;
   }
 
   virtual InputType MyType() { return kOffsetFileInput; }
@@ -605,8 +647,8 @@ class OffsetFileInputImpl: public InputImplBase {
 };
 
 
-Output::Output(const std::string &wxfilename, bool binary, bool write_header):
-    impl_(NULL) {
+Output::Output(const std::string &wxfilename, bool binary,
+               bool write_header):impl_(NULL) {
   if (!Open(wxfilename, binary, write_header)) {
     if (impl_) {
       delete impl_;
@@ -618,8 +660,9 @@ Output::Output(const std::string &wxfilename, bool binary, bool write_header):
 }
 
 bool Output::Close() {
-  if (!impl_) return false;  // error to call Close if not open.
-  else {
+  if (!impl_) {
+    return false;  // error to call Close if not open.
+  } else {
     bool ans = impl_->Close();
     delete impl_;
     impl_ = NULL;
@@ -633,21 +676,24 @@ Output::~Output() {
     delete impl_;
     impl_ = NULL;
     if (!ok)
-      KALDI_ERR << "Error closing output file " <<
-          PrintableWxfilename(filename_);
+      KALDI_ERR << "Error closing output file "
+                << PrintableWxfilename(filename_)
+                << (ClassifyWxfilename(filename_) == kFileOutput ?
+                    " (disk full?)" : "");
   }
 }
 
-std::ostream &Output::Stream() {  // will throw if not open; else returns stream.
+std::ostream &Output::Stream() {  // will throw if not open; else returns
+  // stream.
   if (!impl_) KALDI_ERR << "Output::Stream() called but not open.";
   return impl_->Stream();
 }
 
 bool Output::Open(const std::string &wxfn, bool binary, bool header) {
   if (IsOpen()) {
-    if (!Close()) {  // Throw here rather than return status, as it's an error about
-      // something else: if the user wanted to avoid the exception he/she could have
-      // called Close().
+    if (!Close()) {  // Throw here rather than return status, as it's an error
+      // about something else: if the user wanted to avoid the exception he/she
+      // could have called Close().
       KALDI_ERR << "Output::Open(), failed to close output stream: "
                 << PrintableWxfilename(filename_);
     }
@@ -683,7 +729,9 @@ bool Output::Open(const std::string &wxfn, bool binary, bool header) {
         return false;
       }
       return true;
-    } else return true;
+    } else {
+      return true;
+    }
   }
 }
 
@@ -695,10 +743,14 @@ Input::Input(const std::string &rxfilename, bool *binary): impl_(NULL) {
   }
 }
 
-void Input::Close() {
+int32 Input::Close() {
   if (impl_) {
+    int32 ans = impl_->Close();
     delete impl_;
     impl_ = NULL;
+    return ans;
+  } else {
+    return 0;
   }
 }
 
@@ -711,7 +763,8 @@ bool Input::OpenInternal(const std::string &rxfilename,
     if (type == kOffsetFileInput && impl_->MyType() == kOffsetFileInput) {
       // We want to use the same object to Open... this is in case
       // the files are the same, so we can just seek.
-      if (!impl_->Open(rxfilename, file_binary)) {  // true is binary mode-- always open in binary.
+      if (!impl_->Open(rxfilename, file_binary)) {  // true is binary mode--
+        // always open in binary.
         delete impl_;
         impl_ = NULL;
         return false;
@@ -719,7 +772,8 @@ bool Input::OpenInternal(const std::string &rxfilename,
       // read the binary header, if requested.
       if (contents_binary != NULL)
         return InitKaldiInputStream(impl_->Stream(), contents_binary);
-      else return true;
+      else
+        return true;
     } else {
       Close();
       // and fall through to code below which actually opens the file.
@@ -738,14 +792,16 @@ bool Input::OpenInternal(const std::string &rxfilename,
         PrintableRxfilename(rxfilename);
     return false;
   }
-  if (!impl_->Open(rxfilename, file_binary)) {  // true is binary mode-- always read in binary.
+  if (!impl_->Open(rxfilename, file_binary)) {  // true is binary mode--
+    // always read in binary.
     delete impl_;
     impl_ = NULL;
     return false;
   }
   if (contents_binary != NULL)
     return InitKaldiInputStream(impl_->Stream(), contents_binary);
-  else return true;
+  else
+    return true;
 }
 
 
diff --git a/src/util/kaldi-io.h b/src/util/kaldi-io.h
index f2c7563280e..02362a28021 100644
--- a/src/util/kaldi-io.h
+++ b/src/util/kaldi-io.h
@@ -1,6 +1,7 @@
 // util/kaldi-io.h
 
 // Copyright 2009-2011  Microsoft Corporation;  Jan Silovsky
+//                2016  Xiaohui Zhang
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -19,15 +20,14 @@
 #ifndef KALDI_UTIL_KALDI_IO_H_
 #define KALDI_UTIL_KALDI_IO_H_
 
-#include <cctype>  // For isspace.
-#include <limits>
-#include <string>
-#include "base/kaldi-common.h"
 #ifdef _MSC_VER
 # include <fcntl.h>
 # include <io.h>
 #endif
-
+#include <cctype>  // For isspace.
+#include <limits>
+#include <string>
+#include "base/kaldi-common.h"
 
 
 namespace kaldi {
@@ -53,8 +53,9 @@ class InputImplBase;  // Forward decl; defined in a .cc file
 
 // We now document the types of extended filenames that we use.
 //
-// A "wxfilename"  is an extended filename for writing.  It can take three forms:
-// (1) Filename: e.g.    "/some/filename", "./a/b/c", "c:\Users\dpovey\My Documents\\boo"
+// A "wxfilename"  is an extended filename for writing. It can take three forms:
+// (1) Filename: e.g.    "/some/filename", "./a/b/c", "c:\Users\dpovey\My
+//                        Documents\\boo"
 //          (whatever the actual file-system interprets)
 // (2) Standard output:  "" or "-"
 // (3) A pipe: e.g.  "gunzip -c /tmp/abc.gz |"
@@ -93,10 +94,11 @@ enum OutputType {
 
 /// ClassifyWxfilename interprets filenames as follows:
 ///  - kNoOutput: invalid filenames (leading or trailing space, things that look
-///     like wspecifiers and rspecifiers or like pipes to read from with leading |.
+///     like wspecifiers and rspecifiers or like pipes to read from with leading
+///     |.
 ///  - kFileOutput: Normal filenames
 ///  - kStandardOutput: The empty string or "-", interpreted as standard output
-///  - kPipeOutput: pipes, e.g. "gunzip -c some_file.gz |"  
+///  - kPipeOutput: pipes, e.g. "gunzip -c some_file.gz |"
 OutputType ClassifyWxfilename(const std::string &wxfilename);
 
 enum InputType {
@@ -125,7 +127,7 @@ class Output {
   // with these arguments.
   Output(const std::string &filename, bool binary, bool write_header = true);
 
-  Output(): impl_(NULL) {};
+  Output(): impl_(NULL) {}
 
   /// This opens the stream, with the given mode (binary or text).  It returns
   /// true on success and false on failure.  However, it will throw if something
@@ -136,8 +138,8 @@ class Output {
   /// closing the old stream failed it will throw).
   bool Open(const std::string &wxfilename, bool binary, bool write_header);
 
-  inline bool IsOpen();  // return true if we have an open stream.  Does not imply
-  // stream is good for writing.
+  inline bool IsOpen();  // return true if we have an open stream.  Does not
+  // imply stream is good for writing.
 
   std::ostream &Stream();  // will throw if not open; else returns stream.
 
@@ -216,8 +218,9 @@ class Input {
 
   // It is never necessary or helpful to call Close, except if
   // you are concerned about to many filehandles being open.
-  // Close does not throw.
-  void Close();
+  // Close does not throw. It returns the exit code as int32
+  // in the case of a pipe [kPipeInput], and always zero otherwise.
+  int32 Close();
 
   // Returns the underlying stream. Throws if !IsOpen()
   std::istream &Stream();
@@ -226,7 +229,8 @@ class Input {
   // don't worry about the status when we close them.
   ~Input();
  private:
-  bool OpenInternal(const std::string &rxfilename, bool file_binary, bool *contents_binary);
+  bool OpenInternal(const std::string &rxfilename, bool file_binary,
+                    bool *contents_binary);
   InputImplBase *impl_;
   KALDI_DISALLOW_COPY_AND_ASSIGN(Input);
 };
@@ -259,6 +263,6 @@ std::string PrintableWxfilename(std::string wxfilename);
 
 }  // end namespace kaldi.
 
-#include "kaldi-io-inl.h"
+#include "util/kaldi-io-inl.h"
 
-#endif
+#endif  // KALDI_UTIL_KALDI_IO_H_
diff --git a/src/util/kaldi-pipebuf.h b/src/util/kaldi-pipebuf.h
index 43e5a2e8851..9b83cdccc3d 100644
--- a/src/util/kaldi-pipebuf.h
+++ b/src/util/kaldi-pipebuf.h
@@ -25,14 +25,14 @@
 #ifndef KALDI_UTIL_KALDI_PIPEBUF_H_
 #define KALDI_UTIL_KALDI_PIPEBUF_H_
 
-#if defined(_LIBCPP_VERSION)  // libc++
-#include "basic-filebuf.h"
-#else
+#include<string>
+#if !defined(_LIBCPP_VERSION)  // libc++
 #include <fstream>
+#else
+#include "util/basic-filebuf.h"
 #endif
 
-namespace kaldi
-{
+namespace kaldi {
 // This class provides a way to initialize a filebuf with a FILE* pointer
 // directly; it will not close the file pointer when it is deleted.
 // The C++ standard does not allow implementations of C++ to provide
@@ -42,8 +42,7 @@ namespace kaldi
 #ifdef _MSC_VER
 #elif defined(_LIBCPP_VERSION)  // libc++
 template<class CharType, class Traits = std::char_traits<CharType> >
-class basic_pipebuf : public basic_filebuf<CharType, Traits>
-{
+class basic_pipebuf : public basic_filebuf<CharType, Traits> {
  public:
   typedef basic_pipebuf<CharType, Traits>   ThisType;
 
@@ -60,8 +59,7 @@ class basic_pipebuf : public basic_filebuf<CharType, Traits>
 };  // class basic_pipebuf
 #else
 template<class CharType, class Traits = std::char_traits<CharType> >
-class basic_pipebuf : public std::basic_filebuf<CharType, Traits>
-{
+class basic_pipebuf : public std::basic_filebuf<CharType, Traits> {
  public:
   typedef basic_pipebuf<CharType, Traits>   ThisType;
 
@@ -82,9 +80,9 @@ class basic_pipebuf : public std::basic_filebuf<CharType, Traits>
     this->_M_set_buffer(-1);
   }
 };  // class basic_pipebuf
-#endif // _MSC_VER
+#endif  // _MSC_VER
 
 };  // namespace kaldi
 
-#endif // KALDI_UTIL_KALDI_PIPEBUF_H_
+#endif  // KALDI_UTIL_KALDI_PIPEBUF_H_
 
diff --git a/src/util/kaldi-table-inl.h b/src/util/kaldi-table-inl.h
index 6b73c88251a..f45d613d60f 100644
--- a/src/util/kaldi-table-inl.h
+++ b/src/util/kaldi-table-inl.h
@@ -2,6 +2,7 @@
 
 // Copyright 2009-2011    Microsoft Corporation
 //                2013    Johns Hopkins University (author: Daniel Povey)
+//                2016    Xiaohui Zhang
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -23,9 +24,16 @@
 #define KALDI_UTIL_KALDI_TABLE_INL_H_
 
 #include <algorithm>
+#include <string>
+#include <utility>
+#include <vector>
+#include <errno.h>
 #include "util/kaldi-io.h"
+#include "util/kaldi-holder.h"
 #include "util/text-utils.h"
-#include "util/stl-utils.h" // for StringHasher.
+#include "util/stl-utils.h"  // for StringHasher.
+#include "thread/kaldi-thread.h"
+#include "thread/kaldi-semaphore.h"
 
 
 namespace kaldi {
@@ -33,25 +41,58 @@ namespace kaldi {
 /// \addtogroup table_impl_types
 /// @{
 
+// In SequentialTableReaderScriptImpl and RandomAccessTableReaderScriptImpl, for
+// cases where the scp contained 'range specifiers' (things in square brackets
+// identifying parts of objects like matrices), use this function to separate
+// the input string 'line' (e.g "1.ark:100[1:2,2:10]") into the data_rxfilename
+// (e.g. "1.ark:100") and the optional range specifier which will be everything
+// inside the square brackets.  It returns true if everything seems OK, and
+// false if for example the string contained more than one '['.  This function
+// should only be called if 'line' ends in '[', otherwise it is an error.
+bool ExtractRangeSpecifier(const std::string &line,
+                           std::string *data_rxfilename,
+                           std::string *range);
+
 template<class Holder> class SequentialTableReaderImplBase {
  public:
   typedef typename Holder::T T;
-  // note that Open takes rxfilename not rspecifier.
+  // note that Open takes rxfilename not rspecifier.  Open will only be
+  // called on a just-allocated object.
   virtual bool Open(const std::string &rxfilename) = 0;
+  // Done() should be called on a successfully opened, not-closed object.
+  // only throws if called a the wrong time (i.e. code error).
   virtual bool Done() const = 0;
+  // Returns true if the reader is open [i.e. Open() succeeded and
+  // the user has not called Close()]
   virtual bool IsOpen() const = 0;
+  // Returns the current key; it is valid to call this if Done() returned false.
+  // Only throws on code error (i.e. called at the wrong time).
   virtual std::string Key() = 0;
+  // Returns the value associated with the current key.  Valid to call it if
+  // Done() returned false.  It throws if the value could not be read.  [However
+  // if you use the ,p modifier it will never throw, unless you call it at the
+  // wrong time, i.e. unless there is a code error.]
   virtual const T &Value() = 0;
   virtual void FreeCurrent() = 0;
+  // move to the next object.  This won't throw unless called wrongly (e.g. on
+  // non-open archive.]
   virtual void Next() = 0;
+  // Close the table.  Returns its status as bool so it won't throw, unless
+  // called wrongly [i.e. on non-open archive.]
   virtual bool Close() = 0;
+  // SwapHolder() is not part of the public interface of SequentialTableReader.
+  // It should be called when it would be valid to call Value() or FreeCurrent()
+  // (i.e. when a value is stored), and after this it's not valid to get the
+  // value any more until you call Next().  It swaps the contents of
+  // this->holder_ with those of 'other_holder'.  It's needed as part of how
+  // we implement SequentialTableReaderBackgroundImpl.
+  virtual void SwapHolder(Holder *other_holder) = 0;
   SequentialTableReaderImplBase() { }
-  virtual ~SequentialTableReaderImplBase() { }
+  virtual ~SequentialTableReaderImplBase() { }  // throws.
  private:
-  KALDI_DISALLOW_COPY_AND_ASSIGN(SequentialTableReaderImplBase);  
+  KALDI_DISALLOW_COPY_AND_ASSIGN(SequentialTableReaderImplBase);
 };
 
-
 // This is the implementation for SequentialTableReader
 // when it's actually a script file.
 template<class Holder>  class SequentialTableReaderScriptImpl:
@@ -61,10 +102,12 @@ template<class Holder>  class SequentialTableReaderScriptImpl:
 
   SequentialTableReaderScriptImpl(): state_(kUninitialized) { }
 
+  // You may call Open from states kUninitialized and kError.
+  // It may leave the object in any of the states.
   virtual bool Open(const std::string &rspecifier) {
-    if (state_ != kUninitialized)
-      if (! Close()) // call Close() yourself to suppress this exception.
-        KALDI_ERR << "TableReader::Open, error closing previous input: "
+    if (state_ != kUninitialized && state_ != kError)
+      if (!Close())  // call Close() yourself to suppress this exception.
+        KALDI_ERR << "Error closing previous input: "
                   << "rspecifier was " << rspecifier_;
     bool binary;
     rspecifier_ = rspecifier;
@@ -77,47 +120,42 @@ template<class Holder>  class SequentialTableReaderScriptImpl:
       state_ = kUninitialized;
       return false;
     } else {  // Open succeeded.
-      if (binary) {  // script file should not be binary file..
-        state_ = kError;  // bad script file.
-        script_input_.Close();
+      if (binary) {
+        KALDI_WARN << "Script file should not be binary file.";
+        SetErrorState();
         return false;
       } else {
         state_ = kFileStart;
         Next();
-        if (state_ == kError) {
-          script_input_.Close();
+        if (state_ == kError)
           return false;
-        }
-        if (opts_.permissive) {  // Next() will have preloaded.
-          KALDI_ASSERT(state_ == kLoadSucceeded || state_ == kEof);
-        } else {
-          KALDI_ASSERT(state_ == kHaveScpLine || state_ == kEof);
-        }
-        return true;  // Success.
+        // any other status, including kEof, is OK from the point of view of
+        // the 'open' function (empty scp file is not inherently an error).
+        return true;
       }
     }
   }
 
   virtual bool IsOpen() const {
     switch (state_) {
-      case kEof: case kError: case kHaveScpLine: case kLoadSucceeded: case kLoadFailed: return true;
-      case kUninitialized:  return false;
-      default: KALDI_ERR << "IsOpen() called on invalid object.";  // kFileStart is not valid
-        // state for user to call something on.
+      case kEof: case kHaveScpLine: case kHaveObject: case kHaveRange:
+        return true;
+      case kUninitialized: case kError:
+        return false;
+      default: KALDI_ERR << "IsOpen() called on invalid object.";
+        // note: kFileStart is not a valid state for the user to call a member
+        // function (we never return from a public function in this state).
         return false;
     }
   }
 
   virtual bool Done() const {
     switch (state_) {
-      case kHaveScpLine: return false;
-      case kLoadSucceeded: case kLoadFailed: return false;
-        // These cases are because we want LoadCurrent()
-        // to be callable after Next() and to not change the Done() status [only Next() should change
-        // the Done() status].
-      case kEof: case kError: return true;  // Error condition, like Eof, counts as Done(); the destructor
-        // or Close() will inform the user of the error.
-      default: KALDI_ERR << "Done() called on TableReader object at the wrong time.";
+      case kHaveScpLine: case kHaveObject: case kHaveRange: return false;
+      case kEof: case kError: return true;  // Error condition, like Eof, counts
+        // as Done(); the destructor/Close() will inform the user of the error.
+      default: KALDI_ERR << "Done() called on TableReader object at the wrong"
+          " time.";
         return false;
     }
   }
@@ -125,175 +163,322 @@ template<class Holder>  class SequentialTableReaderScriptImpl:
   virtual std::string Key() {
     // Valid to call this whenever Done() returns false.
     switch (state_) {
-      case kHaveScpLine: case kLoadSucceeded: case kLoadFailed: break;
+      case kHaveScpLine: case kHaveObject: case kHaveRange: break;
       default:
         // coding error.
         KALDI_ERR << "Key() called on TableReader object at the wrong time.";
     }
     return key_;
   }
-  const T &Value() {
-    StateType orig_state = state_;
-    if (state_ == kHaveScpLine) LoadCurrent();  // Takes
-    // state_ to kLoadSucceeded or kLoadFailed.
-    if (state_ == kLoadFailed) {  // this can happen due to
-      // a file listed in an scp file not existing, or
-      // read failure, failure of a command, etc.
-      if (orig_state == kHaveScpLine)
-        KALDI_ERR << "TableReader: failed to load object from "
-                  << PrintableRxfilename(data_rxfilename_)
-                  << " (to suppress this error, add the permissive "
-                  << "(p, ) option to the rspecifier.";
 
-      else // orig_state_ was kLoadFailed, which only could have happened
-        // if the user called FreeCurrent().
-        KALDI_ERR << "TableReader: you called Value() after FreeCurrent().";
-    } else if (state_ != kLoadSucceeded) {
-      // This would be a coding error.
-      KALDI_ERR << "TableReader: Value() called at the wrong time.";
+  const T &Value() {
+    if (!EnsureObjectLoaded())
+      KALDI_ERR << "Failed to load object from "
+                << PrintableRxfilename(data_rxfilename_)
+                << " (to suppress this error, add the permissive "
+                << "(p, ) option to the rspecifier.";
+    // Because EnsureObjectLoaded() returned with success, we know
+    // that if range_ is nonempty (i.e. a range was requested), the
+    // state will be kHaveRange.
+    if (state_ == kHaveRange) {
+      return range_holder_.Value();
+    } else {
+      KALDI_ASSERT(state_ == kHaveObject);
+      return holder_.Value();
     }
-    return holder_.Value();
   }
+
   void FreeCurrent() {
-    if (state_ == kLoadSucceeded) {
+    if (state_ == kHaveObject) {
       holder_.Clear();
-      state_ = kLoadFailed;
+      state_ = kHaveScpLine;
+    } else if (state_ == kHaveRange) {
+      range_holder_.Clear();
+      state_ = kHaveObject;
     } else {
-      KALDI_WARN << "TableReader: FreeCurrent called at the wrong time.";
+      KALDI_WARN << "FreeCurrent called at the wrong time.";
     }
   }
+
+  void SwapHolder(Holder *other_holder) {
+    // call Value() to ensure we have a value, and ignore its return value while
+    // suppressing compiler warnings by casting to void.  It will cause the
+    // program to die with KALDI_ERR if we couldn't get a value.
+    (void) Value();
+    // At this point we know that we successfully loaded an object,
+    // and if there was a range specified, it's in range_holder_.
+    if (state_ == kHaveObject) {
+      holder_.Swap(other_holder);
+      state_ = kHaveScpLine;
+    } else if (state_ == kHaveRange) {
+      range_holder_.Swap(other_holder);
+      state_ = kHaveObject;
+      // This indicates that we still have the base object (but no range).
+    } else {
+      KALDI_ERR << "Code error";
+    }
+    // Note: after this call there may be some junk left in range_holder_ or
+    // holder_, but it won't matter.  We avoid calling Clear() on them, as this
+    // function needs to be lightweight for the 'bg' feature to work well.
+  }
+
+  // Next goes to the next object.
+  // It can leave the object in most of the statuses, but
+  // the only circumstances under which it will return are:
+  //  either:
+  //  - if Done() returned true, i.e. kError or kEof.
+  //  or:
+  //  - in non-permissive mode, status kHaveScpLine or kHaveObjecct
+  //  - in permissive mode, only when we successfully have an object,
+  //    which means either (kHaveObject and range_.empty()), or
+  //    kHaveRange.
   void Next() {
     while (1) {
       NextScpLine();
       if (Done()) return;
       if (opts_.permissive) {
-        // Permissive mode means, when reading scp files, we treat keys whose scp entry
-        // cannot be read as nonexistent.  This means trying to read.
-        if (LoadCurrent()) return;  // Success.
+        // Permissive mode means, when reading scp files, we treat keys whose
+        // scp entry cannot be read as nonexistent.  This means trying to read.
+        if (EnsureObjectLoaded()) return;  // Success.
         // else try the next scp line.
       } else {
-        return;  // We go the next key; Value() will crash if we can't
-        // read the scp line.
+        return;  // We go the next key; Value() will crash if we can't read the
+                 // object on the scp line.
       }
     }
   }
 
+  // This function may be entered at in any state.  At exit, the object will be
+  // in state kUninitialized.  It only returns false in the situation where we
+  // were at the end of the stream (kEof) and the script_input_ was a pipe and
+  // it ended with error status; this is so that we can catch errors from
+  // programs that we invoked via a pipe.
   virtual bool Close() {
-    // Close() will succeed if the stream was not in an error
-    // state.  To clean up, it also closes the Input objects if
-    // they're open.
+    int32 status = 0;
     if (script_input_.IsOpen())
-      script_input_.Close();
+      status = script_input_.Close();
     if (data_input_.IsOpen())
       data_input_.Close();
-    if (state_ == kLoadSucceeded)
-      holder_.Clear();
+    range_holder_.Clear();
+    holder_.Clear();
     if (!this->IsOpen())
       KALDI_ERR << "Close() called on input that was not open.";
     StateType old_state = state_;
     state_ = kUninitialized;
-    if (old_state == kError) {
+    if (old_state == kError || (old_state == kEof && status != 0)) {
       if (opts_.permissive) {
-        KALDI_WARN << "Close() called on scp file with read error, ignoring the "
-            "error because permissive mode specified.";
+        KALDI_WARN << "Close() called on scp file with read error, ignoring the"
+            " error because permissive mode specified.";
         return true;
-      } else  return false;  // User will do something with the error status.
-    } else  return true;
+      } else {
+        return false;  // User will do something with the error status.
+      }
+    } else {
+      return true;
+    }
+    // Possible states                                          Return value
+    // kLoadSucceeded/kRangeSucceeded/kRangeFailed              true
+    // kError (if opts_.permissive)                             true
+    // kError (if !opts_.permissive)                            false
+    // kEof (if script_input_.Close() && !opts.permissive)      false
+    // kEof (if !script_input_.Close() || opts.permissive)      true
+    // kUninitialized/kFileStart/kHaveScpLine                   true
+    // kUnitialized                                             true
   }
 
   virtual ~SequentialTableReaderScriptImpl() {
-    if (state_ == kError)
+    if (this->IsOpen() && !Close())
       KALDI_ERR << "TableReader: reading script file failed: from scp "
-                << PrintableRxfilename(script_rxfilename_);
-    // If you don't want this exception to be thrown you can
-    // call Close() and check the status.
-    if (state_ == kLoadSucceeded)
-      holder_.Clear();
+                      << PrintableRxfilename(script_rxfilename_);
   }
- private:  
-  bool LoadCurrent() {
-    // Attempts to load object whose rxfilename is on the current scp line.
-    if (state_ != kHaveScpLine)
-      KALDI_ERR << "TableReader: LoadCurrent() called at the wrong time.";
-    bool ans;
-    // note, NULL means it doesn't read the binary-mode header
-    if (Holder::IsReadInBinary()) ans = data_input_.Open(data_rxfilename_, NULL);
-    else ans = data_input_.OpenTextMode(data_rxfilename_);
-    if (!ans) {
-      // May want to make this warning a VLOG at some point
-      KALDI_WARN << "TableReader: failed to open file "
-                 << PrintableRxfilename(data_rxfilename_);
-      state_ = kLoadFailed;
-      return false;
-    } else {
-      if (holder_.Read(data_input_.Stream())) {
-        state_ = kLoadSucceeded;
-        return true;
-      } else {  // holder_ will not contain data.
-        KALDI_WARN << "TableReader: failed to load object from "
+ private:
+
+  // Function EnsureObjectLoaded() ensures that we have fully loaded any object
+  // (including object range) associated with the current key, and returns true
+  // on success (i.e. we have the object) and false on failure.
+  //
+  // Possible entry states: kHaveScpLine, kLoadSucceeded, kRangeSucceeded
+  //
+  // Possible exit states: kHaveScpLine, kLoadSucceeded, kRangeSucceeded.
+  //
+  // Note: the return status has information that cannot be deduced from
+  // just the exit state.  If the object could not be loaded we go to state
+  // kHaveScpLine but return false; and if the range was requested but
+  // could not be extracted, we go to state kLoadSucceeded but return false.
+  bool EnsureObjectLoaded() {
+    if (!(state_ == kHaveScpLine || state_ == kHaveObject ||
+          state_ == kHaveRange))
+      KALDI_ERR << "Invalid state (code error)";
+
+    if (state_ == kHaveScpLine) {  // need to load the object into holder_.
+      bool ans;
+      // note, NULL means it doesn't read the binary-mode header
+      if (Holder::IsReadInBinary()) {
+        ans = data_input_.Open(data_rxfilename_, NULL);
+      } else {
+        ans = data_input_.OpenTextMode(data_rxfilename_);
+      }
+      if (!ans) {
+        KALDI_WARN << "Failed to open file "
                    << PrintableRxfilename(data_rxfilename_);
-        state_ = kLoadFailed;
         return false;
+      } else {
+        if (holder_.Read(data_input_.Stream())) {
+          state_ = kHaveObject;
+        } else {  // holder_ will not contain data.
+          KALDI_WARN << "Failed to load object from "
+                     << PrintableRxfilename(data_rxfilename_);
+          return false;
+        }
       }
     }
+    // OK, at this point the state must be either
+    // kHaveObject or kHaveRange.
+    if (range_.empty()) {
+      // if range_ is the empty string, we should not be in the state
+      // kHaveRange.
+      KALDI_ASSERT(state_ == kHaveObject);
+      return true;
+    }
+    // range_ is nonempty.
+    if (state_ == kHaveRange) {
+      // range was already extracted, so there nothing to do.
+      return true;
+    }
+    // OK, range_ is nonempty and state_ is kHaveObject.  We attempt to extract
+    // the range object.  Note: ExtractRange() will throw with KALDI_ERR if the
+    // object type doesn't support ranges.
+    if (!range_holder_.ExtractRange(holder_, range_)) {
+      KALDI_WARN  << "Failed to load object from "
+                  << PrintableRxfilename(data_rxfilename_)
+                  << "[" << range_ << "]";
+      return false;
+    } else {
+      state_ = kHaveRange;
+      return true;
+    }
+  }
+
+  void SetErrorState() {
+    state_ = kError;
+    script_input_.Close();
+    data_input_.Close();
+    holder_.Clear();
+    range_holder_.Clear();
   }
 
   // Reads the next line in the script file.
+  // Possible entry states: kHaveObject, kHaveRange, kHaveScpLine, kFileStart.
+  // Possible exit states: kEof, kError, kHaveScpLine, kHaveObject.
   void NextScpLine() {
-    switch (state_) {
-      case kLoadSucceeded: holder_.Clear(); break;
-      case kHaveScpLine: case kLoadFailed: case kFileStart: break;
+    switch (state_) {  // Check and simplify the state.
+      case kHaveRange:
+        range_holder_.Clear();
+        state_ = kHaveObject;
+        break;
+      case kHaveScpLine: case kHaveObject: case kFileStart: break;
       default:
         // No other states are valid to call Next() from.
         KALDI_ERR << "Reading script file: Next called wrongly.";
     }
+    // at this point the state will be kHaveObject, kHaveScpLine, or kFileStart.
     std::string line;
     if (getline(script_input_.Stream(), line)) {
-      SplitStringOnFirstSpace(line, &key_, &data_rxfilename_);
-      if (!key_.empty() && !data_rxfilename_.empty()) {
+      // After extracting "key" from "line", we put the rest
+      // of "line" into "rest", and then extract data_rxfilename_
+      // (e.g. 1.ark:100) and possibly the range_ specifer
+      // (e.g. [1:2,2:10]) from "rest".
+      std::string data_rxfilename, rest;
+      SplitStringOnFirstSpace(line, &key_, &rest);
+      if (!key_.empty() && !rest.empty()) {
         // Got a valid line.
-        state_ = kHaveScpLine;
+        if (rest[rest.size()-1] == ']') {
+          if(!ExtractRangeSpecifier(rest, &data_rxfilename, &range_)) {
+            KALDI_WARN << "Reading rspecifier '" << rspecifier_
+                       << ", cannot make sense of scp line "
+                       << line;
+            SetErrorState();
+            return;
+          }
+        } else {
+          data_rxfilename = rest;
+          range_ = "";
+        }
+        bool filenames_equal = (data_rxfilename_ == data_rxfilename);
+        if (!filenames_equal)
+          data_rxfilename_ = data_rxfilename;
+        if (state_ == kHaveObject) {
+          if (!filenames_equal) {
+            holder_.Clear();
+            state_ = kHaveScpLine;
+          }
+          // else leave state_ at kHaveObject and leave the object in the
+          // holder.
+        } else {
+          state_ = kHaveScpLine;
+        }
       } else {
-        // Got an invalid line.
-        state_ = kError;  // we can't make sense of this
-        // scp file and will now die.
+        KALDI_WARN << "We got an invalid line in the scp file. "
+                   << "It should look like: some_key 1.ark:10, got: "
+                   << line;
+        SetErrorState();
       }
     } else {
-      state_ = kEof;  // nothing more in the scp file.
-      // Might as well close the input streams as don't need them.
+      state_ = kEof;  // there is nothing more in the scp file.  Might as well
+                      // close input streams as we don't need them.
       script_input_.Close();
       if (data_input_.IsOpen())
         data_input_.Close();
+      holder_.Clear();  // clear the holder if it was nonempty.
+      range_holder_.Clear();  // clear the range holder if it was nonempty.
     }
   }
 
+  std::string rspecifier_;  // the rspecifier that this class was opened with.
+  RspecifierOptions opts_;  // options.
+  std::string script_rxfilename_;  // rxfilename of the script file.
 
   Input script_input_;  // Input object for the .scp file
-  Input data_input_;   // Input object for the entries in
-  // the script file.
-  Holder holder_;  // Holds the object.
-  bool binary_;  // Binary-mode archive.
-  std::string key_;
-  std::string rspecifier_;
-  std::string script_rxfilename_;  // of the script file.
-  RspecifierOptions opts_;  // options.
-  std::string data_rxfilename_;  // of the file we're reading.
+  Input data_input_;   // Input object for the entries in the script file;
+                       // we make this a class member instead of a local variable,
+                       // so that rspecifiers of the form filename:byte-offset,
+                       // e.g. foo.ark:12345, can be handled using fseek().
+
+  Holder holder_;       // Holds the object.
+  Holder range_holder_; // Holds the partial object corresponding to the object
+                        // range specifier 'range_'; this is only used when
+                        // 'range_' is specified, i.e. when the .scp file
+                        // contains lines of the form rspecifier[range], like
+                        // foo.ark:242[0:9] (representing a row range of a
+                        // matrix).
+
+
+  std::string key_;  // the key of the current scp line we're processing
+  std::string data_rxfilename_;  // the rxfilename corresponding to the current key
+  std::string range_;  // the range of object corresponding to the current key, if an
+                       // object range was specified in the script file, else "".
+
   enum StateType {
-    //       [The state of the reading process]               [does holder_ [is script_inp_
-    //                                                         have object]   open]
-    kUninitialized,  // Uninitialized or closed.                    no         no
-    kEof,     // We did Next() and found eof in script file.       no         no
-    kError,   // Some other error                                  no         yes
-    kHaveScpLine,  // Just called Open() or Next() and have a       no         yes
-    // line of the script file but no data.
-    kLoadSucceeded,  // Called LoadCurrent() and it succeeded.     yes         yes
-    kLoadFailed,  // Called LoadCurrent() and it failed,           no         yes
-    // or the user called FreeCurrent().. note,
-    // if when called by user we are in this state,
-    // it means the user called FreeCurrent().
-    kFileStart,        // [state we only use internally]           no         yes
+    //  Summary of the states this object can be in (state_).
+    //
+    //                (*) Does holder_ contain the object corresponding to
+    //                    data_rxfilename_ ?
+    //                    (*) Does range_holder_ contain a range object?
+    //                         (*) is script_input_ open?
+    //                             (*) are key_, data_rxfilename_ and range_ [if applicable] set?
+    //
+    kUninitialized, // no  no  no  no            Uninitialized or closed object.
+    kFileStart,     // no  no  yes no            We just opened the .scp file (we'll never be in this
+                    //                           state when a user-visible function is called.)
+    kEof,           // no  no  no  no            We did Next() and found eof in script file.
+    kError,         // no  no  no  no            Error reading or parsing script file.
+    kHaveScpLine,   // no  no  yes yes           Have a line of the script file but nothing else.
+    kHaveObject,    // yes no  yes yes           holder_ contains an object but range_holder_ does not.
+    kHaveRange,     // yes yes yes yes           we have the range object in range_holder_ (implies
+                    //                           range_ nonempty).
   } state_;
- private:
+
+
 };
 
 
@@ -316,12 +501,12 @@ template<class Holder>  class SequentialTableReaderArchiveImpl:
 
   virtual bool Open(const std::string &rspecifier) {
     if (state_ != kUninitialized) {
-      if (! Close()) {  // call Close() yourself to suppress this exception.
+      if (!Close()) {  // call Close() yourself to suppress this exception.
         if (opts_.permissive)
-          KALDI_WARN << "TableReader::Open, error closing previous input "
+          KALDI_WARN << "Error closing previous input "
               "(only warning, since permissive mode).";
         else
-          KALDI_ERR << "TableReader::Open, error closing previous input.";
+          KALDI_ERR << "Error closing previous input.";
       }
     }
     rspecifier_ = rspecifier;
@@ -337,7 +522,7 @@ template<class Holder>  class SequentialTableReaderArchiveImpl:
     else
       ans = input_.OpenTextMode(archive_rxfilename_);
     if (!ans) {  // header.
-      KALDI_WARN << "TableReader: failed to open stream "
+      KALDI_WARN << "Failed to open stream "
                  << PrintableRxfilename(archive_rxfilename_);
       state_ = kUninitialized;  // Failure on Open
       return false;  // User should print the error message.
@@ -358,11 +543,12 @@ template<class Holder>  class SequentialTableReaderArchiveImpl:
   virtual void Next() {
     switch (state_) {
       case kHaveObject:
-        holder_.Clear(); break;
+        holder_.Clear();
+        break;
       case kFileStart: case kFreedObject:
         break;
       default:
-        KALDI_ERR << "TableReader: Next() called wrongly.";
+        KALDI_ERR << "Next() called wrongly.";
     }
     std::istream &is = input_.Stream();
     is.clear();  // Clear any fail bits that may have been set... just in case
@@ -372,14 +558,16 @@ template<class Holder>  class SequentialTableReaderArchiveImpl:
       state_ = kEof;
       return;
     }
-    if (is.fail()) {  // This shouldn't really happen, barring file-system errors.
+    if (is.fail()) {  // This shouldn't really happen, barring file-system
+                      // errors.
       KALDI_WARN << "Error reading archive "
                  << PrintableRxfilename(archive_rxfilename_);
       state_ = kError;
       return;
     }
     int c;
-    if ((c = is.peek()) != ' ' && c != '\t' && c != '\n') {  // We expect a space ' ' after the key.
+    if ((c = is.peek()) != ' ' && c != '\t' && c != '\n') {  // We expect a
+                                                     // space ' ' after the key.
       // We also allow tab [which is consumed] and newline [which is not], just
       // so we can read archives generated by scripts that may not be fully
       // aware of how this format works.
@@ -406,8 +594,8 @@ template<class Holder>  class SequentialTableReaderArchiveImpl:
     switch (state_) {
       case kEof: case kError: case kHaveObject: case kFreedObject: return true;
       case kUninitialized: return false;
-      default: KALDI_ERR << "IsOpen() called on invalid object.";  // kFileStart is not valid
-        // state for user to call something on.
+      default: KALDI_ERR << "IsOpen() called on invalid object.";  // kFileStart
+        // is not valid state for user to call something on.
         return false;
     }
   }
@@ -435,6 +623,7 @@ template<class Holder>  class SequentialTableReaderArchiveImpl:
     }
     return key_;
   }
+
   const T &Value() {
     switch (state_) {
       case kHaveObject:
@@ -445,48 +634,61 @@ template<class Holder>  class SequentialTableReaderArchiveImpl:
     }
     return holder_.Value();
   }
+
   virtual void FreeCurrent() {
     if (state_ == kHaveObject) {
       holder_.Clear();
       state_ = kFreedObject;
-    } else
-      KALDI_WARN << "TableReader: FreeCurernt called at the wrong time.";
+    } else {
+      KALDI_WARN << "FreeCurrent called at the wrong time.";
+    }
+  }
+
+  void SwapHolder(Holder *other_holder) {
+    // call Value() to ensure we have a value, and ignore its return value while
+    // suppressing compiler warnings by casting to void.
+    (void) Value();
+    if (state_ == kHaveObject) {
+      holder_.Swap(other_holder);
+      state_ = kFreedObject;
+    } else {
+      KALDI_ERR << "SwapHolder called at the wrong time "
+                   "(error related to ',bg' modifier).";
+    }
   }
 
   virtual bool Close() {
-    if (! this->IsOpen())
+    // To clean up, Close() also closes the Input object if
+    // it's open. It will succeed if the stream was not in an error state,
+    // and the Input object isn't in an error state if we've found eof in the archive.
+    if (!this->IsOpen())
       KALDI_ERR << "Close() called on TableReader twice or otherwise wrongly.";
+    int32 status = 0;
     if (input_.IsOpen())
-      input_.Close();
+      status = input_.Close();
     if (state_ == kHaveObject)
       holder_.Clear();
-    bool ans;
-    if (opts_.permissive) {
-      ans = true;  // always return success.
-      if (state_ == kError)
+    StateType old_state = state_;
+    state_ = kUninitialized;
+    if (old_state == kError || (old_state == kEof && status != 0)) {
+      if (opts_.permissive) {
         KALDI_WARN << "Error detected closing TableReader for archive "
-                   << PrintableRxfilename(archive_rxfilename_) << " but ignoring "
+                   << PrintableRxfilename(archive_rxfilename_)
+                   << " but ignoring "
                    << "it as permissive mode specified.";
-    } else
-      ans = (state_ != kError);  // If error state, user should detect it.
-    state_ = kUninitialized;
-    return ans;
+        return true;
+      } else {
+        return false;
+      }
+    } else {
+      return true;
+    }
   }
 
   virtual ~SequentialTableReaderArchiveImpl() {
-    if (state_ == kError) {
-      if (opts_.permissive)
-        KALDI_WARN << "Error detected closing TableReader for archive "
-                   << PrintableRxfilename(archive_rxfilename_) << " but ignoring "
-                   << "it as permissive mode specified.";
-      else
-        KALDI_ERR << "TableReader: error detected closing archive "
-                  << PrintableRxfilename(archive_rxfilename_);
-    }
-    // If you don't want this exception to be thrown you can
-    // call Close() and check the status.
-    if (state_ == kHaveObject)
-      holder_.Clear();
+    if (this->IsOpen() && !Close())
+      KALDI_ERR << "TableReader: error detected closing archive "
+                << PrintableRxfilename(archive_rxfilename_);
   }
  private:
   Input input_;  // Input object for the archive
@@ -495,20 +697,193 @@ template<class Holder>  class SequentialTableReaderArchiveImpl:
   std::string rspecifier_;
   std::string archive_rxfilename_;
   RspecifierOptions opts_;
-  enum {  //  [The state of the reading process]               [does holder_ [is input_
-    //                                                         have object]   open]
-    kUninitialized,  // Uninitialized or closed.                    no         no
-    kFileStart,      // [state we use internally: just opened.]    no         yes
-    kEof,     // We did Next() and found eof in archive            no         no
-    kError,   // Some other error                                  no         no
-    kHaveObject,  // We read the key and the object after it.       yes        yes
-    kFreedObject,  // The user called FreeCurrent().                no         yes
+  enum StateType {  //  [The state of the reading process]        [does holder_ [is input_
+    //                                                     have object]   open]
+    kUninitialized,  // Uninitialized or closed.                  no         no
+    kFileStart,      // [state we use internally: just opened.]   no         yes
+    kEof,     // We did Next() and found eof in archive           no         no
+    kError,   // Some other error                                 no         no
+    kHaveObject,  // We read the key and the object after it.     yes        yes
+    kFreedObject,  // The user called FreeCurrent().              no         yes
   } state_;
 };
 
+// this is for when someone adds the 'th' modifier; it wraps around the basic
+// implementation and allows it to do the reading in a background thread.
+template<class Holder>
+class SequentialTableReaderBackgroundImpl:
+      public SequentialTableReaderImplBase<Holder> {
+ public:
+  typedef typename Holder::T T;
+
+  SequentialTableReaderBackgroundImpl(
+      SequentialTableReaderImplBase<Holder> *base_reader):
+      base_reader_(base_reader) {}
+
+  // This function ignores the rxfilename argument.
+  // We use the same function signature as the regular Open(),
+  // for convenience.
+  virtual bool Open(const std::string &rxfilename) {
+    KALDI_ASSERT(base_reader_ != NULL &&
+                 base_reader_->IsOpen());  // or code error.
+    {
+      pthread_attr_t pthread_attr;
+      pthread_attr_init(&pthread_attr);
+      int32 ret = pthread_create(
+          &thread_,
+          &pthread_attr,
+          SequentialTableReaderBackgroundImpl<Holder>::run,
+          static_cast<void*>(this));
+      if (ret != 0) {
+        const char *c = strerror(ret);
+        KALDI_WARN << "Error creating thread, errno was: " << c;
+        return false;
+      }
+    }
+
+    if (!base_reader_->Done())
+      Next();
+    return true;
+  }
+
+  virtual bool IsOpen() const {
+    // Close() sets base_reader_ to NULL, and we never initialize this object
+    // with a non-open base_reader_, so no need to check if it's open.
+    return base_reader_ != NULL;
+  }
+
+  void RunInBackground() {
+    try {
+      // This function is called in the background thread.  The whole point of
+      // the background thread is that we don't want to do the actual reading
+      // (inside Next()) in the foreground.
+      while (base_reader_ != NULL && !base_reader_->Done()) {
+        consumer_sem_.Signal();
+        // Here is where the consumer process (parent thread) gets to do its
+        // stuff.  Principally it calls SwapHolder()-- a shallow swap that is
+        // cheap.
+        producer_sem_.Wait();
+        // we check that base_reader_ is not NULL in case Close() was
+        // called in the main thread.
+        if (base_reader_ != NULL)
+          base_reader_->Next();   //  here is where the work happens.
+      }
+      // this signal will be waited on in the Next() function of the foreground
+      // thread if it is still running, or Close() otherwise.
+      consumer_sem_.Signal();
+      // this signal may be waited on in Close().
+      consumer_sem_.Signal();
+    } catch (...) {
+      // There is nothing we called above that could potentially throw due to
+      // user data.  So we treat reaching this point as a code-error condition.
+      // Closing base_reader_ will trigger an exception in Next() in the main
+      // thread when it checks that base_reader_->IsOpen().
+      if (base_reader_->IsOpen()) {
+        base_reader_->Close();
+        delete base_reader_;
+        base_reader_ = NULL;
+      }
+      consumer_sem_.Signal();
+      return;
+    }
+  }
+  static void* run(void *object_in) {
+    SequentialTableReaderBackgroundImpl<Holder> *object =
+        reinterpret_cast<SequentialTableReaderBackgroundImpl<Holder>*>(object_in);
+    object->RunInBackground();
+    return NULL;
+  }
+  virtual bool Done() const {
+    return key_.empty();
+  }
+  virtual std::string Key() {
+    if (key_.empty())
+      KALDI_ERR << "Calling Key() at the wrong time.";
+    return key_;
+  }
+  virtual const T &Value() {
+    if (key_.empty())
+      KALDI_ERR << "Calling Value() at the wrong time.";
+    return holder_.Value();
+  }
+  void SwapHolder(Holder *other_holder) {
+    KALDI_ERR << "SwapHolder() should not be called on this class.";
+  }
+  virtual void FreeCurrent() {
+    if (key_.empty())
+      KALDI_ERR << "Calling FreeCurrent() at the wrong time.";
+    // note: ideally a call to Value() should crash if you have just called
+    // FreeCurrent().  For typical holders such as KaldiObjectHolder this will
+    // happen inside the holder_.Value() call.  This won't be the case for all
+    // holders, but it's not a great loss (just a missed opportunity to spot a
+    // code error).
+    holder_.Clear();
+  }
+  virtual void Next() {
+    consumer_sem_.Wait();
+    if (base_reader_ == NULL || !base_reader_->IsOpen())
+      KALDI_ERR << "Error detected (likely code error) in background "
+                << "reader (',bg' option)";
+    if (base_reader_->Done()) {
+      // there is nothing else to read.
+      key_ = "";
+    } else {
+      key_ = base_reader_->Key();
+      base_reader_->SwapHolder(&holder_);
+    }
+    // this Signal() tells the producer thread, in the background,
+    // that it's now safe to read the next value.
+    producer_sem_.Signal();
+  }
+
+  // note: we can be sure that Close() won't be called twice, as the TableReader
+  // object will delete this object after calling Close.
+  virtual bool Close() {
+    KALDI_ASSERT(base_reader_ != NULL && KALDI_PTHREAD_PTR(thread_) != 0);
+    // wait until the producer thread is idle.
+    consumer_sem_.Wait();
+    bool ans = true;
+    try {
+      ans = base_reader_->Close();
+    } catch (...) {
+      ans = false;
+    }
+    delete base_reader_;
+    // setting base_reader_ to NULL will cause the loop in the producer thread
+    // to exit.
+    base_reader_ = NULL;
+    producer_sem_.Signal();
+
+    if (pthread_join(thread_, NULL) != 0) {
+      KALDI_WARN << "Error rejoining thread.";
+      return false;
+    }
+    return ans;
+  }
+  ~SequentialTableReaderBackgroundImpl() {
+    if (base_reader_) {
+      if (!Close()) {
+        KALDI_ERR << "Error detected closing background reader "
+                  << "(relates to ',bg' modifier)";
+      }
+    }
+  }
+ private:
+  std::string key_;
+  Holder holder_;
+  // I couldn't figure out what to call these semaphores.  consumer_sem_ is the
+  // one that the consumer (main thread) waits on; producer_sem_ is the one
+  // that the producer (background thread) waits on.
+  Semaphore consumer_sem_;
+  Semaphore producer_sem_;
+  pthread_t thread_;
+  SequentialTableReaderImplBase<Holder> *base_reader_;
+
+};
 
 template<class Holder>
-SequentialTableReader<Holder>::SequentialTableReader(const std::string &rspecifier): impl_(NULL) {
+SequentialTableReader<Holder>::SequentialTableReader(const std::string
+                                                     &rspecifier): impl_(NULL) {
   if (rspecifier != "" && !Open(rspecifier))
     KALDI_ERR << "Error constructing TableReader: rspecifier is " << rspecifier;
 }
@@ -520,7 +895,8 @@ bool SequentialTableReader<Holder>::Open(const std::string &rspecifier) {
       KALDI_ERR << "Could not close previously open object.";
   // now impl_ will be NULL.
 
-  RspecifierType wt = ClassifyRspecifier(rspecifier, NULL, NULL);
+  RspecifierOptions opts;
+  RspecifierType wt = ClassifyRspecifier(rspecifier, NULL, &opts);
   switch (wt) {
     case kArchiveRspecifier:
       impl_ = new SequentialTableReaderArchiveImpl<Holder>();
@@ -537,12 +913,21 @@ bool SequentialTableReader<Holder>::Open(const std::string &rspecifier) {
     impl_ = NULL;
     return false;  // sub-object will have printed warnings.
   }
-  else return true;
+  if (opts.background) {
+    impl_ = new SequentialTableReaderBackgroundImpl<Holder>(
+        impl_);
+    if (!impl_->Open("")) {
+      // the rxfilename is ignored in that Open() call.
+      // It should only return false on code error.
+      return false;
+    }
+  }
+  return true;
 }
 
 template<class Holder>
 bool SequentialTableReader<Holder>::Close() {
-  CheckImpl();  
+  CheckImpl();
   bool ans = impl_->Close();
   delete impl_;  // We don't keep around empty impl_ objects.
   impl_ = NULL;
@@ -576,7 +961,8 @@ template<class Holder>
 const typename SequentialTableReader<Holder>::T &
 SequentialTableReader<Holder>::Value() {
   CheckImpl();
-  return impl_->Value();  // This may throw (if LoadCurrent() returned false you are safe.).
+  return impl_->Value();  // This may throw (if EnsureObjectLoaded() returned false you
+                          // are safe.).
 }
 
 
@@ -595,7 +981,7 @@ bool SequentialTableReader<Holder>::Done() {
 
 template<class Holder>
 SequentialTableReader<Holder>::~SequentialTableReader() {
-  if (impl_)  delete impl_;
+  delete impl_;
   // Destructor of impl_ may throw.
 }
 
@@ -644,11 +1030,11 @@ class TableWriterArchiveImpl: public TableWriterImplBase<Holder> {
       case kUninitialized:
         break;
       case kWriteError:
-        KALDI_ERR << "TableWriter: opening stream, already open with write error.";
+        KALDI_ERR << "Opening stream, already open with write error.";
       case kOpen: default:
         if (!Close())  // throw because this error may not have been previously
           // detected by the user.
-          KALDI_ERR << "TableWriter: opening stream, error closing previously open stream.";
+          KALDI_ERR << "Opening stream, error closing previously open stream.";
     }
     wspecifier_ = wspecifier;
     WspecifierType ws = ClassifyWspecifier(wspecifier,
@@ -657,7 +1043,8 @@ class TableWriterArchiveImpl: public TableWriterImplBase<Holder> {
                                            &opts_);
     KALDI_ASSERT(ws == kArchiveWspecifier);  // or wrongly called.
 
-    if (output_.Open(archive_wxfilename_, opts_.binary, false)) {  // false means no binary header.
+    if (output_.Open(archive_wxfilename_, opts_.binary, false)) {  // false
+                                                      // means no binary header.
       state_ = kOpen;
       return true;
     } else {
@@ -685,25 +1072,24 @@ class TableWriterArchiveImpl: public TableWriterImplBase<Holder> {
       case kWriteError:
         // user should have known from the last
         // call to Write that there was a problem.
-        KALDI_WARN << "TableWriter: attempting to write to invalid stream.";
+        KALDI_WARN << "Attempting to write to invalid stream.";
         return false;
       case kUninitialized: default:
-        KALDI_ERR << "TableWriter: Write called on invalid stream";
-
+        KALDI_ERR << "Write called on invalid stream";
     }
     // state is now kOpen or kWriteError.
-    if (!IsToken(key)) // e.g. empty string or has spaces...
-      KALDI_ERR << "TableWriter: using invalid key " << key;
+    if (!IsToken(key))  // e.g. empty string or has spaces...
+      KALDI_ERR << "Using invalid key " << key;
     output_.Stream() << key << ' ';
     if (!Holder::Write(output_.Stream(), opts_.binary, value)) {
-      KALDI_WARN << "TableWriter: write failure to "
+      KALDI_WARN << "Write failure to "
                  << PrintableWxfilename(archive_wxfilename_);
       state_ = kWriteError;
       return false;
     }
-    if (state_ == kWriteError) return false;  // Even if this Write seems to have
-    // succeeded, we fail because a previous Write failed and the archive may be
-    // corrupted and unreadable.
+    if (state_ == kWriteError) return false;  // Even if this Write seems to
+    // have succeeded, we fail because a previous Write failed and the archive
+    // may be corrupted and unreadable.
 
     if (opts_.flush)
       Flush();
@@ -718,22 +1104,22 @@ class TableWriterArchiveImpl: public TableWriterImplBase<Holder> {
         output_.Stream().flush();  // Don't check error status.
         return;
       default:
-        KALDI_WARN << "TableWriter: Flush called on not-open writer.";
+        KALDI_WARN << "Flush called on not-open writer.";
     }
   }
 
   virtual bool Close() {
     if (!this->IsOpen() || !output_.IsOpen())
-      KALDI_ERR << "TableWriter: Close called on a stream that was not open." << this->IsOpen() << ", " << output_.IsOpen();
+      KALDI_ERR << "Close called on a stream that was not open."
+                << this->IsOpen() << ", " << output_.IsOpen();
     bool close_success = output_.Close();
     if (!close_success) {
-      KALDI_WARN << "TableWriter: error closing stream: wspecifier is "
-                 << wspecifier_;
+      KALDI_WARN << "Error closing stream: wspecifier is " << wspecifier_;
       state_ = kUninitialized;
       return false;
     }
     if (state_ == kWriteError) {
-      KALDI_WARN << "TableWriter: closing writer in error state: wspecifier is "
+      KALDI_WARN << "Closing writer in error state: wspecifier is "
                  << wspecifier_;
       state_ = kUninitialized;
       return false;
@@ -771,8 +1157,8 @@ class TableWriterArchiveImpl: public TableWriterImplBase<Holder> {
 // individual files (more generally, wxfilenames) specified
 // in an scp file that we read.
 
-// Note: the code for this class is similar to RandomAccessTableReaderScriptImpl;
-// try to keep them in sync.
+// Note: the code for this class is similar to
+// RandomAccessTableReaderScriptImpl; try to keep them in sync.
 
 template<class Holder>
 class TableWriterScriptImpl: public TableWriterImplBase<Holder> {
@@ -794,11 +1180,12 @@ class TableWriterScriptImpl: public TableWriterImplBase<Holder> {
                                            &script_rxfilename_,
                                            &opts_);
     KALDI_ASSERT(ws == kScriptWspecifier);  // or wrongly called.
-    KALDI_ASSERT(script_.empty());  // no way it could be nonempty at this point.
+    KALDI_ASSERT(script_.empty());  // no way it could be nonempty at this poin.
 
-    if (! ReadScriptFile(script_rxfilename_,
+    if (!ReadScriptFile(script_rxfilename_,
                          true,  // print any warnings
-                         &script_)) {  // error reading script file or invalid format
+                         &script_)) {  // error reading script file or invalid
+                                       // format
       state_ = kNotReadScript;
       return false;  // no need to print further warnings.  user gets the error.
     }
@@ -831,20 +1218,20 @@ class TableWriterScriptImpl: public TableWriterImplBase<Holder> {
   // some errors may not be detected till we call Close().
   virtual bool Write(const std::string &key, const T &value) {
     if (!IsOpen())
-      KALDI_ERR << "TableWriter: Write called on invalid stream";
+      KALDI_ERR << "Write called on invalid stream";
 
-    if (!IsToken(key)) // e.g. empty string or has spaces...
-      KALDI_ERR << "TableWriter: using invalid key " << key;
+    if (!IsToken(key))  // e.g. empty string or has spaces...
+      KALDI_ERR << "Using invalid key " << key;
 
     std::string wxfilename;
     if (!LookupFilename(key, &wxfilename)) {
       if (opts_.permissive) {
-        return true; // In permissive mode, it's as if we're writing to /dev/null
-                     // for missing keys.
+        return true;  // In permissive mode, it's as if we're writing to
+                     // /dev/null for missing keys.
       } else {
-        KALDI_WARN << "TableWriter: script file "
+        KALDI_WARN << "Script file "
                    << PrintableRxfilename(script_rxfilename_)
-                   << " has no entry for key "<<key;
+                   << " has no entry for key " <<key;
         return false;
       }
     }
@@ -853,14 +1240,15 @@ class TableWriterScriptImpl: public TableWriterImplBase<Holder> {
       // Open in the text/binary mode (on Windows) given by member var. "binary"
       // (obtained from wspecifier), but do not put the binary-mode header (it
       // will be written, if needed, by the Holder::Write function.)
-      KALDI_WARN << "TableWriter: failed to open stream: "
+      KALDI_WARN << "Failed to open stream: "
                  << PrintableWxfilename(wxfilename);
       return false;
     }
     if (!Holder::Write(output.Stream(), opts_.binary, value)
         || !output.Close()) {
-      KALDI_WARN << "TableWriter: failed to write data to "
+      KALDI_WARN << "Failed to write data to "
                  << PrintableWxfilename(wxfilename);
+
       return false;
     }
     return true;
@@ -875,7 +1263,8 @@ class TableWriterScriptImpl: public TableWriterImplBase<Holder> {
   }
 
  private:
-  // Note: this function is almost the same as in RandomAccessTableReaderScriptImpl.
+  // Note: this function is almost the same as in
+  // RandomAccessTableReaderScriptImpl.
   bool LookupFilename(const std::string &key, std::string *wxfilename) {
     // First, an optimization: if we're going consecutively, this will
     // make the lookup very fast.
@@ -887,8 +1276,8 @@ class TableWriterScriptImpl: public TableWriterImplBase<Holder> {
     std::pair<std::string, std::string> pr(key, "");  // Important that ""
     // compares less than or equal to any string, so lower_bound points to the
     // element that has the same key.
-    typedef typename std::vector<std::pair<std::string, std::string> >::const_iterator 
-        IterType;
+    typedef typename std::vector<std::pair<std::string, std::string> >
+                     ::const_iterator IterType;
     IterType iter = std::lower_bound(script_.begin(), script_.end(), pr);
     if (iter != script_.end() && iter->first == key) {
       last_found_ = iter - script_.begin();
@@ -934,10 +1323,11 @@ class TableWriterBothImpl: public TableWriterImplBase<Holder> {
       case kUninitialized:
         break;
       case kWriteError:
-        KALDI_ERR << "TableWriter: opening stream, already open with write error.";
+        KALDI_ERR << "Opening stream, already open with write error.";
       case kOpen: default:
-        if (!Close())  // throw because this error may not have been previously detected by user.
-          KALDI_ERR << "TableWriter: opening stream, error closing previously open stream.";
+        if (!Close())  // throw because this error may not have been previously
+                       // detected by user.
+          KALDI_ERR << "Opening stream, error closing previously open stream.";
     }
     wspecifier_ = wspecifier;
     WspecifierType ws = ClassifyWspecifier(wspecifier,
@@ -950,13 +1340,14 @@ class TableWriterBothImpl: public TableWriterImplBase<Holder> {
           "will generally not be interpreted correctly unless the archive is "
           "an actual file: wspecifier = " << wspecifier;
 
-    if (!archive_output_.Open(archive_wxfilename_, opts_.binary, false)) {  // false means no binary header.
+    if (!archive_output_.Open(archive_wxfilename_, opts_.binary, false)) {
+      // false means no binary header.
       state_ = kUninitialized;
       return false;
     }
-    if (!script_output_.Open(script_wxfilename_, false, false)) {  // first false means text mode:
-      // script files always text-mode.   second false means don't write header (doesn't matter
-      // for text mode).
+    if (!script_output_.Open(script_wxfilename_, false, false)) {  // first
+      // false means text mode: script files always text-mode.   second false
+      //  means don't write header (doesn't matter for text mode).
       archive_output_.Close();  // Don't care about status: error anyway.
       state_ = kUninitialized;
       return false;
@@ -974,12 +1365,13 @@ class TableWriterBothImpl: public TableWriterImplBase<Holder> {
     return false;
   }
 
-  void MakeFilename(typename std::ostream::pos_type streampos, std::string *output) const {
+  void MakeFilename(typename std::ostream::pos_type streampos,
+                    std::string *output) const {
     std::ostringstream ss;
     ss << ':' << streampos;
     KALDI_ASSERT(ss.str() != ":-1");
     *output = archive_wxfilename_ + ss.str();
-    
+
     // e.g. /some/file:12302.
     // Note that we warned if archive_wxfilename_ is not an actual filename;
     // the philosophy is we give the user rope and if they want to hang
@@ -994,18 +1386,19 @@ class TableWriterBothImpl: public TableWriterImplBase<Holder> {
       case kWriteError:
         // user should have known from the last
         // call to Write that there was a problem.  Warn about it.
-        KALDI_WARN << "TableWriter: writing to non-open TableWriter object.";
+        KALDI_WARN << "Writing to non-open TableWriter object.";
         return false;
       case kUninitialized: default:
-        KALDI_ERR << "TableWriter: Write called on invalid stream";
+        KALDI_ERR << "Write called on invalid stream";
     }
     // state is now kOpen or kWriteError.
-    if (!IsToken(key)) // e.g. empty string or has spaces...
-      KALDI_ERR << "TableWriter: using invalid key " << key;
+    if (!IsToken(key))  // e.g. empty string or has spaces...
+      KALDI_ERR << "Using invalid key " << key;
     std::ostream &archive_os = archive_output_.Stream();
     archive_os << key << ' ';
     typename std::ostream::pos_type archive_os_pos = archive_os.tellp();
-    // position at start of Write() to archive.  We will record this in the script file.
+    // position at start of Write() to archive.  We will record this in the
+    // script file.
     std::string offset_rxfilename;  // rxfilename with offset into the archive,
     // e.g. some_archive_name.ark:431541423
     MakeFilename(archive_os_pos, &offset_rxfilename);
@@ -1017,29 +1410,29 @@ class TableWriterBothImpl: public TableWriterImplBase<Holder> {
     script_output_.Stream() << key << ' ' << offset_rxfilename << '\n';
 
     if (!Holder::Write(archive_output_.Stream(), opts_.binary, value)) {
-      KALDI_WARN << "TableWriter: write failure to"
+      KALDI_WARN << "Write failure to"
                  << PrintableWxfilename(archive_wxfilename_);
       state_ = kWriteError;
       return false;
     }
 
     if (script_os.fail()) {
-      KALDI_WARN << "TableWriter: write failure to script file detected: "
+      KALDI_WARN << "Write failure to script file detected: "
                  << PrintableWxfilename(script_wxfilename_);
       state_ = kWriteError;
       return false;
     }
 
     if (archive_os.fail()) {
-      KALDI_WARN << "TableWriter: write failure to archive file detected: "
+      KALDI_WARN << "Write failure to archive file detected: "
                  << PrintableWxfilename(archive_wxfilename_);
       state_ = kWriteError;
       return false;
     }
 
-    if (state_ == kWriteError) return false;  // Even if this Write seems to have
-    // succeeded, we fail because a previous Write failed and the archive may be
-    // corrupted and unreadable.
+    if (state_ == kWriteError) return false;  // Even if this Write seems to
+    // have succeeded, we fail because a previous Write failed and the archive
+    // may be corrupted and unreadable.
 
     if (opts_.flush)
       Flush();
@@ -1055,13 +1448,13 @@ class TableWriterBothImpl: public TableWriterImplBase<Holder> {
         script_output_.Stream().flush();  // Don't check error status.
         return;
       default:
-        KALDI_WARN << "TableWriter: Flush called on not-open writer.";
+        KALDI_WARN << "Flush called on not-open writer.";
     }
   }
 
   virtual bool Close() {
     if (!this->IsOpen())
-      KALDI_ERR << "TableWriter: Close called on a stream that was not open.";
+      KALDI_ERR << "Close called on a stream that was not open.";
     bool close_success = true;
     if (archive_output_.IsOpen())
       if (!archive_output_.Close()) close_success = false;
@@ -1079,7 +1472,7 @@ class TableWriterBothImpl: public TableWriterImplBase<Holder> {
   virtual ~TableWriterBothImpl() {
     if (!IsOpen()) return;
     else if (!Close())
-      KALDI_ERR << "At TableWriter destructor: Write failed or stream close failed: "
+      KALDI_ERR << "Write failed or stream close failed: "
                 << wspecifier_;
   }
 
@@ -1100,10 +1493,9 @@ class TableWriterBothImpl: public TableWriterImplBase<Holder> {
 
 template<class Holder>
 TableWriter<Holder>::TableWriter(const std::string &wspecifier): impl_(NULL) {
-  if (wspecifier != "" && !Open(wspecifier)) {
-    KALDI_ERR << "TableWriter: failed to write to "
-              << wspecifier;
-  }
+  if (wspecifier != "" && !Open(wspecifier))
+    KALDI_ERR << "Failed to open for writing: " << wspecifier
+              << ": " << strerror(errno);
 }
 
 template<class Holder>
@@ -1114,10 +1506,9 @@ bool TableWriter<Holder>::IsOpen() const {
 
 template<class Holder>
 bool TableWriter<Holder>::Open(const std::string &wspecifier) {
-
   if (IsOpen()) {
-    if (!Close()) // call Close() yourself to suppress this exception.
-      KALDI_ERR << "TableWriter::Open, failed to close previously open writer.";
+    if (!Close())  // call Close() yourself to suppress this exception.
+      KALDI_ERR << "Failed to close previously open writer.";
   }
   KALDI_ASSERT(impl_ == NULL);
   WspecifierType wtype = ClassifyWspecifier(wspecifier, NULL, NULL, NULL);
@@ -1135,8 +1526,9 @@ bool TableWriter<Holder>::Open(const std::string &wspecifier) {
       KALDI_WARN << "ClassifyWspecifier: invalid wspecifier " << wspecifier;
       return false;
   }
-  if (impl_->Open(wspecifier)) return true;
-  else {  // The class will have printed a more specific warning.
+  if (impl_->Open(wspecifier)) {
+    return true;
+  } else {  // The class will have printed a more specific warning.
     delete impl_;
     impl_ = NULL;
     return false;
@@ -1163,7 +1555,8 @@ template<class Holder>
 bool TableWriter<Holder>::Close() {
   CheckImpl();
   bool ans = impl_->Close();
-  delete impl_;  // We don't keep around non-open impl_ objects [c.f. definition of IsOpen()]
+  delete impl_;  // We don't keep around non-open impl_ objects
+                 // [c.f. definition of IsOpen()]
   impl_ = NULL;
   return ans;
 }
@@ -1196,10 +1589,10 @@ TableWriter<Holder>::~TableWriter() {
 //
 // Note that we will rarely in practice have to keep in memory everything in
 // the archive, as long as things are only read once from the archive (the
-// "o, " or "once" option) and as long as we keep our keys in sorted order; to take
-// advantage of this we need the "s, " (sorted) option, so we would read archives
-// as e.g. "s, o, ark:-" (this is the rspecifier we would use if it was the
-// standard input and these conditions held).
+// "o, " or "once" option) and as long as we keep our keys in sorted order;
+// to take advantage of this we need the "s, " (sorted) option, so we would
+// read archives as e.g. "s, o, ark:-" (this is the rspecifier we would use if
+// it was the standard input and these conditions held).
 
 template<class Holder> class RandomAccessTableReaderImplBase {
  public:
@@ -1217,17 +1610,16 @@ template<class Holder> class RandomAccessTableReaderImplBase {
 };
 
 
-// Implementation of RandomAccessTableReader for a script file; for simplicity we
-// just read it in all in one go, as it's unlikely someone would generate this
-// from a pipe.  In principle we could read it on-demand as for the archives, but
-// this would probably be overkill.
+// Implementation of RandomAccessTableReader for a script file; for simplicity
+// we just read it in all in one go, as it's unlikely someone would generate
+// this from a pipe.  In principle we could read it on-demand as for the
+// archives, but this would probably be overkill.
 
 // Note: the code for this this class is similar to TableWriterScriptImpl:
 // try to keep them in sync.
 template<class Holder>
 class RandomAccessTableReaderScriptImpl:
       public RandomAccessTableReaderImplBase<Holder> {
-
  public:
   typedef typename Holder::T T;
 
@@ -1235,8 +1627,9 @@ class RandomAccessTableReaderScriptImpl:
 
   virtual bool Open(const std::string &rspecifier) {
     switch (state_) {
-      case kNotHaveObject: case kHaveObject: case kGaveObject:
-        KALDI_ERR << " Opening already open RandomAccessTableReader: call Close first.";
+      case kNotHaveObject: case kHaveObject: case kHaveRange:
+        KALDI_ERR << " Opening already open RandomAccessTableReader:"
+                     " call Close first.";
       case kUninitialized: case kNotReadScript:
         break;
     }
@@ -1245,11 +1638,12 @@ class RandomAccessTableReaderScriptImpl:
                                            &script_rxfilename_,
                                            &opts_);
     KALDI_ASSERT(rs == kScriptRspecifier);  // or wrongly called.
-    KALDI_ASSERT(script_.empty());  // no way it could be nonempty at this point.
+    KALDI_ASSERT(script_.empty());  // no way it could be nonempty at this point
 
-    if (! ReadScriptFile(script_rxfilename_,
+    if (!ReadScriptFile(script_rxfilename_,
                         true,  // print any warnings
-                        &script_)) {  // error reading script file or invalid format
+                        &script_)) {  // error reading script file or invalid
+                                      // format
       state_ = kNotReadScript;
       return false;  // no need to print further warnings.  user gets the error.
     }
@@ -1261,38 +1655,42 @@ class RandomAccessTableReaderScriptImpl:
     // archive rather than a script.
     if (!opts_.sorted)
       std::sort(script_.begin(), script_.end());
-    for (size_t i = 0; i+1 < script_.size(); i++) {
+    for (size_t i = 0; i + 1 < script_.size(); i++) {
       if (script_[i].first.compare(script_[i+1].first) >= 0) {
         // script[i] not < script[i+1] in lexical order...
         bool same = (script_[i].first == script_[i+1].first);
         KALDI_WARN << "Script file " << PrintableRxfilename(script_rxfilename_)
                    << (same ? " contains duplicate key: " :
-                       " is not sorted (remove s, option or add ns, option): key is ")
-                   << script_[i].first;
+                       " is not sorted (remove s, option or add ns, option):"
+                       " key is ") << script_[i].first;
         state_ = kNotReadScript;
         return false;
       }
     }
     state_ = kNotHaveObject;
+    key_ = "";  // make sure we don't have a key set
     return true;
   }
 
   virtual bool IsOpen() const {
     return  (state_ == kNotHaveObject || state_ == kHaveObject ||
-             state_ == kGaveObject);
+             state_ == kHaveRange);
   }
 
   virtual bool Close() {
     if (!IsOpen())
-      KALDI_ERR << "Close() called on RandomAccessTableReader that was not open.";
+      KALDI_ERR << "Close() called on RandomAccessTableReader that was not"
+                   " open.";
     holder_.Clear();
+    range_holder_.Clear();
     state_ = kUninitialized;
     last_found_ = 0;
     script_.clear();
-    current_key_ = "";
-    // This one cannot fail because any errors of a "global"
-    // nature would have been detected when we did Open().
-    // With archives it's different.
+    key_ = "";
+    range_ = "";
+    data_rxfilename_ = "";
+    // This cannot fail because any errors of a "global" nature would have been
+    // detected when we did Open().  With archives it's different.
     return true;
   }
 
@@ -1307,98 +1705,128 @@ class RandomAccessTableReaderScriptImpl:
   // Write returns true on success, false on failure, but
   // some errors may not be detected till we call Close().
   virtual const T&  Value(const std::string &key) {
-
-    if (!IsOpen())
-      KALDI_ERR << "Value() called on non-open object.";
-
-    if (!((state_ == kHaveObject || state_ == kGaveObject)
-          && key == current_key_)) {  // Not already stored...
-      bool has_key = HasKeyInternal(key, true);  // preload.
-      if (!has_key)
-        KALDI_ERR << "Could not get item for key " << key
-                  << ", rspecifier is " << rspecifier_ << "[to ignore this, "
-                  << "add the p, (permissive) option to the rspecifier.";
-      KALDI_ASSERT(state_ == kHaveObject && key == current_key_);
-    }
-
+    if (!HasKeyInternal(key, true)) // true == preload.
+      KALDI_ERR << "Could not get item for key " << key
+                << ", rspecifier is " << rspecifier_ << "[to ignore this, "
+                << "add the p, (permissive) option to the rspecifier.";
+    KALDI_ASSERT(key_ == key);
     if (state_ == kHaveObject) {
-      state_ = kGaveObject;
-      if (opts_.once) MakeTombstone(key);  // make sure that future lookups fail.
-      return holder_.Value();
-    } else {  // state_ == kGaveObject
-      if (opts_.once)
-        KALDI_ERR << "Value called twice for the same key and ,o (once) option "
-                  << "is used: rspecifier is " << rspecifier_;
       return holder_.Value();
+    } else {
+      KALDI_ASSERT(state_ == kHaveRange);
+      return range_holder_.Value();
     }
   }
 
-  virtual ~RandomAccessTableReaderScriptImpl() {
-    if (state_ == kHaveObject || state_ == kGaveObject)
-      holder_.Clear();
-  }
+  virtual ~RandomAccessTableReaderScriptImpl() { }
 
  private:
+
   // HasKeyInternal when called with preload == false just tells us whether the
   // key is in the scp.  With preload == true, which happens when the ,p
-  // (permissive) option is given in the rspecifier, it will also check that we
-  // can preload the object from disk (loading from the rxfilename in the scp),
-  // and only return true if we can.  This function is called both from HasKey
-  // and from Value().
+  // (permissive) option is given in the rspecifier (or when called from
+  // Value()), it will also check that we can preload the object from disk
+  // (loading from the rxfilename in the scp), and only return true if we can.
+  // This function is called both from HasKey and from Value().
   virtual bool HasKeyInternal(const std::string &key, bool preload) {
     switch (state_) {
       case kUninitialized: case kNotReadScript:
-        KALDI_ERR << "HasKey called on RandomAccessTableReader object that is not open.";
-      case kHaveObject: case kGaveObject:
-        if (key == current_key_)
+        KALDI_ERR << "HasKey called on RandomAccessTableReader object that is"
+                     " not open.";
+      case kHaveObject:
+        if (key == key_ && range_.empty())
+          return true;
+        break;
+      case kHaveRange:
+        if (key == key_)
           return true;
         break;
-      default: break;
+      case kNotHaveObject: default: break;
     }
     KALDI_ASSERT(IsToken(key));
-    size_t key_pos = 0; // set to zero to suppress warning
-    bool ans = LookupKey(key, &key_pos);
-    if (!ans) return false;
-    else {
-      // First do a check regarding the "once" option.
-      if (opts_.once && script_[key_pos].second == "") {  // A "tombstone"; user is asking about
-        // already-read key.
-        KALDI_ERR << "HasKey called on key whose value was already read, and "
-            " you specified the \"once\" option (o, ): try removing o, or adding no, :"
-            " rspecifier is " << rspecifier_;
-      }
-      if (!preload)
-        return true;  // we have the key.
-      else {  // preload specified, so we have to pre-load the object before returning true.
-        if (!input_.Open(script_[key_pos].second)) {
-          KALDI_WARN << "Error opening stream "
-                     << PrintableRxfilename(script_[key_pos].second);
-          return false;
+    size_t key_pos = 0;
+    if (!LookupKey(key, &key_pos)) {
+      return false;
+    } else {
+      if (!preload) {
+        return true;  // we have the key, and were not asked to verify that the
+                      // object could be read.
+      } else {  // preload specified, so we have to attempt to pre-load the
+                // object before returning.
+        std::string data_rxfilename, range; // We will split
+        // script_[key_pos].second (e.g. "1.ark:100[0:2]" into data_rxfilename
+        // (e.g. "1.ark:100") and range (if any), e.g. "0:2".
+        if (script_[key_pos].second[script_[key_pos].second.size()-1] == ']') {
+          if(!ExtractRangeSpecifier(script_[key_pos].second,
+                                    &data_rxfilename,
+                                    &range)) {
+            KALDI_ERR << "TableReader: failed to parse range in '"
+                      << script_[key_pos].second << "'";
+          }
         } else {
-          // Make sure holder empty.
-          if (state_ == kHaveObject || state_ == kGaveObject)
-            holder_.Clear();
-          if (holder_.Read(input_.Stream())) {
-            state_ = kHaveObject;
-            current_key_ = key;
+          data_rxfilename = script_[key_pos].second;
+        }
+        if (state_ == kHaveRange) {
+          if (data_rxfilename_ == data_rxfilename && range_ == range) {
+            // the odd situation where two keys had the same rxfilename and range:
+            // just change the key and keep the object.
+            key_ = key;
             return true;
           } else {
-            KALDI_WARN << "Error reading object from "
-                "stream " << PrintableRxfilename(script_[key_pos].second);
+            range_holder_.Clear();
+            state_ = kHaveObject;
+          }
+        }
+        // OK, at this point the state will be kHaveObject or kNotHaveObject.
+        if (state_ == kHaveObject) {
+          if (data_rxfilename_ != data_rxfilename) {
+            // clear out the object.
             state_ = kNotHaveObject;
+            holder_.Clear();
+          }
+        }
+        // At this point we can safely switch to the new key, data_rxfilename
+        // and range, and we know that if we have an object, it will already be
+        // the correct one.  The state is now kHaveObject or kNotHaveObject.
+        key_ = key;
+        data_rxfilename_ = data_rxfilename;
+        range_ = range;
+        if (state_ == kNotHaveObject) {
+          // we need to read the object.
+          if (!input_.Open(data_rxfilename)) {
+            KALDI_WARN << "Error opening stream "
+                       << PrintableRxfilename(data_rxfilename);
             return false;
+          } else {
+            if (holder_.Read(input_.Stream())) {
+              state_ = kHaveObject;
+            } else {
+              KALDI_WARN << "Error reading object from "
+                  "stream " << PrintableRxfilename(data_rxfilename);
+              return false;
+            }
           }
         }
+        // At this point the state is kHaveObject.
+        if (range.empty())
+          return true;  // we're done: no range was requested.
+        if (range_holder_.ExtractRange(holder_, range)) {
+          state_ = kHaveRange;
+          return true;
+        } else {
+          KALDI_WARN  << "Failed to load object from "
+                      << PrintableRxfilename(data_rxfilename)
+                      << "[" << range << "]";
+          // leave state at kHaveObject.
+          return false;
+        }
       }
     }
   }
-  void MakeTombstone(const std::string &key) {
-    size_t offset;
-    if (!LookupKey(key, &offset))
-      KALDI_ERR << "RandomAccessTableReader object in inconsistent state.";
-    else
-      script_[offset].second = "";
-  }
+
+  // This function attempts to look up the key "key" in the sorted array
+  // script_.  If it was found it returns true and puts the array offset into
+  // 'script_offset'; otherwise it returns false.
   bool LookupKey(const std::string &key, size_t *script_offset) {
     // First, an optimization: if we're going consecutively, this will
     // make the lookup very fast.  Since we may call HasKey and then
@@ -1416,8 +1844,8 @@ class RandomAccessTableReaderScriptImpl:
     std::pair<std::string, std::string> pr(key, "");  // Important that ""
     // compares less than or equal to any string, so lower_bound points to the
     // element that has the same key.
-    typedef typename std::vector<std::pair<std::string, std::string> >::const_iterator 
-        IterType;
+    typedef typename std::vector<std::pair<std::string, std::string> >
+                     ::const_iterator IterType;
     IterType iter = std::lower_bound(script_.begin(), script_.end(), pr);
     if (iter != script_.end() && iter->first == key) {
       last_found_ = *script_offset = iter - script_.begin();
@@ -1429,13 +1857,26 @@ class RandomAccessTableReaderScriptImpl:
 
 
   Input input_;  // Use the same input_ object for reading each file, in case
-  // the scp specifies offsets in an archive (so we can keep the same file open).
+                 // the scp specifies offsets in an archive so we can keep the
+                 // same file open.
   RspecifierOptions opts_;
-  std::string rspecifier_;  // rspecifier used to open it; used in debug messages
-  std::string script_rxfilename_;  // filename of script.
+  std::string rspecifier_;  // rspecifier used to open this object; used in
+                            // debug messages
+  std::string script_rxfilename_;  // rxfilename of script file that we read.
+
+  std::string key_;  // The current key of the object that we have, but see the
+                     // notes regarding states_ for more explanation of the
+                     // semantics.
 
-  std::string current_key_;  // Key of object in holder_
   Holder holder_;
+  Holder range_holder_; // Holds the partial object corresponding to the object
+                        // range specifier 'range_'. this is only used when
+                        // 'range_' is specified.
+  std::string range_; // range within which we read the object from holder_.
+                      // If key_ is set, always correspond to the key.
+  std::string data_rxfilename_;  // the rxfilename corresponding to key_,
+                                 // always set when key_ is set.
+
 
   // the script_ variable contains pairs of (key, filename), sorted using
   // std::sort.  This can be used with binary_search to look up filenames for
@@ -1447,19 +1888,24 @@ class RandomAccessTableReaderScriptImpl:
   std::vector<std::pair<std::string, std::string> > script_;
   size_t last_found_;  // This is for an optimization used in FindFilename.
 
-  enum {  //           [Do we have          [Does holder_
-    //                script_ set up?]      contain object?]
-    kUninitialized,  //     no                     no
-    kNotReadScript,  //     no                     no
-    kNotHaveObject,  //     yes                    no
-    kHaveObject,   //     yes                    yes
-    kGaveObject,   //     yes                    yes
-    // [kGaveObject is as kHaveObject but we note that the
-    //  user has already read it; this is for checking that
-    // if "once" is specified, the user actually only reads
-    // it once.
+  enum {
+    //                   (*) is script_ set up?
+    //                          (*) does holder_ contain an object?
+    //                               (*) does range_holder_ contain and object?
+    //
+    //
+    kUninitialized,  //    no    no    no
+    kNotReadScript,  //    no    no    no
+    kNotHaveObject,  //    yes   no    no
+    kHaveObject,     //    yes   yes   no
+    kHaveRange,      //    yes   yes   yes
+
+    // If we are in a state where holder_ contains an object, it always contains
+    // the object from 'key_', and the corresponding rxfilename is always
+    // 'data_rxfilename_'.  If range_holder_ contains an object, it always
+    // corresponds to the range 'range_' of the object in 'holder_', and always
+    // corresponds to the current key.
   } state_;
-
 };
 
 
@@ -1473,17 +1919,19 @@ class RandomAccessTableReaderScriptImpl:
 // class and the actual Impl classes.
 // The child classes vary in the assumptions regarding sorting, etc.
 
-template<class Holder>  class RandomAccessTableReaderArchiveImplBase:
+template<class Holder>
+class RandomAccessTableReaderArchiveImplBase:
       public RandomAccessTableReaderImplBase<Holder> {
  public:
   typedef typename Holder::T T;
 
-  RandomAccessTableReaderArchiveImplBase(): holder_(NULL), state_(kUninitialized) { }
+  RandomAccessTableReaderArchiveImplBase(): holder_(NULL),
+                                            state_(kUninitialized) { }
 
   virtual bool Open(const std::string &rspecifier) {
     if (state_ != kUninitialized) {
-      if (! this->Close()) // call Close() yourself to suppress this exception.
-        KALDI_ERR << "TableReader::Open, error closing previous input.";
+      if (!this->Close())  // call Close() yourself to suppress this exception.
+        KALDI_ERR << "Error closing previous input.";
     }
     rspecifier_ = rspecifier;
     RspecifierType rs = ClassifyRspecifier(rspecifier, &archive_rxfilename_,
@@ -1497,7 +1945,7 @@ template<class Holder>  class RandomAccessTableReaderArchiveImplBase:
     else
       ans = input_.OpenTextMode(archive_rxfilename_);
     if (!ans) {  // header.
-      KALDI_WARN << "TableReader: failed to open stream "
+      KALDI_WARN << "Failed to open stream "
                  << PrintableRxfilename(archive_rxfilename_);
       state_ = kUninitialized;  // Failure on Open
       return false;  // User should print the error message.
@@ -1514,8 +1962,8 @@ template<class Holder>  class RandomAccessTableReaderArchiveImplBase:
   // it sets the state to kError or kEof.
   void ReadNextObject() {
     if (state_ != kNoObject)
-      KALDI_ERR << "TableReader: ReadNextObject() called from wrong state.";  // Code error
-    // somewhere in this class or a child class.
+      KALDI_ERR << "ReadNextObject() called from wrong state.";
+    // Code error somewhere in this class or a child class.
     std::istream &is = input_.Stream();
     is.clear();  // Clear any fail bits that may have been set... just in case
     // this happened in the Read function.
@@ -1524,18 +1972,22 @@ template<class Holder>  class RandomAccessTableReaderArchiveImplBase:
       state_ = kEof;
       return;
     }
-    if (is.fail()) {  // This shouldn't really happen, barring file-system errors.
+    if (is.fail()) {  // This shouldn't really happen, barring file-system
+                      // errors.
       KALDI_WARN << "Error reading archive: rspecifier is " << rspecifier_;
       state_ = kError;
       return;
     }
     int c;
-    if ((c = is.peek()) != ' ' && c != '\t' && c != '\n') {  // We expect a space ' ' after the key.
-      // We also allow tab, just so we can read archives generated by scripts that may
-      // not be fully aware of how this format works.
-      KALDI_WARN << "Invalid archive file format: expected space after key " <<cur_key_
+    if ((c = is.peek()) != ' ' && c != '\t' && c != '\n') {  // We expect a
+      // space ' ' after the key.
+      // We also allow tab, just so we can read archives generated by scripts
+      // that may not be fully aware of how this format works.
+      KALDI_WARN << "Invalid archive file format: expected space after key "
+                 <<cur_key_
                  <<", got character "
-                 << CharToString(static_cast<char>(is.peek())) << ", reading archive "
+                 << CharToString(static_cast<char>(is.peek()))
+                 << ", reading archive "
                  << PrintableRxfilename(archive_rxfilename_);
       state_ = kError;
       return;
@@ -1567,7 +2019,7 @@ template<class Holder>  class RandomAccessTableReaderArchiveImplBase:
   // Called by the child-class virutal Close() functions; does the
   // shared parts of the cleanup.
   bool CloseInternal() {
-    if (! this->IsOpen())
+    if (!this->IsOpen())
       KALDI_ERR << "Close() called on TableReader twice or otherwise wrongly.";
     if (input_.IsOpen())
       input_.Close();
@@ -1575,7 +2027,9 @@ template<class Holder>  class RandomAccessTableReaderArchiveImplBase:
       KALDI_ASSERT(holder_ != NULL);
       delete holder_;
       holder_ = NULL;
-    } else KALDI_ASSERT(holder_ == NULL);
+    } else {
+      KALDI_ASSERT(holder_ == NULL);
+    }
     bool ans = (state_ != kError);
     state_ = kUninitialized;
     if (!ans && opts_.permissive) {
@@ -1596,33 +2050,33 @@ template<class Holder>  class RandomAccessTableReaderArchiveImplBase:
   // The variables below are accessed by child classes.
 
   std::string cur_key_;   // current key (if state == kHaveObject).
-  Holder *holder_;     // Holds the object we just read (if state == kHaveObject)
+  Holder *holder_;    // Holds the object we just read (if state == kHaveObject)
 
   std::string rspecifier_;
   std::string archive_rxfilename_;
   RspecifierOptions opts_;
 
-  enum {  //  [The state of the reading process]               [does holder_ [is input_
-    //                                                         have object]   open]
-    kUninitialized,  // Uninitialized or closed                     no         no
-    kNoObject,      // Do not have object in holder_               no         yes
-    kHaveObject,    // Have object in holder_                      yes        yes
-    kEof,           // End of file                                 no         yes
-    kError,         // Some kind of error-state in the reading.    no         yes
+  enum {  //  [The state of the reading process]        [does holder_ [is input_
+    //                                                      have object]   open]
+    kUninitialized,  // Uninitialized or closed                   no         no
+    kNoObject,      // Do not have object in holder_              no         yes
+    kHaveObject,    // Have object in holder_                     yes        yes
+    kEof,           // End of file                                no         yes
+    kError,         // Some kind of error-state in the reading.   no         yes
   } state_;
-
 };
 
 
-// RandomAccessTableReaderDSortedArchiveImpl (DSorted for "doubly sorted") is the
-// implementation for random-access reading of archives when both the archive,
-// and the calling code, are in sorted order (i.e. we ask for the keys in sorted
-// order).  This is when the s and cs options are both given.  It only ever has
-// to keep one object in memory.  It inherits from
+// RandomAccessTableReaderDSortedArchiveImpl (DSorted for "doubly sorted") is
+// the implementation for random-access reading of archives when both the
+// archive, and the calling code, are in sorted order (i.e. we ask for the keys
+// in sorted order).  This is when the s and cs options are both given.  It only
+// ever has to keep one object in memory.  It inherits from
 // RandomAccessTableReaderArchiveImplBase which implements the common parts of
-// RandomAccessTableReader that are used when it's an archive we're reading from.
+// RandomAccessTableReader that are used when it's an archive we're reading from
 
-template<class Holder>  class RandomAccessTableReaderDSortedArchiveImpl:
+template<class Holder>
+class RandomAccessTableReaderDSortedArchiveImpl:
       public RandomAccessTableReaderArchiveImplBase<Holder> {
   using RandomAccessTableReaderArchiveImplBase<Holder>::kUninitialized;
   using RandomAccessTableReaderArchiveImplBase<Holder>::kHaveObject;
@@ -1664,7 +2118,7 @@ template<class Holder>  class RandomAccessTableReaderDSortedArchiveImpl:
 
   virtual ~RandomAccessTableReaderDSortedArchiveImpl() {
     if (this->IsOpen())
-      if (!Close()) // more specific warning will already have been printed.
+      if (!Close())  // more specific warning will already have been printed.
         // we are in some kind of error state & user did not find out by
         // calling Close().
         KALDI_ERR << "Error closing RandomAccessTableReader: rspecifier is "
@@ -1697,7 +2151,8 @@ template<class Holder>  class RandomAccessTableReaderDSortedArchiveImpl:
     if (state_ == kEof || state_ == kError) return false;
 
     if (state_ == kUninitialized)
-      KALDI_ERR << "Trying to access a RandomAccessTableReader object that is not open.";
+      KALDI_ERR << "Trying to access a RandomAccessTableReader object that is"
+                   " not open.";
 
     std::string last_key_;  // To check that
     // the archive we're reading is in sorted order.
@@ -1722,7 +2177,8 @@ template<class Holder>  class RandomAccessTableReaderDSortedArchiveImpl:
           return false;  // eof or read error.
         if (cur_key_.compare(last_key_) <= 0) {
           KALDI_ERR << "You provided the \"s\" option "
-                    << " (sorted order), but keys are out of order or duplicated: "
+                    << " (sorted order), but keys are out of order or"
+                       " duplicated: "
                     << last_key_ << " is followed by " << cur_key_
                     << ": rspecifier is " << rspecifier_;
         }
@@ -1732,14 +2188,13 @@ template<class Holder>  class RandomAccessTableReaderDSortedArchiveImpl:
 
   /// Last string provided to HasKey() or Value();
   std::string last_requested_key_;
-
-
 };
 
 // RandomAccessTableReaderSortedArchiveImpl is for random-access reading of
 // archives when the user specified the sorted (s) option but not the
 // called-sorted (cs) options.
-template<class Holder>  class RandomAccessTableReaderSortedArchiveImpl:
+template<class Holder>
+class RandomAccessTableReaderSortedArchiveImpl:
       public RandomAccessTableReaderArchiveImplBase<Holder> {
   using RandomAccessTableReaderArchiveImplBase<Holder>::kUninitialized;
   using RandomAccessTableReaderArchiveImplBase<Holder>::kHaveObject;
@@ -1763,8 +2218,7 @@ template<class Holder>  class RandomAccessTableReaderSortedArchiveImpl:
 
   virtual bool Close() {
     for (size_t i = 0; i < seen_pairs_.size(); i++)
-      if (seen_pairs_[i].second)
-        delete seen_pairs_[i].second;
+      delete seen_pairs_[i].second;
     seen_pairs_.clear();
 
     pending_delete_ = static_cast<size_t>(-1);
@@ -1805,7 +2259,7 @@ template<class Holder>  class RandomAccessTableReaderSortedArchiveImpl:
   }
   virtual ~RandomAccessTableReaderSortedArchiveImpl() {
     if (this->IsOpen())
-      if (!Close()) // more specific warning will already have been printed.
+      if (!Close())  // more specific warning will already have been printed.
         // we are in some kind of error state & user did not find out by
         // calling Close().
         KALDI_ERR << "Error closing RandomAccessTableReader: rspecifier is "
@@ -1843,8 +2297,9 @@ template<class Holder>  class RandomAccessTableReaderSortedArchiveImpl:
     }
 
     if (state_ == kUninitialized)
-      KALDI_ERR << "Trying to access a RandomAccessTableReader object that is not open.";
-    
+      KALDI_ERR << "Trying to access a RandomAccessTableReader object that is"
+                   " not open.";
+
     // Step one is to see whether we have to read ahead for the object..
     // Note, the possible states right now are kNoObject, kEof or kError.
     // We are never in the state kHaveObject except just after calling
@@ -1860,7 +2315,7 @@ template<class Holder>  class RandomAccessTableReaderSortedArchiveImpl:
       // Note that the keys in seen_pairs_ are ordered from least to greatest.
       ReadNextObject();
       if (state_ == kHaveObject) {  // Successfully read object.
-        if (!seen_pairs_.empty() && // This is just a check.
+        if (!seen_pairs_.empty() &&  // This is just a check.
            cur_key_.compare(seen_pairs_.back().first) <= 0) {
           // read the expression above as: !( cur_key_ > previous_key).
           // it means we are not in sorted order [the user specified that we
@@ -1876,12 +2331,14 @@ template<class Holder>  class RandomAccessTableReaderSortedArchiveImpl:
         state_ = kNoObject;
       }
     }
-    if (looped) {  // We only need to check the last element of the seen_pairs_ array,
-      // since we would not have read more after getting "key".
+    if (looped) {  // We only need to check the last element of the seen_pairs_
+      // array, since we would not have read more after getting "key".
       if (!seen_pairs_.empty() && seen_pairs_.back().first == key) {
         last_found_index_ = *index = seen_pairs_.size() - 1;
         return true;
-      } else return false;
+      } else {
+        return false;
+      }
     }
     // Now we have do an actual binary search in the seen_pairs_ array.
     std::pair<std::string, Holder*> pr(key, static_cast<Holder*>(NULL));
@@ -1892,7 +2349,9 @@ template<class Holder>  class RandomAccessTableReaderSortedArchiveImpl:
        key == iter->first) {
       last_found_index_ = *index = (iter - seen_pairs_.begin());
       return true;
-    } else return false;
+    } else {
+      return false;
+    }
   }
 
   // These are the pairs of (key, object) we have read.  We keep all the keys we
@@ -1900,8 +2359,9 @@ template<class Holder>  class RandomAccessTableReaderSortedArchiveImpl:
   // the Holder object) may be deallocated if once == true, and the Holder
   // pointer set to NULL.
   std::vector<std::pair<std::string, Holder*> > seen_pairs_;
-  size_t last_found_index_;  // An optimization s.t. if FindKeyInternal called twice with
-  // same key (as it often will), it doesn't have to do the key search twice.
+  size_t last_found_index_;  // An optimization s.t. if FindKeyInternal called
+  // twice with same key (as it often will), it doesn't have to do the key
+  // search twice.
   size_t pending_delete_;  // If opts_.once == true, this is the index of
   // element of seen_pairs_ that is pending deletion.
   struct PairCompare {
@@ -1920,13 +2380,14 @@ template<class Holder>  class RandomAccessTableReaderSortedArchiveImpl:
 // archives when the user does not specify the sorted (s) option (in this case
 // the called-sorted, or "cs" option, is ignored).  This is the least efficient
 // of the random access archive readers, in general, but it can be as efficient
-// as the others, in speed, memory and latency, if the "once" option is specified
-// and it happens that the keys of the archive are the same as the keys the code
-// is called with (to HasKey() and Value()), and in the same order.  However, if
-// you ask it for a key that's not present it will have to read the archive till
-// the end and store it all in memory.
+// as the others, in speed, memory and latency, if the "once" option is
+// specified and it happens that the keys of the archive are the same as the
+// keys the code is called with (to HasKey() and Value()), and in the same
+// order.  However, if you ask it for a key that's not present it will have to
+// read the archive till the end and store it all in memory.
 
-template<class Holder>  class RandomAccessTableReaderUnsortedArchiveImpl:
+template<class Holder>
+class RandomAccessTableReaderUnsortedArchiveImpl:
       public RandomAccessTableReaderArchiveImplBase<Holder> {
   using RandomAccessTableReaderArchiveImplBase<Holder>::kUninitialized;
   using RandomAccessTableReaderArchiveImplBase<Holder>::kHaveObject;
@@ -1945,8 +2406,7 @@ template<class Holder>  class RandomAccessTableReaderUnsortedArchiveImpl:
 
  public:
   RandomAccessTableReaderUnsortedArchiveImpl(): to_delete_iter_(map_.end()),
-                                                to_delete_iter_valid_(false)
-                                                 {
+                                                to_delete_iter_valid_(false) {
     map_.max_load_factor(0.5);  // make it quite empty -> quite efficient.
     // default seems to be 1.
   }
@@ -1955,8 +2415,7 @@ template<class Holder>  class RandomAccessTableReaderUnsortedArchiveImpl:
     for (typename MapType::iterator iter = map_.begin();
         iter != map_.end();
         ++iter) {
-      if (iter->second)
-        delete iter->second;
+      delete iter->second;
     }
     map_.clear();
     first_deleted_string_ = "";
@@ -1980,7 +2439,7 @@ template<class Holder>  class RandomAccessTableReaderUnsortedArchiveImpl:
   }
   virtual ~RandomAccessTableReaderUnsortedArchiveImpl() {
     if (this->IsOpen())
-      if (!Close()) // more specific warning will already have been printed.
+      if (!Close())  // more specific warning will already have been printed.
         // we are in some kind of error state & user did not find out by
         // calling Close().
         KALDI_ERR << "Error closing RandomAccessTableReader: rspecifier is "
@@ -2001,7 +2460,7 @@ template<class Holder>  class RandomAccessTableReaderUnsortedArchiveImpl:
   // If it is not already there, it reads ahead either until it finds the
   // key, or until end of file.  If called with value_ptr == NULL,
   // it assumes it's called from HasKey() and just returns true or false
-  // and doesn't otherwise have side effects.  If called with value_ptr != 
+  // and doesn't otherwise have side effects.  If called with value_ptr !=
   // NULL, it assumes it's called from Value().  Thus, it will crash
   // if it cannot find the key.  If it can find it it puts its address in
   // *value_ptr, and if opts_once == true it will mark that element of the
@@ -2060,8 +2519,8 @@ template<class Holder>  class RandomAccessTableReaderUnsortedArchiveImpl:
                 << "you are calling using key " << key
                 << " more than once: rspecifier is " << rspecifier_;
     }
-    return false;  // We read the entire archive (or got to error state) and didn't
-    // find it.
+    return false;  // We read the entire archive (or got to error state) and
+    // didn't find it.
   }
 
   typedef unordered_map<std::string, Holder*, StringHasher>  MapType;
@@ -2073,7 +2532,6 @@ template<class Holder>  class RandomAccessTableReaderUnsortedArchiveImpl:
   std::string first_deleted_string_;  // keep the first string we deleted
   // from map_ (if opts_.once == true).  It's for an inexact spot-check that the
   // "once" option isn't being used incorrectly.
-
 };
 
 
@@ -2081,7 +2539,8 @@ template<class Holder>  class RandomAccessTableReaderUnsortedArchiveImpl:
 
 
 template<class Holder>
-RandomAccessTableReader<Holder>::RandomAccessTableReader(const std::string &rspecifier):
+RandomAccessTableReader<Holder>::RandomAccessTableReader(const
+                                                       std::string &rspecifier):
     impl_(NULL) {
   if (rspecifier != "" && !Open(rspecifier))
     KALDI_ERR << "Error opening RandomAccessTableReader object "
@@ -2100,25 +2559,26 @@ bool RandomAccessTableReader<Holder>::Open(const std::string &rspecifier) {
       break;
     case kArchiveRspecifier:
       if (opts.sorted) {
-        if (opts.called_sorted) // "doubly" sorted case.
+        if (opts.called_sorted)  // "doubly" sorted case.
           impl_ = new RandomAccessTableReaderDSortedArchiveImpl<Holder>();
         else
           impl_ = new RandomAccessTableReaderSortedArchiveImpl<Holder>();
-      } else impl_ = new RandomAccessTableReaderUnsortedArchiveImpl<Holder>();
+      } else {
+        impl_ = new RandomAccessTableReaderUnsortedArchiveImpl<Holder>();
+      }
       break;
     case kNoRspecifier: default:
       KALDI_WARN << "Invalid rspecifier: "
                  << rspecifier;
       return false;
   }
-  if (impl_->Open(rspecifier))
-    return true;
-  else {
-    // Warning will already have been printed.
+  if (!impl_->Open(rspecifier)) {
+    // A warning will already have been printed.
     delete impl_;
     impl_ = NULL;
     return false;
   }
+  return true;
 }
 
 template<class Holder>
@@ -2133,7 +2593,7 @@ bool RandomAccessTableReader<Holder>::HasKey(const std::string &key) {
 template<class Holder>
 const typename RandomAccessTableReader<Holder>::T&
 RandomAccessTableReader<Holder>::Value(const std::string &key) {
-  CheckImpl();  
+  CheckImpl();
   return impl_->Value(key);
 }
 
@@ -2148,7 +2608,7 @@ bool RandomAccessTableReader<Holder>::Close() {
 
 template<class Holder>
 RandomAccessTableReader<Holder>::~RandomAccessTableReader() {
-  if (IsOpen() && !Close()) // call Close() yourself to stop this being thrown.
+  if (IsOpen() && !Close())  // call Close() yourself to stop this being thrown.
     KALDI_ERR << "failure detected in destructor.";
 }
 
@@ -2191,7 +2651,7 @@ bool RandomAccessTableReaderMapped<Holder>::Open(
   if (reader_.IsOpen()) reader_.Close();
   if (token_reader_.IsOpen()) token_reader_.Close();
   KALDI_ASSERT(!table_rxfilename.empty());
-  if (!reader_.Open(table_rxfilename)) return false; // will have printed
+  if (!reader_.Open(table_rxfilename)) return false;  // will have printed
   // warning internally, probably.
   if (!utt2spk_rxfilename.empty()) {
     if (!token_reader_.Open(utt2spk_rxfilename)) {
@@ -2208,7 +2668,7 @@ bool RandomAccessTableReaderMapped<Holder>::HasKey(const std::string &utt) {
   // We don't check IsOpen, we let the call go through to the member variable
   // (reader_), which will crash with a more informative error message than
   // we can give here, as we don't any longer know the rxfilename.
-  if (token_reader_.IsOpen()) { // We need to map the key from utt to spk.
+  if (token_reader_.IsOpen()) {  // We need to map the key from utt to spk.
     if (!token_reader_.HasKey(utt))
       KALDI_ERR << "Attempting to read key " << utt << ", which is not present "
                 << "in utt2spk map or similar map being read from "
@@ -2223,7 +2683,7 @@ bool RandomAccessTableReaderMapped<Holder>::HasKey(const std::string &utt) {
 template<class Holder>
 const typename Holder::T& RandomAccessTableReaderMapped<Holder>::Value(
     const std::string &utt) {
-  if (token_reader_.IsOpen()) { // We need to map the key from utt to spk.
+  if (token_reader_.IsOpen()) {  // We need to map the key from utt to spk.
     if (!token_reader_.HasKey(utt))
       KALDI_ERR << "Attempting to read key " << utt << ", which is not present "
                 << "in utt2spk map or similar map being read from "
@@ -2239,8 +2699,8 @@ const typename Holder::T& RandomAccessTableReaderMapped<Holder>::Value(
 
 /// @}
 
-} // end namespace kaldi
+}  // end namespace kaldi
 
 
 
-#endif
+#endif  // KALDI_UTIL_KALDI_TABLE_INL_H_
diff --git a/src/util/kaldi-table-test.cc b/src/util/kaldi-table-test.cc
index 8de78d6643d..85a4fefe5df 100644
--- a/src/util/kaldi-table-test.cc
+++ b/src/util/kaldi-table-test.cc
@@ -72,8 +72,8 @@ void UnitTestReadScriptFile() {
   }
 
   {
-    Output ko("| gzip -c > tmpf.gz", true);  // binary mode w/ header: should fail,
-    // because script files should not have binary header.
+    Output ko("| gzip -c > tmpf.gz", true);  // binary mode w/ header:
+    // should fail, because script files should not have binary header.
     ko.Stream() << "a b\n";
     bool ans = ko.Close();
     KALDI_ASSERT(ans);
@@ -89,38 +89,46 @@ void UnitTestReadScriptFile() {
 
 
 void UnitTestClassifyWspecifier() {
-
   {
     std::string a = "b,ark:foo|";
-    std::string ark = "x", scp = "y"; WspecifierOptions opts;
+    std::string ark = "x", scp = "y";
+    WspecifierOptions opts;
     WspecifierType ans = ClassifyWspecifier(a, &ark, &scp, &opts);
-    KALDI_ASSERT(ans == kArchiveWspecifier && ark == "foo|" && scp == "" && opts.binary == true);
+    KALDI_ASSERT(ans == kArchiveWspecifier && ark == "foo|" && scp == "" &&
+                 opts.binary == true);
   }
 
   {
     std::string a = "t,ark:foo|";
-    std::string ark = "x", scp = "y"; WspecifierOptions opts;
+    std::string ark = "x", scp = "y";
+    WspecifierOptions opts;
     WspecifierType ans = ClassifyWspecifier(a, &ark, &scp, &opts);
-    KALDI_ASSERT(ans == kArchiveWspecifier && ark == "foo|" && scp == "" && opts.binary == false);
+    KALDI_ASSERT(ans == kArchiveWspecifier && ark == "foo|" && scp == "" &&
+                 opts.binary == false);
   }
 
   {
     std::string a = "t,scp:a b c d";
-    std::string ark = "x", scp = "y"; WspecifierOptions opts;
+    std::string ark = "x", scp = "y";
+    WspecifierOptions opts;
     WspecifierType ans = ClassifyWspecifier(a, &ark, &scp, &opts);
-    KALDI_ASSERT(ans == kScriptWspecifier && ark == "" && scp == "a b c d" && opts.binary == false);
+    KALDI_ASSERT(ans == kScriptWspecifier && ark == "" && scp == "a b c d" &&
+                 opts.binary == false);
   }
 
   {
     std::string a = "t,ark,scp:a b,c,d";
-    std::string ark = "x", scp = "y"; WspecifierOptions opts;
+    std::string ark = "x", scp = "y";
+    WspecifierOptions opts;
     WspecifierType ans = ClassifyWspecifier(a, &ark, &scp, &opts);
-    KALDI_ASSERT(ans == kBothWspecifier && ark == "a b" && scp == "c,d" && opts.binary == false);
+    KALDI_ASSERT(ans == kBothWspecifier && ark == "a b" && scp == "c,d" &&
+                 opts.binary == false);
   }
 
   {
     std::string a = "";
-    std::string ark = "x", scp = "y"; WspecifierOptions opts;
+    std::string ark = "x", scp = "y";
+    WspecifierOptions opts;
     WspecifierType ans = ClassifyWspecifier(a, &ark, &scp, &opts);
     KALDI_ASSERT(ans == kNoWspecifier);
   }
@@ -145,31 +153,34 @@ void UnitTestClassifyWspecifier() {
 
   {
     std::string a = "b,ark,scp:,";  // empty ark, scp fnames valid.
-    std::string ark = "x", scp = "y"; WspecifierOptions opts;
+    std::string ark = "x", scp = "y";
+    WspecifierOptions opts;
     WspecifierType ans = ClassifyWspecifier(a, &ark, &scp, &opts);
-    KALDI_ASSERT(ans == kBothWspecifier && ark == "" && scp == "" && opts.binary == true);
+    KALDI_ASSERT(ans == kBothWspecifier && ark == "" && scp == "" &&
+                 opts.binary == true);
   }
 
   {
     std::string a = "f,b,ark,scp:,";  // empty ark, scp fnames valid.
-    std::string ark = "x", scp = "y"; WspecifierOptions opts;
+    std::string ark = "x", scp = "y";
+    WspecifierOptions opts;
     WspecifierType ans = ClassifyWspecifier(a, &ark, &scp, &opts);
-    KALDI_ASSERT(ans == kBothWspecifier && ark == "" && scp == "" && opts.binary == true && opts.flush == true);
+    KALDI_ASSERT(ans == kBothWspecifier && ark == "" && scp == "" &&
+                 opts.binary == true && opts.flush == true);
   }
 
   {
     std::string a = "nf,b,ark,scp:,";  // empty ark, scp fnames valid.
-    std::string ark = "x", scp = "y"; WspecifierOptions opts;
+    std::string ark = "x", scp = "y";
+    WspecifierOptions opts;
     WspecifierType ans = ClassifyWspecifier(a, &ark, &scp, &opts);
-    KALDI_ASSERT(ans == kBothWspecifier && ark == "" && scp == "" && opts.binary == true && opts.flush == false);
+    KALDI_ASSERT(ans == kBothWspecifier && ark == "" && scp == "" &&
+                 opts.binary == true && opts.flush == false);
   }
-
-
 }
 
 
 void UnitTestClassifyRspecifier() {
-
   {
     std::string a = "ark:foo|";
     std::string fname = "x";
@@ -313,8 +324,6 @@ void UnitTestClassifyRspecifier() {
     RspecifierType ans = ClassifyRspecifier(a, &b, NULL);
     KALDI_ASSERT(ans == kArchiveRspecifier && b == "a");
   }
-
-
 }
 
 void UnitTestTableSequentialInt32(bool binary) {
@@ -323,10 +332,11 @@ void UnitTestTableSequentialInt32(bool binary) {
   std::vector<int32> v;
 
   for (int32 i = 0; i < sz; i++) {
-    k.push_back( CharToString( 'a' + static_cast<char>(i)));  // This gives us
+    k.push_back(CharToString('a' + static_cast<char>(i)));  // This gives us
     // some single quotes too but it doesn't really matter.
-    if (i%2 == 0) k.back() = k.back() +  CharToString( 'a' + i);  // make them different lengths.
-    v.push_back( Rand() );
+    if (i%2 == 0) k.back() = k.back() +  CharToString('a' + i);  // make them
+                                                           // different lengths.
+    v.push_back(Rand());
   }
 
   bool ans;
@@ -337,7 +347,7 @@ void UnitTestTableSequentialInt32(bool binary) {
   ans = bw.Close();
   KALDI_ASSERT(ans);
 
-  SequentialInt32Reader sbr("ark:tmpf");
+  SequentialInt32Reader sbr(RandInt(0, 1) == 0 ? "ark:tmpf" : "ark,bg:tmpf");
   std::vector<std::string> k2;
   std::vector<int32> v2;
   for (; !sbr.Done(); sbr.Next()) {
@@ -355,10 +365,11 @@ void UnitTestTableSequentialBool(bool binary) {
   std::vector<bool> v;
 
   for (int32 i = 0; i < sz; i++) {
-    k.push_back( CharToString( 'a' + static_cast<char>(i)));  // This gives us
+    k.push_back(CharToString('a' + static_cast<char>(i)));  // This gives us
     // some single quotes too but it doesn't really matter.
-    if (i%2 == 0) k.back() = k.back() +  CharToString( 'a' + i);  // make them different lengths.
-    v.push_back( (Rand()%2 == 0) );
+    if (i%2 == 0) k.back() = k.back() +  CharToString('a' + i);  // make them
+                                                           // different lengths.
+    v.push_back((Rand()%2 == 0));
   }
 
   bool ans;
@@ -369,7 +380,7 @@ void UnitTestTableSequentialBool(bool binary) {
   ans = bw.Close();
   KALDI_ASSERT(ans);
 
-  SequentialBoolReader sbr("ark:tmpf");
+  SequentialBoolReader sbr(RandInt(0, 1) == 0 ? "ark:tmpf" : "ark,bg:tmpf");
   std::vector<std::string> k2;
   std::vector<bool> v2;
   for (; !sbr.Done(); sbr.Next()) {
@@ -388,10 +399,11 @@ void UnitTestTableSequentialDouble(bool binary) {
   std::vector<double> v;
 
   for (int32 i = 0; i < sz; i++) {
-    k.push_back( CharToString( 'a' + static_cast<char>(i)));  // This gives us
+    k.push_back(CharToString('a' + static_cast<char>(i)));  // This gives us
     // some single quotes too but it doesn't really matter.
-    if (i%2 == 0) k.back() = k.back() +  CharToString( 'a' + i);  // make them different lengths.
-    v.push_back( (Rand() / static_cast<double>(Rand()) ));
+    if (i%2 == 0) k.back() = k.back() +  CharToString('a' + i);  // make them
+                                                           // different lengths.
+    v.push_back((Rand() / static_cast<double>(Rand())));
   }
 
   bool ans;
@@ -402,7 +414,7 @@ void UnitTestTableSequentialDouble(bool binary) {
   ans = bw.Close();
   KALDI_ASSERT(ans);
 
-  SequentialDoubleReader sbr("ark:tmpf");
+  SequentialDoubleReader sbr(RandInt(0, 1) == 0 ? "ark:tmpf" : "ark,bg:tmpf");
   std::vector<std::string> k2;
   std::vector<double> v2;
   for (; !sbr.Done(); sbr.Next()) {
@@ -428,21 +440,25 @@ void UnitTestTableSequentialDoubleBoth(bool binary, bool read_scp) {
   std::vector<double> v;
 
   for (int32 i = 0; i < sz; i++) {
-    k.push_back( CharToString( 'a' + static_cast<char>(i)));  // This gives us
+    k.push_back(CharToString('a' + static_cast<char>(i)));  // This gives us
     // some single quotes too but it doesn't really matter.
-    if (i%2 == 0) k.back() = k.back() +  CharToString( 'a' + i);  // make them different lengths.
-    v.push_back( (Rand() / static_cast<double>(Rand()) ));
+    if (i%2 == 0) k.back() = k.back() +  CharToString('a' + i);  // make them
+                                                           // different lengths.
+    v.push_back((Rand() / static_cast<double>(Rand())));
   }
 
   bool ans;
-  DoubleWriter bw(binary ? "b,ark,scp:tmpf,tmpf.scp" : "t,ark,scp:tmpf,tmpf.scp");
+  DoubleWriter bw(binary ? "b,ark,scp:tmpf,tmpf.scp" :
+                  "t,ark,scp:tmpf,tmpf.scp");
   for (int32 i = 0; i < sz; i++)  {
     bw.Write(k[i], v[i]);
   }
   ans = bw.Close();
   KALDI_ASSERT(ans);
 
-  SequentialDoubleReader sbr(read_scp ? "scp:tmpf.scp" : "ark:tmpf");
+  SequentialDoubleReader sbr(RandInt(0, 1) == 0 ?
+                             (read_scp ? "scp:tmpf.scp" : "ark:tmpf") :
+                             (read_scp ? "scp,bg:tmpf.scp" : "ark,bg:tmpf"));
   std::vector<std::string> k2;
   std::vector<double> v2;
   for (; !sbr.Done(); sbr.Next()) {
@@ -470,24 +486,28 @@ void UnitTestTableSequentialInt32VectorBoth(bool binary, bool read_scp) {
   std::vector<std::vector<int32> > v;
 
   for (int32 i = 0; i < sz; i++) {
-    k.push_back( CharToString( 'a' + static_cast<char>(i)));  // This gives us
+    k.push_back(CharToString('a' + static_cast<char>(i)));  // This gives us
     // some single quotes too but it doesn't really matter.
-    if (i%2 == 0) k.back() = k.back() +  CharToString( 'a' + i);  // make them different lengths.
-    v.push_back( std::vector<int32>() );
+    if (i%2 == 0) k.back() = k.back() +  CharToString('a' + i);  // make them
+                                                           // different lengths.
+    v.push_back(std::vector<int32>());
     int32 sz2 = Rand() % 5;
     for (int32 j = 0; j  < sz2; j++)
-      v.back().push_back( Rand() % 100);
+      v.back().push_back(Rand() % 100);
   }
 
   bool ans;
-  Int32VectorWriter bw(binary ? "b,ark,scp:tmpf,tmpf.scp" : "t,ark,scp:tmpf,tmpf.scp");
+  Int32VectorWriter bw(binary ? "b,ark,scp:tmpf,tmpf.scp" :
+                       "t,ark,scp:tmpf,tmpf.scp");
   for (int32 i = 0; i < sz; i++)  {
     bw.Write(k[i], v[i]);
   }
   ans = bw.Close();
   KALDI_ASSERT(ans);
 
-  SequentialInt32VectorReader sbr(read_scp ? "scp:tmpf.scp" : "ark:tmpf");
+  SequentialInt32VectorReader sbr(RandInt(0, 1) == 0 ?
+                                  (read_scp ? "scp:tmpf.scp" : "ark:tmpf") :
+                                  (read_scp ? "scp,bg:tmpf.scp" : "ark,bg:tmpf"));
   std::vector<std::string> k2;
   std::vector<std::vector<int32> > v2;
   for (; !sbr.Done(); sbr.Next()) {
@@ -507,23 +527,27 @@ void UnitTestTableSequentialInt32PairVectorBoth(bool binary, bool read_scp) {
   std::vector<std::vector<std::pair<int32, int32> > > v(sz);
 
   for (int32 i = 0; i < sz; i++) {
-    k[i] = CharToString( 'a' + static_cast<char>(i));  // This gives us
+    k[i] = CharToString('a' + static_cast<char>(i));  // This gives us
     // some single quotes too but it doesn't really matter.
-    if (i%2 == 0) k.back() = k.back() +  CharToString( 'a' + i);  // make them different lengths.
+    if (i%2 == 0) k.back() = k.back() +  CharToString('a' + i);  // make them
+                                                           // different lengths.
     int32 sz2 = Rand() % 5;
-    for (int32 j = 0; j < sz2; j++) 
+    for (int32 j = 0; j < sz2; j++)
       v[i].push_back(std::pair<int32, int32>(Rand() % 10, Rand() % 10));
   }
-  
+
   bool ans;
-  Int32PairVectorWriter bw(binary ? "b,ark,scp:tmpf,tmpf.scp" : "t,ark,scp:tmpf,tmpf.scp");
+  Int32PairVectorWriter bw(binary ? "b,ark,scp:tmpf,tmpf.scp" :
+                           "t,ark,scp:tmpf,tmpf.scp");
   for (int32 i = 0; i < sz; i++)  {
     bw.Write(k[i], v[i]);
   }
   ans = bw.Close();
   KALDI_ASSERT(ans);
 
-  SequentialInt32PairVectorReader sbr(read_scp ? "scp:tmpf.scp" : "ark:tmpf");
+  SequentialInt32PairVectorReader sbr(RandInt(0, 1) == 0 ?
+                                      (read_scp ? "scp:tmpf.scp" : "ark:tmpf") :
+                                      (read_scp ? "scp,bg:tmpf.scp" : "ark,bg:tmpf"));
   std::vector<std::string> k2;
   std::vector<std::vector<std::pair<int32, int32> > > v2;
   for (; !sbr.Done(); sbr.Next()) {
@@ -543,21 +567,23 @@ void UnitTestTableSequentialInt32VectorVectorBoth(bool binary, bool read_scp) {
   std::vector<std::vector<std::vector<int32> > > v;
 
   for (int32 i = 0; i < sz; i++) {
-    k.push_back( CharToString( 'a' + static_cast<char>(i)));  // This gives us
+    k.push_back(CharToString('a' + static_cast<char>(i)));  // This gives us
     // some single quotes too but it doesn't really matter.
-    if (i%2 == 0) k.back() = k.back() +  CharToString( 'a' + i);  // make them different lengths.
-    v.push_back( std::vector<std::vector<int32> >() );
+    if (i%2 == 0) k.back() = k.back() +  CharToString('a' + i);  // make them
+                                                           // different lengths.
+    v.push_back(std::vector<std::vector<int32> >());
     int32 sz2 = Rand() % 5;
     for (int32 j = 0; j  < sz2; j++) {
-      v.back().push_back(std::vector<int32>() );
+      v.back().push_back(std::vector<int32>());
       int32 sz3 = Rand() % 2;
       for (int32 k = 0; k  < sz3; k++)
-        v.back().back().push_back( Rand() % 100);
+        v.back().back().push_back(Rand() % 100);
     }
   }
 
   bool ans;
-  Int32VectorVectorWriter bw(binary ? "b,ark,scp:tmpf,tmpf.scp" : "t,ark,scp:tmpf,tmpf.scp");
+  Int32VectorVectorWriter bw(binary ? "b,ark,scp:tmpf,tmpf.scp" :
+                             "t,ark,scp:tmpf,tmpf.scp");
   for (int32 i = 0; i < sz; i++)  {
     bw.Write(k[i], v[i]);
   }
@@ -588,16 +614,18 @@ void UnitTestTableSequentialInt32Script(bool binary) {
     buf[0] = 'a' + static_cast<char>(i);
     buf[1] = (i%2 == 0 ? 'b'+static_cast<char>(i) : '\0');
     buf[2] = '\0';
-    k.push_back( std::string(buf));
-    script.push_back( std::make_pair(std::string(buf), std::string(buf) + ".tmp"));
-    v.push_back( Rand() );
+    k.push_back(std::string(buf));
+    script.push_back(std::make_pair(std::string(buf),
+                     std::string(buf) + ".tmp"));
+    v.push_back(Rand());
   }
 
   WriteScriptFile("tmp.scp", script);
   {
     std::vector<std::pair<std::string, std::string> > script2;
     ReadScriptFile("tmp.scp", true, &script2);
-    KALDI_ASSERT(script2 == script);  // This tests WriteScriptFile and ReadScriptFile.
+    KALDI_ASSERT(script2 == script);  // This tests WriteScriptFile and
+                                      // ReadScriptFile.
   }
 
   bool ans;
@@ -608,7 +636,8 @@ void UnitTestTableSequentialInt32Script(bool binary) {
   ans = bw.Close();
   KALDI_ASSERT(ans);
 
-  SequentialInt32Reader sbr("scp:tmp.scp");
+  SequentialInt32Reader sbr(RandInt(0, 1) == 0 ?
+                            "scp:tmp.scp" : "scp,bg:tmp.scp");
   std::vector<std::string> k2;
   std::vector<int32> v2;
   for (; !sbr.Done(); sbr.Next()) {
@@ -632,17 +661,19 @@ void UnitTestTableSequentialDoubleMatrixBoth(bool binary, bool read_scp) {
   std::vector<Matrix<double>*> v;
 
   for (int32 i = 0; i < sz; i++) {
-    k.push_back( CharToString( 'a' + static_cast<char>(i)));  // This gives us
+    k.push_back(CharToString('a' + static_cast<char>(i)));  // This gives us
     // some single quotes too but it doesn't really matter.
-    if (i%2 == 0) k.back() = k.back() +  CharToString( 'a' + i);  // make them different lengths.
-    v.push_back( new Matrix<double>(1 + Rand()%4, 1 + Rand() % 4));
+    if (i%2 == 0) k.back() = k.back() +  CharToString('a' + i);  // make them
+                                                           // different lengths.
+    v.push_back(new Matrix<double>(1 + Rand()%4, 1 + Rand() % 4));
     for (int32 i = 0; i < v.back()->NumRows(); i++)
       for (int32 j = 0; j < v.back()->NumCols(); j++)
         (*(v.back()))(i, j) = RandGauss();
   }
 
   bool ans;
-  DoubleMatrixWriter bw(binary ? "b,ark,scp:tmpf,tmpf.scp" : "t,ark,scp:tmpf,tmpf.scp");
+  DoubleMatrixWriter bw(binary ? "b,ark,scp:tmpf,tmpf.scp" :
+                        "t,ark,scp:tmpf,tmpf.scp");
   for (int32 i = 0; i < sz; i++)  {
     bw.Write(k[i], *(v[i]));
   }
@@ -682,23 +713,28 @@ void UnitTestTableSequentialBaseFloatVectorBoth(bool binary, bool read_scp) {
   std::vector<Vector<BaseFloat>*> v;
 
   for (int32 i = 0; i < sz; i++) {
-    k.push_back( CharToString( 'a' + static_cast<char>(i)));  // This gives us
+    k.push_back(CharToString('a' + static_cast<char>(i)));  // This gives us
     // some single quotes too but it doesn't really matter.
-    if (i%2 == 0) k.back() = k.back() +  CharToString( 'a' + i);  // make them different lengths.
-    v.push_back( new Vector<BaseFloat>(1 + Rand()%4));
+    if (i%2 == 0) k.back() = k.back() +  CharToString('a' + i);  // make them
+                                                           // different lengths.
+    v.push_back(new Vector<BaseFloat>(1 + Rand()%4));
     for (int32 i = 0; i < v.back()->Dim(); i++)
       (*(v.back()))(i) = RandGauss();
   }
 
   bool ans;
-  BaseFloatVectorWriter bw(binary ? "b,ark,scp:tmpf,tmpf.scp" : "t,ark,scp:tmpf,tmpf.scp");
+  BaseFloatVectorWriter bw(binary ? "b,ark,scp:tmpf,tmpf.scp" :
+                           "t,ark,scp:tmpf,tmpf.scp");
   for (int32 i = 0; i < sz; i++)  {
     bw.Write(k[i], *(v[i]));
   }
   ans = bw.Close();
   KALDI_ASSERT(ans);
 
-  SequentialBaseFloatVectorReader sbr(read_scp ? "scp:tmpf.scp" : "ark:tmpf");
+  SequentialBaseFloatVectorReader sbr(
+      RandInt(0, 1) == 0 ?
+      (read_scp ? "scp:tmpf.scp" : "ark:tmpf") :
+      (read_scp ? "scp,bg:tmpf.scp" : "ark,bg:tmpf"));
   std::vector<std::string> k2;
   std::vector<Vector<BaseFloat>* > v2;
   for (; !sbr.Done(); sbr.Next()) {
@@ -743,10 +779,11 @@ void UnitTestTableRandomBothDouble(bool binary, bool read_scp,
   std::vector<double> v;
 
   for (int32 i = 0; i < sz; i++) {
-    k.push_back( CharToString( 'a' + static_cast<char>(i)));  // This gives us
+    k.push_back(CharToString('a' + static_cast<char>(i)));  // This gives us
     // some single quotes too but it doesn't really matter.
-    if (i%2 == 0) k.back() = k.back() +  CharToString( 'a' + i);  // make them different lengths.
-    v.push_back( (Rand() / static_cast<double>(Rand()) ));
+    if (i%2 == 0) k.back() = k.back() +  CharToString('a' + i);  // make them
+                                                           // different lengths.
+    v.push_back((Rand() / static_cast<double>(Rand())));
   }
 
   if (!sorted)
@@ -754,8 +791,9 @@ void UnitTestTableRandomBothDouble(bool binary, bool read_scp,
 
 
   bool ans;
-  DoubleWriter bw(binary ? "b,f,ark,scp:tmpf,tmpf.scp" : "t,f,ark,scp:tmpf,tmpf.scp");  // Putting the
-  // "flush" option in too, just for good measure..
+  DoubleWriter bw(binary ? "b,f,ark,scp:tmpf,tmpf.scp" :
+                  "t,f,ark,scp:tmpf,tmpf.scp");  // Putting the "flush" option
+                                            // in too, just for good measure..
   for (int32 i = 0; i < sz; i++)  {
     bw.Write(k[i], v[i]);
   }
@@ -804,6 +842,137 @@ void UnitTestTableRandomBothDouble(bool binary, bool read_scp,
 }
 
 
+
+void UnitTestRangesMatrix(bool binary) {
+  int32 archive_size = RandInt(1, 10);
+  std::vector<std::pair<std::string, Matrix<BaseFloat> > > archive_contents(
+      archive_size);
+  for (int32 i = 0; i < archive_size; i++) {
+    char key_buf[2];
+    key_buf[0] = 'A' + i;
+    key_buf[1] = '\0';
+    std::string key(key_buf);
+    archive_contents[i].first = key;
+    archive_contents[i].second.Resize(RandInt(1, 5), RandInt(1, 5));
+    archive_contents[i].second.SetRandn();
+  }
+  if (RandInt(0, 1) == 0)
+    std::random_shuffle(archive_contents.begin(), archive_contents.end());
+
+  std::ostringstream writer_name;
+  writer_name << "ark,scp";
+  if (binary) writer_name << ",b";
+  else writer_name << ",t";
+  writer_name << ":tmpf,tmpf.scp";
+
+  {
+    BaseFloatMatrixWriter writer(writer_name.str());
+    for (int32 i = 0; i < archive_size; i++)
+      writer.Write(archive_contents[i].first, archive_contents[i].second);
+  }
+
+  std::vector<std::string> scp_lines;
+  {
+    bool binary;
+    Input scp_input("tmpf.scp", &binary);
+    KALDI_ASSERT(!binary);
+    std::string line;
+    while (getline(scp_input.Stream(), line)) {
+      Trim(&line);  // remove trailing and beginning whitespace.
+      scp_lines.push_back(line);
+    }
+    KALDI_ASSERT(scp_lines.size() == archive_contents.size());
+  }
+
+  int32 scp_length = RandInt(0, 10);
+  std::vector<std::pair<std::string, Matrix<BaseFloat> > >
+      scp_intended_contents(scp_length);
+
+  {
+    Output output("tmpf_ranges.scp", false);
+
+    for (int32 i = 0; i < scp_length; i++) {
+      int32 src_i = RandInt(0, archive_size - 1);
+      std::string scp_line_str = scp_lines[src_i];  // a line like "A tmpf:1043", without newline.
+      scp_line_str[0] = 'a' + i;  // now scp_line_str looks like "a tmpf:1043".
+      std::string key("x");
+      key[0] = 'a' + i;
+      scp_intended_contents[i].first = key;
+      output.Stream() << scp_line_str;
+      const Matrix<BaseFloat> &src_mat = archive_contents[src_i].second;
+      if (RandInt(0, 1) == 0) {  // Use a range.
+        int32 tot_rows = src_mat.NumRows(), tot_cols = src_mat.NumCols();
+        int32 row_offset = RandInt(0, tot_rows - 1),
+            num_rows = RandInt(1, tot_rows - row_offset),
+            col_offset = RandInt(0, tot_cols - 1),
+            num_cols = RandInt(1, tot_cols - col_offset);
+        SubMatrix<BaseFloat> sub_mat(src_mat, row_offset, num_rows,
+                                     col_offset, num_cols);
+        scp_intended_contents[i].second = sub_mat;
+        output.Stream() << "[";
+        if (row_offset != 0 || num_rows != tot_rows)
+          output.Stream() << row_offset << ":"
+                          << (row_offset + num_rows - 1);
+        else
+          output.Stream() << ":";
+        if (col_offset != 0 || num_cols != tot_cols) {
+          output.Stream() << "," << col_offset
+                          << ":" << (col_offset + num_cols - 1);
+        } else {
+          if (RandInt(0, 1) == 0) {
+            output.Stream() << ",:";
+          }
+        }
+        output.Stream() << "]";
+      } else {  // no range.
+        scp_intended_contents[i].second = src_mat;
+      }
+      output.Stream() << "\n";
+    }
+  }
+
+  {  // test random-access reading.
+    bool permissive = (RandInt(0, 1) == 0);
+    RandomAccessDoubleMatrixReader reader(permissive ?
+                                          "scp,p:tmpf_ranges.scp" :
+                                          "scp:tmpf_ranges.scp");
+
+    int32 num_queries = RandInt(0, 10);
+    for (int32 n = 0; n < num_queries; n++) {
+      int32 i = RandInt(0, scp_length);
+      if (i == scp_length) { // fake "bad" query.
+        KALDI_ASSERT(!reader.HasKey("foobar"));
+      } else {
+        std::string key = scp_intended_contents[i].first;
+        if (RandInt(0, 1) == 0)
+          KALDI_ASSERT(reader.HasKey(key));
+        Matrix<BaseFloat> value (reader.Value(key));
+        KALDI_ASSERT(value.ApproxEqual(scp_intended_contents[i].second));
+      }
+    }
+  }
+
+
+  {  // test sequential reading.
+    bool permissive = (RandInt(0, 1) == 0);
+    SequentialBaseFloatMatrixReader reader(permissive ?
+                                           "scp,p:tmpf_ranges.scp" :
+                                           "scp:tmpf_ranges.scp");
+
+    int32 i = 0;
+    for (; !reader.Done(); reader.Next(), i++) {
+      KALDI_ASSERT(reader.Key() == scp_intended_contents[i].first);
+      KALDI_ASSERT(reader.Value().ApproxEqual(scp_intended_contents[i].second));
+    }
+    KALDI_ASSERT(i == scp_length);
+  }
+
+
+  unlink("tmpf");
+  unlink("tmpf.scp");
+  unlink("tmpf_ranges.scp");
+}
+
 void UnitTestTableRandomBothDoubleMatrix(bool binary, bool read_scp,
                                          bool sorted, bool called_sorted,
                                          bool once) {
@@ -812,9 +981,10 @@ void UnitTestTableRandomBothDoubleMatrix(bool binary, bool read_scp,
   std::vector<Matrix<double> > v;
 
   for (int32 i = 0; i < sz; i++) {
-    k.push_back( CharToString( 'a' + static_cast<char>(i)));  // This gives us
+    k.push_back(CharToString('a' + static_cast<char>(i)));  // This gives us
     // some single quotes too but it doesn't really matter.
-    if (i%2 == 0) k.back() = k.back() +  CharToString( 'a' + i);  // make them different lengths.
+    if (i%2 == 0) k.back() = k.back() +  CharToString('a' + i);  // make them
+                                                           // different lengths.
     v.resize(v.size()+1);
     v.back().Resize(1 + Rand()%3, 1 + Rand()%3);
     for (int32 j = 0; j < v.back().NumRows(); j++)
@@ -827,8 +997,9 @@ void UnitTestTableRandomBothDoubleMatrix(bool binary, bool read_scp,
 
 
   bool ans;
-  DoubleMatrixWriter bw(binary ? "b,f,ark,scp:tmpf,tmpf.scp" : "t,f,ark,scp:tmpf,tmpf.scp");  // Putting the
-  // "flush" option in too, just for good measure..
+  DoubleMatrixWriter bw(binary ? "b,f,ark,scp:tmpf,tmpf.scp" :
+                        "t,f,ark,scp:tmpf,tmpf.scp");  // Putting the "flush"
+  // option in too, just for good measure..
   for (int32 i = 0; i < sz; i++)  {
     bw.Write(k[i], v[i]);
   }
@@ -844,7 +1015,6 @@ void UnitTestTableRandomBothDoubleMatrix(bool binary, bool read_scp,
   if (once) name += "o,";
   else if (Rand()%2 == 0) name += "no,";
   name += std::string(read_scp ? "scp:tmpf.scp" : "ark:tmpf");
-  
   RandomAccessDoubleMatrixReader sbr(name);
 
   if (sz != 0) {
@@ -893,6 +1063,7 @@ int main() {
     UnitTestTableSequentialInt32(b);
     UnitTestTableSequentialInt32Script(b);
     UnitTestTableSequentialDouble(b);
+    UnitTestRangesMatrix(b);
     for (int j = 0; j < 2; j++) {
       bool c = (j == 0);
       UnitTestTableSequentialDoubleBoth(b, c);
@@ -907,8 +1078,8 @@ int main() {
           bool e = (l == 0);
           for (int m = 0; m < 2; m++) {
             bool f = (m == 0);
-            UnitTestTableRandomBothDouble (b, c, d, e, f);
-            UnitTestTableRandomBothDoubleMatrix (b, c, d, e, f);
+            UnitTestTableRandomBothDouble(b, c, d, e, f);
+            UnitTestTableRandomBothDoubleMatrix(b, c, d, e, f);
           }
         }
       }
diff --git a/src/util/kaldi-table.cc b/src/util/kaldi-table.cc
index 055edbab9a1..0c93fb8dd2f 100644
--- a/src/util/kaldi-table.cc
+++ b/src/util/kaldi-table.cc
@@ -22,10 +22,26 @@
 
 namespace kaldi {
 
+bool ExtractRangeSpecifier(const std::string &line,
+                           std::string *data_rxfilename,
+                           std::string *range) {
+  if (line.empty() || line[line.size()-1] != ']')
+    KALDI_ERR << "ExtractRangeRspecifier called wrongly.";
+  std::vector<std::string> splits;
+  SplitStringToVector(line, "[", false, &splits);
+  if (splits.size() == 2 && !splits[0].empty() && splits[1].size() > 1) {
+    *data_rxfilename = splits[0];
+    range->assign(splits[1], 0, splits[1].size()-1);
+    return true;
+  }
+  return false;
+}
+
 
 bool ReadScriptFile(const std::string &rxfilename,
                     bool warn,
-                    std::vector<std::pair<std::string, std::string> > *script_out) {
+                    std::vector<std::pair<std::string, std::string> >
+                    *script_out) {
   bool is_binary;
   Input input;
 
@@ -42,13 +58,15 @@ bool ReadScriptFile(const std::string &rxfilename,
 
   bool ans = ReadScriptFile(input.Stream(), warn, script_out);
   if (warn && !ans)
-    KALDI_WARN << "[script file was: " << PrintableRxfilename(rxfilename) << "]";
+    KALDI_WARN << "[script file was: " << PrintableRxfilename(rxfilename) <<
+                  "]";
   return ans;
 }
 
 bool ReadScriptFile(std::istream &is,
                     bool warn,
-                    std::vector<std::pair<std::string, std::string> > *script_out) {
+                    std::vector<std::pair<std::string, std::string> >
+                    *script_out) {
   KALDI_ASSERT(script_out != NULL);
   std::string line;
   int line_number = 0;
@@ -56,7 +74,8 @@ bool ReadScriptFile(std::istream &is,
     line_number++;
     const char *c = line.c_str();
     if (*c == '\0') {
-      if (warn) KALDI_WARN << "Empty "<<line_number<<"'th line in script file";
+      if (warn)
+        KALDI_WARN << "Empty " << line_number << "'th line in script file";
       return false;  // Empty line so invalid scp file format..
     }
 
@@ -64,12 +83,11 @@ bool ReadScriptFile(std::istream &is,
     SplitStringOnFirstSpace(line, &key, &rest);
 
     if (key.empty() || rest.empty()) {
-      if (warn) KALDI_WARN << "Invalid "<<line_number<<"'th line in script file"
+      if (warn)
+        KALDI_WARN << "Invalid " << line_number << "'th line in script file"
                           <<":\"" << line << '"';
       return false;
     }
-    // Not using push_back because who knows how many temp. variables
-    // used there.
     script_out->resize(script_out->size()+1);
     script_out->back().first = key;
     script_out->back().second = rest;
@@ -78,22 +96,26 @@ bool ReadScriptFile(std::istream &is,
 }
 
 bool WriteScriptFile(std::ostream &os,
-                     const std::vector<std::pair<std::string, std::string> > &script) {
+                     const std::vector<std::pair<std::string, std::string> >
+                     &script) {
   if (!os.good()) {
     KALDI_WARN << "WriteScriptFile: attempting to write to invalid stream.";
     return false;
   }
   std::vector<std::pair<std::string, std::string> >::const_iterator iter;
-  for (iter = script.begin(); iter != script.end(); iter++) {
+  for (iter = script.begin(); iter != script.end(); ++iter) {
     if (!IsToken(iter->first)) {
-      KALDI_WARN << "WriteScriptFile: using invalid token \"" << iter->first << '"';
+      KALDI_WARN << "WriteScriptFile: using invalid token \"" << iter->first <<
+                    '"';
       return false;
     }
     if (iter->second.find('\n') != std::string::npos ||
        (iter->second.length() != 0 &&
-        (isspace(iter->second[0]) || isspace(iter->second[iter->second.length()-1])))) {
+        (isspace(iter->second[0]) ||
+         isspace(iter->second[iter->second.length()-1])))) {
       // second part contains newline or leading or trailing space.
-      KALDI_WARN << "WriteScriptFile: attempting to write invalid line \"" << iter->second << '"';
+      KALDI_WARN << "WriteScriptFile: attempting to write invalid line \"" <<
+                    iter->second << '"';
       return false;
     }
     os << iter->first << ' ' << iter->second << '\n';
@@ -106,10 +128,11 @@ bool WriteScriptFile(std::ostream &os,
 }
 
 bool WriteScriptFile(const std::string &wxfilename,
-                     const std::vector<std::pair<std::string, std::string> > &script) {
+                     const std::vector<std::pair<std::string, std::string> >
+                     &script) {
   Output output;
-  if (!output.Open(wxfilename, false, false)) {  // false, false means not binary,
-    // no binary-mode header.
+  if (!output.Open(wxfilename, false, false)) {  // false, false means not
+    // binary, no binary-mode header.
     KALDI_ERR << "Error opening output stream for script file: "
               << PrintableWxfilename(wxfilename);
     return false;
@@ -146,22 +169,24 @@ WspecifierType ClassifyWspecifier(const std::string &wspecifier,
 
   size_t pos = wspecifier.find(':');
   if (pos == std::string::npos) return kNoWspecifier;
-  if (isspace(*(wspecifier.rbegin()))) return kNoWspecifier;  // Trailing space disallowed.
+  if (isspace(*(wspecifier.rbegin()))) return kNoWspecifier;  // Trailing space
+  // disallowed.
 
   std::string before_colon(wspecifier, 0, pos), after_colon(wspecifier, pos+1);
 
   std::vector<std::string> split_first_part;  // Split part before ':' on ', '.
-  SplitStringToVector(before_colon, ", ", false, &split_first_part);  // false== don't omit empty strings
-  // between commas.
+  SplitStringToVector(before_colon, ", ", false, &split_first_part);  // false==
+  // don't omit empty strings between commas.
 
   WspecifierType ws = kNoWspecifier;
 
   if (opts != NULL)
-    *opts = WspecifierOptions(); // Make sure all the defaults are as in the
-                                 // default constructor of the options class.
-  
+    *opts = WspecifierOptions();  // Make sure all the defaults are as in the
+                                  // default constructor of the options class.
+
   for (size_t i = 0; i < split_first_part.size(); i++) {
-    const std::string &str = split_first_part[i];  // e.g. "b", "t", "f", "ark", "scp".
+    const std::string &str = split_first_part[i];  // e.g. "b", "t", "f", "ark",
+    // "scp".
     const char *c = str.c_str();
     if (!strcmp(c, "b")) {
       if (opts) opts->binary = true;
@@ -175,11 +200,14 @@ WspecifierType ClassifyWspecifier(const std::string &wspecifier,
       if (opts) opts->permissive = true;
     } else if (!strcmp(c, "ark")) {
       if (ws == kNoWspecifier) ws = kArchiveWspecifier;
-      else return kNoWspecifier;  // We do not allow "scp, ark", only "ark, scp".
+      else
+        return kNoWspecifier;  // We do not allow "scp, ark", only "ark,
+      // scp".
     } else if (!strcmp(c, "scp")) {
       if (ws == kNoWspecifier) ws = kScriptWspecifier;
       else if (ws == kArchiveWspecifier) ws = kBothWspecifier;
-      else return kNoWspecifier;  // repeated "scp" option: invalid.
+      else
+        return kNoWspecifier;  // repeated "scp" option: invalid.
     } else {
       return kNoWspecifier;  // Could not interpret this option.
     }
@@ -237,28 +265,30 @@ RspecifierType ClassifyRspecifier(const std::string &rspecifier,
   if (wxfilename) wxfilename->clear();
 
   if (opts != NULL)
-    *opts = RspecifierOptions(); // Make sure all the defaults are as in the
-                                 // default constructor of the options class.
-  
+    *opts = RspecifierOptions();  // Make sure all the defaults are as in the
+                                  // default constructor of the options class.
+
   size_t pos = rspecifier.find(':');
   if (pos == std::string::npos) return kNoRspecifier;
 
-  if (isspace(*(rspecifier.rbegin()))) return kNoRspecifier;  // Trailing space disallowed.
+  if (isspace(*(rspecifier.rbegin()))) return kNoRspecifier;  // Trailing space
+  // disallowed.
 
   std::string before_colon(rspecifier, 0, pos),
       after_colon(rspecifier, pos+1);
 
   std::vector<std::string> split_first_part;  // Split part before ':' on ', '.
-  SplitStringToVector(before_colon, ", ", false, &split_first_part);  // false== don't omit empty strings
-  // between commas.
+  SplitStringToVector(before_colon, ", ", false, &split_first_part);  // false==
+  // don't omit empty strings between commas.
 
   RspecifierType rs = kNoRspecifier;
 
   for (size_t i = 0; i < split_first_part.size(); i++) {
-    const std::string &str = split_first_part[i];  // e.g. "b", "t", "f", "ark", "scp".
+    const std::string &str = split_first_part[i];  // e.g. "b", "t", "f", "ark",
+    // "scp".
     const char *c = str.c_str();
-    if (!strcmp(c, "b"));  // Ignore this option.  It's so we can use the same specifiers for
-    // rspecifiers and wspecifiers.
+    if (!strcmp(c, "b"));  // Ignore this option.  It's so we can use the same
+    // specifiers for rspecifiers and wspecifiers.
     else if (!strcmp(c, "t"));  // Ignore this option too.
     else if (!strcmp(c, "o")) {
       if (opts) opts->once = true;
@@ -276,12 +306,18 @@ RspecifierType ClassifyRspecifier(const std::string &rspecifier,
       if (opts) opts->called_sorted = true;
     } else if (!strcmp(c, "ncs")) {
       if (opts) opts->called_sorted = false;
+    } else if (!strcmp(c, "bg")) {
+      if (opts) opts->background = true;
     } else if (!strcmp(c, "ark")) {
       if (rs == kNoRspecifier) rs = kArchiveRspecifier;
-      else return kNoRspecifier;  // Repeated or combined ark and scp options invalid.
+      else
+        return kNoRspecifier;  // Repeated or combined ark and scp options
+      // invalid.
     } else if (!strcmp(c, "scp")) {
       if (rs == kNoRspecifier) rs = kScriptRspecifier;
-      else return kNoRspecifier;  // Repeated or combined ark and scp options invalid.
+      else
+        return kNoRspecifier;  // Repeated or combined ark and scp options
+      // invalid.
     } else {
       return kNoRspecifier;  // Could not interpret this option.
     }
diff --git a/src/util/kaldi-table.h b/src/util/kaldi-table.h
index 6f6cb98661f..3e5b6a5ab53 100644
--- a/src/util/kaldi-table.h
+++ b/src/util/kaldi-table.h
@@ -44,9 +44,10 @@ template<class Holder>  class TableWriterImplBase;
 // explains the "rspecifier" and "wspecifier" concepts (these are strings that
 // explain how to read/write objects via archives or scp files.  A table is
 // conceptually a collection of objects of a particular type T indexed by keys
-// of type std::string (these Keys additionally have an order within each table).
-// The Table classes are templated on a type (call it Holder) such that Holder::T
-// is a typedef equal to T.
+// of type std::string (these Keys additionally have an order within
+// each table).
+// The Table classes are templated on a type (call it Holder) such that
+// Holder::T is a typedef equal to T.
 
 // see kaldi-holder.h for detail on the Holder classes.
 
@@ -112,7 +113,7 @@ enum WspecifierType  {
 struct WspecifierOptions {
   bool binary;
   bool flush;
-  bool permissive; // will ignore absent scp entries.
+  bool permissive;  // will ignore absent scp entries.
   WspecifierOptions(): binary(true), flush(false), permissive(false) { }
 };
 
@@ -129,32 +130,36 @@ WspecifierType ClassifyWspecifier(const std::string &wspecifier,
 // (in order as it was in the scp file) in script_out_, which contains
 // pairs of (key, xfilename).  The .scp
 // file format is: on each line, key xfilename
-// where xfilename means rxfilename or wxfilename, and may contain internal spaces
-// (we trim away any leading or trailing space).  The key is space-free.
+// where xfilename means rxfilename or wxfilename, and may contain internal
+// spaces (we trim away any leading or trailing space).  The key is space-free.
 // ReadScriptFile returns true if the format was valid (empty files
 // are valid).
-// If 'print_warnings', it will print out warning messages that explain what kind
-// of error there was.
+// If 'print_warnings', it will print out warning messages that explain what
+// kind of error there was.
 bool ReadScriptFile(const std::string &rxfilename,
                     bool print_warnings,
-                    std::vector<std::pair<std::string, std::string> > *script_out);
+                    std::vector<std::pair<std::string, std::string> >
+                    *script_out);
 
 // This version of ReadScriptFile works from an istream.
 bool ReadScriptFile(std::istream &is,
                     bool print_warnings,
-                    std::vector<std::pair<std::string, std::string> > *script_out);
-
-// Writes, for each entry in script, the first element, then ' ', then the second
-// element then '\n'.  Checks that the keys (first elements of pairs) are valid
-// tokens (nonempty, no whitespace), and the values (second elements of pairs)
-// are newline-free and contain no leading or trailing space.  Returns true on
-// success.
+                    std::vector<std::pair<std::string, std::string> >
+                    *script_out);
+
+// Writes, for each entry in script, the first element, then ' ', then the
+// second element then '\n'.  Checks that the keys (first elements of pairs) are
+// valid tokens (nonempty, no whitespace), and the values (second elements of
+// pairs) are newline-free and contain no leading or trailing space.  Returns
+// true on success.
 bool WriteScriptFile(const std::string &wxfilename,
-                     const std::vector<std::pair<std::string, std::string> > &script);
+                     const std::vector<std::pair<std::string, std::string> >
+                     &script);
 
 // This version writes to an ostream.
 bool WriteScriptFile(std::ostream &os,
-                     const std::vector<std::pair<std::string, std::string> > &script);
+                     const std::vector<std::pair<std::string, std::string> >
+                     &script);
 
 // Documentation for "rspecifier"
 // "rspecifier" describes how we read a set of objects indexed by keys.
@@ -168,16 +173,22 @@ bool WriteScriptFile(std::ostream &os,
 //       the reader to discard already-asked-for values.
 //   s   means the keys are sorted on input (means we don't have to read till
 //       eof if someone asked for a key that wasn't there).
-//   cs  means that it is called in sorted order (we are generally asserting this
-//       based on knowledge of how the program works).
+//   cs  means that it is called in sorted order (we are generally asserting
+//       this based on knowledge of how the program works).
 //   p   means "permissive", and causes it to skip over keys whose corresponding
 //       scp-file entries cannot be read. [and to ignore errors in archives and
 //       script files, and just consider the "good" entries].
 //       We allow the negation of the options above, as in no, ns, np,
 //       but these aren't currently very useful (just equivalent to omitting the
 //       corresponding option).
-//      [any of the above options can be prefixed by n to negate them, e.g. no, ns,
-//       ncs, np; but these aren't currently useful as you could just omit the option].
+//       [any of the above options can be prefixed by n to negate them, e.g. no,
+//       ns, ncs, np; but these aren't currently useful as you could just omit
+//       the option].
+//   bg means "background".  It currently has no effect for random-access readers,
+//       but for sequential readers it will cause it to "read ahead" to the next
+//       value, in a background thread.  Recommended when reading larger objects
+//       such as neural-net training examples, especially when you want to
+//       maximize GPU usage.
 //
 //   b   is ignored [for scripting convenience]
 //   t   is ignored [for scripting convenience]
@@ -196,11 +207,13 @@ struct  RspecifierOptions {
   bool permissive;  // If "permissive", when reading from scp files it treats
   // scp files that can't be read as if the corresponding key were not there.
   // For archive files it will suppress errors getting thrown if the archive
-  
   // is corrupted and can't be read to the end.
-
+  bool background;  // For sequential readers, if the background option ("bg")
+                    // is provided, it will read ahead to the next object in a
+                    // background thread.
   RspecifierOptions(): once(false), sorted(false),
-                       called_sorted(false), permissive(false) { }
+                       called_sorted(false), permissive(false),
+                       background(false) { }
 };
 
 enum RspecifierType  {
@@ -209,14 +222,10 @@ enum RspecifierType  {
   kScriptRspecifier
 };
 
-RspecifierType ClassifyRspecifier(const std::string &rspecifier, std::string *rxfilename,
+RspecifierType ClassifyRspecifier(const std::string &rspecifier,
+                                  std::string *rxfilename,
                                   RspecifierOptions *opts);
 
-// Class Table<Holder> is useful when you want the entire set of
-// objects in memory.  NOT IMPLEMENTED YET.
-// It is the least scalable way of accessing data in Tables.
-// The *TableReader and TableWriter classes are more scalable.
-
 
 /// Allows random access to a collection
 /// of objects in an archive or script file; see \ref io_sec_tables.
@@ -229,7 +238,7 @@ class RandomAccessTableReader {
 
   // This constructor equivalent to default constructor + "open", but
   // throws on error.
-  RandomAccessTableReader(const std::string &rspecifier);
+  explicit RandomAccessTableReader(const std::string &rspecifier);
 
   // Opens the table.
   bool Open(const std::string &rspecifier);
@@ -259,13 +268,14 @@ class RandomAccessTableReader {
 
   // Allow copy-constructor only for non-opened readers (needed for inclusion in
   // stl vector)
-  RandomAccessTableReader(const RandomAccessTableReader<Holder> &other):
+  explicit RandomAccessTableReader(const RandomAccessTableReader<Holder>
+                                   &other):
       impl_(NULL) { KALDI_ASSERT(other.impl_ == NULL); }
  private:
   // Disallow assignment.
   RandomAccessTableReader &operator=(const RandomAccessTableReader<Holder>&);
-  void CheckImpl() const; // Checks that impl_ is non-NULL; prints an error
-                          // message and dies (with KALDI_ERR) if NULL.
+  void CheckImpl() const;  // Checks that impl_ is non-NULL; prints an error
+                           // message and dies (with KALDI_ERR) if NULL.
   RandomAccessTableReaderImplBase<Holder> *impl_;
 };
 
@@ -282,11 +292,11 @@ class SequentialTableReader {
 
   // This constructor equivalent to default constructor + "open", but
   // throws on error.
-  SequentialTableReader(const std::string &rspecifier);
+  explicit SequentialTableReader(const std::string &rspecifier);
 
-  // Opens the table.  Returns exit status; but does throw if previously
-  // open stream was in error state.  Call Close to stop this [anyway,
-  // calling Open more than once is not recommended.]
+  // Opens the table.  Returns exit status; but does throw if previously open
+  // stream was in error state.  You can call Close to prevent this; anyway,
+  // calling Open more than once is not usually needed.
   bool Open(const std::string &rspecifier);
 
   // Returns true if we're done.  It will also return true if there's some kind
@@ -300,18 +310,16 @@ class SequentialTableReader {
 
   // FreeCurrent() is provided as an optimization to save memory, for large
   // objects.  It instructs the class to deallocate the current value. The
-  // reference Value() will/ be invalidated by this.
-
+  // reference Value() will be invalidated by this.
   void FreeCurrent();
 
-  // Return reference to the current value.
-  // The reference is valid till next call to this object.
-  // If will throw if you are reading an scp file, did not
-  // specify the "permissive" (p) option and the file cannot
-  // be read.  [The permissive option makes it behave as if that
-  // key does not even exist, if the corresponding file cannot be
-  // read.]  You probably wouldn't want to catch this exception;
-  // the user can just specify the p option in the rspecifier.
+  // Return reference to the current value.  It's only valid to call this if
+  // Done() returned false.  The reference is valid till next call to this
+  // object.  If will throw if you are reading an scp file, did not specify the
+  // "permissive" (p) option and the file cannot be read.  [The permissive
+  // option makes it behave as if that key does not even exist, if the
+  // corresponding file cannot be read.]  You probably wouldn't want to catch
+  // this exception; the user can just specify the p option in the rspecifier.
   const T &Value();
 
   // Next goes to the next key.  It will not throw; any error will
@@ -332,22 +340,22 @@ class SequentialTableReader {
   // Close()
   bool Close();
 
-  // The destructor may throw.  This is the desired behaviour, as it's the way we
-  // signal the error to the user (to detect it, call Close().  The issue is that
-  // otherwise the user has no way to tell whether Done() returned true because
-  // we reached the end of the archive or script, or because there was an error
-  // that prevented further reading.
+  // The destructor may throw.  This is the desired behaviour, as it's the way
+  // we signal the error to the user (to detect it, call Close().  The issue is
+  // that otherwise the user has no way to tell whether Done() returned true
+  // because we reached the end of the archive or script, or because there was
+  // an error that prevented further reading.
   ~SequentialTableReader();
 
   // Allow copy-constructor only for non-opened readers (needed for inclusion in
   // stl vector)
-  SequentialTableReader(const SequentialTableReader<Holder> &other):
+  explicit SequentialTableReader(const SequentialTableReader<Holder> &other):
       impl_(NULL) { KALDI_ASSERT(other.impl_ == NULL); }
  private:
   // Disallow assignment.
-  SequentialTableReader &operator = (const SequentialTableReader<Holder>&); 
-  void CheckImpl() const; // Checks that impl_ is non-NULL; prints an error
-                          // message and dies (with KALDI_ERR) if NULL.
+  SequentialTableReader &operator = (const SequentialTableReader<Holder>&);
+  void CheckImpl() const;  // Checks that impl_ is non-NULL; prints an error
+                           // message and dies (with KALDI_ERR) if NULL.
   SequentialTableReaderImplBase<Holder> *impl_;
 };
 
@@ -364,7 +372,7 @@ class TableWriter {
   // This constructor equivalent to default constructor
   // + "open", but throws on error.  See docs for
   // wspecifier above.
-  TableWriter(const std::string &wspecifier);
+  explicit TableWriter(const std::string &wspecifier);
 
   // Opens the table.  See docs for wspecifier above.
   // If it returns true, it is open.
@@ -391,22 +399,22 @@ class TableWriter {
   bool Close();
 
   ~TableWriter();
-  
+
   // Allow copy-constructor only for non-opened writers (needed for inclusion in
   // stl vector)
   TableWriter(const TableWriter &other): impl_(NULL) {
     KALDI_ASSERT(other.impl_ == NULL);
   }
  private:
-  TableWriter &operator = (const TableWriter&); // Disallow assignment.
-  void CheckImpl() const; // Checks that impl_ is non-NULL; prints an error
-                          // message and dies (with KALDI_ERR) if NULL.
+  TableWriter &operator = (const TableWriter&);  // Disallow assignment.
+  void CheckImpl() const;  // Checks that impl_ is non-NULL; prints an error
+                           // message and dies (with KALDI_ERR) if NULL.
   TableWriterImplBase<Holder> *impl_;
 };
 
 
 /// This class is for when you are reading something in random access, but
-/// it may actually be stored per-speaker (or something similar) but the 
+/// it may actually be stored per-speaker (or something similar) but the
 /// keys you're using are per utterance.  So you also provide an "rxfilename"
 /// for a file containing lines like
 /// utt1 spk1
@@ -415,7 +423,8 @@ class TableWriter {
 /// and so on.  Note: this is optional; if it is an empty string, we just won't
 /// do the mapping.  Also, "table_rxfilename" may be the empty string (as for
 /// a regular table), in which case the table just won't be opened.
-/// We provide only the most frequently used of the functions of RandomAccessTableReader.
+/// We provide only the most frequently used of the functions of
+/// RandomAccessTableReader.
 
 template<class Holder>
 class RandomAccessTableReaderMapped {
@@ -427,7 +436,7 @@ class RandomAccessTableReaderMapped {
   RandomAccessTableReaderMapped(const std::string &table_rxfilename,
                                 const std::string &utt2spk_rxfilename);
 
-  RandomAccessTableReaderMapped() {};
+  RandomAccessTableReaderMapped() {}
 
   /// Note: when calling Open, utt2spk_rxfilename may be empty.
   bool Open(const std::string &table_rxfilename,
@@ -437,23 +446,24 @@ class RandomAccessTableReaderMapped {
   const T &Value(const std::string &key);
   inline bool IsOpen() const { return reader_.IsOpen(); }
   inline bool Close() { return reader_.Close(); }
-  
 
 
-  // The default copy-constructor will do what we want: it will crash
-  // for already-opened readers, by calling the member-variable copy-constructors.
+
+  // The default copy-constructor will do what we want: it will crash for
+  // already-opened readers, by calling the member-variable copy-constructors.
  private:
   // Disallow assignment.
-  RandomAccessTableReaderMapped &operator=(const RandomAccessTableReaderMapped<Holder>&);
+  RandomAccessTableReaderMapped &operator =
+    (const RandomAccessTableReaderMapped<Holder>&);
   RandomAccessTableReader<Holder> reader_;
   RandomAccessTableReader<TokenHolder> token_reader_;
-  std::string utt2spk_rxfilename_; // Used only in diagnostic messages.
+  std::string utt2spk_rxfilename_;  // Used only in diagnostic messages.
 };
 
 
 /// @} end "addtogroup table_group"
-} // end namespace kaldi
+}  // end namespace kaldi
 
-#include "kaldi-table-inl.h"
+#include "util/kaldi-table-inl.h"
 
 #endif  // KALDI_UTIL_KALDI_TABLE_H_
diff --git a/src/util/parse-options-test.cc b/src/util/parse-options-test.cc
index bd417604967..e94989c6893 100644
--- a/src/util/parse-options-test.cc
+++ b/src/util/parse-options-test.cc
@@ -94,7 +94,7 @@ void UnitTestParseOptions() {
     "program_name", "--prefix.unum=5", "--num=3",
     "--prefix.str=foo", "--str=bar", "--prefix.my-bool=false",
     "--prefix.my-str=baz", "--prefix.prefix2.unum=42", "a", "b" };
-  
+
   dummy_opts.Register(&ro3);
   po3.PrintUsage(false);
 
@@ -111,15 +111,16 @@ void UnitTestParseOptions() {
   KALDI_ASSERT(dummy_opts.my_string == "baz");
 
 
-  try {   // test error with --option=, which is not a valid way to set boolean options. 
+  try {   // test error with --option=, which is not a valid way to set
+          // boolean options.
     int argc4 = 2;
     const char *argv4[2] = { "program_name", "--option="};
     ParseOptions po4("my usage msg");
     bool val = false;
     po4.Register("option", &val, "My boolean");
     po4.Read(argc4, argv4);
-    KALDI_ASSERT(false); // Should not reach this part of code.
-  } catch (std::exception e) {
+    assert(false); // Should not reach this part of code.
+  } catch(std::exception e) {
     KALDI_LOG << "Failed to read option (this is expected).";
   }
 
@@ -132,18 +133,18 @@ void UnitTestParseOptions() {
     po4.Read(argc4, argv4);
     KALDI_ASSERT(val == true);
   }
-  
 
 
-  try {   // test error with --option, which is not a valid way to set string-valued options. 
+  try {   // test error with --option, which is not a valid way to set
+          // string-valued options.
     int argc4 = 2;
     const char *argv4[2] = { "program_name", "--option"};
     ParseOptions po4("my usage msg");
     std::string val;
     po4.Register("option", &val, "My string");
     po4.Read(argc4, argv4);
-    KALDI_ASSERT(false); // Should not reach this part of code.
-  } catch (std::exception e) {
+    assert(false); // Should not reach this part of code.
+  } catch(std::exception e) {
     KALDI_LOG << "Failed to read option (this is expected).";
   }
 
@@ -176,7 +177,6 @@ void UnitTestParseOptions() {
     po4.Read(argc4, argv4);
     KALDI_ASSERT(val == 8.5);
   }
-  
   { // string options test
     int argc4 = 2;
     const char *argv4[2] = { "program_name", "--option=bar"};
@@ -187,7 +187,6 @@ void UnitTestParseOptions() {
     KALDI_ASSERT(val == "bar");
   }
   
-
   try {   // test error with --float=string
     int argc4 = 2;
     const char *argv4[2] = { "program_name", "--option=foo"};
@@ -195,8 +194,8 @@ void UnitTestParseOptions() {
     BaseFloat val = 32.0;
     po4.Register("option", &val, "My float");
     po4.Read(argc4, argv4);
-    KALDI_ASSERT(false); // Should not reach this part of code.
-  } catch (std::exception e) {
+    assert(false); // Should not reach this part of code.
+  } catch(std::exception e) {
     KALDI_LOG << "Failed to read option (this is expected).";
   }
 
@@ -208,11 +207,35 @@ void UnitTestParseOptions() {
     int32 val = 32;
     po4.Register("option", &val, "My int");
     po4.Read(argc4, argv4);
-    KALDI_ASSERT(false); // Should not reach this part of code.
-  } catch (std::exception e) {
+    assert(false); // Should not reach this part of code.
+  } catch(std::exception e) {
+    KALDI_LOG << "Failed to read option (this is expected).";
+  }
+
+  try {   // test error with --int=int+garbage
+    int argc4 = 2;
+    const char *argv4[2] = { "program_name", "--option=12xyz"};
+    ParseOptions po4("my usage msg");
+    int32 val = 32;
+    po4.Register("option", &val, "My int");
+    po4.Read(argc4, argv4);
+    assert(false); // Should not reach this part of code.
+  } catch(std::exception e) {
     KALDI_LOG << "Failed to read option (this is expected).";
   }
 
+  try {   // test error with --unsigned-int=negative-number.
+    int argc4 = 2;
+    const char *argv4[2] = { "program_name", "--option=-13"};
+    ParseOptions po4("my usage msg");
+    uint32 val = 32;
+    po4.Register("option", &val, "My int");
+    po4.Read(argc4, argv4);
+    assert(false); // Should not reach this part of code.
+  } catch(std::exception e) {
+    KALDI_LOG << "Failed to read option (this is expected)xxx.";
+  }
+
   try {   // test error with --bool=string
     int argc4 = 2;
     const char *argv4[2] = { "program_name", "--option=foo"};
@@ -220,13 +243,13 @@ void UnitTestParseOptions() {
     bool val = false;
     po4.Register("option", &val, "My bool");
     po4.Read(argc4, argv4);
-    KALDI_ASSERT(false); // Should not reach this part of code.
-  } catch (std::exception e) {
+    assert(false); // Should not reach this part of code.
+  } catch(std::exception e) {
     KALDI_LOG << "Failed to read option (this is expected).";
   }
 
-  
-  // test error with --= 
+
+  // test error with --=
   try {
     int argc4 = 2;
     const char *argv4[2] = { "program_name", "--=8"};
@@ -235,7 +258,7 @@ void UnitTestParseOptions() {
     po4.Register("num", &num, "My int32 variable");
     po4.Read(argc4, argv4);
     KALDI_ASSERT(num == 0);
-  } catch (std::exception e) {
+  } catch(std::exception e) {
     KALDI_LOG << "Failed to read option (this is expected).";
   }
 
@@ -270,7 +293,6 @@ void UnitTestParseOptions() {
   po6.Read(argc6, argv6);
   KALDI_ASSERT(po6.NumArgs() == 1);
   KALDI_ASSERT(po6.GetArg(1) == "--foo=8");
-
 }
 
 
diff --git a/src/util/parse-options.cc b/src/util/parse-options.cc
index 474f43d3427..0fde0501d4c 100644
--- a/src/util/parse-options.cc
+++ b/src/util/parse-options.cc
@@ -331,8 +331,7 @@ int ParseOptions::Read(int argc, const char *const argv[]) {
     char *program_name = new char[strlen(c)+2];
     strcpy(program_name, c);
     strcat(program_name, ":");
-    if (g_program_name != NULL)
-      delete [] g_program_name;
+    delete [] g_program_name;
     g_program_name = program_name;
   }
   // first pass: look for config parameter, look for priority
@@ -557,7 +556,7 @@ bool ParseOptions::SetOption(const std::string &key,
   } else if (int_map_.end() != int_map_.find(key)) {
     *(int_map_[key]) = ToInt(value);
   } else if (uint_map_.end() != uint_map_.find(key)) {
-    *(uint_map_[key]) = ToUInt(value);
+    *(uint_map_[key]) = ToUint(value);
   } else if (float_map_.end() != float_map_.find(key)) {
     *(float_map_[key]) = ToFloat(value);
   } else if (double_map_.end() != double_map_.find(key)) {
@@ -594,50 +593,31 @@ bool ParseOptions::ToBool(std::string str) {
 }
 
 
-int32 ParseOptions::ToInt(std::string str) {
-  char *end_pos;
-  // strtol is cheaper than stringstream...
-  // strtol accepts decimal 438143, hexa 0x1f2d3 and octal 067123
-  int32 ret = std::strtol(str.c_str(), &end_pos, 0);
-  if (str.c_str() == end_pos) {
-    PrintUsage(true);
+int32 ParseOptions::ToInt(const std::string &str) {
+  int32 ret;
+  if (!ConvertStringToInteger(str, &ret))
     KALDI_ERR << "Invalid integer option \"" << str << "\"";
-  }
   return ret;
 }
 
-uint32 ParseOptions::ToUInt(std::string str) {
-  char *end_pos;
-  // strtol is cheaper than stringstream...
-  // strtol accepts decimal 438143, hexa 0x1f2d3 and octal 067123
-  uint32 ret = std::strtoul(str.c_str(), &end_pos, 0);
-  if (str.c_str() == end_pos) {
-    PrintUsage(true);
-    KALDI_ERR << "Invalid integer option  \"" << str << "\"";
-  }
+uint32 ParseOptions::ToUint(const std::string &str) {
+  uint32 ret;
+  if (!ConvertStringToInteger(str, &ret))
+    KALDI_ERR << "Invalid integer option \"" << str << "\"";
   return ret;
 }
 
-
-float ParseOptions::ToFloat(std::string str) {
-  char *end_pos;
-  // strtod is cheaper than stringstream...
-  float ret = std::strtod(str.c_str(), &end_pos);
-  if (str.c_str() == end_pos) {
-    PrintUsage(true);
+float ParseOptions::ToFloat(const std::string &str) {
+  float ret;
+  if (!ConvertStringToReal(str, &ret))
     KALDI_ERR << "Invalid floating-point option \"" << str << "\"";
-  }
   return ret;
 }
 
-double ParseOptions::ToDouble(std::string str) {
-  char *end_pos;
-  // strtod is cheaper than stringstream...
-  double ret = std::strtod(str.c_str(), &end_pos);
-  if (str.c_str() == end_pos) {
-    PrintUsage(true);
-    KALDI_ERR << "Invalid floating-point option  \"" << str << "\"";
-  }
+double ParseOptions::ToDouble(const std::string &str) {
+  double ret;
+  if (!ConvertStringToReal(str, &ret))
+    KALDI_ERR << "Invalid floating-point option \"" << str << "\"";
   return ret;
 }
 
@@ -652,8 +632,8 @@ template void ParseOptions::RegisterTmpl(const std::string &name, float *ptr,
                             const std::string &doc);
 template void ParseOptions::RegisterTmpl(const std::string &name, double *ptr,
                             const std::string &doc);
-template void ParseOptions::RegisterTmpl(const std::string &name, std::string *ptr,
-                            const std::string &doc);
+template void ParseOptions::RegisterTmpl(const std::string &name,
+                            std::string *ptr, const std::string &doc);
 
 template void ParseOptions::RegisterStandard(const std::string &name,
                             bool *ptr,
diff --git a/src/util/parse-options.h b/src/util/parse-options.h
index 896aa80f196..12ed62bb55d 100644
--- a/src/util/parse-options.h
+++ b/src/util/parse-options.h
@@ -72,15 +72,15 @@ class ParseOptions : public OptionsItf {
 
   // Methods from the interface
   void Register(const std::string &name,
-                bool *ptr, const std::string &doc); 
+                bool *ptr, const std::string &doc);
   void Register(const std::string &name,
-                int32 *ptr, const std::string &doc); 
+                int32 *ptr, const std::string &doc);
   void Register(const std::string &name,
-                uint32 *ptr, const std::string &doc); 
+                uint32 *ptr, const std::string &doc);
   void Register(const std::string &name,
-                float *ptr, const std::string &doc); 
+                float *ptr, const std::string &doc);
   void Register(const std::string &name,
-                double *ptr, const std::string &doc); 
+                double *ptr, const std::string &doc);
   void Register(const std::string &name,
                 std::string *ptr, const std::string &doc);
 
@@ -97,7 +97,7 @@ class ParseOptions : public OptionsItf {
   /**
     Parses the command line options and fills the ParseOptions-registered
     variables. This must be called after all the variables were registered!!!
-   
+
     Initially the variables have implicit values,
     then the config file values are set-up,
     finally the command line vaues given.
@@ -168,15 +168,6 @@ class ParseOptions : public OptionsItf {
   void RegisterCommon(const std::string &name,
                       T *ptr, const std::string &doc, bool is_standard);
 
-  /// SplitLongArg parses an argument of the form --a=b, --a=, or --a,
-  /// and sets "has_equal_sign" to true if an equals-sign was parsed..
-  /// this is needed in order to correctly allow --x for a boolean option
-  /// x, and --y= for a string option y, and to disallow --x= and --y.
-  void SplitLongArg(std::string in, std::string *key, std::string *value,
-                    bool *has_equal_sign);
-  
-  void NormalizeArgName(std::string *str);
-
   /// Set option with name "key" to "value"; will crash if can't do it.
   /// "has_equal_sign" is used to allow --x for a boolean option x,
   /// and --y=, for a string option y.
@@ -184,10 +175,10 @@ class ParseOptions : public OptionsItf {
                  bool has_equal_sign);
 
   bool ToBool(std::string str);
-  int32 ToInt(std::string str);
-  uint32 ToUInt(std::string str);
-  float ToFloat(std::string str);
-  double ToDouble(std::string str);
+  int32 ToInt(const std::string &str);
+  uint32 ToUint(const std::string &str);
+  float ToFloat(const std::string &str);
+  double ToDouble(const std::string &str);
 
   // maps for option variables
   std::map<std::string, bool*> bool_map_;
@@ -227,6 +218,15 @@ class ParseOptions : public OptionsItf {
   /// is constructed with a prefix
   std::string prefix_;
   OptionsItf *other_parser_;
+ protected:
+    /// SplitLongArg parses an argument of the form --a=b, --a=, or --a,
+  /// and sets "has_equal_sign" to true if an equals-sign was parsed..
+  /// this is needed in order to correctly allow --x for a boolean option
+  /// x, and --y= for a string option y, and to disallow --x= and --y.
+  void SplitLongArg(std::string in, std::string *key, std::string *value,
+                    bool *has_equal_sign);
+
+  void NormalizeArgName(std::string *str);
 };
 
 /// This template is provided for convenience in reading config classes from
@@ -246,7 +246,8 @@ template<class C> void ReadConfigFromFile(const std::string config_filename,
 
 /// This variant of the template ReadConfigFromFile is for if you need to read
 /// two config classes from the same file.
-template<class C1, class C2> void ReadConfigsFromFile(const std::string config_filename,
+template<class C1, class C2> void ReadConfigsFromFile(const std::string
+                                                      config_filename,
                                                       C1 *c1, C2 *c2) {
   std::ostringstream usage_str;
   usage_str << "Parsing config from "
diff --git a/src/util/simple-io-funcs.cc b/src/util/simple-io-funcs.cc
index b91854b72d0..3d770dfff99 100644
--- a/src/util/simple-io-funcs.cc
+++ b/src/util/simple-io-funcs.cc
@@ -21,7 +21,8 @@
 
 namespace kaldi {
 
-bool WriteIntegerVectorSimple(std::string wxfilename, const std::vector<int32> &list) {
+bool WriteIntegerVectorSimple(std::string wxfilename,
+                              const std::vector<int32> &list) {
   kaldi::Output ko;
   // false, false is: text-mode, no Kaldi header.
   if (!ko.Open(wxfilename, false, false)) return false;
@@ -41,7 +42,8 @@ bool ReadIntegerVectorSimple(std::string rxfilename, std::vector<int32> *list) {
   return is.eof();  // should be eof, or junk at end of file.
 }
 
-bool WriteIntegerVectorVectorSimple(std::string wxfilename, const std::vector<std::vector<int32> > &list) {
+bool WriteIntegerVectorVectorSimple(std::string wxfilename,
+                                 const std::vector<std::vector<int32> > &list) {
   kaldi::Output ko;
   // false, false is: text-mode, no Kaldi header.
   if (!ko.Open(wxfilename, false, false)) return false;
@@ -56,7 +58,8 @@ bool WriteIntegerVectorVectorSimple(std::string wxfilename, const std::vector<st
   return ko.Close();
 }
 
-bool ReadIntegerVectorVectorSimple(std::string rxfilename, std::vector<std::vector<int32> > *list) {
+bool ReadIntegerVectorVectorSimple(std::string rxfilename,
+                                   std::vector<std::vector<int32> > *list) {
   kaldi::Input ki;
   if (!ki.OpenTextMode(rxfilename)) return false;
   std::istream &is = ki.Stream();
diff --git a/src/util/simple-io-funcs.h b/src/util/simple-io-funcs.h
index 56573e47a85..58445356e02 100644
--- a/src/util/simple-io-funcs.h
+++ b/src/util/simple-io-funcs.h
@@ -19,20 +19,24 @@
 #ifndef KALDI_UTIL_SIMPLE_IO_FUNCS_H_
 #define KALDI_UTIL_SIMPLE_IO_FUNCS_H_
 
-#include "kaldi-io.h"
+#include <string>
+#include <vector>
+#include "util/kaldi-io.h"
 
-// This header contains some utilities for reading some common, simple text formats:
-// integers in files, one per line, and integers in files, possibly multiple per line.
-// these are not really fully native Kaldi formats; they are mostly for small files that
-// might be generated by scripts, and can be read all at one time.
-// for longer files of this type, we would probably use the Table code.
+// This header contains some utilities for reading some common, simple text
+// formats:integers in files, one per line, and integers in files, possibly
+// multiple per line. these are not really fully native Kaldi formats; they are
+// mostly for small files that might be generated by scripts, and can be read
+// all at one time. for longer files of this type, we would probably use the
+// Table code.
 
 namespace kaldi {
 
 /// WriteToList attempts to write this list of integers, one per line,
 /// to the given file, in text format.
 /// returns true if succeeded.
-bool WriteIntegerVectorSimple(std::string wxfilename, const std::vector<int32> &v);
+bool WriteIntegerVectorSimple(std::string wxfilename,
+                              const std::vector<int32> &v);
 
 /// ReadFromList attempts to read this list of integers, one per line,
 /// from the given file, in text format.
@@ -45,12 +49,14 @@ bool ReadIntegerVectorSimple(std::string rxfilename, std::vector<int32> *v);
 //
 // 4 5 6
 // etc.
-bool WriteIntegerVectorVectorSimple(std::string wxfilename, const std::vector<std::vector<int32> > &v);
+bool WriteIntegerVectorVectorSimple(std::string wxfilename,
+                                    const std::vector<std::vector<int32> > &v);
 
-bool ReadIntegerVectorVectorSimple(std::string rxfilename, std::vector<std::vector<int32> > *v);
+bool ReadIntegerVectorVectorSimple(std::string rxfilename,
+                                   std::vector<std::vector<int32> > *v);
 
 
 }  // end namespace kaldi.
 
 
-#endif
+#endif  // KALDI_UTIL_SIMPLE_IO_FUNCS_H_
diff --git a/src/util/simple-options-test.cc b/src/util/simple-options-test.cc
index 9fa3bbba7ce..1314cb2314c 100644
--- a/src/util/simple-options-test.cc
+++ b/src/util/simple-options-test.cc
@@ -1,4 +1,4 @@
-// util/parse-options-test.cc
+// util/simple-options-test.cc
 
 // Copyright 2013  Tanel Alumae, Tallinn University of Technology
 
@@ -38,7 +38,7 @@ void UnitTestSimpleOptions() {
 
   rval = so.SetOption("num", 42);
   KALDI_ASSERT(rval);
-  so.SetOption("unum", (uint32)43);
+  so.SetOption("unum", static_cast<uint32>(43));
   KALDI_ASSERT(rval);
   rval = so.SetOption("str", (std::string)"foo");
   KALDI_ASSERT(rval);
@@ -60,10 +60,10 @@ void UnitTestSimpleOptions() {
   KALDI_ASSERT(unum == 44);
 
   // test automatic conversion between float and double
-  rval = so.SetOption("realnum", (float)0.2);
+  rval = so.SetOption("realnum", static_cast<float>(0.2));
   KALDI_ASSERT(rval);
   KALDI_ASSERT(realnum - 0.2 < 0.000001);
-  rval = so.SetOption("realnum", (double)0.3);
+  rval = so.SetOption("realnum", static_cast<double>(0.3));
   KALDI_ASSERT(rval);
   KALDI_ASSERT(realnum - 0.3 < 0.000001);
 
@@ -74,12 +74,10 @@ void UnitTestSimpleOptions() {
 
   rval = so.GetOptionType("xxxx", &type);
   KALDI_ASSERT(rval == false);
-
 }
 
 
-}// end namespace kaldi.
-
+}  // end namespace kaldi.
 
 int main() {
   using namespace kaldi;
diff --git a/src/util/simple-options.h b/src/util/simple-options.h
index 58816afbc8a..eb896f42766 100644
--- a/src/util/simple-options.h
+++ b/src/util/simple-options.h
@@ -1,4 +1,4 @@
-// util/simple-options.hh
+// util/simple-options.h
 
 // Copyright 2013  Tanel Alumae, Tallinn University of Technology
 
@@ -32,7 +32,8 @@ namespace kaldi {
 
 /// The class SimpleOptions is an implementation of OptionsItf that allows
 /// setting and getting option values programmatically, i.e., via getter
-/// and setter methods. It doesn't provide any command line parsing functionality.
+/// and setter methods. It doesn't provide any command line parsing
+/// functionality.
 /// The class ParseOptions should be used for command-line options.
 class SimpleOptions : public OptionsItf {
  public:
diff --git a/src/util/stl-utils-test.cc b/src/util/stl-utils-test.cc
index 76b9ed16bb7..11781e2f938 100644
--- a/src/util/stl-utils-test.cc
+++ b/src/util/stl-utils-test.cc
@@ -42,7 +42,7 @@ static void TestIsSortedAndUniq() {
     for (int i = 0;i < len;i++)
       vec.push_back(Rand() % 10);
 
-    if (! IsSortedAndUniq(vec)) {
+    if (!IsSortedAndUniq(vec)) {
       bool ok = false;
       for (size_t i = 0; i+1 < (size_t)len; i++)
         if (vec[i] >= vec[i+1]) ok = true;  // found out-of-order or dup.
@@ -88,9 +88,10 @@ static void TestSortAndUniq() {
       if (ok)  vec.push_back(n);
     }
     // don't sort.
-    std::vector<int> vec2(vec);  // make sure all things in "vec" represented in vec2.
+    std::vector<int> vec2(vec);  // make sure all things in "vec" represented
+                                 // in vec2.
     int len2 = Rand()%10;
-    if (vec.size() > 0) // add more, randomly.
+    if (vec.size() > 0)  // add more, randomly.
       for (int i = 0;i < len2;i++)
         vec2.push_back(vec[Rand()%vec.size()]);
     SortAndUniq(&vec2);
@@ -120,7 +121,8 @@ void TestCopyMapToVector() {
     std::vector<std::pair<int, int> > v;
     CopyMapToVector(mp, &v);
     KALDI_ASSERT(mp.size() == v.size());
-    for (size_t i = 0;i < v.size();i++) KALDI_ASSERT(mp[v[i].first] == v[i].second);
+    for (size_t i = 0;i < v.size();i++)
+      KALDI_ASSERT(mp[v[i].first] == v[i].second);
   }
 }
 
@@ -146,7 +148,8 @@ void TestCopyMapValuesToVector() {
     CopyMapValuesToVector(mp, &v);
     KALDI_ASSERT(mp.size() == v.size());
     int i = 0;
-    for (std::map<int, int>::iterator iter = mp.begin(); iter != mp.end(); iter++) {
+    for (std::map<int, int>::iterator iter = mp.begin(); iter != mp.end();
+         iter++) {
       KALDI_ASSERT(v[i++] == iter->second);
     }
   }
@@ -189,7 +192,11 @@ void TestContainsNullPointers() {
     std::vector<char*> vec;
     int sz = Rand() % 3;
     bool is_null = false;
-    for (int i = 0;i < sz;i++) { vec.push_back( reinterpret_cast<char*>(static_cast<intptr_t>(Rand() % 2))); if (vec.back() == NULL) is_null = true; }
+    for (int i = 0;i < sz;i++) {
+      vec.push_back(reinterpret_cast<char*>(static_cast<intptr_t>(Rand() % 2)));
+      if (vec.back() == NULL)
+        is_null = true;
+    }
     KALDI_ASSERT(is_null == ContainsNullPointers(vec));
   }
 }
@@ -199,7 +206,7 @@ void TestReverseVector() {
     std::vector<int> vec;
     int sz = Rand() % 5;
     for (int i = 0;i < sz;i++)
-      vec.push_back( Rand() % 4) ;
+      vec.push_back(Rand() % 4);
     std::vector<int> vec2(vec), vec3(vec);
     ReverseVector(&vec2);
     ReverseVector(&vec2);
@@ -220,7 +227,8 @@ void TestMergePairVectorSumming() {
       int16 val = (Rand() % 5) - 2;
       v.push_back(std::make_pair(key, val));
       if (m.count(key) == 0) m[key] = val;
-      else m[key] += val;
+      else
+        m[key] += val;
     }
     MergePairVectorSumming(&v);
     KALDI_ASSERT(IsSorted(v));
@@ -235,15 +243,14 @@ void TestMergePairVectorSumming() {
         size_t i;
         for (i = 0; i < v.size(); i++)
           if (v[i].first == iter->first) break;
-        KALDI_ASSERT(i != v.size()); // Or we didn't find this
+        KALDI_ASSERT(i != v.size());  // Or we didn't find this
         // key in v.
       }
     }
   }
 }
-  
 
-} // end namespace kaldi
+}  // end namespace kaldi
 
 int main() {
   using namespace kaldi;
diff --git a/src/util/stl-utils.h b/src/util/stl-utils.h
index 74494fa7409..d37e4d2d203 100644
--- a/src/util/stl-utils.h
+++ b/src/util/stl-utils.h
@@ -20,13 +20,6 @@
 #ifndef KALDI_UTIL_STL_UTILS_H_
 #define KALDI_UTIL_STL_UTILS_H_
 
-#include <algorithm>
-#include <map>
-#include <set>
-#include <string>
-#include <vector>
-#include "base/kaldi-common.h"
-
 #ifdef _MSC_VER
 #include <unordered_map>
 #include <unordered_set>
@@ -44,6 +37,12 @@ using std::tr1::unordered_map;
 using std::tr1::unordered_set;
 #endif
 
+#include <algorithm>
+#include <map>
+#include <set>
+#include <string>
+#include <vector>
+#include "base/kaldi-common.h"
 
 namespace kaldi {
 
@@ -247,8 +246,8 @@ struct VectorHasher {  // hashing function for vector<Int>.
 
 /// A hashing function-object for pairs of ints
 template<typename Int>
-struct PairHasher { // hashing function for pair<int>
-  size_t operator()(const std::pair<Int,Int> &x) const {
+struct PairHasher {  // hashing function for pair<int>
+  size_t operator()(const std::pair<Int, Int> &x) const {
     return x.first + x.second * kPrime;
   }
   PairHasher() {  // Check we're instantiated with an integer type.
@@ -303,7 +302,7 @@ template<typename I, typename F>
 inline void MergePairVectorSumming(std::vector<std::pair<I, F> > *vec) {
   KALDI_ASSERT_IS_INTEGER_TYPE(I);
   CompareFirstMemberOfPair<I, F> c;
-  std::sort(vec->begin(), vec->end(), c); // sort on 1st element. 
+  std::sort(vec->begin(), vec->end(), c);  // sort on 1st element.
   typename std::vector<std::pair<I, F> >::iterator out = vec->begin(),
       in = vec->begin(), end = vec->end();
   // special case: while there is nothing to be changed, skip over
@@ -318,10 +317,10 @@ inline void MergePairVectorSumming(std::vector<std::pair<I, F> > *vec) {
     *out = *in;
     ++in;
     while (in < end && in->first == out->first) {
-      out->second += in->second; // this is the merge operation.
+      out->second += in->second;  // this is the merge operation.
       ++in;
     }
-    if (out->second != static_cast<F>(0)) // Don't keep zero elements.
+    if (out->second != static_cast<F>(0))  // Don't keep zero elements.
       out++;
   }
   vec->erase(out, end);
diff --git a/src/util/table-types.h b/src/util/table-types.h
index 313d1aa2c70..819c98fdf82 100644
--- a/src/util/table-types.h
+++ b/src/util/table-types.h
@@ -33,45 +33,76 @@ namespace kaldi {
 /// \addtogroup table_types
 /// @{
 
-typedef TableWriter<KaldiObjectHolder<Matrix<BaseFloat> > >  BaseFloatMatrixWriter;
-typedef SequentialTableReader<KaldiObjectHolder<Matrix<BaseFloat> > >  SequentialBaseFloatMatrixReader;
-typedef RandomAccessTableReader<KaldiObjectHolder<Matrix<BaseFloat> > >  RandomAccessBaseFloatMatrixReader;
-typedef RandomAccessTableReaderMapped<KaldiObjectHolder<Matrix<BaseFloat> > >  RandomAccessBaseFloatMatrixReaderMapped;
-
-typedef TableWriter<KaldiObjectHolder<Matrix<double> > >  DoubleMatrixWriter;
-typedef SequentialTableReader<KaldiObjectHolder<Matrix<double> > >  SequentialDoubleMatrixReader;
-typedef RandomAccessTableReader<KaldiObjectHolder<Matrix<double> > >  RandomAccessDoubleMatrixReader;
-typedef RandomAccessTableReaderMapped<KaldiObjectHolder<Matrix<double> > >  RandomAccessDoubleMatrixReaderMapped;
-
-typedef TableWriter<KaldiObjectHolder<CompressedMatrix> >  CompressedMatrixWriter;
-
-typedef TableWriter<KaldiObjectHolder<Vector<BaseFloat> > >  BaseFloatVectorWriter;
-typedef SequentialTableReader<KaldiObjectHolder<Vector<BaseFloat> > >  SequentialBaseFloatVectorReader;
-typedef RandomAccessTableReader<KaldiObjectHolder<Vector<BaseFloat> > >  RandomAccessBaseFloatVectorReader;
-typedef RandomAccessTableReaderMapped<KaldiObjectHolder<Vector<BaseFloat> > >  RandomAccessBaseFloatVectorReaderMapped;
-
-typedef TableWriter<KaldiObjectHolder<Vector<double> > >  DoubleVectorWriter;
-typedef SequentialTableReader<KaldiObjectHolder<Vector<double> > >  SequentialDoubleVectorReader;
-typedef RandomAccessTableReader<KaldiObjectHolder<Vector<double> > >  RandomAccessDoubleVectorReader;
-
-typedef TableWriter<KaldiObjectHolder<CuMatrix<BaseFloat> > >  BaseFloatCuMatrixWriter;
-typedef SequentialTableReader<KaldiObjectHolder<CuMatrix<BaseFloat> > >  SequentialBaseFloatCuMatrixReader;
-typedef RandomAccessTableReader<KaldiObjectHolder<CuMatrix<BaseFloat> > >  RandomAccessBaseFloatCuMatrixReader;
-typedef RandomAccessTableReaderMapped<KaldiObjectHolder<CuMatrix<BaseFloat> > >  RandomAccessBaseFloatCuMatrixReaderMapped;
-
-typedef TableWriter<KaldiObjectHolder<CuMatrix<double> > >  DoubleCuMatrixWriter;
-typedef SequentialTableReader<KaldiObjectHolder<CuMatrix<double> > >  SequentialDoubleCuMatrixReader;
-typedef RandomAccessTableReader<KaldiObjectHolder<CuMatrix<double> > >  RandomAccessDoubleCuMatrixReader;
-typedef RandomAccessTableReaderMapped<KaldiObjectHolder<CuMatrix<double> > >  RandomAccessDoubleCuMatrixReaderMapped;
-
-typedef TableWriter<KaldiObjectHolder<CuVector<BaseFloat> > >  BaseFloatCuVectorWriter;
-typedef SequentialTableReader<KaldiObjectHolder<CuVector<BaseFloat> > >  SequentialBaseFloatCuVectorReader;
-typedef RandomAccessTableReader<KaldiObjectHolder<CuVector<BaseFloat> > >  RandomAccessBaseFloatCuVectorReader;
-typedef RandomAccessTableReaderMapped<KaldiObjectHolder<CuVector<BaseFloat> > >  RandomAccessBaseFloatCuVectorReaderMapped;
-
-typedef TableWriter<KaldiObjectHolder<CuVector<double> > >  DoubleCuVectorWriter;
-typedef SequentialTableReader<KaldiObjectHolder<CuVector<double> > >  SequentialDoubleCuVectorReader;
-typedef RandomAccessTableReader<KaldiObjectHolder<CuVector<double> > >  RandomAccessDoubleCuVectorReader;
+typedef TableWriter<KaldiObjectHolder<Matrix<BaseFloat> > >
+                    BaseFloatMatrixWriter;
+typedef SequentialTableReader<KaldiObjectHolder<Matrix<BaseFloat> > >
+                             SequentialBaseFloatMatrixReader;
+typedef RandomAccessTableReader<KaldiObjectHolder<Matrix<BaseFloat> > >
+                                RandomAccessBaseFloatMatrixReader;
+typedef RandomAccessTableReaderMapped<KaldiObjectHolder<Matrix<BaseFloat> > >
+                                      RandomAccessBaseFloatMatrixReaderMapped;
+
+typedef TableWriter<KaldiObjectHolder<Matrix<double> > >
+                                      DoubleMatrixWriter;
+typedef SequentialTableReader<KaldiObjectHolder<Matrix<double> > >
+                              SequentialDoubleMatrixReader;
+typedef RandomAccessTableReader<KaldiObjectHolder<Matrix<double> > >
+                                RandomAccessDoubleMatrixReader;
+typedef RandomAccessTableReaderMapped<KaldiObjectHolder<Matrix<double> > >
+                                      RandomAccessDoubleMatrixReaderMapped;
+
+typedef TableWriter<KaldiObjectHolder<CompressedMatrix> >
+                                      CompressedMatrixWriter;
+
+typedef TableWriter<KaldiObjectHolder<Vector<BaseFloat> > >
+                                      BaseFloatVectorWriter;
+typedef SequentialTableReader<KaldiObjectHolder<Vector<BaseFloat> > >
+                              SequentialBaseFloatVectorReader;
+typedef RandomAccessTableReader<KaldiObjectHolder<Vector<BaseFloat> > >
+                                RandomAccessBaseFloatVectorReader;
+typedef RandomAccessTableReaderMapped<KaldiObjectHolder<Vector<BaseFloat> > >
+                                      RandomAccessBaseFloatVectorReaderMapped;
+
+typedef TableWriter<KaldiObjectHolder<Vector<double> > >
+                                      DoubleVectorWriter;
+typedef SequentialTableReader<KaldiObjectHolder<Vector<double> > >
+                              SequentialDoubleVectorReader;
+typedef RandomAccessTableReader<KaldiObjectHolder<Vector<double> > >
+                                RandomAccessDoubleVectorReader;
+
+typedef TableWriter<KaldiObjectHolder<CuMatrix<BaseFloat> > >
+                                      BaseFloatCuMatrixWriter;
+typedef SequentialTableReader<KaldiObjectHolder<CuMatrix<BaseFloat> > >
+                              SequentialBaseFloatCuMatrixReader;
+typedef RandomAccessTableReader<KaldiObjectHolder<CuMatrix<BaseFloat> > >
+                                RandomAccessBaseFloatCuMatrixReader;
+typedef RandomAccessTableReaderMapped<KaldiObjectHolder<CuMatrix<BaseFloat> > >
+                                      RandomAccessBaseFloatCuMatrixReaderMapped;
+
+typedef TableWriter<KaldiObjectHolder<CuMatrix<double> > >
+                                      DoubleCuMatrixWriter;
+typedef SequentialTableReader<KaldiObjectHolder<CuMatrix<double> > >
+                              SequentialDoubleCuMatrixReader;
+typedef RandomAccessTableReader<KaldiObjectHolder<CuMatrix<double> > >
+                                RandomAccessDoubleCuMatrixReader;
+typedef RandomAccessTableReaderMapped<KaldiObjectHolder<CuMatrix<double> > >
+                                      RandomAccessDoubleCuMatrixReaderMapped;
+
+typedef TableWriter<KaldiObjectHolder<CuVector<BaseFloat> > >
+                    BaseFloatCuVectorWriter;
+typedef SequentialTableReader<KaldiObjectHolder<CuVector<BaseFloat> > >
+                              SequentialBaseFloatCuVectorReader;
+typedef RandomAccessTableReader<KaldiObjectHolder<CuVector<BaseFloat> > >
+                                RandomAccessBaseFloatCuVectorReader;
+typedef RandomAccessTableReaderMapped<KaldiObjectHolder<CuVector<BaseFloat> > >
+                                      RandomAccessBaseFloatCuVectorReaderMapped;
+
+typedef TableWriter<KaldiObjectHolder<CuVector<double> > >
+                    DoubleCuVectorWriter;
+typedef SequentialTableReader<KaldiObjectHolder<CuVector<double> > >
+                              SequentialDoubleCuVectorReader;
+typedef RandomAccessTableReader<KaldiObjectHolder<CuVector<double> > >
+                                RandomAccessDoubleCuVectorReader;
 
 
 typedef TableWriter<BasicHolder<int32> >  Int32Writer;
@@ -79,25 +110,37 @@ typedef SequentialTableReader<BasicHolder<int32> >  SequentialInt32Reader;
 typedef RandomAccessTableReader<BasicHolder<int32> >  RandomAccessInt32Reader;
 
 typedef TableWriter<BasicVectorHolder<int32> >  Int32VectorWriter;
-typedef SequentialTableReader<BasicVectorHolder<int32> >  SequentialInt32VectorReader;
-typedef RandomAccessTableReader<BasicVectorHolder<int32> >  RandomAccessInt32VectorReader;
+typedef SequentialTableReader<BasicVectorHolder<int32> >
+                              SequentialInt32VectorReader;
+typedef RandomAccessTableReader<BasicVectorHolder<int32> >
+                                RandomAccessInt32VectorReader;
 
 typedef TableWriter<BasicVectorVectorHolder<int32> >  Int32VectorVectorWriter;
-typedef SequentialTableReader<BasicVectorVectorHolder<int32> >  SequentialInt32VectorVectorReader;
-typedef RandomAccessTableReader<BasicVectorVectorHolder<int32> >  RandomAccessInt32VectorVectorReader;
+typedef SequentialTableReader<BasicVectorVectorHolder<int32> >
+                              SequentialInt32VectorVectorReader;
+typedef RandomAccessTableReader<BasicVectorVectorHolder<int32> >
+                                RandomAccessInt32VectorVectorReader;
 
 typedef TableWriter<BasicPairVectorHolder<int32> >  Int32PairVectorWriter;
-typedef SequentialTableReader<BasicPairVectorHolder<int32> >  SequentialInt32PairVectorReader;
-typedef RandomAccessTableReader<BasicPairVectorHolder<int32> >  RandomAccessInt32PairVectorReader;
-
-typedef TableWriter<BasicPairVectorHolder<BaseFloat> >  BaseFloatPairVectorWriter;
-typedef SequentialTableReader<BasicPairVectorHolder<BaseFloat> >  SequentialBaseFloatPairVectorReader;
-typedef RandomAccessTableReader<BasicPairVectorHolder<BaseFloat> >  RandomAccessBaseFloatPairVectorReader;
+typedef SequentialTableReader<BasicPairVectorHolder<int32> >
+                              SequentialInt32PairVectorReader;
+typedef RandomAccessTableReader<BasicPairVectorHolder<int32> >
+                                RandomAccessInt32PairVectorReader;
+
+typedef TableWriter<BasicPairVectorHolder<BaseFloat> >
+                    BaseFloatPairVectorWriter;
+typedef SequentialTableReader<BasicPairVectorHolder<BaseFloat> >
+                              SequentialBaseFloatPairVectorReader;
+typedef RandomAccessTableReader<BasicPairVectorHolder<BaseFloat> >
+                                RandomAccessBaseFloatPairVectorReader;
 
 typedef TableWriter<BasicHolder<BaseFloat> >  BaseFloatWriter;
-typedef SequentialTableReader<BasicHolder<BaseFloat> >  SequentialBaseFloatReader;
-typedef RandomAccessTableReader<BasicHolder<BaseFloat> >  RandomAccessBaseFloatReader;
-typedef RandomAccessTableReaderMapped<BasicHolder<BaseFloat> >  RandomAccessBaseFloatReaderMapped;
+typedef SequentialTableReader<BasicHolder<BaseFloat> >
+                              SequentialBaseFloatReader;
+typedef RandomAccessTableReader<BasicHolder<BaseFloat> >
+                                RandomAccessBaseFloatReader;
+typedef RandomAccessTableReaderMapped<BasicHolder<BaseFloat> >
+                                      RandomAccessBaseFloatReaderMapped;
 
 typedef TableWriter<BasicHolder<double> >  DoubleWriter;
 typedef SequentialTableReader<BasicHolder<double> >  SequentialDoubleReader;
@@ -122,7 +165,8 @@ typedef RandomAccessTableReader<TokenHolder> RandomAccessTokenReader;
 typedef TableWriter<TokenVectorHolder> TokenVectorWriter;
 // Ditto for SequentialTokenVectorReader.
 typedef SequentialTableReader<TokenVectorHolder> SequentialTokenVectorReader;
-typedef RandomAccessTableReader<TokenVectorHolder> RandomAccessTokenVectorReader;
+typedef RandomAccessTableReader<TokenVectorHolder>
+                                RandomAccessTokenVectorReader;
 
 
 /// @}
@@ -130,8 +174,8 @@ typedef RandomAccessTableReader<TokenVectorHolder> RandomAccessTokenVectorReader
 // Note: for FST reader/writer, see ../fstext/fstext-utils.h
 // [not done yet].
 
-} // end namespace kaldi
+}  // end namespace kaldi
 
 
 
-#endif
+#endif  // KALDI_UTIL_TABLE_TYPES_H_
diff --git a/src/util/text-utils-test.cc b/src/util/text-utils-test.cc
index 65ea9b34412..ee9d20aa641 100644
--- a/src/util/text-utils-test.cc
+++ b/src/util/text-utils-test.cc
@@ -54,7 +54,7 @@ void TestSplitStringToVector() {
     int sz = Rand() % 73;
     std::string full;
     for (int i = 0; i < sz-1; i++) {
-      full.push_back( (Rand() % 7 == 0)? GetRandDelim() : GetRandChar());
+      full.push_back((Rand() % 7 == 0)? GetRandDelim() : GetRandChar());
     }
     std::string delim;
     delim.push_back(GetRandDelim());
@@ -155,8 +155,8 @@ void TestConvertStringToInteger() {
 
   uint64 k;
   KALDI_ASSERT(ConvertStringToInteger("12345", &k) && k == 12345);
-  KALDI_ASSERT(!ConvertStringToInteger("-12345", &k));  // unsigned, cannot convert.
-
+  KALDI_ASSERT(!ConvertStringToInteger("-12345", &k));  // unsigned,
+                                                        // cannot convert.
 }
 
 template<class Real>
@@ -174,7 +174,6 @@ void TestConvertStringToReal() {
   // it also works for inf or nan.
   KALDI_ASSERT(ConvertStringToReal("inf", &d) && d > 0 && d - d != 0);
   KALDI_ASSERT(ConvertStringToReal("nan", &d) && d != d);
-  
 }
 
 
@@ -190,7 +189,7 @@ void TestTrim() {
   KALDI_ASSERT(TrimTmp("X\n") == "X");
   KALDI_ASSERT(TrimTmp("X\n\t") == "X");
   KALDI_ASSERT(TrimTmp("\n\tX") == "X");
-} // end namespace kaldi
+}  // end namespace kaldi
 
 
 void TestSplitStringOnFirstSpace() {
@@ -221,9 +220,9 @@ void TestIsToken() {
   KALDI_ASSERT(!IsToken("ab "));
   KALDI_ASSERT(!IsToken(" ab"));
   KALDI_ASSERT(!IsToken("a b"));
-  KALDI_ASSERT(IsToken("\231")); // typical non-ASCII printable character, something with
-  // an accent.
-  KALDI_ASSERT(!IsToken("\377")); // character 255, which is a form of space.
+  KALDI_ASSERT(IsToken("\231"));  // typical non-ASCII printable character,
+                                  // something with an accent.
+  KALDI_ASSERT(!IsToken("\377"));  // character 255, which is a form of space.
   KALDI_ASSERT(IsToken("a-b,c,d=ef"));
   KALDI_ASSERT(!IsToken("a\nb"));
   KALDI_ASSERT(!IsToken("a\tb"));
@@ -239,7 +238,7 @@ void TestIsLine() {
   KALDI_ASSERT(!IsLine(" a b"));
 }
 
-} // end namespace kaldi
+}  // end namespace kaldi
 
 int main() {
   using namespace kaldi;
diff --git a/src/util/text-utils.cc b/src/util/text-utils.cc
index 28b2a9fb2dd..876ee3f0be9 100644
--- a/src/util/text-utils.cc
+++ b/src/util/text-utils.cc
@@ -17,9 +17,9 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
-#include "base/kaldi-common.h"
 #include "util/text-utils.h"
-
+#include <limits>
+#include "base/kaldi-common.h"
 
 namespace kaldi {
 
@@ -27,10 +27,10 @@ namespace kaldi {
 template<class F>
 bool SplitStringToFloats(const std::string &full,
                          const char *delim,
-                         bool omit_empty_strings, // typically false
+                         bool omit_empty_strings,  // typically false
                          std::vector<F> *out) {
   KALDI_ASSERT(out != NULL);
-  if ( *(full.c_str()) == '\0') {
+  if (*(full.c_str()) == '\0') {
     out->clear();
     return true;
   }
@@ -105,10 +105,11 @@ bool IsToken(const std::string &token) {
   if (l == 0) return false;
   for (size_t i = 0; i < l; i++) {
     unsigned char c = token[i];
-    if ( (!isprint(c) || isspace(c)) && (isascii(c) || c == (unsigned char)255) ) return false;
-    // The "&& (isascii(c) || c == 255)" was added so that we won't reject non-ASCII
-    // characters such as French characters with accents [except for 255 which is
-    // "nbsp", a form of space].
+    if ((!isprint(c) || isspace(c)) && (isascii(c) || c == (unsigned char)255))
+      return false;
+    // The "&& (isascii(c) || c == 255)" was added so that we won't reject
+    // non-ASCII characters such as French characters with accents [except for
+    // 255 which is "nbsp", a form of space].
   }
   return true;
 }
@@ -154,8 +155,8 @@ bool IsLine(const std::string &line) {
   if (isspace(*(line.begin()))) return false;
   if (isspace(*(line.rbegin()))) return false;
   std::string::const_iterator iter = line.begin(), end = line.end();
-  for (; iter!= end; iter++)
-    if ( ! isprint(*iter)) return false;
+  for (; iter != end; iter++)
+    if (!isprint(*iter)) return false;
   return true;
 }
 
@@ -189,10 +190,12 @@ inline bool is_nan_text(const std::string &in, const std::string &prefix) {
 
 template<typename T>
 bool convert_special_number(const std::string &str, T *out) {
-  if (stricmp(str, "infinity") || stricmp(str, "inf") || starts_with(str, "1.#INF")) {
+  if (stricmp(str, "infinity") || stricmp(str, "inf") ||
+      starts_with(str, "1.#INF")) {
     *out = std::numeric_limits<T>::infinity();
     return true;
-  } else if (stricmp(str, "-infinity") || stricmp(str, "-inf") || starts_with(str, "-1.#INF")) {
+  } else if (stricmp(str, "-infinity") || stricmp(str, "-inf") ||
+             starts_with(str, "-1.#INF")) {
     *out = -std::numeric_limits<T>::infinity();
     return true;
   } else if (is_nan_text(str, "nan") || starts_with(str, "1.#QNAN")) {
@@ -216,7 +219,7 @@ bool ConvertStringToReal(const std::string &str,
   // depending on claims of the C++11 support, it should have
   if (convert_special_number(str, out))
     return true;
-#endif // defined(_MSC_VER)
+#endif  // defined(_MSC_VER)
 
   double d = KALDI_STRTOD(this_str, &end);
   if (end != this_str)
@@ -238,7 +241,7 @@ bool ConvertStringToReal(const std::string &str,
   // depending on claims of the C++11 support, it should have
   if (convert_special_number(str, out))
     return true;
-#endif // _MSC_VER
+#endif  // _MSC_VER
 
   float f = KALDI_STRTOF(this_str, &end);
   if (end != this_str)
diff --git a/src/util/text-utils.h b/src/util/text-utils.h
index 1d85c473baf..be2e963a27b 100644
--- a/src/util/text-utils.h
+++ b/src/util/text-utils.h
@@ -20,15 +20,16 @@
 #ifndef KALDI_UTIL_TEXT_UTILS_H_
 #define KALDI_UTIL_TEXT_UTILS_H_
 
+#include <errno.h>
+#include <string>
 #include <algorithm>
 #include <map>
 #include <set>
-#include <string>
 #include <vector>
-#include <errno.h>
-
+#include <limits>
 #include "base/kaldi-common.h"
 
+
 namespace kaldi {
 
 /// Split a string using any of the single character delimiters.
@@ -68,7 +69,7 @@ bool SplitStringToIntegers(const std::string &full,
                            std::vector<I> *out) {
   KALDI_ASSERT(out != NULL);
   KALDI_ASSERT_IS_INTEGER_TYPE(I);
-  if ( *(full.c_str()) == '\0') {
+  if (*(full.c_str()) == '\0') {
     out->clear();
     return true;
   }
@@ -78,14 +79,14 @@ bool SplitStringToIntegers(const std::string &full,
   for (size_t i = 0; i < split.size(); i++) {
     const char *this_str = split[i].c_str();
     char *end = NULL;
-    long long int j = 0;
+    int64 j = 0;
     j = KALDI_STRTOLL(this_str, &end);
     if (end == this_str || *end != '\0') {
       out->clear();
       return false;
     } else {
       I jI = static_cast<I>(j);
-      if (static_cast<long long int>(jI) != j) {
+      if (static_cast<int64>(jI) != j) {
         // output type cannot fit this integer.
         out->clear();
         return false;
@@ -100,7 +101,7 @@ bool SplitStringToIntegers(const std::string &full,
 template<class F>
 bool SplitStringToFloats(const std::string &full,
                          const char *delim,
-                         bool omit_empty_strings, // typically false
+                         bool omit_empty_strings,  // typically false
                          std::vector<F> *out);
 
 
@@ -116,13 +117,14 @@ bool ConvertStringToInteger(const std::string &str,
   const char *this_str = str.c_str();
   char *end = NULL;
   errno = 0;
-  long long int i = KALDI_STRTOLL(this_str, &end);
+  int64 i = KALDI_STRTOLL(this_str, &end);
   if (end != this_str)
     while (isspace(*end)) end++;
   if (end == this_str || *end != '\0' || errno != 0)
     return false;
   Int iInt = static_cast<Int>(i);
-  if (static_cast<long long int>(iInt) != i || (i<0 && !std::numeric_limits<Int>::is_signed)) {
+  if (static_cast<int64>(iInt) != i ||
+      (i < 0 && !std::numeric_limits<Int>::is_signed)) {
     return false;
   }
   *out = iInt;
@@ -130,9 +132,9 @@ bool ConvertStringToInteger(const std::string &str,
 }
 
 
-/// ConvertStringToReal converts a string into either float or double via strtod,
-/// and returns false if there was any kind of problem (i.e. the string was not a
-/// floating point number or contained extra non-whitespace junk.
+/// ConvertStringToReal converts a string into either float or double via
+/// strtod, and returns false if there was any kind of problem (i.e. the string
+/// was not a floating point number or contained extra non-whitespace junk.
 /// Be careful- this function will successfully read inf's or nan's.
 bool ConvertStringToReal(const std::string &str,
                          double *out);
diff --git a/src/util/timer.h b/src/util/timer.h
index e3ee8d53d25..3b92b48b603 100644
--- a/src/util/timer.h
+++ b/src/util/timer.h
@@ -22,6 +22,7 @@
 // #include base/timer.h
 #ifndef KALDI_UTIL_TIMER_H_
 #define KALDI_UTIL_TIMER_H_
-#pragma message warning: please do not include util/timer.h, include base/timer.h (it has been moved)
+#pragma message warning: please do not include util/timer.h, \
+  include base/timer.h(it has been moved)
 #include "base/timer.h"
-#endif
+#endif  // KALDI_UTIL_TIMER_H_
diff --git a/tools/INSTALL b/tools/INSTALL
index a6b235bc9db..b13d45826bd 100644
--- a/tools/INSTALL
+++ b/tools/INSTALL
@@ -1,4 +1,15 @@
-To install the prerequisites for Kaldi:
+
+To install the most important prerequisites for Kaldi:
+
+ first do
+
+  extras/check_dependencies.sh
+
+to see if there are any system-level installations or modifications you need to do.
+Check the output carefully: there are some things that will make your life a lot
+easier if you fix them at this stage.
+
+Then run
 
   make
 
@@ -11,216 +22,7 @@ By default, Kaldi builds against OpenFst-1.3.4. If you want to build against
 OpenFst-1.4, edit the Makefile in this folder. Note that this change requires
 a relatively new compiler with C++11 support, e.g. gcc >= 4.6, clang >= 3.0.
 
-Note:
-./install_atlas.sh is a backup plan in case you don't have ATLAS installed
-on your system. Ignore it unless you get errors from ../src/configure.
-
-**** IGNORE THE REST OF THIS FILE!  ****
-
-The rest of this file contains the old, manual instructions, which
-we keep around in case they are useful for someone, but they are
-basically deprecated.
-
-
-See below install instructions for:
-(1) sph2pipe (needed to run the example scripts but not for compilation)
-(2) openfst (needed)
-(3) dot (optional; only used by doxygen and for FST debugging).
-(4) IRSTLM (optional; only needed if you want to build LMs and
-  don't already have a setup).
-(5) sclite (optional; useful for detailed scoring output but the
-   default scripts don't use it).
-(6) ATLAS (needed only for headers, if already on system, but for
-  native Windows compilation without Intel MKL you have to compile this)
-(7) CLAPACK headers (required if you have the library available but
- no headers in a directory accessed by default; this is the case on Cygwin)
-(8) libportaudio (needed for the online recognition binaries)
-
-
-####
-
-(1)
-Install instructions for sph2pipe_v2.5.tar.gz
-#Note: you may need to try other sites, where the archive is available
-#For example:
-#	http://merlin.fit.vutbr.cz/kaldi/sph2pipe_v2.5.tar.gz
-
-
-wget ftp://ftp.ldc.upenn.edu/pub/ldc/misc_sw/sph2pipe_v2.5.tar.gz
-tar -xovzf sph2pipe_v2.5.tar.gz 
-cd sph2pipe_v2.5
-gcc -o sph2pipe *.c -lm 
-cd ..
-
-# These instructions are not valid for native Windows;
-# see the readme, 0readme.1st, in sph2pipe_v2.5
-
-####
-(2)
-Install instructions for OpenFst
-
-Note that this should be compiled with g++-4.x
-You may have to install this and give the option CXX=<g++-4-binary-name>
-to configure, if it's not already the default (g++ -v will tell you).
-(on cygwin you may have to install the g++-4.0 package and give the options CXX=g++-4.exe CC=gcc-4.exe to configure).
-
-
-wget http://openfst.cs.nyu.edu/twiki/pub/FST/FstDownload/openfst-1.3.2.tar.gz
-tar -xovzf openfst-1.3.2.tar.gz   
-for dir in openfst-1.3.2/{src/,}include/fst; do
-    ( [ -d $dir ] && cd $dir && patch -p0 -N <../../../../openfst.patch ) 
-done 
-rm openfst 2>/dev/null # Remove any existing link
-ln -s openfst-1.3.2 openfst
- 
-cd openfst-1.3.2
-# Choose the correct configure statement:
-
-Linux or Darwin:
-  ./configure --prefix=`pwd` --enable-static --disable-shared 
-
-Linux, cross-compile (64-bit) [you probably don't want this]:
- # Configure for linux, cross-compile to 64-bit
- # Note that this refers to 64-bit compilation, which
- # on the BUT machines it not the default even though the machines
- # are 64bit [you would have to compile with OPT=64bit].  
-  ./configure --host=x86_64-linux --prefix=`pwd` --enable-static --disable-shared 
-
-Cygwin:
-  ./configure --prefix=`pwd` CXX=g++-4.exe CC=gcc-4.exe --enable-static --disable-shared  
-
-# make install is equivalent to "make; make install"
-make install
-
-####
-
-(3) dot
-
-This is used by doxygen to automatically generate documentation for the source, and
-is also useful for debugging FSTs (in conjunction with fstdraw).
-
-We provide here instructions for installing dot on cygwin.
-On most native UNIX systems, dot is likely to be installed already.
-If not, once you work out how to install it please put the instructions here.
-
-Get the Windows version of dot from
-http://www.graphviz.org/pub/graphviz/stable/windows/graphviz-2.26.3.msi
-follow the default installation setup.
-in your .bashrc file, add the line
-export PATH=$PATH:/cygdrive/c/Program\ Files/Graphviz2.26.3/bin/
-[you might want to first make sure that that's where you installed it.
-If your shell is not bash, this may have to be changed a bit].
-
-####
-
-(4)
-The following are instructions to install IRSTLM.
-This is not needed for the basic system builds (RM has its own
-non-ARPA LM, and WSJ comes with LMs).  So installing this may be
-left till later, if you are in a hurry.
-
-svn co -r 618 https://svn.code.sf.net/p/irstlm/code/trunk irstlm
-cd irstlm
-[
-if your aclocal and automake are too old, you may want to 
-adjust the settings for the variables ACLOCAL and AUTOMAKE
-in ./regenerate-makefiles.sh.  For example, on cygwin you have
-to install the package automake-1.9, and the two lines in
-regenerate-makefiles.sh become:
-ACLOCAL=/bin/aclocal-1.9
-AUTOMAKE=/bin/automake-1.9 
-... however, the defaults may work.
-]
-./regenerate-makefiles.sh  
-./regenerate-makefiles.sh  # run this twice; it seems to be necessary.
-./configure --prefix=`pwd`
-#or for 64 bit cross-compilation (tested on BUT machines)
-#./configure --prefix=`pwd` CC="x86_64-linux-gcc" CXX="x86_64-linux-g++" AR="x86_64-linux-ar" AS="x86_64-linux-as" RANLIB="x86_64-linux-ranlib"
-[ you may have to install zlib before typing make ]
-make
-make install
-
-####
-
-(5) sclite [OPTIONAL!]  This can be helpful helpful for scoring but the default
-scoring scripts do not use it (they use our own Kaldi-based scorer). 
-
-If you get a bug about strncasecmp, you have to modify the file 'sctk-2.4.0/src/rfilter1/makefile'.
-There is a comment in there about setting OPTIONS to be empty.  Do that and it
-should compile. [Do this after "make config" but before "make install"].
-
-Install instructions for sclite/sctk.
-This was obtained as follows:
-
- (a) download it.
- If the following command fails with e.g. a proxy error:
-   wget ftp://jaguar.ncsl.nist.gov/pub/sctk-2.4.0-20091110-0958.tar.bz2
- then try this:  
-  ftp jaguar.ncsl.nist.gov 
-  use username="anonymous" and an empty password
-  cd pub
-  get sctk-2.4.0-20091110-0958.tar.bz2
- 
- (b) unpack, patch and compile
-tar -xovjf sctk-2.4.0-20091110-0958.tar.bz2 
-cd sctk-2.4.0
-for x in src/asclite/core/recording.{h,cpp}; do # Fix a compilation error that can occur with newer compiler versions.
-  sed 's/Filter::Filter/::Filter/' $x > tmpf; mv tmpf $x;
-done
-make config
-make all
-make check
-make install
-make doc
-
-####
-
-(6) Atlas
-wget http://sourceforge.net/projects/math-atlas/files/Stable/3.8.3/atlas3.8.3.tar.gz
-tar -xovzf atlas3.8.3.tar.gz ATLAS/include
-# don't do any more installation at this point, we just need this for headers
-# (except for native Windows compilation, for which see ../windows/INSTALL.atlas)
-
-#####
-
-(7) CLAPACK headers (in case needed, if CLAPACK is installed on the machine...
-   we don't go to the trouble of installing CLAPACK if the libraries are not
-   already there).
-mkdir CLAPACK_include
-cd CLAPACK_include
-for x in  clapack.h f2c.h ; do 
-  wget http://www.netlib.org/clapack/$x; 
-done
-cd ..
-cp ATLAS/include/cblas.h CLAPACK_include
- 
-####
-
-(8) Install instructions for libportaudio.
-
-libportaudio is only needed for the online recognition binaries. It enables audio
-capture from the sound card. Unfortunately, the installer for libportaudio may not 
-work on all version of Linux or Mac OS. However, for most people it will be 
-sufficient to simply run the script
-
-./install_portaudio.sh
-
-We tested this installation script on various versions of Suse Linux and Red Hat as
-well as on Mac OS (Darwin). Please note that the installation script patches up the 
-default Makefile of portaudio when installing on Mac (for details, please have a look
-at the installation script itself). 
-
-!!IMPORTANT!! UNDER MAC OS, YOU MAY NEED TO COPY THE libportaudio.dylib TO
-/usr/local/lib SO THAT THE BINARIES CAN FIND THE LIBRARY. AFTER COMPILING 
-LIPPORTAUDIO, YOU CAN FIND THE DYLIB FILE(S) IN portaudio/install/lib
-
-The best bet for support on compiling libportaudio, should you face any problems, 
-would be the portaudio documentation:
-
-http://portaudio.com/docs/v19-doxydocs/tutorial_start.html
+In extras/, there are also various scripts to install extra bits and pieces that
+are used by individual example scripts.  If an example script needs you to run
+one of those scripts, it will tell you what to do.
 
-We know of at least one instance were libportaudio  compiled fine, but only 
-produced garbage audio samples. This happened on a Linux system  that only had 
-Open Sound System (OSS) installed. We therefore recommend to install ALSA. This 
-is also reflected in our Makefile for the binaries in /src/onlinebin where we 
-include -lasound.
diff --git a/tools/Makefile b/tools/Makefile
index ab4a1452bd7..22e91e8677b 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -31,6 +31,9 @@ all: check_required_programs sph2pipe atlas sclite openfst
 	@echo "Warning: use the script extras/install_irstlm.sh"
 	@echo "All done OK."
 
+# make sure check_required_programs runs before anything else:
+sph2pipe atlas sclite openfst sctk: | check_required_programs
+
 check_required_programs:
 	extras/check_dependencies.sh
 
@@ -45,13 +48,13 @@ sclite_cleaned:
 
 distclean:
 	rm -rf openfst-$(OPENFST_VERSION)/
-	rm -rf sctk-2.4.9/
+	rm -rf sctk-2.4.10/
 	rm -rf sctk
 	rm -rf ATLAS/
 	rm -rf sph2pipe_v2.5/
 	rm -rf sph2pipe_v2.5.tar.gz
 	rm -rf atlas3.8.3.tar.gz
-	rm -rf sctk-2.4.9-20141015-1634Z.tar.bz2
+	rm -rf sctk-2.4.10-20151007-1312Z.tar.bz2
 	rm -rf openfst-$(OPENFST_VERSION).tar.gz
 	rm -f openfst
 	rm -rf libsndfile-1.0.25{,.tar.gz} BeamformIt-3.51{,.tgz}
@@ -72,7 +75,7 @@ openfst-$(OPENFST_VERSION)/lib: | openfst-$(OPENFST_VERSION)/Makefile
 
 # Add the -O flag to CXXFLAGS on cygwin as it can fix the compilation error
 # "file too big".
-openfst-$(OPENFST_VERSION)/Makefile: openfst-$(OPENFST_VERSION)/.patched
+openfst-$(OPENFST_VERSION)/Makefile: openfst-$(OPENFST_VERSION)/.patched | check_required_programs
 ifeq ($(OSTYPE),cygwin)
 	cd openfst-$(OPENFST_VERSION)/; ./configure --prefix=`pwd` --enable-static --enable-shared --enable-far --enable-ngram-fsts CXX=$(CXX) CXXFLAGS="$(CXXFLAGS) -O" LDFLAGS="$(LDFLAGS)" LIBS="-ldl"
 else
@@ -83,14 +86,14 @@ endif
 # support multi-threads when compile with g++ (gcc) version above 4.1
 openfst-$(OPENFST_VERSION)/.patched: | openfst-$(OPENFST_VERSION)
 	-cd openfst-$(OPENFST_VERSION)/src/include/fst; \
-	patch -p0 -N < ../../../../extras/openfst-$(OPENFST_VERSION).patch;
-	$(CXX) -dumpversion | awk '{if(NR==1 && $$1>"4.1") print "cd openfst-$(OPENFST_VERSION)/src/include/fst; patch -p0 -N < ../../../../extras/openfst_gcc41up.patch"}' | sh -
+	patch -c -p0 -N < ../../../../extras/openfst-$(OPENFST_VERSION).patch;
+	$(CXX) -dumpversion | awk '{if(NR==1 && $$1>"4.1") print "cd openfst-$(OPENFST_VERSION)/src/include/fst; patch -c -p0 -N < ../../../../extras/openfst_gcc41up.patch"}' | sh -
 	touch $@
 
 openfst-$(OPENFST_VERSION): openfst-$(OPENFST_VERSION).tar.gz
 	tar xozf openfst-$(OPENFST_VERSION).tar.gz
 
-openfst-$(OPENFST_VERSION).tar.gz: 
+openfst-$(OPENFST_VERSION).tar.gz:
 	wget http://openfst.cs.nyu.edu/twiki/pub/FST/FstDownload/openfst-$(OPENFST_VERSION).tar.gz || \
 	wget -T 10 -t 3 http://www.openslr.org/resources/2/openfst-$(OPENFST_VERSION).tar.gz
 
@@ -108,13 +111,13 @@ sctk/.configured: sctk
 	touch sctk/.configured
 
 .PHONY: sctk
-sctk: sctk-2.4.9-20141015-1634Z.tar.bz2
-	tar xojf sctk-2.4.9-20141015-1634Z.tar.bz2
-	rm -rf sctk && ln -s sctk-2.4.9 sctk
+sctk: sctk-2.4.10-20151007-1312Z.tar.bz2
+	tar xojf sctk-2.4.10-20151007-1312Z.tar.bz2
+	rm -rf sctk && ln -s sctk-2.4.10 sctk
 
-sctk-2.4.9-20141015-1634Z.tar.bz2:
-	wget -T 10 -t 3 ftp://jaguar.ncsl.nist.gov/pub/sctk-2.4.9-20141015-1634Z.tar.bz2|| \
-    wget --no-check-certificate -T 10 http://www.openslr.org/resources/4/sctk-2.4.9-20141015-1634Z.tar.bz2
+sctk-2.4.10-20151007-1312Z.tar.bz2:
+	wget -T 10 -t 3 ftp://jaguar.ncsl.nist.gov/pub/sctk-2.4.10-20151007-1312Z.tar.bz2|| \
+	wget --no-check-certificate -T 10 http://www.openslr.org/resources/4/sctk-2.4.10-20151007-1312Z.tar.bz2
 
 
 atlas: ATLAS/include/cblas.h
@@ -123,7 +126,8 @@ ATLAS/include/cblas.h: | atlas3.8.3.tar.gz
 	tar xozf atlas3.8.3.tar.gz ATLAS/include
 
 atlas3.8.3.tar.gz:
-	wget -T 10  http://sourceforge.net/projects/math-atlas/files/Stable/3.8.3/atlas3.8.3.tar.gz || \
+	wget -T 10 ftp://ftp.vim.org/vol/2/metalab/distributions/tinycorelinux/2.x/tce/src/libatlas/atlas3.8.3.tar.gz || \
+    wget -T 10 http://sourceforge.net/projects/math-atlas/files/Stable/3.8.3/atlas3.8.3.tar.gz || \
 	wget --no-check-certificate -T 10 -t 3 http://www.danielpovey.com/files/kaldi/atlas3.8.3.tar.gz
 
 sph2pipe: sph2pipe_compiled
@@ -138,8 +142,8 @@ sph2pipe_v2.5: sph2pipe_v2.5.tar.gz
 	tar xzf sph2pipe_v2.5.tar.gz
 
 sph2pipe_v2.5.tar.gz:
-	wget --no-check-certificate -T 10  https://sourceforge.net/projects/kaldi/files/sph2pipe_v2.5.tar.gz || \
-	wget -T 10 -t 3 http://www.openslr.org/resources/3/sph2pipe_v2.5.tar.gz
+	wget -T 10 -t 3 http://www.openslr.org/resources/3/sph2pipe_v2.5.tar.gz || \
+	wget --no-check-certificate -T 10  https://sourceforge.net/projects/kaldi/files/sph2pipe_v2.5.tar.gz
 
 openblas: openblas_compiled
 
@@ -148,37 +152,22 @@ openblas: openblas_compiled
 fortran_opt = $(shell gcc -v 2>&1 | perl -e '$$x = join(" ", <STDIN>); if($$x =~ m/target=\S+64\S+/) { print "BINARY=64"; }')
 
 
-# note: you can change the USE_THREAD=0 option below to USE_THREAD=1
-# if you want Open Blas to use multiple threads.  then you could set,
-# for example, OPENBLAS_NUM_THREADS=2 in your path.sh so that the
-# runtime knows how many threads to use.
-# Note: if you ever get the error "Program is Terminated. Because you tried to
-# allocate too many memory regions.", this is because OpenBLAS has a fixed
-# buffer size for the number of threads and you might have gone beyond that.  It
-# may possibly help to add e.g. NUM_THREADS=64 to the command line below (after
-# $(MAKE)).
+# note: you can uncomment the line that has USE_THREAD=1 and comment the line
+# that has USE_THREADE=0 if you want Open Blas to use multiple threads.  then
+# you could set, for example, OPENBLAS_NUM_THREADS=2 in your path.sh so that the
+# runtime knows how many threads to use.  Note: if you ever get the error
+# "Program is Terminated. Because you tried to allocate too many memory
+# regions.", this is because OpenBLAS has a fixed buffer size controlled by the
+# Makefile option NUM_THREADS; I believe this limits the product of number of
+# program threads that are calling BLAS by the shell variable
+# OPENBLAS_NUM_THREADS.  In that case it might help to increase the NUM_THREADS
+# option.
 openblas_compiled:
-	-git clone git://github.com/xianyi/OpenBLAS
+	echo "Note: see tools/Makefile for options regarding OpenBLAS compilation"
+	-git clone https://github.com/xianyi/OpenBLAS.git
+	-cd OpenBLAS; git pull
 	cd OpenBLAS; sed 's:# FCOMMON_OPT = -frecursive:FCOMMON_OPT = -frecursive:' < Makefile.rule >tmp && mv tmp Makefile.rule
-	$(MAKE) PREFIX=`pwd`/OpenBLAS/install FC=gfortran $(fortran_opt) DEBUG=1 USE_THREAD=1 -C OpenBLAS all install
-
-# libsndfile needed by beamformit,
-libsndfile: libsndfile-1.0.25.tar.gz libsndfile-1.0.25/lib/libsndfile.so
-libsndfile-1.0.25.tar.gz:
-	wget http://www.mega-nerd.com/libsndfile/files/libsndfile-1.0.25.tar.gz
-libsndfile-1.0.25/lib/libsndfile.so:
-	tar xvf libsndfile-1.0.25.tar.gz && cd libsndfile-1.0.25/ && \
-	./configure --prefix=$(PWD)/libsndfile-1.0.25/ && make && make install
-
-# beamforming tool,
-beamformit: libsndfile beamformit-3.51
-
-.PHONY: beamformit-3.51
-
-beamformit-3.51: beamformit-3.51.tgz
-	tar -xozf BeamformIt-3.51.tgz; \
-	cd BeamformIt-3.51; patch -p1 -N < ../extras/beamformit.patch; cmake . ; make
+	# $(MAKE) PREFIX=`pwd`/OpenBLAS/install FC=gfortran $(fortran_opt) DEBUG=1 USE_THREAD=1 NUM_THREADS=64 -C OpenBLAS all install
+	$(MAKE) PREFIX=`pwd`/OpenBLAS/install FC=gfortran $(fortran_opt) DEBUG=1 USE_THREAD=0 -C OpenBLAS all install
 
-beamformit-3.51.tgz:
-	wget -c -T 10 http://www.xavieranguera.com/beamformit/releases/BeamformIt-3.51.tgz
 
diff --git a/tools/config/common_path.sh b/tools/config/common_path.sh
new file mode 100644
index 00000000000..3e2ea50d685
--- /dev/null
+++ b/tools/config/common_path.sh
@@ -0,0 +1,23 @@
+# we assume KALDI_ROOT is already defined 
+[ -z "$KALDI_ROOT" ] && echo "The variable KALDI_ROOT must be already defined" && exit 1
+# The formatting of the path export command is intentionally weird, because
+# this allows for easy diff'ing
+export PATH=\
+${KALDI_ROOT}/src/bin:\
+${KALDI_ROOT}/src/chainbin:\
+${KALDI_ROOT}/src/featbin:\
+${KALDI_ROOT}/src/fgmmbin:\
+${KALDI_ROOT}/src/fstbin:\
+${KALDI_ROOT}/src/gmmbin:\
+${KALDI_ROOT}/src/ivectorbin:\
+${KALDI_ROOT}/src/kwsbin:\
+${KALDI_ROOT}/src/latbin:\
+${KALDI_ROOT}/src/lmbin:\
+${KALDI_ROOT}/src/nnet2bin:\
+${KALDI_ROOT}/src/nnet3bin:\
+${KALDI_ROOT}/src/nnetbin:\
+${KALDI_ROOT}/src/online2bin:\
+${KALDI_ROOT}/src/onlinebin:\
+${KALDI_ROOT}/src/sgmm2bin:\
+${KALDI_ROOT}/src/sgmmbin:\
+$PATH
diff --git a/tools/extras/beamformit.patch b/tools/extras/beamformit.patch
deleted file mode 100644
index 4e94166fbef..00000000000
--- a/tools/extras/beamformit.patch
+++ /dev/null
@@ -1,57 +0,0 @@
-diff -rupN BeamformIt-3.51/CMakeLists.txt BeamformIt-3.51.new/CMakeLists.txt
---- BeamformIt-3.51/CMakeLists.txt	2014-06-25 16:30:13.000000000 +0200
-+++ BeamformIt-3.51.new/CMakeLists.txt	2015-04-22 02:02:28.209121734 +0200
-@@ -20,7 +20,7 @@ set (CMAKE_C_FLAGS_RELEASE         "-D_R
- set (CMAKE_CXX_FLAGS_RELEASE       "-DLINUX -Wall -W -O3 -DNDEBUG -Wno-unknown-pragmas -Wno-unused-result -Wno-unused-variable")
- 
- ##find libraries installed in the system
--FIND_LIBRARY(LIBSNDFILE sndfile)
-+FIND_LIBRARY(LIBSNDFILE sndfile ../libsndfile-1.0.25/lib/)
- 
- ##set particular settings for each architecture
- set (FFTREAL_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/external/FFTReal-2.11)
-@@ -37,7 +37,7 @@ endif()
- ##set some generic link directories
- LINK_DIRECTORIES(${os_link_libs})
- set (CMAKE_INCLUDE_SYSTEM_FLAG_CXX "-isystem") #needed for the SYSTEM link directories to work on osx
--INCLUDE_DIRECTORIES(SYSTEM ${os_include_libs})
-+INCLUDE_DIRECTORIES(SYSTEM ${os_include_libs} ../libsndfile-1.0.25/include/)
- INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/includes)
- 
- #gather project files (note that when adding a new file cmake needs to be rerun)
-diff -rupN BeamformIt-3.51/src/delaysum.cc BeamformIt-3.51.new/src/delaysum.cc
---- BeamformIt-3.51/src/delaysum.cc	2014-06-25 18:35:44.000000000 +0200
-+++ BeamformIt-3.51.new/src/delaysum.cc	2015-04-22 02:02:28.213121734 +0200
-@@ -12,7 +12,7 @@
- #include <iostream>
- #include <cstring>
- 
--#include "samplerate.h"
-+//#include "samplerate.h"
- #include "sndfile.h"
- #include "global.h"
- #include "delaysum.h"
-diff -rupN BeamformIt-3.51/src/fileinout.cc BeamformIt-3.51.new/src/fileinout.cc
---- BeamformIt-3.51/src/fileinout.cc	2014-06-25 15:27:05.000000000 +0200
-+++ BeamformIt-3.51.new/src/fileinout.cc	2015-04-22 02:02:28.217121735 +0200
-@@ -1,7 +1,7 @@
- #include <stdlib.h>
- #include <cstring>
- #include "fileinout.h"
--#include "samplerate.h" 
-+//#include "samplerate.h" 
- 
- //char tmp_string[1024];
- 
-diff -rupN BeamformIt-3.51/src/tdoa.cc BeamformIt-3.51.new/src/tdoa.cc
---- BeamformIt-3.51/src/tdoa.cc	2014-06-25 16:55:40.000000000 +0200
-+++ BeamformIt-3.51.new/src/tdoa.cc	2015-04-22 02:03:03.081579456 +0200
-@@ -10,7 +10,7 @@
- #include "tdoa.h"
- #include "fileinout.h"
- #include "global.h"
--#include "fftw3.h"
-+//#include "fftw3.h"
- 
- //char tmp_string[1024];
- 
diff --git a/tools/extras/check_dependencies.sh b/tools/extras/check_dependencies.sh
index 7cb723f31c6..29eb1acbe9e 100755
--- a/tools/extras/check_dependencies.sh
+++ b/tools/extras/check_dependencies.sh
@@ -4,37 +4,44 @@
 # script.
 redhat_packages=
 debian_packages=
+opensuse_packages=
 
 function add_packages {
   redhat_packages="$redhat_packages $1";
   debian_packages="$debian_packages $2";
+  opensuse_packages="$opensuse_packages $3";
 }
 
 if ! which g++ >&/dev/null; then
   echo "$0: g++ is not installed."
-  add_packages gcc-c++ g++
+  add_packages gcc-c++ g++ gcc-c++
 fi
 
 if ! echo "#include <zlib.h>" | gcc -E - >&/dev/null; then
   echo "$0: zlib is not installed."
-  add_packages zlib-devel zlib1g-dev
+  add_packages zlib-devel zlib1g-dev zlib-devel
 fi
 
-for f in make automake libtool autoconf patch awk grep bzip2 gzip wget git; do
+for f in make gcc automake autoconf patch grep bzip2 gzip wget git; do
   if ! which $f >&/dev/null; then
     echo "$0: $f is not installed."
-    add_packages $f $f
+    add_packages $f $f $f
   fi
 done
 
+if ! which libtoolize >&/dev/null && ! which glibtoolize >&/dev/null; then
+  echo "$0: neither libtoolize nor glibtoolize is installed"
+  add_packages libtool libtool libtool
+fi
+
 if ! which svn >&/dev/null; then
   echo "$0: subversion is not installed"
-  add_packages subversion subversion
+  add_packages subversion subversion subversion
 fi
 
 if ! which awk >&/dev/null; then
   echo "$0: awk is not installed"
-  add_packages gawk gawk
+  add_packages gawk gawk gawk
 fi
 
 if which python >&/dev/null ; then
@@ -45,18 +52,21 @@ if which python >&/dev/null ; then
       echo "$0: default or create an bash alias for kaldi scripts to run correctly"
     else
       echo "$0: python 2.7 is not installed"
-      add_packages python2.7 python2.7
+      add_packages python2.7 python2.7 python2.7
     fi
   fi
 else
   echo "$0: python 2.7 is not installed"
-  add_packages python2.7 python2.7
+  add_packages python2.7 python2.7 python2.7
 fi
 
 printed=false
 status=0
 
-if which apt-get >&/dev/null; then
+if which apt-get >&/dev/null && ! which zypper >/dev/null; then
+  # if we're using apt-get [but we're not OpenSuse, which uses zypper as the
+  # primary installer, but sometimes installs apt-get for some compatibility
+  # reason without it really working]...
   if [ ! -z "$debian_packages" ]; then
     echo "$0: we recommend that you run (our best guess):"
     echo " sudo apt-get install $debian_packages"
@@ -92,6 +102,20 @@ if which yum >&/dev/null; then
   fi
 fi
 
+if which zypper >&/dev/null; then
+  if [ ! -z "$opensuse_packages" ]; then
+    echo "$0: we recommend that you run (our best guess):"
+    echo " sudo zypper install $opensuse_packages"
+    printed=true
+    status=1
+  fi
+  if ! zypper search -i | grep -E 'libatlas3|libatlas3-devel' >/dev/null; then
+    echo "You should probably do: "
+    echo "sudo zypper install libatlas3-devel"
+    printed=true
+  fi
+fi
+
 if [ ! -z "$debian_packages" ]; then
   # If the list of packages to be installed is nonempty,
   # we'll exit with error status.  Check this outside of
@@ -101,7 +125,7 @@ if [ ! -z "$debian_packages" ]; then
 fi
 
 
-if [ $(pwd | wc -w) -gt 1 ]; then 
+if [ $(pwd | wc -w) -gt 1 ]; then
   echo "*** $0: Warning: Kaldi scripts will fail if the directory name contains a space."
   echo "***  (it's OK if you just want to compile a few tools -> disable this check)."
   status=1;
@@ -113,6 +137,13 @@ if which grep >&/dev/null && pwd | grep -E 'JOB|LMWT' >/dev/null; then
   status=1;
 fi
 
+if [ -f /usr/lib64/libfst.so.1 ]; then
+  echo "*** $0: Kaldi cannot be installed (for now) if you have OpenFst"
+  echo "***   installed in system space (version mismatches, etc.)"
+  echo "***   Please try to uninstall it."
+  status=1
+fi
+
 if ! $printed && [ $status -eq 0 ]; then
   echo "$0: all OK."
 fi
diff --git a/tools/extras/check_for_rnnlm.sh b/tools/extras/check_for_rnnlm.sh
new file mode 100755
index 00000000000..66aa8004c7b
--- /dev/null
+++ b/tools/extras/check_for_rnnlm.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+set -e
+
+if [ $# -ne 1 ]; then
+    echo "The scripts checks whether requested rnnlm binary exists in tools/<rnnlm_ver>/rnnlm"
+    echo
+    echo "Usage: $0 <rnnlm_ver>"
+    exit 1
+fi
+
+rnnlm_ver=$1
+rnnlm_path="$(readlink -f "$(dirname "$0")/../")/$rnnlm_ver/rnnlm"
+scriptname="$(basename "$0")"
+
+if [ -f "$rnnlm_path" ]; then
+    echo "$scriptname: Found binary $rnnlm_path"
+else
+    if [ $rnnlm_ver == "faster-rnnlm" ]; then
+        echo "$scriptname: ERROR Faster RNNLM is not installed. Use extras/install_faster_rnnlm.sh to install it"
+    elif [ $rnnlm_ver == "rnnlm-0.??" ]; then
+        echo "$scriptname: ERROR Class based RNNLM is not installed. Use extras/install_mikolov_rnnlm.sh to install it"
+    else
+        echo "$scriptname: ERROR Cannot find $rnnlm_path. Neither know how to install it"
+    fi
+    exit 1
+fi
+
diff --git a/tools/extras/install_beamformit.sh b/tools/extras/install_beamformit.sh
index 638f18ce94e..22768420530 100755
--- a/tools/extras/install_beamformit.sh
+++ b/tools/extras/install_beamformit.sh
@@ -1,7 +1,21 @@
 #!/bin/bash
 
-# to be run from ..
-# this script just exists to tell you how you'd make beamformit- we actually did it via Makefile rules,
-# but it's not a default target.
+# Installs beamformit from the location https://github.com/xanguera/BeamformIt
 
-make beamformit
+# libsndfile needed by beamformit,
+wget http://www.mega-nerd.com/libsndfile/files/libsndfile-1.0.25.tar.gz
+tar xvf libsndfile-1.0.25.tar.gz 
+( 
+cd libsndfile-1.0.25;
+./configure --prefix=$PWD
+make
+make install
+);
+
+# building beamformit,
+git clone https://github.com/xanguera/BeamformIt
+(
+cd BeamformIt
+cmake -DLIBSND_INSTALL_DIR=$PWD/../libsndfile-1.0.25 .
+make
+);
diff --git a/tools/extras/install_faster_rnnlm.sh b/tools/extras/install_faster_rnnlm.sh
new file mode 100755
index 00000000000..48fca768a50
--- /dev/null
+++ b/tools/extras/install_faster_rnnlm.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+# The script downloads and installs faster-rnnlm
+# https://github.com/yandex/faster-rnnlm
+
+set -e
+
+# Make sure we are in the tools/ directory.
+if [ `basename $PWD` == extras ]; then
+  cd ..
+fi
+
+! [ `basename $PWD` == tools ] && \
+   echo "You must call this script from the tools/ directory" && exit 1;
+
+echo "Installing Faster RNNLM"
+
+if [ ! -d "faster-rnnlm" ]; then
+    git clone https://github.com/yandex/faster-rnnlm.git
+fi
+
+cd faster-rnnlm
+git pull
+./build.sh
+ln -sf faster-rnnlm/rnnlm
diff --git a/tools/extras/install_irstlm.sh b/tools/extras/install_irstlm.sh
index 7f87ce5d6aa..89ddd3c2f1c 100755
--- a/tools/extras/install_irstlm.sh
+++ b/tools/extras/install_irstlm.sh
@@ -6,6 +6,7 @@
 # End configuration section
 set -e -o pipefail
 
+
 errcho() { echo "$@" 1>&2; }
 
 errcho "****() Installing IRSTLM"
@@ -54,8 +55,7 @@ fi
   [ ! -z ${IRSTLM} ] && \
     echo >&2 "IRSTLM config is already in env.sh" && exit
 
-  wd=`pwd`
-  wd=`readlink -f $wd`
+  wd=`pwd -P`
 
   echo "export IRSTLM=$wd/irstlm"
   echo "export PATH=\${PATH}:\${IRSTLM}/bin"
diff --git a/tools/extras/install_liblbfgs.sh b/tools/extras/install_liblbfgs.sh
new file mode 100644
index 00000000000..88c2b5e0d8d
--- /dev/null
+++ b/tools/extras/install_liblbfgs.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+VER=1.10
+if [ ! -f $liblbfgs-$VER.tar.gz ]; then
+  wget https://github.com/downloads/chokkan/liblbfgs/liblbfgs-$VER.tar.gz
+fi
+
+tar -xzf liblbfgs-$VER.tar.gz
+cd liblbfgs-$VER
+./configure
+make
+cd ..
+
+(
+  [ ! -z ${LIBLBFGS} ] && \
+    echo >&2 "LIBLBFGS variable is aleady defined. Undefining..." && \
+    unset LIBLBFGS
+
+  [ -f ./env.sh ] && . ./env.sh
+
+  [ ! -z ${LIBLBFGS} ] && \
+    echo >&2 "libLBFGS config is already in env.sh" && exit
+
+  wd=`pwd`
+  wd=`readlink -f $wd || pwd`
+
+  echo "export LIBLBFGS=$wd/liblbfgs-1.10"
+  echo export LD_LIBRARY_PATH='${LD_LIBRARY_PATH}':'${LIBLBFGS}'/lib/.libs
+) >> env.sh
+
diff --git a/tools/extras/install_mikolov_rnnlm.sh b/tools/extras/install_mikolov_rnnlm.sh
new file mode 100755
index 00000000000..175cd83bcff
--- /dev/null
+++ b/tools/extras/install_mikolov_rnnlm.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+set -e
+
+if [ $# -ne 1 ]; then
+    echo "Download and install given rnnlm version from rnnlm.org"
+    echo
+    echo "Usage: $0 <rnnlm_ver> # e.g. $0 rnnlm-0.3e"
+    exit 1
+fi
+
+rnnlm_ver=$1
+tools_dir="$(readlink -f "$(dirname "$0")/../")"
+
+if [ "$(basename "$tools_dir")" != "tools" ]; then
+    echo "Cannot find tools/ dir. Am I in tools/extras?"
+    exit 1
+fi
+
+cd $tools_dir
+echo Downloading and installing the rnnlm tools
+# http://www.fit.vutbr.cz/~imikolov/rnnlm/$rnnlm_ver.tgz
+arc_file="$rnnlm_ver.tgz"
+if [ ! -f "$arc_file" ]; then
+    wget "http://www.fit.vutbr.cz/~imikolov/rnnlm/$rnnlm_ver.tgz" -O "$arc_file" || exit 1;
+fi
+mkdir $rnnlm_ver
+cd $rnnlm_ver
+tar -xvzf ../$rnnlm_ver.tgz || exit 1;
+patch  < ../extras/mikolov_rnnlm.patch
+make CC=g++ || exit 1;
+echo Done making the rnnlm tools
diff --git a/tools/extras/install_mpg123.sh b/tools/extras/install_mpg123.sh
index 7cd93128825..5c9424e7778 100755
--- a/tools/extras/install_mpg123.sh
+++ b/tools/extras/install_mpg123.sh
@@ -65,7 +65,7 @@ ln -s mpg123-1.21.0  mpg123
     echo >&2 "MPG123 config is already in env.sh" && exit
 
   wd=`pwd`
-  wd=`readlink -f $wd`
+  wd=`readlink -f $wd || pwd`
 
   echo "export MPG123=$wd/mpg123"
   echo "export PATH=\${PATH}:\${MPG123}/bin"
diff --git a/tools/extras/install_rnnlm_hs.sh b/tools/extras/install_rnnlm_hs.sh
deleted file mode 100755
index 40173a36298..00000000000
--- a/tools/extras/install_rnnlm_hs.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/bin/bash
-
-# Make sure we are in the tools/ directory.
-if [ `basename $PWD` == extras ]; then
-  cd ..
-fi
-
-! [ `basename $PWD` == tools ] && \
-   echo "You must call this script from the tools/ directory" && exit 1;
-
-echo "Installing RNNLM-HS 0.1b"
-
-cd rnnlm-hs-0.1b
-if  grep -q '#define MAX_STRING 100' rnnlm.c ; then 
-    patch <../extras/rnnlm.patch
-fi
-make
diff --git a/tools/extras/install_sequitur.sh b/tools/extras/install_sequitur.sh
index 76199122411..ae90ff0c5e6 100755
--- a/tools/extras/install_sequitur.sh
+++ b/tools/extras/install_sequitur.sh
@@ -1,7 +1,6 @@
 #!/bin/bash
 set -u
 set -e
-g2p_archive=g2p-r1668.tar.gz
 
 
 # Make sure we are in the tools/ directory.
@@ -38,13 +37,33 @@ else
 fi
 
 
-[ ! -f $g2p_archive ] && wget http://www-i6.informatik.rwth-aachen.de/web/Software/$g2p_archive
-tar xzf $g2p_archive
-mv g2p sequitur
+if [ -d ./g2p ] || [ -d sequitur ] ; then
+  echo  >&2 "$0: Warning: old installation of Sequitur found. You should manually"
+  echo  >&2 "  delete the directories tools/sequitur and/or tools/g2p and "
+  echo  >&2 "  edit the file tools/env.sh and remove manually all references to it"
+fi
+
+if [ ! -d ./sequitur-g2p ] ; then
+  git clone https://github.com/sequitur-g2p/sequitur-g2p.git sequitur-g2p ||
+  {
+    echo  >&2 "$0: Warning: git clone operation ended unsuccessfully"
+    echo  >&2 "  I will assume this is because you don't have https support"
+    echo  >&2 "  compiled into your git "
+    git clone git@github.com:sequitur-g2p/sequitur-g2p.git sequitur-g2p
+
+    if [ $? -ne 0 ]; then
+      echo  >&2 "$0: Error git clone operation ended unsuccessfully"
+      echo  >&2 "  Clone the github repository (https://github.com/sequitur-g2p/sequitur-g2p.git)"
+      echo  >&2 "  manually and re-run the script"
+    fi
+  }
+fi
+#just to retain backward compatibility for a while. Can be removed
+#in a couple of months. 
+ln -sf sequitur-g2p sequitur
 
 
-cd sequitur
-patch  < ../extras/sequitur.patch
+cd sequitur-g2p
 make
 python setup.py install --prefix `pwd`
 
@@ -62,11 +81,11 @@ cd ../
     echo >&2 "SEQUITUR config is already in env.sh" && exit
 
   wd=`pwd`
-  wd=`readlink -f $wd`
+  wd=`readlink -f $wd || pwd`
 
-  echo "export SEQUITUR=$wd/sequitur"
+  echo "export SEQUITUR=$wd/sequitur-g2p"
   echo "export PATH=\$PATH:\${SEQUITUR}/bin"
-  echo "_site_packages=\`readlink -f \${SEQUITUR}/lib/python*/site-packages\`"
+  echo "_site_packages=\`find \${SEQUITUR}/lib -type d -regex '.*python.*/site-packages'\`"
   echo "export PYTHONPATH=\$PYTHONPATH:\$_site_packages"
 ) >> env.sh
 
diff --git a/tools/extras/install_srilm.sh b/tools/extras/install_srilm.sh
index 969984273a4..2e0de4b264b 100755
--- a/tools/extras/install_srilm.sh
+++ b/tools/extras/install_srilm.sh
@@ -1,12 +1,17 @@
 #!/bin/bash
 
-# http://www.speech.sri.com/projects/srilm/download.html
+if [ ! -f liblbfgs-1.10 ]; then
+    echo Intalling libLBFGS library to support MaxEnt LMs
+    bash extras/install_liblbfgs.sh || exit 1
+fi
 
+# http://www.speech.sri.com/projects/srilm/download.html
 if [ ! -f srilm.tgz ]; then
   echo This script cannot install SRILM in a completely automatic
   echo way because you need to put your address in a download form.
   echo Please download SRILM from http://www.speech.sri.com/projects/srilm/download.html
   echo put it in ./srilm.tgz, then run this script.
+  exit 1
 fi
 
 ! which gawk 2>/dev/null && \
@@ -22,7 +27,20 @@ cp Makefile tmpf
 cat tmpf | awk -v pwd=`pwd` '/SRILM =/{printf("SRILM = %s\n", pwd); next;} {print;}' \
   > Makefile || exit 1;
 
-make
+mtype=`sbin/machine-type`
+
+echo HAVE_LIBLBFGS=1 >> common/Makefile.machine.$mtype
+grep ADDITIONAL_INCLUDES common/Makefile.machine.$mtype | \
+    sed 's|$| -I$(SRILM)/../liblbfgs-1.10/include|' \
+    >> common/Makefile.machine.$mtype
+
+grep ADDITIONAL_LDFLAGS common/Makefile.machine.$mtype | \
+    sed 's|$| -L$(SRILM)/../liblbfgs-1.10/lib/.libs|' \
+    >> common/Makefile.machine.$mtype
+
+
+
+make || exit 1
 
 cd ..
 (
@@ -36,7 +54,7 @@ cd ..
     echo >&2 "SRILM config is already in env.sh" && exit
 
   wd=`pwd`
-  wd=`readlink -f $wd`
+  wd=`readlink -f $wd || pwd`
 
   echo "export SRILM=$wd/srilm"
   dirs="\${PATH}"
diff --git a/tools/extras/mikolov_rnnlm.patch b/tools/extras/mikolov_rnnlm.patch
new file mode 100644
index 00000000000..ca20b19f986
--- /dev/null
+++ b/tools/extras/mikolov_rnnlm.patch
@@ -0,0 +1,11 @@
+--- rnnlmlib.h	2015-10-14 17:57:52.315280379 -0400
++++ rnnlmlib.h	2015-10-14 17:58:03.723969289 -0400
+@@ -9,7 +9,7 @@
+ #ifndef _RNNLMLIB_H_
+ #define _RNNLMLIB_H_
+ 
+-#define MAX_STRING 100
++#define MAX_STRING 300
+ 
+ typedef double real;		// doubles for NN weights
+ typedef double direct_t;	// doubles for ME weights; TODO: check why floats are not enough for RNNME (convergence problems)
diff --git a/tools/extras/openfstwin-1.3.4.patch b/tools/extras/openfstwin-1.3.4.patch
index e142341f5ba..9e624f4db32 100644
--- a/tools/extras/openfstwin-1.3.4.patch
+++ b/tools/extras/openfstwin-1.3.4.patch
@@ -1,425 +1,431 @@
-diff --git a/src/include/fst/fst.h b/src/include/fst/fst.h
-index 5ad3b52..d9c0ca6 100644
---- a/src/include/fst/fst.h
-+++ b/src/include/fst/fst.h
-@@ -45,6 +45,12 @@ DECLARE_bool(fst_align);
- 
- namespace fst {
- 
-+	typedef ::int64 int64;
-+	typedef ::uint64 uint64;
-+	typedef ::int32 int32;
-+	typedef ::uint32 uint32;
-+
-+
- bool OPENFSTDLL IsFstHeader(istream &, const string &); //ChangedPD
- 
- class FstHeader;
-diff --git a/src/include/fst/interval-set.h b/src/include/fst/interval-set.h
-index c4362f2..58cad44 100644
---- a/src/include/fst/interval-set.h
-+++ b/src/include/fst/interval-set.h
-@@ -37,38 +37,38 @@ template <typename T>
- class IntervalSet {
-  public:
-   struct Interval {
--    T begin;
--    T end;
-+    T begin_;
-+    T end_;
- 
--    Interval() : begin(-1), end(-1) {}
-+    Interval() : begin_(-1), end_(-1) {}
- 
--    Interval(T b, T e) : begin(b), end(e) {}
-+    Interval(T b, T e) : begin_(b), end_(e) {}
- 
-     bool operator<(const Interval &i) const {
--      return begin < i.begin || (begin == i.begin && end > i.end);
-+      return begin_ < i.begin_ || (begin_ == i.begin_ && end_ > i.end_);
-     }
- 
-     bool operator==(const Interval &i) const {
--      return begin == i.begin && end == i.end;
-+      return begin_ == i.begin_ && end_ == i.end_;
-     }
- 
-     bool operator!=(const Interval &i) const {
--      return begin != i.begin || end != i.end;
-+      return begin_ != i.begin_ || end_ != i.end_;
-     }
- 
-     istream &Read(istream &strm) {
-       T n;
-       ReadType(strm, &n);
--      begin = n;
-+      begin_ = n;
-       ReadType(strm, &n);
--      end = n;
-+      end_ = n;
-       return strm;
-     }
- 
-     ostream &Write(ostream &strm) const {
--      T n = begin;
-+      T n = begin_;
-       WriteType(strm, n);
--      n = end;
-+      n = end_;
-       WriteType(strm, n);
-       return strm;
-     }
-@@ -108,7 +108,7 @@ class IntervalSet {
-         lower_bound(intervals_.begin(), intervals_.end(), interval);
-     if (lb == intervals_.begin())
-       return false;
--    return (--lb)->end > value;
-+    return (--lb)->end_ > value;
-   }
- 
-   // Requires intervals be normalized.
-@@ -123,7 +123,7 @@ class IntervalSet {
- 
-   bool Singleton() const {
-     return intervals_.size() == 1 &&
--        intervals_[0].begin + 1 == intervals_[0].end;
-+        intervals_[0].begin_ + 1 == intervals_[0].end_;
-   }
- 
- 
-@@ -178,17 +178,17 @@ void IntervalSet<T>::Normalize() {
-   T size = 0;
-   for (T i = 0; i < intervals_.size(); ++i) {
-     Interval &inti = intervals_[i];
--    if (inti.begin == inti.end)
-+    if (inti.begin_ == inti.end_)
-       continue;
-     for (T j = i + 1; j < intervals_.size(); ++j) {
-       Interval &intj = intervals_[j];
--      if (intj.begin > inti.end)
-+      if (intj.begin_ > inti.end_)
-         break;
--      if (intj.end > inti.end)
--        inti.end = intj.end;
-+      if (intj.end_ > inti.end_)
-+        inti.end_ = intj.end_;
-       ++i;
-     }
--    count_ += inti.end - inti.begin;
-+    count_ += inti.end_ - inti.begin_;
-     intervals_[size++] = inti;
-   }
-   intervals_.resize(size);
-@@ -208,17 +208,17 @@ void IntervalSet<T>::Intersect(const IntervalSet<T> &iset,
-   oset->count_ = 0;
- 
-   while (it1 != intervals_.end() && it2 != iintervals->end()) {
--    if (it1->end <= it2->begin) {
-+    if (it1->end_ <= it2->begin_) {
-       ++it1;
--    } else if (it2->end <= it1->begin) {
-+    } else if (it2->end_ <= it1->begin_) {
-       ++it2;
-     } else {
-       Interval interval;
--      interval.begin = max(it1->begin, it2->begin);
--      interval.end = min(it1->end, it2->end);
-+      interval.begin_ = max(it1->begin_, it2->begin_);
-+      interval.end_ = min(it1->end_, it2->end_);
-       ointervals->push_back(interval);
--      oset->count_ += interval.end - interval.begin;
--      if (it1->end < it2->end)
-+      oset->count_ += interval.end_ - interval.begin_;
-+      if (it1->end_ < it2->end_)
-         ++it1;
-       else
-         ++it2;
-@@ -235,21 +235,21 @@ void IntervalSet<T>::Complement(T maxval, IntervalSet<T> *oset) const {
-   oset->count_ = 0;
- 
-   Interval interval;
--  interval.begin = 0;
-+  interval.begin_ = 0;
-   for (typename vector<Interval>::const_iterator it = intervals_.begin();
-        it != intervals_.end();
-        ++it) {
--    interval.end = min(it->begin, maxval);
--    if (interval.begin < interval.end) {
-+    interval.end_ = min(it->begin_, maxval);
-+    if (interval.begin_ < interval.end_) {
-       ointervals->push_back(interval);
--      oset->count_ += interval.end - interval.begin;
-+      oset->count_ += interval.end_ - interval.begin_;
-     }
--    interval.begin = it->end;
-+    interval.begin_ = it->end_;
-   }
--  interval.end = maxval;
--  if (interval.begin < interval.end) {
-+  interval.end_ = maxval;
-+  if (interval.begin_ < interval.end_) {
-     ointervals->push_back(interval);
--    oset->count_ += interval.end - interval.begin;
-+    oset->count_ += interval.end_ - interval.begin_;
-   }
- }
- 
-@@ -263,7 +263,7 @@ void IntervalSet<T>::Difference(const IntervalSet<T> &iset,
-     oset->count_ = 0;
-   } else {
-     IntervalSet<T> cset;
--    iset.Complement(intervals_.back().end, &cset);
-+    iset.Complement(intervals_.back().end_, &cset);
-     Intersect(cset, oset);
-   }
- }
-@@ -277,9 +277,9 @@ bool IntervalSet<T>::Overlaps(const IntervalSet<T> &iset) const {
-   typename vector<Interval>::const_iterator it2 = intervals->begin();
- 
-   while (it1 != intervals_.end() && it2 != intervals->end()) {
--    if (it1->end <= it2->begin) {
-+    if (it1->end_ <= it2->begin_) {
-       ++it1;
--    } else if (it2->end <= it1->begin) {
-+    } else if (it2->end_ <= it1->begin_) {
-       ++it2;
-     } else {
-       return true;
-@@ -300,21 +300,21 @@ bool IntervalSet<T>::StrictlyOverlaps(const IntervalSet<T> &iset) const {
-   bool overlap = false; // point in both intervals_ and intervals
- 
-   while (it1 != intervals_.end() && it2 != intervals->end()) {
--    if (it1->end <= it2->begin) {  // no overlap - it1 first
-+    if (it1->end_ <= it2->begin_) {  // no overlap - it1 first
-       only1 = true;
-       ++it1;
--    } else if (it2->end <= it1->begin) {  // no overlap - it2 first
-+    } else if (it2->end_ <= it1->begin_) {  // no overlap - it2 first
-       only2 = true;
-       ++it2;
--    } else if (it2->begin == it1->begin && it2->end == it1->end) {  // equals
-+    } else if (it2->begin_ == it1->begin_ && it2->end_ == it1->end_) {  // equals
-       overlap = true;
-       ++it1;
-       ++it2;
--    } else if (it2->begin <= it1->begin && it2->end >= it1->end) {  // 1 c 2
-+    } else if (it2->begin_ <= it1->begin_ && it2->end_ >= it1->end_) {  // 1 c 2
-       only2 = true;
-       overlap = true;
-       ++it1;
--    } else if (it1->begin <= it2->begin && it1->end >= it2->end) {  // 2 c 1
-+    } else if (it1->begin_ <= it2->begin_ && it1->end_ >= it2->end_) {  // 2 c 1
-       only1 = true;
-       overlap = true;
-       ++it2;
-@@ -346,11 +346,11 @@ bool IntervalSet<T>::Contains(const IntervalSet<T> &iset) const {
-   typename vector<Interval>::const_iterator it2 = intervals->begin();
- 
-   while (it1 != intervals_.end() && it2 != intervals->end()) {
--    if (it1->end <= it2->begin) {  // no overlap - it1 first
-+    if (it1->end_ <= it2->begin_) {  // no overlap - it1 first
-       ++it1;
--    } else if (it2->begin < it1->begin || it2->end > it1->end) {  // no C
-+    } else if (it2->begin_ < it1->begin_ || it2->end_ > it1->end_) {  // no C
-       return false;
--    } else if (it2->end == it1->end) {
-+    } else if (it2->end_ == it1->end_) {
-       ++it1;
-       ++it2;
-     } else {
-@@ -370,7 +370,7 @@ ostream &operator<<(ostream &strm, const IntervalSet<T> &s)  {
-        ++it) {
-     if (it != intervals->begin())
-       strm << ",";
--    strm << "[" << it->begin << "," << it->end << ")";
-+    strm << "[" << it->begin_ << "," << it->end_ << ")";
-   }
-   strm << "}";
-   return strm;
-diff --git a/src/include/fst/label-reachable.h b/src/include/fst/label-reachable.h
-index a7c3360..491ef7d 100644
---- a/src/include/fst/label-reachable.h
-+++ b/src/include/fst/label-reachable.h
-@@ -359,9 +359,9 @@ class LabelReachable {
-                iiter = intervals->begin();
-            iiter != intervals->end(); ++iiter) {
-         begin_low = LowerBound(aiter, end_low, aiter_end,
--                               aiter_input, iiter->begin);
-+                               aiter_input, iiter->begin_);
-         end_low = LowerBound(aiter, begin_low, aiter_end,
--                             aiter_input, iiter->end);
-+                             aiter_input, iiter->end_);
-         if (end_low - begin_low > 0) {
-           if (reach_begin_ < 0)
-             reach_begin_ = begin_low;
-diff --git a/src/include/fst/minimize.h b/src/include/fst/minimize.h
-index 3fbe3ba..6e9dd3d 100644
---- a/src/include/fst/minimize.h
-+++ b/src/include/fst/minimize.h
-@@ -134,7 +134,14 @@ class CyclicMinimizer {
-   typedef typename A::Weight Weight;
-   typedef ReverseArc<A> RevA;
- 
--  CyclicMinimizer(const ExpandedFst<A>& fst) {
-+  CyclicMinimizer(const ExpandedFst<A>& fst):
-+      // tell the Partition data-member to expect multiple repeated
-+      // calls to SplitOn with the same element if we are non-deterministic.
-+      P_(fst.Properties(kIDeterministic, true) == 0) {
-+    if(fst.Properties(kIDeterministic, true) == 0)
-+      CHECK(Weight::Properties() & kIdempotent); // this minimization
-+    // algorithm for non-deterministic FSTs can only work with idempotent
-+    // semirings.
-     Initialize(fst);
-     Compute(fst);
-   }
-@@ -315,7 +322,13 @@ class AcyclicMinimizer {
-   typedef typename A::StateId ClassId;
-   typedef typename A::Weight Weight;
- 
--  AcyclicMinimizer(const ExpandedFst<A>& fst) {
-+  AcyclicMinimizer(const ExpandedFst<A>& fst):
-+      // tell the Partition data-member to expect multiple repeated
-+      // calls to SplitOn with the same element if we are non-deterministic.
-+      partition_(fst.Properties(kIDeterministic, true) == 0) {
-+    if(fst.Properties(kIDeterministic, true) == 0)
-+      CHECK(Weight::Properties() & kIdempotent); // minimization for
-+    // non-deterministic FSTs can only work with idempotent semirings.
-     Initialize(fst);
-     Refine(fst);
-   }
-@@ -531,13 +544,7 @@ template <class A>
- void Minimize(MutableFst<A>* fst,
-               MutableFst<A>* sfst = 0,
-               float delta = kDelta) {
--  uint64 props = fst->Properties(kAcceptor | kIDeterministic|
--                                 kWeighted | kUnweighted, true);
--  if (!(props & kIDeterministic)) {
--    FSTERROR() << "FST is not deterministic";
--    fst->SetProperties(kError, kError);
--    return;
--  }
-+  uint64 props = fst->Properties(kAcceptor | kWeighted | kUnweighted, true);
- 
-   if (!(props & kAcceptor)) {  // weighted transducer
-     VectorFst< GallicArc<A, STRING_LEFT> > gfst;
-diff --git a/src/include/fst/partition.h b/src/include/fst/partition.h
-index dcee67b..40b849a 100644
---- a/src/include/fst/partition.h
-+++ b/src/include/fst/partition.h
-@@ -43,8 +43,8 @@ class Partition {
-   friend class PartitionIterator<T>;
- 
-   struct Element {
--   Element() : value(0), next(0), prev(0) {}
--   Element(T v) : value(v), next(0), prev(0) {}
-+    Element() : value(0), next(0), prev(0) {}
-+    Element(T v) : value(v), next(0), prev(0) {}
- 
-    T        value;
-    Element* next;
-@@ -52,9 +52,11 @@ class Partition {
-   };
- 
-  public:
--  Partition() {}
-+  Partition(bool allow_repeated_split):
-+      allow_repeated_split_(allow_repeated_split) {}
- 
--  Partition(T num_states) {
-+  Partition(bool allow_repeated_split, T num_states):
-+      allow_repeated_split_(allow_repeated_split) {
-     Initialize(num_states);
-   }
- 
-@@ -137,16 +139,16 @@ class Partition {
-     if (class_size_[class_id] == 1) return;
- 
-     // first time class is split
--    if (split_size_[class_id] == 0)
-+    if (split_size_[class_id] == 0) { 
-       visited_classes_.push_back(class_id);
--
-+      class_split_[class_id] = classes_[class_id];
-+    }
-     // increment size of split (set of element at head of chain)
-     split_size_[class_id]++;
--
-+    
-     // update split point
--    if (class_split_[class_id] == 0)
--      class_split_[class_id] = classes_[class_id];
--    if (class_split_[class_id] == elements_[element_id])
-+    if (class_split_[class_id] != 0
-+        && class_split_[class_id] == elements_[element_id])
-       class_split_[class_id] = elements_[element_id]->next;
- 
-     // move to head of chain in same class
-@@ -157,24 +159,31 @@ class Partition {
-   // class indices of the newly created class. Returns the new_class id
-   // or -1 if no new class was created.
-   T SplitRefine(T class_id) {
-+
-+    Element* split_el = class_split_[class_id];
-     // only split if necessary
--    if (class_size_[class_id] == split_size_[class_id]) {
--      class_split_[class_id] = 0;
-+    //if (class_size_[class_id] == split_size_[class_id]) {
-+    if(split_el == NULL) { // we split on everything...
-       split_size_[class_id] = 0;
-       return -1;
-     } else {
--
-       T new_class = AddClass();
-+
-+      if(allow_repeated_split_) { // split_size_ is possibly
-+        // inaccurate, so work it out exactly.
-+        size_t split_count;  Element *e;
-+        for(split_count=0,e=classes_[class_id];
-+            e != split_el; split_count++, e=e->next);
-+        split_size_[class_id] = split_count;
-+      }
-       size_t remainder = class_size_[class_id] - split_size_[class_id];
-       if (remainder < split_size_[class_id]) {  // add smaller
--        Element* split_el   = class_split_[class_id];
-         classes_[new_class] = split_el;
--        class_size_[class_id] = split_size_[class_id];
--        class_size_[new_class] = remainder;
-         split_el->prev->next = 0;
-         split_el->prev = 0;
-+        class_size_[class_id] = split_size_[class_id];
-+        class_size_[new_class] = remainder;
-       } else {
--        Element* split_el   = class_split_[class_id];
-         classes_[new_class] = classes_[class_id];
-         class_size_[class_id] = remainder;
-         class_size_[new_class] = split_size_[class_id];
-@@ -245,10 +254,16 @@ class Partition {
-   vector<T> class_size_;
- 
-   // size of split for each class
-+  // in the nondeterministic case, split_size_ is actually an upper
-+  // bound on the size of split for each class.
-   vector<T> split_size_;
- 
-   // set of visited classes to be used in split refine
-   vector<T> visited_classes_;
-+
-+  // true if input fst was deterministic: we can make
-+  // certain assumptions in this case that speed up the algorithm.
-+  bool allow_repeated_split_;
- };
- 
- 
-diff --git a/src/include/fst/state-reachable.h b/src/include/fst/state-reachable.h
-index 6d0c971..1da922e 100644
---- a/src/include/fst/state-reachable.h
-+++ b/src/include/fst/state-reachable.h
-@@ -112,7 +112,7 @@ class IntervalReachVisitor {
-   void FinishState(StateId s, StateId p, const A *arc) {
-     if (index_ >= 0 && fst_.Final(s) != Weight::Zero()) {
-       vector<Interval> *intervals = (*isets_)[s].Intervals();
--      (*intervals)[0].end = index_;      // Update tree interval end
-+      (*intervals)[0].end_ = index_;      // Update tree interval end
-     }
-     (*isets_)[s].Normalize();
-     if (p != kNoStateId)
+diff --git a/src/include/fst/compat.h b/src/include/fst/compat.h
+index 00e2dba..ff8bacc 100644
+--- a/src/include/fst/compat.h
++++ b/src/include/fst/compat.h
+@@ -23,7 +23,9 @@
+ #ifdef _MSC_VER //AddedPD
+ #include <BaseTsd.h>
+ typedef SSIZE_T ssize_t;
++#if _MSC_VER < 1900 //AddedYT -- Visual Studio 2016 already has snprintf
+ #define snprintf _snprintf
++#endif  // _MSC_VER < 1900
+ #define strtoll _strtoi64
+ #ifndef OPENFSTEXPORT
+ 	#ifdef _DEBUG
+@@ -37,7 +39,7 @@ typedef SSIZE_T ssize_t;
+ 		  #pragma comment (lib, "openfst64.lib")
+     #else
+       #pragma comment (lib, "openfst.lib")
+-    #endif		
++    #endif
+ 	#endif
+ #endif
+ #else
+diff --git a/src/include/fst/interval-set.h b/src/include/fst/interval-set.h
+index c4362f2..58cad44 100644
+--- a/src/include/fst/interval-set.h
++++ b/src/include/fst/interval-set.h
+@@ -37,38 +37,38 @@ template <typename T>
+ class IntervalSet {
+  public:
+   struct Interval {
+-    T begin;
+-    T end;
++    T begin_;
++    T end_;
+ 
+-    Interval() : begin(-1), end(-1) {}
++    Interval() : begin_(-1), end_(-1) {}
+ 
+-    Interval(T b, T e) : begin(b), end(e) {}
++    Interval(T b, T e) : begin_(b), end_(e) {}
+ 
+     bool operator<(const Interval &i) const {
+-      return begin < i.begin || (begin == i.begin && end > i.end);
++      return begin_ < i.begin_ || (begin_ == i.begin_ && end_ > i.end_);
+     }
+ 
+     bool operator==(const Interval &i) const {
+-      return begin == i.begin && end == i.end;
++      return begin_ == i.begin_ && end_ == i.end_;
+     }
+ 
+     bool operator!=(const Interval &i) const {
+-      return begin != i.begin || end != i.end;
++      return begin_ != i.begin_ || end_ != i.end_;
+     }
+ 
+     istream &Read(istream &strm) {
+       T n;
+       ReadType(strm, &n);
+-      begin = n;
++      begin_ = n;
+       ReadType(strm, &n);
+-      end = n;
++      end_ = n;
+       return strm;
+     }
+ 
+     ostream &Write(ostream &strm) const {
+-      T n = begin;
++      T n = begin_;
+       WriteType(strm, n);
+-      n = end;
++      n = end_;
+       WriteType(strm, n);
+       return strm;
+     }
+@@ -108,7 +108,7 @@ class IntervalSet {
+         lower_bound(intervals_.begin(), intervals_.end(), interval);
+     if (lb == intervals_.begin())
+       return false;
+-    return (--lb)->end > value;
++    return (--lb)->end_ > value;
+   }
+ 
+   // Requires intervals be normalized.
+@@ -123,7 +123,7 @@ class IntervalSet {
+ 
+   bool Singleton() const {
+     return intervals_.size() == 1 &&
+-        intervals_[0].begin + 1 == intervals_[0].end;
++        intervals_[0].begin_ + 1 == intervals_[0].end_;
+   }
+ 
+ 
+@@ -178,17 +178,17 @@ void IntervalSet<T>::Normalize() {
+   T size = 0;
+   for (T i = 0; i < intervals_.size(); ++i) {
+     Interval &inti = intervals_[i];
+-    if (inti.begin == inti.end)
++    if (inti.begin_ == inti.end_)
+       continue;
+     for (T j = i + 1; j < intervals_.size(); ++j) {
+       Interval &intj = intervals_[j];
+-      if (intj.begin > inti.end)
++      if (intj.begin_ > inti.end_)
+         break;
+-      if (intj.end > inti.end)
+-        inti.end = intj.end;
++      if (intj.end_ > inti.end_)
++        inti.end_ = intj.end_;
+       ++i;
+     }
+-    count_ += inti.end - inti.begin;
++    count_ += inti.end_ - inti.begin_;
+     intervals_[size++] = inti;
+   }
+   intervals_.resize(size);
+@@ -208,17 +208,17 @@ void IntervalSet<T>::Intersect(const IntervalSet<T> &iset,
+   oset->count_ = 0;
+ 
+   while (it1 != intervals_.end() && it2 != iintervals->end()) {
+-    if (it1->end <= it2->begin) {
++    if (it1->end_ <= it2->begin_) {
+       ++it1;
+-    } else if (it2->end <= it1->begin) {
++    } else if (it2->end_ <= it1->begin_) {
+       ++it2;
+     } else {
+       Interval interval;
+-      interval.begin = max(it1->begin, it2->begin);
+-      interval.end = min(it1->end, it2->end);
++      interval.begin_ = max(it1->begin_, it2->begin_);
++      interval.end_ = min(it1->end_, it2->end_);
+       ointervals->push_back(interval);
+-      oset->count_ += interval.end - interval.begin;
+-      if (it1->end < it2->end)
++      oset->count_ += interval.end_ - interval.begin_;
++      if (it1->end_ < it2->end_)
+         ++it1;
+       else
+         ++it2;
+@@ -235,21 +235,21 @@ void IntervalSet<T>::Complement(T maxval, IntervalSet<T> *oset) const {
+   oset->count_ = 0;
+ 
+   Interval interval;
+-  interval.begin = 0;
++  interval.begin_ = 0;
+   for (typename vector<Interval>::const_iterator it = intervals_.begin();
+        it != intervals_.end();
+        ++it) {
+-    interval.end = min(it->begin, maxval);
+-    if (interval.begin < interval.end) {
++    interval.end_ = min(it->begin_, maxval);
++    if (interval.begin_ < interval.end_) {
+       ointervals->push_back(interval);
+-      oset->count_ += interval.end - interval.begin;
++      oset->count_ += interval.end_ - interval.begin_;
+     }
+-    interval.begin = it->end;
++    interval.begin_ = it->end_;
+   }
+-  interval.end = maxval;
+-  if (interval.begin < interval.end) {
++  interval.end_ = maxval;
++  if (interval.begin_ < interval.end_) {
+     ointervals->push_back(interval);
+-    oset->count_ += interval.end - interval.begin;
++    oset->count_ += interval.end_ - interval.begin_;
+   }
+ }
+ 
+@@ -263,7 +263,7 @@ void IntervalSet<T>::Difference(const IntervalSet<T> &iset,
+     oset->count_ = 0;
+   } else {
+     IntervalSet<T> cset;
+-    iset.Complement(intervals_.back().end, &cset);
++    iset.Complement(intervals_.back().end_, &cset);
+     Intersect(cset, oset);
+   }
+ }
+@@ -277,9 +277,9 @@ bool IntervalSet<T>::Overlaps(const IntervalSet<T> &iset) const {
+   typename vector<Interval>::const_iterator it2 = intervals->begin();
+ 
+   while (it1 != intervals_.end() && it2 != intervals->end()) {
+-    if (it1->end <= it2->begin) {
++    if (it1->end_ <= it2->begin_) {
+       ++it1;
+-    } else if (it2->end <= it1->begin) {
++    } else if (it2->end_ <= it1->begin_) {
+       ++it2;
+     } else {
+       return true;
+@@ -300,21 +300,21 @@ bool IntervalSet<T>::StrictlyOverlaps(const IntervalSet<T> &iset) const {
+   bool overlap = false; // point in both intervals_ and intervals
+ 
+   while (it1 != intervals_.end() && it2 != intervals->end()) {
+-    if (it1->end <= it2->begin) {  // no overlap - it1 first
++    if (it1->end_ <= it2->begin_) {  // no overlap - it1 first
+       only1 = true;
+       ++it1;
+-    } else if (it2->end <= it1->begin) {  // no overlap - it2 first
++    } else if (it2->end_ <= it1->begin_) {  // no overlap - it2 first
+       only2 = true;
+       ++it2;
+-    } else if (it2->begin == it1->begin && it2->end == it1->end) {  // equals
++    } else if (it2->begin_ == it1->begin_ && it2->end_ == it1->end_) {  // equals
+       overlap = true;
+       ++it1;
+       ++it2;
+-    } else if (it2->begin <= it1->begin && it2->end >= it1->end) {  // 1 c 2
++    } else if (it2->begin_ <= it1->begin_ && it2->end_ >= it1->end_) {  // 1 c 2
+       only2 = true;
+       overlap = true;
+       ++it1;
+-    } else if (it1->begin <= it2->begin && it1->end >= it2->end) {  // 2 c 1
++    } else if (it1->begin_ <= it2->begin_ && it1->end_ >= it2->end_) {  // 2 c 1
+       only1 = true;
+       overlap = true;
+       ++it2;
+@@ -346,11 +346,11 @@ bool IntervalSet<T>::Contains(const IntervalSet<T> &iset) const {
+   typename vector<Interval>::const_iterator it2 = intervals->begin();
+ 
+   while (it1 != intervals_.end() && it2 != intervals->end()) {
+-    if (it1->end <= it2->begin) {  // no overlap - it1 first
++    if (it1->end_ <= it2->begin_) {  // no overlap - it1 first
+       ++it1;
+-    } else if (it2->begin < it1->begin || it2->end > it1->end) {  // no C
++    } else if (it2->begin_ < it1->begin_ || it2->end_ > it1->end_) {  // no C
+       return false;
+-    } else if (it2->end == it1->end) {
++    } else if (it2->end_ == it1->end_) {
+       ++it1;
+       ++it2;
+     } else {
+@@ -370,7 +370,7 @@ ostream &operator<<(ostream &strm, const IntervalSet<T> &s)  {
+        ++it) {
+     if (it != intervals->begin())
+       strm << ",";
+-    strm << "[" << it->begin << "," << it->end << ")";
++    strm << "[" << it->begin_ << "," << it->end_ << ")";
+   }
+   strm << "}";
+   return strm;
+diff --git a/src/include/fst/label-reachable.h b/src/include/fst/label-reachable.h
+index a7c3360..491ef7d 100644
+--- a/src/include/fst/label-reachable.h
++++ b/src/include/fst/label-reachable.h
+@@ -359,9 +359,9 @@ class LabelReachable {
+                iiter = intervals->begin();
+            iiter != intervals->end(); ++iiter) {
+         begin_low = LowerBound(aiter, end_low, aiter_end,
+-                               aiter_input, iiter->begin);
++                               aiter_input, iiter->begin_);
+         end_low = LowerBound(aiter, begin_low, aiter_end,
+-                             aiter_input, iiter->end);
++                             aiter_input, iiter->end_);
+         if (end_low - begin_low > 0) {
+           if (reach_begin_ < 0)
+             reach_begin_ = begin_low;
+diff --git a/src/include/fst/minimize.h b/src/include/fst/minimize.h
+index 3fbe3ba..6e9dd3d 100644
+--- a/src/include/fst/minimize.h
++++ b/src/include/fst/minimize.h
+@@ -134,7 +134,14 @@ class CyclicMinimizer {
+   typedef typename A::Weight Weight;
+   typedef ReverseArc<A> RevA;
+ 
+-  CyclicMinimizer(const ExpandedFst<A>& fst) {
++  CyclicMinimizer(const ExpandedFst<A>& fst):
++      // tell the Partition data-member to expect multiple repeated
++      // calls to SplitOn with the same element if we are non-deterministic.
++      P_(fst.Properties(kIDeterministic, true) == 0) {
++    if(fst.Properties(kIDeterministic, true) == 0)
++      CHECK(Weight::Properties() & kIdempotent); // this minimization
++    // algorithm for non-deterministic FSTs can only work with idempotent
++    // semirings.
+     Initialize(fst);
+     Compute(fst);
+   }
+@@ -315,7 +322,13 @@ class AcyclicMinimizer {
+   typedef typename A::StateId ClassId;
+   typedef typename A::Weight Weight;
+ 
+-  AcyclicMinimizer(const ExpandedFst<A>& fst) {
++  AcyclicMinimizer(const ExpandedFst<A>& fst):
++      // tell the Partition data-member to expect multiple repeated
++      // calls to SplitOn with the same element if we are non-deterministic.
++      partition_(fst.Properties(kIDeterministic, true) == 0) {
++    if(fst.Properties(kIDeterministic, true) == 0)
++      CHECK(Weight::Properties() & kIdempotent); // minimization for
++    // non-deterministic FSTs can only work with idempotent semirings.
+     Initialize(fst);
+     Refine(fst);
+   }
+@@ -531,13 +544,7 @@ template <class A>
+ void Minimize(MutableFst<A>* fst,
+               MutableFst<A>* sfst = 0,
+               float delta = kDelta) {
+-  uint64 props = fst->Properties(kAcceptor | kIDeterministic|
+-                                 kWeighted | kUnweighted, true);
+-  if (!(props & kIDeterministic)) {
+-    FSTERROR() << "FST is not deterministic";
+-    fst->SetProperties(kError, kError);
+-    return;
+-  }
++  uint64 props = fst->Properties(kAcceptor | kWeighted | kUnweighted, true);
+ 
+   if (!(props & kAcceptor)) {  // weighted transducer
+     VectorFst< GallicArc<A, STRING_LEFT> > gfst;
+diff --git a/src/include/fst/partition.h b/src/include/fst/partition.h
+index dcee67b..40b849a 100644
+--- a/src/include/fst/partition.h
++++ b/src/include/fst/partition.h
+@@ -43,8 +43,8 @@ class Partition {
+   friend class PartitionIterator<T>;
+ 
+   struct Element {
+-   Element() : value(0), next(0), prev(0) {}
+-   Element(T v) : value(v), next(0), prev(0) {}
++    Element() : value(0), next(0), prev(0) {}
++    Element(T v) : value(v), next(0), prev(0) {}
+ 
+    T        value;
+    Element* next;
+@@ -52,9 +52,11 @@ class Partition {
+   };
+ 
+  public:
+-  Partition() {}
++  Partition(bool allow_repeated_split):
++      allow_repeated_split_(allow_repeated_split) {}
+ 
+-  Partition(T num_states) {
++  Partition(bool allow_repeated_split, T num_states):
++      allow_repeated_split_(allow_repeated_split) {
+     Initialize(num_states);
+   }
+ 
+@@ -137,16 +139,16 @@ class Partition {
+     if (class_size_[class_id] == 1) return;
+ 
+     // first time class is split
+-    if (split_size_[class_id] == 0)
++    if (split_size_[class_id] == 0) { 
+       visited_classes_.push_back(class_id);
+-
++      class_split_[class_id] = classes_[class_id];
++    }
+     // increment size of split (set of element at head of chain)
+     split_size_[class_id]++;
+-
++    
+     // update split point
+-    if (class_split_[class_id] == 0)
+-      class_split_[class_id] = classes_[class_id];
+-    if (class_split_[class_id] == elements_[element_id])
++    if (class_split_[class_id] != 0
++        && class_split_[class_id] == elements_[element_id])
+       class_split_[class_id] = elements_[element_id]->next;
+ 
+     // move to head of chain in same class
+@@ -157,24 +159,31 @@ class Partition {
+   // class indices of the newly created class. Returns the new_class id
+   // or -1 if no new class was created.
+   T SplitRefine(T class_id) {
++
++    Element* split_el = class_split_[class_id];
+     // only split if necessary
+-    if (class_size_[class_id] == split_size_[class_id]) {
+-      class_split_[class_id] = 0;
++    //if (class_size_[class_id] == split_size_[class_id]) {
++    if(split_el == NULL) { // we split on everything...
+       split_size_[class_id] = 0;
+       return -1;
+     } else {
+-
+       T new_class = AddClass();
++
++      if(allow_repeated_split_) { // split_size_ is possibly
++        // inaccurate, so work it out exactly.
++        size_t split_count;  Element *e;
++        for(split_count=0,e=classes_[class_id];
++            e != split_el; split_count++, e=e->next);
++        split_size_[class_id] = split_count;
++      }
+       size_t remainder = class_size_[class_id] - split_size_[class_id];
+       if (remainder < split_size_[class_id]) {  // add smaller
+-        Element* split_el   = class_split_[class_id];
+         classes_[new_class] = split_el;
+-        class_size_[class_id] = split_size_[class_id];
+-        class_size_[new_class] = remainder;
+         split_el->prev->next = 0;
+         split_el->prev = 0;
++        class_size_[class_id] = split_size_[class_id];
++        class_size_[new_class] = remainder;
+       } else {
+-        Element* split_el   = class_split_[class_id];
+         classes_[new_class] = classes_[class_id];
+         class_size_[class_id] = remainder;
+         class_size_[new_class] = split_size_[class_id];
+@@ -245,10 +254,16 @@ class Partition {
+   vector<T> class_size_;
+ 
+   // size of split for each class
++  // in the nondeterministic case, split_size_ is actually an upper
++  // bound on the size of split for each class.
+   vector<T> split_size_;
+ 
+   // set of visited classes to be used in split refine
+   vector<T> visited_classes_;
++
++  // true if input fst was deterministic: we can make
++  // certain assumptions in this case that speed up the algorithm.
++  bool allow_repeated_split_;
+ };
+ 
+ 
+diff --git a/src/include/fst/state-reachable.h b/src/include/fst/state-reachable.h
+index 6d0c971..1da922e 100644
+--- a/src/include/fst/state-reachable.h
++++ b/src/include/fst/state-reachable.h
+@@ -112,7 +112,7 @@ class IntervalReachVisitor {
+   void FinishState(StateId s, StateId p, const A *arc) {
+     if (index_ >= 0 && fst_.Final(s) != Weight::Zero()) {
+       vector<Interval> *intervals = (*isets_)[s].Intervals();
+-      (*intervals)[0].end = index_;      // Update tree interval end
++      (*intervals)[0].end_ = index_;      // Update tree interval end
+     }
+     (*isets_)[s].Normalize();
+     if (p != kNoStateId)
diff --git a/tools/extras/rnnlm.patch b/tools/extras/rnnlm.patch
deleted file mode 100644
index 97b02571ffa..00000000000
--- a/tools/extras/rnnlm.patch
+++ /dev/null
@@ -1,13 +0,0 @@
-Index: rnnlm.c
-===================================================================
---- rnnlm.c	(revision 4774)
-+++ rnnlm.c	(working copy)
-@@ -20,7 +20,7 @@
- #include <pthread.h>
- #include <assert.h>
- 
--#define MAX_STRING 100
-+#define MAX_STRING 256
- #define MAX_SENTENCE_LENGTH 10000
- #define MAX_CODE_LENGTH 40
- 
diff --git a/tools/rnnlm-hs-0.1b/LICENSE b/tools/rnnlm-hs-0.1b/LICENSE
deleted file mode 100644
index d6456956733..00000000000
--- a/tools/rnnlm-hs-0.1b/LICENSE
+++ /dev/null
@@ -1,202 +0,0 @@
-
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [yyyy] [name of copyright owner]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
diff --git a/tools/rnnlm-hs-0.1b/Makefile b/tools/rnnlm-hs-0.1b/Makefile
deleted file mode 100644
index 60de131be78..00000000000
--- a/tools/rnnlm-hs-0.1b/Makefile
+++ /dev/null
@@ -1,11 +0,0 @@
-CC = gcc
-#The -Ofast might not work with older versions of gcc; in that case, use -O2
-CFLAGS = `$(CC) -dumpversion | awk '{if(NR==1 && $$1>="4.6") print "-lm -pthread -Ofast -march=native -Wall -funroll-loops -Wno-unused-result -std=c99 -g"; else print "-lm -pthread -O2 -march=native -Wall -funroll-loops -std=c99 -g";}'`
-
-all: rnnlm
-
-rnnlm : rnnlm.c
-	$(CC) rnnlm.c -o rnnlm $(CFLAGS)
-
-clean:
-	rm -rf rnnlm
diff --git a/tools/rnnlm-hs-0.1b/README.txt b/tools/rnnlm-hs-0.1b/README.txt
deleted file mode 100644
index bb8d8829990..00000000000
--- a/tools/rnnlm-hs-0.1b/README.txt
+++ /dev/null
@@ -1,141 +0,0 @@
-1) OVERVIEW
-
-This is a tool to estimate recurrent neural network language models on large amounts of text data. 
-
-The source code is heavily based on the word2vec toolkit. Basically this is just a small extension of word2vec that makes possible to apply it for language modeling.
-https://code.google.com/p/word2vec/
-
-The main ideas, interface and maxent extension implementation come from RNNLM written by Tomas Mikolov:
-http://rnnlm.org
-
-The differences from the published Mikolov's RNNLM are hierarchical softmax and hogwild multithreading (both tricks are taken directly from word2vec). This makes possible to train RNNLM-HS (hierarchical softmax) on large corpora, e.g. billions of words. However, on small and average-sized corpora Tomas Mikolov's RNNLM works considerably better both in terms of entropy and WER. RNNLM-HS is also much faster in test time, which is useful for online ASR.
-
-Please send your ideas and proposals regarding this tool to ilia@yandex-team.com (Ilya Edrenkin, Yandex LLC). Bugreports and fixes are also of course welcome.
-
-2) USAGE EXAMPLES
-
-A typical example to obtain a reasonable model on a large (~4 billion words) corpus in a couple of days on a 16-core machine:
-./rnnlm -train corpus.shuf.split-train -valid corpus.shuf.split-valid -hidden 100 -rnnlm corpus.shuf.split-train.h100me5-1000.t16 -threads 16 -alpha 0.1 -bptt 4 -bptt-block 10 -direct-order 5 -direct 1000
-
-Fine-tuning of an existing model on a smaller in-domain corpora:
-./rnnlm -train corpus.indomain.split-train -valid corpus.indomain.split-valid -rnnlm corpus.shuf.split-train.h100me5-1000.t16 -threads 1 -bptt 0 -alpha 0.01 -recompute-counts 1
-
-Obtaining individual logprobs for a set of test sentences:
-./rnnlm -rnnlm corpus.shuf.split-train.h100me5-1000.t16 -test corpus.test
-
-Interactive sampling from an existing model:
-./rnnlm -rnnlm corpus.shuf.split-train.h100me5-1000.t16 -gen -10
-
-3) USAGE ADVICE
-
-- you don't need to repeat structural parameters (size, maxent-order, maxent-size) when using an existing model. They will be ignored. The vocab saved in the model will be reused.
-- the vocabulary is built based on the training file on the first run of the tool for a particular model. The program will ignore sentences with OOVs in train time (or report them in test time).
-- set the number of threads to the number of physical CPUs or less.
-- vocabulary size plays very small role in the performance (it is logarithmic in the size of vocabulary due to the Huffman tree decomposition). Hidden layer size and the amount of training data are the main factors.
-- the model will be written to file after a training epoch if and only if its validation entropy improved compared to the previous epoch.
-- using multithreading together with unlimited BPTT (setting -bptt 0) may cause the net to diverge.
-- unlike Mikolov's RNNLM which has -independent switch, this tool always considers sentences as independent and doesn't track global context.
-- it is a good idea to shuffle sentences in the set before splitting them into training and validation sets (GNU shuf & split are one of the possible choices to do it). 
-
-4) PARAMETERS
-
-	-train <file>
-		Use text data from <file> to train the model
-	-valid <file>
-		Use text data from <file> to perform validation and control learning rate
-	-test <file>
-		Use text data from <file> to compute logprobs with an existing model
-
-Train, valid and test corpora. All distinct words that are found in the training file will be used for the nnet vocab, their counts will determine Huffman tree structure and remain fixed for this nnet. 
-If you prefer using limited vocabulary (say, top 1 million words) you should map all other words to <unk> or another token of your choice. Limited vocabulary is usually a good idea if it helps you to have enough training examples for each word.
-
-	-rnnlm <file>
-		Use <file> to save the resulting language model
-
-Will create <file> and <file>.nnet files (for storing vocab/counts in the text form and the net itself in binary form).
-If the <file> and <file>.nnet already exist, the tool will attempt to load them instead of starting new training.
-
-	-hidden <int>
-		Set size of hidden layer; default is 100
-
-Large (300 and more) hidden layers are slow; sometimes they even fail to learn well in combination with multithreading and large or unlimited BPTT depth.
-
-	-bptt <int>
-		Set length of BPTT unfolding; default is 3; set to 0 to disable truncation
-	-bptt-block <int>
-		Set period of BPTT unfolding; default is 10; BPTT is performed each bptt+bptt_block steps
-
-Default parameters for BPTT are reasonable in the most cases. 
-
-	-gen <int>
-		Sampling mode; number of sentences to sample, default is 0 (off); enter negative number for interactive mode
-
-Can complete a given prefix or generate a whole sentence (which corresponds to empty given prefix). Can be fun with a well trained model. Probably could be used for generating text for ngram variational approximation of the RNNLM, never tried it myself though.
-
-	-threads <int>
-		Use <int> threads (default 1)
-
-The performance does not scale linearly with the number of threads (it is sublinear due to cache misses, false hogwild assumptions, etc).
-Testing, validation and sampling are always performed by a single thread regardless of this setting.
-
-	-min-count <int>
-		This will discard words that appear less than <int> times; default is 0
-
-Inherited from word2vec; not tested thoroughly. It is better to map rare words to <unk> by hand before training.
-
-	-alpha <float>
-		Set the starting learning rate; default is 0.1
-	-maxent-alpha <float>
-		Set the starting learning rate for maxent; default is 0.1
-
-Maxent is somewhat prone to overfitting. Sometimes it makes sense to make maxent-alpha less than the 'main' alpha.
-
-	-reject-threshold <float>
-		Reject nnet and reload nnet from previous epoch if the relative entropy improvement on the validation set is below this threshold (default 0.997)
-	-stop <float>
-		Stop training when the relative entropy improvement on the validation set is below this threshold (default 1.003); see also -retry
-	-retry <int>
-		Stop training iff N retries with halving learning rate have failed (default 2)
-
-The training schedule is as follows: if the validation set entropy improvement is less than <stop>, learning rate (both nnet and maxent) will start halving with each iteration.
-In this mode, if the validation set entropy improvement is again less than <stop>, the retry counter will increment; when it reaches <retry> the training stops.
-In addition to that, if the validation set entropy improvement is less than <reject-threshold> the nnet will be rejected and reloaded from the previous epoch.
-
-	-debug <int>
-		Set the debug mode (default = 2 = more info during training)
-
-Inherited from word2vec. Set debug to 0 if you don't want to see speed statistics. 
-
-	-direct <int>
-		Set the size of hash for maxent parameters, in millions (default 0 = maxent off)
-	-direct-order <int>
-		Set the order of n-gram features to be used in maxent (default 3)
-
-Maxent extension. Off by default. Speeds up convergence a lot, also improves entropy; the only drawback is memory demand, e.g. setting -direct-size 1000 will cost you ~4 GB for the nnet file.
-
-	-beta1 <float>
-		L2 regularisation parameter for RNNLM weights (default 1e-6)
-	-beta2 <float>
-		L2 regularisation parameter for maxent weights (default 1e-6)
-
-Maxent is somewhat prone to overfitting. Sometimes it makes sense to make beta2 larger than beta1.
-
-	-recompute-counts <int>
-		Recompute train words counts, useful for fine-tuning (default = 0 = use counts stored in the vocab file)
-
-Vocabulary counts are stored in the nnet file and used to reconstruct Huffman tree. When fine-tuning an existing model on a new corpus, use this option. New counts will not be saved, they are used for the fine-tuning session only.
-
-
-5) FUTURE PLANS
-
-Large amount of tricks that help training RNNs has been discussed in the literature and could be applied to his tool, but currently are not implemented in this release:
-
-- Better initialization (spectral radius trick, Bayesian model selection)
-- Using ReLU/softplus instead of sigmoid
-- Momentum/NAG
-- Second order methods, HF
-- LSTM
-- Adapting code for GPUs
-- More efficient SGD parallelisation 
-
-If you are interested in contribution feel free to participate.
diff --git a/tools/rnnlm-hs-0.1b/rnnlm.c b/tools/rnnlm-hs-0.1b/rnnlm.c
deleted file mode 100644
index b305736c8aa..00000000000
--- a/tools/rnnlm-hs-0.1b/rnnlm.c
+++ /dev/null
@@ -1,1013 +0,0 @@
-//  word2vec: Copyright 2013 Google Inc. All Rights Reserved.
-//  RNNLM extension: Copyright 2014 Yandex LLC. All Rights Reserved.
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-#include <pthread.h>
-#include <assert.h>
-
-#define MAX_STRING 1024
-#define MAX_SENTENCE_LENGTH 10000
-#define MAX_CODE_LENGTH 40
-
-const int vocab_hash_size = 30000000;  // Maximum 30 * 0.7 = 21M words in the vocabulary
-
-// #define DOUBLE
-// #define DEBUG
-
-#ifdef DOUBLE        // Precision of float numbers
-typedef double real;
-#else
-typedef float real;
-#endif
-
-#define MAX_GRAD 15.0f
-#define MIN_GRAD -MAX_GRAD
-
-#define MAX_NGRAM_ORDER 10
-
-struct vocab_word {
-  long long cn;
-  int *point;
-  char *word, *code, codelen;
-};
-
-struct tree_node {
-  int child0;
-  int child1;
-};
-struct tree_node *tree;
-
-struct {
-  real *syn0, *syn1, *synRec, *synMaxent;
-} nnet = { NULL, NULL, NULL, NULL };
-
-char train_file[MAX_STRING], valid_file[MAX_STRING], model_file[MAX_STRING], model_file_nnet[MAX_STRING+5], test_file[MAX_STRING];
-char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
-struct vocab_word *vocab;
-int debug_mode = 2, min_count = 0, min_reduce = 0, num_threads = 1, recompute_train_counts = 0;
-int *vocab_hash;
-long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
-long long train_words = 0, valid_words = 0, word_count_actual = 0, file_size = 0, valid_file_size = 0;
-real alpha = 0.1;
-real maxent_alpha = 0.1;
-real *expTable;
-clock_t start;
-real sumlogprob = 0, sumlogprob_valid = 0;
-real stop = 1.003;
-real reject_threshold = 0.997;
-int max_retry = 2;
-int bptt = 3;
-int bptt_block = 10;
-unsigned long long counter = 0;
-int maxent_order = 3;
-long long maxent_hash_size = 0;
-real beta = 1e-6;
-real maxent_beta = 1e-6;
-int gen = 0;
-
-const unsigned long long int PRIMES[]={108641969, 116049371, 125925907, 133333309, 145678979, 175308587, 197530793, 234567803, 251851741, 264197411, 330864029, 399999781,
-407407183, 459258997, 479012069, 545678687, 560493491, 607407037, 629629243, 656789717, 716048933, 718518067, 725925469, 733332871, 753085943, 755555077,
-782715551, 790122953, 812345159, 814814293, 893826581, 923456189, 940740127, 953085797, 985184539, 990122807};
-const unsigned long long int PRIMES_SIZE=sizeof(PRIMES)/sizeof(PRIMES[0]);
-
-inline static void ApplySigmoid(real* neu, long long layer_size) { 
-  for (int c = 0; c < layer_size; c++) {
-    neu[c] = exp(neu[c])/(1 + exp(neu[c]));
-  }
-}
-
-inline static void MultiplySigmoidDerivative(real* neu, long long layer_size, real* neu_e) { 
-  for (int c = 0; c < layer_size; ++c) {
-    neu_e[c] *= neu[c] * (1 - neu[c]);
-    neu_e[c] = neu_e[c] < MAX_GRAD ? neu_e[c] : MAX_GRAD;
-    neu_e[c] = neu_e[c] > MIN_GRAD ? neu_e[c] : MIN_GRAD;
-  }
-}
-
-
-// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
-void ReadWord(char *word, FILE *fin) {
-  int a = 0, ch;
-  while (!feof(fin)) {
-    ch = fgetc(fin);
-    if (ch == 13) continue;
-    if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
-      if (a > 0) {
-        if (ch == '\n') ungetc(ch, fin);
-        break;
-      }
-      if (ch == '\n') {
-        strcpy(word, (char *)"</s>");
-        return;
-      } else continue;
-    }
-    word[a] = ch;
-    a++;
-    if (a >= MAX_STRING - 1) a--;   // Truncate too long words
-  }
-  word[a] = 0;
-}
-
-// Returns hash value of a word
-int GetWordHash(char *word) {
-  unsigned long long a, hash = 0;
-  for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a];
-  hash = hash % vocab_hash_size;
-  return hash;
-}
-
-// Returns position of a word in the vocabulary; if the word is not found, returns -1
-int SearchVocab(char *word) {
-  unsigned int hash = GetWordHash(word);
-  while (1) {
-    if (vocab_hash[hash] == -1) return -1;
-    if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash];
-    hash = (hash + 1) % vocab_hash_size;
-  }
-  return -1;
-}
-
-// Reads a word and returns its index in the vocabulary
-int ReadWordIndex(FILE *fin) {
-  char word[MAX_STRING];
-  ReadWord(word, fin);
-  if (feof(fin)) return -1;
-  return SearchVocab(word);
-}
-
-// Adds a word to the vocabulary
-int AddWordToVocab(char *word) {
-  unsigned int hash, length = strlen(word) + 1;
-  if (length > MAX_STRING) length = MAX_STRING;
-  vocab[vocab_size].word = (char *)calloc(length, sizeof(char));
-  strcpy(vocab[vocab_size].word, word);
-  vocab[vocab_size].cn = 0;
-  vocab_size++;
-  // Reallocate memory if needed
-  if (vocab_size + 2 >= vocab_max_size) {
-    vocab_max_size *= 1.5; // was += 1000, modified to have fewer reallocations
-    vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word));
-  }
-  hash = GetWordHash(word);
-  while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
-  vocab_hash[hash] = vocab_size - 1;
-  return vocab_size - 1;
-}
-
-// Used later for sorting by word counts
-int VocabCompare(const void *a, const void *b) {
-    return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn;
-}
-
-// Sorts the vocabulary by frequency using word counts
-void SortVocab() {
-  int a, size;
-  unsigned int hash;
-  // Sort the vocabulary and keep </s> at the first position
-  qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
-  for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
-  size = vocab_size;
-  train_words = 0;
-  for (a = 0; a < size; a++) {
-    // Words occuring less than min_count times will be discarded from the vocab
-    if (vocab[a].cn < min_count) {
-      vocab_size--;
-      free(vocab[vocab_size].word);
-    } else {
-      // Hash will be re-computed, as after the sorting it is not actual
-      hash=GetWordHash(vocab[a].word);
-      while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
-      vocab_hash[hash] = a;
-      train_words += vocab[a].cn;
-    }
-  }
-  vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word));
-  // Allocate memory for the binary tree construction
-  for (a = 0; a < vocab_size; a++) {
-    vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char));
-    vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int));
-  }
-}
-
-// Reduces the vocabulary by removing infrequent tokens
-void ReduceVocab() {
-  int a, b = 0;
-  unsigned int hash;
-  for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) {
-    vocab[b].cn = vocab[a].cn;
-    vocab[b].word = vocab[a].word;
-    b++;
-  } else free(vocab[a].word);
-  vocab_size = b;
-  for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
-  for (a = 0; a < vocab_size; a++) {
-    // Hash will be re-computed, as it is not actual
-    hash = GetWordHash(vocab[a].word);
-    while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
-    vocab_hash[hash] = a;
-  }
-  fflush(stdout);
-  min_reduce++;
-}
-
-// Create binary Huffman tree using the word counts
-// Frequent words will have short unique binary codes
-void CreateBinaryTree() {
-  long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
-  char code[MAX_CODE_LENGTH];
-  long long *count = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
-  long long *binary = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
-  long long *parent_node = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
-  tree = calloc(vocab_size, sizeof(struct tree_node));
-
-  for (a = 0; a < vocab_size; a++) count[a] = vocab[a].cn;
-  for (a = vocab_size; a < vocab_size * 2; a++) count[a] = 1e15;
-  pos1 = vocab_size - 1;
-  pos2 = vocab_size;
-  // Following algorithm constructs the Huffman tree by adding one node at a time
-  for (a = 0; a < vocab_size - 1; a++) {
-    // First, find two smallest nodes 'min1, min2'
-    if (pos1 >= 0 && count[pos1] < count[pos2]) {
-      min1i = pos1;
-      pos1--;
-    } else {
-      min1i = pos2;
-      pos2++;
-    }
-
-    if (pos1 >= 0 && count[pos1] < count[pos2]) {
-      min2i = pos1;
-      pos1--;
-    } else {
-      min2i = pos2;
-      pos2++;
-    }
-
-    count[vocab_size + a] = count[min1i] + count[min2i];
-    parent_node[min1i] = vocab_size + a;
-    parent_node[min2i] = vocab_size + a;
-    tree[a].child0 = min1i - vocab_size;
-    tree[a].child1 = min2i - vocab_size;
-    binary[min2i] = 1;
-  }
-  // Now assign binary code to each vocabulary word
-  for (a = 0; a < vocab_size; a++) {
-    b = a;
-    i = 0;
-    while (1) {
-      code[i] = binary[b];
-      point[i] = b;
-      i++;
-      b = parent_node[b];
-      if (b == vocab_size * 2 - 2) break;
-    }
-    vocab[a].codelen = i;
-    vocab[a].point[0] = vocab_size - 2;
-    for (b = 0; b < i; b++) {
-      vocab[a].code[i - b - 1] = code[b];
-      vocab[a].point[i - b] = point[b] - vocab_size;
-    }
-  }
-  free(count);
-  free(binary);
-  free(parent_node);
-}
-
-void LearnVocabFromTrainFile() {
-  char word[MAX_STRING];
-  FILE *fin;
-  long long a, i;
-  for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
-  fin = fopen(train_file, "rb");
-  if (fin == NULL) {
-    printf("ERROR: training data file not found!\n");
-    exit(1);
-  }
-  vocab_size = 0;
-  AddWordToVocab((char *)"</s>");
-  while (1) {
-    ReadWord(word, fin);
-    if (feof(fin)) break;
-    train_words++;
-    if ((debug_mode > 1) && (train_words % 100000 == 0)) {
-      fprintf(stderr, "%lldK%c", train_words / 1000, 13);
-    }
-    i = SearchVocab(word);
-    if (i == -1) {
-      a = AddWordToVocab(word);
-      vocab[a].cn = 1;
-    } else vocab[i].cn++;
-    if (vocab_size > vocab_hash_size * 0.7) ReduceVocab();
-  }
-  SortVocab();
-  if (debug_mode > 0) {
-    fprintf(stderr, "Vocab size: %lld\n", vocab_size);
-    fprintf(stderr, "Words in train file: %lld\n", train_words);
-  }
-  file_size = ftell(fin);
-  fclose(fin);
-}
-
-void SaveVocab() {
-  long long i;
-  FILE *fo = fopen(model_file, "wb");
-  for (i = 0; i < vocab_size; i++) fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
-  fclose(fo);
-}
-
-void ReadVocab() {
-  long long a, i = 0;
-  char c;
-  char word[MAX_STRING];
-  FILE *fin = fopen(model_file, "rb");
-  if (fin == NULL) {
-    fprintf(stderr, "Vocabulary file not found\n");
-    exit(1);
-  }
-  for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
-  vocab_size = 0;
-  while (1) {
-    ReadWord(word, fin);
-    if (feof(fin)) break;
-    a = AddWordToVocab(word);
-    fscanf(fin, "%lld%c", &vocab[a].cn, &c);
-    i++;
-  }
-  SortVocab();
-
-  if(recompute_train_counts) { // If training file changed, e.g. in fine-tuning
-    FILE *fi = fopen(train_file, "rb");
-    if (fi == NULL) {
-      fprintf(stderr, "ERROR: training data file not found!\n");
-      exit(1);
-    }
-    train_words = 0;
-    while (1) {
-      ReadWordIndex(fi);
-      ++train_words;
-      if (feof(fi)) break;
-    }    
-    fclose(fi);
-  }
-
-  if (debug_mode > 0) {
-    fprintf(stderr, "Vocab size: %lld\n", vocab_size);
-    fprintf(stderr, "Words in train file: %lld\n", train_words);
-  }
-  if(test_file[0] != 0 || gen != 0) return;
-
-  fin = fopen(train_file, "rb");
-  if (fin == NULL) {
-    fprintf(stderr, "ERROR: training data file not found!\n");
-    exit(1);
-  }
-  fseek(fin, 0, SEEK_END);
-  file_size = ftell(fin);
-  fclose(fin);
-}
-
-void InitNet() {
-  long long a, b;
-  a = posix_memalign((void **)&nnet.syn0, 128, (long long)vocab_size * layer1_size * sizeof(real));
-  if (nnet.syn0 == NULL) {fprintf(stderr, "Memory allocation failed (syn0)\n"); exit(1);}
-
-  a = posix_memalign((void **)&nnet.syn1, 128, (long long)layer1_size * vocab_size * sizeof(real));
-  if (nnet.syn1 == NULL) {fprintf(stderr, "Memory allocation failed (syn1)\n"); exit(1);}
-
-  a = posix_memalign((void **)&nnet.synRec, 128, (long long)layer1_size * layer1_size * sizeof(real));
-  if (nnet.synRec == NULL) {fprintf(stderr, "Memory allocation failed (synRec)\n"); exit(1);}
-
-  if(maxent_hash_size != 0) {
-    a = posix_memalign((void **)&nnet.synMaxent, 128, (long long)maxent_hash_size * sizeof(real));
-    if (nnet.synMaxent == NULL) {fprintf(stderr, "Memory allocation failed (synMaxent)\n"); exit(1);}
-    memset(nnet.synMaxent, 0, (long long) maxent_hash_size * sizeof(real));
-  } else {
-    maxent_order = 0;
-  }
-
-  for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) nnet.syn0[a * layer1_size + b] = (rand() / (real)RAND_MAX - 0.5) / layer1_size;
-  for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) nnet.syn1[a * layer1_size + b] = 0;
-  for (a = 0; a < layer1_size; a++) for (b = 0; b < layer1_size; b++) nnet.synRec[a * layer1_size + b] = (rand() / (real)RAND_MAX - 0.5) / layer1_size;
-
-  CreateBinaryTree();
-}
-
-void *TrainModelThread(void *id) {
-  long long d, word = -1, last_word, sentence_length = 0;
-  long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
-  long long l2;
-  real f, g;
-  clock_t now;
-  real *neu1, *neu1e;
-  int full_block = bptt_block + bptt;
-  posix_memalign((void **)&neu1, 128, (long long)layer1_size * MAX_SENTENCE_LENGTH * sizeof(real));
-  posix_memalign((void **)&neu1e, 128, (long long)layer1_size * MAX_SENTENCE_LENGTH * sizeof(real));
-
-  FILE *fi = fopen(train_file, "rb");
-  fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET);
-#ifdef DEBUG
-  real my_sumlogprob = 0;
-#endif
-  if((long long)id != 0) while(word != 0 && !feof(fi)) { // skipping to the next newline
-      word = ReadWordIndex(fi); 
-    }
-
-  while (1) {
-    if (word_count - last_word_count > 10000) {
-      word_count_actual += word_count - last_word_count;
-      last_word_count = word_count;
-      if (debug_mode > 1) {
-        now=clock();
-        fprintf(stderr, "%cAlpha: %f  ME-alpha: %f  Progress: %.2f%%  Words/thread/sec: %.2fk\t", 13, alpha, maxent_alpha,
-		word_count_actual / (real)(train_words + 1) * 100,
-		word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000));
-        fflush(stdout);
-      }
-    }
-    if (feof(fi) || word_count > train_words / num_threads) break;
-   
-    sen[0] = 0; // <s> token -- beginning of sentence
-    int good = 1;
-    sentence_length = 1;
-    while(sentence_length < MAX_SENTENCE_LENGTH) {
-      word = ReadWordIndex(fi);
-      ++word_count;
-      sen[sentence_length] = word;
-      if (feof(fi) || word == 0) break;
-      if (word == -1) good = 0;
-      ++sentence_length;
-    }
-
-    if(good == 0) continue;
-    if(sentence_length == 1 && feof(fi)) break;
-    
-    memset(neu1e, 0, (long long)layer1_size * sentence_length * sizeof(real)); // clear gradients  
-    memset(neu1, 0, (long long)layer1_size * sentence_length * sizeof(real)); // clear activations
-#ifdef DEBUG
-    real sentence_logprob = 0.0;
-#endif
-
-    for(int input = 0; input < sentence_length; ++input) {
-      // Forward pass (not including final softmax)  
-      if (input != 0) { 
-	for(int c = 0; c < layer1_size; ++c) {
-	  for(int d = 0; d < layer1_size; ++d) { 
-	    neu1[input*layer1_size + c] += nnet.synRec[c*layer1_size + d] * neu1[(input-1)*layer1_size + d];  // Recurrent hidden->hidden activation
-	  }
-	}
-      }
-      last_word = sen[input];
-      for(int c = 0; c < layer1_size; ++c) {
-	neu1[input*layer1_size + c] += nnet.syn0[last_word*layer1_size + c]; // Input to hidden
-      }
-      ApplySigmoid(neu1+layer1_size*input, layer1_size);
-    }
-    
-    for(int target = 1; target <= sentence_length; ++target) {
-      // Forward pass (softmax)
-      word = sen[target];
-      long long feature_hashes[MAX_NGRAM_ORDER] = {0};
-      if(maxent_order) {
-	for(int order = 0; order < maxent_order && target - order >= 0; ++order) {
-	  feature_hashes[order] = PRIMES[0]*PRIMES[1];    	    
-	  for (int b = 1; b <= order; ++b) feature_hashes[order] += PRIMES[(order*PRIMES[b]+b) % PRIMES_SIZE]*(unsigned long long)(sen[target-b]+1);
-	  feature_hashes[order] = feature_hashes[order] % (maxent_hash_size - vocab_size);
-	}
-      }
-      for (d = 0; d < vocab[word].codelen; d++) {
-	// Propagate hidden -> output
-	f = 0.0;
-	l2 = vocab[word].point[d] * layer1_size;
-	for(int c = 0; c < layer1_size; ++c) {
-	  f += neu1[layer1_size*(target - 1) + c] * nnet.syn1[l2 + c];
-	}
-	for(int order = 0; order < maxent_order && target - order >= 0; ++order) {
-	  f += nnet.synMaxent[feature_hashes[order] + vocab[word].point[d]];
-	}
-#ifdef DEBUG
-	sentence_logprob += log10(1+(vocab[word].code[d] == 1 ? exp(f) : exp(-f)));
-#endif
-	f = exp(f)/(1+exp(f)); // sigmoid
-	g = (1 - vocab[word].code[d] - f); 
-	g = g > MAX_GRAD ? MAX_GRAD : g;
-	g = g < MIN_GRAD ? MIN_GRAD : g;
-	real g_alpha = g * alpha; // 'g_alpha' is the gradient multiplied by the learning rate
-	real g_maxentalpha = g * maxent_alpha;
-
-	// Propagate errors output -> hidden
-	for(int c = 0; c < layer1_size; ++c) {
-	  neu1e[layer1_size * (target - 1) + c] += g_alpha * nnet.syn1[l2 + c];
-        }
-
-	// Learn weights hidden -> output
-	for(int c = 0; c < layer1_size; ++c) {
-	  nnet.syn1[l2 + c] += g_alpha * neu1[layer1_size*(target - 1) + c] - beta * nnet.syn1[l2 + c];
-        }
-	for(int order = 0; order < maxent_order && target - order >= 0; ++order) {
-          nnet.synMaxent[feature_hashes[order] + vocab[word].point[d]] += g_maxentalpha - maxent_beta * nnet.synMaxent[feature_hashes[order] + vocab[word].point[d]];
-        }
-      }
-    }
-#ifdef DEBUG
-    my_sumlogprob += sentence_logprob;
-#endif
-    // Backpropagation through time pass
-    int my_bptt = 0;
-    for(int input = sentence_length - 1; input >= 0; --input) {
-	MultiplySigmoidDerivative(neu1+layer1_size*input, layer1_size, neu1e+layer1_size*input);  
-	last_word = sen[input];
-
-	for(int c = 0; c < layer1_size; ++c) {
-	  nnet.syn0[layer1_size*last_word + c] += neu1e[layer1_size*input + c] - beta * nnet.syn0[layer1_size*last_word + c]; // Input weight update
-	}
-
-	long long word_num = word_count - (input - sentence_length);
-	if(full_block == 0 || word_num % full_block == 0) {
-	  my_bptt = bptt;
-	}
-	if(input > 0 && (bptt == 0 || my_bptt > 0 )) {
-	  // Work with recurrent weights: backpropagate
-	  for(int c = 0; c < layer1_size; ++c) {
-	    for(int d = 0; d < layer1_size; ++d) {
-	      neu1e[(input-1)*layer1_size + d] += nnet.synRec[c*layer1_size + d] * neu1e[input*layer1_size + c];  // Recurrent hidden->hidden backprop
-	    }
-	  }
-	  --my_bptt;
-	}
-    } // End BPTT loop
-
-    for(int input = sentence_length - 1; input > 0; --input) {
-      // Work with recurrent weights: update
- 	for(int c = 0; c < layer1_size; ++c) {
-	  for(int d = 0; d < layer1_size; ++d) { 
-	    nnet.synRec[c*layer1_size + d] += neu1e[input*layer1_size + c] * neu1[(input-1)*layer1_size + d] - beta * nnet.synRec[c*layer1_size + d]; // Recurrent hidden->hidden weight update
-	  }
-	}
-     }
-
-  } // End main training loop
-
-#ifdef DEBUG
-  if((long long)id == 0) fprintf(stderr, "Train Entropy (thread %lld, word count %lld) %f\t", (long long)id, word_count, my_sumlogprob/log10(2)/(real)word_count);
-#endif
-
-  fclose(fi);
-  free(neu1);
-  free(neu1e);
-  pthread_exit(NULL);
-}
-
-real EvaluateModel(char* filename, int printLoglikes) {
-  long long d, word = -1, last_word, sentence_length = 0;
-  long long sen[MAX_SENTENCE_LENGTH + 1];
-  long long l2;
-  real f;
-  real *neu1;
-  posix_memalign((void **)&neu1, 128, (long long)layer1_size * MAX_SENTENCE_LENGTH * sizeof(real));
-  memset(neu1, 0, (long long)layer1_size * MAX_SENTENCE_LENGTH * sizeof(real));
-
-  FILE *fi = fopen(filename, "rb");
-  real my_sumlogprob = 0;
-
-  while (1) {
-    if (feof(fi)) break;
-   
-    sen[0] = 0;
-    int good = 1;
-    sentence_length = 1;
-    while(sentence_length < MAX_SENTENCE_LENGTH) {
-      word = ReadWordIndex(fi);
-      sen[sentence_length] = word;
-      if (feof(fi) || word == 0) break;
-      if( word == -1) good = 0;
-      ++sentence_length;
-    }
-    if(good == 0) {
-      if(printLoglikes) printf("OOV\n");
-      continue;
-    }
-    if(sentence_length == 1 && feof(fi)) break;
-    real sentence_logprob = 0.0;
-
-    memset(neu1, 0, (long long)layer1_size * sentence_length * sizeof(real));
-
-    for(int input = 0; input < sentence_length; ++input) {
-      // Forward pass (not including final softmax)  
-      if (input != 0) { 
-	for(int c = 0; c < layer1_size; ++c) {
-	  for(int d = 0; d < layer1_size; ++d) { neu1[input*layer1_size + c] += nnet.synRec[c*layer1_size + d] * neu1[(input-1)*layer1_size + d]; } // Recurrent hidden->hidden activation
-	}
-      }
-      last_word = sen[input];
-      for(int c = 0; c < layer1_size; ++c) {
-	neu1[input*layer1_size + c] += nnet.syn0[last_word*layer1_size + c]; // Input to hidden
-      }
-      ApplySigmoid(neu1+layer1_size*input, layer1_size);
-    }
-
-    for(int target = 1; target <= sentence_length; ++target) {
-      // Forward pass (softmax)
-      word = sen[target];
-      long long feature_hashes[MAX_NGRAM_ORDER] = {0};
-      if(maxent_order) {
-	for(int order = 0; order < maxent_order && target - order >= 0; ++order) {
-	  feature_hashes[order] = PRIMES[0]*PRIMES[1];    	    
-	  for (int b = 1; b <= order; ++b) feature_hashes[order] += PRIMES[(order*PRIMES[b]+b) % PRIMES_SIZE]*(unsigned long long)(sen[target-b]+1);
-	  feature_hashes[order] = feature_hashes[order] % (maxent_hash_size - vocab_size);
-	}
-      }
-      real logprob = 0.0;
-
-      int maxent_present = maxent_order;
-      // Check if we should exclude some ME features that were probably not learned
-      for(int order = 0; order < maxent_order && target - order >= 0; ++order) {
-	for (d = 0; d < vocab[word].codelen; d++) {
-	  if (nnet.synMaxent[feature_hashes[order] + vocab[word].point[d]] == 0) { 
-	    // Make ME hash act a Bloom filter: if a weight is zero, it was probably never touched by training and this (an higher) ngrams should not be considered for this target.
-	    maxent_present = order;
-	    break;
-	  }
-	}
-      }      
-
-      for (d = 0; d < vocab[word].codelen; d++) {
-	// Propagate hidden -> output
-	f = 0.0;
-	l2 = vocab[word].point[d] * layer1_size;
-	for(int c = 0; c < layer1_size; ++c) {
-	  f += neu1[layer1_size*(target - 1) + c] * nnet.syn1[l2 + c];
-	}
-	for(int order = 0; order < maxent_present && target - order >= 0; ++order) {
-	  f += nnet.synMaxent[feature_hashes[order] + vocab[word].point[d]];
-	}
-	logprob += log10(1+(vocab[word].code[d] == 1 ? exp(f) : exp(-f)));	
-      }
-      sentence_logprob += logprob;
-      ++counter;
-    }
-    if(printLoglikes) printf("%f\n", -sentence_logprob);
-    my_sumlogprob += sentence_logprob;
-  }
-  fclose(fi);
-  free(neu1);
-  return my_sumlogprob;
-}
-
-void Sample(int num_sentences, int interactive) {
-
-  long long last_word;
-  long long sen[MAX_SENTENCE_LENGTH + 1];
-  long long l2;
-  real f;
-  real *neu1;
-  int begin = 0;
-  posix_memalign((void **)&neu1, 128, (long long)layer1_size * MAX_SENTENCE_LENGTH * sizeof(real));
-  sen[0] = 0;
-  if(interactive) {
-    printf("Enter the phrase to be continued:\n");
-    while(1) {
-      int word = ReadWordIndex(stdin);
-      if(word == 0) break;
-      if(word == -1) word = SearchVocab("<unk>");
-      ++begin;
-      sen[begin] = word;      
-    }
-
-  }
-
-  int sentence = 0;
-  while (sentence < num_sentences) {
-    memset(neu1, 0, (long long)layer1_size * MAX_SENTENCE_LENGTH * sizeof(real)); // clean activations
-
-    for(int i = 1; i <= begin; ++i) printf("%s ", vocab[sen[i]].word);
-    if(begin) printf("| ");
-    int input = 0;
-    real logprob = 0.0;
-    while(1) {
-
-      if (input != 0) { 
-	for(int c = 0; c < layer1_size; ++c) {
-	  for(int d = 0; d < layer1_size; ++d) { neu1[input*layer1_size + c] += nnet.synRec[c*layer1_size + d] * neu1[(input-1)*layer1_size + d]; } // Recurrent hidden->hidden activation
-	}
-      }
-      last_word = sen[input];
-      for(int c = 0; c < layer1_size; ++c) {
-	neu1[input*layer1_size + c] += nnet.syn0[last_word*layer1_size + c]; // Input to hidden
-      }
-      ApplySigmoid(neu1+layer1_size*input, layer1_size);
-    
-      if(input < begin) {
-	++input;
-	continue;
-      }
-
-      long long feature_hashes[MAX_NGRAM_ORDER] = {0};
-
-      if(maxent_order) {
-	for(int order = 0; order < maxent_order && input + 1 >= order; ++order) {
-	  feature_hashes[order] = PRIMES[0]*PRIMES[1];    	    
-	  for (int b = 1; b <= order; ++b) feature_hashes[order] += PRIMES[(order*PRIMES[b]+b) % PRIMES_SIZE]*(unsigned long long)(sen[input+1-b]+1);
-	  feature_hashes[order] = feature_hashes[order] % (maxent_hash_size - vocab_size);
-	}
-      }
-
-      int node = vocab_size - 2;
-      while(node > 0) {
-	// Propagate hidden -> output
-	f = 0.0;
-	l2 = node * layer1_size;
-	for(int c = 0; c < layer1_size; ++c) {
-	  f += neu1[input*layer1_size + c] * nnet.syn1[l2 + c];
-	}
-	for(int order = 0; order < maxent_order && input + 1 >= order; ++order) {
-	  f += nnet.synMaxent[feature_hashes[order] + node];
-	}
-	f = exp(f)/(1+exp(f)); // sigmoid
-	real random = rand() / (real)RAND_MAX;
-	if (f > random) {
-	  node = tree[node].child0; 
-	  logprob += log10(f);
-	} else {
-	  node = tree[node].child1; 
-	  logprob += log10(1-f);
-	}
-      }
-      ++input;
-      sen[input] = node + vocab_size;
-      printf("%s ", vocab[sen[input]].word);
-      if(sen[input] == 0 || input == MAX_SENTENCE_LENGTH) {
-	printf("%f %f\n", logprob, logprob /(input-begin));
-	break;
-      }
-    }
-    ++sentence;
-  }
-  free(neu1);
-}
-
-void SaveNnet() {
-  SaveVocab();  
-  FILE* fo = fopen(model_file_nnet, "wb");
-
-  fwrite(&layer1_size, sizeof(long long), 1, fo);
-  fwrite(&maxent_hash_size, sizeof(long long), 1, fo);
-  fwrite(&maxent_order, sizeof(int), 1, fo);
-
-  fwrite(nnet.syn0, sizeof(real), layer1_size*vocab_size, fo);
-  fwrite(nnet.syn1, sizeof(real), layer1_size*vocab_size, fo);
-  fwrite(nnet.synRec, sizeof(real), layer1_size*layer1_size, fo);
-  if(maxent_hash_size) fwrite(nnet.synMaxent, sizeof(real), maxent_hash_size, fo);
-  fclose(fo);
-}
-
-void LoadNnet() {
-  ReadVocab();
-  FILE *fo = fopen(model_file_nnet, "rb");
-  
-  fread(&layer1_size, sizeof(long long), 1, fo);
-  fread(&maxent_hash_size, sizeof(long long), 1, fo);
-  fread(&maxent_order, sizeof(int), 1, fo);
-
-  InitNet();
-  fread(nnet.syn0, sizeof(real), layer1_size*vocab_size, fo);
-  fread(nnet.syn1, sizeof(real), layer1_size*vocab_size, fo);
-  fread(nnet.synRec, sizeof(real), layer1_size*layer1_size, fo);
-  if(maxent_hash_size) fread(nnet.synMaxent, sizeof(real), maxent_hash_size, fo);
-  fclose(fo);
-}
-
-void FreeNnet() {
-  free(nnet.syn0);
-  free(nnet.syn1);
-  free(nnet.synRec);
-  if(maxent_hash_size != 0) free(nnet.synMaxent);
-}
-
-
-void TrainModel() {
-  long a;
-  pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t));
-  if (model_file[0] == 0) return;
-  int iter = 0;
-
-  FILE *t1 = fopen(model_file, "rb");
-  FILE *t2 = fopen(model_file_nnet, "rb");
-  if(t1 != NULL && t2 != NULL) {
-    fclose(t1);
-    fclose(t2);
-    fprintf(stderr, "Restoring nnet from existing files %s, %s\n", model_file, model_file_nnet);
-    LoadNnet();
-  } else {
-    LearnVocabFromTrainFile();
-    if(maxent_hash_size) {
-      maxent_hash_size *= 1000000;
-      maxent_hash_size -= maxent_hash_size % vocab_size;
-    }
-    InitNet();
-    SaveNnet();
-  } 
-
-  if(test_file[0] != 0) {
-    counter = 0;
-    real sumlogprob = EvaluateModel(test_file, 1);
-    fprintf(stderr, "Test entropy %f\n", sumlogprob/log10(2)/(real)counter);
-    return;
-  }
-
-  if(gen > 0) {
-    Sample(gen, 0);
-    return;
-  } else if(gen < 0) {
-    while(1) {
-      Sample(-gen, 1);
-    }
-    return;
-  }
-
-  fprintf(stderr, "Starting training using file %s\n", train_file);
-
-  FILE *fi = fopen(valid_file, "rb");
-  valid_words = 0;
-  while (1) {
-    ReadWordIndex(fi);
-    ++valid_words;
-    if (feof(fi)) break;
-  }    
-  valid_file_size = ftell(fi);
-  fclose(fi);
-
-  real old_entropy = 1e99;
-  real entropy;
-  real diff = 1e99;
-  int retry = 0;
-  int decay = 0;
-  while(retry < max_retry) {
-    if(iter  != 0) {
-      if(decay) {
-	alpha /= 2.0;
-	maxent_alpha /= 2.0;
-      }
-      word_count_actual = 0;
-      counter = 0;
-      start = clock();
-      for (a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, TrainModelThread, (void *)a);
-      for (a = 0; a < num_threads; a++) pthread_join(pt[a], NULL);
-    }
-    fprintf(stderr, "Iteration %d\t", iter);
-    sumlogprob_valid = 0;
-    counter = 0;
-    sumlogprob_valid = EvaluateModel(valid_file, 0);
-    entropy = sumlogprob_valid/log10(2)/(real)counter;
-    fprintf(stderr, "Valid Entropy %f", entropy);
-    ++iter;
-
-    diff = old_entropy/entropy;  
-    if (isnan(entropy) || isinf(entropy) || diff < stop) {
-      if (decay == 1) {
-	++retry;
-	fprintf(stderr, "\tRetry %d/%d", retry, max_retry);
-      } else {
-	decay = 1;
-	fprintf(stderr, "\tDecay started");
-      }
-      if(isnan(entropy) || isinf(entropy) || diff < reject_threshold) {
-	fprintf(stderr, "\tNnet rejected");
-	FreeNnet();
-	int debug_ = debug_mode;
-	debug_mode = 0;
-	LoadNnet();
-	debug_mode = debug_;
-      }
-    }
-    fprintf(stderr, "\n");
-
-    if(diff > 1.0) {  
-      SaveNnet();
-      old_entropy = entropy;
-    }
-  }
-}
-
-int ArgPos(char *str, int argc, char **argv) {
-  int a;
-  for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) {
-    if (a == argc - 1) {
-      printf("Argument missing for %s\n", str);
-      exit(1);
-    }
-    return a;
-  }
-  return -1;
-}
-
-void CopyOrFail(char *output, const char *input) {
-  if (strlen(input) >= MAX_STRING ) {
-    fprintf(stderr, "The string %s is too long! Probably too deep directory"
-        "structure. Either patch the rnnlm.c or change the directory path\n",
-        input);
-  } else {
-    strcpy(output, input);
-  }
-}
-
-int main(int argc, char **argv) {
-  int i;
-  if (argc == 1) {
-    printf("RNNLM based on WORD VECTOR estimation toolkit v 0.1b\n\n");
-    printf("Options:\n");
-    printf("Parameters for training:\n");
-    printf("\t-train <file>\n");
-    printf("\t\tUse text data from <file> to train the model\n");
-    printf("\t-valid <file>\n");
-    printf("\t\tUse text data from <file> to perform validation and control learning rate\n");
-    printf("\t-test <file>\n");
-    printf("\t\tUse text data from <file> to compute logprobs with an existing model\n");
-    printf("\t-rnnlm <file>\n");
-    printf("\t\tUse <file> to save the resulting language model\n");
-    printf("\t-hidden <int>\n");
-    printf("\t\tSet size of hidden layer; default is 100\n");
-    printf("\t-bptt <int>\n");
-    printf("\t\tSet length of BPTT unfolding; default is 3; set to 0 to disable truncation\n");
-    printf("\t-bptt-block <int>\n");
-    printf("\t\tSet period of BPTT unfolding; default is 10; BPTT is performed each bptt+bptt_block steps\n");
-    printf("\t-gen <int>\n");
-    printf("\t\tSampling mode; number of sentences to sample, default is 0 (off); enter negative number for interactive mode\n");
-    printf("\t-threads <int>\n");
-    printf("\t\tUse <int> threads (default 1)\n");
-    printf("\t-min-count <int>\n");
-    printf("\t\tThis will discard words that appear less than <int> times; default is 0\n");
-    printf("\t-alpha <float>\n");
-    printf("\t\tSet the starting learning rate; default is 0.1\n");
-    printf("\t-maxent-alpha <float>\n");
-    printf("\t\tSet the starting learning rate for maxent; default is 0.1\n");
-    printf("\t-reject-threshold <float>\n");
-    printf("\t\tReject nnet and reload nnet from previous epoch if the relative entropy improvement on the validation set is below this threshold (default 0.997)\n");
-    printf("\t-stop <float>\n");
-    printf("\t\tStop training when the relative entropy improvement on the validation set is below this threshold (default 1.003); see also -retry\n");
-    printf("\t-retry <int>\n");
-    printf("\t\tStop training iff N retries with halving learning rate have failed (default 2)\n");
-    printf("\t-debug <int>\n");
-    printf("\t\tSet the debug mode (default = 2 = more info during training)\n");
-    printf("\t-direct <int>\n");
-    printf("\t\tSet the size of hash for maxent parameters, in millions (default 0 = maxent off)\n");
-    printf("\t-direct-order <int>\n");
-    printf("\t\tSet the order of n-gram features to be used in maxent (default 3)\n");
-    printf("\t-beta1 <float>\n");
-    printf("\t\tL2 regularisation parameter for RNNLM weights (default 1e-6)\n");
-    printf("\t-beta2 <float>\n");
-    printf("\t\tL2 regularisation parameter for maxent weights (default 1e-6)\n");
-    printf("\t-recompute-counts <int>\n");
-    printf("\t\tRecompute train words counts, useful for fine-tuning (default = 0 = use counts stored in the vocab file)\n");
-    printf("\nExamples:\n");
-    printf("./rnnlm -train data.txt -valid valid.txt -rnnlm result.rnnlm -debug 2 -hidden 200\n\n");
-    return 0;
-  }
-  model_file[0] = 0;
-  test_file[0] = 0;
-  if ((i = ArgPos((char *)"-hidden", argc, argv)) > 0) layer1_size = atoi(argv[i + 1]);
-  if ((i = ArgPos((char *)"-train", argc, argv)) > 0) CopyOrFail(train_file, argv[i + 1]);
-  if ((i = ArgPos((char *)"-valid", argc, argv)) > 0) CopyOrFail(valid_file, argv[i + 1]);
-  if ((i = ArgPos((char *)"-test", argc, argv)) > 0) CopyOrFail(test_file, argv[i + 1]);
-  if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]);
-  if ((i = ArgPos((char *)"-bptt", argc, argv)) > 0) bptt = atoi(argv[i + 1]);
-  if ((i = ArgPos((char *)"-bptt-block", argc, argv)) > 0) bptt_block = atoi(argv[i + 1]);
-  if ((i = ArgPos((char *)"-alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]);
-  if ((i = ArgPos((char *)"-maxent-alpha", argc, argv)) > 0) maxent_alpha = atof(argv[i + 1]);
-  if ((i = ArgPos((char *)"-reject-threshold", argc, argv)) > 0) reject_threshold = atof(argv[i + 1]);
-  if ((i = ArgPos((char *)"-stop", argc, argv)) > 0) stop = atof(argv[i + 1]);
-  if ((i = ArgPos((char *)"-retry", argc, argv)) > 0) max_retry = atoi(argv[i + 1]);
-  if ((i = ArgPos((char *)"-rnnlm", argc, argv)) > 0) {
-    CopyOrFail(model_file, argv[i + 1]);
-    CopyOrFail(model_file_nnet, argv[i + 1]);
-    strcat(model_file_nnet, ".nnet");
-  }
-  if ((i = ArgPos((char *)"-threads", argc, argv)) > 0) num_threads = atoi(argv[i + 1]);
-  if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]);
-  if ((i = ArgPos((char *)"-direct", argc, argv)) > 0) maxent_hash_size = atoi(argv[i + 1]);
-  if ((i = ArgPos((char *)"-direct-order", argc, argv)) > 0) maxent_order = atoi(argv[i + 1]);
-  if ((i = ArgPos((char *)"-beta1", argc, argv)) > 0) beta = atof(argv[i + 1]);
-  if ((i = ArgPos((char *)"-beta2", argc, argv)) > 0) maxent_beta = atof(argv[i + 1]);
-  if ((i = ArgPos((char *)"-gen", argc, argv)) > 0) gen = atoi(argv[i + 1]);
-  if ((i = ArgPos((char *)"-recompute-counts", argc, argv)) > 0) recompute_train_counts = atoi(argv[i + 1]);
-
-
-  vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word));
-  vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int));
-  TrainModel();
-  return 0;
-}
diff --git a/windows/.gitignore b/windows/.gitignore
new file mode 100644
index 00000000000..b817384b6de
--- /dev/null
+++ b/windows/.gitignore
@@ -0,0 +1,3 @@
+variables.props
+kaldiwin.props
+kaldiwin_projguids.txt
diff --git a/windows/INSTALL b/windows/INSTALL
deleted file mode 100644
index d743129498b..00000000000
--- a/windows/INSTALL
+++ /dev/null
@@ -1,146 +0,0 @@
-
-# Installation instructions for native Windows with Visual
-# studio (for cygwin installation, see the instructions 
-# in ../INSTALL).
-
-#NOTE: These instructions are valid June 2015, MKL and OpenBLAS are supported
-#NOTE: ATLAS is not supported and I personally have no intention to work on supporting
-#      it, as it requires whole cygwin environment
-#NOTE: We now (20150613) support CUDA on Windows as well. The build was 
-#      tested on CUDA 7.0. It is possible that the compilation fails
-#      for significantly older CUDA SDK (less than, say, 5.0)
-#      Please not that CUDA support for windows is not really that usefull,
-#      because, the speed benefit during decoding is not large. And for training
-#      one would have to re-implement the while training pipeline (as the 
-#      bash script wouldn't most probably work) 
-#NOTE: While the 32bit project files will still be generated, we don't really
-#      care if they work or not. They will be removed in the near future.
-#NOTE: The build process were validated using MSVS2013 and partially (MKL only) using MSVS2015-rc
-#NOTE: We support only openfst-1.3.x for now.
-#NOTE: I suggest to have git installed -- not only because we will 
-#      use it to download the source codes (you could download archives
-#      instead of it), but also because the windows version comes
-#      with a bunch of useful utilities. 
-#NOTE: The examples will assume you have installed the git for windows
-#      and during the installation you chose the GIT Shell to install as well.
-#      Moreover, all the commands are issued from the same session
-
-1) Checkout Kaldi trunk, either using the svn from the url
-   https://svn.code.sf.net/p/kaldi/code/trunk
-   or using git from 
-   https://github.com/kaldi-asr/kaldi.git
-   Example:
-     $ git clone https://github.com/kaldi-asr/kaldi.git kaldi
-
-2) enter the (kaldi)/tools directory in the freshly 
-   checked-out kaldi repo. All following actions should
-   be taken in the tools dir
-   Example:
-     $ cd (kaldi)/tools
-	 (kaldi)/tools$ pwd
-   
-   
-2a) Use git to clone the OpenFST(win) from
-   https://github.com/jtrmal/openfstwin-1.3.4.git
-   Example:
-     (kaldi)/tools$ git clone https://github.com/jtrmal/openfstwin-1.3.4.git openfst
-   
-   
-2b) Download pthread-win32 (or wget or curl)
-   https://sourceforge.net/projects/pthreads4w/
-     (kaldi)/tools$ wget http://downloads.sourceforge.net/project/pthreads4w/pthreads-w32-2-9-1-release.zip
-	 (kaldi)/tools$ mkdir pthreads; cd pthreads
-	 (kaldi)/tools/pthreads$ unzip ../pthreads-w32-2-9-1-release.zip
-
-2c) Use patch (or you can use git patch) to patch the OpenFST(win)
-   patch location tools/extras/openfstwin-1.3.4.patch,
-   Example:
-     (kaldi)/tools$ cd openfst
-	 (kaldi)/tools/openfst$ patch -p1 <../extras/openfstwin-1.3.4.patch 
-
-2d-1) Download the OpenBLAS binary packages
-      https://sourceforge.net/projects/openblas
-	  (kaldi)/tools$ wget http://sourceforge.net/projects/openblas/files/v0.2.14/OpenBLAS-v0.2.14-Win64-int32.zip
-	  (kaldi)/tools$ wget http://sourceforge.net/projects/openblas/files/v0.2.14/mingw64_dll.zip
-	  (kaldi)/tools$ unzip OpenBLAS-v0.2.14-Win64-int32.zip
-	  (kaldi)/tools$ unzip mingw64_dll.zip
-	  
-	  NOTE: Be carefull to download "Win64-int32" and not "Win64-int64"!
-	  
-2d-2) Install MKL
-2e) If you want enabled CUDA support, download and install NVidia CUDA SDK.
-    Be careful and strive for as standard install as possible. The installer
-	set certain environment variables on which the MSVC Build rules rely.
-	If you call "set" in the command line, you should see:
-    
-	(kaldi)/tools $ set | grep CUDA
-     CUDA_PATH='C:\Users\Yenda\Downloads\cuda'
-     CUDA_PATH_V7_0='C:\Users\Yenda\Downloads\cuda'
-     NVCUDASAMPLES7_0_ROOT='C:\Users\Yenda\Downloads\cuda'
-     NVCUDASAMPLES_ROOT='C:\Users\Yenda\Downloads\cuda'
-	
-    The first one (CUDA_PATH) is particulary important.
- 	
-3)  Open the OpenFST solution in VS -- 
-   for VS 2013, the correct solution is in VS2012 directory
-   for VS 2014, the correct solution is in VS2014 directory
-   !!!switch the configuration to debug|x64 and build the solution
-   !!!The same for configuration release|x64
-   If either of the two won't build, you should stop here and start figuring what's different!
-
-4)  Enter the (kaldi)/windows directory
-   Example:
-	 (kaldi)/tools/openfst$ cd ../../windows
-	 (kaldi)/windows $ pwd
-	 
-4a) modify the file variables.props to reflect 
-    the correct paths, using your favorite text editor.
-	Don't worry, it's a text file, even though you have to be 
-	careful to keep the structure itself intact
-	(kaldi)/windows $ vim variables.props
-	
-	If you plan to use MKL, you can ignore the OPENBLASDIR path
-	If you plan to use OpenBLAS, you can ignore the MKLDIR path
-	No matter what you plan to use, set both the OPENFST* and PTHREADW
-	variables correctly
-	
-4b-1) For OpenBLAS support, copy the file "kaldiwin_openblas.props" to "kaldiwin.props"
-4b-2) For MKL support, you don't have to do anything, it should work out of the box. 
-      When you need to switch from OpenBLAS to MKL, copy the "kaldiwin_mkl.props" 
-	  to "kaldiwin.props"
-
-
-4c) call the script that generates the MSVC solution
-	i.e.
-	generate_solution.pl --vsver <default|vs2013|vs2015>
-	i.e. for example
-	generate_solution.pl --vsver vs2013
-	
-	For CUDA support, add switch --enable-cuda to the command line,
-	i.e. for example
-	generate_solution.pl --vsver vs2013 --enable-cuda
-	
-5)  Open the generated solution in the visual studio and switch to Debug|x64 (or Release|x64) and build
-   Expect 10 projects to fail, majority of them will fail because of missing include "portaudio.h"
-
-------  	
-NOTE: I'm leaving the information about ATLAS here, for reference (also do not forget to consult the README.ATLAS)
-(B) either
-   (i) compile ATLAS under cygwin [see INSTALL.atlas] and copy
-  kaldiwin_atlas.props  to kaldiwin.props
-
-(D)
-If you had installed ATLAS, you next have to do this:
-[assuming you are one level above this directory]
-cd kaldiwin_vs10_auto/
-
-# type the following (these commands were done from cygwin): note that these
-# commands are a bit wasteful of disk; you could alternatively ensure that
-# [root]/tools/ATLAS/cygwin_build/install/lib/ is always on your path when you
-# run the binaries.
-
-mkdir -p Debug Release
-cp ../tools/ATLAS/cygwin_build/install/lib/lib_atlas.dll Debug
-cp ../tools/ATLAS/cygwin_build/install/lib/lib_atlas.dll Release
-
-Then build the project with Visual Studio.
diff --git a/windows/INSTALL.md b/windows/INSTALL.md
new file mode 100644
index 00000000000..9d8cbe63541
--- /dev/null
+++ b/windows/INSTALL.md
@@ -0,0 +1,184 @@
+
+# Installation instructions for native Windows with Visual Studio
+
+For cygwin installation, see the instructions in `../INSTALL`.
+
+## Notes
+
+* The recipes (in egs/) will not work. There is no commitment to support Windows.
+  The Windows port of Kaldi is targeted at experienced developers who want 
+  to program their own apps using the kaldi libraries and are able to do 
+  the troubleshooting on their own. 
+* These instructions are valid March 2016, 
+  [Intel® MKL](https://software.intel.com/en-us/intel-mkl) and OpenBLAS are supported
+* ATLAS is not supported and I personally have no intention to work on supporting
+  it, as it requires whole cygwin environment
+* We now (20150613) support CUDA on Windows as well. The build was
+  tested on CUDA 7.0. It is possible that the compilation fails
+  for significantly older CUDA SDK (less than, say, 5.0)
+  Please note that CUDA support for windows is not really that usefull,
+  because, the speed benefit during decoding is not large. And for training
+  one would have to re-implement the while training pipeline (as the
+  bash script wouldn't most probably work)
+* While the 32bit project files will still be generated, we don't really
+  care if they work or not. They will be removed in the near future.
+* The build process were validated using MSVS2013 and partially (MKL only) using MSVS2015-rc
+* We support only openfst-1.3.x for now.
+* I suggest to have git installed -- not only because we will
+  use it to download the source codes (you could download archives
+  instead of it), but also because the windows version comes
+  with a bunch of useful utilities.
+* The examples will assume you have installed the git for windows
+  and during the installation you chose the GIT Shell to install as well.
+  Moreover, all the commands are issued from the same session.
+
+## Steps
+
+1. Checkout Kaldi trunk, using [git](https://git-for-windows.github.io/) from https://github.com/kaldi-asr/kaldi.git
+
+   Example:
+   
+        $ git clone https://github.com/kaldi-asr/kaldi.git kaldi
+
+2. Enter the `(kaldi)/tools` directory in the freshly
+   checked-out kaldi repo. All following actions should
+   be taken in the tools dir.
+
+   Example:
+   
+        $ cd (kaldi)/tools
+        (kaldi)/tools$ pwd
+
+3. Use git to clone the [OpenFST(win)](https://github.com/jtrmal/openfstwin-1.3.4) from
+       
+        https://github.com/jtrmal/openfstwin-1.3.4.git
+
+   Example:
+   
+        (kaldi)/tools$ git clone https://github.com/jtrmal/openfstwin-1.3.4.git openfst
+
+4. Download [pthreads-win32](https://sourceforge.net/projects/pthreads4w/) (or `wget` or `curl`)
+
+   https://sourceforge.net/projects/pthreads4w/
+
+        (kaldi)/tools$ curl -L -O http://downloads.sourceforge.net/project/pthreads4w/pthreads-w32-2-9-1-release.zip
+        (kaldi)/tools$ mkdir pthreads; cd pthreads
+        (kaldi)/tools/pthreads$ unzip ../pthreads-w32-2-9-1-release.zip
+
+5. Use patch (or you can use git patch) to patch the OpenFST(win).
+
+   The patch location is `tools/extras/openfstwin-1.3.4.patch`
+
+   Example:
+   
+        (kaldi)/tools$ cd openfst
+        (kaldi)/tools/openfst$ patch -p1 <../extras/openfstwin-1.3.4.patch
+
+   If you get this error: `Assertion failed: hunk, file ../patch-2.5.9-src/patch.c, line 354`
+   it is because the `patch.c` file should have Windows line endings (CRLF) rather than Unix ones (LF).
+   
+There are two options to use for BLAS (linear algebra): [Intel® MKL](https://software.intel.com/en-us/intel-mkl) and OpenBLAS. [Intel® MKL](https://software.intel.com/en-us/intel-mkl) is made by Intel and is optimised
+for their processors. It isn't free, but you can get [Community Licensing for Intel® Performance Libraries
+](https://software.intel.com/sites/campaigns/nest/) or as part of Intel product suite if you [qualify as students, educators, academic researchers, and open source contributors](https://software.intel.com/en-us/qualify-for-free-software). OpenBLAS is free alternative with similar performance.
+
+6. If using [Intel® MKL](https://software.intel.com/en-us/intel-mkl), [install it](https://software.intel.com/en-us/intel-mkl/try-buy).
+
+7. If using OpenBLAS, download the binary packages.
+
+   https://sourceforge.net/projects/openblas
+
+        (kaldi)/tools$ curl -L -O http://sourceforge.net/projects/openblas/files/v0.2.14/OpenBLAS-v0.2.14-Win64-int32.zip
+        (kaldi)/tools$ curl -L -O http://sourceforge.net/projects/openblas/files/v0.2.14/mingw64_dll.zip
+        (kaldi)/tools$ unzip OpenBLAS-v0.2.14-Win64-int32.zip
+        (kaldi)/tools$ unzip mingw64_dll.zip
+
+   **Be careful to download "Win64-int32" and not "Win64-int64"!**
+
+8. If you want enabled [CUDA](http://www.nvidia.com/object/cuda_home_new.html) support, download and install [NVIDIA CUDA SDK](https://developer.nvidia.com/cuda-downloads).
+   Be careful and strive for as standard install as possible. The installer
+   set certain environment variables on which the MSVC Build rules rely.
+   If you call "set" in the command line, you should see:
+
+        (kaldi)/tools $ set | grep CUDA
+        CUDA_PATH='C:\Users\Yenda\Downloads\cuda'
+        CUDA_PATH_V7_0='C:\Users\Yenda\Downloads\cuda'
+        NVCUDASAMPLES7_0_ROOT='C:\Users\Yenda\Downloads\cuda'
+        NVCUDASAMPLES_ROOT='C:\Users\Yenda\Downloads\cuda'
+
+   The first one (`CUDA_PATH`) is particularly important.
+
+9. Open the OpenFST solution in Visual Studio
+
+   * for [Visual Studio 2013](https://www.visualstudio.com/en-us/news/vs2013-community-vs.aspx), the correct solution is in `MSVC12` directory
+   * for [Visual Studio 2015](https://www.visualstudio.com/en-us/products/visual-studio-community-vs.aspx), the correct solution is in `MSVC14` directory
+
+   **Switch the configuration to `debug|x64` and build the solution.**
+
+   **Do the same for configuration `release|x64`.**
+
+   If either of the two won't build, you should stop here and start figuring what's different!
+
+10. Enter the `(kaldi)/windows` directory
+
+    Example:
+    
+         (kaldi)/tools/openfst$ cd ../../windows
+         (kaldi)/windows $ pwd
+
+11. Copy `variables.props.dev` to `variables.props`.
+    Then modify the file `variables.props` to reflect
+    the correct paths, using your favorite text editor.
+    Don't worry, it's a text file, even though you have to be
+    careful to keep the structure itself intact
+
+         (kaldi)/windows $ vim variables.props
+
+    If you plan to use MKL, you can ignore the `OPENBLASDIR` path.
+    If you plan to use OpenBLAS, you can ignore the `MKLDIR` path.
+    No matter what you plan to use, set both the `OPENFST*` and `PTHREADW`
+    variables correctly
+
+12. For OpenBLAS support, copy the file `kaldiwin_openblas.props` to `kaldiwin.props`
+13. For MKL support, copy the `kaldiwin_mkl.props` to `kaldiwin.props`
+
+14. Call the script that generates the MSVC solution
+
+         ./generate_solution.pl --vsver <default|vs2013|vs2015> [--enable-cuda] [--enable-openblas] [--enable-mkl]
+
+    `--enable-mkl` is the default so you shouldn't need to use it. If `--enable-openblas` is passed it disables MKL support.
+    CUDA is disabled by default. The default Visual Studio version is 11.0 (Visual Studio 2012).
+
+    For example, for a build using OpenBLAS and VS 2015 you would run:
+
+         (kaldi)/tools$ generate_solution.pl --vsver vs2015 --enable-openblas
+
+    Another example, for OpenBLAS, VS 2013 and CUDA support:
+
+         (kaldi)/tools$ generate_solution.pl --vsver vs2013 --enable-cuda --enable-openblas
+
+15. Open the generated solution in the visual studio and switch to **Debug|x64** (or **Release|x64**) and build.
+   Expect 10 projects to fail, majority of them will fail because of missing include `portaudio.h`
+
+------
+NOTE: I'm leaving the information about ATLAS here, for reference (also do not forget to consult the `README.ATLAS`)
+
+(B) either
+   (i) compile ATLAS under cygwin [see INSTALL.atlas] and copy
+  `kaldiwin_atlas.props` to `kaldiwin.props`
+
+(D)
+If you had installed ATLAS, you next have to do this:
+[assuming you are one level above this directory]
+
+    cd kaldiwin_vs10_auto/
+
+Type the following (these commands were done from cygwin): note that these
+commands are a bit wasteful of disk; you could alternatively ensure that
+`[root]/tools/ATLAS/cygwin_build/install/lib/` is always on your path when you
+run the binaries.
+
+    mkdir -p Debug Release
+    cp ../tools/ATLAS/cygwin_build/install/lib/lib_atlas.dll Debug
+    cp ../tools/ATLAS/cygwin_build/install/lib/lib_atlas.dll Release
+
+Then build the project with Visual Studio.
diff --git a/windows/kaldiwin.props b/windows/kaldiwin.props
deleted file mode 100644
index 80920b710f3..00000000000
--- a/windows/kaldiwin.props
+++ /dev/null
@@ -1,19 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <PropertyGroup>
-    <_ProjectFileVersion>10.0.30319.1</_ProjectFileVersion>
-    <LinkIncremental>
-    </LinkIncremental>
-  </PropertyGroup>
-  <ItemDefinitionGroup>
-    <ClCompile>
-      <AdditionalOptions>/bigobj %(AdditionalOptions)</AdditionalOptions>
-      <AdditionalIncludeDirectories>..\..\..\src;$(MKLDIR)\include;$(PTHREADW)\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>HAVE_MKL;USE_ONLY_PRERECORDED;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-    </ClCompile>
-    <Link>
-      <AdditionalLibraryDirectories>$(MKLDIR)\lib\intel64;$(PTHREADW)\lib\x64;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
-      <AdditionalDependencies>mkl_rt.lib;mkl_intel_thread.lib;mkl_core.lib;mkl_intel_lp64.lib;pthreadVC2.lib;%(AdditionalDependencies)</AdditionalDependencies>
-    </Link>
-  </ItemDefinitionGroup>
-</Project>
diff --git a/windows/variables.props b/windows/variables.props.dev
similarity index 99%
rename from windows/variables.props
rename to windows/variables.props.dev
index a9c9bde8fc9..c8063dc1841 100644
--- a/windows/variables.props
+++ b/windows/variables.props.dev
@@ -32,4 +32,4 @@
       <EnvironmentVariable>true</EnvironmentVariable>
     </BuildMacro>
   </ItemGroup>
-</Project>
\ No newline at end of file
+</Project>